% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/LogReg.R
\name{LogReg}
\alias{LogReg}
\title{Fast supervised classifier with m/z subsetting and optional sampling}
\usage{
LogReg(
  X,
  moz,
  Y,
  number = 2,
  repeats = 2,
  Metric = c("Kappa", "Accuracy", "F1", "AdjRankIndex", "MatthewsCorrelation"),
  kind = "linear",
  Sampling = c("no", "up", "down", "smote"),
  ncores = max(1L, parallel::detectCores() - 1L),
  num.trees = 500L,
  tuneLength = 5L,
  seed = 123L
)
}
\arguments{
\item{X}{Numeric matrix or data.frame with samples in rows and features (m/z) in columns.
Column names must be numeric (or coercible), e.g., "1234.567" or "mz_1234.567".
Non-finite values are set to 0.}

\item{moz}{Numeric vector of m/z values to keep. Only columns of \code{X} whose
numeric names match values in \code{moz} are used. An error is thrown if none match.}

\item{Y}{Factor (or coercible) of class labels; length must equal nrow(X).}

\item{number}{Integer; number of CV folds (k). Default 2.}

\item{repeats}{Integer; number of CV repeats. Default 2.}

\item{Metric}{Character; selection metric. One of "Kappa", "Accuracy", "F1",
"AdjRankIndex", "MatthewsCorrelation". For non-caret metrics, custom summary
functions are used.}

\item{kind}{Character; model type. One of "linear" (multinom), "nnet" (nnet),
"rf" (random forest), "svm" (svmLinear2), "xgb" (xgbTree). Default "linear".}

\item{Sampling}{Character; class-balancing strategy. One of "no", "up", "down",
"smote". For "smote", the function smote_classif(Y ~ ., data.frame(Y, X))
is used before training. For "up"/"down", caret’s in-fold sampling is used.}

\item{ncores}{Integer; number of CPU cores to use for caret’s parallel backend
(doParallel). Default is all but one core. Ignored if doParallel is unavailable.}

\item{num.trees}{Integer; number of trees for random forests (ranger engine). Default 500.
Used when kind = "rf" and either the caret "ranger" fallback is used or
the caret-free LogReg_rf_fast is available.}

\item{tuneLength}{Integer; size of the hyperparameter search (caret-based models).
Default 5 (compact grid).}

\item{seed}{Integer; random seed for reproducibility. Default 123.}
}
\value{
A list with:
\itemize{
\item train_mod: the fitted model (caret::train object) or, if kind = "rf" and
LogReg_rf_fast is available, the structure returned by LogReg_rf_fast
(contains the final ranger model and CV details).
\item boxplot: ggplot object of resampling metric distributions (caret paths) or
the boxplot returned by LogReg_rf_fast.
\item Confusion.Matrix: caret::confusionMatrix on the fitted model (caret paths) or
the confusion matrix returned by LogReg_rf_fast.
\item stats_global: data.frame summarizing per-fold metrics (Metric, Mean, Sd) for
caret paths; from LogReg_rf_fast otherwise.
}
}
\description{
Trains a multiclass classifier on a subset of m/z features using cross-validation.
For kind = "rf", it automatically delegates to a ranger-based algorithm
(LogReg_rf_fast) when available for maximum speed and parallelism; otherwise it
uses the caret R package  with method = "ranger" as a fast fallback. Other kinds ("linear",
"nnet", "svm", "xgb") are trained via caret with compact grids and optional
parallelization. Features (columns) are selected by matching their numeric
column names to \code{moz}. Optional class-balancing (among up/down-sampling or SMOTE) can be applied.
}
\details{
\itemize{
\item Feature subsetting: \code{X} is subset to columns whose numeric names match \code{moz}.
This avoids expensive joins/transposes and guarantees stable feature order.
\item Random forests: if the function LogReg_rf_fast is available in the namespace
(see its documentation), this function delegates the "rf" case to it for
maximum speed and Windows-friendly parallel CV. Otherwise, it uses caret with
method = "ranger" (still fast and parallelizable).
\item Sampling: "smote" is applied once, before training; "up"/"down" are applied
in-fold by caret via trainControl(sampling = ...). "no" leaves the data unchanged.
\item Parallelism: if ncores > 1 and doParallel is installed, a PSOCK cluster is
registered for caret. The fast RF engine (LogReg_rf_fast) internally handles
fold-level parallelism and ranger threading to avoid oversubscription.
}
}
\examples{
\dontrun{
set.seed(1)
X <- matrix(runif(2000), nrow = 100, ncol = 20)
colnames(X) <- as.character(round(seq(1000, 1190, length.out = 20), 4))
moz <- as.numeric(colnames(X))[seq(1, 20, by = 2)]
Y <- factor(sample(letters[1:3], 100, replace = TRUE))

# Fast RF (delegates to LogReg_rf_fast if available; else caret + ranger)
fit_rf <- LogReg(X, moz, Y, number = 3, repeats = 1, kind = "rf",
                 Metric = "Kappa", Sampling = "no", ncores = 4,
                 num.trees = 300, seed = 42)
fit_rf$Confusion.Matrix

# Linear (multinom) with macro F1 metric
fit_lin <- LogReg(X, moz, Y, number = 3, repeats = 1, kind = "linear",
                  Metric = "F1", Sampling = "no", ncores = 2)
fit_lin$stats_global
}

}
\seealso{
LogReg_rf_fast, ranger::ranger, caret::train, caret::confusionMatrix
}
