\name{eNetXplorer}
\alias{eNetXplorer}
\title{generates family of elastic net models for different alphas}
\description{
Elastic net uses a mixing parameter \code{alpha} to tune the penalty term continuously from ridge (\code{alpha=0}) to lasso (\code{alpha=1}). \code{eNetXplorer} generates a family of elastic net models over different values of \code{alpha} for the quantitative exploration of the effects of shrinkage. For each \code{alpha}, the regularization parameter \code{lambda} is chosen by optimizing a quality function based on out-of-bag cross-validation predictions. Statistical significance of each model, as well as that of individual features within a model, 
is assigned by comparison to a set of null models generated by random permutations of the response. \code{eNetXplorer} fits linear (gaussian), logistic (binomial) and multinomial models. 

}
\usage{
eNetXplorer(x, y, family=c("gaussian","binomial","multinomial"), 
alpha=seq(0,1,by=0.2), nlambda=100, nlambda.ext=NULL, seed=NULL, scaled=T, 
n_fold=5, n_run=100, n_perm_null=25, QF.FUN=NULL, QF_label=NULL, 
cor_method=c("pearson","kendall","spearman"), fold_distrib_fail.max=100, \dots)
}
\arguments{
  \item{x}{Input numerical matrix with instances as rows and features as columns. Instance and feature labels should be provided as row and column names, respectively. Can be in sparse matrix format (inherit from class \code{"sparseMatrix"} as in package \code{Matrix}). Cannot handle missing values.}
  \item{y}{Response variable. For \code{family="gaussian"}, numerical vector. For \code{family=}
  \code{"binomial"}, factor with two levels. For \code{family="multinomial"}, factor with two or more levels. For categorical families, if a vector is supplied, it will be coerced into a factor.}
\item{family}{Response type: \code{"gaussian"} (numerical), \code{"binomial"} (2-level factor), or\cr
\code{"multinomial"} (factor with >=2 levels).}
\item{alpha}{Sequence of values for the mixing parameter penalty term in the elastic net family. Default is \code{seq(0,1,by=0.2)}.}
\item{nlambda}{Number of values for 
the regularization parameter \code{lambda}. Default is 100. Irrespective of \code{nlambda}, the range of \code{lambda} values is assigned by \code{glmnet}.}
\item{nlambda.ext}{If set to a value larger than \code{nlambda}, this will be the number of values for \code{lambda} obtained by extending the range assigned by \code{glmnet} symmetrically while keeping the \code{lambda} density uniform in log scale. Default is \code{NULL}, which will not extend the range of \code{lambda} assigned by \code{glmnet}.}
\item{seed}{Sets the pseudo-random number seed to enforce  reproducibility. Default is \code{NULL}.}
\item{scaled}{Z-score transformation of individual features across all instances. Default is \code{TRUE}.}
\item{n_fold}{Number of cross-validation folds per run. \code{lambda} is chosen based on the maximization of a quality function on out-of-bag-instances averaged over all runs. Default is 5.}
\item{n_run}{Number of runs; for each run, instances are randomly assigned to cross-validation folds. Default is 100.}
\item{n_perm_null}{Number of random null-model permutations of the response per run. Default is 25.}
\item{QF.FUN}{User-defined quality function as maximization criterion to select \code{lambda} based on response vs out-of-bag predicted instances. For \code{family="gaussian"}, default is correlation; for \code{family="binomial"}, it is accuracy; for \code{family=}\code{"multinomial"}, it is average accuracy.}
\item{QF_label}{Label for user-defined quality function, if QF.FUN is provided.}
\item{cor_method}{For \code{family="gaussian"}, correlation method to be used in the default quality function \code{cor.test}. Default is \code{"pearson"}.}
\item{fold_distrib_fail.max}{For categorical models, maximum number of failed attempts per run to have all classes represented in each in-bag fold. If this number is exceeded, the execution is halted; try again with larger \code{n_fold}, by removing/reasigning classes of small size, and/or with larger \code{fold_distrib_fail.max}. Default is 100.}
  \item{\dots}{Accepts parameters from \code{glmnet.control(\dots)} to allow changes of factory default parameters in \code{glmnet}. If not explicitly set, it will use factory defaults.}
}
\details{
For each \code{alpha}, a set of \code{nlambda} values is 
obtained using the full data; if provided, \code{nlambda.ext} 
allows to extend the range of \code{lambda} values symmetrically while keeping its density uniform in log scale. Using these 
values of \code{lambda}, elastic net cross-validation models are generated for \code{n_run} random assignments of instances among \code{n_fold} folds; the best  \code{lambda} is determined
by the maximization of a quality function that compares out-of-bag predictions against the response. User-defined quality functions can be provided via \code{QF.FUN}, otherwise sensible defaults are used (e.g. correlation for gaussian models).
For each run, using the same assignment of instances into folds, \code{n_perm_null} null models are generated by shuffling the response. By using the quality function to compare the out-of-bag performance of the model to that of the null models, 
an empirical significance p-value is assigned to the model.
Similar procedures allow to obtain p-values for individual features based on absolute coefficient magnitude and on the frequency of non-zero coefficients. 
A family of elastic net models is thus generated for multiple 
values of \code{alpha} spanning the range from   
ridge (\code{alpha=0}) to lasso (\code{alpha=1}). This function 
returns an \code{eNetXplorer} object on which summary, plotting 
and export functions in this package can be applied for further 
analysis. 
For details about the underlying elastic net models, please refer to the \code{glmnet} package and references therein.
}
\value{
An object with S3 class \code{"eNetXplorer"}.
\item{predictor}{Predictor matrix used for regression (in sparse matrix format).}  
\item{response}{Response variable used for regression.}
\item{family}{Input parameter.}
\item{alpha}{Input parameter.}
\item{nlambda}{Input parameter.}
\item{nlambda.ext}{Input parameter.}
\item{seed}{Input parameter.}
\item{scaled}{Input parameter.}
\item{n_fold}{Input parameter.}
\item{n_run}{Input parameter.}
\item{n_perm_null}{Input parameter.}
\item{QF_label}{Input parameter.}
\item{cor_method}{Input parameter.}
\item{fold_distrib_fail.max}{Input parameter.}
\item{instance}{Instance labels.}
\item{feature}{Feature labels.}
\item{glmnet_params}{\code{glmnet} parameters used for regression.}
\item{best_lambda}{\code{lambda} values chosen by cross-validation.}
\item{model_QF_est}{Quality function values obtained by cross-validation.}
\item{QF_model_vs_null_pval}{P-value from model vs null comparison to assess statistical significance.}
\item{lambda_values}{List of \code{lambda} values used for each \code{alpha}.}
\item{lambda_QF_est}{List of quality function values obtained for each \code{alpha}.}
\item{predicted_values}{List of out-of-bag predicted values for each \code{alpha}; rows are instances and columns are median/mad predictions (for linear regression) or class predictions (for binomial and multinomial regression).}
\item{feature_coef_wmean}{Mean of feature coefficients (over runs) weighted by non-zero frequency (over folds) in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
\item{feature_coef_wsd}{Standard deviation of feature coefficients (over runs) weighted by non-zero frequency (over folds) in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
\item{feature_freq_mean}{Mean of non-zero frequency in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
\item{feature_freq_sd}{Standard deviation of non-zero frequency in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
\item{null_feature_coef_wmean}{Analogous to \code{feature_coef_wmean} for null model permutations.}
\item{null_feature_coef_wsd}{Analogous to \code{feature_coef_wsd} for null model permutations.}
\item{null_feature_freq_mean}{Analogous to \code{feature_freq_mean} for null model permutations.}
\item{null_feature_freq_sd}{Analogous to \code{feature_freq_sd} for null model permutations.}
\item{feature_coef_model_vs_null_pval}{P-value from model vs null comparison to assess statistical significance of mean non-zero feature coefficients in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
\item{feature_freq_model_vs_null_pval}{P-value from model vs null comparison to assess statistical significance of mean non-zero feature frequencies in sparse matrix format, with features as rows and \code{alpha} values as columns. For multinomial regression, it is a list of matrices (one matrix for each class).}
}
\author{Julian Candia and John S. Tsang\cr 
Maintainer: Julian Candia \email{julian.candia@nih.gov}}
\references{Candia J and Tsang JS (2018)
  \emph{eNetXplorer: an R package for the quantitative exploration of elastic net families 
  for generalized linear models}, bioRxiv 305870 (under review).
}
\seealso{\code{\link{summary}}, \code{\link{plot}}, \code{\link{summaryPDF}}, \code{\link{export}}
}
\examples{
\dontshow{
set.seed(123)
fit = eNetXplorer(x=matrix(rnorm(75),ncol=3),y=rnorm(25),family="gaussian",
n_run=5,n_perm_null=4,alpha=c(0.5,1))
}
\donttest{
# Linear models (synthetic dataset comprised of 20 features and 75 instances):
data(QuickStartEx)
fit = eNetXplorer(x=QuickStartEx$predictor, y=QuickStartEx$response,
family="gaussian", n_run=20, n_perm_null=10, seed=111)
}
\donttest{
# Linear models to predict numerical day-70 H1N1 serum titers based on 
# day-7 cell population frequencies:
data(H1N1_Flow)
fit = eNetXplorer(x=H1N1_Flow$predictor_day7, y=H1N1_Flow$response_numer[rownames(
H1N1_Flow$predictor_day7)], family="gaussian", n_run=25, n_perm_null=15, seed=111)
}
\donttest{
# Binomial models to predict acute myeloid (AML) vs acute lymphoblastic (ALL) 
# leukemias: 
data(Leukemia_miR)
fit = eNetXplorer(x=Leukemia_miR$predictor, y=Leukemia_miR$response_binomial, 
family="binomial", n_run=25, n_perm_null=15, seed=111)
}
\donttest{
# Multinomial models to predict acute myeloid (AML), acute B-cell lymphoblastic 
# (B-ALL) and acute T-cell lymphoblastic (T-ALL) leukemias:
data(Leukemia_miR)
fit = eNetXplorer(x=Leukemia_miR$predictor, y=Leukemia_miR$response_multinomial,
family="multinomial", n_run=25, n_perm_null=15, seed=111)
}
\donttest{
# Binomial models to predict B-ALL vs T-ALL:
data(Leukemia_miR)
fit = eNetXplorer(x=Leukemia_miR$predictor[Leukemia_miR$response_multinomial!="AML",],
y=Leukemia_miR$response_multinomial[Leukemia_miR$response_multinomial!="AML"], 
family="binomial", n_run=25, n_perm_null=15, seed=111)
}
}

