################################
#### Principal components regression for binary and poisson regression
#### Selection of the number of principal components
#### via K-fold cross validation
#### Tsagris Michail 1/2016
#### mtsagris@yahoo.gr
################################
glmpcr.tune <- function(y, x, M = 10, maxk = 10, mat = NULL, ncores = 1, graph = TRUE) {
  ## y is the UNIVARIATE dependent variable
  ## y is either a binary variable (binary logistic regression)
  ## or a discrete variable (Poisson regression)
  ## x contains the independent variables
  ## fraction denotes the percentage of observations
  ## to be used as the test set
  ## the 1-fraction proportion of the data will be the training set
  ## R is the number of cross validations
  ## if ncores==1, then 1 processor is used, otherwise more are
  ## used (parallel computing)
  n <- dim(x)[1]
  p <- dim(x)[2]
  if ( maxk > p ) maxk <- p  ## just a check

  if ( is.null(mat) ) {
    nu <- sample(1:n, min( n, round(n / M) * M ) )
    ## It may be the case this new nu is not exactly the same
    ## as the one specified by the user
    ## to a matrix a warning message should appear
    options(warn = -1)
    mat <- matrix( nu, ncol = M )
  } else  mat <- mat

  M <- dim(mat)[2]
  msp <- matrix( nrow = M, ncol = maxk )
  ## deigma will contain the positions of the test set
  ## this is stored but not showed in the end
  ## the user can access it though by running
  ## the commands outside this function
  if ( length( Rfast::sort_unique(y) ) == 2 ) {
    oiko <- "binomial"
  } else oiko <- "poisson"

  if (ncores == 1) {
    runtime <- proc.time()
    for (vim in 1:M) {
      ytest <- y[mat[, vim] ]   ## test set dependent vars
      ytrain <- y[-mat[, vim] ]   ## train set dependent vars
      xtrain <- x[-mat[, vim], , drop = FALSE]   ## train set independent vars
      xtest <- x[mat[, vim], , drop = FALSE ]  ## test set independent vars
	    vec <- prcomp(xtrain, center = FALSE)$rotation
      z <- xtrain %*% vec  ## PCA scores

      for ( j in 1:maxk) {
        if (oiko == "binomial") {
          be <- Rfast::glm_logistic(z[, 1:j], ytrain)$be
        } else {
          be <- Rfast::glm_poisson(z[, 1:j], ytrain)$be
        }
        ztest <- xtest %*% vec[, 1:j, drop = FALSE]  ## PCA scores
        es <- as.vector( ztest %*% be[-1] ) + be[1]

        if (oiko == "binomial") {
          est <- as.vector(  exp(es) / ( 1 + exp(es) )  )
          ri <-  -2 *( ytest * log(est) + (1 - ytest) * log(1 - est) )
        } else {
          est <- as.vector( exp(es) )
          ri <- 2 * ytest * log(ytest / est)
        }
        msp[vim, j] <- sum( ri, na.rm = TRUE )
      }
    }
    runtime <- proc.time() - runtime

  } else {
    runtime <- proc.time()
    cl <- makePSOCKcluster(ncores)
    registerDoParallel(cl)
    er <- numeric(maxk)
    msp <- foreach(vim = 1:M, .combine = rbind, .packages = "Rfast", .export = c("glm_logistic", "glm_poisson") ) %dopar% {
      ytest <- y[mat[, vim] ]  ## test set dependent vars
      ytrain <-  y[-mat[, vim] ]   ## train set dependent vars
      xtrain <- x[-mat[, vim], , drop = FALSE]   ## train set independent vars
      xtest <- x[mat[, vim], , drop = FALSE]  ## test set independent vars
	    vec <- prcomp(xtrain, center = FALSE)$rotation
      z <- xtrain %*% vec  ## PCA scores

      for ( j in 1:maxk) {
        if (oiko == "binomial") {
          be <- Rfast::glm_logistic(z[, 1:j], ytrain)$be
        } else {
          be <- Rfast::glm_poisson(z[, 1:j], ytrain)$be
        }
        ztest <- xtest %*% vec[, 1:j, drop = FALSE]  ## PCA scores
        es <- as.vector( ztest %*% be[-1] ) + be[1]

        if (oiko == "binomial") {
          est <- exp(es) / ( 1 + exp(es) )
          ri <-  -2 *( ytest * log(est) + (1 - ytest) * log(1 - est) )
        } else {
          est <- exp(es)
          ri <- 2 * ytest * log(ytest / est)
        }
        er[j] <- sum( ri, na.rm = TRUE )
      }
      return(er)
    }
    stopCluster(cl)
    runtime <- proc.time() - runtime
  }

  mpd <- Rfast::colmeans(msp)
  if ( graph )  plot(1:maxk, mpd, xlab = "Number of principal components", ylab = "Mean predicted deviance", type = "b" ,cex.alb = 1.3)

  names(mpd) <- paste("PC", 1:maxk, sep = " ")
  performance <- min(mpd)
  names(performance) <- "MPD"
  list(msp = msp, mpd = mpd, k = which.min(mpd), performance = performance, runtime = runtime)
}
