% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/fold.R
\name{fold}
\alias{fold}
\alias{create_balanced_groups}
\title{Create balanced folds for cross-validation}
\usage{
fold(
  data,
  k = 5,
  cat_col = NULL,
  num_col = NULL,
  id_col = NULL,
  method = "n_dist",
  id_aggregation_fn = sum,
  extreme_pairing_levels = 1,
  num_fold_cols = 1,
  unique_fold_cols_only = TRUE,
  max_iters = 5,
  handle_existing_fold_cols = "keep_warn",
  parallel = FALSE
)
}
\arguments{
\item{data}{\code{data.frame}. Can be \emph{grouped}, in which case
the function is applied group-wise.}

\item{k}{\emph{Depends on \code{`method`}.}

Number of folds (default), fold size, with more (see \code{`method`}).

When \code{`num_fold_cols` > 1}, \code{`k`} can also be a vector
with one k per fold column. This allows trying multiple \code{`k`} settings at a time. Note
that the generated fold columns are not guaranteed to be in the order of \code{`k`}.

Given as whole number or percentage (\code{0 < `k` < 1}).}

\item{cat_col}{Name of categorical variable to balance between folds.

E.g. when predicting a binary variable (a or b), we usually want
both classes represented in every fold.

N.B. If also passing an \code{`id_col`}, \code{`cat_col`} should be constant within each ID.}

\item{num_col}{Name of numerical variable to balance between folds.

N.B. When used with \code{`id_col`}, values for each ID are aggregated using
\code{`id_aggregation_fn`} before being balanced.

N.B. When passing \code{`num_col`}, the \code{`method`} parameter is ignored.}

\item{id_col}{Name of factor with IDs.
This will be used to keep all rows that share an ID in the same fold
(if possible).

E.g. If we have measured a participant multiple times and want to see the
effect of time, we want to have all observations of this participant in
the same fold.

N.B. When \code{`data`} is a \emph{grouped} \code{data.frame}
(see \code{\link[dplyr:group_by]{dplyr::group_by()}}), IDs that appear in multiple
groupings might end up in different folds in those groupings.}

\item{method}{\code{"n_dist"}, \code{"n_fill"}, \code{"n_last"},
\code{"n_rand"}, \code{"greedy"}, or \code{"staircase"}.

\strong{Notice}: examples are sizes of the generated groups
based on a vector with \code{57} elements.

\subsection{n_dist (default)}{Divides the data into a specified number of groups and
distributes excess data points across groups
\eqn{(e.g. 11, 11, 12, 11, 12)}.

\code{`k`} is number of groups}

\subsection{n_fill}{Divides the data into a specified number of groups and
fills up groups with excess data points from the beginning
\eqn{(e.g. 12, 12, 11, 11, 11)}.

\code{`k`} is number of groups}

\subsection{n_last}{Divides the data into a specified number of groups.
It finds the most equal group sizes possible,
using all data points. Only the last group is able to differ in size
\eqn{(e.g. 11, 11, 11, 11, 13)}.

\code{`k`} is number of groups}

\subsection{n_rand}{Divides the data into a specified number of groups.
Excess data points are placed randomly in groups (only 1 per group)
\eqn{(e.g. 12, 11, 11, 11, 12)}.

\code{`k`} is number of groups}

\subsection{greedy}{Divides up the data greedily given a specified group size
\eqn{(e.g. 10, 10, 10, 10, 10, 7)}.

\code{`k`} is group size}

\subsection{staircase}{Uses step size to divide up the data.
Group size increases with 1 step for every group,
until there is no more data
\eqn{(e.g. 5, 10, 15, 20, 7)}.

\code{`k`} is step size}}

\item{id_aggregation_fn}{Function for aggregating values in \code{`num_col`}
for each ID, before balancing \code{`num_col`}.

N.B. Only used when \code{`num_col`} and \code{`id_col`} are both specified.}

\item{extreme_pairing_levels}{How many levels of extreme pairing to do
when balancing folds by a numerical column (i.e. \code{`num_col`} is specified).

\strong{Extreme pairing}: Rows/pairs are ordered as smallest, largest,
second smallest, second largest, etc. If \code{extreme_pairing_levels > 1},
this is done "recursively" on the extreme pairs. See \code{`Details/num_col`} for more.

N.B. Larger values work best with large datasets. If set too high,
the result might not be stochastic. Always check if an increase
actually makes the folds more balanced. See example.}

\item{num_fold_cols}{Number of fold columns to create.
Useful for repeated cross-validation.

If \code{num_fold_cols > 1}, columns will be named
\eqn{".folds_1"}, \eqn{".folds_2"}, etc.
Otherwise simply \eqn{".folds"}.

N.B. If \code{`unique_fold_cols_only`} is \code{TRUE},
we can end up with fewer columns than specified, see \code{`max_iters`}.

N.B. If \code{`data`} has existing fold columns, see \code{`handle_existing_fold_cols`}.}

\item{unique_fold_cols_only}{Check if fold columns are identical and
keep only unique columns.

As the number of column comparisons can be time consuming,
we can run this part in parallel. See \code{`parallel`}.

N.B. We can end up with fewer columns than specified in
\code{`num_fold_cols`}, see \code{`max_iters`}.

N.B. Only used when \code{`num_fold_cols` > 1} or \code{`data`} has existing fold columns.}

\item{max_iters}{Maximum number of attempts at reaching
\code{`num_fold_cols`} \emph{unique} fold columns.

When only keeping unique fold columns, we risk having fewer columns than expected.
Hence, we repeatedly create the missing columns and remove those that are not unique.
This is done until we have \code{`num_fold_cols`} unique fold columns
or we have attempted \code{`max_iters`} times.
In some cases, it is not possible to create \code{`num_fold_cols`}
unique combinations of the dataset, e.g.
when specifying \code{`cat_col`}, \code{`id_col`} and \code{`num_col`}.
\code{`max_iters`} specifies when to stop trying.
Note that we can end up with fewer columns than specified in \code{`num_fold_cols`}.

N.B. Only used \code{`num_fold_cols` > 1}.}

\item{handle_existing_fold_cols}{How to handle existing fold columns.
Either \code{"keep_warn"}, \code{"keep"}, or \code{"remove"}.

To \strong{add} extra fold columns, use \code{"keep"} or \code{"keep_warn"}.
Note that existing fold columns might be renamed.

To \strong{replace} the existing fold columns, use \code{"remove"}.}

\item{parallel}{Whether to parallelize the fold column comparisons,
when \code{`unique_fold_cols_only`} is \code{TRUE}.

Requires a registered parallel backend.
Like \code{doParallel::registerDoParallel}.}
}
\value{
\code{data.frame} with grouping factor for subsetting in cross-validation.
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("stable")}

Divides data into groups by a range of methods.
Balances a given categorical variable and/or numerical variable between folds and keeps (if possible)
all data points with a shared ID (e.g. participant_id) in the same fold.
Can create multiple unique fold columns for repeated cross-validation.
}
\details{
\subsection{cat_col}{
\enumerate{
\item \code{`data`} is subset by \code{`cat_col`}.
\item Subsets are grouped and merged.
}
}

\subsection{id_col}{
\enumerate{
\item Groups are created from unique IDs.
}
}

\subsection{num_col}{
\enumerate{
\item Rows are shuffled.
\strong{Note} that this will only affect rows with the same value in \code{`num_col`}.
\item Extreme pairing 1: Rows are ordered as \emph{smallest, largest, second smallest, second largest}, etc.
Each pair get a group identifier.
\item If \code{`extreme_pairing_levels` > 1}: The group identifiers are reordered as \emph{smallest,
largest, second smallest, second largest}, etc., by the sum of \code{`num_col`} in the represented rows.
These pairs (of pairs) get a new set of group identifiers, and the process is repeated
\code{`extreme_pairing_levels`-2} times. Note that the group identifiers at the last level will represent
\code{2^`extreme_pairing_levels`} rows, why you should be careful when choosing that setting.
\item The final group identifiers are folded, and the fold identifiers are transferred to the rows.
}

N.B. When doing extreme pairing of an unequal number of rows,
the row with the smallest value is placed in a group by itself, and the order is instead:
smallest, \emph{second smallest, largest, third smallest, second largest}, etc.
}

\subsection{cat_col AND id_col}{
\enumerate{
\item \code{`data`} is subset by \code{`cat_col`}.
\item Groups are created from unique IDs in each subset.
\item Subsets are merged.
}
}

\subsection{cat_col AND num_col}{
\enumerate{
\item \code{`data`} is subset by \code{`cat_col`}.
\item Subsets are grouped by \code{`num_col`}.
\item Subsets are merged such that the largest group
(by sum of \code{`num_col`}) from the first category
is merged with the smallest group from the second category, etc.
}
}

\subsection{num_col AND id_col}{
\enumerate{
\item Values in \code{`num_col`} are aggregated for each ID, using \code{`id_aggregation_fn`}.
\item The IDs are grouped, using the aggregated values as "\code{num_col}".
\item The groups of the IDs are transferred to the rows.
}
}

\subsection{cat_col AND num_col AND id_col}{
\enumerate{
\item Values in \code{`num_col`} are aggregated for each ID, using \code{`id_aggregation_fn`}.
\item IDs are subset by \code{`cat_col`}.
\item The IDs in each subset are grouped,
by using the aggregated values as "\code{num_col}".
\item The subsets are merged such that the largest group
(by sum of the aggregated values) from the first category
is merged with the smallest group from the second category, etc.
\item The groups of the IDs are transferred to the rows.
}
}
}
\examples{
# Attach packages
library(groupdata2)
library(dplyr)

# Create data frame
df <- data.frame(
  "participant" = factor(rep(c("1", "2", "3", "4", "5", "6"), 3)),
  "age" = rep(sample(c(1:100), 6), 3),
  "diagnosis" = factor(rep(c("a", "b", "a", "a", "b", "b"), 3)),
  "score" = sample(c(1:100), 3 * 6)
)
df <- df \%>\% arrange(participant)
df$session <- rep(c("1", "2", "3"), 6)

# Using fold()

## Without balancing
df_folded <- fold(data = df, k = 3, method = "n_dist")

## With cat_col
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  method = "n_dist"
)

## With id_col
df_folded <- fold(
  data = df,
  k = 3,
  id_col = "participant",
  method = "n_dist"
)

## With num_col
# Note: 'method' would not be used in this case
df_folded <- fold(data = df, k = 3, num_col = "score")

# With cat_col and id_col
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  id_col = "participant", method = "n_dist"
)

## With cat_col, id_col and num_col
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  id_col = "participant", num_col = "score"
)

# Order by folds
df_folded <- df_folded \%>\% arrange(.folds)

## Multiple fold columns
# Useful for repeated cross-validation
# Note: Consider running in parallel
df_folded <- fold(
  data = df,
  k = 3,
  cat_col = "diagnosis",
  id_col = "participant",
  num_fold_cols = 5,
  unique_fold_cols_only = TRUE,
  max_iters = 4
)

# Different `k` per fold column
# Note: `length(k) == num_fold_cols`
df_folded <- fold(
  data = df,
  k = c(2, 3),
  cat_col = "diagnosis",
  id_col = "participant",
  num_fold_cols = 2,
  unique_fold_cols_only = TRUE,
  max_iters = 4
)

## Check if additional `extreme_pairing_levels`
## improve the numerical balance
set.seed(2) # try with seed 1 as well
df_folded_1 <- fold(
  data = df,
  k = 3,
  num_col = "score",
  extreme_pairing_levels = 1
)
df_folded_1 \%>\%
  dplyr::group_by(.folds) \%>\%
  dplyr::summarise(
    sum_score = sum(score),
    mean_score = mean(score)
  )

set.seed(2)  # Try with seed 1 as well
df_folded_2 <- fold(
  data = df,
  k = 3,
  num_col = "score",
  extreme_pairing_levels = 2
)
df_folded_2 \%>\%
  dplyr::group_by(.folds) \%>\%
  dplyr::summarise(
    sum_score = sum(score),
    mean_score = mean(score)
  )
}
\seealso{
\code{\link{partition}} for balanced partitions

Other grouping functions: 
\code{\link{all_groups_identical}()},
\code{\link{group_factor}()},
\code{\link{group}()},
\code{\link{partition}()},
\code{\link{splt}()}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
\concept{grouping functions}
