# ==========================================
# File: predict_SDS.R
# Function: predict_SDS()
# Purpose: Predict Spectral Disease Severity (SDS) using standard linear regression (`lm()`)
# Author: Medhat Mahmoud
# Date: 24.02.2025
# ==========================================
#' Predict Spectral Disease Severity (SDS)
#'
#' This function predicts Spectral Disease Severity (SDS) using a standard linear regression model (`lm()`).
#' It automatically handles column names with special characters by using backticks and constrains predictions to the range `[0, 100]`.
#'
#' @param cleaned_data A dataframe containing spectral measurements and treatment labels.
#' @param sf_test A dataframe containing selected important features (from statistical tests).
#' @param fixed_effects A character vector of fixed effects to include (default: NULL). 
#'                      Example: c("Scan.date").
#'
#' @return A dataframe with predicted SDS values for all treatments, constrained between `0` and `100`.
#' @export
#' @examples
#' # Create mock spectral data
#' library(openxlsx)
#' cleaned_data <- data.frame(
#'   treatment = sample(0:1, 100, replace = TRUE),
#'   var1 = rnorm(100),
#'   var2 = rnorm(100),
#'   var3 = rnorm(100),
#'   Scan.date = sample(
#'     seq.Date(
#'       from = as.Date('2023-01-01'),
#'       to = as.Date('2023-12-31'),
#'       by = 'day'
#'     ),
#'     100
#'   ),
#'   Scan.time = format(Sys.time(), "%H:%M:%S")
#' )
predict_SDS <- function(cleaned_data, sf_test, fixed_effects = NULL) {
  
  # Step 1: Ensure 'treatment' column exists and handle sanitization
  treatment_col <- grep("treatment", colnames(cleaned_data), value = TRUE)
  if (length(treatment_col) == 0) {
    stop("The treatment column was not found in the dataset.")
  }
  
  # Add Spectral Disease Severity (SDS) column
  cleaned_data <- cleaned_data %>%
    dplyr::mutate(SDS = dplyr::case_when(
      !!rlang::sym(treatment_col) == 0 ~ 0,   # Healthy plants
      !!rlang::sym(treatment_col) == 1 ~ 100, # Diseased plants
      TRUE ~ NA_real_                        # Other treatments (to be predicted)
    ))
  
  # Step 2: Split dataset into training and predicting data using base R
  training_data <- cleaned_data[cleaned_data[[treatment_col]] %in% c(0, 1), ]  # Only healthy & diseased
  predicting_data <- cleaned_data[!(cleaned_data[[treatment_col]] %in% c(0, 1)), ]  # Other treatments
  
  # Step 3: Select important features from sf_test
  selected_features <- as.character(sf_test$Variable)
  
  # Ensure the selected features exist in cleaned_data
  selected_features <- selected_features[selected_features %in% colnames(cleaned_data)]
  
  if (length(selected_features) == 0) {
    stop("No matching features found in the cleaned data. Check sf_test and column names.")
  }
  
  # Prepare training dataset with selected features
  train_X <- training_data %>% select(all_of(selected_features))
  train_Y <- training_data$SDS  # Response variable
  
  # Prepare prediction dataset with selected features
  predict_X <- predicting_data %>% select(all_of(selected_features))
  
  # Step 4: Construct the formula dynamically (use backticks for special characters)
  fixed_part <- if (!is.null(fixed_effects) && length(fixed_effects) > 0) 
    paste0("`", fixed_effects, "`", collapse = " + ") 
  else "1"  # Default to intercept-only model
  
  # Use backticks around feature names
  features_with_backticks <- paste0("`", selected_features, "`", collapse = " + ")
  model_formula <- as.formula(paste("SDS ~", fixed_part, "+", features_with_backticks))
  
  # Step 5: Fit a Standard Linear Model (lm)
  lm_model <- lm(model_formula, data = training_data)
  
  # Step 6: Predict continuous SDS for remaining treatments
  predicting_data$SDS <- predict(lm_model, newdata = predicting_data)
  
  # Step 7: Ensure SDS remains between 0-100
  predicting_data$SDS <- pmax(pmin(predicting_data$SDS, 100), 0)
  
  # Step 8: Merge training and predicted data into final dataset
  final_data <- bind_rows(training_data, predicting_data)
  
  # Round training data SDS values
  final_data$SDS <- round(final_data$SDS, 2)
  
  # Move SDS column after Scan.time (no name sanitization)
  final_data <- final_data %>%
    relocate(SDS, .after = `Scan.time`)
  
  return(final_data)
}

