R/random_experiments.R

#' Run K random experiments
#'
#' Run K early terminated T-Rex (\doi{10.48550/arXiv.2110.06048}) random experiments and
#' compute the matrix of relative occurrences for all variables and all numbers of included variables before stopping.
#'
#' @param X Real valued predictor matrix.
#' @param y Response vector.
#' @param K Number of random experiments.
#' @param T_stop Number of included dummies after which the random experiments (i.e., forward selection processes) are stopped.
#' @param num_dummies Number of dummies that are appended to the predictor matrix.
#' @param method 'trex' for the T-Rex selector (\doi{10.48550/arXiv.2110.06048}),
#' 'trex+GVS' for the T-Rex+GVS selector (\doi{10.23919/EUSIPCO55093.2022.9909883}),
#' 'trex+DA+AR1' for the T-Rex+DA+AR1 selector,
#' 'trex+DA+equi' for the T-Rex+DA+equi selector,
#' 'trex+DA+BT' for the T-Rex+DA+BT selector (\doi{10.48550/arXiv.2401.15796}),
#' 'trex+DA+NN' for the T-Rex+DA+NN selector (\doi{10.48550/arXiv.2401.15139}).
#' @param GVS_type 'IEN' for the Informed Elastic Net (\doi{10.1109/CAMSAP58249.2023.10403489}),
#' 'EN' for the ordinary Elastic Net (\doi{10.1111/j.1467-9868.2005.00503.x}).
#' @param type 'lar' for 'LARS' and 'lasso' for Lasso.
#' @param corr_max Maximum allowed correlation between any two predictors from different clusters (for method = 'trex+GVS').
#' @param lambda_2_lars lambda_2-value for LARS-based Elastic Net.
#' @param early_stop Logical. If TRUE, then the forward selection process is stopped after T_stop dummies have been included. Otherwise
#' the entire solution path is computed.
#' @param lars_state_list If parallel_process = TRUE: List of state variables of the previous T-LARS steps of the K random experiments
#' (necessary for warm-starts, i.e., restarting the forward selection process exactly where it was previously terminated).
#' If parallel_process = FALSE: List of objects of the class tlars_cpp associated with the K random experiments
#' (necessary for warm-starts, i.e., restarting the forward selection process exactly where it was previously terminated).
#' @param verbose Logical. If TRUE progress in computations is shown.
#' @param intercept Logical. If TRUE an intercept is included.
#' @param standardize Logical. If TRUE the predictors are standardized and the response is centered.
#' @param dummy_coef Logical. If TRUE a matrix containing the terminal dummy coefficient vectors of all K random experiments as rows is returned.
#' @param parallel_process Logical. If TRUE random experiments are executed in parallel.
#' @param parallel_max_cores Maximum number of cores to be used for parallel processing.
#' @param seed Seed for random number generator (ignored if parallel_process = FALSE).
#' @param eps Numerical zero.
#'
#' @return List containing the results of the K random experiments.
#'
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach getDoParWorkers registerDoSEQ `%do%` `%dopar%` foreach
#' @importFrom doRNG `%dorng%`
#' @importFrom methods is
#'
#' @export
#'
#' @examples
#' set.seed(123)
#' data("Gauss_data")
#' X <- Gauss_data$X
#' y <- c(Gauss_data$y)
#' res <- random_experiments(X = X, y = y)
#' relative_occurrences_matrix <- res$phi_T_mat
#' relative_occurrences_matrix
random_experiments <- function(X,
                               y,
                               K = 20,
                               T_stop = 1,
                               num_dummies = ncol(X),
                               method = "trex",
                               GVS_type = "EN",
                               type = "lar",
                               corr_max = 0.5,
                               lambda_2_lars = NULL,
                               early_stop = TRUE,
                               lars_state_list,
                               verbose = TRUE,
                               intercept = FALSE,
                               standardize = TRUE,
                               dummy_coef = FALSE,
                               parallel_process = FALSE,
                               parallel_max_cores = min(K, max(1, parallel::detectCores(logical = FALSE))),
                               seed = NULL,
                               eps = .Machine$double.eps) {
  # Error control
  method <- match.arg(method, c("trex", "trex+GVS", "trex+DA+AR1", "trex+DA+equi", "trex+DA+BT", "trex+DA+NN"))

  type <- match.arg(type, c("lar", "lasso"))

  GVS_type <- match.arg(GVS_type, c("IEN", "EN"))

  if (!is.matrix(X)) {
    stop("'X' must be a matrix.")
  }

  if (!is.numeric(X)) {
    stop("'X' only allows numerical values.")
  }

  if (anyNA(X)) {
    stop("'X' contains NAs. Please remove or impute them before proceeding.")
  }

  if (!is.vector(drop(y))) {
    stop("'y' must be a vector.")
  }

  if (!is.numeric(y)) {
    stop("'y' only allows numerical values.")
  }

  if (anyNA(y)) {
    stop("'y' contains NAs. Please remove or impute them before proceeding.")
  }

  if (nrow(X) != length(drop(y))) {
    stop("Number of rows in X does not match length of y.")
  }

  if (length(K) != 1 ||
    K < 2 ||
    K %% 1 != 0) {
    stop("The number of random experiments 'K' must be an integer larger or equal to 2.")
  }

  if (method == "trex" || method == "trex+DA+AR1" || method == "trex+DA+equi" || method == "trex+DA+BT" || method == "trex+DA+NN") {
    if (length(num_dummies) != 1 ||
      num_dummies %% 1 != 0 ||
      num_dummies < 1) {
      stop("'num_dummies' must be an integer larger or equal to 1.")
    }
  }

  # Number of variables in X
  p <- ncol(X)

  # Continue error control
  if (method == "trex+GVS") {
    if (length(num_dummies) != 1 ||
      num_dummies %% p != 0 ||
      num_dummies < 1) {
      stop(
        "'num_dummies' must be a positive integer multiple of the total number of original predictors in X."
      )
    }
  }

  if (length(T_stop) != 1 ||
    !(T_stop %in% seq(1, num_dummies))) {
    stop(
      paste0(
        "Value of 'T_stop' not valid. 'T_stop' must be an integer from 1 to ",
        num_dummies,
        "."
      )
    )
  }

  if (method == "trex+GVS") {
    if (length(corr_max) != 1 ||
      corr_max < 0 ||
      corr_max > 1) {
      stop("'corr_max' must have a value between zero and one.")
    }

    if (!is.null(lambda_2_lars)) {
      if (length(lambda_2_lars) != 1 ||
        lambda_2_lars < eps) {
        stop("'lambda_2_lars' must be a number larger than zero.")
      }
    }
  }

  if (parallel_process &&
    (
      length(parallel_max_cores) != 1 ||
        parallel_max_cores %% 1 != 0 ||
        parallel_max_cores < 2
    )) {
    stop(
      "For parallel processing at least two workers have to be registered:
         'parallel_max_cores' must be an integer larger or equal to 2."
    )
  }

  if (parallel_process &&
    parallel_max_cores > min(K, max(1, parallel::detectCores(logical = FALSE)))) {
    parallel_max_cores_modified <-
      min(K, max(1, parallel::detectCores(logical = FALSE)))
    message(
      paste0(
        "For computing ",
        K,
        " random experiments, it is not useful/possible to register ",
        parallel_max_cores,
        " workers. Setting parallel_max_cores = ",
        min(K, max(
          1, parallel::detectCores(logical = FALSE)
        )),
        " (# physical cores) ...\n"
      )
    )
    parallel_max_cores <-
      min(K, max(1, parallel::detectCores(logical = FALSE)))
  }

  if (parallel_process && T_stop == 1 && num_dummies <= p) {
    message(
      "Computing random experiments in parallel...
            Note that this is only advantageous if you have at least a few thousand predictors and/or data points in 'X'.
            Otherwise, the overhead will slow down the computations in parallel. Thus, for small data sizes it is better
            to set parallel_process = FALSE.

      Be careful!"
    )
  }

  if (!(missing(lars_state_list) || is.null(lars_state_list))) {
    if (length(lars_state_list) != K) {
      stop("Length of 'lars_state_list' must be equal to number of random experiments 'K'.")
    }
  }

  # Create empty lars_state_list if missing or NULL
  if (missing(lars_state_list) || is.null(lars_state_list)) {
    lars_state_list <- vector(mode = "list", length = K)
  }

  # Combines Output Lists of Parallel 'foreach' Loop
  comb_fun <- function(x, ...) {
    lapply(
      seq_along(x),
      function(i) {
        c(x[[i]], lapply(list(...), function(y) {
          y[[i]]
        }))
      }
    )
  }

  # Setup cluster
  if (parallel_process && foreach::getDoParWorkers() == 1) {
    cl <- parallel::makeCluster(parallel_max_cores)
    doParallel::registerDoParallel(cl)
    on.exit(parallel::stopCluster(cl), add = TRUE)
    on.exit(foreach::registerDoSEQ(), add = TRUE)
  }

  `%par_exe%` <-
    ifelse(parallel_process, doRNG::`%dorng%`, foreach::`%do%`)
  h <- NULL
  res <- foreach::foreach(
    h = seq(K),
    .combine = comb_fun,
    .multicombine = TRUE,
    .init = list(list(), list(), list(), list()),
    .options.RNG = seed
  ) %par_exe% {
    # Recreate tlarsCpp object if necessary
    if (parallel_process &&
      !is.null(lars_state_list[[h]]) &&
      !methods::is(object = lars_state_list[[h]], class2 = tlars::tlars_cpp)) {
      lars_state <- tlars::tlars_model(lars_state = lars_state_list[[h]])
    } else {
      lars_state <- lars_state_list[[h]]
    }

    # Run random experiment
    lars_state <- lm_dummy(
      X = X,
      y = y,
      model_tlars = lars_state,
      T_stop = T_stop,
      num_dummies = num_dummies,
      method = method,
      GVS_type = GVS_type,
      type = type,
      corr_max = corr_max,
      lambda_2_lars = lambda_2_lars,
      early_stop = early_stop,
      verbose = verbose,
      intercept = intercept,
      standardize = standardize
    )

    # Extract T-LARS path
    lars_path <- do.call(cbind, lars_state$get_beta_path())

    # Extract content of object lars_state if performing parallel computations
    if (parallel_process) {
      lars_state <- lars_state$get_all()
    }

    # Number of included dummies along solution path
    dummy_num_path <- colSums(matrix(
      abs(lars_path[(p + 1):(p + num_dummies), ]) > eps,
      nrow = num_dummies,
      ncol = ncol(lars_path)
    ))

    # Number of included original variables along solution path
    var_num_path <- colSums(matrix(
      abs(lars_path[1:p, ]) > eps,
      nrow = p,
      ncol = ncol(lars_path)
    ))

    # Relative occurrences
    phi_T_mat <- matrix(0, nrow = p, ncol = T_stop)
    for (c in seq(T_stop)) {
      if (!any(dummy_num_path == c)) {
        ind_sol_path <- length(dummy_num_path)
        warning(
          paste(
            "For T_stop = ",
            c,
            " LARS is running until k = min(n, p) and stops there before selecting ",
            c,
            " dummies.",
            sep = ""
          )
        )
      } else {
        ind_sol_path <- which(as.numeric(dummy_num_path) == c)[1]
      }
      phi_T_mat[, c] <-
        (1 / K) * (abs(lars_path[1:p, ind_sol_path]) > eps)
    }

    # Last coefficient vectors of all random experiments after termination
    rand_exp_last_betas_mat <- lars_path[1:p, ncol(lars_path)]

    # Matrix containing the dummy coefficient vectors of all K random experiments as rows is returned.
    if (dummy_coef) {
      dummy_rand_exp_last_betas_mat <- lars_path[seq(p + 1, p + num_dummies), ncol(lars_path)]
    } else {
      dummy_rand_exp_last_betas_mat <- NULL
    }

    list(
      phi_T_mat,
      rand_exp_last_betas_mat,
      lars_state,
      dummy_rand_exp_last_betas_mat
    )
  }

  # Merging results of all random experiments
  lars_state_list <- res[[3]]
  names(lars_state_list) <-
    paste("lars_state (K = ", seq(K), ")", sep = "")

  phi_T_mat <- Reduce("+", res[[1]])
  rand_exp_last_betas_mat <- unname(Reduce(rbind, res[[2]]))
  Phi <- apply(abs(rand_exp_last_betas_mat) > eps, 2, sum) / K

  if (dummy_coef) {
    dummy_rand_exp_last_betas_mat <- unname(Reduce(rbind, res[[4]]))
  } else {
    dummy_rand_exp_last_betas_mat <- NULL
  }

  # List of results
  rand_exp_res <- list(
    phi_T_mat = phi_T_mat,
    rand_exp_last_betas_mat = rand_exp_last_betas_mat,
    dummy_rand_exp_last_betas_mat = dummy_rand_exp_last_betas_mat,
    Phi = Phi,
    lars_state_list = lars_state_list,
    K = K,
    T_stop = T_stop,
    num_dummies = num_dummies,
    method = method,
    GVS_type = GVS_type,
    type = type,
    corr_max = corr_max,
    lambda_2_lars = lambda_2_lars,
    seed = seed,
    eps = eps
  )

  return(rand_exp_res)
}