-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathadd_dummies_GVS.R
122 lines (110 loc) · 3.89 KB
/
add_dummies_GVS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#' Add dummy predictors to the original predictor matrix, as required by the T-Rex+GVS selector (\doi{10.23919/EUSIPCO55093.2022.9909883})
#'
#' Generate num_dummies dummy predictors as required for the T-Rex+GVS selector (\doi{10.23919/EUSIPCO55093.2022.9909883}) and append them to the predictor matrix X.
#'
#' @param X Real valued predictor matrix.
#' @param num_dummies Number of dummies that are appended to the predictor matrix. Has to be a multiple of the number of original variables.
#' @param corr_max Maximum allowed correlation between any two predictors from different clusters.
#'
#' @return Enlarged predictor matrix for the T-Rex+GVS selector.
#'
#' @importFrom stats cov as.dist hclust cutree aggregate
#' @importFrom MASS mvrnorm
#'
#' @export
#'
#' @examples
#' set.seed(123)
#' n <- 50
#' p <- 100
#' X <- matrix(stats::rnorm(n * p), nrow = n, ncol = p)
#' add_dummies_GVS(X = X, num_dummies = p)
add_dummies_GVS <- function(X,
num_dummies,
corr_max = 0.5) {
# Error control
if (!is.matrix(X)) {
stop("'X' must be a matrix.")
}
if (!is.numeric(X)) {
stop("'X' only allows numerical values.")
}
if (anyNA(X)) {
stop("'X' contains NAs. Please remove or impute them before proceeding.")
}
# Dimensions of the data
n <- nrow(X)
p <- ncol(X)
# Continue error control
if (length(num_dummies) != 1 ||
num_dummies %% p != 0 ||
num_dummies < 1) {
stop(
"'num_dummies' must be a positive integer multiple of the total number of original predictors in X."
)
}
if (length(corr_max) != 1 ||
corr_max < 0 ||
corr_max > 1) {
stop("'corr_max' must have a value between zero and one.")
}
# Add variable names
colnames(X) <- paste0("V", seq(p))
# Single linkage hierarchical clustering using the sample correlation as the similarity measure
sigma_X <- stats::cov(X)
sigma_X_dist <- stats::as.dist(1 - abs(stats::cov2cor(sigma_X)))
fit <- stats::hclust(sigma_X_dist, method = "single")
clusters <- stats::cutree(fit, h = 1 - corr_max)
max_clusters <- max(clusters)
clusters <- data.frame(
"Var" = names(clusters),
"Cluster_Nr." = unname(clusters)
)
clusters <-
stats::aggregate(clusters$"Var" ~ clusters$"Cluster_Nr.",
FUN = "c",
simplify = FALSE
)
cluster_sizes <- vector("numeric", length = max_clusters)
for (j in seq(max_clusters)) {
cluster_sizes[j] <- length(clusters$`clusters$Var`[[j]])
}
# Binary cluster identity vectors for T-Rex+GVS+IEN
IEN_cl_ind <- lapply(clusters$`clusters$Var`, FUN = function(x) as.numeric(sub(".", "", x)))
IEN_cl_id_vectors <- t(sapply(seq_along(IEN_cl_ind), FUN = function(x) {
cl_id_vec <- rep(FALSE, times = ncol(X))
cl_id_vec[IEN_cl_ind[[x]]] <- TRUE
cl_id_vec}))
# Generate dummy predictors and append them to the original predictor matrix X
w_max <- num_dummies / p
X_p_sub_dummy <- matrix(NA, nrow = n, ncol = p)
X_Dummy <- matrix(NA, nrow = n, ncol = p + num_dummies)
X_Dummy[, seq(p)] <- X
for (w in seq(w_max)) {
for (z in seq(max_clusters)) {
sub_X <- X[, clusters$`clusters$Var`[[z]], drop = FALSE]
sigma_sub_X <- stats::cov(sub_X)
idx <- cumsum(cluster_sizes)
mu <- rep(0, times = cluster_sizes[z])
if (z == 1) {
X_p_sub_dummy[, seq(idx[z])] <- MASS::mvrnorm(n,
mu = mu,
Sigma = sigma_sub_X,
empirical = FALSE
)
} else {
X_p_sub_dummy[, seq(idx[z - 1] + 1, idx[z])] <- MASS::mvrnorm(n,
mu = mu,
Sigma = sigma_sub_X,
empirical = FALSE
)
}
}
X_Dummy[, seq(w * p + 1, (w + 1) * p)] <- X_p_sub_dummy
}
add_dummies_res <- list(X_Dummy = X_Dummy,
max_clusters = max_clusters,
cluster_sizes = cluster_sizes,
IEN_cl_id_vectors = IEN_cl_id_vectors)
return(add_dummies_res)
}