Skip to content

Commit

Permalink
Create R package tknock
Browse files Browse the repository at this point in the history
- Add main R functions: tknock.R, random_experiments.R, lm_knockoff.R, and others
  • Loading branch information
jasinmachkour committed Apr 4, 2022
0 parents commit 63168f0
Show file tree
Hide file tree
Showing 58 changed files with 3,194 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
^tknock\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
^README\.Rmd$
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.Rproj.user
.Rhistory
.Rdata
.httr-oauth
.DS_Store
inst/doc
37 changes: 37 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
Package: tknock
Title: The T-Knock filter for fast high-dimensional variable selection with FDR control
Version: 0.0.0.9000
Authors@R:
c(
person("Jasin", "Machkour", , "[email protected]", role = c("aut", "cre")),
person("Simon", "Tien", , email = "[email protected]", role = "aut"),
person(c("Daniel", "P."), "Palomar", , "[email protected]", role = "aut"),
person("Michael", "Muma", , "[email protected]", role = "aut")
)
Maintainer: Jasin Machkour <[email protected]>
Description: Performs variable selection in high-dimensional settings where the number of variables is potentially higher than the number of observations (data points) while controlling the FDR at a user-defined target level.
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.2
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Imports:
Matrix,
MASS,
stats,
mvnfast,
tlars,
parallel,
doParallel,
foreach,
doRNG,
snpStats,
methods
Depends:
R (>= 2.10)
VignetteBuilder: knitr
595 changes: 595 additions & 0 deletions LICENSE.md

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by roxygen2: do not edit by hand

export(FDP)
export(Phi_prime_fun)
export(TPP)
export(add_knockoffs)
export(add_knockoffs_GVS)
export(fdp_hat)
export(lm_knockoff)
export(random_experiments)
export(select_var_fun)
export(tknock)
import(MASS)
import(doParallel)
import(doRNG)
import(foreach)
import(methods)
import(mvnfast)
import(parallel)
import(snpStats)
import(stats)
import(tlars)
importFrom(Matrix,nearPD)
importFrom(Matrix,qr)
20 changes: 20 additions & 0 deletions R/FDP.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#' False discovery proportion (FDP)
#'
#' @param beta_hat Estimated support vector.
#' @param beta True support vector.
#' @param eps Numerical zero.
#'
#' @return FDP
#'
#' @export
FDP = function(beta_hat, beta, eps = .Machine$double.eps) {
if (sum(is.na(beta_hat)) == length(beta_hat)) {
fdp = NA
} else if (sum(abs(beta_hat) > eps) == 0) {
fdp = 0
} else{
fdp = 100 * (sum(abs(beta) < eps &
abs(beta_hat) > eps) / sum(abs(beta_hat) > eps))
}
return(fdp)
}
27 changes: 27 additions & 0 deletions R/Gauss_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' Toy data generated from a Gaussian linear model.
#'
#' A dataset containing a predictor matrix X with n = 50 observations
#' and p = 100 variables (predictors), and a sparse parameter vector beta
#' with associated support vector.
#'
#' Generated as follows:
#' set.seed(789)
#' n = 50
#' p = 100
#' X = matrix(rnorm(n * p), nrow = n, ncol = p)
#' beta = c(rep(5, times = 3), rep(0, times = 97))
#' supp = beta > 0
#' y = X %*% beta + rnorm(n)
#' Gauss_data = list(X = X,
#' y = y,
#' beta = beta,
#' supp = supp);
#'
#' @format A list containing a matrix X and vectors y, beta, and supp:
#' \describe{
#' \item{X}{Predictor matrix, n = 50, p = 100.}
#' \item{y}{Response vector.}
#' \item{beta}{Parameter vector.}
#' \item{supp}{Support vector.}
#' }
"Gauss_data"
54 changes: 54 additions & 0 deletions R/Phi_prime_fun.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' Computes the Deflated Relative Occurrences
#'
#' Computes the matrix of deflated relative occurrences for all variables (i.e., j = 1,..., p) and for T = 1, ..., T_stop.
#'
#' @param p Number of candidate variables.
#' @param T_stop Number of included knockoffs after which the random experiments (i.e., forward selection processes) are stopped.
#' @param L_val Number of knockoffs.
#' @param phi_T_mat Matrix of relative occurrences for all variables (i.e., j = 1,..., p) and for T = 1, ..., T_stop.
#' @param Phi Vector of relative occurrences for all variables (i.e., j = 1,..., p) at T = T_stop.
#' @param eps Numerical zero.
#'
#' @return Matrix of deflated relative occurrences for all variables (i.e., j = 1,..., p) and for T = 1, ..., T_stop.
#'
#' @export
#'
#' @examples
#' data("Gauss_data")
#' X = Gauss_data$X
#' y = c(Gauss_data$y)
#' rand_exp = random_experiments(X, y)
#' Phi_prime_fun(p = ncol(X),
#' T_stop = rand_exp$T_stop,
#' L_val = rand_exp$L_val,
#' phi_T_mat = rand_exp$phi_T_mat,
#' Phi = rand_exp$Phi,
#' eps = rand_exp$eps)
Phi_prime_fun = function(p,
T_stop,
L_val,
phi_T_mat,
Phi,
eps = .Machine$double.eps) {
av_num_var_sel = colSums(phi_T_mat)
fifty_phi_T_mat = phi_T_mat[Phi > 0.5, , drop = FALSE]
delta_av_num_var_sel = colSums(fifty_phi_T_mat)

if (T_stop > 1) {
delta_av_num_var_sel[2:T_stop] = delta_av_num_var_sel[2:T_stop] - delta_av_num_var_sel[1:(T_stop - 1)]
phi_T_mat[, 2:T_stop] = phi_T_mat[, 2:T_stop] - phi_T_mat[, 1:(T_stop - 1)]
}

phi_scale = rep(NA, times = length(delta_av_num_var_sel))
for (t in seq_along(delta_av_num_var_sel)) {
if (delta_av_num_var_sel[t] > eps) {
phi_scale[t] = 1 - (((p - av_num_var_sel[t]) / (L_val - t + 1)) / delta_av_num_var_sel[t])
} else{
phi_scale[t] = 0
}
}

Phi_prime = phi_T_mat %*% phi_scale

return(Phi_prime)
}
20 changes: 20 additions & 0 deletions R/TPP.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#' True positive proportion (TPP)
#'
#' @param beta_hat Estimated support vector.
#' @param beta True support vector.
#' @param eps Numerical zero.
#'
#' @return TPP.
#'
#' @export
TPP = function(beta_hat, beta, eps = .Machine$double.eps) {
if (sum(is.na(beta_hat)) == length(beta_hat)) {
tpp = NA
} else if (sum(abs(beta) > eps) == 0) {
tpp = 0
} else{
tpp = 100 * (sum(abs(beta) > eps &
abs(beta_hat) > eps) / sum(abs(beta) > eps))
}
return(tpp)
}
73 changes: 73 additions & 0 deletions R/add_knockoffs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#' Add knockoffs to the original predictor matrix
#'
#' Sample L_val knockoff vectors from the univariate standard normal distribution and append them to the predictor matrix X.
#'
#' @param X Real valued Predictor matrix.
#' @param L_val Number of knockoffs.
#' @param cor.structure TRUE/FALSE.
#' @param empirical TRUE/FALSE.
#' @param eps Numerical zero.
#'
#' @return Enlarged predictor matrix, i.e., original predictor matrix and knockoffs.
#'
#' @import stats
#' @importFrom Matrix qr
#' @importFrom Matrix nearPD
#' @import MASS
#' @import mvnfast
#'
#' @export
#'
#' @examples
#' n = 50
#' p = 100
#' add_knockoffs(X = matrix(rnorm(n * p), nrow = n, ncol = p), L_val = p)
add_knockoffs = function(X,
L_val,
cor.structure = FALSE,
empirical = FALSE,
eps = .Machine$double.eps) {
g = 1
n = nrow(X)
p = ncol(X)
mu = rep(0, times = p)
for (i in seq(g)) {
if (L_val != p) {
X_surrogate = matrix(
stats::rnorm(n * L_val),
nrow = n,
ncol = L_val,
byrow = FALSE
)
} else{
if (cor.structure) {
Sig = stats::cov(X)
if (Matrix::qr(Sig)$rank != p) {
Sig = as.matrix(Matrix::nearPD(Sig)$mat)
}
if (empirical) {
X_surrogate = MASS::mvrnorm(n,
mu = mu,
Sigma = Sig,
empirical = empirical)
} else{
X_surrogate = mvnfast::rmvn(
n,
mu = mu,
sigma = Sig,
ncores = 1,
isChol = FALSE,
A = NULL
)
}
} else{
X_surrogate = matrix(rnorm(n * p),
nrow = n,
ncol = p,
byrow = FALSE)
}
}
X_Knock = cbind(X, X_surrogate)
}
return(X_Knock)
}
69 changes: 69 additions & 0 deletions R/add_knockoffs_GVS.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#' Add GVS-knockoffs to the original predictor matrix
#'
#' Sample L_val knockoff vectors for the T-Knock+GVS filter from the univariate standard normal distribution and append them to the predictor matrix X.
#'
#' @param X Real valued predictor matrix.
#' @param L_val Number of knockoffs.
#' @param corr_max Maximum allowed correlation between any two predictors from different clusters.
#'
#' @return Enlarged predictor matrix for the T-Knock+GVS filter, i.e., original predictor matrix and knockoffs.
#'
#' @import stats
#' @import MASS
#'
#' @export
#'
#' @examples
#' n = 50
#' p = 100
#' add_knockoffs_GVS(X = matrix(rnorm(n * p), nrow = n, ncol = p), L_val = p)
add_knockoffs_GVS = function(X,
L_val,
corr_max = 0.5) {
n = nrow(X)
p = ncol(X)
if (is.null(names(X))) {
colnames(X) = paste0('V', seq(p))
}

# Single linkage hierarchical clustering using the sample correlation as the similarity measure
sigma_X = stats::cov(X)
sigma_X_dist = stats::as.dist(1 - abs(stats::cov2cor(sigma_X)))
fit = stats::hclust(sigma_X_dist, method = 'single')
clusters = stats::cutree(fit, h = 1 - corr_max)
max_clusters = max(clusters)
clusters = data.frame('Var' = names(clusters),
'Cluster_Nr.' = unname(clusters))
clusters = stats::aggregate(clusters$'Var' ~ clusters$'Cluster_Nr.', FUN = 'c')
cluster_sizes = vector('numeric', length = max_clusters)
for (j in seq(max_clusters)) {
cluster_sizes[j] = length(clusters$`clusters$Var`[[j]])
}
w_max = L_val / p

X_p_surrogate = matrix(NA, nrow = n, ncol = p)
X_Knock = matrix(NA, nrow = n, ncol = p + L_val)
X_Knock[, seq(p)] = X

for (w in seq(w_max)) {
for (z in seq(max_clusters)) {
sub_X = X[, clusters$`clusters$Var`[[z]], drop = FALSE]
sigma_sub_X = stats::cov(sub_X)
idx = cumsum(cluster_sizes)
mu = rep(0, times = cluster_sizes[z])
if (z == 1) {
X_p_surrogate[, seq(idx[z])] = MASS::mvrnorm(n,
mu = mu,
Sigma = sigma_sub_X,
empirical = FALSE)
} else{
X_p_surrogate[, seq(idx[z - 1] + 1, idx[z])] = MASS::mvrnorm(n,
mu = mu,
Sigma = sigma_sub_X,
empirical = FALSE)
}
}
X_Knock[, seq(w * p + 1, (w + 1) * p)] = X_p_surrogate
}
return(X_Knock)
}
29 changes: 29 additions & 0 deletions R/fdp_hat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#' Computes the conservative FDP estimate of the T-Knock filter
#'
#' @param V Voting level grid.
#' @param Phi Vector of relative occurrences.
#' @param Phi_prime Vector of deflated relative occurrences.
#' @param T_stop Number of included knockoffs after which the random experiments (i.e., forward selection processes) are stopped.
#' @param L_val Number of knockoffs.
#' @param eps Numerical zero.
#'
#' @return Vector of conservative FDP estimates for each value of the voting level grid.
#'
#' @export
fdp_hat = function(V,
Phi,
Phi_prime,
T_stop,
L_val,
eps = .Machine$double.eps) {
fdp_h = rep(NA, times = length(V))
for (i in seq_along(V)) {
num_sel_var = sum(Phi > V[i])
if (num_sel_var < eps) {
fdp_h[i] = 0
} else{
fdp_h[i] = min(1, (sum((1 - Phi_prime)[Phi > V[i]]) / (num_sel_var)))
}
}
return(fdp_h)
}
15 changes: 15 additions & 0 deletions R/ground_truth.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#' Ground truth for simulated GWAS.
#'
#' A table containing the disease SNPs (disease allele, index, position, heterozygote risks,
#' homozygote risks, and rs-ID-number).
#'
#' @format A dataframe containing six columns:
#' \describe{
#' \item{allele}{Disease allele (zero or one).}
#' \item{idx}{Index of disease SNP.}
#' \item{position}{Position of disease SNP.}
#' \item{risk_hete}{Heterozygote risk.}
#' \item{risk_homo}{Homozygote risk.}
#' \item{rs}{rs-ID-number.}
#' }
"ground_truth"
Loading

0 comments on commit 63168f0

Please sign in to comment.