Paper Author : Abhineet Agarwal, Yan Shuo Tan, Omer Ronen, Chandan Singh, Bin Yu
R Package Author: Haoxue Wang ([email protected])---University of Cambridge
This package is R version for the Hierarchical Shinkage algorithm based on python, there is another R package for FIGS. algorithm. Hopefully more R version for imodels will be developed in the furture. The introduction manual of the package is in Manual.
HSTree decreases MSPE in all shrinkage methods. It shows its superiority of generelization especially for random forest
library(devtools)
install_github("wanghaoxue0/HSTree")
library(rpart)
library(randomForest)
library(gbm)
library(HSTree)
source("fit.R")
source("fitCV.R")
set.seed(2023)
X=read.csv("X.csv",header = FALSE)
y=read.csv("y.csv",header = FALSE)
colnames(y) <-"y"
fit <- HSTreeRegressorCV(X, y, reg_param=c(0.1, 1, 10, 20, 50, 100, 500), cv=4, verbose=TRUE, shrinkage="constant") # the default estimator is CART
the best regulization parameter is 1 , its mean square error is 908.8491
# split the data into 3:1
smp_size <- floor(0.75 * nrow(X))
train_ind <- sample(seq_len(nrow(X)), size = smp_size)
X_train <- X[train_ind, ]
y_train <- data.frame(y[train_ind, ])
colnames(y_train) <-"y"
X_test <- X[-train_ind, ]
y_test <- data.frame(y[-train_ind, ])
colnames(y_test) <-"y"
# original decision tree model
fit <- rpart(y~., data=data.frame(X_train,y_train), control = rpart.control(maxdepth = 5))
fit1 <- HSTreeRegressor(X_train, y_train, shrinkage="constant") # the default estimator is CART
fit2 <- HSTreeRegressor(X_train, y_train, estimator="CART") # the default shrinkage method is node_based
msep = sum((predict(fit, X_test)-y_test[[1]])^2)/nrow(X)
msep1 = sum((predict(fit1, X_test)-y_test[[1]])^2)/nrow(X)
msep2 = sum((predict(fit2, X_test)-y_test[[1]])^2)/nrow(X)
plot(fit1)
text(fit1, use.n = TRUE)
fit <- randomForest(X_train, y_train[[1]], ntree=50, maxnodes=5)
fit1 <- HSTreeRegressor(X_train, y_train, estimator="RandomForest") # the default shrinkage method is node_based
fit2 <- HSTreeRegressor(X_train, y_train, estimator="RandomForest", shrinkage="constant")
fit3 <- HSTreeRegressor(X_train, y_train, estimator="RandomForest", shrinkage="leaf_based")
msep = sum((predict(fit, X_test)-y_test[[1]])^2)/nrow(X)
msep1 = sum((predict(fit1, X_test)-y_test[[1]])^2)/nrow(X)
msep2 = sum((predict(fit2, X_test)-y_test[[1]])^2)/nrow(X)
msep3 = sum((predict(fit3, X_test)-y_test[[1]])^2)/nrow(X)
fit <- gbm(y~., data = data.frame(X_train,y_train), n.trees=100, interaction.depth=2)
fit1 <- HSTreeRegressor(X_train, y_train, interaction.depth=2, estimator="GradientBoosting") # the default shrinkage method is node_based
fit2 <- HSTreeRegressor(X_train, y_train, interaction.depth=2, estimator="GradientBoosting", shrinkage="constant")
msep = sum((predict(fit, X_test)-y_test[[1]])^2)/nrow(X)
msep1 = sum((predict(fit1, X_test)-y_test[[1]])^2)/nrow(X)
msep2 = sum((predict(fit2, X_test)-y_test[[1]])^2)/nrow(X)
# to print the structure for each tree
# single= pretty.gbm.tree(fit, i.tree = 1)