The RGF package is a wrapper of the Regularized Greedy Forest python package, which also includes a Multi-core implementation (FastRGF). Portability from Python to R was made possible using the reticulate package and the installation requires basic knowledge of Python. Except for the Linux Operating System, the installation on Macintosh and Windows might be somehow cumbersome (on windows the package currently can be used only from within the command prompt). Detailed installation instructions for all three Operating Systems can be found in the README.md file and in the rgf_python GitHub repository.
The Regularized Greedy Forest algorithm is explained in detail in the paper Rie Johnson and Tong Zhang, Learning Nonlinear Functions Using Regularized Greedy Forest. A small synopsis would be “… the resulting method, which we refer to as regularized greedy forest (RGF), integrates two ideas: one is to include tree-structured regularization into the learning formulation; and the other is to employ the fully-corrective regularized greedy algorithm ….”.
At the time of writing this Vignette (11 - 02 - 2018), there isn’t a corresponding implementation of the algorithm in the R language, so I decided to port the Python package in R taking advantage of the reticulate package. In the next lines, I will explain the functionality of the package and I compare RGF with other similar implementations, such as ranger (random forest algorithm) and xgboost (gradient boosting algorithm), in terms of time efficiency and error rate improvement.
The RGF package includes the following R6-classes / functions,
RGF_Regressor | RGF_Classifier | FastRGF_Regressor | FastRGF_Classifier |
---|---|---|---|
fit() | fit() | fit() | fit() |
predict() | predict() | predict() | predict() |
predict_proba() | predict_proba() | ||
cleanup() | cleanup() | cleanup() | cleanup() |
get_params() | get_params() | get_params() | get_params() |
score() | score() | score() | score() |
feature_importances() | feature_importances() | ||
dump_model() | dump_model() |
UPDATE 10-05-2018 : Beginning from version 1.0.3 the dgCMatrix_2scipy_sparse function was renamed to TO_scipy_sparse and now accepts either a dgCMatrix or a dgRMatrix as input. The appropriate format for the RGF package in case of sparse matrices is the dgCMatrix format (scipy.sparse.csc_matrix)
TO_scipy_sparse() |
---|
RGF_cleanup_temp_files() |
---|
mat_2scipy_sparse() |
---|
The package documentation includes details and examples for all R6-classes and functions. In the following code chunks, I’ll explain how a user can work with sparse matrices as all RGF algorithms (besides a dense matrix) require a python sparse matrix as input.
The RGF package includes two functions (mat_2scipy_sparse and TO_scipy_sparse) which allow the user to convert from a matrix / sparse matrix (dgCMatrix, dgRMatrix) to a scipy sparse matrix (scipy.sparse.csc_matrix, scipy.sparse.csr_matrix),
library(RGF)
# conversion from a matrix object to a scipy sparse matrix
#----------------------------------------------------------
set.seed(1)
= matrix(runif(1000), nrow = 100, ncol = 10)
x
= mat_2scipy_sparse(x, format = "sparse_row_matrix")
x_sparse
print(dim(x))
1] 100 10
[
print(x_sparse$shape)
100, 10) (
# conversion from a dgCMatrix object to a scipy sparse matrix
#-------------------------------------------------------------
= c(1, 0, 2, 0, 0, 3, 4, 5, 6)
data
# 'dgCMatrix' sparse matrix
#--------------------------
= Matrix::Matrix(data = data, nrow = 3,
dgcM
ncol = 3, byrow = TRUE,
sparse = TRUE)
print(dim(dgcM))
1] 3 3
[
= TO_scipy_sparse(dgcM)
x_sparse
print(x_sparse$shape)
3, 3)
(
# 'dgRMatrix' sparse matrix
#--------------------------
= as(dgcM, "RsparseMatrix")
dgrM
class(dgrM)
# [1] "dgRMatrix"
# attr(,"package")
# [1] "Matrix"
print(dim(dgrM))
1] 3 3
[
= TO_scipy_sparse(dgrM)
res_dgr
print(res_dgr$shape)
3, 3) (
First the data, libraries and cross-validation function will be inputted (the MLmetrics library is also required),
data(Boston, package = 'KernelKnn')
library(RGF)
library(ranger)
library(xgboost)
# shuffling function for cross-validation folds
#-----------------------------------------------
= function(vec, times = 10) {
func_shuffle
for (i in 1:times) {
= sample(vec, length(vec))
out
}
out
}
# cross-validation folds [ regression]
#-------------------------------------
= function(folds, RESP, stratified = FALSE) {
regr_folds
if (is.factor(RESP)) {
stop(simpleError("This function is meant for regression.
For classification use the 'class_folds' function."))
}
= rep(1/folds, folds)
samp_vec
= paste0('fold_', 1:folds)
sort_names
if (stratified == TRUE) {
= cut(RESP, breaks = folds)
stratif
= lapply(unique(stratif), function(x) which(stratif == x))
clas
= lapply(clas, function(x) length(x))
len
= lapply(len, function(y) sapply(1:length(samp_vec), function(x)
prop round(y * samp_vec[x])))
= unlist(lapply(prop, function(x) sapply(1:length(x), function(y)
repl rep(paste0('fold_', y), x[y]))))
= suppressWarnings(split(1:length(RESP), repl))}
spl
else {
= lapply(length(RESP), function(y) sapply(1:length(samp_vec),
prop function(x) round(y * samp_vec[x])))
= func_shuffle(unlist(lapply(prop, function(x)
repl sapply(1:length(x), function(y) rep(paste0('fold_', y), x[y])))))
= suppressWarnings(split(1:length(RESP), repl))
spl
}
= spl[sort_names]
spl
if (length(table(unlist(lapply(spl, function(x) length(x))))) > 1) {
warning('the folds are not equally split')
}
if (length(unlist(spl)) != length(RESP)) {
stop(simpleError("the length of the splits are not equal with the length
of the response"))
}
spl }
single threaded [ small data set ] |
---|
In the next code chunk, I’ll perform 5-fold cross-validation using the Boston dataset and I’ll compare time execution and error rate for all three algorithms (without doing hyper-parameter tuning),
= 5
NUM_FOLDS
set.seed(1)
= regr_folds(folds = NUM_FOLDS, Boston[, 'medv'], stratified = T)
FOLDS
= boston_ranger_te = boston_xgb_te = boston_rgf_time =
boston_rgf_te = boston_xgb_time = rep(NA, NUM_FOLDS)
boston_ranger_time
for (i in 1:length(FOLDS)) {
cat("fold : ", i, "\n")
= unlist(FOLDS[-i])
samp = unlist(FOLDS[i])
samp_
# RGF
#----
= Sys.time()
rgf_start
= RGF_Regressor$new(l2 = 0.1)
init_regr
$fit(x = as.matrix(Boston[samp, -ncol(Boston)]), y = Boston[samp, ncol(Boston)])
init_regr
= init_regr$predict(as.matrix(Boston[samp_, -ncol(Boston)]))
pr_te
= Sys.time()
rgf_end
= rgf_end - rgf_start
boston_rgf_time[i]
= MLmetrics::RMSE(Boston[samp_, 'medv'], pr_te)
boston_rgf_te[i]
# ranger
#-------
= Sys.time()
ranger_start
= ranger(dependent.variable.name = "medv", data = Boston[samp, ], write.forest = TRUE,
fit
probability = F, num.threads = 1, num.trees = 500, verbose = T,
classification = F, mtry = NULL, min.node.size = 5, keep.inbag = T)
= predict(fit, data = Boston[samp_, -ncol(Boston)], type = 'se')$predictions
pred_te
= Sys.time()
ranger_end
= ranger_end - ranger_start
boston_ranger_time[i]
= MLmetrics::RMSE(Boston[samp_, 'medv'], pred_te)
boston_ranger_te[i]
# xgboost
#--------
= Sys.time()
xgb_start
<- xgb.DMatrix(data = as.matrix(Boston[samp, -ncol(Boston)]),
dtrain
label = Boston[samp, ncol(Boston)])
<- xgb.DMatrix(data = as.matrix(Boston[samp_, -ncol(Boston)]),
dtest
label = Boston[samp_, ncol(Boston)])
<- list(train = dtrain, test = dtest)
watchlist
= list("objective" = "reg:linear", "bst:eta" = 0.05, "max_depth" = 4,
param
"subsample" = 0.85, "colsample_bytree" = 0.85, "booster" = "gbtree",
"nthread" = 1)
= xgb.train(param, dtrain, nround = 500, print_every_n = 100, watchlist = watchlist,
fit
early_stopping_rounds = 20, maximize = FALSE, verbose = 0)
= xgboost:::predict.xgb.Booster(fit, as.matrix(Boston[samp_, -ncol(Boston)]),
p_te
ntreelimit = fit$best_iteration)
= Sys.time()
xgb_end
= xgb_end - xgb_start
boston_xgb_time[i]
= MLmetrics::RMSE(Boston[samp_, 'medv'], p_te)
boston_xgb_te[i]
}
: 1
fold : 2
fold : 3
fold : 4
fold : 5
fold
cat("total time rgf 5 fold cross-validation : ", sum(boston_rgf_time),
" mean rmse on test data : ", mean(boston_rgf_te), "\n")
cat("total time ranger 5 fold cross-validation : ", sum(boston_ranger_time),
" mean rmse on test data : ", mean(boston_ranger_te), "\n")
cat("total time xgb 5 fold cross-validation : ", sum(boston_xgb_time),
" mean rmse on test data : ", mean(boston_xgb_te), "\n")
5 fold cross-validation : 0.7730639 mean rmse on test data : 3.832135
total time rgf 5 fold cross-validation : 3.826846 mean rmse on test data : 4.17419
total time ranger 5 fold cross-validation : 0.4316094 mean rmse on test data : 3.949122 total time xgb
5 threads [ high dimensional data set and presence of multicollinearity ] |
---|
For the high-dimensional data (can be downloaded from the following GitHub repository) we’ll use the FastRGF_Regressor rather than the RGF_Regressor (comparison without doing hyper-parameter tuning),
# download the data from the following GitHub repository (tested on a Linux OS)
system("wget
https://raw.githubusercontent.com/mlampros/DataSets/master/africa_soil_train_data.zip")
# load the data in the R session
= read.table(unz("africa_soil_train_data.zip", "train.csv"), nrows = 1157,
train_dat
header = T, quote = "\"", sep = ",")
# c("Ca", "P", "pH", "SOC", "Sand") : response variables
# exclude response-variables and factor variable
= train_dat[, -c(1, which(colnames(train_dat) %in%
x c("Ca", "P", "pH", "SOC", "Sand", "Depth")))]
# take (randomly) the first of the responses for train
= train_dat[, "Ca"]
y
# dataset for ranger
= cbind(Ca = y, x)
tmp_rg_dat
# cross-validation folds
set.seed(2)
= regr_folds(folds = NUM_FOLDS, y, stratified = T)
FOLDS
= highdim_ranger_te = highdim_xgb_te = highdim_rgf_time =
highdim_rgf_te = highdim_xgb_time = rep(NA, NUM_FOLDS)
highdim_ranger_time
for (i in 1:length(FOLDS)) {
cat("fold : ", i, "\n")
= unlist(FOLDS[-i])
new_samp = unlist(FOLDS[i])
new_samp_
# RGF
#----
= Sys.time()
rgf_start
= FastRGF_Regressor$new(n_jobs = 5, l2 = 0.1) # added 'l2' regularization
init_regr
$fit(x = as.matrix(x[new_samp, ]), y = y[new_samp])
init_regr
= init_regr$predict(as.matrix(x[new_samp_, ]))
pr_te
= Sys.time()
rgf_end
= rgf_end - rgf_start
highdim_rgf_time[i]
= MLmetrics::RMSE(y[new_samp_], pr_te)
highdim_rgf_te[i]
# ranger
#-------
= Sys.time()
ranger_start
= ranger(dependent.variable.name = "Ca", data = tmp_rg_dat[new_samp, ],
fit
write.forest = TRUE, probability = F, num.threads = 5, num.trees = 500,
verbose = T, classification = F, mtry = NULL, min.node.size = 5,
keep.inbag = T)
= predict(fit, data = x[new_samp_, ], type = 'se')$predictions
pred_te
= Sys.time()
ranger_end
= ranger_end - ranger_start
highdim_ranger_time[i]
= MLmetrics::RMSE(y[new_samp_], pred_te)
highdim_ranger_te[i]
# xgboost
#--------
= Sys.time()
xgb_start
<- xgb.DMatrix(data = as.matrix(x[new_samp, ]), label = y[new_samp])
dtrain
<- xgb.DMatrix(data = as.matrix(x[new_samp_, ]), label = y[new_samp_])
dtest
<- list(train = dtrain, test = dtest)
watchlist
= list("objective" = "reg:linear", "bst:eta" = 0.05, "max_depth" = 6,
param
"subsample" = 0.85, "colsample_bytree" = 0.85, "booster" = "gbtree",
"nthread" = 5) # "lambda" = 0.1 does not improve RMSE
= xgb.train(param, dtrain, nround = 500, print_every_n = 100, watchlist = watchlist,
fit
early_stopping_rounds = 20, maximize = FALSE, verbose = 0)
= xgboost:::predict.xgb.Booster(fit, as.matrix(x[new_samp_, ]),
p_te ntreelimit = fit$best_iteration)
= Sys.time()
xgb_end
= xgb_end - xgb_start
highdim_xgb_time[i]
= MLmetrics::RMSE(y[new_samp_], p_te)
highdim_xgb_te[i]
}
: 1
fold : 2
fold : 3
fold : 4
fold : 5
fold
cat("total time rgf 5 fold cross-validation : ", sum(highdim_rgf_time),
" mean rmse on test data : ", mean(highdim_rgf_te), "\n")
cat("total time ranger 5 fold cross-validation : ", sum(highdim_ranger_time),
" mean rmse on test data : ", mean(highdim_ranger_te), "\n")
cat("total time xgb 5 fold cross-validation : ", sum(highdim_xgb_time),
" mean rmse on test data : ", mean(highdim_xgb_te), "\n")
5 fold cross-validation : 92.31971 mean rmse on test data : 0.5155166
total time rgf 5 fold cross-validation : 27.32866 mean rmse on test data : 0.5394164
total time ranger 5 fold cross-validation : 30.48834 mean rmse on test data : 0.5453544 total time xgb