Improve speed of VarPro for largish data set

Convert all factors to real values

## first we roughly impute the data
data(housing, package = "randomForestSRC")

## to speed up analysis, convert all factors to real values
housing2 <- randomForestSRC:::get.na.roughfix(housing)
housing2 <- data.frame(data.matrix(housing2))

Use fewer trees and bigger nodesize

print(importance(unsupv(housing2, ntree = 50, nodesize = 150)))

                      mean        std          z
Year.Built     0.931866081 0.03534011 26.3685078
SalePrice      0.623965693 0.45256914  1.3787190
House.Style    0.553343223 0.45651108  1.2121134
MS.SubClass    0.513814524 0.46068239  1.1153335
X2nd.Flr.SF    0.505550442 0.47144671  1.0723385
Bldg.Type      0.412513544 0.43572544  0.9467282
Gr.Liv.Area    0.425758314 0.48535603  0.8772083
Garage.Type    0.316527428 0.44549829  0.7105020
X1st.Flr.SF    0.316414218 0.44618501  0.7091547
Garage.Cars    0.310521457 0.43816212  0.7086908
Garage.Yr.Blt  0.246561223 0.42035093  0.5865604
Full.Bath      0.210678710 0.40073971  0.5257246
TotRms.AbvGrd  0.193122256 0.39017740  0.4949601
Total.Bsmt.SF  0.168101881 0.36246045  0.4637799
Garage.Cond    0.167356809 0.36251698  0.4616523
Garage.Qual    0.154566372 0.35775327  0.4320474
Overall.Qual   0.149279794 0.34552415  0.4320387
Exter.Qual     0.152537100 0.35308526  0.4320121
Foundation     0.131105381 0.32831956  0.3993225
Bsmt.Qual      0.009415646 0.02853372  0.3299832
Kitchen.Qual   0.009380460 0.02842709  0.3299832
BsmtFin.Type.1 0.009290939 0.02815580  0.3299832
Bedroom.AbvGr  0.000000000 0.00000000        NaN
Bsmt.Exposure  0.000000000 0.00000000        NaN
BsmtFin.SF.1   0.000000000 0.00000000        NaN
BsmtFin.Type.2 0.000000000 0.00000000        NaN
Central.Air    0.000000000 0.00000000        NaN
Exterior.1st   0.000000000 0.00000000        NaN
Fireplaces     0.000000000 0.00000000        NaN
Garage.Area    0.000000000 0.00000000        NaN
Half.Bath      0.000000000 0.00000000        NaN
Kitchen.AbvGr  0.000000000 0.00000000        NaN
Land.Contour   0.000000000 0.00000000        NaN
Lot.Area       0.000000000 0.00000000        NaN
Lot.Frontage   0.000000000 0.00000000        NaN
MS.Zoning      0.000000000 0.00000000        NaN
Paved.Drive    0.000000000 0.00000000        NaN
Pool.Area      0.000000000 0.00000000        NaN
Pool.QC        0.000000000 0.00000000        NaN
Roof.Matl      0.000000000 0.00000000        NaN
Roof.Style     0.000000000 0.00000000        NaN
Year.Remod.Add 0.000000000 0.00000000        NaN

Use large nodesize

n <- 25000
p <- 50
d <- data.frame(y = rnorm(n), x = matrix(rnorm(n * p), n))

## use large nodesize
print(system.time(o <- varpro(y~., d, ntree = 100, nodesize = 200)))

   user  system elapsed 
 3.0281   0.064  3.383 
 
 
print(importance(o))

             z
x.43 0.8533468
x.47 0.8523626
x.1  0.8520027
x.39 0.8399330
x.45 0.8264873

Use smaller bootstrap

## use large nodesize, smaller bootstrap 
print(system.time(o <- varpro(y~., d, ntree = 100, nodesize = 200,
           sampsize = 100)))

   user  system elapsed 
 1.323   0.049  1.492
 
 
print(importance(o))

             z
x.38 0.8454787
x.40 0.8260607
x.22 0.8227180
x.6  0.8196515
x.29 0.8087301

VarPro with cross-validation

The VarPro procedure with cross-validation works similarly:

## roughly impute the data
data(housing, package = "randomForestSRC")
housing2 <- randomForestSRC:::get.na.roughfix(housing)

## use bigger nodesize
print(cv.varpro(SalePrice~., housing2, fast = TRUE, ntree = 50, nodesize = 150))

$imp
       variable         z
1  Overall.Qual 10.195703
6   Gr.Liv.Area  7.822601
5 Total.Bsmt.SF  7.470481
2    Year.Built  3.933678
9   Garage.Area  1.812320
4  BsmtFin.SF.1  1.613374

$imp.conserve
       variable         z
1  Overall.Qual 10.195703
6   Gr.Liv.Area  7.822601
5 Total.Bsmt.SF  7.470481
2    Year.Built  3.933678

$imp.liberal
       variable         z
1  Overall.Qual 10.195703
6   Gr.Liv.Area  7.822601
5 Total.Bsmt.SF  7.470481
2    Year.Built  3.933678
9   Garage.Area  1.812320
4  BsmtFin.SF.1  1.613374

$err
         zcut nvar       err          sd
[1,] 0.100000    6 0.1660557 0.013879393
[2,] 1.651020    5 0.1695325 0.015993478
[3,] 1.844898    4 0.1735643 0.007493364

$zcut
[1] 0.1

$zcut.conserve
[1] 1.844898

$zcut.liberal
[1] 0.1

## use smaller bootstrap
print(cv.varpro(SalePrice~., housing2, fast = TRUE, ntree = 50, nodesize = 150, sampsize = 250))

$imp
       variable         z
1  Overall.Qual 10.161676
6   Gr.Liv.Area  8.000638
5 Total.Bsmt.SF  7.895597
2    Year.Built  3.764146
4  BsmtFin.SF.1  2.405743

$imp.conserve
       variable         z
1  Overall.Qual 10.161676
6   Gr.Liv.Area  8.000638
5 Total.Bsmt.SF  7.895597
2    Year.Built  3.764146
4  BsmtFin.SF.1  2.405743

$imp.liberal
       variable         z
1  Overall.Qual 10.161676
6   Gr.Liv.Area  8.000638
5 Total.Bsmt.SF  7.895597
2    Year.Built  3.764146
4  BsmtFin.SF.1  2.405743
8   Garage.Area  1.791214

$err
         zcut nvar       err          sd
[1,] 0.100000    6 0.1582964 0.008467771
[2,] 1.806122    5 0.1559126 0.012814282

$zcut
[1] 1.806122

$zcut.conserve
[1] 1.806122

$zcut.liberal
[1] 0.1

Unsupervised problem with large pp

The “unsupv” method works faster than the default settings. The default method="auto" runs a random forest autoencoder by fitting selected variables against themselves - a special type of multivariate forest. The second is method="unsupv" which runs unsupervised forests. The third is random forests using pure random splitting, method="rnd". For very large data sets the auto-encoder might be slow, while other two methods are much faster.

data(vdv, package = "randomForestSRC")
print(system.time(o <- unsupv(vdv[,c(3:100)])))
   user  system elapsed 
 5.090   0.116  3.170 
 
print(importance(o))

                  mean        std         z
AB007962    0.12914048 0.24298406 0.5314772
AB020689    0.11329214 0.21852539 0.5184392
AB018324    0.10386398 0.21087383 0.4925409
AB014538    0.10968787 0.22292932 0.4920298
AB011115    0.07737236 0.16718163 0.4628042
AB011089    0.09426302 0.20416639 0.4616970
AB018305    0.10367288 0.22494634 0.4608783
AB011132    0.08747343 0.19211243 0.4553241
AB018260    0.08308302 0.18387317 0.4518496
AB024704    0.06988908 0.16715597 0.4181070
AB007883    0.05976462 0.15065662 0.3966943
AB007916    0.06164144 0.16047624 0.3841157
AB029001    0.05366293 0.14600870 0.3675324
AB007892    0.03440068 0.09578839 0.3591320
AB002448    0.02761796 0.07888187 0.3501180
AB011087    0.03174186 0.09290478 0.3416601
AB023152    0.01378631 0.04035098 0.3416598
AB023151    0.02419407 0.07081405 0.3416564
AA555029_RC 0.00000000 0.00000000       NaN
AA598803_RC 0.00000000 0.00000000       NaN
AB002301    0.00000000 0.00000000       NaN
AB002308    0.00000000 0.00000000       NaN
AB002331    0.00000000 0.00000000       NaN
AB002351    0.00000000 0.00000000       NaN
AB002445    0.00000000 0.00000000       NaN
...

## unsupervised splitting 
print(system.time(o <- unsupv(vdv[,c(3:100)], method = "unsupv")))
   user  system elapsed 
 2.969   0.089  1.694 
 
print(importance(o))

                  mean        std         z
AB024704    0.06873266 0.16325205 0.4210217
AB007883    0.05011575 0.12412446 0.4037541
AB011115    0.06032827 0.15294954 0.3944325
AB018271    0.05368750 0.13723027 0.3912220
AB011089    0.06048775 0.15478042 0.3907972
AB018260    0.04718282 0.12129511 0.3889919
AB007855    0.04950081 0.12869312 0.3846422
AB018305    0.05987620 0.15601187 0.3837926
AB011132    0.04622992 0.12246470 0.3774959
AB014538    0.05848944 0.15580698 0.3753968
AB006746    0.04643441 0.12764627 0.3637741
AB002308    0.02537020 0.07074907 0.3585942
AB002448    0.04070350 0.11354889 0.3584667
AB023152    0.01155402 0.03226723 0.3580729
AB007962    0.03688246 0.10301183 0.3580410
AB007899    0.03066386 0.08744720 0.3506558
AB007857    0.03109128 0.08929385 0.3481907
AB007916    0.03186857 0.09327556 0.3416604
AB029001    0.03078501 0.09010465 0.3416584
AB028998    0.01480819 0.04335023 0.3415944
AA555029_RC 0.00000000 0.00000000       NaN
AA598803_RC 0.00000000 0.00000000       NaN
AB002301    0.00000000 0.00000000       NaN
AB002331    0.00000000 0.00000000       NaN
...

## random splitting 
print(system.time(o <- unsupv(vdv[,c(3:100)], method = "rnd")))
   user  system elapsed 
 2.713   0.092  1.590 
 
print(importance(o))

                   mean         std         z
AB007962    0.026057884 0.074484582 0.3498426
AB020677    0.015160471 0.044373754 0.3416540
AB007888    0.028651761 0.083862195 0.3416529
AB007916    0.001315760 0.003957185 0.3324990
AB018310    0.001200670 0.003611050 0.3324990
AB007954    0.001558233 0.004686429 0.3324990
AB020695    0.002136625 0.006425959 0.3324990
AA555029_RC 0.000000000 0.000000000       NaN
AA598803_RC 0.000000000 0.000000000       NaN
AB002301    0.000000000 0.000000000       NaN
AB002308    0.000000000 0.000000000       NaN
AB002331    0.000000000 0.000000000       NaN
AB002351    0.000000000 0.000000000       NaN
...


Cite this vignette as
M. Lu, A. Shear, U. B. Kogalur, and H. Ishwaran. 2025. “varPro: speedup varPro analyses vignette.” http://www.varprotools.org/articles/speedup.html.

@misc{LuSpeedup,
    author = "Min Lu and Aster Shear and Udaya B. Kogalur and Hemant Ishwaran",
    title = {{varPro}: speedup {varPro} analyses vignette},
    year = {2025},
    url = {http://www.varprotools.org/articles/speedup.html},
    howpublished = "\url{http://www.varprotools.org/articles/speedup.html}",
    note = "[accessed date]"
}