“Gradient boosting is one of the most successful machine learning algorithms for nonparametric regression and classification. Boosting adaptively combines a large number of relatively simple prediction models called base learners into an ensemble learner to achieve high prediction performance.”
“Its advantages are threefold. First, the model structure of TDboost is learned from data and not predetermined, thereby avoiding an explicit model specification. Non-linearities, discontinuities, complex and higher order interactions are naturally incorporated into the model to reduce the potential modeling bias and to produce high predictive performance, which enables TDboost to serve as a benchmark model in scoring insurance policies, guiding pricing practice, and facilitating marketing efforts. Feature selection is performed as an integral part of the procedure. In addition, TDboost handles the predictor and response variables of any type without the need for transformation, and it is highly robust to outliers. Missing values in the predictors are managed almost without loss of information.”
TDboost
package notesA boosted Tweedie compound Poisson model using the gradient boosting. It is capable of fitting a flexible nonlinear Tweedie compound Poisson model (or a gamma model) and capturing interactions among predictors.
CRAN: https://cran.r-project.org/web/packages/TDboost/TDboost.pdf
library(TDboost)
library(HDtweedie) # has example dataset
data("auto")
library(dplyr)
auto2 = tbl_df(as.data.frame(auto))
# create a split based on the outcome of y which preserves the response distribution
# http://topepo.github.io/caret/splitting.html
library(caret)
set.seed(3456)
trainIndex <- createDataPartition(auto2$y, p = .66,
list = FALSE,
times = 1)
head(trainIndex)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 6
## [6,] 7
train_auto <- auto2[trainIndex, ]
dim(train_auto)
## [1] 1857 57
test_auto <- auto2[-trainIndex, ]
dim(test_auto)
## [1] 955 57
TDboost
train_auto <- as.data.frame(train_auto)
test_auto <- as.data.frame(test_auto)
fit <- TDboost(y ~. , data=train_auto, cv.folds=5, n.trees=300, interaction.depth = 20)
## CV: 1
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.1324 8.5538 0.0010 0.0016
## 2 8.1309 8.5548 0.0010 -0.0006
## 3 8.1288 8.5532 0.0010 0.0014
## 4 8.1278 8.5517 0.0010 -0.0013
## 5 8.1260 8.5508 0.0010 0.0010
## 6 8.1243 8.5514 0.0010 0.0001
## 7 8.1223 8.5500 0.0010 0.0013
## 8 8.1203 8.5486 0.0010 0.0011
## 9 8.1187 8.5473 0.0010 0.0009
## 10 8.1175 8.5482 0.0010 -0.0015
## 100 7.9431 8.4607 0.0010 -0.0025
## 200 7.7854 8.4049 0.0010 0.0010
## 300 7.6470 8.3557 0.0010 0.0007
##
## CV: 2
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.2351 8.1419 0.0010 0.0017
## 2 8.2327 8.1405 0.0010 0.0017
## 3 8.2309 8.1400 0.0010 0.0010
## 4 8.2284 8.1390 0.0010 0.0019
## 5 8.2261 8.1382 0.0010 0.0016
## 6 8.2239 8.1368 0.0010 0.0014
## 7 8.2220 8.1374 0.0010 0.0007
## 8 8.2195 8.1378 0.0010 0.0010
## 9 8.2175 8.1370 0.0010 0.0012
## 10 8.2155 8.1359 0.0010 0.0014
## 100 8.0334 8.1102 0.0010 0.0011
## 200 7.8673 8.1166 0.0010 0.0010
## 300 7.7306 8.1709 0.0010 -0.0013
##
## CV: 3
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.2650 8.0270 0.0010 -0.0016
## 2 8.2631 8.0255 0.0010 0.0013
## 3 8.2608 8.0240 0.0010 0.0020
## 4 8.2587 8.0225 0.0010 0.0016
## 5 8.2567 8.0211 0.0010 0.0006
## 6 8.2543 8.0210 0.0010 0.0015
## 7 8.2519 8.0211 0.0010 0.0011
## 8 8.2499 8.0202 0.0010 0.0004
## 9 8.2477 8.0182 0.0010 0.0013
## 10 8.2458 8.0168 0.0010 0.0004
## 100 8.0828 7.9237 0.0010 0.0000
## 200 7.9223 7.8581 0.0010 0.0009
## 300 7.7881 7.8157 0.0010 0.0004
##
## CV: 4
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.1980 8.2908 0.0010 0.0016
## 2 8.1957 8.2889 0.0010 0.0016
## 3 8.1937 8.2885 0.0010 0.0006
## 4 8.1915 8.2868 0.0010 0.0017
## 5 8.1878 8.2837 0.0010 0.0031
## 6 8.1856 8.2819 0.0010 0.0016
## 7 8.1853 8.2810 0.0010 -0.0030
## 8 8.1846 8.2811 0.0010 -0.0015
## 9 8.1821 8.2793 0.0010 0.0019
## 10 8.1800 8.2789 0.0010 0.0006
## 100 8.0161 8.1743 0.0010 -0.0004
## 200 7.8609 8.0890 0.0010 0.0003
## 300 7.7263 8.0382 0.0010 0.0008
##
## CV: 5
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.2512 8.0774 0.0010 0.0012
## 2 8.2491 8.0756 0.0010 0.0014
## 3 8.2469 8.0740 0.0010 0.0016
## 4 8.2447 8.0723 0.0010 0.0016
## 5 8.2430 8.0729 0.0010 -0.0005
## 6 8.2410 8.0717 0.0010 0.0012
## 7 8.2383 8.0699 0.0010 0.0025
## 8 8.2358 8.0682 0.0010 0.0016
## 9 8.2337 8.0682 0.0010 0.0008
## 10 8.2315 8.0666 0.0010 0.0014
## 100 8.0548 7.9619 0.0010 0.0011
## 200 7.8894 7.8852 0.0010 0.0009
## 300 7.7527 7.8404 0.0010 0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 8.2160 nan 0.0010 0.0018
## 2 8.2140 nan 0.0010 0.0013
## 3 8.2123 nan 0.0010 0.0009
## 4 8.2106 nan 0.0010 0.0012
## 5 8.2088 nan 0.0010 0.0013
## 6 8.2067 nan 0.0010 0.0002
## 7 8.2047 nan 0.0010 0.0013
## 8 8.2029 nan 0.0010 0.0011
## 9 8.2028 nan 0.0010 -0.0032
## 10 8.2006 nan 0.0010 0.0010
## 100 8.0386 nan 0.0010 0.0006
## 200 7.8848 nan 0.0010 0.0010
## 300 7.7598 nan 0.0010 -0.0007
best.iter <- TDboost.perf(fit, method="test")
best.iter <- TDboost.perf(fit,method="cv")
# plot the performance
# plot variable influence
summary(fit,n.trees=1) # based on the first tree
## var rel.inf
## 1 x.REVOLKED 29.351645
## 2 x.MVR_PTS 27.070913
## 3 x.BLUEBOOK 10.417508
## 4 x.AGE3 7.397987
## 5 x.BLUEBOOK3 5.486209
## 6 x.AREA 3.616030
## 7 x.MAX_EDUC_2 3.229753
## 8 x.TRAVTIME 2.678981
## 9 x.JOBCLASS_7 2.623611
## 10 x.HOME_VAL2 2.186072
## 11 x.INCOME 1.841182
## 12 x.BLUEBOOK2 1.118888
## 13 x.HOME_VAL3 1.054774
## 14 x.SAMEHOME2 0.993953
## 15 x.HOME_VAL 0.932493
## 16 x.CAR_TYPE_2 0.000000
## 17 x.CAR_TYPE_3 0.000000
## 18 x.CAR_TYPE_4 0.000000
## 19 x.CAR_TYPE_5 0.000000
## 20 x.CAR_TYPE_6 0.000000
## 21 x.JOBCLASS_3 0.000000
## 22 x.JOBCLASS_4 0.000000
## 23 x.JOBCLASS_5 0.000000
## 24 x.JOBCLASS_6 0.000000
## 25 x.JOBCLASS_8 0.000000
## 26 x.JOBCLASS_9 0.000000
## 27 x.MAX_EDUC_3 0.000000
## 28 x.MAX_EDUC_4 0.000000
## 29 x.MAX_EDUC_5 0.000000
## 30 x.KIDSDRIV 0.000000
## 31 x.KIDSDRIV2 0.000000
## 32 x.KIDSDRIV3 0.000000
## 33 x.TRAVTIME2 0.000000
## 34 x.TRAVTIME3 0.000000
## 35 x.NPOLICY 0.000000
## 36 x.NPOLICY2 0.000000
## 37 x.NPOLICY3 0.000000
## 38 x.MVR_PTS2 0.000000
## 39 x.MVR_PTS3 0.000000
## 40 x.AGE 0.000000
## 41 x.AGE2 0.000000
## 42 x.HOMEKIDS 0.000000
## 43 x.HOMEKIDS2 0.000000
## 44 x.HOMEKIDS3 0.000000
## 45 x.YOJ 0.000000
## 46 x.YOJ2 0.000000
## 47 x.YOJ3 0.000000
## 48 x.INCOME2 0.000000
## 49 x.INCOME3 0.000000
## 50 x.SAMEHOME 0.000000
## 51 x.SAMEHOME3 0.000000
## 52 x.CAR_USE 0.000000
## 53 x.RED_CAR 0.000000
## 54 x.GENDER 0.000000
## 55 x.MARRIED 0.000000
## 56 x.PARENT1 0.000000
summary(fit,n.trees=best.iter) # based on the estimated best number of trees
## var rel.inf
## 1 x.REVOLKED 34.75532285
## 2 x.MVR_PTS 20.90084985
## 3 x.BLUEBOOK2 3.60367561
## 4 x.TRAVTIME2 3.45726498
## 5 x.BLUEBOOK 3.00356665
## 6 x.HOME_VAL3 2.16967149
## 7 x.INCOME 1.93941539
## 8 x.HOME_VAL2 1.91162078
## 9 x.HOME_VAL 1.87478172
## 10 x.INCOME3 1.86828797
## 11 x.AGE 1.78601375
## 12 x.CAR_USE 1.77605653
## 13 x.AGE2 1.76699488
## 14 x.INCOME2 1.70185223
## 15 x.BLUEBOOK3 1.55137845
## 16 x.TRAVTIME3 1.54019657
## 17 x.JOBCLASS_7 1.50680593
## 18 x.TRAVTIME 1.49150631
## 19 x.AGE3 1.47500617
## 20 x.YOJ3 1.35905074
## 21 x.SAMEHOME2 1.16758188
## 22 x.YOJ2 1.07284960
## 23 x.SAMEHOME3 0.97785583
## 24 x.SAMEHOME 0.95544938
## 25 x.YOJ 0.77660014
## 26 x.AREA 0.58298262
## 27 x.NPOLICY 0.51644298
## 28 x.MAX_EDUC_4 0.37871805
## 29 x.MAX_EDUC_3 0.30800492
## 30 x.HOMEKIDS 0.27854732
## 31 x.GENDER 0.24192356
## 32 x.MAX_EDUC_2 0.23989966
## 33 x.MARRIED 0.22518150
## 34 x.CAR_TYPE_3 0.20235934
## 35 x.CAR_TYPE_5 0.13154223
## 36 x.CAR_TYPE_2 0.11749727
## 37 x.CAR_TYPE_4 0.07014899
## 38 x.JOBCLASS_8 0.05862168
## 39 x.CAR_TYPE_6 0.05702679
## 40 x.KIDSDRIV 0.04873483
## 41 x.RED_CAR 0.04368389
## 42 x.JOBCLASS_9 0.03392040
## 43 x.JOBCLASS_6 0.02263796
## 44 x.JOBCLASS_3 0.01639964
## 45 x.MAX_EDUC_5 0.01360515
## 46 x.JOBCLASS_5 0.01221596
## 47 x.PARENT1 0.01024956
## 48 x.JOBCLASS_4 0.00000000
## 49 x.KIDSDRIV2 0.00000000
## 50 x.KIDSDRIV3 0.00000000
## 51 x.NPOLICY2 0.00000000
## 52 x.NPOLICY3 0.00000000
## 53 x.MVR_PTS2 0.00000000
## 54 x.MVR_PTS3 0.00000000
## 55 x.HOMEKIDS2 0.00000000
## 56 x.HOMEKIDS3 0.00000000
f.predict <- predict.TDboost(fit, test_auto, best.iter)
print(sum((test_auto$y - f.predict)^2))
## [1] 64227.63