library(car) #For vif()
library(leaps) #For regsubsets()
library(glmnet) #for glmnet()
library(statmod) #for tweedie()
library(tweedie) #for tweedie.profile()
library(boot) #for glm.diag.plots()
rm(list=ls())
#Dataset 2: Perovskite Stability
data2 = read.csv('Perovskite_Stability_with_features.csv')
head(data2)
dim(data2)
## [1] 1929 81
colnames(data2)
## [1] "ï..Material.Composition"
## [2] "A.site..1"
## [3] "A.site..2"
## [4] "A.site..3"
## [5] "B.site..1"
## [6] "B.site..2"
## [7] "B.site..3"
## [8] "X.site"
## [9] "Number.of.elements"
## [10] "energy_above_hull..meV.atom."
## [11] "formation_energy..eV.atom."
## [12] "num_of_atoms_host_Asite0"
## [13] "host_Asite0_Heat.of.Vaporization"
## [14] "host_Bsite0_at..wt."
## [15] "host_Bsite0_Ionization.Energy..kJ.mol."
## [16] "shannon_radii_AB_avg"
## [17] "Density_AB_avg"
## [18] "BCCefflatcnt_AB_avg"
## [19] "BCCvolume_padiff_AB_avg"
## [20] "GSenergy_pa_AB_avg"
## [21] "ICSDVolume_AB_avg"
## [22] "covalent.radius_AB_avg"
## [23] "Ionization.Energy..kJ.mol._AB_avg"
## [24] "Electron.Affinity..kJ.mol._AB_avg"
## [25] "Atomic.Volume..cm³.mol._AB_avg"
## [26] "MendeleevNumber_AB_avg"
## [27] "First.Ionization.Potential..V._AB_avg"
## [28] "thermal.conductivity_AB_avg"
## [29] "at..wt._AB_diff"
## [30] "specific.heat.capacity_AB_diff"
## [31] "electrical.conductivity_AB_diff"
## [32] "BCCefflatcnt_AB_ratio"
## [33] "Ionization.Energy..kJ.mol._AB_ratio"
## [34] "Heat.of.Vaporization_AB_ratio"
## [35] "Asite_BCCvolume_pa_weighted_avg"
## [36] "Asite_BCCvolume_padiff_weighted_avg"
## [37] "Asite_At..Radius....angstroms._weighted_avg"
## [38] "Asite_n_ws.third_weighted_avg"
## [39] "Bsite_.BP..K._weighted_avg"
## [40] "Bsite_At..Radius....angstroms._weighted_avg"
## [41] "Bsite_Second.Ionization.Potential...V._weighted_avg"
## [42] "Bsite_electrical.conductivity_weighted_avg"
## [43] "Asite_Ionic.Radius..angstroms._max"
## [44] "Asite_BCCenergy_pa_max"
## [45] "Asite_Atomic.Radius..Ã.._max"
## [46] "Asite_At..Radius....angstroms._max"
## [47] "Asite_Atomic.Volume..cm³.mol._max"
## [48] "Bsite_.BP..K._max"
## [49] "Bsite_At..Radius....angstroms._max"
## [50] "Bsite_First.Ionization.Potential..V._max"
## [51] "Bsite_Third.Ionization.Potential...V._max"
## [52] "Asite_shannon_radii_min"
## [53] "Asite_BCCenergy_pa_min"
## [54] "Asite_BCCenergydiff_min"
## [55] "Asite_GSmagmom_min"
## [56] "Asite_At..Radius....angstroms._min"
## [57] "Bsite_MendeleevNumber_min"
## [58] "Bsite_n_ws.third_min"
## [59] "Asite_shannon_radii_range"
## [60] "Asite_BCCefflatcnt_range"
## [61] "Asite_IsBoron_weighted_avg"
## [62] "Asite_IsHalogen_weighted_avg"
## [63] "Asite_IsPnictide_weighted_avg"
## [64] "Asite_IsRareEarth_weighted_avg"
## [65] "Asite_NfUnfilled_weighted_avg"
## [66] "Asite_NfValence_weighted_avg"
## [67] "Bsite_At..._weighted_avg"
## [68] "Bsite_Period_weighted_avg"
## [69] "Bsite_IsMetal_weighted_avg"
## [70] "Bsite_NdUnfilled_weighted_avg"
## [71] "Bsite_NpUnfilled_weighted_avg"
## [72] "host_Asite0_IsBCC"
## [73] "host_Asite0_IsCubic"
## [74] "host_Asite0_IsAlkali"
## [75] "host_Asite0_OrbitalD"
## [76] "host_Asite0_NsValence"
## [77] "host_Bsite0_At..."
## [78] "host_Bsite0_IsHexagonal"
## [79] "host_Bsite0_IsNoblegas"
## [80] "Asite_IsAlkali_max"
## [81] "Bsite_IsMetal_max"
# Number of unique elements in A.site (1,2,3) are (15,15,19)
length(unique(data2$A.site..1))
## [1] 15
length(unique(data2$A.site..2))
## [1] 19
length(unique(data2$A.site..3))
## [1] 5
# Number of unique elements in B.site (1,2,3) are (27,29,8)
length(unique(data2$B.site..1))
## [1] 27
length(unique(data2$B.site..2))
## [1] 29
length(unique(data2$B.site..3))
## [1] 8
#Always O in X.site
length(unique(data2$X.site))
## [1] 1
#Scaling features to have mean 0 and variance 1
data2[,12:81] = scale(data2[,12:81])
#Removing features with 0 SD
data2 = data2[,apply(data2,2,FUN = function(x){mean(is.na(x))}) != 1]
#Reorder feature columns alphabetically
reordered = data2[,(12:73)[order(colnames(data2)[12:73])]]
data2 = cbind(data2[,1:11],reordered)
slr_hull_fit = lm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,12:73)])
#Serious multicollinearity issues
vif(slr_hull_fit)
## Asite_At..Radius....angstroms._max
## 18.526347
## Asite_At..Radius....angstroms._min
## 37.707898
## Asite_At..Radius....angstroms._weighted_avg
## 5944.343689
## Asite_Atomic.Radius..Ã.._max
## 432.307276
## Asite_Atomic.Volume..cm³.mol._max
## 469.108078
## Asite_BCCefflatcnt_range
## 59.860994
## Asite_BCCenergy_pa_max
## 44.298279
## Asite_BCCenergy_pa_min
## 29.361962
## Asite_BCCenergydiff_min
## 356.767165
## Asite_BCCvolume_pa_weighted_avg
## 27154.203281
## Asite_BCCvolume_padiff_weighted_avg
## 89.416926
## Asite_Ionic.Radius..angstroms._max
## 46.638610
## Asite_IsAlkali_max
## 127.899654
## Asite_IsPnictide_weighted_avg
## 147.812322
## Asite_IsRareEarth_weighted_avg
## 51435.980075
## Asite_n_ws.third_weighted_avg
## 14748.571571
## Asite_NfUnfilled_weighted_avg
## 11476.286697
## Asite_NfValence_weighted_avg
## 2071.129958
## Asite_shannon_radii_min
## 271.736589
## Asite_shannon_radii_range
## 101.518182
## at..wt._AB_diff
## 1961.906842
## Atomic.Volume..cm³.mol._AB_avg
## 29041.229180
## BCCefflatcnt_AB_avg
## 6645.271065
## BCCefflatcnt_AB_ratio
## 646.099078
## BCCvolume_padiff_AB_avg
## 74.433831
## Bsite_.BP..K._max
## 12.471461
## Bsite_.BP..K._weighted_avg
## 54.818341
## Bsite_At..._weighted_avg
## 1655.531031
## Bsite_At..Radius....angstroms._max
## 23.692212
## Bsite_At..Radius....angstroms._weighted_avg
## 743.130799
## Bsite_electrical.conductivity_weighted_avg
## 2102.527734
## Bsite_First.Ionization.Potential..V._max
## 5.674401
## Bsite_IsMetal_weighted_avg
## 5.597967
## Bsite_MendeleevNumber_min
## 24.240071
## Bsite_n_ws.third_min
## 12.071546
## Bsite_NdUnfilled_weighted_avg
## 301.049343
## Bsite_NpUnfilled_weighted_avg
## 77.867742
## Bsite_Period_weighted_avg
## 2233.238185
## Bsite_Second.Ionization.Potential...V._weighted_avg
## 17.682964
## Bsite_Third.Ionization.Potential...V._max
## 6.774047
## covalent.radius_AB_avg
## 37026.060283
## Density_AB_avg
## 61.716436
## electrical.conductivity_AB_diff
## 2133.686893
## Electron.Affinity..kJ.mol._AB_avg
## 316.918391
## First.Ionization.Potential..V._AB_avg
## 897.144887
## GSenergy_pa_AB_avg
## 399.621229
## Heat.of.Vaporization_AB_ratio
## 35.932343
## host_Asite0_Heat.of.Vaporization
## 20.511384
## host_Asite0_IsBCC
## 6.266314
## host_Asite0_OrbitalD
## 19.541754
## host_Bsite0_At...
## 1416.796345
## host_Bsite0_at..wt.
## 1391.458383
## host_Bsite0_Ionization.Energy..kJ.mol.
## 14.902716
## host_Bsite0_IsHexagonal
## 7.086327
## ICSDVolume_AB_avg
## 23137.450032
## Ionization.Energy..kJ.mol._AB_avg
## 1542.052435
## Ionization.Energy..kJ.mol._AB_ratio
## 396.681867
## MendeleevNumber_AB_avg
## 314.038787
## num_of_atoms_host_Asite0
## 4.390351
## shannon_radii_AB_avg
## 377.869418
## specific.heat.capacity_AB_diff
## 104.089090
## thermal.conductivity_AB_avg
## 434.711982
cor_mat = cor(data2[,12:73])
diag(cor_mat) = 0
prob_feature_pair = data.frame(feature1 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,1]],
feature2 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,2]],
corr = cor_mat[which(abs(cor_mat) > 0.7, arr.ind = T)])
prob_feature_pair[!duplicated(prob_feature_pair$corr),]
Based on the variance inflation factors (VIF > 10), there are serious multicollinearity issues for the features.
To solve the multicollinearity issues, we consider stepwise/LASSO methods to drop some of the variables:
# Perform Stepwise variable selection to try and reduce the multicollinearity issue
#Forward Stepwise
slr_hull_fwd=regsubsets(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,12:73)], nvmax =62,
method ="forward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_hull_fwd)$rss, type = 'l'); points(which.min(summary(slr_hull_fwd)$rss),min(summary(slr_hull_fwd)$rss))
plot(summary(slr_hull_fwd)$adjr2, type = 'l') ; points(which.max(summary(slr_hull_fwd)$adjr2),max(summary(slr_hull_fwd)$adjr2))
plot(summary(slr_hull_fwd)$cp, type = 'l') ; points(which.min(summary(slr_hull_fwd)$cp),min(summary(slr_hull_fwd)$cp))
plot(summary(slr_hull_fwd)$bic, type = 'l') ; points(which.min(summary(slr_hull_fwd)$bic),min(summary(slr_hull_fwd)$bic))
#Backward Stepwise
slr_hull_bwd=regsubsets(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,12:73)], nvmax =62,
method ="backward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_hull_bwd)$rss, type = 'l'); points(which.min(summary(slr_hull_bwd)$rss),min(summary(slr_hull_bwd)$rss))
plot(summary(slr_hull_bwd)$adjr2, type = 'l') ; points(which.max(summary(slr_hull_bwd)$adjr2),max(summary(slr_hull_bwd)$adjr2))
plot(summary(slr_hull_bwd)$cp, type = 'l') ; points(which.min(summary(slr_hull_bwd)$cp),min(summary(slr_hull_bwd)$cp))
plot(summary(slr_hull_bwd)$bic, type = 'l') ; points(which.min(summary(slr_hull_bwd)$bic),min(summary(slr_hull_bwd)$bic))
#### Lasso ####
set.seed(1)
cv_slr_hull_lasso = cv.glmnet(x = model.matrix(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,12:73)])[,-1],
y = data2$energy_above_hull..meV.atom.,
alpha = 1, family = 'gaussian', standardize = FALSE)
cv_slr_hull_lasso$lambda.min
## [1] 0.01885837
cv_slr_hull_lasso$lambda.1se
## [1] 1.975632
out_slr_hull_lasso=glmnet(x = model.matrix(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,12:73)])[,-1],
y = data2$energy_above_hull..meV.atom.,
alpha = 1, family = 'gaussian', standardize = FALSE)
data.frame(lambda.min = as.vector(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min)), lambda.1se = as.vector(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.1se)), row.names = rownames(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min)))
plot(cv_slr_hull_lasso)
By using BIC as the selection criterion, forward stepwise selection method chose the model with 30 variables (hence dropped 32 variables), while the backward stepwise selection method resulted in a model with 37 variables (and 25 variables dropped). Whereas Lasso combined with 10-fold cross-validation to select the optimal tuning parameter \(\lambda\) resulted in a model with 59 variables (3 variables dropped) when using \(\lambda_{\min} =\) 0.0188584 and another model with 20 variables (hence dropped 42 variables) when using \(\lambda_{1,SE} =\) 1.9756323.
The variables that have been dropped based on each of the method are given as:
#Stepwise Forward Selection
colnames(data2)[12:73][!colnames(data2)[12:73] %in% names(coef(slr_hull_fwd, which.min(summary(slr_hull_fwd)$bic)))]
## [1] "Asite_At..Radius....angstroms._min"
## [2] "Asite_At..Radius....angstroms._weighted_avg"
## [3] "Asite_BCCefflatcnt_range"
## [4] "Asite_BCCenergy_pa_min"
## [5] "Asite_BCCvolume_pa_weighted_avg"
## [6] "Asite_BCCvolume_padiff_weighted_avg"
## [7] "Asite_Ionic.Radius..angstroms._max"
## [8] "Asite_IsAlkali_max"
## [9] "Asite_IsPnictide_weighted_avg"
## [10] "Asite_n_ws.third_weighted_avg"
## [11] "Asite_NfUnfilled_weighted_avg"
## [12] "Asite_NfValence_weighted_avg"
## [13] "at..wt._AB_diff"
## [14] "Atomic.Volume..cm³.mol._AB_avg"
## [15] "BCCefflatcnt_AB_avg"
## [16] "Bsite_.BP..K._weighted_avg"
## [17] "Bsite_At..Radius....angstroms._max"
## [18] "Bsite_At..Radius....angstroms._weighted_avg"
## [19] "Bsite_IsMetal_weighted_avg"
## [20] "Bsite_n_ws.third_min"
## [21] "Bsite_NdUnfilled_weighted_avg"
## [22] "Bsite_NpUnfilled_weighted_avg"
## [23] "covalent.radius_AB_avg"
## [24] "Electron.Affinity..kJ.mol._AB_avg"
## [25] "Heat.of.Vaporization_AB_ratio"
## [26] "host_Asite0_Heat.of.Vaporization"
## [27] "host_Asite0_IsBCC"
## [28] "host_Bsite0_At..."
## [29] "host_Bsite0_at..wt."
## [30] "host_Bsite0_Ionization.Energy..kJ.mol."
## [31] "ICSDVolume_AB_avg"
## [32] "Ionization.Energy..kJ.mol._AB_ratio"
#Stepwise Backward Selection
colnames(data2)[12:73][!colnames(data2)[12:73] %in% names(coef(slr_hull_bwd, which.min(summary(slr_hull_bwd)$bic)))]
## [1] "Asite_At..Radius....angstroms._max"
## [2] "Asite_At..Radius....angstroms._weighted_avg"
## [3] "Asite_BCCefflatcnt_range"
## [4] "Asite_BCCenergy_pa_max"
## [5] "Asite_BCCenergy_pa_min"
## [6] "Asite_BCCenergydiff_min"
## [7] "Asite_Ionic.Radius..angstroms._max"
## [8] "Asite_IsAlkali_max"
## [9] "Asite_shannon_radii_min"
## [10] "at..wt._AB_diff"
## [11] "Atomic.Volume..cm³.mol._AB_avg"
## [12] "Bsite_.BP..K._weighted_avg"
## [13] "Bsite_MendeleevNumber_min"
## [14] "Bsite_n_ws.third_min"
## [15] "Bsite_Second.Ionization.Potential...V._weighted_avg"
## [16] "Bsite_Third.Ionization.Potential...V._max"
## [17] "Electron.Affinity..kJ.mol._AB_avg"
## [18] "Heat.of.Vaporization_AB_ratio"
## [19] "host_Asite0_Heat.of.Vaporization"
## [20] "host_Asite0_IsBCC"
## [21] "host_Bsite0_At..."
## [22] "host_Bsite0_at..wt."
## [23] "host_Bsite0_Ionization.Energy..kJ.mol."
## [24] "host_Bsite0_IsHexagonal"
## [25] "num_of_atoms_host_Asite0"
#Lasso with lambda.min
rownames(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min))[which(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min) == 0)]
## [1] "Asite_IsAlkali_max" "host_Bsite0_at..wt." "ICSDVolume_AB_avg"
#Lasso with lambda.1se
rownames(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.1se))[which(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.1se) == 0)]
## [1] "Asite_At..Radius....angstroms._max"
## [2] "Asite_At..Radius....angstroms._min"
## [3] "Asite_At..Radius....angstroms._weighted_avg"
## [4] "Asite_Atomic.Radius..Ã.._max"
## [5] "Asite_BCCefflatcnt_range"
## [6] "Asite_BCCenergy_pa_max"
## [7] "Asite_BCCenergy_pa_min"
## [8] "Asite_BCCenergydiff_min"
## [9] "Asite_BCCvolume_pa_weighted_avg"
## [10] "Asite_BCCvolume_padiff_weighted_avg"
## [11] "Asite_Ionic.Radius..angstroms._max"
## [12] "Asite_IsAlkali_max"
## [13] "Asite_n_ws.third_weighted_avg"
## [14] "Asite_NfUnfilled_weighted_avg"
## [15] "Asite_NfValence_weighted_avg"
## [16] "Asite_shannon_radii_min"
## [17] "at..wt._AB_diff"
## [18] "Atomic.Volume..cm³.mol._AB_avg"
## [19] "BCCefflatcnt_AB_avg"
## [20] "BCCefflatcnt_AB_ratio"
## [21] "BCCvolume_padiff_AB_avg"
## [22] "Bsite_.BP..K._max"
## [23] "Bsite_.BP..K._weighted_avg"
## [24] "Bsite_At..._weighted_avg"
## [25] "Bsite_At..Radius....angstroms._max"
## [26] "Bsite_At..Radius....angstroms._weighted_avg"
## [27] "Bsite_electrical.conductivity_weighted_avg"
## [28] "Bsite_n_ws.third_min"
## [29] "Bsite_NdUnfilled_weighted_avg"
## [30] "Bsite_NpUnfilled_weighted_avg"
## [31] "Bsite_Second.Ionization.Potential...V._weighted_avg"
## [32] "covalent.radius_AB_avg"
## [33] "Density_AB_avg"
## [34] "host_Asite0_Heat.of.Vaporization"
## [35] "host_Asite0_IsBCC"
## [36] "host_Bsite0_At..."
## [37] "host_Bsite0_at..wt."
## [38] "host_Bsite0_Ionization.Energy..kJ.mol."
## [39] "ICSDVolume_AB_avg"
## [40] "Ionization.Energy..kJ.mol._AB_avg"
## [41] "Ionization.Energy..kJ.mol._AB_ratio"
## [42] "MendeleevNumber_AB_avg"
Therefore, the aggressiveness in terms of dropping variables from the strongest to the weakest are: Lasso-1se > Forward Stepwise > Backward Stepwise > Lasso-min.
The following presents the comparison of the residual sum of squares (RSS), adjusted R\(^2\) (AdjR2), Mallow CP and BIC for the models selected by backward stepwise and forward stepwise method, indicating that backward stepwise outperforms the forward stepwise method in terms of all criterion.
compare_fwd_bwd = data.frame(rbind(rbind(summary(slr_hull_fwd)$rss, summary(slr_hull_fwd)$adjr2, summary(slr_hull_fwd)$cp, summary(slr_hull_fwd)$bic)[,which.min(summary(slr_hull_fwd)$bic)],
rbind(summary(slr_hull_bwd)$rss, summary(slr_hull_bwd)$adjr2, summary(slr_hull_bwd)$cp, summary(slr_hull_bwd)$bic)[,which.min(summary(slr_hull_bwd)$bic)]
))
rownames(compare_fwd_bwd) = c('Forward','Backward')
colnames(compare_fwd_bwd) = c('RSS','AdjR2','Mallow-CP','BIC')
compare_fwd_bwd
Based on the four sets of selected variables - Forward, Backward, Lasso-1se, Lasso-min, we then examine their corresponding VIFs:
#VIF seems to be better compared to using all features, although still some problems with (Site2_MeltingT,Site2_BCCenergy_pa,BCCenergy_pa_composition_average)
var_all = colnames(data2)[12:73]
var_fwd = names(coef(slr_hull_fwd, which.min(summary(slr_hull_fwd)$bic)))[-1]
#var_fwd = var_fwd[-which(var_fwd %in% c('Asite_Atomic.Radius..Ã.._max','Asite_shannon_radii_min','Asite_Atomic.Volume..cm³.mol._max','First.Ionization.Potential..V._AB_avg','Bsite_At..._weighted_avg','Bsite_electrical.conductivity_weighted_avg'))]
var_bwd = names(coef(slr_hull_bwd, which.min(summary(slr_hull_bwd)$bic)))[-1]
var_lasso_1se = rownames(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.1se))[which(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.1se) != 0)][-1]
var_lasso_min = rownames(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min))[which(predict(out_slr_hull_lasso ,type="coefficients",s=cv_slr_hull_lasso$lambda.min) != 0)][-1]
slr_hull_fwd_fit = lm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_fwd))])
slr_hull_bwd_fit = lm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_bwd))])
slr_hull_lasso_1se_fit = lm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))])
slr_hull_lasso_min_fit = lm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))])
#All
vif(slr_hull_fit)
## Asite_At..Radius....angstroms._max
## 18.526347
## Asite_At..Radius....angstroms._min
## 37.707898
## Asite_At..Radius....angstroms._weighted_avg
## 5944.343689
## Asite_Atomic.Radius..Ã.._max
## 432.307276
## Asite_Atomic.Volume..cm³.mol._max
## 469.108078
## Asite_BCCefflatcnt_range
## 59.860994
## Asite_BCCenergy_pa_max
## 44.298279
## Asite_BCCenergy_pa_min
## 29.361962
## Asite_BCCenergydiff_min
## 356.767165
## Asite_BCCvolume_pa_weighted_avg
## 27154.203281
## Asite_BCCvolume_padiff_weighted_avg
## 89.416926
## Asite_Ionic.Radius..angstroms._max
## 46.638610
## Asite_IsAlkali_max
## 127.899654
## Asite_IsPnictide_weighted_avg
## 147.812322
## Asite_IsRareEarth_weighted_avg
## 51435.980075
## Asite_n_ws.third_weighted_avg
## 14748.571571
## Asite_NfUnfilled_weighted_avg
## 11476.286697
## Asite_NfValence_weighted_avg
## 2071.129958
## Asite_shannon_radii_min
## 271.736589
## Asite_shannon_radii_range
## 101.518182
## at..wt._AB_diff
## 1961.906842
## Atomic.Volume..cm³.mol._AB_avg
## 29041.229180
## BCCefflatcnt_AB_avg
## 6645.271065
## BCCefflatcnt_AB_ratio
## 646.099078
## BCCvolume_padiff_AB_avg
## 74.433831
## Bsite_.BP..K._max
## 12.471461
## Bsite_.BP..K._weighted_avg
## 54.818341
## Bsite_At..._weighted_avg
## 1655.531031
## Bsite_At..Radius....angstroms._max
## 23.692212
## Bsite_At..Radius....angstroms._weighted_avg
## 743.130799
## Bsite_electrical.conductivity_weighted_avg
## 2102.527734
## Bsite_First.Ionization.Potential..V._max
## 5.674401
## Bsite_IsMetal_weighted_avg
## 5.597967
## Bsite_MendeleevNumber_min
## 24.240071
## Bsite_n_ws.third_min
## 12.071546
## Bsite_NdUnfilled_weighted_avg
## 301.049343
## Bsite_NpUnfilled_weighted_avg
## 77.867742
## Bsite_Period_weighted_avg
## 2233.238185
## Bsite_Second.Ionization.Potential...V._weighted_avg
## 17.682964
## Bsite_Third.Ionization.Potential...V._max
## 6.774047
## covalent.radius_AB_avg
## 37026.060283
## Density_AB_avg
## 61.716436
## electrical.conductivity_AB_diff
## 2133.686893
## Electron.Affinity..kJ.mol._AB_avg
## 316.918391
## First.Ionization.Potential..V._AB_avg
## 897.144887
## GSenergy_pa_AB_avg
## 399.621229
## Heat.of.Vaporization_AB_ratio
## 35.932343
## host_Asite0_Heat.of.Vaporization
## 20.511384
## host_Asite0_IsBCC
## 6.266314
## host_Asite0_OrbitalD
## 19.541754
## host_Bsite0_At...
## 1416.796345
## host_Bsite0_at..wt.
## 1391.458383
## host_Bsite0_Ionization.Energy..kJ.mol.
## 14.902716
## host_Bsite0_IsHexagonal
## 7.086327
## ICSDVolume_AB_avg
## 23137.450032
## Ionization.Energy..kJ.mol._AB_avg
## 1542.052435
## Ionization.Energy..kJ.mol._AB_ratio
## 396.681867
## MendeleevNumber_AB_avg
## 314.038787
## num_of_atoms_host_Asite0
## 4.390351
## shannon_radii_AB_avg
## 377.869418
## specific.heat.capacity_AB_diff
## 104.089090
## thermal.conductivity_AB_avg
## 434.711982
#Stepwise Forward Selection
vif(slr_hull_fwd_fit)
## Asite_At..Radius....angstroms._max
## 8.587176
## Asite_Atomic.Radius..Ã.._max
## 216.004385
## Asite_Atomic.Volume..cm³.mol._max
## 185.311054
## Asite_BCCenergy_pa_max
## 14.622533
## Asite_BCCenergydiff_min
## 55.180037
## Asite_IsRareEarth_weighted_avg
## 16.827872
## Asite_shannon_radii_min
## 53.382067
## Asite_shannon_radii_range
## 38.834122
## BCCefflatcnt_AB_ratio
## 20.181720
## BCCvolume_padiff_AB_avg
## 4.110279
## Bsite_.BP..K._max
## 3.931401
## Bsite_At..._weighted_avg
## 46.985162
## Bsite_electrical.conductivity_weighted_avg
## 116.937020
## Bsite_First.Ionization.Potential..V._max
## 3.494056
## Bsite_MendeleevNumber_min
## 7.292043
## Bsite_Period_weighted_avg
## 47.492071
## Bsite_Second.Ionization.Potential...V._weighted_avg
## 4.562565
## Bsite_Third.Ionization.Potential...V._max
## 2.023713
## Density_AB_avg
## 26.073512
## electrical.conductivity_AB_diff
## 61.787318
## First.Ionization.Potential..V._AB_avg
## 248.588928
## GSenergy_pa_AB_avg
## 8.415448
## host_Asite0_OrbitalD
## 3.802189
## host_Bsite0_IsHexagonal
## 2.135208
## Ionization.Energy..kJ.mol._AB_avg
## 268.236425
## MendeleevNumber_AB_avg
## 11.644961
## num_of_atoms_host_Asite0
## 2.355069
## shannon_radii_AB_avg
## 15.903556
## specific.heat.capacity_AB_diff
## 15.443977
## thermal.conductivity_AB_avg
## 45.959112
#Stepwise Backward Selection
vif(slr_hull_bwd_fit)
## Asite_At..Radius....angstroms._min
## 10.046026
## Asite_Atomic.Radius..Ã.._max
## 307.153006
## Asite_Atomic.Volume..cm³.mol._max
## 292.981240
## Asite_BCCvolume_pa_weighted_avg
## 4409.460122
## Asite_BCCvolume_padiff_weighted_avg
## 39.657503
## Asite_IsPnictide_weighted_avg
## 22.417511
## Asite_IsRareEarth_weighted_avg
## 3034.504296
## Asite_n_ws.third_weighted_avg
## 810.517152
## Asite_NfUnfilled_weighted_avg
## 1012.984906
## Asite_NfValence_weighted_avg
## 160.931542
## Asite_shannon_radii_range
## 6.236888
## BCCefflatcnt_AB_avg
## 1839.936550
## BCCefflatcnt_AB_ratio
## 419.790501
## BCCvolume_padiff_AB_avg
## 35.558127
## Bsite_.BP..K._max
## 7.145015
## Bsite_At..._weighted_avg
## 287.274981
## Bsite_At..Radius....angstroms._max
## 10.307670
## Bsite_At..Radius....angstroms._weighted_avg
## 300.051662
## Bsite_electrical.conductivity_weighted_avg
## 539.945463
## Bsite_First.Ionization.Potential..V._max
## 3.741149
## Bsite_IsMetal_weighted_avg
## 3.265308
## Bsite_NdUnfilled_weighted_avg
## 113.720232
## Bsite_NpUnfilled_weighted_avg
## 25.183898
## Bsite_Period_weighted_avg
## 502.349464
## covalent.radius_AB_avg
## 3683.653538
## Density_AB_avg
## 41.460949
## electrical.conductivity_AB_diff
## 460.417872
## First.Ionization.Potential..V._AB_avg
## 461.954802
## GSenergy_pa_AB_avg
## 62.159833
## host_Asite0_OrbitalD
## 6.629950
## ICSDVolume_AB_avg
## 4743.993580
## Ionization.Energy..kJ.mol._AB_avg
## 684.000600
## Ionization.Energy..kJ.mol._AB_ratio
## 160.407048
## MendeleevNumber_AB_avg
## 59.995925
## shannon_radii_AB_avg
## 82.912983
## specific.heat.capacity_AB_diff
## 36.732503
## thermal.conductivity_AB_avg
## 138.832035
#Lasso 1se
vif(slr_hull_lasso_1se_fit)
## Asite_Atomic.Volume..cm³.mol._max
## 10.233934
## Asite_IsPnictide_weighted_avg
## 1.053859
## Asite_IsRareEarth_weighted_avg
## 4.909396
## Asite_shannon_radii_range
## 2.556826
## Bsite_First.Ionization.Potential..V._max
## 2.818421
## Bsite_IsMetal_weighted_avg
## 1.156702
## Bsite_MendeleevNumber_min
## 2.201740
## Bsite_Period_weighted_avg
## 5.166424
## Bsite_Third.Ionization.Potential...V._max
## 2.306072
## electrical.conductivity_AB_diff
## 3.995491
## Electron.Affinity..kJ.mol._AB_avg
## 5.627982
## First.Ionization.Potential..V._AB_avg
## 4.415987
## GSenergy_pa_AB_avg
## 2.277503
## Heat.of.Vaporization_AB_ratio
## 3.949192
## host_Asite0_OrbitalD
## 2.313357
## host_Bsite0_IsHexagonal
## 1.369712
## num_of_atoms_host_Asite0
## 1.974678
## shannon_radii_AB_avg
## 8.043730
## specific.heat.capacity_AB_diff
## 11.996748
## thermal.conductivity_AB_avg
## 4.553375
#Lasso min
vif(slr_hull_lasso_min_fit)
## Asite_At..Radius....angstroms._max
## 18.372555
## Asite_At..Radius....angstroms._min
## 37.337987
## Asite_At..Radius....angstroms._weighted_avg
## 2657.645766
## Asite_Atomic.Radius..Ã.._max
## 429.117797
## Asite_Atomic.Volume..cm³.mol._max
## 462.682197
## Asite_BCCefflatcnt_range
## 58.294369
## Asite_BCCenergy_pa_max
## 31.974854
## Asite_BCCenergy_pa_min
## 28.960664
## Asite_BCCenergydiff_min
## 144.422396
## Asite_BCCvolume_pa_weighted_avg
## 21652.859333
## Asite_BCCvolume_padiff_weighted_avg
## 86.189124
## Asite_Ionic.Radius..angstroms._max
## 44.758058
## Asite_IsPnictide_weighted_avg
## 68.906602
## Asite_IsRareEarth_weighted_avg
## 20794.445207
## Asite_n_ws.third_weighted_avg
## 12227.224194
## Asite_NfUnfilled_weighted_avg
## 4501.179149
## Asite_NfValence_weighted_avg
## 707.440817
## Asite_shannon_radii_min
## 261.817166
## Asite_shannon_radii_range
## 96.667167
## at..wt._AB_diff
## 1894.921205
## Atomic.Volume..cm³.mol._AB_avg
## 14541.743679
## BCCefflatcnt_AB_avg
## 5246.388836
## BCCefflatcnt_AB_ratio
## 642.758204
## BCCvolume_padiff_AB_avg
## 69.728562
## Bsite_.BP..K._max
## 12.453786
## Bsite_.BP..K._weighted_avg
## 49.638246
## Bsite_At..._weighted_avg
## 1476.898314
## Bsite_At..Radius....angstroms._max
## 23.463622
## Bsite_At..Radius....angstroms._weighted_avg
## 604.377961
## Bsite_electrical.conductivity_weighted_avg
## 1902.565678
## Bsite_First.Ionization.Potential..V._max
## 5.567805
## Bsite_IsMetal_weighted_avg
## 4.968652
## Bsite_MendeleevNumber_min
## 23.670064
## Bsite_n_ws.third_min
## 12.048272
## Bsite_NdUnfilled_weighted_avg
## 217.618729
## Bsite_NpUnfilled_weighted_avg
## 76.247082
## Bsite_Period_weighted_avg
## 1539.531331
## Bsite_Second.Ionization.Potential...V._weighted_avg
## 14.524441
## Bsite_Third.Ionization.Potential...V._max
## 6.593944
## covalent.radius_AB_avg
## 16699.604777
## Density_AB_avg
## 59.847037
## electrical.conductivity_AB_diff
## 1836.102390
## Electron.Affinity..kJ.mol._AB_avg
## 267.032957
## First.Ionization.Potential..V._AB_avg
## 888.194670
## GSenergy_pa_AB_avg
## 244.738995
## Heat.of.Vaporization_AB_ratio
## 34.273725
## host_Asite0_Heat.of.Vaporization
## 20.365887
## host_Asite0_IsBCC
## 6.227376
## host_Asite0_OrbitalD
## 19.467838
## host_Bsite0_At...
## 6.611423
## host_Bsite0_Ionization.Energy..kJ.mol.
## 14.390758
## host_Bsite0_IsHexagonal
## 5.917165
## Ionization.Energy..kJ.mol._AB_avg
## 1415.267916
## Ionization.Energy..kJ.mol._AB_ratio
## 388.002702
## MendeleevNumber_AB_avg
## 306.684898
## num_of_atoms_host_Asite0
## 4.343475
## shannon_radii_AB_avg
## 338.313957
## specific.heat.capacity_AB_diff
## 84.869308
## thermal.conductivity_AB_avg
## 405.562693
Compared to the full model, the model selected by different variable selection methods seem to have less multicollinearity issues indicated by the smaller VIFs, with the model selected by lasso.1se having VIFs that are mostly smaller than 10. It is worth noting that we can drop more variables (e.g. for the forward selection model by uncommenting the line of code for var_fwd
) to achieve much smaller VIFs if required , and the results in the following would still be largely similar.
Next, we look at the residual diagnostic plots for each of the models above:
#Diagnostic Plots
par(mfrow = c(2,2))
#All
plot(slr_hull_fit, which = c(1,2,4,5))
#Stepwise Forward Selection
plot(slr_hull_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(slr_hull_bwd_fit, which = c(1,2,4,5))
#Lasso 1se
plot(slr_hull_lasso_1se_fit, which = c(1,2,4,5))
#Lasso min
plot(slr_hull_lasso_min_fit, which = c(1,2,4,5))
All five models suffered from heteroskedasticity in the residual plot, and departure from normality at the lower and upper tails of the residuals. This motivates us to consider a square-root transformation for the response. Also, the apparent linear line forming the lower bound for the residuals is due to the observations with 0 \(E_{hull}\). To model these zeros, it might be useful to consider a Tweedie GLM or a zero-inflated model in the future.
In addition, all five Cook’s distance plots suggest that there are a few highly influential instances (two of which are related to the A-site Y element and B-site Fe element) as seen below:
data2[c(402,1000,1627,1682,1734),]
sqrt_slr_hull_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ . , data = data2[,c(10,which(colnames(data2) %in% var_all))])
sqrt_slr_hull_fwd_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ . , data = data2[,c(10,which(colnames(data2) %in% var_fwd))])
sqrt_slr_hull_bwd_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ . , data = data2[,c(10,which(colnames(data2) %in% var_bwd))])
sqrt_slr_hull_lasso_1se_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))])
sqrt_slr_hull_lasso_min_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))])
#Diagnostic Plots
par(mfrow = c(2,2))
plot(sqrt_slr_hull_fit, which = c(1,2,4,5))
plot(sqrt_slr_hull_fwd_fit, which = c(1,2,4,5))
plot(sqrt_slr_hull_bwd_fit, which = c(1,2,4,5))
plot(sqrt_slr_hull_lasso_1se_fit, which = c(1,2,4,5))
plot(sqrt_slr_hull_lasso_min_fit, which = c(1,2,4,5))
After applying a square-root transformation to \(E_{hull}\), the residual plots seem to satisfy the homoskedasticity assumption, while there are still some issues with departure from normality for the lower and upper tails of residuals although the issues are less severe now. The Cook’s distance plots suggest that there are a few highly influential instances (two of which are related to the A-site Y element and B-site Fe element, and two of which are related to the A-site Sr element and B-site V element) as seen below:
data2[c(402,1000,1626,1627,1682,1734),]
In this section, we make use of the above four sets of chosen features and fit regression model where we add quadratic terms of each selected feature into the model, while using the square-root transformed \(E_{hull}\) as the response.
quadratic_slr_hull_formula =as.formula(
paste(
'sqrt(energy_above_hull..meV.atom.)', '~', paste(
var_all, collapse = '+'
), '+', paste(
'I(',var_all,'^2)', collapse = '+'
)
)
)
quadratic_slr_hull_fwd_formula =as.formula(
paste(
'sqrt(energy_above_hull..meV.atom.)', '~', paste(
var_fwd, collapse = '+'
), '+', paste(
'I(',var_fwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_hull_bwd_formula =as.formula(
paste(
'sqrt(energy_above_hull..meV.atom.)', '~', paste(
var_bwd, collapse = '+'
), '+', paste(
'I(',var_bwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_hull_lasso_1se_formula =as.formula(
paste(
'sqrt(energy_above_hull..meV.atom.)', '~', paste(
var_lasso_1se, collapse = '+'
), '+', paste(
'I(',var_lasso_1se,'^2)', collapse = '+'
)
)
)
quadratic_slr_hull_lasso_min_formula =as.formula(
paste(
'sqrt(energy_above_hull..meV.atom.)', '~', paste(
var_lasso_min, collapse = '+'
), '+', paste(
'I(',var_lasso_min,'^2)', collapse = '+'
)
)
)
#All
quadratic_slr_hull_fit = lm(quadratic_slr_hull_formula, data = data2[,c(10,which(colnames(data2) %in% var_all))])
#quadratic_slr_hull_formula
#Stepwise Forward Selection
quadratic_slr_hull_fwd_fit = lm(quadratic_slr_hull_fwd_formula, data = data2[,c(10,which(colnames(data2) %in% var_fwd))])
#quadratic_slr_hull_fwd_formula
#Stepwise Backward Selection
quadratic_slr_hull_bwd_fit = lm(quadratic_slr_hull_bwd_formula, data = data2[,c(10,which(colnames(data2) %in% var_bwd))])
#quadratic_slr_hull_bwd_formula
#Lasso-1se
quadratic_slr_hull_lasso_1se_fit = lm(quadratic_slr_hull_lasso_1se_formula, data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))])
#quadratic_slr_hull_lasso_1se_formula
#Lasso-min
quadratic_slr_hull_lasso_min_fit = lm(quadratic_slr_hull_lasso_min_formula, data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))])
#quadratic_slr_hull_lasso_min_formula
The residual diagnostic plots of each fitted model with quadratic terms are given below:
par(mfrow = c(2,2))
#All
plot(quadratic_slr_hull_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(quadratic_slr_hull_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(quadratic_slr_hull_bwd_fit, which = c(1,2,4,5))
#Lasso-1se
plot(quadratic_slr_hull_lasso_1se_fit, which = c(1,2,4,5))
#Lasso-min
plot(quadratic_slr_hull_lasso_min_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
After including quadratic terms of features into the models, the residual plots seem similar as those with just the linear terms. In addition, the Cook’s distance plots suggest a similar set of influential observations (two of which are related to the A-site Y element and B-site Fe element):
data2[c(1000,1627,1682,1734),]
In this section, we make use of the above four sets of chosen features and fit regression model where we add all possible pairwise interaction terms of the selected features into the model, while using the square-root transformed \(E_{hull}\) as the response. The main motivation for using interaction terms is due to some features being related to the elements in A-site while other features are related to the elements in B-site.
interact_slr_hull_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_all))])
interact_slr_hull_fwd_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_fwd))])
interact_slr_hull_bwd_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_bwd))])
interact_slr_hull_lasso_1se_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))])
interact_slr_hull_lasso_min_fit = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))])
The residual diagnostic plots of each fitted model with interaction terms are given below:
par(mfrow = c(2,2))
#All
plot(interact_slr_hull_fit, which = c(1,2,4,5))
## Warning: not plotting observations with leverage one:
## 1, 2, 5, 17, 18, 28, 53, 54, 55, 57, 58, 59, 60, 61, 62, 89, 92, 112, 129, 232, 233, 234, 235, 236, 237, 238, 239, 256, 265, 269, 285, 286, 287, 299, 386, 419, 482, 483, 484, 485, 622, 641, 660, 661, 670, 781, 1000, 1019, 1368, 1369, 1472, 1508, 1513, 1514, 1515, 1523, 1532, 1540, 1541, 1542, 1543, 1544, 1547, 1616, 1626, 1634, 1644, 1682, 1734
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(interact_slr_hull_fwd_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Backward Selection
plot(interact_slr_hull_bwd_fit, which = c(1,2,4,5))
## Warning: not plotting observations with leverage one:
## 92, 484, 485, 1000, 1369, 1616, 1626, 1634, 1644, 1652, 1662, 1682, 1734
## Warning: NaNs produced
## Warning: NaNs produced
#Lasso-1se
plot(interact_slr_hull_lasso_1se_fit, which = c(1,2,4,5))
## Warning: not plotting observations with leverage one:
## 484, 485, 1682, 1734
## Warning: NaNs produced
## Warning: NaNs produced
#Lasso-min
plot(interact_slr_hull_lasso_min_fit, which = c(1,2,4,5))
## Warning: not plotting observations with leverage one:
## 2, 5, 17, 18, 28, 29, 53, 54, 55, 56, 57, 58, 61, 62, 92, 112, 232, 233, 234, 235, 236, 237, 238, 239, 256, 265, 285, 286, 287, 299, 420, 482, 483, 484, 485, 622, 641, 660, 661, 781, 1000, 1019, 1368, 1369, 1455, 1471, 1507, 1508, 1513, 1515, 1522, 1540, 1542, 1543, 1548, 1616, 1626, 1634, 1652, 1682, 1734
## Warning: NaNs produced
## Warning: NaNs produced
The residual plots and QQplots for models with interaction terms are similar to those of models with quadratic terms above. However, it can be seen that when interaction terms are used, the leverage of some observations are equal to one, resulting in undefined Cook’s distance. Therefore, the models with interaction terms might not be useful for identifying influential observations due to the undefined Cook’s distance. Note that observations 1682 and 1734 that are always identified as influential observations in the analyses above are found to have leverage one in all of the models with interaction terms (except for interact_slr_hull_fwd_fit
). Regardless, the Cook’s distance plots suggest the following influential observations (excluding those with leverage one):
data2[c(62,255,417,419,622,1000,1521,1606,1644,1662,1682,1734),]
Finally, the problem of observations with leverage one is due to the high discreteness of some of the features in the dataset: the number of unique values for some features is small relative to the number of observations. To solve this, we can remove those features and refit the models with pairwise interaction terms as follow:
non_unique_feature = names(which(apply(data2[,12:73], 2 , function(x){length(unique(x))}) <=40))
interact_slr_hull_fit2 = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_all & !colnames(data2) %in% non_unique_feature ))])
interact_slr_hull_fwd_fit2 = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_fwd & !colnames(data2) %in% non_unique_feature))])
interact_slr_hull_bwd_fit2 = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_bwd & !colnames(data2) %in% non_unique_feature))])
interact_slr_hull_lasso_1se_fit2 = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se & !colnames(data2) %in% non_unique_feature))])
interact_slr_hull_lasso_min_fit2 = lm(sqrt(energy_above_hull..meV.atom.) ~ .^2, data = data2[,c(10,which(colnames(data2) %in% var_lasso_min & !colnames(data2) %in% non_unique_feature))])
par(mfrow = c(2,2))
#All
plot(interact_slr_hull_fit2, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(interact_slr_hull_fwd_fit2, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(interact_slr_hull_bwd_fit2, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Lasso-1se
plot(interact_slr_hull_lasso_1se_fit2, which = c(1,2,4,5))
#Lasso-min
plot(interact_slr_hull_lasso_min_fit2, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
In comparison to the interaction term models including the features with high discreteness, the residual plots are quite similar, but now the influential observations identified using the Cook’s distance plots are different:
data2[c(62,402,441,472,1626,1682,1734),]
The following table compares the different fitted models in terms of the adjusted R\(^2\):
data.frame(Linear = c(summary(sqrt_slr_hull_fit)$adj.r.squared,
summary(sqrt_slr_hull_fwd_fit)$adj.r.squared,
summary(sqrt_slr_hull_bwd_fit)$adj.r.squared,
summary(sqrt_slr_hull_lasso_1se_fit)$adj.r.squared,
summary(sqrt_slr_hull_lasso_min_fit)$adj.r.squared),
Linear_and_Quadratic = c(summary(quadratic_slr_hull_fit)$adj.r.squared,
summary(quadratic_slr_hull_fwd_fit)$adj.r.squared,
summary(quadratic_slr_hull_bwd_fit)$adj.r.squared,
summary(quadratic_slr_hull_lasso_1se_fit)$adj.r.squared,
summary(quadratic_slr_hull_lasso_min_fit)$adj.r.squared),
Linear_and_Interaction = c(summary(interact_slr_hull_fit)$adj.r.squared,
summary(interact_slr_hull_fwd_fit)$adj.r.squared,
summary(interact_slr_hull_bwd_fit)$adj.r.squared,
summary(interact_slr_hull_lasso_1se_fit)$adj.r.squared,
summary(interact_slr_hull_lasso_min_fit)$adj.r.squared),
Linear_and_Interaction_nonDiscrete = c(summary(interact_slr_hull_fit2)$adj.r.squared,
summary(interact_slr_hull_fwd_fit2)$adj.r.squared,
summary(interact_slr_hull_bwd_fit2)$adj.r.squared,
summary(interact_slr_hull_lasso_1se_fit2)$adj.r.squared,
summary(interact_slr_hull_lasso_min_fit2)$adj.r.squared), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
The following table compares the different fitted models in terms of the number of parameters (regression coefficients + residual variance) to be estimated:
data.frame(Linear = c(sum(!is.na(sqrt_slr_hull_fit$coefficients)) + 1,
sum(!is.na(sqrt_slr_hull_fwd_fit$coefficients)) + 1,
sum(!is.na(sqrt_slr_hull_bwd_fit$coefficients)) + 1,
sum(!is.na(sqrt_slr_hull_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(sqrt_slr_hull_lasso_min_fit$coefficients)) + 1),
Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_hull_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_hull_fwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_hull_bwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_hull_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_hull_lasso_min_fit$coefficients)) + 1),
Linear_and_Interaction = c(sum(!is.na(interact_slr_hull_fit$coefficients)) + 1,
sum(!is.na(interact_slr_hull_fwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_hull_bwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_hull_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(interact_slr_hull_lasso_min_fit$coefficients)) + 1),
Linear_and_Interaction_nonDiscrete = c(sum(!is.na(interact_slr_hull_fit2$coefficients)) + 1,
sum(!is.na(interact_slr_hull_fwd_fit2$coefficients)) + 1,
sum(!is.na(interact_slr_hull_bwd_fit2$coefficients)) + 1,
sum(!is.na(interact_slr_hull_lasso_1se_fit2$coefficients)) + 1,
sum(!is.na(interact_slr_hull_lasso_min_fit2$coefficients)) + 1), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
In this section, we consider fitting Tweedie GLM to the data, as the Tweedie GLM can be used for non-negative continuous response with zeros. Since we are fitting a Tweedie GLM which consider a transformation for the mean of the response via a link function, we consider the raw response \(E_{hull}\):
# Tweedie GLM with E_hull #
var_p <- tweedie.profile(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_all))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
slr_hull_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_all))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
var_p_fwd <- tweedie.profile(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_fwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
slr_hull_fwd_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_fwd))], family = tweedie(var.power = var_p_fwd$p.max, link.power = 0 ))
var_p_bwd <- tweedie.profile(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_bwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
slr_hull_bwd_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_bwd))], family = tweedie(var.power = var_p_bwd$p.max, link.power = 0 ))
var_p_lasso_1se <- tweedie.profile(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
slr_hull_lasso_1se_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))], family = tweedie(var.power = var_p_lasso_1se$p.max, link.power = 0 ))
var_p_lasso_min <- tweedie.profile(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
slr_hull_lasso_min_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))], family = tweedie(var.power = var_p_lasso_min$p.max, link.power = 0 ))
c(var_p$p.max, var_p_fwd$p.max, var_p_bwd$p.max,var_p_lasso_1se$p.max,var_p_lasso_min$p.max)
## [1] 1.155918 1.155918 1.155918 1.155918 1.155918
par(mfrow = c(2,2))
#All
plot(slr_hull_fit_tw, which = c(1,2,4,5))
#Stepwise Forward Selection
plot(slr_hull_fwd_fit_tw, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(slr_hull_bwd_fit_tw, which = c(1,2,4,5))
#Lasso-1se
plot(slr_hull_lasso_1se_fit_tw, which = c(1,2,4,5))
#Lasso-min
plot(slr_hull_lasso_min_fit_tw, which = c(1,2,4,5))
In comparison to the fitted linear model using raw \(E_{hull}\), the residual plots of the fitted Tweedie GLM don’t have the issue of heteroskedasticity. On the other hand, the QQplots of the fitted Tweedie GLM also look better compared to those of the fitted linear models using square-root transformed \(E_{hull}\). Finally, the influential observations identified through the Cook’s distance plots of the fitted Tweedie GLM are largely similar as before:
data2[c(402,441,1000,1682,1734),]
In this section, we fit Tweedie GLM to the data using raw \(E_{hull}\) as the response, while including quadratic terms of the different sets of selected features. For var_all
, var_bwd
, and var_lasso_min
, we made use of the variance power parameter selected from the above since the algorithm (i.e. tweedie.profile
) to estimate them does not converge.
#Did not converge
#quadratic_var_p <- tweedie.profile(quadratic_slr_hull_formula , data = data2[,c(10,which(colnames(data2) %in% var_all))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
quadratic_slr_hull_fit_tw = glm(quadratic_slr_hull_formula , data = data2[,c(10,which(colnames(data2) %in% var_all))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
quadratic_var_p_fwd <- tweedie.profile(quadratic_slr_hull_fwd_formula , data = data2[,c(10,which(colnames(data2) %in% var_fwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
quadratic_slr_hull_fwd_fit_tw = glm(quadratic_slr_hull_fwd_formula , data = data2[,c(10,which(colnames(data2) %in% var_fwd))], family = tweedie(var.power = quadratic_var_p_fwd$p.max, link.power = 0 ))
#Did not converge
#quadratic_var_p_bwd <- tweedie.profile(quadratic_slr_hull_bwd_formula , data = data2[,c(10,which(colnames(data2) %in% var_bwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
quadratic_slr_hull_bwd_fit_tw = glm(quadratic_slr_hull_bwd_formula , data = data2[,c(10,which(colnames(data2) %in% var_bwd))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
quadratic_var_p_lasso_1se <- tweedie.profile(quadratic_slr_hull_lasso_1se_formula , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
## 1.011 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8
## .........Done.
quadratic_slr_hull_lasso_1se_fit_tw = glm(quadratic_slr_hull_lasso_1se_formula , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))], family = tweedie(var.power = quadratic_var_p_lasso_1se$p.max, link.power = 0 ))
#Did not converge
#quadratic_var_p_lasso_min <- tweedie.profile(quadratic_slr_hull_lasso_min_formula , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
quadratic_slr_hull_lasso_min_fit_tw = glm(quadratic_slr_hull_lasso_min_formula , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
c(var_p$p.max, quadratic_var_p_fwd$p.max, var_p$p.max,quadratic_var_p_lasso_1se$p.max,var_p$p.max)
## [1] 1.155918 1.334694 1.155918 1.107612 1.155918
par(mfrow = c(2,2))
#All
plot(quadratic_slr_hull_fit_tw, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(quadratic_slr_hull_fwd_fit_tw, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Backward Selection
plot(quadratic_slr_hull_bwd_fit_tw, which = c(1,2,4,5))
#Lasso-1se
plot(quadratic_slr_hull_lasso_1se_fit_tw, which = c(1,2,4,5))
#Lasso-min
plot(quadratic_slr_hull_lasso_min_fit_tw, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
In comparison to the fitted linear model with square-root transformed \(E_{hull}\) as the response and quadratic terms of features, the residual plots and QQplots of the fitted Tweedie GLM with quadratic terms are quite similar as before. In addition, the influential observations identified by the Cook’s distance plots are similar as before as well:
data2[c(420,441,1000,1627,1682,1734),]
In this section, we fit Tweedie GLM to the data using raw \(E_{hull}\) as the response, while including pairwise interaction terms of the different sets of selected features. To save computational time, we made use of the variance power parameter selected from the above, i.e. var_p$p.max
.
non_unique_feature = names(which(apply(data2[,12:73], 2 , function(x){length(unique(x))}) <=40))
#interact_var_p <- tweedie.profile(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_all))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
interact_slr_hull_fit_tw = glm(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_all & !colnames(data2) %in% non_unique_feature ))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
## Warning: glm.fit: algorithm did not converge
#interact_var_p_fwd <- tweedie.profile(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_fwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
interact_slr_hull_fwd_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_fwd & !colnames(data2) %in% non_unique_feature))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
#interact_var_p_bwd <- tweedie.profile(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_bwd))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
interact_slr_hull_bwd_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_bwd & !colnames(data2) %in% non_unique_feature))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
#interact_var_p_lasso_1se <- tweedie.profile(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
interact_slr_hull_lasso_1se_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_1se & !colnames(data2) %in% non_unique_feature))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
#interact_var_p_lasso_min <- tweedie.profile(energy_above_hull..meV.atom. ~ .^2 , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min))], p.vec=c(1.011,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8), method='series', do.ci=F)
interact_slr_hull_lasso_min_fit_tw = glm(energy_above_hull..meV.atom. ~ . , data = data2[,c(10,which(colnames(data2) %in% var_lasso_min & !colnames(data2) %in% non_unique_feature))], family = tweedie(var.power = var_p$p.max, link.power = 0 ))
par(mfrow = c(2,2))
#All
plot(interact_slr_hull_fit_tw, which = c(1,2,4,5))
## Warning: not plotting observations with leverage one:
## 92, 472, 1682, 1734
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(interact_slr_hull_fwd_fit_tw, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(interact_slr_hull_bwd_fit_tw, which = c(1,2,4,5))
#Lasso-1se
plot(interact_slr_hull_lasso_1se_fit_tw, which = c(1,2,4,5))
#Lasso-min
plot(interact_slr_hull_lasso_min_fit_tw, which = c(1,2,4,5))
For the Tweedie GLM with the interaction terms for all features (var_all
), the fitting function did not converge and hence resulted in a weird residual plot. As for the other four fitted Tweedie GLM models, their residual plots no longer show the obvious linear lower bound and their QQplots seem to be better at the lower tails compared to the diagnostic plots of the fitted linear model with interaction terms (excluding the highly discrete features). The influential observations identified through the Cook’s distance plots of the fitted Tweedie GLM are:
data2[c(402,1000,1627,1682),]
The table belows summarize the pseudo R\(^2\) (\(1 - Residual Deviance / Null Deviance\)) of the fitted Tweedie GLM models, which shows that incorporating quadratic/interaction terms does not really improve the model fit.
data.frame(Linear = c(1 - slr_hull_fit_tw$deviance/ slr_hull_fit_tw$null.deviance,
1 - slr_hull_fwd_fit_tw$deviance/ slr_hull_fwd_fit_tw$null.deviance,
1 - slr_hull_bwd_fit_tw$deviance/ slr_hull_bwd_fit_tw$null.deviance,
1 - slr_hull_lasso_1se_fit_tw$deviance/ slr_hull_lasso_1se_fit_tw$null.deviance,
1 - slr_hull_lasso_min_fit_tw$deviance/ slr_hull_lasso_min_fit_tw$null.deviance),
Linear_and_Quadratic = c(1 - quadratic_slr_hull_fit_tw$deviance/ quadratic_slr_hull_fit_tw$null.deviance,
1 - quadratic_slr_hull_fwd_fit_tw$deviance/ quadratic_slr_hull_fwd_fit_tw$null.deviance,
1 - quadratic_slr_hull_bwd_fit_tw$deviance/ quadratic_slr_hull_bwd_fit_tw$null.deviance,
1 - quadratic_slr_hull_lasso_1se_fit_tw$deviance/ quadratic_slr_hull_lasso_1se_fit_tw$null.deviance,
1 - quadratic_slr_hull_lasso_min_fit_tw$deviance/ quadratic_slr_hull_lasso_min_fit_tw$null.deviance),
Linear_and_Interaction_nonDiscrete = c(NA,
1 - interact_slr_hull_fwd_fit_tw$deviance/ interact_slr_hull_fwd_fit_tw$null.deviance,
1 - interact_slr_hull_bwd_fit_tw$deviance/ interact_slr_hull_bwd_fit_tw$null.deviance,
1 - interact_slr_hull_lasso_1se_fit_tw$deviance/ interact_slr_hull_lasso_1se_fit_tw$null.deviance,
1 - interact_slr_hull_lasso_min_fit_tw$deviance/ interact_slr_hull_lasso_min_fit_tw$null.deviance), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
The table belows summarize the AIC of the fitted Tweedie GLM models, which shows that it is worthwhile to incorporate quadratic terms resulting in lower AICs, while the interaction terms increase the complexity of the model but do not give much improvement in the fitting of the models.
data.frame(Linear = c(AICtweedie(slr_hull_fit_tw),
AICtweedie(slr_hull_fwd_fit_tw),
AICtweedie(slr_hull_bwd_fit_tw),
AICtweedie(slr_hull_lasso_1se_fit_tw),
AICtweedie(slr_hull_lasso_min_fit_tw)),
Linear_and_Quadratic = c(AICtweedie(quadratic_slr_hull_fit_tw),
AICtweedie(quadratic_slr_hull_fwd_fit_tw),
AICtweedie(quadratic_slr_hull_bwd_fit_tw),
AICtweedie(quadratic_slr_hull_lasso_1se_fit_tw),
AICtweedie(quadratic_slr_hull_lasso_min_fit_tw)),
Linear_and_Interaction_nonDiscrete = c(NA,
AICtweedie(interact_slr_hull_fwd_fit_tw),
AICtweedie(interact_slr_hull_bwd_fit_tw),
AICtweedie(interact_slr_hull_lasso_1se_fit_tw),
AICtweedie(interact_slr_hull_lasso_min_fit_tw)), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
The following table compares the different fitted Tweedie GLM in terms of the number of regression coefficients to be estimated:
data.frame(Linear = c(sum(!is.na(slr_hull_fit_tw$coefficients)) ,
sum(!is.na(slr_hull_fwd_fit_tw$coefficients)) ,
sum(!is.na(slr_hull_bwd_fit_tw$coefficients)) ,
sum(!is.na(slr_hull_lasso_1se_fit_tw$coefficients)) ,
sum(!is.na(slr_hull_lasso_min_fit_tw$coefficients)) ),
Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_hull_fit_tw$coefficients)) ,
sum(!is.na(quadratic_slr_hull_fwd_fit_tw$coefficients)) ,
sum(!is.na(quadratic_slr_hull_bwd_fit_tw$coefficients)) ,
sum(!is.na(quadratic_slr_hull_lasso_1se_fit_tw$coefficients)) ,
sum(!is.na(quadratic_slr_hull_lasso_min_fit_tw$coefficients)) ),
Linear_and_Interaction_nonDiscrete = c(NA ,
sum(!is.na(interact_slr_hull_fwd_fit_tw$coefficients)) ,
sum(!is.na(interact_slr_hull_bwd_fit_tw$coefficients)) ,
sum(!is.na(interact_slr_hull_lasso_1se_fit_tw$coefficients)) ,
sum(!is.na(interact_slr_hull_lasso_min_fit_tw$coefficients)) ), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
In summary, we made use of forward/backward/lasso to reduce the multicollinearity issue, square-root transformation for the response to solve the heteoskedasticity problem in the residual plot and slightly help with the normality issue of the residuals near their lower/upper tails. Quadratic/interaction terms of the features were included to improve the adjusted R\(^2\), although some features had to be removed when fitting the model with interaction terms due to their high discreteness. Finally, Tweedie GLMs were also fitted since the response is non-negative and continuous with some zeros, where the fitted Tweedie GLMs identified influential observations that are largely similar to those found in the linear models.