library(car) #For vif()
library(leaps) #For regsubsets()
library(glmnet) #for glmnet()
rm(list=ls())
#Dataset 1: Dilute_Solute Diffusion
data1 = read.csv('Dilute_Solute_Diffusion_with_features.csv')
#Number of Unique Host = 15
length(unique(data1$Material.compositions.1))
## [1] 15
#Number of Unique Solute = 58
length(unique(data1$Material.compositions.2))
## [1] 58
#All 7 Columns with labels beginning with Site1 are uniquely identified by the Host
host_data = cbind(data1$Material.compositions.1,data1[,substr(colnames(data1),1,5) == 'Site1'])
dim(unique(host_data))
## [1] 15 8
#All 7 Columns with labels beginning with Site2 are uniquely identified by the solute
solute_data = cbind(data1$Material.compositions.2,data1[,substr(colnames(data1),1,5) == 'Site2'])
dim(unique(solute_data))
## [1] 58 7
#The column 'BCCenergy_pa_arithmetic_average' is exactly the same as another column 'data1$BCCenergy_pa_composition_average'
max(abs(data1$BCCenergy_pa_arithmetic_average - data1$BCCenergy_pa_composition_average))
## [1] 0
data1 = data1[,-29]
# Scale all features to have sample mean 0 and sample variance 1
data1[,5:28] = scale(data1[,5:28])
#Reorder feature columns alphabetically
reordered = data1[,(5:28)[order(colnames(data1)[5:28])]]
data1 = cbind(data1[,1:4],reordered)
slr_raw_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,5:28)])
#There are serious multicollinearity issues:
vif(slr_raw_fit)
## BCCenergy_pa_composition_average BoilingT_max_value
## 31.658570 12.153347
## CovalentRadius_max_value GSestFCClatcnt_max_value
## 8.274496 5.564614
## IonicRadii_max_value MeltingT_min_value
## 3.414137 11.358956
## MiracleRadius_min_value n_ws.third_min_value
## 3.343098 9.016486
## NUnfilled_max_value Site1_BCCfermi
## 8.187786 4.073512
## Site1_CovalentRadii Site1_Electronegativity
## 48.185461 7.050570
## Site1_HHIr Site1_IonicRadii
## 3.938766 6.127765
## Site1_MendeleevNumber Site1_MiracleRadius
## 4.716981 44.400696
## Site2_BCCenergy_pa Site2_Density
## 31.670981 3.532424
## Site2_Group Site2_MeltingT
## 5.855864 27.343113
## Site2_NdUnfilled Site2_NUnfilled
## 8.001194 10.414374
## SpecificHeatCapacity_difference valence_arithmetic_average
## 2.322162 5.958953
cor_mat = cor(data1[,5:28])
diag(cor_mat) = 0
prob_feature_pair = data.frame(feature1 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,1]],
feature2 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,2]],
corr = cor_mat[which(abs(cor_mat) > 0.7, arr.ind = T)])
prob_feature_pair[!duplicated(prob_feature_pair$corr),]
Based on the variance inflation factors (VIF > 10), there are serious multicollinearity issues for the two groups of (Site1_CovalentRadii, Site1_MiracleRadius) and (BCCenergy_pa_composition_average, BoilingT_max_value, MeltingT_min_value, Site2_BCCenergy_pa, Site2_MeltingT, Site2_NUnfilled).
To solve the multicollinearity issues, we consider stepwise/LASSO methods to drop some of the variables:
# Perform Stepwise variable selection to try and reduce the multicollinearity issue
#Forward Stepwise
slr_raw_fwd=regsubsets (E_raw..eV. ~ . , data = data1[,c(4,5:28)], nvmax =24,
method ="forward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_raw_fwd)$rss, type = 'l'); points(which.min(summary(slr_raw_fwd)$rss),min(summary(slr_raw_fwd)$rss))
plot(summary(slr_raw_fwd)$adjr2, type = 'l') ; points(which.max(summary(slr_raw_fwd)$adjr2),max(summary(slr_raw_fwd)$adjr2))
plot(summary(slr_raw_fwd)$cp, type = 'l') ; points(which.min(summary(slr_raw_fwd)$cp),min(summary(slr_raw_fwd)$cp))
plot(summary(slr_raw_fwd)$bic, type = 'l') ; points(which.min(summary(slr_raw_fwd)$bic),min(summary(slr_raw_fwd)$bic))
#Backward Stepwise
slr_raw_bwd=regsubsets(E_raw..eV. ~ . , data = data1[,c(4,5:28)], nvmax =24,
method ="backward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_raw_bwd)$rss, type = 'l'); points(which.min(summary(slr_raw_bwd)$rss),min(summary(slr_raw_bwd)$rss))
plot(summary(slr_raw_bwd)$adjr2, type = 'l') ; points(which.max(summary(slr_raw_bwd)$adjr2),max(summary(slr_raw_bwd)$adjr2))
plot(summary(slr_raw_bwd)$cp, type = 'l') ; points(which.min(summary(slr_raw_bwd)$cp),min(summary(slr_raw_bwd)$cp))
plot(summary(slr_raw_bwd)$bic, type = 'l') ; points(which.min(summary(slr_raw_bwd)$bic),min(summary(slr_raw_bwd)$bic))
#Lasso
set.seed(1)
cv_slr_raw_lasso = cv.glmnet(x = model.matrix(E_raw..eV. ~ . , data = data1[,c(4,5:28)])[,-1],
y = data1$E_raw..eV.[],
alpha = 1, family = 'gaussian', standardize = FALSE)
cv_slr_raw_lasso$lambda.min
## [1] 0.0003021515
cv_slr_raw_lasso$lambda.1se
## [1] 0.01370221
out_slr_raw_lasso=glmnet(x = model.matrix(E_raw..eV. ~ . , data = data1[,c(4,5:28)])[,-1],
y = data1$E_raw..eV.,
alpha = 1, family = 'gaussian', standardize = FALSE)
data.frame(lambda.min = as.vector(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min)), lambda.1se = as.vector(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se)), row.names = rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min)))
plot(cv_slr_raw_lasso)
By using BIC as the selection criterion, forward stepwise selection method chose the model with 15 variables (hence dropped 9 variables), while the backward stepwise selection method resulted in a model with 13 variables (and 11 variables dropped). Whereas Lasso combined with 10-fold cross-validation to select the optimal tuning parameter \(\lambda\) resulted in a model with all variables included when using \(\lambda_{\min} =\) 3.021515410^{-4} and another model with 19 variables (hence dropped 5 variables) when using \(\lambda_{1,SE} =\) 0.0137022.
The variables that have been dropped based on each of the method are given as:
#Stepwise Forward Selection
colnames(data1)[5:28][!colnames(data1)[5:28] %in% names(coef(slr_raw_fwd, which.min(summary(slr_raw_fwd)$bic)))]
## [1] "CovalentRadius_max_value" "MiracleRadius_min_value"
## [3] "Site1_BCCfermi" "Site1_CovalentRadii"
## [5] "Site1_HHIr" "Site2_Density"
## [7] "Site2_Group" "Site2_NdUnfilled"
## [9] "Site2_NUnfilled"
#Stepwise Backward Selection
colnames(data1)[5:28][!colnames(data1)[5:28] %in% names(coef(slr_raw_bwd, which.min(summary(slr_raw_bwd)$bic)))]
## [1] "CovalentRadius_max_value" "MiracleRadius_min_value"
## [3] "Site1_BCCfermi" "Site1_CovalentRadii"
## [5] "Site1_HHIr" "Site1_IonicRadii"
## [7] "Site2_Density" "Site2_Group"
## [9] "Site2_NdUnfilled" "Site2_NUnfilled"
## [11] "valence_arithmetic_average"
#Lasso with lambda.min
colnames(data1)[5:28][!colnames(data1)[5:28] %in% rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min) )]
## character(0)
#Lasso with lambda.1se
rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se))[which(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se) == 0)]
## [1] "CovalentRadius_max_value" "Site1_BCCfermi"
## [3] "Site1_MiracleRadius" "Site2_Group"
## [5] "Site2_NUnfilled"
Therefore, the aggressiveness in terms of dropping variables from the strongest to the weakest are: Backward Stepwise > Forward Stepwise > Lasso-1se > Lasso-min.
The following presents the comparison of the residual sum of squares (RSS), adjusted R\(^2\) (AdjR2), Mallow CP and BIC for the models selected by backward stepwise and forward stepwise method.
compare_fwd_bwd = data.frame(rbind(rbind(summary(slr_raw_fwd)$rss, summary(slr_raw_fwd)$adjr2, summary(slr_raw_fwd)$cp, summary(slr_raw_fwd)$bic)[,which.min(summary(slr_raw_fwd)$bic)],
rbind(summary(slr_raw_bwd)$rss, summary(slr_raw_bwd)$adjr2, summary(slr_raw_bwd)$cp, summary(slr_raw_bwd)$bic)[,which.min(summary(slr_raw_bwd)$bic)]
))
rownames(compare_fwd_bwd) = c('Forward','Backward')
colnames(compare_fwd_bwd) = c('RSS','AdjR2','Mallow-CP','BIC')
compare_fwd_bwd
Based on the four sets of selected variables - Forward, Backward, Lasso-min, Lasso-1se, recalling that Lasso-min is equivalent to selecting all the variables, we then examine their corresponding VIFs:
#VIF seems to be better compared to using all features, although still some problems with (Site2_MeltingT,Site2_BCCenergy_pa,BCCenergy_pa_composition_average)
var_all = colnames(data1)[5:28]
var_fwd = names(coef(slr_raw_fwd, which.min(summary(slr_raw_fwd)$bic)))[-1]
var_bwd = names(coef(slr_raw_bwd, which.min(summary(slr_raw_bwd)$bic)))[-1]
#var_bwd = var_bwd[-which(var_bwd == "Site2_BCCenergy_pa")]
var_lasso_1se = rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se))[which(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se) != 0)][-1]
slr_raw_fwd_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
slr_raw_bwd_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
slr_raw_lasso_1se_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])
#All
vif(slr_raw_fit)
## BCCenergy_pa_composition_average BoilingT_max_value
## 31.658570 12.153347
## CovalentRadius_max_value GSestFCClatcnt_max_value
## 8.274496 5.564614
## IonicRadii_max_value MeltingT_min_value
## 3.414137 11.358956
## MiracleRadius_min_value n_ws.third_min_value
## 3.343098 9.016486
## NUnfilled_max_value Site1_BCCfermi
## 8.187786 4.073512
## Site1_CovalentRadii Site1_Electronegativity
## 48.185461 7.050570
## Site1_HHIr Site1_IonicRadii
## 3.938766 6.127765
## Site1_MendeleevNumber Site1_MiracleRadius
## 4.716981 44.400696
## Site2_BCCenergy_pa Site2_Density
## 31.670981 3.532424
## Site2_Group Site2_MeltingT
## 5.855864 27.343113
## Site2_NdUnfilled Site2_NUnfilled
## 8.001194 10.414374
## SpecificHeatCapacity_difference valence_arithmetic_average
## 2.322162 5.958953
#Stepwise Forward Selection
vif(slr_raw_fwd_fit)
## BCCenergy_pa_composition_average BoilingT_max_value
## 21.560966 8.286418
## GSestFCClatcnt_max_value IonicRadii_max_value
## 3.600398 2.333823
## MeltingT_min_value n_ws.third_min_value
## 8.231074 6.810415
## NUnfilled_max_value Site1_Electronegativity
## 1.920920 3.475194
## Site1_IonicRadii Site1_MendeleevNumber
## 2.519738 2.750901
## Site1_MiracleRadius Site2_BCCenergy_pa
## 2.619663 15.530467
## Site2_MeltingT SpecificHeatCapacity_difference
## 17.019635 2.101013
## valence_arithmetic_average
## 5.079535
#Stepwise Backward Selection
vif(slr_raw_bwd_fit)
## BCCenergy_pa_composition_average BoilingT_max_value
## 12.241168 6.888968
## GSestFCClatcnt_max_value IonicRadii_max_value
## 3.270643 1.976747
## MeltingT_min_value n_ws.third_min_value
## 6.400564 5.882975
## NUnfilled_max_value Site1_Electronegativity
## 1.889698 3.167231
## Site1_MendeleevNumber Site1_MiracleRadius
## 2.635655 1.941638
## Site2_BCCenergy_pa Site2_MeltingT
## 14.383640 16.680019
## SpecificHeatCapacity_difference
## 1.990185
#Lasso 1se
vif(slr_raw_lasso_1se_fit)
## BCCenergy_pa_composition_average BoilingT_max_value
## 23.759652 10.131998
## GSestFCClatcnt_max_value IonicRadii_max_value
## 3.588846 2.913302
## MeltingT_min_value MiracleRadius_min_value
## 9.918700 2.545862
## n_ws.third_min_value NUnfilled_max_value
## 8.106654 3.410028
## Site1_CovalentRadii Site1_Electronegativity
## 6.104965 5.062818
## Site1_HHIr Site1_IonicRadii
## 3.798445 4.874145
## Site1_MendeleevNumber Site2_BCCenergy_pa
## 3.271690 21.756775
## Site2_Density Site2_MeltingT
## 2.828103 20.720455
## Site2_NdUnfilled SpecificHeatCapacity_difference
## 5.837358 2.222165
## valence_arithmetic_average
## 5.623930
The VIF from the model selected by stepwise backward selection suffered from less problems, although the VIFs of (BCCenergy_pa_composition_average,Site2_BCCenergy_pa,Site2_MeltingT) are still greater than 10. This could be solved by removing Site2_BCCenergy_pa if required, and the following results would remain largely similar.
Next, we look at the residual diagnostic plots for each of the four models above:
#Diagnostic Plots
par(mfrow = c(2,2))
#All
plot(slr_raw_fit, which = c(1,2,4,5))
#Stepwise Forward Selection
plot(slr_raw_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(slr_raw_bwd_fit, which = c(1,2,4,5))
#Lasso 1se
plot(slr_raw_lasso_1se_fit, which = c(1,2,4,5))
All four models suffered from a U-shaped trend in the residual plot, together with deviation from normality for the residuals near the upper tail. This motivates us to consider adding quadratic or interaction terms into the model, which will be discussed later. In addition, all four Cook’s distance plots suggest that there are three highly influential instances which are related to the Mg host as seen below:
data1[c(179,190,192),]
In this section, we make use of the above four sets chosen features and fit regression model where we add quadratic terms of each selected feature into the model.
quadratic_slr_raw_formula =as.formula(
paste(
'E_raw..eV.', '~', paste(
var_all, collapse = '+'
), '+', paste(
'I(',var_all,'^2)', collapse = '+'
)
)
)
quadratic_slr_raw_fwd_formula =as.formula(
paste(
'E_raw..eV.', '~', paste(
var_fwd, collapse = '+'
), '+', paste(
'I(',var_fwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_raw_bwd_formula =as.formula(
paste(
'E_raw..eV.', '~', paste(
var_bwd, collapse = '+'
), '+', paste(
'I(',var_bwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_raw_lasso_1se_formula =as.formula(
paste(
'E_raw..eV.', '~', paste(
var_lasso_1se, collapse = '+'
), '+', paste(
'I(',var_lasso_1se,'^2)', collapse = '+'
)
)
)
#All
quadratic_slr_raw_fit = lm(quadratic_slr_raw_formula, data = data1[,c(4,which(colnames(data1) %in% var_all))])
quadratic_slr_raw_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value +
## CovalentRadius_max_value + GSestFCClatcnt_max_value + IonicRadii_max_value +
## MeltingT_min_value + MiracleRadius_min_value + n_ws.third_min_value +
## NUnfilled_max_value + Site1_BCCfermi + Site1_CovalentRadii +
## Site1_Electronegativity + Site1_HHIr + Site1_IonicRadii +
## Site1_MendeleevNumber + Site1_MiracleRadius + Site2_BCCenergy_pa +
## Site2_Density + Site2_Group + Site2_MeltingT + Site2_NdUnfilled +
## Site2_NUnfilled + SpecificHeatCapacity_difference + valence_arithmetic_average +
## I(BCCenergy_pa_composition_average^2) + I(BoilingT_max_value^2) +
## I(CovalentRadius_max_value^2) + I(GSestFCClatcnt_max_value^2) +
## I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(MiracleRadius_min_value^2) +
## I(n_ws.third_min_value^2) + I(NUnfilled_max_value^2) + I(Site1_BCCfermi^2) +
## I(Site1_CovalentRadii^2) + I(Site1_Electronegativity^2) +
## I(Site1_HHIr^2) + I(Site1_IonicRadii^2) + I(Site1_MendeleevNumber^2) +
## I(Site1_MiracleRadius^2) + I(Site2_BCCenergy_pa^2) + I(Site2_Density^2) +
## I(Site2_Group^2) + I(Site2_MeltingT^2) + I(Site2_NdUnfilled^2) +
## I(Site2_NUnfilled^2) + I(SpecificHeatCapacity_difference^2) +
## I(valence_arithmetic_average^2)
#Stepwise Forward Selection
quadratic_slr_raw_fwd_fit = lm(quadratic_slr_raw_fwd_formula, data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
quadratic_slr_raw_fwd_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value +
## GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value +
## n_ws.third_min_value + NUnfilled_max_value + Site1_Electronegativity +
## Site1_IonicRadii + Site1_MendeleevNumber + Site1_MiracleRadius +
## Site2_BCCenergy_pa + Site2_MeltingT + SpecificHeatCapacity_difference +
## valence_arithmetic_average + I(BCCenergy_pa_composition_average^2) +
## I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) +
## I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(n_ws.third_min_value^2) +
## I(NUnfilled_max_value^2) + I(Site1_Electronegativity^2) +
## I(Site1_IonicRadii^2) + I(Site1_MendeleevNumber^2) + I(Site1_MiracleRadius^2) +
## I(Site2_BCCenergy_pa^2) + I(Site2_MeltingT^2) + I(SpecificHeatCapacity_difference^2) +
## I(valence_arithmetic_average^2)
#Stepwise Backward Selection
quadratic_slr_raw_bwd_fit = lm(quadratic_slr_raw_bwd_formula, data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
quadratic_slr_raw_bwd_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value +
## GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value +
## n_ws.third_min_value + NUnfilled_max_value + Site1_Electronegativity +
## Site1_MendeleevNumber + Site1_MiracleRadius + Site2_BCCenergy_pa +
## Site2_MeltingT + SpecificHeatCapacity_difference + I(BCCenergy_pa_composition_average^2) +
## I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) +
## I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(n_ws.third_min_value^2) +
## I(NUnfilled_max_value^2) + I(Site1_Electronegativity^2) +
## I(Site1_MendeleevNumber^2) + I(Site1_MiracleRadius^2) + I(Site2_BCCenergy_pa^2) +
## I(Site2_MeltingT^2) + I(SpecificHeatCapacity_difference^2)
#Lasso-1se
quadratic_slr_raw_lasso_1se_fit = lm(quadratic_slr_raw_lasso_1se_formula, data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])
quadratic_slr_raw_lasso_1se_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value +
## GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value +
## MiracleRadius_min_value + n_ws.third_min_value + NUnfilled_max_value +
## Site1_CovalentRadii + Site1_Electronegativity + Site1_HHIr +
## Site1_IonicRadii + Site1_MendeleevNumber + Site2_BCCenergy_pa +
## Site2_Density + Site2_MeltingT + Site2_NdUnfilled + SpecificHeatCapacity_difference +
## valence_arithmetic_average + I(BCCenergy_pa_composition_average^2) +
## I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) +
## I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(MiracleRadius_min_value^2) +
## I(n_ws.third_min_value^2) + I(NUnfilled_max_value^2) + I(Site1_CovalentRadii^2) +
## I(Site1_Electronegativity^2) + I(Site1_HHIr^2) + I(Site1_IonicRadii^2) +
## I(Site1_MendeleevNumber^2) + I(Site2_BCCenergy_pa^2) + I(Site2_Density^2) +
## I(Site2_MeltingT^2) + I(Site2_NdUnfilled^2) + I(SpecificHeatCapacity_difference^2) +
## I(valence_arithmetic_average^2)
The residual diagnostic plots of each fitted model with quadratic terms are given below:
par(mfrow = c(2,2))
#All
plot(quadratic_slr_raw_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(quadratic_slr_raw_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(quadratic_slr_raw_bwd_fit, which = c(1,2,4,5))
#Lasso-1se
plot(quadratic_slr_raw_lasso_1se_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
After including quadratic terms of the features, it can be seen that there is no longer obvious quadratic trends in the residual plot and the residuals are now approximatelty normally distributed. All Cook’s distance plots suggest that observation no. 192 is an influential outlier, while only the Cook’s distance plot of the model using all features suggests that observation no. 74 and 357 is a potential outlier:
data1[c(74,192,357),]
In this section, we make use of the above four sets of chosen features and fit regression model where we add all possible pairwise interaction terms of the selected features into the model. The main motivation for using interaction terms is due to some features being related to the host element while other features are related to the solute.
interact_slr_raw_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_all))])
interact_slr_raw_fwd_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
interact_slr_raw_bwd_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
interact_slr_raw_lasso_1se_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])
The residual diagnostic plots of each fitted model with interaction terms are given below:
par(mfrow = c(2,2))
#All
plot(interact_slr_raw_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(interact_slr_raw_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(interact_slr_raw_bwd_fit, which = c(1,2,4,5))
#Lasso-1se
plot(interact_slr_raw_lasso_1se_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
By adding interaction terms, the residual plots no longer show any obvious trend and the QQ plots confirm the normality of the residuals. On the other hand, the Cook’s distance plots for the four fitted models indicate some possible outliers:
data1[c(19,74,75,76,77,192, 293),]
The following table compares the different fitted models in terms of the adjusted R\(^2\):
data.frame(Linear = c(summary(slr_raw_fit)$adj.r.squared,
summary(slr_raw_fwd_fit)$adj.r.squared,
summary(slr_raw_bwd_fit)$adj.r.squared,
summary(slr_raw_lasso_1se_fit)$adj.r.squared),
Linear_and_Quadratic = c(summary(quadratic_slr_raw_fit)$adj.r.squared,
summary(quadratic_slr_raw_fwd_fit)$adj.r.squared,
summary(quadratic_slr_raw_bwd_fit)$adj.r.squared,
summary(quadratic_slr_raw_lasso_1se_fit)$adj.r.squared),
Linear_and_Interaction = c(summary(interact_slr_raw_fit)$adj.r.squared,
summary(interact_slr_raw_fwd_fit)$adj.r.squared,
summary(interact_slr_raw_bwd_fit)$adj.r.squared,
summary(interact_slr_raw_lasso_1se_fit)$adj.r.squared), row.names = c('All','Forward','Backward','Lasso-1se')
)
The following table compares the different fitted models in terms of the number of parameters (regression coefficients + residual variance) to be estimated:
data.frame(Linear = c(sum(!is.na(slr_raw_fit$coefficients)) + 1,
sum(!is.na(slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(slr_raw_lasso_1se_fit$coefficients)) + 1),
Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_raw_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_lasso_1se_fit$coefficients)) + 1),
Linear_and_Interaction = c(sum(!is.na(interact_slr_raw_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_lasso_1se_fit$coefficients)) + 1), row.names = c('All','Forward','Backward','Lasso-1se')
)
In summary, making use of stepwise forward/backward selection method or the lasso helps to reduce the multicollinearity issue, while adding quadratic/interaction terms solves the problem of having obvious U-shaped trend in the residual plots and deviation from normality for the residuals near the upper tail.