library(car) #For vif()
library(leaps) #For regsubsets()
library(glmnet) #for glmnet()
rm(list=ls())

#Dataset 1: Dilute_Solute Diffusion
data1 = read.csv('Dilute_Solute_Diffusion_with_features.csv')

#Number of Unique Host = 15
length(unique(data1$Material.compositions.1))
## [1] 15
#Number of Unique Solute = 58
length(unique(data1$Material.compositions.2))
## [1] 58
#All 7 Columns with labels beginning with Site1 are uniquely identified by the Host
host_data = cbind(data1$Material.compositions.1,data1[,substr(colnames(data1),1,5) == 'Site1'])
dim(unique(host_data))
## [1] 15  8
#All 7 Columns with labels beginning with Site2 are uniquely identified by the solute
solute_data = cbind(data1$Material.compositions.2,data1[,substr(colnames(data1),1,5) == 'Site2'])
dim(unique(solute_data))
## [1] 58  7
#The column 'BCCenergy_pa_arithmetic_average' is exactly the same as another column 'data1$BCCenergy_pa_composition_average'
max(abs(data1$BCCenergy_pa_arithmetic_average - data1$BCCenergy_pa_composition_average))
## [1] 0
data1 = data1[,-29]

# Scale all features to have sample mean 0 and sample variance 1
data1[,5:28] = scale(data1[,5:28])

#Reorder feature columns alphabetically
reordered = data1[,(5:28)[order(colnames(data1)[5:28])]]
data1 = cbind(data1[,1:4],reordered)

E_raw

slr_raw_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,5:28)])

#There are serious multicollinearity issues:
vif(slr_raw_fit)
## BCCenergy_pa_composition_average               BoilingT_max_value 
##                        31.658570                        12.153347 
##         CovalentRadius_max_value         GSestFCClatcnt_max_value 
##                         8.274496                         5.564614 
##             IonicRadii_max_value               MeltingT_min_value 
##                         3.414137                        11.358956 
##          MiracleRadius_min_value             n_ws.third_min_value 
##                         3.343098                         9.016486 
##              NUnfilled_max_value                   Site1_BCCfermi 
##                         8.187786                         4.073512 
##              Site1_CovalentRadii          Site1_Electronegativity 
##                        48.185461                         7.050570 
##                       Site1_HHIr                 Site1_IonicRadii 
##                         3.938766                         6.127765 
##            Site1_MendeleevNumber              Site1_MiracleRadius 
##                         4.716981                        44.400696 
##               Site2_BCCenergy_pa                    Site2_Density 
##                        31.670981                         3.532424 
##                      Site2_Group                   Site2_MeltingT 
##                         5.855864                        27.343113 
##                 Site2_NdUnfilled                  Site2_NUnfilled 
##                         8.001194                        10.414374 
##  SpecificHeatCapacity_difference       valence_arithmetic_average 
##                         2.322162                         5.958953
cor_mat = cor(data1[,5:28])
diag(cor_mat) = 0
prob_feature_pair = data.frame(feature1 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,1]],
                               feature2 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,2]],
                               corr = cor_mat[which(abs(cor_mat) > 0.7, arr.ind = T)])
prob_feature_pair[!duplicated(prob_feature_pair$corr),]

Based on the variance inflation factors (VIF > 10), there are serious multicollinearity issues for the two groups of (Site1_CovalentRadii, Site1_MiracleRadius) and (BCCenergy_pa_composition_average, BoilingT_max_value, MeltingT_min_value, Site2_BCCenergy_pa, Site2_MeltingT, Site2_NUnfilled).

To solve the multicollinearity issues, we consider stepwise/LASSO methods to drop some of the variables:

# Perform Stepwise variable selection to try and reduce the multicollinearity issue
#Forward Stepwise
slr_raw_fwd=regsubsets (E_raw..eV. ~ . , data = data1[,c(4,5:28)], nvmax =24,
                        method ="forward")


#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_raw_fwd)$rss, type = 'l'); points(which.min(summary(slr_raw_fwd)$rss),min(summary(slr_raw_fwd)$rss))
plot(summary(slr_raw_fwd)$adjr2, type = 'l') ; points(which.max(summary(slr_raw_fwd)$adjr2),max(summary(slr_raw_fwd)$adjr2))
plot(summary(slr_raw_fwd)$cp, type = 'l') ; points(which.min(summary(slr_raw_fwd)$cp),min(summary(slr_raw_fwd)$cp))
plot(summary(slr_raw_fwd)$bic, type = 'l') ; points(which.min(summary(slr_raw_fwd)$bic),min(summary(slr_raw_fwd)$bic))

#Backward Stepwise
slr_raw_bwd=regsubsets(E_raw..eV. ~ . , data = data1[,c(4,5:28)], nvmax =24,
                        method ="backward")


#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_raw_bwd)$rss, type = 'l'); points(which.min(summary(slr_raw_bwd)$rss),min(summary(slr_raw_bwd)$rss))
plot(summary(slr_raw_bwd)$adjr2, type = 'l') ; points(which.max(summary(slr_raw_bwd)$adjr2),max(summary(slr_raw_bwd)$adjr2))
plot(summary(slr_raw_bwd)$cp, type = 'l') ; points(which.min(summary(slr_raw_bwd)$cp),min(summary(slr_raw_bwd)$cp))
plot(summary(slr_raw_bwd)$bic, type = 'l') ; points(which.min(summary(slr_raw_bwd)$bic),min(summary(slr_raw_bwd)$bic))

#Lasso
set.seed(1)
cv_slr_raw_lasso = cv.glmnet(x = model.matrix(E_raw..eV. ~ . , data = data1[,c(4,5:28)])[,-1],
                             y = data1$E_raw..eV.[],
                             alpha = 1, family = 'gaussian', standardize = FALSE)
cv_slr_raw_lasso$lambda.min
## [1] 0.0003021515
cv_slr_raw_lasso$lambda.1se
## [1] 0.01370221
out_slr_raw_lasso=glmnet(x = model.matrix(E_raw..eV. ~ . , data = data1[,c(4,5:28)])[,-1],
                         y = data1$E_raw..eV.,
                         alpha = 1, family = 'gaussian', standardize = FALSE)
data.frame(lambda.min = as.vector(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min)), lambda.1se = as.vector(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se)), row.names = rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min)))
plot(cv_slr_raw_lasso)

By using BIC as the selection criterion, forward stepwise selection method chose the model with 15 variables (hence dropped 9 variables), while the backward stepwise selection method resulted in a model with 13 variables (and 11 variables dropped). Whereas Lasso combined with 10-fold cross-validation to select the optimal tuning parameter \(\lambda\) resulted in a model with all variables included when using \(\lambda_{\min} =\) 3.021515410^{-4} and another model with 19 variables (hence dropped 5 variables) when using \(\lambda_{1,SE} =\) 0.0137022.

The variables that have been dropped based on each of the method are given as:

#Stepwise Forward Selection
colnames(data1)[5:28][!colnames(data1)[5:28] %in% names(coef(slr_raw_fwd, which.min(summary(slr_raw_fwd)$bic)))]
## [1] "CovalentRadius_max_value" "MiracleRadius_min_value" 
## [3] "Site1_BCCfermi"           "Site1_CovalentRadii"     
## [5] "Site1_HHIr"               "Site2_Density"           
## [7] "Site2_Group"              "Site2_NdUnfilled"        
## [9] "Site2_NUnfilled"
#Stepwise Backward Selection
colnames(data1)[5:28][!colnames(data1)[5:28] %in% names(coef(slr_raw_bwd, which.min(summary(slr_raw_bwd)$bic)))]
##  [1] "CovalentRadius_max_value"   "MiracleRadius_min_value"   
##  [3] "Site1_BCCfermi"             "Site1_CovalentRadii"       
##  [5] "Site1_HHIr"                 "Site1_IonicRadii"          
##  [7] "Site2_Density"              "Site2_Group"               
##  [9] "Site2_NdUnfilled"           "Site2_NUnfilled"           
## [11] "valence_arithmetic_average"
#Lasso with lambda.min
colnames(data1)[5:28][!colnames(data1)[5:28] %in% rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.min) )]
## character(0)
#Lasso with lambda.1se
rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se))[which(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se) == 0)]
## [1] "CovalentRadius_max_value" "Site1_BCCfermi"          
## [3] "Site1_MiracleRadius"      "Site2_Group"             
## [5] "Site2_NUnfilled"

Therefore, the aggressiveness in terms of dropping variables from the strongest to the weakest are: Backward Stepwise > Forward Stepwise > Lasso-1se > Lasso-min.

The following presents the comparison of the residual sum of squares (RSS), adjusted R\(^2\) (AdjR2), Mallow CP and BIC for the models selected by backward stepwise and forward stepwise method.

compare_fwd_bwd = data.frame(rbind(rbind(summary(slr_raw_fwd)$rss, summary(slr_raw_fwd)$adjr2, summary(slr_raw_fwd)$cp, summary(slr_raw_fwd)$bic)[,which.min(summary(slr_raw_fwd)$bic)],
                                   rbind(summary(slr_raw_bwd)$rss, summary(slr_raw_bwd)$adjr2, summary(slr_raw_bwd)$cp, summary(slr_raw_bwd)$bic)[,which.min(summary(slr_raw_bwd)$bic)]
))
rownames(compare_fwd_bwd) = c('Forward','Backward')
colnames(compare_fwd_bwd) = c('RSS','AdjR2','Mallow-CP','BIC')
compare_fwd_bwd

Based on the four sets of selected variables - Forward, Backward, Lasso-min, Lasso-1se, recalling that Lasso-min is equivalent to selecting all the variables, we then examine their corresponding VIFs:

#VIF seems to be better compared to using all features, although still some problems with (Site2_MeltingT,Site2_BCCenergy_pa,BCCenergy_pa_composition_average)
var_all = colnames(data1)[5:28]
var_fwd = names(coef(slr_raw_fwd, which.min(summary(slr_raw_fwd)$bic)))[-1]
var_bwd = names(coef(slr_raw_bwd, which.min(summary(slr_raw_bwd)$bic)))[-1]
#var_bwd = var_bwd[-which(var_bwd == "Site2_BCCenergy_pa")]


var_lasso_1se = rownames(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se))[which(predict(out_slr_raw_lasso ,type="coefficients",s=cv_slr_raw_lasso$lambda.1se) != 0)][-1]


slr_raw_fwd_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
slr_raw_bwd_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
slr_raw_lasso_1se_fit = lm(E_raw..eV. ~ . , data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])

#All
vif(slr_raw_fit)
## BCCenergy_pa_composition_average               BoilingT_max_value 
##                        31.658570                        12.153347 
##         CovalentRadius_max_value         GSestFCClatcnt_max_value 
##                         8.274496                         5.564614 
##             IonicRadii_max_value               MeltingT_min_value 
##                         3.414137                        11.358956 
##          MiracleRadius_min_value             n_ws.third_min_value 
##                         3.343098                         9.016486 
##              NUnfilled_max_value                   Site1_BCCfermi 
##                         8.187786                         4.073512 
##              Site1_CovalentRadii          Site1_Electronegativity 
##                        48.185461                         7.050570 
##                       Site1_HHIr                 Site1_IonicRadii 
##                         3.938766                         6.127765 
##            Site1_MendeleevNumber              Site1_MiracleRadius 
##                         4.716981                        44.400696 
##               Site2_BCCenergy_pa                    Site2_Density 
##                        31.670981                         3.532424 
##                      Site2_Group                   Site2_MeltingT 
##                         5.855864                        27.343113 
##                 Site2_NdUnfilled                  Site2_NUnfilled 
##                         8.001194                        10.414374 
##  SpecificHeatCapacity_difference       valence_arithmetic_average 
##                         2.322162                         5.958953
#Stepwise Forward Selection
vif(slr_raw_fwd_fit)
## BCCenergy_pa_composition_average               BoilingT_max_value 
##                        21.560966                         8.286418 
##         GSestFCClatcnt_max_value             IonicRadii_max_value 
##                         3.600398                         2.333823 
##               MeltingT_min_value             n_ws.third_min_value 
##                         8.231074                         6.810415 
##              NUnfilled_max_value          Site1_Electronegativity 
##                         1.920920                         3.475194 
##                 Site1_IonicRadii            Site1_MendeleevNumber 
##                         2.519738                         2.750901 
##              Site1_MiracleRadius               Site2_BCCenergy_pa 
##                         2.619663                        15.530467 
##                   Site2_MeltingT  SpecificHeatCapacity_difference 
##                        17.019635                         2.101013 
##       valence_arithmetic_average 
##                         5.079535
#Stepwise Backward Selection
vif(slr_raw_bwd_fit)
## BCCenergy_pa_composition_average               BoilingT_max_value 
##                        12.241168                         6.888968 
##         GSestFCClatcnt_max_value             IonicRadii_max_value 
##                         3.270643                         1.976747 
##               MeltingT_min_value             n_ws.third_min_value 
##                         6.400564                         5.882975 
##              NUnfilled_max_value          Site1_Electronegativity 
##                         1.889698                         3.167231 
##            Site1_MendeleevNumber              Site1_MiracleRadius 
##                         2.635655                         1.941638 
##               Site2_BCCenergy_pa                   Site2_MeltingT 
##                        14.383640                        16.680019 
##  SpecificHeatCapacity_difference 
##                         1.990185
#Lasso 1se
vif(slr_raw_lasso_1se_fit)
## BCCenergy_pa_composition_average               BoilingT_max_value 
##                        23.759652                        10.131998 
##         GSestFCClatcnt_max_value             IonicRadii_max_value 
##                         3.588846                         2.913302 
##               MeltingT_min_value          MiracleRadius_min_value 
##                         9.918700                         2.545862 
##             n_ws.third_min_value              NUnfilled_max_value 
##                         8.106654                         3.410028 
##              Site1_CovalentRadii          Site1_Electronegativity 
##                         6.104965                         5.062818 
##                       Site1_HHIr                 Site1_IonicRadii 
##                         3.798445                         4.874145 
##            Site1_MendeleevNumber               Site2_BCCenergy_pa 
##                         3.271690                        21.756775 
##                    Site2_Density                   Site2_MeltingT 
##                         2.828103                        20.720455 
##                 Site2_NdUnfilled  SpecificHeatCapacity_difference 
##                         5.837358                         2.222165 
##       valence_arithmetic_average 
##                         5.623930

The VIF from the model selected by stepwise backward selection suffered from less problems, although the VIFs of (BCCenergy_pa_composition_average,Site2_BCCenergy_pa,Site2_MeltingT) are still greater than 10. This could be solved by removing Site2_BCCenergy_pa if required, and the following results would remain largely similar.

Next, we look at the residual diagnostic plots for each of the four models above:

#Diagnostic Plots
par(mfrow = c(2,2))
#All
plot(slr_raw_fit, which = c(1,2,4,5))

#Stepwise Forward Selection
plot(slr_raw_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(slr_raw_bwd_fit, which = c(1,2,4,5))

#Lasso 1se
plot(slr_raw_lasso_1se_fit, which = c(1,2,4,5))

All four models suffered from a U-shaped trend in the residual plot, together with deviation from normality for the residuals near the upper tail. This motivates us to consider adding quadratic or interaction terms into the model, which will be discussed later. In addition, all four Cook’s distance plots suggest that there are three highly influential instances which are related to the Mg host as seen below:

data1[c(179,190,192),]

Quadratic Terms

In this section, we make use of the above four sets chosen features and fit regression model where we add quadratic terms of each selected feature into the model.

quadratic_slr_raw_formula =as.formula(
  paste(
    'E_raw..eV.', '~', paste(
      var_all, collapse = '+'
    ), '+', paste(
      'I(',var_all,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_raw_fwd_formula =as.formula(
  paste(
    'E_raw..eV.', '~', paste(
      var_fwd, collapse = '+'
    ), '+', paste(
      'I(',var_fwd,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_raw_bwd_formula =as.formula(
  paste(
    'E_raw..eV.', '~', paste(
      var_bwd, collapse = '+'
    ), '+', paste(
      'I(',var_bwd,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_raw_lasso_1se_formula =as.formula(
  paste(
    'E_raw..eV.', '~', paste(
      var_lasso_1se, collapse = '+'
    ), '+', paste(
      'I(',var_lasso_1se,'^2)', collapse = '+'
    )
  )
)

#All
quadratic_slr_raw_fit = lm(quadratic_slr_raw_formula, data = data1[,c(4,which(colnames(data1) %in% var_all))])
quadratic_slr_raw_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value + 
##     CovalentRadius_max_value + GSestFCClatcnt_max_value + IonicRadii_max_value + 
##     MeltingT_min_value + MiracleRadius_min_value + n_ws.third_min_value + 
##     NUnfilled_max_value + Site1_BCCfermi + Site1_CovalentRadii + 
##     Site1_Electronegativity + Site1_HHIr + Site1_IonicRadii + 
##     Site1_MendeleevNumber + Site1_MiracleRadius + Site2_BCCenergy_pa + 
##     Site2_Density + Site2_Group + Site2_MeltingT + Site2_NdUnfilled + 
##     Site2_NUnfilled + SpecificHeatCapacity_difference + valence_arithmetic_average + 
##     I(BCCenergy_pa_composition_average^2) + I(BoilingT_max_value^2) + 
##     I(CovalentRadius_max_value^2) + I(GSestFCClatcnt_max_value^2) + 
##     I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(MiracleRadius_min_value^2) + 
##     I(n_ws.third_min_value^2) + I(NUnfilled_max_value^2) + I(Site1_BCCfermi^2) + 
##     I(Site1_CovalentRadii^2) + I(Site1_Electronegativity^2) + 
##     I(Site1_HHIr^2) + I(Site1_IonicRadii^2) + I(Site1_MendeleevNumber^2) + 
##     I(Site1_MiracleRadius^2) + I(Site2_BCCenergy_pa^2) + I(Site2_Density^2) + 
##     I(Site2_Group^2) + I(Site2_MeltingT^2) + I(Site2_NdUnfilled^2) + 
##     I(Site2_NUnfilled^2) + I(SpecificHeatCapacity_difference^2) + 
##     I(valence_arithmetic_average^2)
#Stepwise Forward Selection
quadratic_slr_raw_fwd_fit = lm(quadratic_slr_raw_fwd_formula, data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
quadratic_slr_raw_fwd_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value + 
##     GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value + 
##     n_ws.third_min_value + NUnfilled_max_value + Site1_Electronegativity + 
##     Site1_IonicRadii + Site1_MendeleevNumber + Site1_MiracleRadius + 
##     Site2_BCCenergy_pa + Site2_MeltingT + SpecificHeatCapacity_difference + 
##     valence_arithmetic_average + I(BCCenergy_pa_composition_average^2) + 
##     I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) + 
##     I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(n_ws.third_min_value^2) + 
##     I(NUnfilled_max_value^2) + I(Site1_Electronegativity^2) + 
##     I(Site1_IonicRadii^2) + I(Site1_MendeleevNumber^2) + I(Site1_MiracleRadius^2) + 
##     I(Site2_BCCenergy_pa^2) + I(Site2_MeltingT^2) + I(SpecificHeatCapacity_difference^2) + 
##     I(valence_arithmetic_average^2)
#Stepwise Backward Selection
quadratic_slr_raw_bwd_fit = lm(quadratic_slr_raw_bwd_formula, data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
quadratic_slr_raw_bwd_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value + 
##     GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value + 
##     n_ws.third_min_value + NUnfilled_max_value + Site1_Electronegativity + 
##     Site1_MendeleevNumber + Site1_MiracleRadius + Site2_BCCenergy_pa + 
##     Site2_MeltingT + SpecificHeatCapacity_difference + I(BCCenergy_pa_composition_average^2) + 
##     I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) + 
##     I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(n_ws.third_min_value^2) + 
##     I(NUnfilled_max_value^2) + I(Site1_Electronegativity^2) + 
##     I(Site1_MendeleevNumber^2) + I(Site1_MiracleRadius^2) + I(Site2_BCCenergy_pa^2) + 
##     I(Site2_MeltingT^2) + I(SpecificHeatCapacity_difference^2)
#Lasso-1se
quadratic_slr_raw_lasso_1se_fit = lm(quadratic_slr_raw_lasso_1se_formula, data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])
quadratic_slr_raw_lasso_1se_formula
## E_raw..eV. ~ BCCenergy_pa_composition_average + BoilingT_max_value + 
##     GSestFCClatcnt_max_value + IonicRadii_max_value + MeltingT_min_value + 
##     MiracleRadius_min_value + n_ws.third_min_value + NUnfilled_max_value + 
##     Site1_CovalentRadii + Site1_Electronegativity + Site1_HHIr + 
##     Site1_IonicRadii + Site1_MendeleevNumber + Site2_BCCenergy_pa + 
##     Site2_Density + Site2_MeltingT + Site2_NdUnfilled + SpecificHeatCapacity_difference + 
##     valence_arithmetic_average + I(BCCenergy_pa_composition_average^2) + 
##     I(BoilingT_max_value^2) + I(GSestFCClatcnt_max_value^2) + 
##     I(IonicRadii_max_value^2) + I(MeltingT_min_value^2) + I(MiracleRadius_min_value^2) + 
##     I(n_ws.third_min_value^2) + I(NUnfilled_max_value^2) + I(Site1_CovalentRadii^2) + 
##     I(Site1_Electronegativity^2) + I(Site1_HHIr^2) + I(Site1_IonicRadii^2) + 
##     I(Site1_MendeleevNumber^2) + I(Site2_BCCenergy_pa^2) + I(Site2_Density^2) + 
##     I(Site2_MeltingT^2) + I(Site2_NdUnfilled^2) + I(SpecificHeatCapacity_difference^2) + 
##     I(valence_arithmetic_average^2)

The residual diagnostic plots of each fitted model with quadratic terms are given below:

par(mfrow = c(2,2))
#All
plot(quadratic_slr_raw_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#Stepwise Forward Selection
plot(quadratic_slr_raw_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(quadratic_slr_raw_bwd_fit, which = c(1,2,4,5))

#Lasso-1se
plot(quadratic_slr_raw_lasso_1se_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

After including quadratic terms of the features, it can be seen that there is no longer obvious quadratic trends in the residual plot and the residuals are now approximatelty normally distributed. All Cook’s distance plots suggest that observation no. 192 is an influential outlier, while only the Cook’s distance plot of the model using all features suggests that observation no. 74 and 357 is a potential outlier:

data1[c(74,192,357),]

Interaction Terms

In this section, we make use of the above four sets of chosen features and fit regression model where we add all possible pairwise interaction terms of the selected features into the model. The main motivation for using interaction terms is due to some features being related to the host element while other features are related to the solute.

interact_slr_raw_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_all))])
interact_slr_raw_fwd_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_fwd))])
interact_slr_raw_bwd_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_bwd))])
interact_slr_raw_lasso_1se_fit = lm(E_raw..eV. ~ .^2, data = data1[,c(4,which(colnames(data1) %in% var_lasso_1se))])

The residual diagnostic plots of each fitted model with interaction terms are given below:

par(mfrow = c(2,2))
#All
plot(interact_slr_raw_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#Stepwise Forward Selection
plot(interact_slr_raw_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(interact_slr_raw_bwd_fit, which = c(1,2,4,5))

#Lasso-1se
plot(interact_slr_raw_lasso_1se_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

By adding interaction terms, the residual plots no longer show any obvious trend and the QQ plots confirm the normality of the residuals. On the other hand, the Cook’s distance plots for the four fitted models indicate some possible outliers:

data1[c(19,74,75,76,77,192, 293),]

Comparison of Adjusted R\(^2\)

The following table compares the different fitted models in terms of the adjusted R\(^2\):

data.frame(Linear = c(summary(slr_raw_fit)$adj.r.squared,
summary(slr_raw_fwd_fit)$adj.r.squared,
summary(slr_raw_bwd_fit)$adj.r.squared,
summary(slr_raw_lasso_1se_fit)$adj.r.squared), 
Linear_and_Quadratic = c(summary(quadratic_slr_raw_fit)$adj.r.squared,
summary(quadratic_slr_raw_fwd_fit)$adj.r.squared,
summary(quadratic_slr_raw_bwd_fit)$adj.r.squared,
summary(quadratic_slr_raw_lasso_1se_fit)$adj.r.squared),
Linear_and_Interaction = c(summary(interact_slr_raw_fit)$adj.r.squared,
summary(interact_slr_raw_fwd_fit)$adj.r.squared,
summary(interact_slr_raw_bwd_fit)$adj.r.squared,
summary(interact_slr_raw_lasso_1se_fit)$adj.r.squared), row.names = c('All','Forward','Backward','Lasso-1se')
)

The following table compares the different fitted models in terms of the number of parameters (regression coefficients + residual variance) to be estimated:

data.frame(Linear = c(sum(!is.na(slr_raw_fit$coefficients)) + 1,
sum(!is.na(slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(slr_raw_lasso_1se_fit$coefficients)) + 1), 
Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_raw_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_raw_lasso_1se_fit$coefficients)) + 1),
Linear_and_Interaction = c(sum(!is.na(interact_slr_raw_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_fwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_bwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_raw_lasso_1se_fit$coefficients)) + 1), row.names = c('All','Forward','Backward','Lasso-1se')
)

In summary, making use of stepwise forward/backward selection method or the lasso helps to reduce the multicollinearity issue, while adding quadratic/interaction terms solves the problem of having obvious U-shaped trend in the residual plots and deviation from normality for the residuals near the upper tail.