library(car) #For vif()
library(leaps) #For regsubsets()
library(glmnet) #for glmnet
library(GGally) #for ggpairs
rm(list=ls())

#Dataset 3: Metallic Glass Forming
data3 = read.csv('Metallic_Glass_Forming_with_features.csv')
head(data3)
apply(data3,2,function(x){length(unique(x))})
##                 Material.compositions                          main_element 
##                                   585                                    27 
##                                   Trg           Density_composition_average 
##                                   174                                   585 
##           IsBoron_composition_average          IsDBlock_composition_average 
##                                    63                                   123 
## IsTransitionMetal_composition_average         NdValence_composition_average 
##                                   123                                   299 
##          NValance_composition_average            HeatVaporization_max_value 
##                                   369                                    25 
##                   BoilingT_difference           HeatVaporization_difference 
##                                    83                                    88 
##                   MeltingT_difference                  NdValence_difference 
##                                    87                                    11 
##                 NsUnfilled_difference                    valence_difference 
##                                     2                                     5 
##                         Site1_Density                Site1_HeatCapacityMass 
##                                    30                                    29 
##                      Site1_HeatFusion                        Site1_IsDBlock 
##                                    30                                     2 
##               Site1_IsTransitionMetal                       Site1_NdValence 
##                                     2                                     9 
##            Site1_SpecificHeatCapacity 
##                                    27
hist(data3$Trg)

summary(data3$Trg)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2230  0.5570  0.5840  0.5774  0.6090  0.6880
#
length(unique(data3$main_element))
## [1] 27
# Scale all features to have sample mean 0 and sample variance 1
data3[,4:23] = scale(data3[,4:23])

#Reorder feature columns alphabetically
reordered = data3[,(4:23)[order(colnames(data3)[4:23])]]
data3 = cbind(data3[,1:3],reordered)

#Site1_IsTransitionMetal Column is exactly the same as Site1_IsDBlock column
#IsDBlock_composition_average is exactly the same as IsTransitionMetal_composition_average
data3$Site1_IsDBlock - data3$Site1_IsTransitionMetal 
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
data3$IsDBlock_composition_average - data3$IsTransitionMetal_composition_average
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
data3 = data3[,-which(colnames(data3) == 'Site1_IsTransitionMetal')]
data3 = data3[,-which(colnames(data3) == 'IsTransitionMetal_composition_average')]


#NsUnfilled_difference and Site1_IsDBlock columns has only 2 unique values
#valence_difference column has only 5 unique values

Trg

slr_Trg_fit = lm(Trg ~ . , data = data3[,c(3,4:21)])
vif(slr_Trg_fit)
##           BoilingT_difference   Density_composition_average 
##                     14.567801                     28.925799 
##   HeatVaporization_difference    HeatVaporization_max_value 
##                     37.770249                     13.531181 
##   IsBoron_composition_average  IsDBlock_composition_average 
##                      2.288699                     21.110220 
##           MeltingT_difference NdValence_composition_average 
##                      5.466689                     14.528577 
##          NdValence_difference         NsUnfilled_difference 
##                      2.507814                      2.251670 
##  NValance_composition_average                 Site1_Density 
##                      7.435110                     22.696909 
##        Site1_HeatCapacityMass              Site1_HeatFusion 
##                  65719.798341                      6.526110 
##                Site1_IsDBlock               Site1_NdValence 
##                     20.592039                     14.086964 
##    Site1_SpecificHeatCapacity            valence_difference 
##                  65629.954937                      3.040346
#There are serious multicollinearity issues between Site1_SpecificHeatCapacity and Site1_HeatCapacityMass
cor_mat = cor(data3[,4:21])
diag(cor_mat) = 0
prob_feature_pair = data.frame(feature1 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,1]],
                               feature2 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,2]],
                               corr = cor_mat[which(abs(cor_mat) > 0.7, arr.ind = T)])

prob_feature_pair[!duplicated(prob_feature_pair$corr),]

Based on the variance inflation factors (VIF > 10), there are serious multicollinearity issue due to the high correlation between Site1_SpecificHeatCapacity and Site1_HeatCapacityMass.

To solve the multicollinearity issues, we consider stepwise/LASSO methods to drop some of the variables:

# Perform Stepwise variable selection to try and reduce the multicollinearity issue
#Forward Stepwise
slr_Trg_fwd=regsubsets(Trg ~ . , data = data3[,c(3,4:21)], nvmax =18,
                        method ="forward")

#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_Trg_fwd)$rss, type = 'l'); points(which.min(summary(slr_Trg_fwd)$rss),min(summary(slr_Trg_fwd)$rss))
plot(summary(slr_Trg_fwd)$adjr2, type = 'l') ; points(which.max(summary(slr_Trg_fwd)$adjr2),max(summary(slr_Trg_fwd)$adjr2))
plot(summary(slr_Trg_fwd)$cp, type = 'l') ; points(which.min(summary(slr_Trg_fwd)$cp),min(summary(slr_Trg_fwd)$cp))
plot(summary(slr_Trg_fwd)$bic, type = 'l') ; points(which.min(summary(slr_Trg_fwd)$bic),min(summary(slr_Trg_fwd)$bic))

#Backward Stepwise
slr_Trg_bwd=regsubsets(Trg ~ . , data = data3[,c(3,4:21)], nvmax =18,
                        method ="backward")


#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_Trg_bwd)$rss, type = 'l'); points(which.min(summary(slr_Trg_bwd)$rss),min(summary(slr_Trg_bwd)$rss))
plot(summary(slr_Trg_bwd)$adjr2, type = 'l') ; points(which.max(summary(slr_Trg_bwd)$adjr2),max(summary(slr_Trg_bwd)$adjr2))
plot(summary(slr_Trg_bwd)$cp, type = 'l') ; points(which.min(summary(slr_Trg_bwd)$cp),min(summary(slr_Trg_bwd)$cp))
plot(summary(slr_Trg_bwd)$bic, type = 'l') ; points(which.min(summary(slr_Trg_bwd)$bic),min(summary(slr_Trg_bwd)$bic))

#### Lasso ####
set.seed(1)
cv_slr_Trg_lasso = cv.glmnet(x = model.matrix(Trg ~ . , data = data3[,c(3,4:21)])[,-1],
                              y = data3$Trg,
                              alpha = 1, family = 'gaussian', standardize = FALSE)
cv_slr_Trg_lasso$lambda.min
## [1] 7.584325e-05
cv_slr_Trg_lasso$lambda.1se
## [1] 0.007239606
out_slr_Trg_lasso=glmnet(x = model.matrix(Trg ~ . , data = data3[,c(3,4:21)])[,-1],
                          y = data3$Trg,
                          alpha = 1, family = 'gaussian', standardize = FALSE)
data.frame(lambda.min = as.vector(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min)), lambda.1se = as.vector(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se)), row.names = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min)))
plot(cv_slr_Trg_lasso)

By using BIC as the selection criterion, forward stepwise selection method chose the model with 10 variables (hence dropped 8 variables), while the backward stepwise selection method resulted in a model with 11 variables (and 7 variables dropped). Whereas Lasso combined with 10-fold cross-validation to select the optimal tuning parameter \(\lambda\) resulted in a model with 17 variables (only 1 variable dropped) when using \(\lambda_{\min} =\) 7.584324810^{-5} and another model with 8 variables (hence dropped 10 variables) when using \(\lambda_{1,SE} =\) 0.0072396.

The variables that have been dropped based on each of the method are given as:

#Stepwise Forward Selection
colnames(data3)[4:21][!colnames(data3)[4:21] %in% names(coef(slr_Trg_fwd, which.min(summary(slr_Trg_fwd)$bic)))]
## [1] "HeatVaporization_max_value"   "IsDBlock_composition_average"
## [3] "MeltingT_difference"          "NsUnfilled_difference"       
## [5] "Site1_Density"                "Site1_HeatFusion"            
## [7] "Site1_IsDBlock"               "Site1_NdValence"
#Stepwise Backward Selection
colnames(data3)[4:21][!colnames(data3)[4:21] %in% names(coef(slr_Trg_bwd, which.min(summary(slr_Trg_bwd)$bic)))]
## [1] "HeatVaporization_max_value"    "IsDBlock_composition_average" 
## [3] "MeltingT_difference"           "NdValence_composition_average"
## [5] "NdValence_difference"          "NsUnfilled_difference"        
## [7] "Site1_SpecificHeatCapacity"
#Lasso with lambda.min
rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min) == 0)]#Lasso with lambda.1se
## [1] "Site1_SpecificHeatCapacity"
rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se) == 0)]
##  [1] "Density_composition_average"  "HeatVaporization_difference" 
##  [3] "HeatVaporization_max_value"   "MeltingT_difference"         
##  [5] "NdValence_difference"         "NValance_composition_average"
##  [7] "Site1_Density"                "Site1_HeatFusion"            
##  [9] "Site1_NdValence"              "Site1_SpecificHeatCapacity"

Therefore, the aggressiveness in terms of dropping variables from the strongest to the weakest are: Lasso-1se > Forward Stepwise > Backward Stepwise > Lasso-min. Interestingly, Lasso-1se/Backward Stepwise/Lasso-min dropped Site1_SpecificHeatCapacity, but not Forward Stepwise.

The following presents the comparison of the residual sum of squares (RSS), adjusted R\(^2\) (AdjR2), Mallow CP and BIC for the models selected by backward stepwise and forward stepwise method, indicating that backward stepwise outperforms the forward stepwise method in terms of all criterion.

compare_fwd_bwd = data.frame(rbind(rbind(summary(slr_Trg_fwd)$rss, summary(slr_Trg_fwd)$adjr2, summary(slr_Trg_fwd)$cp, summary(slr_Trg_fwd)$bic)[,which.min(summary(slr_Trg_fwd)$bic)],
                                   rbind(summary(slr_Trg_bwd)$rss, summary(slr_Trg_bwd)$adjr2, summary(slr_Trg_bwd)$cp, summary(slr_Trg_bwd)$bic)[,which.min(summary(slr_Trg_bwd)$bic)]
))
rownames(compare_fwd_bwd) = c('Forward','Backward')
colnames(compare_fwd_bwd) = c('RSS','AdjR2','Mallow-CP','BIC')
compare_fwd_bwd

Based on the four sets of selected variables - Forward, Backward, Lasso-1se, Lasso-min, we then examine their corresponding VIFs:

#VIF seems to be better compared to using all features, although still some problems with (Site2_MeltingT,Site2_BCCenergy_pa,BCCenergy_pa_composition_average)
var_all = colnames(data3)[4:21]
var_fwd = names(coef(slr_Trg_fwd, which.min(summary(slr_Trg_fwd)$bic)))[-1]

var_bwd = names(coef(slr_Trg_bwd, which.min(summary(slr_Trg_bwd)$bic)))[-1]
var_lasso_1se = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se) != 0)][-1]
var_lasso_min = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min) != 0)][-1]


slr_Trg_fwd_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_fwd))])
slr_Trg_bwd_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
slr_Trg_lasso_1se_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
slr_Trg_lasso_min_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])


#All
vif(slr_Trg_fit)
##           BoilingT_difference   Density_composition_average 
##                     14.567801                     28.925799 
##   HeatVaporization_difference    HeatVaporization_max_value 
##                     37.770249                     13.531181 
##   IsBoron_composition_average  IsDBlock_composition_average 
##                      2.288699                     21.110220 
##           MeltingT_difference NdValence_composition_average 
##                      5.466689                     14.528577 
##          NdValence_difference         NsUnfilled_difference 
##                      2.507814                      2.251670 
##  NValance_composition_average                 Site1_Density 
##                      7.435110                     22.696909 
##        Site1_HeatCapacityMass              Site1_HeatFusion 
##                  65719.798341                      6.526110 
##                Site1_IsDBlock               Site1_NdValence 
##                     20.592039                     14.086964 
##    Site1_SpecificHeatCapacity            valence_difference 
##                  65629.954937                      3.040346
#Stepwise Forward Selection
vif(slr_Trg_fwd_fit)
##           BoilingT_difference   Density_composition_average 
##                      9.788566                     11.276896 
##   HeatVaporization_difference   IsBoron_composition_average 
##                     12.700805                      1.555682 
## NdValence_composition_average          NdValence_difference 
##                      5.713336                      1.530404 
##  NValance_composition_average        Site1_HeatCapacityMass 
##                      6.235090                  38965.787462 
##    Site1_SpecificHeatCapacity            valence_difference 
##                  38920.907349                      2.334048
#Stepwise Backward Selection
vif(slr_Trg_bwd_fit)
##          BoilingT_difference  Density_composition_average 
##                    10.604841                    11.261591 
##  HeatVaporization_difference  IsBoron_composition_average 
##                    14.246700                     1.284963 
## NValance_composition_average                Site1_Density 
##                     6.406883                    12.084051 
##       Site1_HeatCapacityMass             Site1_HeatFusion 
##                     2.644843                     5.022383 
##               Site1_IsDBlock              Site1_NdValence 
##                    11.552154                     7.399463 
##           valence_difference 
##                     2.784167
#Lasso 1se
vif(slr_Trg_lasso_1se_fit)
##           BoilingT_difference   IsBoron_composition_average 
##                      1.468956                      1.371307 
##  IsDBlock_composition_average NdValence_composition_average 
##                      8.776673                      3.577632 
##         NsUnfilled_difference        Site1_HeatCapacityMass 
##                      1.446377                      1.743887 
##                Site1_IsDBlock            valence_difference 
##                      9.274189                      2.219276
#Lasso min
vif(slr_Trg_lasso_min_fit)
##           BoilingT_difference   Density_composition_average 
##                     13.671398                     28.579766 
##   HeatVaporization_difference    HeatVaporization_max_value 
##                     35.664099                     11.886105 
##   IsBoron_composition_average  IsDBlock_composition_average 
##                      1.857241                     18.362539 
##           MeltingT_difference NdValence_composition_average 
##                      5.458959                     14.078186 
##          NdValence_difference         NsUnfilled_difference 
##                      2.466051                      2.199454 
##  NValance_composition_average                 Site1_Density 
##                      7.299569                     22.431616 
##        Site1_HeatCapacityMass              Site1_HeatFusion 
##                      2.980744                      5.814029 
##                Site1_IsDBlock               Site1_NdValence 
##                     20.334514                     12.878039 
##            valence_difference 
##                      3.038898

Compared to the full model, the model selected by different variable selection methods seem to have less multicollinearity issues indicated by the smaller VIFs, except for the model selected by stepwise forward as it did not drop Site1_SpecificHeatCapacity nor Site1_HeatCapacityMass, and these two features are highly correlated.

Next, we look at the residual diagnostic plots for each of the models above:

#Diagnostic Plots
par(mfrow = c(2,2))
#All
plot(slr_Trg_fit, which = c(1,2,4,5))

#Stepwise Forward Selection
plot(slr_Trg_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(slr_Trg_bwd_fit, which = c(1,2,4,5))

#Lasso 1se
plot(slr_Trg_lasso_1se_fit, which = c(1,2,4,5))

#Lasso min
plot(slr_Trg_lasso_min_fit, which = c(1,2,4,5))

All residuals plots seem good, while the QQplots indicate some departure from normality at lower and upper tails.

In addition, the Cook’s distance plots suggest that there are a few highly influential instances which are related to Al/Au/Pd/Ti majority element.

data3[c(31,35,36,473,487),]

Through further investigation, it can be found that Pd95Si5 and Ti40Zr10Cu34Pd16 have large magnitude of standardized residuals, which caused them to be identified as influential instances. Furthermore, Al89.5Ni3.5Y7, Au77.8Si8.4Ge13.8 and Au77Si9.4Ge13.6 are also flagged as influential due to their Trg being in the range of 0.22 to 0.38 which are much smaller than the other instances, and their relatively large standardized residuals. Furthermore, Al89.5Ni3.5Y7, Au77.8Si8.4Ge13.8 and Au77Si9.4Ge13.6 have high leverages, which is caused by some of their features taking extremely large/small values. For example, Al89.5Ni3.5Y7 has the highest IsBoron_composition_average, Au77.8Si8.4Ge13.8 have the highest Density_composition_average and NValance_composition_average, and Au77Si9.4Ge13.6 has the lowest Site1_HeatCapacityMass

Quadratic Terms

In this section, we make use of the above four sets of chosen features and fit regression model where we add quadratic terms of each selected feature into the model.

quadratic_slr_Trg_formula =as.formula(
  paste(
    'Trg', '~', paste(
      var_all, collapse = '+'
    ), '+', paste(
      'I(',var_all,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_Trg_fwd_formula =as.formula(
  paste(
    'Trg', '~', paste(
      var_fwd, collapse = '+'
    ), '+', paste(
      'I(',var_fwd,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_Trg_bwd_formula =as.formula(
  paste(
    'Trg', '~', paste(
      var_bwd, collapse = '+'
    ), '+', paste(
      'I(',var_bwd,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_Trg_lasso_1se_formula =as.formula(
  paste(
    'Trg', '~', paste(
      var_lasso_1se, collapse = '+'
    ), '+', paste(
      'I(',var_lasso_1se,'^2)', collapse = '+'
    )
  )
)

quadratic_slr_Trg_lasso_min_formula =as.formula(
  paste(
    'Trg', '~', paste(
      var_lasso_min, collapse = '+'
    ), '+', paste(
      'I(',var_lasso_min,'^2)', collapse = '+'
    )
  )
)

#All
quadratic_slr_Trg_fit = lm(quadratic_slr_Trg_formula, data = data3[,c(3,which(colnames(data3) %in% var_all))])
#quadratic_slr_Trg_formula

#Stepwise Forward Selection
quadratic_slr_Trg_fwd_fit = lm(quadratic_slr_Trg_fwd_formula, data = data3[,c(3,which(colnames(data3) %in% var_fwd))])
#quadratic_slr_Trg_fwd_formula

#Stepwise Backward Selection
quadratic_slr_Trg_bwd_fit = lm(quadratic_slr_Trg_bwd_formula, data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
#quadratic_slr_Trg_bwd_formula


#Lasso-1se
quadratic_slr_Trg_lasso_1se_fit = lm(quadratic_slr_Trg_lasso_1se_formula, data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
#quadratic_slr_Trg_lasso_1se_formula

#Lasso-min
quadratic_slr_Trg_lasso_min_fit = lm(quadratic_slr_Trg_lasso_min_formula, data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])
#quadratic_slr_Trg_lasso_min_formula

The residual diagnostic plots of each fitted model with quadratic terms are given below:

par(mfrow = c(2,2))
#All
plot(quadratic_slr_Trg_fit, which = c(1,2,4,5))

#Stepwise Forward Selection
plot(quadratic_slr_Trg_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(quadratic_slr_Trg_bwd_fit, which = c(1,2,4,5))

#Lasso-1se
plot(quadratic_slr_Trg_lasso_1se_fit, which = c(1,2,4,5))

#Lasso-min
plot(quadratic_slr_Trg_lasso_min_fit, which = c(1,2,4,5))

In general, the residuals plot of the model using quadratic terms are similar to those of the model involving only linear terms, which demonstrate no obvious problem. Additionally, the QQplots of the model using quadratic terms are slightly improved especially at the upper tails. The Cook’s distance plots identified the following influential observations associated with Ca/Pd/Ti majority element, which mostly overlap with those identified from the Cook’s distance plots of the models involving only linear terms:

data3[c(32,473,487),]

Interaction Terms

In this section, we make use of the above four sets of chosen features and fit regression model where we add all possible pairwise interaction terms of the selected features into the model. The main motivation for using interaction terms is to allow for the possibility that the effect of a feature on Trg to be dependent on the values of other features.

interact_slr_Trg_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_all  ))])
interact_slr_Trg_fwd_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_fwd ))])
interact_slr_Trg_bwd_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
interact_slr_Trg_lasso_1se_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
interact_slr_Trg_lasso_min_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])

The residual diagnostic plots of each fitted model with interaction terms are given below:

par(mfrow = c(2,2))
#All
plot(interact_slr_Trg_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#Stepwise Forward Selection
plot(interact_slr_Trg_fwd_fit, which = c(1,2,4,5))

#Stepwise Backward Selection
plot(interact_slr_Trg_bwd_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

#Lasso-1se
plot(interact_slr_Trg_lasso_1se_fit, which = c(1,2,4,5))

#Lasso-min
plot(interact_slr_Trg_lasso_min_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Again, the residuals plots of the model involving interaction terms are highly similar to those of the model involving only linear terms, which show no major issue. Also, the QQplots of the models involving interaction terms are slightly improved, especially near the upper tails. The Cook’s distance plots identified the following influential observations:

data3[c(32,84,231,303,473,482,476),]

Next, we provide some interpretation into the identification of outliers based on the model interact_slr_Trg_lasso_1se_fit, namely Ca65Ga35 and Pd95Si5. These two instances have large magnitude of standardized residuals with values around 5. Furthermore, their leverages are also relatively high compared to the other instances. In particular, Ca65Ga35 has the lowest IsDBlock_composition_average, while Pd95Si5 has the highest NdValence_composition_average together with the lowest BoilingT_difference and IsBoron_composition_average in the dataset. Therefore, the large standardized residuals combined with the relatively high leverages caused these two instances to be flagged as influential in the fitted model using pairwise interaction terms of features selected by Lasso-1se.

We also add some interpretation into the identification of outliers based on the model interact_slr_Trg_lasso_min_fit, i.e., Au35Ca65, Ga18Sr82 and Pt42.5Cu27Ni59.5P21. These instances all have leverages around 0.99, primarily due to some of their features taking extremely large/small values. For example, Au35Ca65 have the smallest IsBoron_composition_average and Site1_HeatCapacityMass, Ga18Sr82 has the smallest IsDBlock_composition_average, and Pt42.5Cu27Ni59.5P21 has the largest Site1_Density. Their extremely high leverages combined with their relatively large standardized residuals (1.44, 1.79 and -3.86 for Au35Ca65, Ga18Sr82 and Pt42.5Cu27Ni59.5P21, respetively) resulted in them being identifed as influential instances.

Comparison of Adjusted R\(^2\)

The following table compares the different fitted models in terms of the adjusted R\(^2\):

data.frame(Linear = c(summary(slr_Trg_fit)$adj.r.squared,
                      summary(slr_Trg_fwd_fit)$adj.r.squared,
                      summary(slr_Trg_bwd_fit)$adj.r.squared,
                      summary(slr_Trg_lasso_1se_fit)$adj.r.squared,
                      summary(slr_Trg_lasso_min_fit)$adj.r.squared), 
           Linear_and_Quadratic = c(summary(quadratic_slr_Trg_fit)$adj.r.squared,
                                    summary(quadratic_slr_Trg_fwd_fit)$adj.r.squared,
                                    summary(quadratic_slr_Trg_bwd_fit)$adj.r.squared,
                                    summary(quadratic_slr_Trg_lasso_1se_fit)$adj.r.squared,
                                    summary(quadratic_slr_Trg_lasso_min_fit)$adj.r.squared),
           Linear_and_Interaction = c(summary(interact_slr_Trg_fit)$adj.r.squared,
                                      summary(interact_slr_Trg_fwd_fit)$adj.r.squared,
                                      summary(interact_slr_Trg_bwd_fit)$adj.r.squared,
                                      summary(interact_slr_Trg_lasso_1se_fit)$adj.r.squared,
                                      summary(interact_slr_Trg_lasso_min_fit)$adj.r.squared), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)

The following table compares the different fitted models in terms of the number of parameters (regression coefficients + residual variance) to be estimated:

data.frame(Linear = c(sum(!is.na(slr_Trg_fit$coefficients)) + 1,
                      sum(!is.na(slr_Trg_fwd_fit$coefficients)) + 1,
                      sum(!is.na(slr_Trg_bwd_fit$coefficients)) + 1,
                      sum(!is.na(slr_Trg_lasso_1se_fit$coefficients)) + 1,
                      sum(!is.na(slr_Trg_lasso_min_fit$coefficients)) + 1), 
           Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_Trg_fit$coefficients)) + 1,
                                    sum(!is.na(quadratic_slr_Trg_fwd_fit$coefficients)) + 1,
                                    sum(!is.na(quadratic_slr_Trg_bwd_fit$coefficients)) + 1,
                                    sum(!is.na(quadratic_slr_Trg_lasso_1se_fit$coefficients)) + 1,
                                    sum(!is.na(quadratic_slr_Trg_lasso_min_fit$coefficients)) + 1),
           Linear_and_Interaction = c(sum(!is.na(interact_slr_Trg_fit$coefficients)) + 1,
                                      sum(!is.na(interact_slr_Trg_fwd_fit$coefficients)) + 1,
                                      sum(!is.na(interact_slr_Trg_bwd_fit$coefficients)) + 1,
                                      sum(!is.na(interact_slr_Trg_lasso_1se_fit$coefficients)) + 1,
                                      sum(!is.na(interact_slr_Trg_lasso_min_fit$coefficients)) + 1), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)