library(car) #For vif()
library(leaps) #For regsubsets()
library(glmnet) #for glmnet
library(GGally) #for ggpairs
rm(list=ls())
#Dataset 3: Metallic Glass Forming
data3 = read.csv('Metallic_Glass_Forming_with_features.csv')
head(data3)
apply(data3,2,function(x){length(unique(x))})
## Material.compositions main_element
## 585 27
## Trg Density_composition_average
## 174 585
## IsBoron_composition_average IsDBlock_composition_average
## 63 123
## IsTransitionMetal_composition_average NdValence_composition_average
## 123 299
## NValance_composition_average HeatVaporization_max_value
## 369 25
## BoilingT_difference HeatVaporization_difference
## 83 88
## MeltingT_difference NdValence_difference
## 87 11
## NsUnfilled_difference valence_difference
## 2 5
## Site1_Density Site1_HeatCapacityMass
## 30 29
## Site1_HeatFusion Site1_IsDBlock
## 30 2
## Site1_IsTransitionMetal Site1_NdValence
## 2 9
## Site1_SpecificHeatCapacity
## 27
hist(data3$Trg)
summary(data3$Trg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2230 0.5570 0.5840 0.5774 0.6090 0.6880
#
length(unique(data3$main_element))
## [1] 27
# Scale all features to have sample mean 0 and sample variance 1
data3[,4:23] = scale(data3[,4:23])
#Reorder feature columns alphabetically
reordered = data3[,(4:23)[order(colnames(data3)[4:23])]]
data3 = cbind(data3[,1:3],reordered)
#Site1_IsTransitionMetal Column is exactly the same as Site1_IsDBlock column
#IsDBlock_composition_average is exactly the same as IsTransitionMetal_composition_average
data3$Site1_IsDBlock - data3$Site1_IsTransitionMetal
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
data3$IsDBlock_composition_average - data3$IsTransitionMetal_composition_average
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
data3 = data3[,-which(colnames(data3) == 'Site1_IsTransitionMetal')]
data3 = data3[,-which(colnames(data3) == 'IsTransitionMetal_composition_average')]
#NsUnfilled_difference and Site1_IsDBlock columns has only 2 unique values
#valence_difference column has only 5 unique values
slr_Trg_fit = lm(Trg ~ . , data = data3[,c(3,4:21)])
vif(slr_Trg_fit)
## BoilingT_difference Density_composition_average
## 14.567801 28.925799
## HeatVaporization_difference HeatVaporization_max_value
## 37.770249 13.531181
## IsBoron_composition_average IsDBlock_composition_average
## 2.288699 21.110220
## MeltingT_difference NdValence_composition_average
## 5.466689 14.528577
## NdValence_difference NsUnfilled_difference
## 2.507814 2.251670
## NValance_composition_average Site1_Density
## 7.435110 22.696909
## Site1_HeatCapacityMass Site1_HeatFusion
## 65719.798341 6.526110
## Site1_IsDBlock Site1_NdValence
## 20.592039 14.086964
## Site1_SpecificHeatCapacity valence_difference
## 65629.954937 3.040346
#There are serious multicollinearity issues between Site1_SpecificHeatCapacity and Site1_HeatCapacityMass
cor_mat = cor(data3[,4:21])
diag(cor_mat) = 0
prob_feature_pair = data.frame(feature1 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,1]],
feature2 = colnames(cor_mat)[which(abs(cor_mat) > 0.7, arr.ind = T)[,2]],
corr = cor_mat[which(abs(cor_mat) > 0.7, arr.ind = T)])
prob_feature_pair[!duplicated(prob_feature_pair$corr),]
Based on the variance inflation factors (VIF > 10), there are serious multicollinearity issue due to the high correlation between Site1_SpecificHeatCapacity
and Site1_HeatCapacityMass
.
To solve the multicollinearity issues, we consider stepwise/LASSO methods to drop some of the variables:
# Perform Stepwise variable selection to try and reduce the multicollinearity issue
#Forward Stepwise
slr_Trg_fwd=regsubsets(Trg ~ . , data = data3[,c(3,4:21)], nvmax =18,
method ="forward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_Trg_fwd)$rss, type = 'l'); points(which.min(summary(slr_Trg_fwd)$rss),min(summary(slr_Trg_fwd)$rss))
plot(summary(slr_Trg_fwd)$adjr2, type = 'l') ; points(which.max(summary(slr_Trg_fwd)$adjr2),max(summary(slr_Trg_fwd)$adjr2))
plot(summary(slr_Trg_fwd)$cp, type = 'l') ; points(which.min(summary(slr_Trg_fwd)$cp),min(summary(slr_Trg_fwd)$cp))
plot(summary(slr_Trg_fwd)$bic, type = 'l') ; points(which.min(summary(slr_Trg_fwd)$bic),min(summary(slr_Trg_fwd)$bic))
#Backward Stepwise
slr_Trg_bwd=regsubsets(Trg ~ . , data = data3[,c(3,4:21)], nvmax =18,
method ="backward")
#Plotting RSS, Adjusted R-squared, Mallow Cp and BIC
par(mfrow = c(2,2))
plot(summary(slr_Trg_bwd)$rss, type = 'l'); points(which.min(summary(slr_Trg_bwd)$rss),min(summary(slr_Trg_bwd)$rss))
plot(summary(slr_Trg_bwd)$adjr2, type = 'l') ; points(which.max(summary(slr_Trg_bwd)$adjr2),max(summary(slr_Trg_bwd)$adjr2))
plot(summary(slr_Trg_bwd)$cp, type = 'l') ; points(which.min(summary(slr_Trg_bwd)$cp),min(summary(slr_Trg_bwd)$cp))
plot(summary(slr_Trg_bwd)$bic, type = 'l') ; points(which.min(summary(slr_Trg_bwd)$bic),min(summary(slr_Trg_bwd)$bic))
#### Lasso ####
set.seed(1)
cv_slr_Trg_lasso = cv.glmnet(x = model.matrix(Trg ~ . , data = data3[,c(3,4:21)])[,-1],
y = data3$Trg,
alpha = 1, family = 'gaussian', standardize = FALSE)
cv_slr_Trg_lasso$lambda.min
## [1] 7.584325e-05
cv_slr_Trg_lasso$lambda.1se
## [1] 0.007239606
out_slr_Trg_lasso=glmnet(x = model.matrix(Trg ~ . , data = data3[,c(3,4:21)])[,-1],
y = data3$Trg,
alpha = 1, family = 'gaussian', standardize = FALSE)
data.frame(lambda.min = as.vector(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min)), lambda.1se = as.vector(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se)), row.names = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min)))
plot(cv_slr_Trg_lasso)
By using BIC as the selection criterion, forward stepwise selection method chose the model with 10 variables (hence dropped 8 variables), while the backward stepwise selection method resulted in a model with 11 variables (and 7 variables dropped). Whereas Lasso combined with 10-fold cross-validation to select the optimal tuning parameter \(\lambda\) resulted in a model with 17 variables (only 1 variable dropped) when using \(\lambda_{\min} =\) 7.584324810^{-5} and another model with 8 variables (hence dropped 10 variables) when using \(\lambda_{1,SE} =\) 0.0072396.
The variables that have been dropped based on each of the method are given as:
#Stepwise Forward Selection
colnames(data3)[4:21][!colnames(data3)[4:21] %in% names(coef(slr_Trg_fwd, which.min(summary(slr_Trg_fwd)$bic)))]
## [1] "HeatVaporization_max_value" "IsDBlock_composition_average"
## [3] "MeltingT_difference" "NsUnfilled_difference"
## [5] "Site1_Density" "Site1_HeatFusion"
## [7] "Site1_IsDBlock" "Site1_NdValence"
#Stepwise Backward Selection
colnames(data3)[4:21][!colnames(data3)[4:21] %in% names(coef(slr_Trg_bwd, which.min(summary(slr_Trg_bwd)$bic)))]
## [1] "HeatVaporization_max_value" "IsDBlock_composition_average"
## [3] "MeltingT_difference" "NdValence_composition_average"
## [5] "NdValence_difference" "NsUnfilled_difference"
## [7] "Site1_SpecificHeatCapacity"
#Lasso with lambda.min
rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min) == 0)]#Lasso with lambda.1se
## [1] "Site1_SpecificHeatCapacity"
rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se) == 0)]
## [1] "Density_composition_average" "HeatVaporization_difference"
## [3] "HeatVaporization_max_value" "MeltingT_difference"
## [5] "NdValence_difference" "NValance_composition_average"
## [7] "Site1_Density" "Site1_HeatFusion"
## [9] "Site1_NdValence" "Site1_SpecificHeatCapacity"
Therefore, the aggressiveness in terms of dropping variables from the strongest to the weakest are: Lasso-1se > Forward Stepwise > Backward Stepwise > Lasso-min. Interestingly, Lasso-1se/Backward Stepwise/Lasso-min dropped Site1_SpecificHeatCapacity
, but not Forward Stepwise.
The following presents the comparison of the residual sum of squares (RSS), adjusted R\(^2\) (AdjR2), Mallow CP and BIC for the models selected by backward stepwise and forward stepwise method, indicating that backward stepwise outperforms the forward stepwise method in terms of all criterion.
compare_fwd_bwd = data.frame(rbind(rbind(summary(slr_Trg_fwd)$rss, summary(slr_Trg_fwd)$adjr2, summary(slr_Trg_fwd)$cp, summary(slr_Trg_fwd)$bic)[,which.min(summary(slr_Trg_fwd)$bic)],
rbind(summary(slr_Trg_bwd)$rss, summary(slr_Trg_bwd)$adjr2, summary(slr_Trg_bwd)$cp, summary(slr_Trg_bwd)$bic)[,which.min(summary(slr_Trg_bwd)$bic)]
))
rownames(compare_fwd_bwd) = c('Forward','Backward')
colnames(compare_fwd_bwd) = c('RSS','AdjR2','Mallow-CP','BIC')
compare_fwd_bwd
Based on the four sets of selected variables - Forward, Backward, Lasso-1se, Lasso-min, we then examine their corresponding VIFs:
#VIF seems to be better compared to using all features, although still some problems with (Site2_MeltingT,Site2_BCCenergy_pa,BCCenergy_pa_composition_average)
var_all = colnames(data3)[4:21]
var_fwd = names(coef(slr_Trg_fwd, which.min(summary(slr_Trg_fwd)$bic)))[-1]
var_bwd = names(coef(slr_Trg_bwd, which.min(summary(slr_Trg_bwd)$bic)))[-1]
var_lasso_1se = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.1se) != 0)][-1]
var_lasso_min = rownames(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min))[which(predict(out_slr_Trg_lasso ,type="coefficients",s=cv_slr_Trg_lasso$lambda.min) != 0)][-1]
slr_Trg_fwd_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_fwd))])
slr_Trg_bwd_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
slr_Trg_lasso_1se_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
slr_Trg_lasso_min_fit = lm(Trg ~ . , data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])
#All
vif(slr_Trg_fit)
## BoilingT_difference Density_composition_average
## 14.567801 28.925799
## HeatVaporization_difference HeatVaporization_max_value
## 37.770249 13.531181
## IsBoron_composition_average IsDBlock_composition_average
## 2.288699 21.110220
## MeltingT_difference NdValence_composition_average
## 5.466689 14.528577
## NdValence_difference NsUnfilled_difference
## 2.507814 2.251670
## NValance_composition_average Site1_Density
## 7.435110 22.696909
## Site1_HeatCapacityMass Site1_HeatFusion
## 65719.798341 6.526110
## Site1_IsDBlock Site1_NdValence
## 20.592039 14.086964
## Site1_SpecificHeatCapacity valence_difference
## 65629.954937 3.040346
#Stepwise Forward Selection
vif(slr_Trg_fwd_fit)
## BoilingT_difference Density_composition_average
## 9.788566 11.276896
## HeatVaporization_difference IsBoron_composition_average
## 12.700805 1.555682
## NdValence_composition_average NdValence_difference
## 5.713336 1.530404
## NValance_composition_average Site1_HeatCapacityMass
## 6.235090 38965.787462
## Site1_SpecificHeatCapacity valence_difference
## 38920.907349 2.334048
#Stepwise Backward Selection
vif(slr_Trg_bwd_fit)
## BoilingT_difference Density_composition_average
## 10.604841 11.261591
## HeatVaporization_difference IsBoron_composition_average
## 14.246700 1.284963
## NValance_composition_average Site1_Density
## 6.406883 12.084051
## Site1_HeatCapacityMass Site1_HeatFusion
## 2.644843 5.022383
## Site1_IsDBlock Site1_NdValence
## 11.552154 7.399463
## valence_difference
## 2.784167
#Lasso 1se
vif(slr_Trg_lasso_1se_fit)
## BoilingT_difference IsBoron_composition_average
## 1.468956 1.371307
## IsDBlock_composition_average NdValence_composition_average
## 8.776673 3.577632
## NsUnfilled_difference Site1_HeatCapacityMass
## 1.446377 1.743887
## Site1_IsDBlock valence_difference
## 9.274189 2.219276
#Lasso min
vif(slr_Trg_lasso_min_fit)
## BoilingT_difference Density_composition_average
## 13.671398 28.579766
## HeatVaporization_difference HeatVaporization_max_value
## 35.664099 11.886105
## IsBoron_composition_average IsDBlock_composition_average
## 1.857241 18.362539
## MeltingT_difference NdValence_composition_average
## 5.458959 14.078186
## NdValence_difference NsUnfilled_difference
## 2.466051 2.199454
## NValance_composition_average Site1_Density
## 7.299569 22.431616
## Site1_HeatCapacityMass Site1_HeatFusion
## 2.980744 5.814029
## Site1_IsDBlock Site1_NdValence
## 20.334514 12.878039
## valence_difference
## 3.038898
Compared to the full model, the model selected by different variable selection methods seem to have less multicollinearity issues indicated by the smaller VIFs, except for the model selected by stepwise forward as it did not drop Site1_SpecificHeatCapacity
nor Site1_HeatCapacityMass
, and these two features are highly correlated.
Next, we look at the residual diagnostic plots for each of the models above:
#Diagnostic Plots
par(mfrow = c(2,2))
#All
plot(slr_Trg_fit, which = c(1,2,4,5))
#Stepwise Forward Selection
plot(slr_Trg_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(slr_Trg_bwd_fit, which = c(1,2,4,5))
#Lasso 1se
plot(slr_Trg_lasso_1se_fit, which = c(1,2,4,5))
#Lasso min
plot(slr_Trg_lasso_min_fit, which = c(1,2,4,5))
All residuals plots seem good, while the QQplots indicate some departure from normality at lower and upper tails.
In addition, the Cook’s distance plots suggest that there are a few highly influential instances which are related to Al/Au/Pd/Ti majority element.
data3[c(31,35,36,473,487),]
Through further investigation, it can be found that Pd95Si5 and Ti40Zr10Cu34Pd16 have large magnitude of standardized residuals, which caused them to be identified as influential instances. Furthermore, Al89.5Ni3.5Y7, Au77.8Si8.4Ge13.8 and Au77Si9.4Ge13.6 are also flagged as influential due to their Trg being in the range of 0.22 to 0.38 which are much smaller than the other instances, and their relatively large standardized residuals. Furthermore, Al89.5Ni3.5Y7, Au77.8Si8.4Ge13.8 and Au77Si9.4Ge13.6 have high leverages, which is caused by some of their features taking extremely large/small values. For example, Al89.5Ni3.5Y7 has the highest IsBoron_composition_average
, Au77.8Si8.4Ge13.8 have the highest Density_composition_average
and NValance_composition_average
, and Au77Si9.4Ge13.6 has the lowest Site1_HeatCapacityMass
In this section, we make use of the above four sets of chosen features and fit regression model where we add quadratic terms of each selected feature into the model.
quadratic_slr_Trg_formula =as.formula(
paste(
'Trg', '~', paste(
var_all, collapse = '+'
), '+', paste(
'I(',var_all,'^2)', collapse = '+'
)
)
)
quadratic_slr_Trg_fwd_formula =as.formula(
paste(
'Trg', '~', paste(
var_fwd, collapse = '+'
), '+', paste(
'I(',var_fwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_Trg_bwd_formula =as.formula(
paste(
'Trg', '~', paste(
var_bwd, collapse = '+'
), '+', paste(
'I(',var_bwd,'^2)', collapse = '+'
)
)
)
quadratic_slr_Trg_lasso_1se_formula =as.formula(
paste(
'Trg', '~', paste(
var_lasso_1se, collapse = '+'
), '+', paste(
'I(',var_lasso_1se,'^2)', collapse = '+'
)
)
)
quadratic_slr_Trg_lasso_min_formula =as.formula(
paste(
'Trg', '~', paste(
var_lasso_min, collapse = '+'
), '+', paste(
'I(',var_lasso_min,'^2)', collapse = '+'
)
)
)
#All
quadratic_slr_Trg_fit = lm(quadratic_slr_Trg_formula, data = data3[,c(3,which(colnames(data3) %in% var_all))])
#quadratic_slr_Trg_formula
#Stepwise Forward Selection
quadratic_slr_Trg_fwd_fit = lm(quadratic_slr_Trg_fwd_formula, data = data3[,c(3,which(colnames(data3) %in% var_fwd))])
#quadratic_slr_Trg_fwd_formula
#Stepwise Backward Selection
quadratic_slr_Trg_bwd_fit = lm(quadratic_slr_Trg_bwd_formula, data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
#quadratic_slr_Trg_bwd_formula
#Lasso-1se
quadratic_slr_Trg_lasso_1se_fit = lm(quadratic_slr_Trg_lasso_1se_formula, data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
#quadratic_slr_Trg_lasso_1se_formula
#Lasso-min
quadratic_slr_Trg_lasso_min_fit = lm(quadratic_slr_Trg_lasso_min_formula, data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])
#quadratic_slr_Trg_lasso_min_formula
The residual diagnostic plots of each fitted model with quadratic terms are given below:
par(mfrow = c(2,2))
#All
plot(quadratic_slr_Trg_fit, which = c(1,2,4,5))
#Stepwise Forward Selection
plot(quadratic_slr_Trg_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(quadratic_slr_Trg_bwd_fit, which = c(1,2,4,5))
#Lasso-1se
plot(quadratic_slr_Trg_lasso_1se_fit, which = c(1,2,4,5))
#Lasso-min
plot(quadratic_slr_Trg_lasso_min_fit, which = c(1,2,4,5))
In general, the residuals plot of the model using quadratic terms are similar to those of the model involving only linear terms, which demonstrate no obvious problem. Additionally, the QQplots of the model using quadratic terms are slightly improved especially at the upper tails. The Cook’s distance plots identified the following influential observations associated with Ca/Pd/Ti majority element, which mostly overlap with those identified from the Cook’s distance plots of the models involving only linear terms:
data3[c(32,473,487),]
In this section, we make use of the above four sets of chosen features and fit regression model where we add all possible pairwise interaction terms of the selected features into the model. The main motivation for using interaction terms is to allow for the possibility that the effect of a feature on Trg to be dependent on the values of other features.
interact_slr_Trg_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_all ))])
interact_slr_Trg_fwd_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_fwd ))])
interact_slr_Trg_bwd_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_bwd))])
interact_slr_Trg_lasso_1se_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_lasso_1se))])
interact_slr_Trg_lasso_min_fit = lm(Trg ~ .^2, data = data3[,c(3,which(colnames(data3) %in% var_lasso_min))])
The residual diagnostic plots of each fitted model with interaction terms are given below:
par(mfrow = c(2,2))
#All
plot(interact_slr_Trg_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Stepwise Forward Selection
plot(interact_slr_Trg_fwd_fit, which = c(1,2,4,5))
#Stepwise Backward Selection
plot(interact_slr_Trg_bwd_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
#Lasso-1se
plot(interact_slr_Trg_lasso_1se_fit, which = c(1,2,4,5))
#Lasso-min
plot(interact_slr_Trg_lasso_min_fit, which = c(1,2,4,5))
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Again, the residuals plots of the model involving interaction terms are highly similar to those of the model involving only linear terms, which show no major issue. Also, the QQplots of the models involving interaction terms are slightly improved, especially near the upper tails. The Cook’s distance plots identified the following influential observations:
data3[c(32,84,231,303,473,482,476),]
Next, we provide some interpretation into the identification of outliers based on the model interact_slr_Trg_lasso_1se_fit
, namely Ca65Ga35 and Pd95Si5. These two instances have large magnitude of standardized residuals with values around 5. Furthermore, their leverages are also relatively high compared to the other instances. In particular, Ca65Ga35 has the lowest IsDBlock_composition_average
, while Pd95Si5 has the highest NdValence_composition_average
together with the lowest BoilingT_difference
and IsBoron_composition_average
in the dataset. Therefore, the large standardized residuals combined with the relatively high leverages caused these two instances to be flagged as influential in the fitted model using pairwise interaction terms of features selected by Lasso-1se.
We also add some interpretation into the identification of outliers based on the model interact_slr_Trg_lasso_min_fit
, i.e., Au35Ca65, Ga18Sr82 and Pt42.5Cu27Ni59.5P21. These instances all have leverages around 0.99, primarily due to some of their features taking extremely large/small values. For example, Au35Ca65 have the smallest IsBoron_composition_average
and Site1_HeatCapacityMass
, Ga18Sr82 has the smallest IsDBlock_composition_average
, and Pt42.5Cu27Ni59.5P21 has the largest Site1_Density
. Their extremely high leverages combined with their relatively large standardized residuals (1.44, 1.79 and -3.86 for Au35Ca65, Ga18Sr82 and Pt42.5Cu27Ni59.5P21, respetively) resulted in them being identifed as influential instances.
The following table compares the different fitted models in terms of the adjusted R\(^2\):
data.frame(Linear = c(summary(slr_Trg_fit)$adj.r.squared,
summary(slr_Trg_fwd_fit)$adj.r.squared,
summary(slr_Trg_bwd_fit)$adj.r.squared,
summary(slr_Trg_lasso_1se_fit)$adj.r.squared,
summary(slr_Trg_lasso_min_fit)$adj.r.squared),
Linear_and_Quadratic = c(summary(quadratic_slr_Trg_fit)$adj.r.squared,
summary(quadratic_slr_Trg_fwd_fit)$adj.r.squared,
summary(quadratic_slr_Trg_bwd_fit)$adj.r.squared,
summary(quadratic_slr_Trg_lasso_1se_fit)$adj.r.squared,
summary(quadratic_slr_Trg_lasso_min_fit)$adj.r.squared),
Linear_and_Interaction = c(summary(interact_slr_Trg_fit)$adj.r.squared,
summary(interact_slr_Trg_fwd_fit)$adj.r.squared,
summary(interact_slr_Trg_bwd_fit)$adj.r.squared,
summary(interact_slr_Trg_lasso_1se_fit)$adj.r.squared,
summary(interact_slr_Trg_lasso_min_fit)$adj.r.squared), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)
The following table compares the different fitted models in terms of the number of parameters (regression coefficients + residual variance) to be estimated:
data.frame(Linear = c(sum(!is.na(slr_Trg_fit$coefficients)) + 1,
sum(!is.na(slr_Trg_fwd_fit$coefficients)) + 1,
sum(!is.na(slr_Trg_bwd_fit$coefficients)) + 1,
sum(!is.na(slr_Trg_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(slr_Trg_lasso_min_fit$coefficients)) + 1),
Linear_and_Quadratic = c(sum(!is.na(quadratic_slr_Trg_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_Trg_fwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_Trg_bwd_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_Trg_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(quadratic_slr_Trg_lasso_min_fit$coefficients)) + 1),
Linear_and_Interaction = c(sum(!is.na(interact_slr_Trg_fit$coefficients)) + 1,
sum(!is.na(interact_slr_Trg_fwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_Trg_bwd_fit$coefficients)) + 1,
sum(!is.na(interact_slr_Trg_lasso_1se_fit$coefficients)) + 1,
sum(!is.na(interact_slr_Trg_lasso_min_fit$coefficients)) + 1), row.names = c('All','Forward','Backward','Lasso-1se','Lasso-min')
)