import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import tree
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from ELib import *
import warnings
warnings.filterwarnings('ignore')
import ResidualDecomposition as RD
np.random.seed(0)
data = pd.read_csv("../../Data/applied/Dilute_Solute_Diffusion_with_features.csv")
def plot_group(elements_subgroup, data_d, index_i):
g1 = d[d.iloc[:,index_i].isin(elements_subgroup)].index
g2 = d[~d.iloc[:,index_i].isin(elements_subgroup)].index
# plt.xlabel("Composition Mean")
# plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition, axis=0)[g2], np.mean(data_rf_contribution, axis=1)[g2], c='b', s=10)
plt.scatter(np.mean(data_rf_composition, axis=0)[g1], np.mean(data_rf_contribution, axis=1)[g1], c='r', s=10)
Y = data['E_raw (eV)']
Y_N = data['Enorm (eV)']
# Y = data['Enorm (eV)']
X = data.iloc[:,4:]
feature_names = X.columns
X = X.values
data.head(6)
Material compositions 1 | Material compositions 2 | Enorm (eV) | E_raw (eV) | Site2_MeltingT | Site1_MendeleevNumber | Site1_MiracleRadius | GSestFCClatcnt_max_value | Site2_BCCenergy_pa | Site1_BCCfermi | ... | BCCenergy_pa_composition_average | MiracleRadius_min_value | MeltingT_min_value | NUnfilled_max_value | Site2_Group | Site1_CovalentRadii | Site2_NUnfilled | SpecificHeatCapacity_difference | Site1_Electronegativity | BCCenergy_pa_arithmetic_average | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ag | Ag | 0.000000 | 1.824450 | -0.531814 | 0.402504 | 0.263045 | -0.626925 | 1.021337 | -1.217816 | ... | 1.386573 | 0.859808 | -0.046109 | -1.546561 | 0.626615 | 0.690287 | -1.036320 | -1.057898 | 0.099471 | 1.386573 |
1 | Ag | Co | -0.090142 | 1.734308 | 0.064051 | 0.402504 | 0.263045 | -0.626925 | -0.248457 | -1.217816 | ... | 0.484851 | -0.730180 | -0.046109 | -0.801582 | 0.116318 | 0.690287 | -0.391288 | -0.388610 | 0.099471 | 0.484851 |
2 | Ag | Cr | 0.259139 | 2.083589 | 0.524584 | 0.402504 | 0.263045 | -0.626925 | -1.000991 | -1.217816 | ... | -0.049547 | -0.311762 | -0.046109 | 0.315885 | -0.649128 | 0.690287 | 0.576260 | -0.287857 | 0.099471 | -0.049547 |
3 | Ag | Cu | -0.022200 | 1.802250 | -0.394504 | 0.402504 | 0.263045 | -0.626925 | 0.746249 | -1.217816 | ... | 1.191224 | -0.646497 | -0.046109 | -1.546561 | 0.626615 | 0.690287 | -1.036320 | -0.518150 | 0.099471 | 1.191224 |
4 | Ag | Fe | 0.317672 | 2.142122 | 0.112116 | 0.402504 | 0.263045 | -0.626925 | -0.637552 | -1.217816 | ... | 0.208542 | -0.730180 | -0.046109 | -0.429093 | -0.138831 | 0.690287 | -0.068772 | -0.287857 | 0.099471 | 0.208542 |
5 | Ag | Mn | 0.202186 | 2.026636 | -0.214281 | 0.402504 | 0.263045 | -0.626925 | -0.811899 | -1.217816 | ... | 0.084733 | -0.144395 | -0.046109 | -0.056604 | -0.393979 | 0.690287 | 0.253744 | -0.176309 | 0.099471 | 0.084733 |
6 rows × 29 columns
values1 = []
for l in np.unique(data['Material compositions 1']):
indices = np.where(data['Material compositions 1'] == l)
values1.append(np.mean(cooks[indices]))
values2 = []
for l in np.unique(data['Material compositions 2']):
indices = np.where(data['Material compositions 2'] == l)
values2.append(np.mean(cooks[indices]))
plt.figure(figsize=(14,10))
plt.subplot(223)
plt.title("Effect on the Model")
plt.xlabel("Material Composition 1")
plt.ylabel("Cooks Distance")
plt.bar(np.unique(data['Material compositions 1']), values1)
plt.axhline(np.mean(cooks), c='r')
plt.subplot(211)
plt.title("Effect on the Model")
plt.xlabel("Material Composition 2")
plt.ylabel("Cooks Distance")
plt.xticks(rotation=70)
plt.bar(np.unique(data['Material compositions 2']), values2)
<BarContainer object of 58 artists>
values1 = []
for l in np.unique(data['Material compositions 1']):
indices = np.where(data['Material compositions 1'] == l)
values1.append(np.mean(dffits[indices]))
values2 = []
for l in np.unique(data['Material compositions 2']):
indices = np.where(data['Material compositions 2'] == l)
values2.append(np.mean(dffits[indices]))
plt.figure(figsize=(14,10))
plt.subplot(223)
plt.title("Effect on the Model")
plt.xlabel("Material Composition 1")
plt.ylabel("Difference in Fits")
plt.bar(np.unique(data['Material compositions 1']), values1)
plt.axhline(np.mean(dffits), c='r')
plt.subplot(211)
plt.title("Effect on the Model")
plt.xlabel("Material Composition 2")
plt.ylabel("Difference in Fits")
plt.xticks(rotation=70)
plt.bar(np.unique(data['Material compositions 2']), values2)
<BarContainer object of 58 artists>
These diagrams immediately tell us that Lead (Pb) and Calcium (Ca) as the first material composition has the most effect on the model compared to the other elements.
We see that Potassium (K), Lithium (Li) and Thorium (Th) have the largest effect as the secondary material composition.
We confirm using the Residual Decomposition SHAPley values (RDSHAP) that linear regression is indeed a poor fit for this model below
data_lr = np.genfromtxt("Data/LR_DSD.csv", delimiter=',')
data_lr_composition = data_lr
summed_composition = np.sum(data_lr_composition, axis=0)
data_lr_contribution = ((data_lr_composition.T * -np.sign(summed_composition))).T
data_rf = np.genfromtxt("Data/RF_DSD.csv", delimiter=',')
data_rf_composition = data_rf
summed_composition = np.sum(data_rf_composition, axis=0)
data_rf_contribution = ((data_rf_composition.T * -np.sign(summed_composition))).T
data_rf_norm = np.genfromtxt("Data/RF_DSD_norm.csv", delimiter=',')
data_rf_composition_norm = data_rf_norm
summed_composition = np.sum(data_rf_composition_norm, axis=0)
data_rf_contribution_norm = ((data_rf_composition_norm.T * -np.sign(summed_composition))).T
plt.figure(figsize=(18,14))
plt.subplot(331)
# plt.title("CC-Plot for DSD using LR Colored by Enorm")
plt.xlabel("Composition Mean", fontsize=16)
plt.ylabel("Contribution Mean", fontsize=16)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_lr_composition, axis=0), np.mean(data_lr_contribution, axis=1), c=Y)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.subplot(332)
# plt.title("CC-Plot for DSD using RF Colored by Enorm")
plt.xlabel("Composition Mean", fontsize=16)
plt.ylabel("Contribution Mean", fontsize=16)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition, axis=0), np.mean(data_rf_contribution, axis=1), c=Y)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.subplot(333)
# plt.title("CC-Plot for DSD using RF Colored by Enorm")
plt.xlabel("Composition Mean", fontsize=16)
plt.ylabel("Contribution Mean", fontsize=16)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition_norm, axis=0), np.mean(data_rf_contribution_norm, axis=1), c=Y)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.savefig("Figures/CC_DSD_lr_rf.pdf", bbox_inches='tight')
elements, counts = np.unique(data.iloc[:,0], return_counts=True)
sorted_counts = np.argsort(counts)
plt.figure(figsize=(14,10))
plt.subplot(223)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 1'] == e)
influence_vals.append(np.mean(np.mean((data_lr_contribution)[indices], axis=0)))
plt.title("Relative Influence of the instances within each main element group")
plt.bar(elements, influence_vals)
plt.xlabel("Element (1)")
plt.ylabel("Relative Contribution (Influence) Level")
plt.axhline(np.mean(data_lr_contribution), c='r')
plt.subplot(224)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 1'] == e)[0]
influence_vals.append(np.mean(data['E_raw (eV)'][indices]))
plt.title("Average E_raw (eV) by element group")
plt.xlabel("Element (1)")
plt.ylabel("Average E_raw (eV)")
plt.bar(elements, influence_vals)
plt.subplot(211)
plt.xticks(rotation=90)
elements, counts = np.unique(data.iloc[:,1], return_counts=True)
sorted_counts = np.argsort(counts)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 2'] == e)
influence_vals.append(np.mean(np.mean((data_lr_contribution)[indices], axis=0)))
plt.title("Relative Influence of the instances within each sub element group")
plt.bar(elements, influence_vals)
<BarContainer object of 58 artists>
We see the non-uniformity in the CC-plot generated from a Linear Regression model on this DSD dataset, on the right we see the much more uniform CC-plot using a Random Forest model which demonstrates a better model fit for the data.
We also see that some yellow and purple instances with boundary values for eV tend to have a larger effect on this model. We visualise the effect of each group similar to the linear regression case.
elements, counts = np.unique(data.iloc[:,0], return_counts=True)
sorted_counts = np.argsort(counts)
plt.figure(figsize=(14,10))
plt.subplot(223)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 1'] == e)
influence_vals.append(np.mean(np.mean((data_rf_contribution)[indices], axis=0)))
plt.title("Relative Influence of the instances within each main element group")
plt.bar(elements, influence_vals)
plt.xlabel("Element (1)")
plt.ylabel("Relative Contribution (Influence) Level")
plt.axhline(np.mean(data_rf_contribution), c='r')
plt.subplot(224)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 1'] == e)[0]
influence_vals.append(np.mean(data['E_raw (eV)'][indices]))
plt.title("Average E_raw (eV) by element group")
plt.xlabel("Element (1)")
plt.ylabel("Average E_raw (eV)")
plt.bar(elements, influence_vals)
plt.subplot(211)
plt.xticks(rotation=90)
elements, counts = np.unique(data.iloc[:,1], return_counts=True)
sorted_counts = np.argsort(counts)
influence_vals = []
for e in elements:
indices = np.where(data['Material compositions 2'] == e)
influence_vals.append(np.mean(np.mean((data_rf_contribution)[indices], axis=0)))
plt.title("Relative Influence of the instances within each sub element group")
plt.bar(elements, influence_vals)
<BarContainer object of 58 artists>
We see that the Lead (Pb) Material composition 1 again has the greatest effect on the model, however, the rest of the elements tend to have lower effects, what is interesting is the Iridium (Ir) group has negative contribution which tends to decrease the model prediction errors.
We see that the material composition 2 values are much more varied in this case, with several elements with negative effects that decrease the residuals of the model.
We carry out further analysis of the CC plot to determine various the effects of various groupings of the data
# Grouping by https://pubchem.ncbi.nlm.nih.gov/periodic-table/
d = data
i = 0
plt.figure(figsize=(14,14))
plt.subplot(331)
plt.title("Row 1 Transition Metals")
r1_metals = ['Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu']
plot_group(r1_metals, d, i)
plt.subplot(332)
plt.title("Row 2 Transition Metals")
r2_metals = ['Zr', 'Nb', "Mo", 'Tc', "Ru", 'Rh', 'Pd', 'Ag']
plot_group(r2_metals, d, i)
plt.subplot(333)
plt.title("Row 3 Transition Metals")
r3_metals = ['Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au']
plot_group(r3_metals, d, i)
# plt.savefig("Figures/
r1_count = 0
for zzz in r1_metals:
r1_count += (len(data[data['Material compositions 1'] == zzz]))
r2_count = 0
for zzz in r2_metals:
r2_count += (len(data[data['Material compositions 1'] == zzz]))
r3_count = 0
for zzz in r3_metals:
r3_count += (len(data[data['Material compositions 1'] == zzz]))
uniq_elements = np.unique(data['Material compositions 1'])
uniq_elements_count = len(uniq_elements)
avc_matrix = np.zeros((uniq_elements_count, uniq_elements_count))
for y in range(0, uniq_elements_count):
for z in range(0, uniq_elements_count):
a = np.where(data['Material compositions 1'] == uniq_elements[y])[0]
b = np.where(data['Material compositions 1'] == uniq_elements[z])[0]
avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)
plt.figure(figsize=(12,10))
plt.xticks(ticks=np.arange(0, uniq_elements_count), labels=uniq_elements, fontsize=18)
plt.yticks(ticks=np.arange(0, uniq_elements_count), labels=uniq_elements, fontsize=18)
plt.imshow(avc_matrix)
cbar = plt.colorbar()
cbar.ax.tick_params()
ticklabs = cbar.ax.get_yticks()
cbar.ax.set_yticklabels(ticklabs, fontsize=18)
plt.savefig("Figures/DSD_heatmap_mc1.pdf", bbox_inches='tight')
plt.figure(figsize=(12,10))
uniq_elements = np.unique(data['Material compositions 2'])
uniq_elements_count = len(uniq_elements)
avc_matrix = np.zeros((uniq_elements_count, uniq_elements_count))
for y in range(0, uniq_elements_count):
for z in range(0, uniq_elements_count):
a = np.where(data['Material compositions 2'] == uniq_elements[y])[0]
b = np.where(data['Material compositions 2'] == uniq_elements[z])[0]
avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)
counter = 1
unique_elements_display = []
for i in uniq_elements:
if counter % 2 == 0:
unique_elements_display.append('{}⟼'.format(i))
else:
unique_elements_display.append(i)
counter += 1
plt.xticks(ticks=np.arange(0, uniq_elements_count), labels=unique_elements_display, rotation=90, fontsize=15)
plt.yticks(ticks=np.arange(0, uniq_elements_count), labels=unique_elements_display, fontsize=15)
plt.imshow(avc_matrix)
cbar = plt.colorbar()
cbar.ax.tick_params()
ticklabs = cbar.ax.get_yticks()
cbar.ax.set_yticklabels(ticklabs, fontsize=18)
plt.savefig("Figures/DSD_heatmap_mc2.pdf", bbox_inches='tight')