import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import tree
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.cm as cm
from sklearn.decomposition import PCA
from ELib import *
import warnings
warnings.filterwarnings('ignore')
import ResidualDecomposition as RD
np.random.seed(0)
data = pd.read_csv("../../Data/applied/Perovskite_Stability_with_features.csv")
data.head()
Material Composition | A site #1 | A site #2 | A site #3 | B site #1 | B site #2 | B site #3 | X site | Number of elements | energy_above_hull (meV/atom) | ... | host_Asite0_IsBCC | host_Asite0_IsCubic | host_Asite0_IsAlkali | host_Asite0_OrbitalD | host_Asite0_NsValence | host_Bsite0_At. # | host_Bsite0_IsHexagonal | host_Bsite0_IsNoblegas | Asite_IsAlkali_max | Bsite_IsMetal_max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ba1Sr7V8O24 | Ba | Sr | NaN | V | NaN | NaN | O | 4 | 29.747707 | ... | 0 | 0 | 0 | 0 | 2 | 23 | 0 | 0 | 1 | 1 |
1 | Ba2Bi2Pr4Co8O24 | Ba | Bi | Pr | Co | NaN | NaN | O | 5 | 106.702335 | ... | 0 | 0 | 0 | 0 | 2 | 27 | 1 | 0 | 1 | 1 |
2 | Ba2Ca6Fe8O24 | Ba | Ca | NaN | Fe | NaN | NaN | O | 4 | 171.608093 | ... | 0 | 0 | 0 | 0 | 2 | 26 | 0 | 0 | 1 | 1 |
3 | Ba2Cd2Pr4Ni8O24 | Ba | Cd | Pr | Ni | NaN | NaN | O | 5 | 284.898190 | ... | 0 | 0 | 0 | 0 | 2 | 28 | 0 | 0 | 1 | 1 |
4 | Ba2Dy6Fe8O24 | Ba | Dy | NaN | Fe | NaN | NaN | O | 4 | 270.007913 | ... | 0 | 0 | 0 | 0 | 2 | 26 | 0 | 0 | 1 | 1 |
5 rows × 81 columns
Y = data['energy_above_hull (meV/atom)']
X = data.iloc[:,12:]
X = scale_data(X).values
rf = RandomForestRegressor(n_estimators=25)
rf.fit(X, Y)
features_vals = np.sort(rf_best.feature_importances_)
features_args = np.argsort(rf_best.feature_importances_)
plt.xticks(rotation=90)
plt.barh(data.columns[4:][features_args][:20], features_vals[:20])
plt.title("Feature Importances according to Random Forest")
plt.xlabel("Relative Importance Value")
plt.ylabel("Feature Descriptor")
Text(0, 0.5, 'Feature Descriptor')
data_lr = np.genfromtxt("Data/LR_PS.csv", delimiter=',')
data_lr_composition = data_lr
summed_composition = np.sum(data_lr_composition, axis=0)
data_lr_contribution = ((data_lr_composition.T * -np.sign(summed_composition))).T
data_rf = np.genfromtxt("Data/RF_PS.csv", delimiter=',')
data_rf_composition = data_rf
summed_composition = np.sum(data_rf_composition, axis=0)
data_rf_contribution = ((data_rf_composition.T * -np.sign(summed_composition))).T
def plot_group(elements_subgroup, data_d, index_i):
g1 = d[d.iloc[:,index_i].isin(elements_subgroup)].index
g2 = d[~d.iloc[:,index_i].isin(elements_subgroup)].index
# plt.xlabel("Composition Mean")
# plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition, axis=0)[g2], np.mean(data_rf_contribution, axis=1)[g2], c='b', s=10)
plt.scatter(np.mean(data_rf_composition, axis=0)[g1], np.mean(data_rf_contribution, axis=1)[g1], c='r', s=10)
plt.figure(figsize=(14,10))
plt.subplot(221)
# plt.title("CC-Plot for DSD using RF Colored by Trg")
plt.xlabel("Composition Mean", fontsize=20)
plt.ylabel("Contribution Mean", fontsize=20)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_lr_composition, axis=0), np.mean(data_lr_contribution, axis=1), c=Y)
plt.xticks(fontsize=12, rotation=15)
plt.yticks(fontsize=12)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.subplot(222)
# plt.title("CC-Plot for DSD using RF Colored by Trg")
plt.xlabel("Composition Mean", fontsize=20)
plt.ylabel("Contribution Mean", fontsize=20)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition, axis=0), np.mean(data_rf_contribution, axis=1), c=Y)
plt.xticks(fontsize=12, rotation=15)
plt.yticks(fontsize=12)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.savefig("Figures/CC_PS_lr_rf.pdf", bbox_inches='tight')
plt.figure(figsize=(10,8))
elements, counts = np.unique(data.iloc[:,1], return_counts=True)
sorted_counts = np.argsort(counts)
plt.title("CC-Plot for PS grouped by A Site #1")
plt.xlabel("Composition Mean")
plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
cmap = cm.get_cmap('viridis', len(elements))
color_count = 0
for e in elements:
indices = np.where(data['A site #1'] == e)
plt.scatter(np.mean(data_rf_composition, axis=0)[indices], np.mean(data_rf_contribution, axis=1)[indices]\
,color=cmap.colors[color_count], label=e, s=20, marker=(3, 0, color_count*45))
color_count+=1
plt.legend()
<matplotlib.legend.Legend at 0x126fcec3be0>
plt.figure(figsize=(10,8))
elements, counts = np.unique(data.iloc[:,4], return_counts=True)
sorted_counts = np.argsort(counts)
plt.title("CC-Plot for PS grouped by B Site #1")
plt.xlabel("Composition Mean")
plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
cmap = cm.get_cmap('viridis', len(elements))
color_count = 0
for e in elements:
indices = np.where(data['B site #1'] == e)
plt.scatter(np.mean(data_rf_composition, axis=0)[indices], np.mean(data_rf_contribution, axis=1)[indices]\
, color=cmap.colors[color_count], label=e, s=20, marker=(3, 0, color_count*45))
color_count+=1
plt.legend()
<matplotlib.legend.Legend at 0x126c5724940>
uniq_elementsA = np.unique(data['A site #1'])
uniq_elements_countA = len(uniq_elementsA)
uniq_elementsB = np.unique(data['A site #1'])
uniq_elements_countB = len(uniq_elementsB)
avc_matrix = np.zeros((uniq_elements_countA, uniq_elements_countB))
for y in range(0, uniq_elements_countA):
for z in range(0, uniq_elements_countB):
a = np.where(data['A site #1'] == uniq_elementsA[y])[0]
b = np.where(data['A site #1'] == uniq_elementsB[z])[0]
avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)
plt.figure(figsize=(12,10))
plt.xticks(ticks=np.arange(0, uniq_elements_countA), labels=uniq_elementsA, fontsize=14)
plt.yticks(ticks=np.arange(0, uniq_elements_countB), labels=uniq_elementsB, fontsize=14)
plt.imshow(avc_matrix.T)
plt.ylabel("A Site Element Contributing Towards", fontsize=24)
plt.xlabel("A Site Element Contribution Target", fontsize=24)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.savefig("Figures/Perov_heatmap_Asite.pdf", bbox_inches='tight')
uniq_elementsA = np.unique(data['B site #1'])
uniq_elements_countA = len(uniq_elementsA)
uniq_elementsB = np.unique(data['B site #1'])
uniq_elements_countB = len(uniq_elementsB)
avc_matrix = np.zeros((uniq_elements_countA, uniq_elements_countB))
for y in range(0, uniq_elements_countA):
for z in range(0, uniq_elements_countB):
a = np.where(data['B site #1'] == uniq_elementsA[y])[0]
b = np.where(data['B site #1'] == uniq_elementsB[z])[0]
avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)
plt.figure(figsize=(12,10))
plt.xticks(ticks=np.arange(0, uniq_elements_countA), labels=uniq_elementsA, fontsize=14)
plt.yticks(ticks=np.arange(0, uniq_elements_countB), labels=uniq_elementsB, fontsize=14)
plt.imshow(avc_matrix.T)
plt.xlabel("B Site Elements", fontsize=24)
plt.ylabel("B Site Elements", fontsize=24)
cbar = plt.colorbar()
cbar.ax.tick_params()
plt.savefig("Figures/Perov_heatmap_Bsite.pdf", bbox_inches='tight')