In [36]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys

from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import tree
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.cm as cm
from sklearn.decomposition import PCA

from ELib import *

import warnings
warnings.filterwarnings('ignore')
import ResidualDecomposition as RD

np.random.seed(0)
In [37]:
data = pd.read_csv("../../Data/applied/Perovskite_Stability_with_features.csv")
In [38]:
data.head()
Out[38]:
Material Composition A site #1 A site #2 A site #3 B site #1 B site #2 B site #3 X site Number of elements energy_above_hull (meV/atom) ... host_Asite0_IsBCC host_Asite0_IsCubic host_Asite0_IsAlkali host_Asite0_OrbitalD host_Asite0_NsValence host_Bsite0_At. # host_Bsite0_IsHexagonal host_Bsite0_IsNoblegas Asite_IsAlkali_max Bsite_IsMetal_max
0 Ba1Sr7V8O24 Ba Sr NaN V NaN NaN O 4 29.747707 ... 0 0 0 0 2 23 0 0 1 1
1 Ba2Bi2Pr4Co8O24 Ba Bi Pr Co NaN NaN O 5 106.702335 ... 0 0 0 0 2 27 1 0 1 1
2 Ba2Ca6Fe8O24 Ba Ca NaN Fe NaN NaN O 4 171.608093 ... 0 0 0 0 2 26 0 0 1 1
3 Ba2Cd2Pr4Ni8O24 Ba Cd Pr Ni NaN NaN O 5 284.898190 ... 0 0 0 0 2 28 0 0 1 1
4 Ba2Dy6Fe8O24 Ba Dy NaN Fe NaN NaN O 4 270.007913 ... 0 0 0 0 2 26 0 0 1 1

5 rows × 81 columns

In [4]:
Y = data['energy_above_hull (meV/atom)']
X = data.iloc[:,12:]
X = scale_data(X).values
In [39]:
rf = RandomForestRegressor(n_estimators=25)
rf.fit(X, Y)

features_vals = np.sort(rf_best.feature_importances_)
features_args = np.argsort(rf_best.feature_importances_)

plt.xticks(rotation=90)
plt.barh(data.columns[4:][features_args][:20], features_vals[:20])
plt.title("Feature Importances according to Random Forest")
plt.xlabel("Relative Importance Value")
plt.ylabel("Feature Descriptor")
Out[39]:
Text(0, 0.5, 'Feature Descriptor')
In [40]:
data_lr = np.genfromtxt("Data/LR_PS.csv", delimiter=',')
data_lr_composition = data_lr
summed_composition = np.sum(data_lr_composition, axis=0)
data_lr_contribution = ((data_lr_composition.T * -np.sign(summed_composition))).T

data_rf = np.genfromtxt("Data/RF_PS.csv", delimiter=',')
data_rf_composition = data_rf
summed_composition = np.sum(data_rf_composition, axis=0)
data_rf_contribution = ((data_rf_composition.T * -np.sign(summed_composition))).T
In [41]:
def plot_group(elements_subgroup, data_d, index_i):
    g1 = d[d.iloc[:,index_i].isin(elements_subgroup)].index
    g2 = d[~d.iloc[:,index_i].isin(elements_subgroup)].index
    # plt.xlabel("Composition Mean")
    # plt.ylabel("Contribution Mean")
    plt.axhline(0, c='r')
    plt.axvline(0, c='orange')
    plt.scatter(np.mean(data_rf_composition, axis=0)[g2], np.mean(data_rf_contribution, axis=1)[g2], c='b', s=10)
    plt.scatter(np.mean(data_rf_composition, axis=0)[g1], np.mean(data_rf_contribution, axis=1)[g1], c='r', s=10)
In [42]:
plt.figure(figsize=(14,10))

plt.subplot(221)

# plt.title("CC-Plot for DSD using RF Colored by Trg")
plt.xlabel("Composition Mean", fontsize=20)
plt.ylabel("Contribution Mean", fontsize=20)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_lr_composition, axis=0), np.mean(data_lr_contribution, axis=1), c=Y)

plt.xticks(fontsize=12, rotation=15)
plt.yticks(fontsize=12)

cbar = plt.colorbar()
cbar.ax.tick_params()

plt.subplot(222)

# plt.title("CC-Plot for DSD using RF Colored by Trg")
plt.xlabel("Composition Mean", fontsize=20)
plt.ylabel("Contribution Mean", fontsize=20)
plt.axhline(0, c='r')
plt.axvline(0, c='orange')
plt.scatter(np.mean(data_rf_composition, axis=0), np.mean(data_rf_contribution, axis=1), c=Y)

plt.xticks(fontsize=12, rotation=15)
plt.yticks(fontsize=12)

cbar = plt.colorbar()
cbar.ax.tick_params()

plt.savefig("Figures/CC_PS_lr_rf.pdf", bbox_inches='tight')
In [43]:
plt.figure(figsize=(10,8))

elements, counts = np.unique(data.iloc[:,1], return_counts=True)
sorted_counts = np.argsort(counts)

plt.title("CC-Plot for PS grouped by A Site #1")
plt.xlabel("Composition Mean")
plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')

cmap = cm.get_cmap('viridis', len(elements))

color_count = 0
for e in elements:
    indices = np.where(data['A site #1'] == e)    
    plt.scatter(np.mean(data_rf_composition, axis=0)[indices], np.mean(data_rf_contribution, axis=1)[indices]\
                ,color=cmap.colors[color_count], label=e, s=20, marker=(3, 0, color_count*45))
    
    color_count+=1
    
plt.legend()
Out[43]:
<matplotlib.legend.Legend at 0x126fcec3be0>
In [44]:
plt.figure(figsize=(10,8))

elements, counts = np.unique(data.iloc[:,4], return_counts=True)
sorted_counts = np.argsort(counts)

plt.title("CC-Plot for PS grouped by B Site #1")
plt.xlabel("Composition Mean")
plt.ylabel("Contribution Mean")
plt.axhline(0, c='r')
plt.axvline(0, c='orange')

cmap = cm.get_cmap('viridis', len(elements))

color_count = 0
for e in elements:
    indices = np.where(data['B site #1'] == e)    
    plt.scatter(np.mean(data_rf_composition, axis=0)[indices], np.mean(data_rf_contribution, axis=1)[indices]\
                , color=cmap.colors[color_count], label=e, s=20, marker=(3, 0, color_count*45))
    
    color_count+=1
    
plt.legend()
Out[44]:
<matplotlib.legend.Legend at 0x126c5724940>
In [45]:
uniq_elementsA = np.unique(data['A site #1'])
uniq_elements_countA = len(uniq_elementsA)
uniq_elementsB = np.unique(data['A site #1'])
uniq_elements_countB = len(uniq_elementsB)
avc_matrix = np.zeros((uniq_elements_countA, uniq_elements_countB))

for y in range(0, uniq_elements_countA):
    for z in range(0, uniq_elements_countB):
        a = np.where(data['A site #1'] == uniq_elementsA[y])[0]
        b = np.where(data['A site #1'] == uniq_elementsB[z])[0]
        avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
        
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)

plt.figure(figsize=(12,10))
plt.xticks(ticks=np.arange(0, uniq_elements_countA), labels=uniq_elementsA, fontsize=14)
plt.yticks(ticks=np.arange(0, uniq_elements_countB), labels=uniq_elementsB, fontsize=14)
plt.imshow(avc_matrix.T)

plt.ylabel("A Site Element Contributing Towards", fontsize=24)
plt.xlabel("A Site Element Contribution Target", fontsize=24)

cbar = plt.colorbar()
cbar.ax.tick_params()

plt.savefig("Figures/Perov_heatmap_Asite.pdf", bbox_inches='tight')
In [46]:
uniq_elementsA = np.unique(data['B site #1'])
uniq_elements_countA = len(uniq_elementsA)
uniq_elementsB = np.unique(data['B site #1'])
uniq_elements_countB = len(uniq_elementsB)
avc_matrix = np.zeros((uniq_elements_countA, uniq_elements_countB))

for y in range(0, uniq_elements_countA):
    for z in range(0, uniq_elements_countB):
        a = np.where(data['B site #1'] == uniq_elementsA[y])[0]
        b = np.where(data['B site #1'] == uniq_elementsB[z])[0]
        avc_matrix[y,z] = np.mean(data_rf_contribution[a,:][:,b])
        
avc_matrix[np.where(avc_matrix > 0)] /= np.max(avc_matrix)
avc_matrix[np.where(avc_matrix < 0)] /= -np.min(avc_matrix)

plt.figure(figsize=(12,10))
plt.xticks(ticks=np.arange(0, uniq_elements_countA), labels=uniq_elementsA, fontsize=14)
plt.yticks(ticks=np.arange(0, uniq_elements_countB), labels=uniq_elementsB, fontsize=14)
plt.imshow(avc_matrix.T)

plt.xlabel("B Site Elements", fontsize=24)
plt.ylabel("B Site Elements", fontsize=24)

cbar = plt.colorbar()
cbar.ax.tick_params()

plt.savefig("Figures/Perov_heatmap_Bsite.pdf", bbox_inches='tight')
In [ ]: