In [1]:
import numpy as np

from ase.io import read
from ase.build import molecule
from ase import Atoms

from sklearn.metrics.pairwise import pairwise_distances

from dscribe.descriptors import SOAP
In [2]:
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.multioutput import RegressorChain
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
In [3]:
def calc_distances(p0, points):
    return ((p0 - points)**2).sum(axis=1)

Machine Learning Model¶

Read training data¶

Li2S1¶

In [4]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s1 = read("./FeN4C66-Li2S/fe1n4c66-li2s.2279str.xyz",":-50")
In [5]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S/fe1n4c66-li2s.2279energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s1 = []
for i in range(0,len(lines)-50):
    binding_energies_li2s1.append(lines[i][1])
    
binding_energies_li2s1 = np.asarray(binding_energies_li2s1)
In [6]:
esub = 12383.42949802
e_li2s1 = 708.7375253
binding_energies_li2s1 = binding_energies_li2s1 + esub + e_li2s1

Li2S2¶

isomer 1 substrate 1¶

In [7]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s2 = read("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220str.xyz",":-50")
In [8]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220energies.dat", comments="#", delimiter=" ", unpack=False)

binding_energies_li2s2 = []
for i in range(0,len(lines)-50):
    binding_energies_li2s2.append(lines[i][1])
    
binding_energies_li2s2 = np.asarray(binding_energies_li2s2)
In [9]:
esub = 12383.42949802
e_li2s2 = 1013.31309
binding_energies_li2s2 = binding_energies_li2s2 + esub + e_li2s2

li2s4¶

In [10]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s4 = read("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119str.xyz",":-50")
In [11]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s4 = []
for i in range(0,len(lines)-50):
    binding_energies_li2s4.append(lines[i][1])
    
binding_energies_li2s4 = np.asarray(binding_energies_li2s4)
In [12]:
esub = 12383.42949802
e_li2s4 = 1621.689849
binding_energies_li2s4 = binding_energies_li2s4 + esub + e_li2s4

li2s6¶

In [13]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s6 = read("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594str.xyz",":-50")
In [14]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s6 = []
for i in range(0,len(lines)-50):
    binding_energies_li2s6.append(lines[i][1])
    
binding_energies_li2s6 = np.asarray(binding_energies_li2s6)
In [15]:
esub = 12383.42949802
e_li2s6 = 2229.344944
binding_energies_li2s6 = binding_energies_li2s6 + esub + e_li2s6

li2s8¶

In [16]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s8 = read("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715str.xyz",":-50")
In [17]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s8 = []
for i in range(0,len(lines)-50):
    binding_energies_li2s8.append(lines[i][1])
    
binding_energies_li2s8 = np.asarray(binding_energies_li2s8)
In [18]:
esub = 12383.42949802
e_li2s8 = 2836.121134
binding_energies_li2s8 = binding_energies_li2s8 + esub + e_li2s8

start wrapping up stuff¶

In [19]:
binding_energies = []
for i in range(0,len(binding_energies_li2s1)):
    binding_energies.append(binding_energies_li2s1[i])
for i in range(0,len(binding_energies_li2s2)):
    binding_energies.append(binding_energies_li2s2[i])
for i in range(0,len(binding_energies_li2s4)):
    binding_energies.append(binding_energies_li2s4[i])
for i in range(0,len(binding_energies_li2s6)):
    binding_energies.append(binding_energies_li2s6[i])
for i in range(0,len(binding_energies_li2s8)):
    binding_energies.append(binding_energies_li2s8[i])
In [20]:
print(len(binding_energies))

binding_energies_untouched = binding_energies

purged = []
for i in range(0,len(binding_energies)):
    if 5 < binding_energies[i]  :
        purged.append(i)

for i in purged[::-1]:
    binding_energies = np.delete(binding_energies,i)

print(len(binding_energies_untouched))
print(len(binding_energies))
9677
9677
7793
In [21]:
print(len(binding_energies_untouched))
9677

Get Soap representation for Training and Testing¶

In [22]:
# Let's create a list of structures and gather the chemical elements that are in all the structures.
species = set()
for structure in structure_li2s2:
    species.update(structure.get_chemical_symbols())

soap = SOAP(
    species=species,
    periodic=False,
    rcut=6.5,
    nmax=6,
    lmax=7,
    rbf="gto",
    average="outer",
    sparse=False
)
In [23]:
feature_vectors = []
feature_vectorss = []
counter = -1

for structure in structure_li2s1:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched[counter]  :
        feature_vectors.append(soap.create(structure, n_jobs=1))

for structure in structure_li2s2:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched[counter]  :
        feature_vectors.append(soap.create(structure, n_jobs=1))

for structure in structure_li2s4:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched[counter]  :
        feature_vectors.append(soap.create(structure, n_jobs=1))
    
for structure in structure_li2s6:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched[counter]  :
        feature_vectors.append(soap.create(structure, n_jobs=1))
    
for structure in structure_li2s8:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched[counter]  :
        feature_vectors.append(soap.create(structure, n_jobs=1))
In [24]:
print(counter)
9676
In [25]:
feature_vectorss = np.vstack(feature_vectors)
print(len(feature_vectorss))
7793
In [26]:
binding_energies_m = binding_energies.mean()
binding_energies_centered = binding_energies - binding_energies_m
binding_energies_variance = binding_energies.std()
binding_energies_norm = binding_energies_centered / binding_energies_variance 
In [27]:
print(binding_energies_m)
1.2583438033998016
In [28]:
# here we reshape stuff to make it more friendly for coding
feature_vectors = np.asarray(feature_vectors)
print(feature_vectors.shape)
feature_vectorss = np.asarray(feature_vectorss)
print(feature_vectorss.shape)
(7793, 3720)
(7793, 3720)

Get Kernel Ridge Regressor for Training¶

In [80]:
x = feature_vectorss
yy = np.asarray(binding_energies_centered)

counter = 0

kf = KFold(n_splits=8,random_state=1010, shuffle=True)
kf.get_n_splits(x)

for train_index, test_index in kf.split(x):
    
    counter = counter + 1 
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = yy[train_index], yy[test_index]
    
    xx = []
    for i in range(0,len(X_train)):
        xx.append(X_train[i])
    for i in range(0,len(X_test)):
        xx.append(X_test[i])

    scaler = StandardScaler()
    scaled_x = scaler.fit_transform(X_train)
    scaled_y = scaler.transform(X_test)
 
  #  alphas = [1,0.5,0.25,0.125,0.0625,0.03125,0.0155,0.0008,0.0004,0.0002,0.0001]
  #  alphas = [0.015,0.014,0.013,0.012,0.011,0.01,0.009,0.008,0.007]
  #  alphas = [0.02,0.019,0.018,0.017,0.016,0.015]

  #  for alphaz in alphas: 

    tmaee = []
    trmsee = []
    tmaee2 = []
    trmsee2 = []
    r22 = []
    r222 = []

    regr = KernelRidge(alpha=0.02, kernel='rbf')

    y_pred = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_x[0:len(X_train)])
    y_pred2 = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_y[0:len(X_test)])

    aa = []
    aa2 = []

    for i in range(0,len(y_pred)):
        aa.append(y_pred[i])
    for i in range(0,len(y_pred2)):
        aa2.append(y_pred2[i])

    tmaee.append(metrics.mean_absolute_error(y_train, aa))
    trmsee.append(metrics.mean_squared_error(y_train, aa))
    r22.append(metrics.r2_score(y_train, aa))
    tmaee2.append(metrics.mean_absolute_error(y_test, aa2))
    trmsee2.append(metrics.mean_squared_error(y_test, aa2))
    r222.append(metrics.r2_score(y_test, aa2))

    print()
    print()
    print('train')
    print(tmaee)
    #print(trmsee)
    print(r22)
    print('test')
    print(tmaee2)
    #print(trmsee2)
    print(r222)

    print(binding_energies_variance)

train
[0.07236551567713013]
[0.9889543982201139]
test
[0.10985851935138376]
[0.979343291900827]
1.2802963593751946


train
[0.07176517591885477]
[0.9893375729261991]
test
[0.11869038095670066]
[0.9717590384454946]
1.2802963593751946


train
[0.07294055592847262]
[0.9890347675966049]
test
[0.1083920070386096]
[0.9758316284619863]
1.2802963593751946


train
[0.07200605819421883]
[0.9886937073779143]
test
[0.12250150596865342]
[0.977873476638365]
1.2802963593751946


train
[0.07085929706715868]
[0.9893036048285315]
test
[0.12252927952427305]
[0.9739397011824984]
1.2802963593751946


train
[0.0703655956411123]
[0.9900113548743525]
test
[0.12098043636709455]
[0.9654052756227417]
1.2802963593751946


train
[0.07094210364407319]
[0.9900602093645443]
test
[0.11887967270596825]
[0.964526644999453]
1.2802963593751946


train
[0.07264472021745924]
[0.988476521915738]
test
[0.1139855281913159]
[0.982610266986113]
1.2802963593751946
In [81]:
print(np.array(aa).shape)
print(np.array(aa2).shape)
(6819,)
(974,)
In [82]:
print(np.array(y_train).shape)
print(np.array(y_test).shape)
(6819,)
(974,)
In [97]:
import matplotlib.pyplot as plt


font = {'family': 'DejaVu Sans',
        'weight': 'normal',
        'size': 16,
        }

plt.rc('font',**font)

fig, ax = plt.subplots()
ax.set_ylim(-2,5.1,1)
ax.set_xlim(-2,5.1,1)

plt.errorbar(x=y_train+binding_energies_m, y=aa+binding_energies_m, c='blue', markersize=4., label='train', fmt='o')
plt.errorbar(x=y_test+binding_energies_m, y=aa2+binding_energies_m, c='green', markersize=4., label='validation', fmt='o')
plt.errorbar(x=binding_energies_val+binding_energies_m, y=bb+binding_energies_m, c='red', markersize=4., label='test', fmt='o')
ax.set_xlabel('DFT E$_{b}$ (eV)')
ax.set_ylabel('ML E$_{b}$ (eV)')
ax.legend(prop={'size': 12})

plt.text(-2.7, 3.2, 'B', fontsize=16)

#ax.text(2, -1, r'MAE=0.11 eV', fontsize=16)
#ax.text(2, -1.6, r'R2=0.99', fontsize=16)

plt.tight_layout()

plt.savefig('parity1.png', dpi=600, facecolor='w', edgecolor='w',
            orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()
No description has been provided for this image
In [79]:
print(min(np.array(aa))+binding_energies_m)
print(np.mean(aa))
-1.6371314509003574
-0.007957664860224778

Get predictions for validations set¶

In [32]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s1 = read("./FeN4C66-Li2S/fe1n4c66-li2s.2279str.xyz","-50:")
In [33]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S/fe1n4c66-li2s.2279energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s1_val = []
for i in range(len(lines)-50,len(lines)):
    binding_energies_li2s1_val.append(lines[i][1])
    
binding_energies_li2s1_val = np.asarray(binding_energies_li2s1_val)
In [34]:
esub = 12383.42949802
e_li2s1 = 708.7375253
binding_energies_li2s1_val = binding_energies_li2s1_val + esub + e_li2s1
In [35]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s2 = read("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220str.xyz","-50:")
In [36]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s2_val = []
for i in range(len(lines)-50,len(lines)):
    binding_energies_li2s2_val.append(lines[i][1])
    
binding_energies_li2s2_val = np.asarray(binding_energies_li2s2_val)
In [37]:
esub = 12383.42949802
e_li2s2 = 1013.31309
binding_energies_li2s2_val = binding_energies_li2s2_val + esub + e_li2s2
In [38]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s4 = read("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119str.xyz","-50:")
In [39]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s4_val = []
for i in range(len(lines)-50,len(lines)):
    binding_energies_li2s4_val.append(lines[i][1])
    
binding_energies_li2s4_val = np.asarray(binding_energies_li2s4_val)
In [40]:
esub = 12383.42949802
e_li2s4 = 1621.689849
binding_energies_li2s4_val = binding_energies_li2s4_val + esub + e_li2s4
In [41]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s6 = read("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594str.xyz","-50:")
In [42]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s6_val = []
for i in range(len(lines)-50,len(lines)):
    binding_energies_li2s6_val.append(lines[i][1])
    
binding_energies_li2s6_val = np.asarray(binding_energies_li2s6_val)
In [43]:
esub = 12383.42949802
e_li2s6 = 2229.344944
binding_energies_li2s6_val = binding_energies_li2s6_val + esub + e_li2s6
In [44]:
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s8 = read("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715str.xyz","-50:")
In [45]:
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s8_val = []
for i in range(len(lines)-50,len(lines)):
    binding_energies_li2s8_val.append(lines[i][1])
    
binding_energies_li2s8_val = np.asarray(binding_energies_li2s8_val)
In [46]:
esub = 12383.42949802
e_li2s8 = 2836.121134
binding_energies_li2s8_val = binding_energies_li2s8_val + esub + e_li2s8
In [47]:
binding_energies_val  = []
for i in range(0,len(binding_energies_li2s1_val)):
    binding_energies_val.append(binding_energies_li2s1_val[i])
for i in range(0,len(binding_energies_li2s2_val)):
    binding_energies_val.append(binding_energies_li2s2_val[i])
for i in range(0,len(binding_energies_li2s4_val)):
    binding_energies_val.append(binding_energies_li2s4_val[i])
for i in range(0,len(binding_energies_li2s6_val)):
    binding_energies_val.append(binding_energies_li2s6_val[i])
for i in range(0,len(binding_energies_li2s8_val)):
    binding_energies_val.append(binding_energies_li2s8_val[i])
In [48]:
print(len(binding_energies_val))

binding_energies_untouched_val = binding_energies_val

purged = []
for i in range(0,len(binding_energies_val)):
    if 5 < binding_energies_val[i]  :
        purged.append(i)

for i in purged[::-1]:
    binding_energies_val = np.delete(binding_energies_val,i)

print(len(binding_energies_untouched_val))
print(len(binding_energies_val))
250
250
195
In [49]:
binding_energies_val  = binding_energies_val - binding_energies_m
In [50]:
feature_vectors_val = []
counter = -1 

for structure in structure_li2s1:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched_val[counter]  :
        feature_vectors_val.append(soap.create(structure, n_jobs=1))

for structure in structure_li2s2:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched_val[counter]  :
        feature_vectors_val.append(soap.create(structure, n_jobs=1))

for structure in structure_li2s4:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched_val[counter]  :
        feature_vectors_val.append(soap.create(structure, n_jobs=1))
    
for structure in structure_li2s6:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched_val[counter]  :
        feature_vectors_val.append(soap.create(structure, n_jobs=1))
    
for structure in structure_li2s8:
    # Let's create SOAP feature vectors for each structure
    counter = counter + 1
    if 5 > binding_energies_untouched_val[counter]  :
        feature_vectors_val.append(soap.create(structure, n_jobs=1))
In [51]:
feature_vectorss_val = np.vstack(feature_vectors_val)
print(len(feature_vectorss_val))
195
In [52]:
x = feature_vectorss
yy = np.asarray(binding_energies_centered)

counter = 0
ii = []

kf = KFold(n_splits=8,random_state=1010, shuffle=True)
kf.get_n_splits(x)

for train_index, test_index in kf.split(x):
    
    counter = counter + 1 
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = yy[train_index], yy[test_index]
    
    xx = []
    for i in range(0,len(X_train)):
        xx.append(X_train[i])
    for i in range(0,len(X_test)):
        xx.append(X_test[i])

    scaler = StandardScaler()
    scaled_x = scaler.fit_transform(X_train)
    scaled_y = scaler.transform(feature_vectorss_val)

    regr = KernelRidge(alpha=0.02, kernel='rbf')

    y_pred2 = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_y[0:len(feature_vectors_val)])
    
    for i in range(0,len(y_pred2)):
        ii.append(y_pred2[i])
        
bb = []
cc = []

for j in range (0,len(y_pred2)):
    aa = []
    for i in range(0,counter):
        aa.append(ii[j+(i*len(y_pred2))])

    aa = np.asarray(aa)
    bb.append(aa.mean(0)) 
    cc.append(aa.std(0))
In [53]:
print(metrics.r2_score(binding_energies_val, bb))
print(metrics.mean_absolute_error(binding_energies_val, bb))
print(metrics.mean_squared_error(binding_energies_val, bb))
0.9853257741042919
0.10972795351100352
0.026416466640916726
In [54]:
print(len(bb))
print(len(binding_energies_val))
195
195
In [55]:
import matplotlib.pyplot as plt


font = {'family': 'DejaVu Sans',
        'weight': 'normal',
        'size': 16,
        }

plt.rc('font',**font)

fig, ax = plt.subplots()
plt.errorbar(x=binding_energies_val, y=bb, yerr=cc, c='red', markersize=4., label='', fmt='o')
ax.set_xlabel('DFT E$_{b}$ (eV)')
ax.set_ylabel('ML E$_{b}$ (eV)')
ax.set_ylim(-2,4)
ax.set_xlim(-2,4)
plt.text(-1.7, 3.2, 'B', fontsize=16)

ax.text(2, -1, r'MAE=0.11 eV', fontsize=16)
ax.text(2, -1.6, r'R2=0.99', fontsize=16)

plt.tight_layout()

plt.savefig('parity1.png', dpi=600, facecolor='w', edgecolor='w',
            orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()
No description has been provided for this image
In [56]:
print(binding_energies_m)
print(min(bb))
print(min(binding_energies_val))
1.2583438033998016
-2.2588176026617774
-2.170839323401159
In [47]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
plt.errorbar(x=bb, y=cc, c='orange', markersize=4., label='', fmt='o')
ax.set_xlabel('Predicted')
ax.set_ylabel('Uncertainty')
plt.savefig('parity1.png', dpi=None, facecolor='w', edgecolor='w', 
            orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()
No description has been provided for this image