import numpy as np
from ase.io import read
from ase.build import molecule
from ase import Atoms
from sklearn.metrics.pairwise import pairwise_distances
from dscribe.descriptors import SOAP
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.multioutput import RegressorChain
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
def calc_distances(p0, points):
return ((p0 - points)**2).sum(axis=1)
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s1 = read("./FeN4C66-Li2S/fe1n4c66-li2s.2279str.xyz",":-50")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S/fe1n4c66-li2s.2279energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s1 = []
for i in range(0,len(lines)-50):
binding_energies_li2s1.append(lines[i][1])
binding_energies_li2s1 = np.asarray(binding_energies_li2s1)
esub = 12383.42949802
e_li2s1 = 708.7375253
binding_energies_li2s1 = binding_energies_li2s1 + esub + e_li2s1
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s2 = read("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220str.xyz",":-50")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s2 = []
for i in range(0,len(lines)-50):
binding_energies_li2s2.append(lines[i][1])
binding_energies_li2s2 = np.asarray(binding_energies_li2s2)
esub = 12383.42949802
e_li2s2 = 1013.31309
binding_energies_li2s2 = binding_energies_li2s2 + esub + e_li2s2
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s4 = read("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119str.xyz",":-50")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s4 = []
for i in range(0,len(lines)-50):
binding_energies_li2s4.append(lines[i][1])
binding_energies_li2s4 = np.asarray(binding_energies_li2s4)
esub = 12383.42949802
e_li2s4 = 1621.689849
binding_energies_li2s4 = binding_energies_li2s4 + esub + e_li2s4
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s6 = read("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594str.xyz",":-50")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s6 = []
for i in range(0,len(lines)-50):
binding_energies_li2s6.append(lines[i][1])
binding_energies_li2s6 = np.asarray(binding_energies_li2s6)
esub = 12383.42949802
e_li2s6 = 2229.344944
binding_energies_li2s6 = binding_energies_li2s6 + esub + e_li2s6
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s8 = read("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715str.xyz",":-50")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s8 = []
for i in range(0,len(lines)-50):
binding_energies_li2s8.append(lines[i][1])
binding_energies_li2s8 = np.asarray(binding_energies_li2s8)
esub = 12383.42949802
e_li2s8 = 2836.121134
binding_energies_li2s8 = binding_energies_li2s8 + esub + e_li2s8
binding_energies = []
for i in range(0,len(binding_energies_li2s1)):
binding_energies.append(binding_energies_li2s1[i])
for i in range(0,len(binding_energies_li2s2)):
binding_energies.append(binding_energies_li2s2[i])
for i in range(0,len(binding_energies_li2s4)):
binding_energies.append(binding_energies_li2s4[i])
for i in range(0,len(binding_energies_li2s6)):
binding_energies.append(binding_energies_li2s6[i])
for i in range(0,len(binding_energies_li2s8)):
binding_energies.append(binding_energies_li2s8[i])
print(len(binding_energies))
binding_energies_untouched = binding_energies
purged = []
for i in range(0,len(binding_energies)):
if 5 < binding_energies[i] :
purged.append(i)
for i in purged[::-1]:
binding_energies = np.delete(binding_energies,i)
print(len(binding_energies_untouched))
print(len(binding_energies))
9677 9677 7793
print(len(binding_energies_untouched))
9677
# Let's create a list of structures and gather the chemical elements that are in all the structures.
species = set()
for structure in structure_li2s2:
species.update(structure.get_chemical_symbols())
soap = SOAP(
species=species,
periodic=False,
rcut=6.5,
nmax=6,
lmax=7,
rbf="gto",
average="outer",
sparse=False
)
feature_vectors = []
feature_vectorss = []
counter = -1
for structure in structure_li2s1:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched[counter] :
feature_vectors.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s2:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched[counter] :
feature_vectors.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s4:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched[counter] :
feature_vectors.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s6:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched[counter] :
feature_vectors.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s8:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched[counter] :
feature_vectors.append(soap.create(structure, n_jobs=1))
print(counter)
9676
feature_vectorss = np.vstack(feature_vectors)
print(len(feature_vectorss))
7793
binding_energies_m = binding_energies.mean()
binding_energies_centered = binding_energies - binding_energies_m
binding_energies_variance = binding_energies.std()
binding_energies_norm = binding_energies_centered / binding_energies_variance
print(binding_energies_m)
1.2583438033998016
# here we reshape stuff to make it more friendly for coding
feature_vectors = np.asarray(feature_vectors)
print(feature_vectors.shape)
feature_vectorss = np.asarray(feature_vectorss)
print(feature_vectorss.shape)
(7793, 3720) (7793, 3720)
x = feature_vectorss
yy = np.asarray(binding_energies_centered)
counter = 0
kf = KFold(n_splits=8,random_state=1010, shuffle=True)
kf.get_n_splits(x)
for train_index, test_index in kf.split(x):
counter = counter + 1
X_train, X_test = x[train_index], x[test_index]
y_train, y_test = yy[train_index], yy[test_index]
xx = []
for i in range(0,len(X_train)):
xx.append(X_train[i])
for i in range(0,len(X_test)):
xx.append(X_test[i])
scaler = StandardScaler()
scaled_x = scaler.fit_transform(X_train)
scaled_y = scaler.transform(X_test)
# alphas = [1,0.5,0.25,0.125,0.0625,0.03125,0.0155,0.0008,0.0004,0.0002,0.0001]
# alphas = [0.015,0.014,0.013,0.012,0.011,0.01,0.009,0.008,0.007]
# alphas = [0.02,0.019,0.018,0.017,0.016,0.015]
# for alphaz in alphas:
tmaee = []
trmsee = []
tmaee2 = []
trmsee2 = []
r22 = []
r222 = []
regr = KernelRidge(alpha=0.02, kernel='rbf')
y_pred = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_x[0:len(X_train)])
y_pred2 = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_y[0:len(X_test)])
aa = []
aa2 = []
for i in range(0,len(y_pred)):
aa.append(y_pred[i])
for i in range(0,len(y_pred2)):
aa2.append(y_pred2[i])
tmaee.append(metrics.mean_absolute_error(y_train, aa))
trmsee.append(metrics.mean_squared_error(y_train, aa))
r22.append(metrics.r2_score(y_train, aa))
tmaee2.append(metrics.mean_absolute_error(y_test, aa2))
trmsee2.append(metrics.mean_squared_error(y_test, aa2))
r222.append(metrics.r2_score(y_test, aa2))
print()
print()
print('train')
print(tmaee)
#print(trmsee)
print(r22)
print('test')
print(tmaee2)
#print(trmsee2)
print(r222)
print(binding_energies_variance)
train [0.07236551567713013] [0.9889543982201139] test [0.10985851935138376] [0.979343291900827] 1.2802963593751946 train [0.07176517591885477] [0.9893375729261991] test [0.11869038095670066] [0.9717590384454946] 1.2802963593751946 train [0.07294055592847262] [0.9890347675966049] test [0.1083920070386096] [0.9758316284619863] 1.2802963593751946 train [0.07200605819421883] [0.9886937073779143] test [0.12250150596865342] [0.977873476638365] 1.2802963593751946 train [0.07085929706715868] [0.9893036048285315] test [0.12252927952427305] [0.9739397011824984] 1.2802963593751946 train [0.0703655956411123] [0.9900113548743525] test [0.12098043636709455] [0.9654052756227417] 1.2802963593751946 train [0.07094210364407319] [0.9900602093645443] test [0.11887967270596825] [0.964526644999453] 1.2802963593751946 train [0.07264472021745924] [0.988476521915738] test [0.1139855281913159] [0.982610266986113] 1.2802963593751946
print(np.array(aa).shape)
print(np.array(aa2).shape)
(6819,) (974,)
print(np.array(y_train).shape)
print(np.array(y_test).shape)
(6819,) (974,)
import matplotlib.pyplot as plt
font = {'family': 'DejaVu Sans',
'weight': 'normal',
'size': 16,
}
plt.rc('font',**font)
fig, ax = plt.subplots()
ax.set_ylim(-2,5.1,1)
ax.set_xlim(-2,5.1,1)
plt.errorbar(x=y_train+binding_energies_m, y=aa+binding_energies_m, c='blue', markersize=4., label='train', fmt='o')
plt.errorbar(x=y_test+binding_energies_m, y=aa2+binding_energies_m, c='green', markersize=4., label='validation', fmt='o')
plt.errorbar(x=binding_energies_val+binding_energies_m, y=bb+binding_energies_m, c='red', markersize=4., label='test', fmt='o')
ax.set_xlabel('DFT E$_{b}$ (eV)')
ax.set_ylabel('ML E$_{b}$ (eV)')
ax.legend(prop={'size': 12})
plt.text(-2.7, 3.2, 'B', fontsize=16)
#ax.text(2, -1, r'MAE=0.11 eV', fontsize=16)
#ax.text(2, -1.6, r'R2=0.99', fontsize=16)
plt.tight_layout()
plt.savefig('parity1.png', dpi=600, facecolor='w', edgecolor='w',
orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()
print(min(np.array(aa))+binding_energies_m)
print(np.mean(aa))
-1.6371314509003574 -0.007957664860224778
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s1 = read("./FeN4C66-Li2S/fe1n4c66-li2s.2279str.xyz","-50:")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S/fe1n4c66-li2s.2279energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s1_val = []
for i in range(len(lines)-50,len(lines)):
binding_energies_li2s1_val.append(lines[i][1])
binding_energies_li2s1_val = np.asarray(binding_energies_li2s1_val)
esub = 12383.42949802
e_li2s1 = 708.7375253
binding_energies_li2s1_val = binding_energies_li2s1_val + esub + e_li2s1
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s2 = read("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220str.xyz","-50:")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S2/fe1n4c66-li2s2.2220energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s2_val = []
for i in range(len(lines)-50,len(lines)):
binding_energies_li2s2_val.append(lines[i][1])
binding_energies_li2s2_val = np.asarray(binding_energies_li2s2_val)
esub = 12383.42949802
e_li2s2 = 1013.31309
binding_energies_li2s2_val = binding_energies_li2s2_val + esub + e_li2s2
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s4 = read("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119str.xyz","-50:")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S4/fe1n4c66-li2s4.2119energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s4_val = []
for i in range(len(lines)-50,len(lines)):
binding_energies_li2s4_val.append(lines[i][1])
binding_energies_li2s4_val = np.asarray(binding_energies_li2s4_val)
esub = 12383.42949802
e_li2s4 = 1621.689849
binding_energies_li2s4_val = binding_energies_li2s4_val + esub + e_li2s4
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s6 = read("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594str.xyz","-50:")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S6/fe1n4c66-li2s6.1594energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s6_val = []
for i in range(len(lines)-50,len(lines)):
binding_energies_li2s6_val.append(lines[i][1])
binding_energies_li2s6_val = np.asarray(binding_energies_li2s6_val)
esub = 12383.42949802
e_li2s6 = 2229.344944
binding_energies_li2s6_val = binding_energies_li2s6_val + esub + e_li2s6
# Let's use ASE to read atomic structures as ase.Atoms objects.
structure_li2s8 = read("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715str.xyz","-50:")
from numpy import loadtxt
lines = loadtxt("./FeN4C66-Li2S8/fe1n4c66-li2s8.1715energies.dat", comments="#", delimiter=" ", unpack=False)
binding_energies_li2s8_val = []
for i in range(len(lines)-50,len(lines)):
binding_energies_li2s8_val.append(lines[i][1])
binding_energies_li2s8_val = np.asarray(binding_energies_li2s8_val)
esub = 12383.42949802
e_li2s8 = 2836.121134
binding_energies_li2s8_val = binding_energies_li2s8_val + esub + e_li2s8
binding_energies_val = []
for i in range(0,len(binding_energies_li2s1_val)):
binding_energies_val.append(binding_energies_li2s1_val[i])
for i in range(0,len(binding_energies_li2s2_val)):
binding_energies_val.append(binding_energies_li2s2_val[i])
for i in range(0,len(binding_energies_li2s4_val)):
binding_energies_val.append(binding_energies_li2s4_val[i])
for i in range(0,len(binding_energies_li2s6_val)):
binding_energies_val.append(binding_energies_li2s6_val[i])
for i in range(0,len(binding_energies_li2s8_val)):
binding_energies_val.append(binding_energies_li2s8_val[i])
print(len(binding_energies_val))
binding_energies_untouched_val = binding_energies_val
purged = []
for i in range(0,len(binding_energies_val)):
if 5 < binding_energies_val[i] :
purged.append(i)
for i in purged[::-1]:
binding_energies_val = np.delete(binding_energies_val,i)
print(len(binding_energies_untouched_val))
print(len(binding_energies_val))
250 250 195
binding_energies_val = binding_energies_val - binding_energies_m
feature_vectors_val = []
counter = -1
for structure in structure_li2s1:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched_val[counter] :
feature_vectors_val.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s2:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched_val[counter] :
feature_vectors_val.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s4:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched_val[counter] :
feature_vectors_val.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s6:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched_val[counter] :
feature_vectors_val.append(soap.create(structure, n_jobs=1))
for structure in structure_li2s8:
# Let's create SOAP feature vectors for each structure
counter = counter + 1
if 5 > binding_energies_untouched_val[counter] :
feature_vectors_val.append(soap.create(structure, n_jobs=1))
feature_vectorss_val = np.vstack(feature_vectors_val)
print(len(feature_vectorss_val))
195
x = feature_vectorss
yy = np.asarray(binding_energies_centered)
counter = 0
ii = []
kf = KFold(n_splits=8,random_state=1010, shuffle=True)
kf.get_n_splits(x)
for train_index, test_index in kf.split(x):
counter = counter + 1
X_train, X_test = x[train_index], x[test_index]
y_train, y_test = yy[train_index], yy[test_index]
xx = []
for i in range(0,len(X_train)):
xx.append(X_train[i])
for i in range(0,len(X_test)):
xx.append(X_test[i])
scaler = StandardScaler()
scaled_x = scaler.fit_transform(X_train)
scaled_y = scaler.transform(feature_vectorss_val)
regr = KernelRidge(alpha=0.02, kernel='rbf')
y_pred2 = regr.fit(scaled_x[0:len(X_train)],y_train).predict(scaled_y[0:len(feature_vectors_val)])
for i in range(0,len(y_pred2)):
ii.append(y_pred2[i])
bb = []
cc = []
for j in range (0,len(y_pred2)):
aa = []
for i in range(0,counter):
aa.append(ii[j+(i*len(y_pred2))])
aa = np.asarray(aa)
bb.append(aa.mean(0))
cc.append(aa.std(0))
print(metrics.r2_score(binding_energies_val, bb))
print(metrics.mean_absolute_error(binding_energies_val, bb))
print(metrics.mean_squared_error(binding_energies_val, bb))
0.9853257741042919 0.10972795351100352 0.026416466640916726
print(len(bb))
print(len(binding_energies_val))
195 195
import matplotlib.pyplot as plt
font = {'family': 'DejaVu Sans',
'weight': 'normal',
'size': 16,
}
plt.rc('font',**font)
fig, ax = plt.subplots()
plt.errorbar(x=binding_energies_val, y=bb, yerr=cc, c='red', markersize=4., label='', fmt='o')
ax.set_xlabel('DFT E$_{b}$ (eV)')
ax.set_ylabel('ML E$_{b}$ (eV)')
ax.set_ylim(-2,4)
ax.set_xlim(-2,4)
plt.text(-1.7, 3.2, 'B', fontsize=16)
ax.text(2, -1, r'MAE=0.11 eV', fontsize=16)
ax.text(2, -1.6, r'R2=0.99', fontsize=16)
plt.tight_layout()
plt.savefig('parity1.png', dpi=600, facecolor='w', edgecolor='w',
orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()
print(binding_energies_m)
print(min(bb))
print(min(binding_energies_val))
1.2583438033998016 -2.2588176026617774 -2.170839323401159
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
plt.errorbar(x=bb, y=cc, c='orange', markersize=4., label='', fmt='o')
ax.set_xlabel('Predicted')
ax.set_ylabel('Uncertainty')
plt.savefig('parity1.png', dpi=None, facecolor='w', edgecolor='w',
orientation='landscape', format=None, transparent=False, bbox_inches=None, pad_inches=0.1)
plt.show()