In [1]:
from molgx import *
import pandas as pd
import numpy as np
In [2]:
csvfn = './PCO2-Tg-Thd-data-all-simulated.csv'
df = pd.read_csv(csvfn)

# Because open source version molgx cannot deal with hompolymer, convert OPSIN SMILES to SMILES
df['SMILES'] = [opsim.replace('[*:1]', '').replace('[*:2]', '') for opsim in df['OPSIN SMILES']]
df.head()
Out[2]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Polymer_ID Polymer_name OPSIN SMILES Tg (K) Td_1/2 (K) log10(P_CO2 (Barrer)) SMILES
0 746 poly(silanediylmethylene) [Si](C[*:2])[*:1] 178.969311 453.171845 4.696604 [Si](C)
1 158 Poly(dimethylsilanediyl) [*:1][Si](C)(C)[Si](C)(C)[*:2] 170.533265 408.658103 4.429005 [Si](C)(C)[Si](C)(C)
2 735 poly(dimethylsilanediyl) [*:1][Si](C)(C)[Si](C)(C)[*:2] 170.533265 408.658103 4.429005 [Si](C)(C)[Si](C)(C)
3 747 poly[(methylsilanediyl)methylene] C[Si](C[*:2])[*:1] 220.097968 447.668763 4.010414 C[Si](C)
4 731 poly[(dimethylsilanediyl)methylene] [*:1][Si](C)(C)C[*:2] 175.053327 444.493538 3.831722 [Si](C)(C)C
In [3]:
# Save to another csv file for molgx
csvfn2 = './PCO2-Tg-Thd-data-all-simulated2.csv'
df2 = df.drop(columns=['OPSIN SMILES'])
df2.to_csv(csvfn2)
In [4]:
moldata = MolData.read_csv(csvfn2)
moldata.print_properties()
properties:['Polymer_ID', 'Polymer_name', 'Tg (K)', 'Td_1/2 (K)', 'log10(P_CO2 (Barrer))']

Feature Extraction¶

In [5]:
%%time

fs_atom = moldata.extract_features(HeavyAtomExtractor(moldata))
fs_ring = moldata.extract_features(RingExtractor(moldata))
fs_aring = moldata.extract_features(AromaticRingExtractor(moldata))
fs_fp_structure1 = moldata.extract_features(FingerPrintStructureExtractor(moldata, radius=1))
moldata.print_features()

mfp = moldata.merge_features([fs_atom.id, fs_ring.id, fs_aring.id, fs_fp_structure1.id])
moldata.print_merged_features()
feature set list:
  0: heavy_atom
  1: ring
  2: aromatic_ring
  3: finger_print_structure:radius=1
merged feature set list:
  0: |aromatic_ring|finger_print_structure:radius=1|heavy_atom|ring|
CPU times: user 11.7 s, sys: 15 µs, total: 11.7 s
Wall time: 12.7 s

Model prediction¶

In [6]:
target_property = {
    'Tg (K)': (490, np.inf),
    'Td_1/2 (K)': (590, np.inf),
    'log10(P_CO2 (Barrer))': (3, 4)
}

Lasso¶

In [7]:
# Tg (K)
model = moldata.optimize_and_select_features(
    LassoRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_lasso_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
optimized parameters: {'alpha': 1.0}
regression model cross validation target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.74 cv_score=0.65 (+/- 0.02)
feature selection target='Tg (K)': data_size=1169: model:Lasso:alpha=1.0:opt threshold=None
feature size:191 -> 102
regression model cross validation target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.74 cv_score=0.66 (+/- 0.03)
No description has been provided for this image
In [8]:
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
    LassoRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_lasso_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
optimized parameters: {'alpha': 1.0}
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.80 cv_score=0.75 (+/- 0.02)
feature selection target='Td_1/2 (K)': data_size=1169: model:Lasso:alpha=1.0:opt threshold=None
feature size:191 -> 82
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.80 cv_score=0.76 (+/- 0.02)
No description has been provided for this image
In [9]:
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
    LassoRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_lasso_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True
optimized parameters: {'alpha': 0.01}
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.84 cv_score=0.77 (+/- 0.05)
feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso:alpha=0.01:opt threshold=None
feature size:191 -> 109
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True
R^2 score=0.84 cv_score=0.79 (+/- 0.03)
No description has been provided for this image

ElasticNet¶

In [10]:
# Tg (K)
model = moldata.optimize_and_select_features(
    ElasticNetRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_enet_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
optimized parameters: {'alpha': 1.0, 'l1_ratio': 1.0}
regression model cross validation target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.74 cv_score=0.66 (+/- 0.02)
feature selection target='Tg (K)': data_size=1169: model:ElasticNet:alpha=1.0 l1_ratio=1.0:opt threshold=None
feature size:191 -> 51
regression model cross validation target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.73 cv_score=0.67 (+/- 0.03)
No description has been provided for this image
In [11]:
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
    ElasticNetRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_enet_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
optimized parameters: {'alpha': 0.1, 'l1_ratio': 0.0}
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.82 cv_score=0.75 (+/- 0.02)
feature selection target='Td_1/2 (K)': data_size=1169: model:ElasticNet:alpha=0.1 l1_ratio=0.0:opt threshold=None
feature size:191 -> 65
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.80 cv_score=0.76 (+/- 0.02)
No description has been provided for this image
In [12]:
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
    ElasticNetRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_enet_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
optimized parameters: {'alpha': 0.1, 'l1_ratio': 0.2}
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.82 cv_score=0.77 (+/- 0.02)
feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet:alpha=0.1 l1_ratio=0.2:opt threshold=None
feature size:191 -> 47
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True
R^2 score=0.81 cv_score=0.78 (+/- 0.02)
No description has been provided for this image

Ridge¶

In [13]:
# Tg (K)
model = moldata.optimize_and_select_features(
    RidgeRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_ridge_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
optimized parameters: {'alpha': 100.0}
regression model cross validation target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.75 cv_score=0.65 (+/- 0.03)
feature selection target='Tg (K)': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None
feature size:191 -> 59
regression model cross validation target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.72 cv_score=0.67 (+/- 0.03)
No description has been provided for this image
In [14]:
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
    RidgeRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_ridge_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
optimized parameters: {'alpha': 100.0}
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.82 cv_score=0.76 (+/- 0.02)
feature selection target='Td_1/2 (K)': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None
feature size:191 -> 66
regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.80 cv_score=0.75 (+/- 0.02)
No description has been provided for this image
In [15]:
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
    RidgeRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_ridge_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True
optimized parameters: {'alpha': 100.0}
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.84 cv_score=0.78 (+/- 0.02)
feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None
feature size:191 -> 55
regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True
R^2 score=0.82 cv_score=0.78 (+/- 0.03)
No description has been provided for this image

Model summary¶

In [16]:
df_models = {}
for target_p in target_property:
        df_m = moldata.get_regression_model_summary(target_p)
        df_models[target_p] = df_m
In [17]:
tp = list(target_property.keys())[0]
print(tp)
df_models[tp]
Tg (K)
Out[17]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
model model_id score cv_score cv_score(std) rmse size aromatic_ring (3) finger_print_structure:radius=1 (174) heavy_atom (9) ring (5)
0 <molgx.Prediction.ElasticNetRegressionModel ob... ElasticNet:alpha=1.0 l1_ratio=1.0:opt:select() 0.728669 0.667034 0.027880 54.086206 51 1 43 3 4
1 <molgx.Prediction.RidgeRegressionModel object ... Ridge:alpha=100.0:opt:select() 0.721891 0.666777 0.026832 54.757595 59 2 50 3 4
2 <molgx.Prediction.LassoRegressionModel object ... Lasso:alpha=1.0:opt:select() 0.741218 0.661274 0.026605 52.820661 102 1 93 4 4
In [18]:
tp = list(target_property.keys())[1]
print(tp)
df_models[tp]
Td_1/2 (K)
Out[18]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
model model_id score cv_score cv_score(std) rmse size aromatic_ring (3) finger_print_structure:radius=1 (174) heavy_atom (9) ring (5)
0 <molgx.Prediction.LassoRegressionModel object ... Lasso:alpha=1.0:opt:select() 0.802458 0.756285 0.021127 28.345927 82 1 73 5 3
1 <molgx.Prediction.ElasticNetRegressionModel ob... ElasticNet:alpha=0.1 l1_ratio=0.0:opt:select() 0.799326 0.755580 0.022499 28.569765 65 2 56 4 3
2 <molgx.Prediction.RidgeRegressionModel object ... Ridge:alpha=100.0:opt:select() 0.800708 0.755000 0.018856 28.471213 66 2 57 4 3
In [19]:
tp = list(target_property.keys())[2]
print(tp)
df_models[tp]
log10(P_CO2 (Barrer))
Out[19]:
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
model model_id score cv_score cv_score(std) rmse size aromatic_ring (3) finger_print_structure:radius=1 (174) heavy_atom (9) ring (5)
0 <molgx.Prediction.LassoRegressionModel object ... Lasso:alpha=0.01:opt:select() 0.840011 0.785739 0.025975 0.648070 109 1 98 6 4
1 <molgx.Prediction.ElasticNetRegressionModel ob... ElasticNet:alpha=0.1 l1_ratio=0.2:opt:select() 0.807608 0.783213 0.019711 0.710675 47 0 42 5 0
2 <molgx.Prediction.RidgeRegressionModel object ... Ridge:alpha=100.0:opt:select() 0.817974 0.782591 0.026398 0.691264 55 1 48 5 1
In [ ]: