from molgx import *
import pandas as pd
import numpy as np
csvfn = './PCO2-Tg-Thd-data-all-simulated.csv'
df = pd.read_csv(csvfn)
# Because open source version molgx cannot deal with hompolymer, convert OPSIN SMILES to SMILES
df['SMILES'] = [opsim.replace('[*:1]', '').replace('[*:2]', '') for opsim in df['OPSIN SMILES']]
df.head()
Polymer_ID | Polymer_name | OPSIN SMILES | Tg (K) | Td_1/2 (K) | log10(P_CO2 (Barrer)) | SMILES | |
---|---|---|---|---|---|---|---|
0 | 746 | poly(silanediylmethylene) | [Si](C[*:2])[*:1] | 178.969311 | 453.171845 | 4.696604 | [Si](C) |
1 | 158 | Poly(dimethylsilanediyl) | [*:1][Si](C)(C)[Si](C)(C)[*:2] | 170.533265 | 408.658103 | 4.429005 | [Si](C)(C)[Si](C)(C) |
2 | 735 | poly(dimethylsilanediyl) | [*:1][Si](C)(C)[Si](C)(C)[*:2] | 170.533265 | 408.658103 | 4.429005 | [Si](C)(C)[Si](C)(C) |
3 | 747 | poly[(methylsilanediyl)methylene] | C[Si](C[*:2])[*:1] | 220.097968 | 447.668763 | 4.010414 | C[Si](C) |
4 | 731 | poly[(dimethylsilanediyl)methylene] | [*:1][Si](C)(C)C[*:2] | 175.053327 | 444.493538 | 3.831722 | [Si](C)(C)C |
# Save to another csv file for molgx
csvfn2 = './PCO2-Tg-Thd-data-all-simulated2.csv'
df2 = df.drop(columns=['OPSIN SMILES'])
df2.to_csv(csvfn2)
moldata = MolData.read_csv(csvfn2)
moldata.print_properties()
properties:['Polymer_ID', 'Polymer_name', 'Tg (K)', 'Td_1/2 (K)', 'log10(P_CO2 (Barrer))']
%%time
fs_atom = moldata.extract_features(HeavyAtomExtractor(moldata))
fs_ring = moldata.extract_features(RingExtractor(moldata))
fs_aring = moldata.extract_features(AromaticRingExtractor(moldata))
fs_fp_structure1 = moldata.extract_features(FingerPrintStructureExtractor(moldata, radius=1))
moldata.print_features()
mfp = moldata.merge_features([fs_atom.id, fs_ring.id, fs_aring.id, fs_fp_structure1.id])
moldata.print_merged_features()
feature set list: 0: heavy_atom 1: ring 2: aromatic_ring 3: finger_print_structure:radius=1 merged feature set list: 0: |aromatic_ring|finger_print_structure:radius=1|heavy_atom|ring| CPU times: user 11.7 s, sys: 15 µs, total: 11.7 s Wall time: 12.7 s
target_property = {
'Tg (K)': (490, np.inf),
'Td_1/2 (K)': (590, np.inf),
'log10(P_CO2 (Barrer))': (3, 4)
}
# Tg (K)
model = moldata.optimize_and_select_features(
LassoRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_lasso_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True optimized parameters: {'alpha': 1.0} regression model cross validation target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.74 cv_score=0.65 (+/- 0.02) feature selection target='Tg (K)': data_size=1169: model:Lasso:alpha=1.0:opt threshold=None feature size:191 -> 102 regression model cross validation target='Tg (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.74 cv_score=0.66 (+/- 0.03)
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
LassoRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_lasso_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True optimized parameters: {'alpha': 1.0} regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.80 cv_score=0.75 (+/- 0.02) feature selection target='Td_1/2 (K)': data_size=1169: model:Lasso:alpha=1.0:opt threshold=None feature size:191 -> 82 regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.80 cv_score=0.76 (+/- 0.02)
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
LassoRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_lasso_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True optimized parameters: {'alpha': 0.01} regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.84 cv_score=0.77 (+/- 0.05) feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso:alpha=0.01:opt threshold=None feature size:191 -> 109 regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Lasso n_splits=3 shuffle=True R^2 score=0.84 cv_score=0.79 (+/- 0.03)
# Tg (K)
model = moldata.optimize_and_select_features(
ElasticNetRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_enet_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True optimized parameters: {'alpha': 1.0, 'l1_ratio': 1.0} regression model cross validation target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.74 cv_score=0.66 (+/- 0.02) feature selection target='Tg (K)': data_size=1169: model:ElasticNet:alpha=1.0 l1_ratio=1.0:opt threshold=None feature size:191 -> 51 regression model cross validation target='Tg (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.73 cv_score=0.67 (+/- 0.03)
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
ElasticNetRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_enet_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True optimized parameters: {'alpha': 0.1, 'l1_ratio': 0.0} regression model cross validation target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.82 cv_score=0.75 (+/- 0.02) feature selection target='Td_1/2 (K)': data_size=1169: model:ElasticNet:alpha=0.1 l1_ratio=0.0:opt threshold=None feature size:191 -> 65 regression model cross validation target='Td_1/2 (K)': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.80 cv_score=0.76 (+/- 0.02)
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
ElasticNetRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_enet_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True optimized parameters: {'alpha': 0.1, 'l1_ratio': 0.2} regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.82 cv_score=0.77 (+/- 0.02) feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet:alpha=0.1 l1_ratio=0.2:opt threshold=None feature size:191 -> 47 regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:ElasticNet n_splits=3 shuffle=True R^2 score=0.81 cv_score=0.78 (+/- 0.02)
# Tg (K)
model = moldata.optimize_and_select_features(
RidgeRegressionModel(moldata, list(target_property.keys())[0], mfp))
model.plot_estimate('molgx_ridge_tg.png')
regression model parameter optimization target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True optimized parameters: {'alpha': 100.0} regression model cross validation target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.75 cv_score=0.65 (+/- 0.03) feature selection target='Tg (K)': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None feature size:191 -> 59 regression model cross validation target='Tg (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.72 cv_score=0.67 (+/- 0.03)
# half decomposition temperature (K)
model = moldata.optimize_and_select_features(
RidgeRegressionModel(moldata, list(target_property.keys())[1], mfp))
model.plot_estimate('molgx_ridge_td.png')
regression model parameter optimization target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True optimized parameters: {'alpha': 100.0} regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.82 cv_score=0.76 (+/- 0.02) feature selection target='Td_1/2 (K)': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None feature size:191 -> 66 regression model cross validation target='Td_1/2 (K)': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.80 cv_score=0.75 (+/- 0.02)
# Permeability_CO2 (Barrer)
model = moldata.optimize_and_select_features(
RidgeRegressionModel(moldata, list(target_property.keys())[2], mfp))
model.plot_estimate('molgx_ridge_pco2.png')
regression model parameter optimization target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True optimized parameters: {'alpha': 100.0} regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.84 cv_score=0.78 (+/- 0.02) feature selection target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge:alpha=100.0:opt threshold=None feature size:191 -> 55 regression model cross validation target='log10(P_CO2 (Barrer))': data_size=1169: model:Ridge n_splits=3 shuffle=True R^2 score=0.82 cv_score=0.78 (+/- 0.03)
df_models = {}
for target_p in target_property:
df_m = moldata.get_regression_model_summary(target_p)
df_models[target_p] = df_m
tp = list(target_property.keys())[0]
print(tp)
df_models[tp]
Tg (K)
model | model_id | score | cv_score | cv_score(std) | rmse | size | aromatic_ring (3) | finger_print_structure:radius=1 (174) | heavy_atom (9) | ring (5) | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | <molgx.Prediction.ElasticNetRegressionModel ob... | ElasticNet:alpha=1.0 l1_ratio=1.0:opt:select() | 0.728669 | 0.667034 | 0.027880 | 54.086206 | 51 | 1 | 43 | 3 | 4 |
1 | <molgx.Prediction.RidgeRegressionModel object ... | Ridge:alpha=100.0:opt:select() | 0.721891 | 0.666777 | 0.026832 | 54.757595 | 59 | 2 | 50 | 3 | 4 |
2 | <molgx.Prediction.LassoRegressionModel object ... | Lasso:alpha=1.0:opt:select() | 0.741218 | 0.661274 | 0.026605 | 52.820661 | 102 | 1 | 93 | 4 | 4 |
tp = list(target_property.keys())[1]
print(tp)
df_models[tp]
Td_1/2 (K)
model | model_id | score | cv_score | cv_score(std) | rmse | size | aromatic_ring (3) | finger_print_structure:radius=1 (174) | heavy_atom (9) | ring (5) | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | <molgx.Prediction.LassoRegressionModel object ... | Lasso:alpha=1.0:opt:select() | 0.802458 | 0.756285 | 0.021127 | 28.345927 | 82 | 1 | 73 | 5 | 3 |
1 | <molgx.Prediction.ElasticNetRegressionModel ob... | ElasticNet:alpha=0.1 l1_ratio=0.0:opt:select() | 0.799326 | 0.755580 | 0.022499 | 28.569765 | 65 | 2 | 56 | 4 | 3 |
2 | <molgx.Prediction.RidgeRegressionModel object ... | Ridge:alpha=100.0:opt:select() | 0.800708 | 0.755000 | 0.018856 | 28.471213 | 66 | 2 | 57 | 4 | 3 |
tp = list(target_property.keys())[2]
print(tp)
df_models[tp]
log10(P_CO2 (Barrer))
model | model_id | score | cv_score | cv_score(std) | rmse | size | aromatic_ring (3) | finger_print_structure:radius=1 (174) | heavy_atom (9) | ring (5) | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | <molgx.Prediction.LassoRegressionModel object ... | Lasso:alpha=0.01:opt:select() | 0.840011 | 0.785739 | 0.025975 | 0.648070 | 109 | 1 | 98 | 6 | 4 |
1 | <molgx.Prediction.ElasticNetRegressionModel ob... | ElasticNet:alpha=0.1 l1_ratio=0.2:opt:select() | 0.807608 | 0.783213 | 0.019711 | 0.710675 | 47 | 0 | 42 | 5 | 0 |
2 | <molgx.Prediction.RidgeRegressionModel object ... | Ridge:alpha=100.0:opt:select() | 0.817974 | 0.782591 | 0.026398 | 0.691264 | 55 | 1 | 48 | 5 | 1 |