Import all the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit import DataStructs

Processing the data to be predicted

Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above.

In [2]:
## Load the file
df = pd.read_excel("example_AB_Regression.xlsx", sheet_name='Sheet1')
df.head()
Out[2]:
SMILES Time Guideline Principle Endpoint Reliability
0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 14 OECD 301A DOC die away Ready 1
1 CC1(C)OC[C@@H](CC=O)O1 28 OECD 301A DOC die away Ready 1
2 CC1(C)OC[C@@H](CCCO)O1 28 OECD 301A DOC die away Ready 1
3 CC1(C)OC[C@@H](CCI)O1 28 OECD 301A DOC die away Ready 1
4 CC1(C)OC[C@@H](CCO)O1 28 OECD 301A DOC die away Ready 1
In [3]:
## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)
df['mol'] = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]
df['fp'] = [GetMACCSKeysFingerprint(mol) for mol in df['mol']]
df = pd.concat([df['fp'], df['Time'], df['Guideline'], df['Principle'], df['Endpoint'], df['Reliability']], axis=1)
df.head()
Out[3]:
fp Time Guideline Principle Endpoint Reliability
0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 14 OECD 301A DOC die away Ready 1
1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A DOC die away Ready 1
2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A DOC die away Ready 1
3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A DOC die away Ready 1
4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A DOC die away Ready 1
In [4]:
## Mannually encode the categorical data
cat_dict_guideline = {'EU Method C.4-A': 0, 'EU Method C.4-C': 1, 'EU Method C.4-D': 2, 'EU Method C.4-E': 3,
                      'OECD 301A': 4, 'OECD 301B': 5, 'OECD 301C': 6,
                      'OECD 301D': 7, 'OECD 301E': 8, 'OECD 301F': 9,
                      'OECD 302B': 10, 'OECD 302C': 11, 'OECD 310': 12}
cat_dict_principle = {'DOC die away': 0, 'CO2 evolution': 1, 'Closed respirometer': 2, 'Closed bottle test': 3}
cat_dict_endpoint = {'Ready': 0, 'Inherent': 1}
df = df.replace({'Guideline': cat_dict_guideline, 'Principle': cat_dict_principle, 'Endpoint': cat_dict_endpoint})
df.head()
Out[4]:
fp Time Guideline Principle Endpoint Reliability
0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 14 4 0 0 1
1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 0 0 1
2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 0 0 1
3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 0 0 1
4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 0 0 1
In [5]:
## Obtain the final X_input for the model
X = []
X_fp = np.array(df.iloc[:, 0])
X_other = np.array(df.iloc[:, 1:6])
for i in range(len(df)):
    record_fp = np.array(X_fp[i]).tolist()
    other = np.array(X_other[i]).tolist()
    for item in other:
        record_fp.append(item)  ## Append each categorical data into fp
    X.append(record_fp)
X = np.array(X)
X
Out[5]:
array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 2, 0, 2],
       [0, 0, 0, ..., 2, 0, 2],
       [0, 0, 0, ..., 2, 0, 2]])

Load the model and perform the prediction

In [6]:
## Load the model (you can download this model use the link above)
model = pickle.load(open('AB_XGBRegression_model_21.pkl', 'rb'))
In [7]:
## Perform the prediction and save the results to a column named "Prediction" in the orginal dataframe
prediction = model.predict(X)

Calculate the prediction performance

The prediction performance is based on the similarity between the query compound and the dataset used to build the model.

In [8]:
## Load the data that was used to build the model. It can be downloaded in the "Dataset" tab
model_data = pd.read_excel('model_data_AB_Regression_21.xlsx', sheet_name='Sheet1')
model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]
model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]
In [9]:
'''The prediction performance is based on the similarity score. 
For example, during the model development, chemicals with a similarity score of >=0.9 with each other 
demonstrated an R2 or 0.79 and RMSE of 0.14 between the predicted and true values.'''
def prediction_acc(similarity):
    if similarity >= 0.9:
        R2 = 0.79
        RMSE = 0.14
    elif 0.8 <= similarity <= 0.9:
        R2 = 0.66
        RMSE = 0.21
    elif 0.7 <= similarity <= 0.8:
        R2 = 0.59
        RMSE = 0.23
    elif 0.6 <= similarity <= 0.7:
        R2 = 0.44
        RMSE = 0.26
    elif 0.5 <= similarity <= 0.6:
        R2 = 0.49
        RMSE = 0.26
    else:
        R2 = 'Out of AD'
        RMSE = 'Out of AD'
    return R2, RMSE
In [10]:
similarity_list = []
R2_list = []
RMSE_list = []
for fp in df['fp']:
    similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data
    similarities.sort()
    similarity = round(similarities[-1], 2)
    R2, RMSE = prediction_acc(similarity)
    similarity_list.append(similarity)
    R2_list.append(R2)
    RMSE_list.append(RMSE)
In [11]:
## Add the similarity and accuracy scores to the dataframe
df_0 = pd.read_excel("example_AB_Regression.xlsx", sheet_name='Sheet1')
df_0['Prediction'] = ['{:.1%}'.format(i) for i in prediction]
df_0['Similarity'] = similarity_list
df_0['Expected prediction R2'] = R2_list
df_0['Expected prediction RMSE'] = RMSE_list
df_0.head()
Out[11]:
SMILES Time Guideline Principle Endpoint Reliability Prediction Similarity Expected prediction R2 Expected prediction RMSE
0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 14 OECD 301A DOC die away Ready 1 40.6% 0.64 0.44 0.26
1 CC1(C)OC[C@@H](CC=O)O1 28 OECD 301A DOC die away Ready 1 52.5% 0.83 0.66 0.21
2 CC1(C)OC[C@@H](CCCO)O1 28 OECD 301A DOC die away Ready 1 48.9% 0.84 0.66 0.21
3 CC1(C)OC[C@@H](CCI)O1 28 OECD 301A DOC die away Ready 1 31.5% 0.74 0.59 0.23
4 CC1(C)OC[C@@H](CCO)O1 28 OECD 301A DOC die away Ready 1 40.6% 0.82 0.66 0.21

Save the results to a csv file

In [12]:
df_0.to_csv("prediction_result.csv")