Import all the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit import DataStructs

Processing the data to be predicted

Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above.

In [2]:
## Load the file
df = pd.read_excel("example.xlsx", sheet_name='Sheet1')
df.head()
Out[2]:
SMILES
0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...
1 CC1(C)OC[C@@H](CC=O)O1
2 CC1(C)OC[C@@H](CCCO)O1
3 CC1(C)OC[C@@H](CCI)O1
4 CC1(C)OC[C@@H](CCO)O1
In [3]:
## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)
mols = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]
df_fp = [GetMACCSKeysFingerprint(mol) for mol in mols]
In [4]:
## Convert the fingerprint list to a numpy array, so that the model can read them as the input with the correct shape
X = np.array(df_fp)
X
Out[4]:
array([[0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]])

Load the model and perform the prediction

In [5]:
## Load the model (you can download this model use the link above)
model = pickle.load(open('AB_XGBClassifier_model_21.pkl', 'rb'))
In [6]:
## Below two lines of codes are to ignore the futurewarning raised by sklearn due to the version reasons
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Perform the prediction and save the results to a column named "Prediction" in the orginal dataframe
df['Prediction'] = model.predict(X)
df.head()
Out[6]:
SMILES Prediction
0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0
1 CC1(C)OC[C@@H](CC=O)O1 0
2 CC1(C)OC[C@@H](CCCO)O1 0
3 CC1(C)OC[C@@H](CCI)O1 0
4 CC1(C)OC[C@@H](CCO)O1 0

Calculate the prediction accuracy

The prediction accuracy is based on the similarity between the query compound and the dataset used to build the model.

In [7]:
## Load the data that was used to build the model. It can be downloaded in the "Dataset" tab
model_data = pd.read_excel('model_data_AB_Classification_21.xlsx', sheet_name='Sheet1')
model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]
model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]
In [8]:
'''The prediction accuracy is based on the similarity score. 
For example, during the model development, chemicals with a similarity score of >=0.9 with each other 
demonstrated a model perdiction accuracy of 0.886.'''
def prediction_acc(similarity):
    if similarity >= 0.9:
        accuracy = 0.886
    elif 0.8 <= similarity <= 0.9:
        accuracy = 0.827
    elif 0.7 <= similarity <= 0.8:
        accuracy = 0.862
    elif 0.6 <= similarity <= 0.7:
        accuracy = 0.800
    elif 0.5 <= similarity <= 0.6:
        accuracy = 0.732
    else:
        accuracy = '-'
    return accuracy
In [9]:
similarity_list = []
accuracy_list = []
for fp in df_fp:
    similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data
    similarities.sort()
    similarity = round(similarities[-1], 2) ## Get the largest similarity score and round to two decimal points
    accuracy = prediction_acc(similarity)
    similarity_list.append(similarity)
    accuracy_list.append(accuracy)
In [10]:
## Add the similarity and accuracy scores to the dataframe
df['Similarity'] = similarity_list
df['Accuracy'] = accuracy_list
df.head()
Out[10]:
SMILES Prediction Similarity Accuracy
0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0 0.63 0.800
1 CC1(C)OC[C@@H](CC=O)O1 0 0.83 0.827
2 CC1(C)OC[C@@H](CCCO)O1 0 0.84 0.827
3 CC1(C)OC[C@@H](CCI)O1 0 0.74 0.862
4 CC1(C)OC[C@@H](CCO)O1 0 0.82 0.827

Save the results to a csv file

In [11]:
df.to_csv("prediction_result.csv")