{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import all the necessary libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import pickle\n",
"from rdkit.Chem import AllChem\n",
"from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint\n",
"from rdkit import DataStructs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Processing the data to be predicted\n",
"Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SMILES | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... | \n",
"
\n",
" \n",
" 1 | \n",
" CC1(C)OC[C@@H](CC=O)O1 | \n",
"
\n",
" \n",
" 2 | \n",
" CC1(C)OC[C@@H](CCCO)O1 | \n",
"
\n",
" \n",
" 3 | \n",
" CC1(C)OC[C@@H](CCI)O1 | \n",
"
\n",
" \n",
" 4 | \n",
" CC1(C)OC[C@@H](CCO)O1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SMILES\n",
"0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...\n",
"1 CC1(C)OC[C@@H](CC=O)O1\n",
"2 CC1(C)OC[C@@H](CCCO)O1\n",
"3 CC1(C)OC[C@@H](CCI)O1\n",
"4 CC1(C)OC[C@@H](CCO)O1"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Load the file\n",
"df = pd.read_excel(\"example.xlsx\", sheet_name='Sheet1')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)\n",
"mols = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]\n",
"df_fp = [GetMACCSKeysFingerprint(mol) for mol in mols]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 0, ..., 1, 1, 0],\n",
" [0, 0, 0, ..., 1, 1, 0],\n",
" [0, 0, 0, ..., 1, 1, 0],\n",
" ...,\n",
" [0, 0, 0, ..., 1, 1, 0],\n",
" [0, 0, 0, ..., 1, 1, 0],\n",
" [0, 0, 0, ..., 1, 1, 0]])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Convert the fingerprint list to a numpy array, so that the model can read them as the input with the correct shape\n",
"X = np.array(df_fp)\n",
"X"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load the model and perform the prediction"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"## Load the model (you can download this model use the link above)\n",
"model = pickle.load(open('AB_XGBClassifier_model_21.pkl', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SMILES | \n",
" Prediction | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" CC1(C)OC[C@@H](CC=O)O1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" CC1(C)OC[C@@H](CCCO)O1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" CC1(C)OC[C@@H](CCI)O1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" CC1(C)OC[C@@H](CCO)O1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SMILES Prediction\n",
"0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0\n",
"1 CC1(C)OC[C@@H](CC=O)O1 0\n",
"2 CC1(C)OC[C@@H](CCCO)O1 0\n",
"3 CC1(C)OC[C@@H](CCI)O1 0\n",
"4 CC1(C)OC[C@@H](CCO)O1 0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Below two lines of codes are to ignore the futurewarning raised by sklearn due to the version reasons\n",
"import warnings\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"\n",
"## Perform the prediction and save the results to a column named \"Prediction\" in the orginal dataframe\n",
"df['Prediction'] = model.predict(X)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Calculate the prediction accuracy\n",
"The prediction accuracy is based on the similarity between the query compound and the dataset used to build the model."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"## Load the data that was used to build the model. It can be downloaded in the \"Dataset\" tab\n",
"model_data = pd.read_excel('model_data_AB_Classification_21.xlsx', sheet_name='Sheet1')\n",
"model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]\n",
"model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"'''The prediction accuracy is based on the similarity score. \n",
"For example, during the model development, chemicals with a similarity score of >=0.9 with each other \n",
"demonstrated a model perdiction accuracy of 0.886.'''\n",
"def prediction_acc(similarity):\n",
" if similarity >= 0.9:\n",
" accuracy = 0.886\n",
" elif 0.8 <= similarity <= 0.9:\n",
" accuracy = 0.827\n",
" elif 0.7 <= similarity <= 0.8:\n",
" accuracy = 0.862\n",
" elif 0.6 <= similarity <= 0.7:\n",
" accuracy = 0.800\n",
" elif 0.5 <= similarity <= 0.6:\n",
" accuracy = 0.732\n",
" else:\n",
" accuracy = '-'\n",
" return accuracy"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"similarity_list = []\n",
"accuracy_list = []\n",
"for fp in df_fp:\n",
" similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data\n",
" similarities.sort()\n",
" similarity = round(similarities[-1], 2) ## Get the largest similarity score and round to two decimal points\n",
" accuracy = prediction_acc(similarity)\n",
" similarity_list.append(similarity)\n",
" accuracy_list.append(accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" SMILES | \n",
" Prediction | \n",
" Similarity | \n",
" Accuracy | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... | \n",
" 0 | \n",
" 0.63 | \n",
" 0.800 | \n",
"
\n",
" \n",
" 1 | \n",
" CC1(C)OC[C@@H](CC=O)O1 | \n",
" 0 | \n",
" 0.83 | \n",
" 0.827 | \n",
"
\n",
" \n",
" 2 | \n",
" CC1(C)OC[C@@H](CCCO)O1 | \n",
" 0 | \n",
" 0.84 | \n",
" 0.827 | \n",
"
\n",
" \n",
" 3 | \n",
" CC1(C)OC[C@@H](CCI)O1 | \n",
" 0 | \n",
" 0.74 | \n",
" 0.862 | \n",
"
\n",
" \n",
" 4 | \n",
" CC1(C)OC[C@@H](CCO)O1 | \n",
" 0 | \n",
" 0.82 | \n",
" 0.827 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" SMILES Prediction Similarity \\\n",
"0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0 0.63 \n",
"1 CC1(C)OC[C@@H](CC=O)O1 0 0.83 \n",
"2 CC1(C)OC[C@@H](CCCO)O1 0 0.84 \n",
"3 CC1(C)OC[C@@H](CCI)O1 0 0.74 \n",
"4 CC1(C)OC[C@@H](CCO)O1 0 0.82 \n",
"\n",
" Accuracy \n",
"0 0.800 \n",
"1 0.827 \n",
"2 0.827 \n",
"3 0.862 \n",
"4 0.827 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Add the similarity and accuracy scores to the dataframe\n",
"df['Similarity'] = similarity_list\n",
"df['Accuracy'] = accuracy_list\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Save the results to a csv file"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"prediction_result.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}