{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import all the necessary libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import pickle\n", "from rdkit.Chem import AllChem\n", "from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint\n", "from rdkit import DataStructs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Processing the data to be predicted\n", "Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SMILES
0CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...
1CC1(C)OC[C@@H](CC=O)O1
2CC1(C)OC[C@@H](CCCO)O1
3CC1(C)OC[C@@H](CCI)O1
4CC1(C)OC[C@@H](CCO)O1
\n", "
" ], "text/plain": [ " SMILES\n", "0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...\n", "1 CC1(C)OC[C@@H](CC=O)O1\n", "2 CC1(C)OC[C@@H](CCCO)O1\n", "3 CC1(C)OC[C@@H](CCI)O1\n", "4 CC1(C)OC[C@@H](CCO)O1" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Load the file\n", "df = pd.read_excel(\"example.xlsx\", sheet_name='Sheet1')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)\n", "mols = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]\n", "df_fp = [GetMACCSKeysFingerprint(mol) for mol in mols]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 1, 1, 0],\n", " [0, 0, 0, ..., 1, 1, 0],\n", " [0, 0, 0, ..., 1, 1, 0],\n", " ...,\n", " [0, 0, 0, ..., 1, 1, 0],\n", " [0, 0, 0, ..., 1, 1, 0],\n", " [0, 0, 0, ..., 1, 1, 0]])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Convert the fingerprint list to a numpy array, so that the model can read them as the input with the correct shape\n", "X = np.array(df_fp)\n", "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the model and perform the prediction" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "## Load the model (you can download this model use the link above)\n", "model = pickle.load(open('AB_XGBClassifier_model_21.pkl', 'rb'))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SMILESPrediction
0CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...0
1CC1(C)OC[C@@H](CC=O)O10
2CC1(C)OC[C@@H](CCCO)O10
3CC1(C)OC[C@@H](CCI)O10
4CC1(C)OC[C@@H](CCO)O10
\n", "
" ], "text/plain": [ " SMILES Prediction\n", "0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0\n", "1 CC1(C)OC[C@@H](CC=O)O1 0\n", "2 CC1(C)OC[C@@H](CCCO)O1 0\n", "3 CC1(C)OC[C@@H](CCI)O1 0\n", "4 CC1(C)OC[C@@H](CCO)O1 0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Below two lines of codes are to ignore the futurewarning raised by sklearn due to the version reasons\n", "import warnings\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "## Perform the prediction and save the results to a column named \"Prediction\" in the orginal dataframe\n", "df['Prediction'] = model.predict(X)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculate the prediction accuracy\n", "The prediction accuracy is based on the similarity between the query compound and the dataset used to build the model." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## Load the data that was used to build the model. It can be downloaded in the \"Dataset\" tab\n", "model_data = pd.read_excel('model_data_AB_Classification_21.xlsx', sheet_name='Sheet1')\n", "model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]\n", "model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "'''The prediction accuracy is based on the similarity score. \n", "For example, during the model development, chemicals with a similarity score of >=0.9 with each other \n", "demonstrated a model perdiction accuracy of 0.886.'''\n", "def prediction_acc(similarity):\n", " if similarity >= 0.9:\n", " accuracy = 0.886\n", " elif 0.8 <= similarity <= 0.9:\n", " accuracy = 0.827\n", " elif 0.7 <= similarity <= 0.8:\n", " accuracy = 0.862\n", " elif 0.6 <= similarity <= 0.7:\n", " accuracy = 0.800\n", " elif 0.5 <= similarity <= 0.6:\n", " accuracy = 0.732\n", " else:\n", " accuracy = '-'\n", " return accuracy" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "similarity_list = []\n", "accuracy_list = []\n", "for fp in df_fp:\n", " similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data\n", " similarities.sort()\n", " similarity = round(similarities[-1], 2) ## Get the largest similarity score and round to two decimal points\n", " accuracy = prediction_acc(similarity)\n", " similarity_list.append(similarity)\n", " accuracy_list.append(accuracy)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SMILESPredictionSimilarityAccuracy
0CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...00.630.800
1CC1(C)OC[C@@H](CC=O)O100.830.827
2CC1(C)OC[C@@H](CCCO)O100.840.827
3CC1(C)OC[C@@H](CCI)O100.740.862
4CC1(C)OC[C@@H](CCO)O100.820.827
\n", "
" ], "text/plain": [ " SMILES Prediction Similarity \\\n", "0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 0 0.63 \n", "1 CC1(C)OC[C@@H](CC=O)O1 0 0.83 \n", "2 CC1(C)OC[C@@H](CCCO)O1 0 0.84 \n", "3 CC1(C)OC[C@@H](CCI)O1 0 0.74 \n", "4 CC1(C)OC[C@@H](CCO)O1 0 0.82 \n", "\n", " Accuracy \n", "0 0.800 \n", "1 0.827 \n", "2 0.827 \n", "3 0.862 \n", "4 0.827 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Add the similarity and accuracy scores to the dataframe\n", "df['Similarity'] = similarity_list\n", "df['Accuracy'] = accuracy_list\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save the results to a csv file" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"prediction_result.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }