{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import all the necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from rdkit.Chem import AllChem\n",
    "from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint\n",
    "from rdkit import DataStructs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Processing the data to be predicted\n",
    "Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SMILES</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CC1(C)OC[C@@H](CC=O)O1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC1(C)OC[C@@H](CCCO)O1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CC1(C)OC[C@@H](CCI)O1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CC1(C)OC[C@@H](CCO)O1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              SMILES\n",
       "0  CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...\n",
       "1                             CC1(C)OC[C@@H](CC=O)O1\n",
       "2                             CC1(C)OC[C@@H](CCCO)O1\n",
       "3                              CC1(C)OC[C@@H](CCI)O1\n",
       "4                              CC1(C)OC[C@@H](CCO)O1"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Load the file\n",
    "df = pd.read_excel(\"example.xlsx\", sheet_name='Sheet1')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)\n",
    "mols = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]\n",
    "df_fp = [GetMACCSKeysFingerprint(mol) for mol in mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 1, 1, 0],\n",
       "       [0, 0, 0, ..., 1, 1, 0],\n",
       "       [0, 0, 0, ..., 1, 1, 0],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 1, 1, 0],\n",
       "       [0, 0, 0, ..., 1, 1, 0],\n",
       "       [0, 0, 0, ..., 1, 1, 0]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Convert the fingerprint list to a numpy array, so that the model can read them as the input with the correct shape\n",
    "X = np.array(df_fp)\n",
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the model and perform the prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Load the model (you can download this model use the link above)\n",
    "model = pickle.load(open('AB_XGBClassifier_model_21.pkl', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SMILES</th>\n",
       "      <th>Prediction</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CC1(C)OC[C@@H](CC=O)O1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC1(C)OC[C@@H](CCCO)O1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CC1(C)OC[C@@H](CCI)O1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CC1(C)OC[C@@H](CCO)O1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              SMILES  Prediction\n",
       "0  CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...           0\n",
       "1                             CC1(C)OC[C@@H](CC=O)O1           0\n",
       "2                             CC1(C)OC[C@@H](CCCO)O1           0\n",
       "3                              CC1(C)OC[C@@H](CCI)O1           0\n",
       "4                              CC1(C)OC[C@@H](CCO)O1           0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Below two lines of codes are to ignore the futurewarning raised by sklearn due to the version reasons\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "\n",
    "## Perform the prediction and save the results to a column named \"Prediction\" in the orginal dataframe\n",
    "df['Prediction'] = model.predict(X)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate the prediction accuracy\n",
    "The prediction accuracy is based on the similarity between the query compound and the dataset used to build the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Load the data that was used to build the model. It can be downloaded in the \"Dataset\" tab\n",
    "model_data = pd.read_excel('model_data_AB_Classification_21.xlsx', sheet_name='Sheet1')\n",
    "model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]\n",
    "model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''The prediction accuracy is based on the similarity score. \n",
    "For example, during the model development, chemicals with a similarity score of >=0.9 with each other \n",
    "demonstrated a model perdiction accuracy of 0.886.'''\n",
    "def prediction_acc(similarity):\n",
    "    if similarity >= 0.9:\n",
    "        accuracy = 0.886\n",
    "    elif 0.8 <= similarity <= 0.9:\n",
    "        accuracy = 0.827\n",
    "    elif 0.7 <= similarity <= 0.8:\n",
    "        accuracy = 0.862\n",
    "    elif 0.6 <= similarity <= 0.7:\n",
    "        accuracy = 0.800\n",
    "    elif 0.5 <= similarity <= 0.6:\n",
    "        accuracy = 0.732\n",
    "    else:\n",
    "        accuracy = '-'\n",
    "    return accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "similarity_list = []\n",
    "accuracy_list = []\n",
    "for fp in df_fp:\n",
    "    similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data\n",
    "    similarities.sort()\n",
    "    similarity = round(similarities[-1], 2) ## Get the largest similarity score and round to two decimal points\n",
    "    accuracy = prediction_acc(similarity)\n",
    "    similarity_list.append(similarity)\n",
    "    accuracy_list.append(accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SMILES</th>\n",
       "      <th>Prediction</th>\n",
       "      <th>Similarity</th>\n",
       "      <th>Accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CC1(C)OC[C@@H](CC=O)O1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.83</td>\n",
       "      <td>0.827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC1(C)OC[C@@H](CCCO)O1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.827</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CC1(C)OC[C@@H](CCI)O1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CC1(C)OC[C@@H](CCO)O1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.827</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              SMILES  Prediction  Similarity  \\\n",
       "0  CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...           0        0.63   \n",
       "1                             CC1(C)OC[C@@H](CC=O)O1           0        0.83   \n",
       "2                             CC1(C)OC[C@@H](CCCO)O1           0        0.84   \n",
       "3                              CC1(C)OC[C@@H](CCI)O1           0        0.74   \n",
       "4                              CC1(C)OC[C@@H](CCO)O1           0        0.82   \n",
       "\n",
       "   Accuracy  \n",
       "0     0.800  \n",
       "1     0.827  \n",
       "2     0.827  \n",
       "3     0.862  \n",
       "4     0.827  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Add the similarity and accuracy scores to the dataframe\n",
    "df['Similarity'] = similarity_list\n",
    "df['Accuracy'] = accuracy_list\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save the results to a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"prediction_result.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}