{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import all the necessary libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from rdkit.Chem import AllChem\n",
    "from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint\n",
    "from rdkit import DataStructs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Processing the data to be predicted\n",
    "Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SMILES</th>\n",
       "      <th>Time</th>\n",
       "      <th>Guideline</th>\n",
       "      <th>Principle</th>\n",
       "      <th>Endpoint</th>\n",
       "      <th>Reliability</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...</td>\n",
       "      <td>14</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CC1(C)OC[C@@H](CC=O)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC1(C)OC[C@@H](CCCO)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CC1(C)OC[C@@H](CCI)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CC1(C)OC[C@@H](CCO)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              SMILES  Time  Guideline  \\\n",
       "0  CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...    14  OECD 301A   \n",
       "1                             CC1(C)OC[C@@H](CC=O)O1    28  OECD 301A   \n",
       "2                             CC1(C)OC[C@@H](CCCO)O1    28  OECD 301A   \n",
       "3                              CC1(C)OC[C@@H](CCI)O1    28  OECD 301A   \n",
       "4                              CC1(C)OC[C@@H](CCO)O1    28  OECD 301A   \n",
       "\n",
       "      Principle Endpoint  Reliability  \n",
       "0  DOC die away    Ready            1  \n",
       "1  DOC die away    Ready            1  \n",
       "2  DOC die away    Ready            1  \n",
       "3  DOC die away    Ready            1  \n",
       "4  DOC die away    Ready            1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Load the file\n",
    "df = pd.read_excel(\"example_AB_Regression.xlsx\", sheet_name='Sheet1')\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fp</th>\n",
       "      <th>Time</th>\n",
       "      <th>Guideline</th>\n",
       "      <th>Principle</th>\n",
       "      <th>Endpoint</th>\n",
       "      <th>Reliability</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>14</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  fp  Time  Guideline  \\\n",
       "0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    14  OECD 301A   \n",
       "1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28  OECD 301A   \n",
       "2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28  OECD 301A   \n",
       "3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28  OECD 301A   \n",
       "4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28  OECD 301A   \n",
       "\n",
       "      Principle Endpoint  Reliability  \n",
       "0  DOC die away    Ready            1  \n",
       "1  DOC die away    Ready            1  \n",
       "2  DOC die away    Ready            1  \n",
       "3  DOC die away    Ready            1  \n",
       "4  DOC die away    Ready            1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)\n",
    "df['mol'] = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]\n",
    "df['fp'] = [GetMACCSKeysFingerprint(mol) for mol in df['mol']]\n",
    "df = pd.concat([df['fp'], df['Time'], df['Guideline'], df['Principle'], df['Endpoint'], df['Reliability']], axis=1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fp</th>\n",
       "      <th>Time</th>\n",
       "      <th>Guideline</th>\n",
       "      <th>Principle</th>\n",
       "      <th>Endpoint</th>\n",
       "      <th>Reliability</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>14</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...</td>\n",
       "      <td>28</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  fp  Time  Guideline  \\\n",
       "0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    14          4   \n",
       "1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28          4   \n",
       "2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28          4   \n",
       "3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28          4   \n",
       "4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...    28          4   \n",
       "\n",
       "   Principle  Endpoint  Reliability  \n",
       "0          0         0            1  \n",
       "1          0         0            1  \n",
       "2          0         0            1  \n",
       "3          0         0            1  \n",
       "4          0         0            1  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Mannually encode the categorical data\n",
    "cat_dict_guideline = {'EU Method C.4-A': 0, 'EU Method C.4-C': 1, 'EU Method C.4-D': 2, 'EU Method C.4-E': 3,\n",
    "                      'OECD 301A': 4, 'OECD 301B': 5, 'OECD 301C': 6,\n",
    "                      'OECD 301D': 7, 'OECD 301E': 8, 'OECD 301F': 9,\n",
    "                      'OECD 302B': 10, 'OECD 302C': 11, 'OECD 310': 12}\n",
    "cat_dict_principle = {'DOC die away': 0, 'CO2 evolution': 1, 'Closed respirometer': 2, 'Closed bottle test': 3}\n",
    "cat_dict_endpoint = {'Ready': 0, 'Inherent': 1}\n",
    "df = df.replace({'Guideline': cat_dict_guideline, 'Principle': cat_dict_principle, 'Endpoint': cat_dict_endpoint})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       [0, 0, 0, ..., 0, 0, 1],\n",
       "       ...,\n",
       "       [0, 0, 0, ..., 2, 0, 2],\n",
       "       [0, 0, 0, ..., 2, 0, 2],\n",
       "       [0, 0, 0, ..., 2, 0, 2]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Obtain the final X_input for the model\n",
    "X = []\n",
    "X_fp = np.array(df.iloc[:, 0])\n",
    "X_other = np.array(df.iloc[:, 1:6])\n",
    "for i in range(len(df)):\n",
    "    record_fp = np.array(X_fp[i]).tolist()\n",
    "    other = np.array(X_other[i]).tolist()\n",
    "    for item in other:\n",
    "        record_fp.append(item)  ## Append each categorical data into fp\n",
    "    X.append(record_fp)\n",
    "X = np.array(X)\n",
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the model and perform the prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Load the model (you can download this model use the link above)\n",
    "model = pickle.load(open('AB_XGBRegression_model_21.pkl', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Perform the prediction and save the results to a column named \"Prediction\" in the orginal dataframe\n",
    "prediction = model.predict(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate the prediction performance\n",
    "The prediction performance is based on the similarity between the query compound and the dataset used to build the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Load the data that was used to build the model. It can be downloaded in the \"Dataset\" tab\n",
    "model_data = pd.read_excel('model_data_AB_Regression_21.xlsx', sheet_name='Sheet1')\n",
    "model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]\n",
    "model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''The prediction performance is based on the similarity score. \n",
    "For example, during the model development, chemicals with a similarity score of >=0.9 with each other \n",
    "demonstrated an R2 or 0.79 and RMSE of 0.14 between the predicted and true values.'''\n",
    "def prediction_acc(similarity):\n",
    "    if similarity >= 0.9:\n",
    "        R2 = 0.79\n",
    "        RMSE = 0.14\n",
    "    elif 0.8 <= similarity <= 0.9:\n",
    "        R2 = 0.66\n",
    "        RMSE = 0.21\n",
    "    elif 0.7 <= similarity <= 0.8:\n",
    "        R2 = 0.59\n",
    "        RMSE = 0.23\n",
    "    elif 0.6 <= similarity <= 0.7:\n",
    "        R2 = 0.44\n",
    "        RMSE = 0.26\n",
    "    elif 0.5 <= similarity <= 0.6:\n",
    "        R2 = 0.49\n",
    "        RMSE = 0.26\n",
    "    else:\n",
    "        R2 = 'Out of AD'\n",
    "        RMSE = 'Out of AD'\n",
    "    return R2, RMSE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "similarity_list = []\n",
    "R2_list = []\n",
    "RMSE_list = []\n",
    "for fp in df['fp']:\n",
    "    similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data\n",
    "    similarities.sort()\n",
    "    similarity = round(similarities[-1], 2)\n",
    "    R2, RMSE = prediction_acc(similarity)\n",
    "    similarity_list.append(similarity)\n",
    "    R2_list.append(R2)\n",
    "    RMSE_list.append(RMSE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SMILES</th>\n",
       "      <th>Time</th>\n",
       "      <th>Guideline</th>\n",
       "      <th>Principle</th>\n",
       "      <th>Endpoint</th>\n",
       "      <th>Reliability</th>\n",
       "      <th>Prediction</th>\n",
       "      <th>Similarity</th>\n",
       "      <th>Expected prediction R2</th>\n",
       "      <th>Expected prediction RMSE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...</td>\n",
       "      <td>14</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "      <td>40.6%</td>\n",
       "      <td>0.64</td>\n",
       "      <td>0.44</td>\n",
       "      <td>0.26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CC1(C)OC[C@@H](CC=O)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "      <td>52.5%</td>\n",
       "      <td>0.83</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC1(C)OC[C@@H](CCCO)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "      <td>48.9%</td>\n",
       "      <td>0.84</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CC1(C)OC[C@@H](CCI)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "      <td>31.5%</td>\n",
       "      <td>0.74</td>\n",
       "      <td>0.59</td>\n",
       "      <td>0.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CC1(C)OC[C@@H](CCO)O1</td>\n",
       "      <td>28</td>\n",
       "      <td>OECD 301A</td>\n",
       "      <td>DOC die away</td>\n",
       "      <td>Ready</td>\n",
       "      <td>1</td>\n",
       "      <td>40.6%</td>\n",
       "      <td>0.82</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              SMILES  Time  Guideline  \\\n",
       "0  CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...    14  OECD 301A   \n",
       "1                             CC1(C)OC[C@@H](CC=O)O1    28  OECD 301A   \n",
       "2                             CC1(C)OC[C@@H](CCCO)O1    28  OECD 301A   \n",
       "3                              CC1(C)OC[C@@H](CCI)O1    28  OECD 301A   \n",
       "4                              CC1(C)OC[C@@H](CCO)O1    28  OECD 301A   \n",
       "\n",
       "      Principle Endpoint  Reliability Prediction  Similarity  \\\n",
       "0  DOC die away    Ready            1      40.6%        0.64   \n",
       "1  DOC die away    Ready            1      52.5%        0.83   \n",
       "2  DOC die away    Ready            1      48.9%        0.84   \n",
       "3  DOC die away    Ready            1      31.5%        0.74   \n",
       "4  DOC die away    Ready            1      40.6%        0.82   \n",
       "\n",
       "   Expected prediction R2  Expected prediction RMSE  \n",
       "0                    0.44                      0.26  \n",
       "1                    0.66                      0.21  \n",
       "2                    0.66                      0.21  \n",
       "3                    0.59                      0.23  \n",
       "4                    0.66                      0.21  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Add the similarity and accuracy scores to the dataframe\n",
    "df_0 = pd.read_excel(\"example_AB_Regression.xlsx\", sheet_name='Sheet1')\n",
    "df_0['Prediction'] = ['{:.1%}'.format(i) for i in prediction]\n",
    "df_0['Similarity'] = similarity_list\n",
    "df_0['Expected prediction R2'] = R2_list\n",
    "df_0['Expected prediction RMSE'] = RMSE_list\n",
    "df_0.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save the results to a csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_0.to_csv(\"prediction_result.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}