{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import all the necessary libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import pickle\n", "from rdkit.Chem import AllChem\n", "from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint\n", "from rdkit import DataStructs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Processing the data to be predicted\n", "Below codes demonstrate how to process the smiles strings in an xlsx file. You can download the file above." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SMILESTimeGuidelinePrincipleEndpointReliability
0CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...14OECD 301ADOC die awayReady1
1CC1(C)OC[C@@H](CC=O)O128OECD 301ADOC die awayReady1
2CC1(C)OC[C@@H](CCCO)O128OECD 301ADOC die awayReady1
3CC1(C)OC[C@@H](CCI)O128OECD 301ADOC die awayReady1
4CC1(C)OC[C@@H](CCO)O128OECD 301ADOC die awayReady1
\n", "
" ], "text/plain": [ " SMILES Time Guideline \\\n", "0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 14 OECD 301A \n", "1 CC1(C)OC[C@@H](CC=O)O1 28 OECD 301A \n", "2 CC1(C)OC[C@@H](CCCO)O1 28 OECD 301A \n", "3 CC1(C)OC[C@@H](CCI)O1 28 OECD 301A \n", "4 CC1(C)OC[C@@H](CCO)O1 28 OECD 301A \n", "\n", " Principle Endpoint Reliability \n", "0 DOC die away Ready 1 \n", "1 DOC die away Ready 1 \n", "2 DOC die away Ready 1 \n", "3 DOC die away Ready 1 \n", "4 DOC die away Ready 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Load the file\n", "df = pd.read_excel(\"example_AB_Regression.xlsx\", sheet_name='Sheet1')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fpTimeGuidelinePrincipleEndpointReliability
0[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...14OECD 301ADOC die awayReady1
1[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...28OECD 301ADOC die awayReady1
2[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...28OECD 301ADOC die awayReady1
3[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...28OECD 301ADOC die awayReady1
4[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...28OECD 301ADOC die awayReady1
\n", "
" ], "text/plain": [ " fp Time Guideline \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 14 OECD 301A \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 OECD 301A \n", "\n", " Principle Endpoint Reliability \n", "0 DOC die away Ready 1 \n", "1 DOC die away Ready 1 \n", "2 DOC die away Ready 1 \n", "3 DOC die away Ready 1 \n", "4 DOC die away Ready 1 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Convert smiles to MACCS molecular fingerprint (the model we will be using was built based on MACCS fingerprints)\n", "df['mol'] = [AllChem.MolFromSmiles(smiles) for smiles in df['SMILES']]\n", "df['fp'] = [GetMACCSKeysFingerprint(mol) for mol in df['mol']]\n", "df = pd.concat([df['fp'], df['Time'], df['Guideline'], df['Principle'], df['Endpoint'], df['Reliability']], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fpTimeGuidelinePrincipleEndpointReliability
0[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...144001
1[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...284001
2[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...284001
3[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...284001
4[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...284001
\n", "
" ], "text/plain": [ " fp Time Guideline \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 14 4 \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 28 4 \n", "\n", " Principle Endpoint Reliability \n", "0 0 0 1 \n", "1 0 0 1 \n", "2 0 0 1 \n", "3 0 0 1 \n", "4 0 0 1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Mannually encode the categorical data\n", "cat_dict_guideline = {'EU Method C.4-A': 0, 'EU Method C.4-C': 1, 'EU Method C.4-D': 2, 'EU Method C.4-E': 3,\n", " 'OECD 301A': 4, 'OECD 301B': 5, 'OECD 301C': 6,\n", " 'OECD 301D': 7, 'OECD 301E': 8, 'OECD 301F': 9,\n", " 'OECD 302B': 10, 'OECD 302C': 11, 'OECD 310': 12}\n", "cat_dict_principle = {'DOC die away': 0, 'CO2 evolution': 1, 'Closed respirometer': 2, 'Closed bottle test': 3}\n", "cat_dict_endpoint = {'Ready': 0, 'Inherent': 1}\n", "df = df.replace({'Guideline': cat_dict_guideline, 'Principle': cat_dict_principle, 'Endpoint': cat_dict_endpoint})\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 1],\n", " [0, 0, 0, ..., 0, 0, 1],\n", " [0, 0, 0, ..., 0, 0, 1],\n", " ...,\n", " [0, 0, 0, ..., 2, 0, 2],\n", " [0, 0, 0, ..., 2, 0, 2],\n", " [0, 0, 0, ..., 2, 0, 2]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Obtain the final X_input for the model\n", "X = []\n", "X_fp = np.array(df.iloc[:, 0])\n", "X_other = np.array(df.iloc[:, 1:6])\n", "for i in range(len(df)):\n", " record_fp = np.array(X_fp[i]).tolist()\n", " other = np.array(X_other[i]).tolist()\n", " for item in other:\n", " record_fp.append(item) ## Append each categorical data into fp\n", " X.append(record_fp)\n", "X = np.array(X)\n", "X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the model and perform the prediction" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "## Load the model (you can download this model use the link above)\n", "model = pickle.load(open('AB_XGBRegression_model_21.pkl', 'rb'))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## Perform the prediction and save the results to a column named \"Prediction\" in the orginal dataframe\n", "prediction = model.predict(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Calculate the prediction performance\n", "The prediction performance is based on the similarity between the query compound and the dataset used to build the model." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "## Load the data that was used to build the model. It can be downloaded in the \"Dataset\" tab\n", "model_data = pd.read_excel('model_data_AB_Regression_21.xlsx', sheet_name='Sheet1')\n", "model_mols = [AllChem.MolFromSmiles(smiles) for smiles in model_data['Smiles']]\n", "model_fp = [GetMACCSKeysFingerprint(mol) for mol in model_mols]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "'''The prediction performance is based on the similarity score. \n", "For example, during the model development, chemicals with a similarity score of >=0.9 with each other \n", "demonstrated an R2 or 0.79 and RMSE of 0.14 between the predicted and true values.'''\n", "def prediction_acc(similarity):\n", " if similarity >= 0.9:\n", " R2 = 0.79\n", " RMSE = 0.14\n", " elif 0.8 <= similarity <= 0.9:\n", " R2 = 0.66\n", " RMSE = 0.21\n", " elif 0.7 <= similarity <= 0.8:\n", " R2 = 0.59\n", " RMSE = 0.23\n", " elif 0.6 <= similarity <= 0.7:\n", " R2 = 0.44\n", " RMSE = 0.26\n", " elif 0.5 <= similarity <= 0.6:\n", " R2 = 0.49\n", " RMSE = 0.26\n", " else:\n", " R2 = 'Out of AD'\n", " RMSE = 'Out of AD'\n", " return R2, RMSE" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "similarity_list = []\n", "R2_list = []\n", "RMSE_list = []\n", "for fp in df['fp']:\n", " similarities = DataStructs.BulkTanimotoSimilarity(fp, model_fp) ## Compare the query compound with all the model data\n", " similarities.sort()\n", " similarity = round(similarities[-1], 2)\n", " R2, RMSE = prediction_acc(similarity)\n", " similarity_list.append(similarity)\n", " R2_list.append(R2)\n", " RMSE_list.append(RMSE)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SMILESTimeGuidelinePrincipleEndpointReliabilityPredictionSimilarityExpected prediction R2Expected prediction RMSE
0CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc...14OECD 301ADOC die awayReady140.6%0.640.440.26
1CC1(C)OC[C@@H](CC=O)O128OECD 301ADOC die awayReady152.5%0.830.660.21
2CC1(C)OC[C@@H](CCCO)O128OECD 301ADOC die awayReady148.9%0.840.660.21
3CC1(C)OC[C@@H](CCI)O128OECD 301ADOC die awayReady131.5%0.740.590.23
4CC1(C)OC[C@@H](CCO)O128OECD 301ADOC die awayReady140.6%0.820.660.21
\n", "
" ], "text/plain": [ " SMILES Time Guideline \\\n", "0 CC1(C)OC[C@@H](CC(OC(=O)[O-])C2c3ccccc3-c3cccc... 14 OECD 301A \n", "1 CC1(C)OC[C@@H](CC=O)O1 28 OECD 301A \n", "2 CC1(C)OC[C@@H](CCCO)O1 28 OECD 301A \n", "3 CC1(C)OC[C@@H](CCI)O1 28 OECD 301A \n", "4 CC1(C)OC[C@@H](CCO)O1 28 OECD 301A \n", "\n", " Principle Endpoint Reliability Prediction Similarity \\\n", "0 DOC die away Ready 1 40.6% 0.64 \n", "1 DOC die away Ready 1 52.5% 0.83 \n", "2 DOC die away Ready 1 48.9% 0.84 \n", "3 DOC die away Ready 1 31.5% 0.74 \n", "4 DOC die away Ready 1 40.6% 0.82 \n", "\n", " Expected prediction R2 Expected prediction RMSE \n", "0 0.44 0.26 \n", "1 0.66 0.21 \n", "2 0.66 0.21 \n", "3 0.59 0.23 \n", "4 0.66 0.21 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Add the similarity and accuracy scores to the dataframe\n", "df_0 = pd.read_excel(\"example_AB_Regression.xlsx\", sheet_name='Sheet1')\n", "df_0['Prediction'] = ['{:.1%}'.format(i) for i in prediction]\n", "df_0['Similarity'] = similarity_list\n", "df_0['Expected prediction R2'] = R2_list\n", "df_0['Expected prediction RMSE'] = RMSE_list\n", "df_0.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save the results to a csv file" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df_0.to_csv(\"prediction_result.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }