From 6104741564df417ab59457ad16270bb996ab51cc Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Wed, 12 Feb 2025 19:46:28 +0100 Subject: [PATCH 1/8] feat: adaptive ms2 frag types --- nbs_trials/adapt_charged_fragtypes.ipynb | 1446 ++++++++++++++++++++++ peptdeep/model/ms2.py | 233 +++- 2 files changed, 1631 insertions(+), 48 deletions(-) create mode 100644 nbs_trials/adapt_charged_fragtypes.ipynb diff --git a/nbs_trials/adapt_charged_fragtypes.ipynb b/nbs_trials/adapt_charged_fragtypes.ipynb new file mode 100644 index 00000000..083d46a5 --- /dev/null +++ b/nbs_trials/adapt_charged_fragtypes.ipynb @@ -0,0 +1,1446 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "from peptdeep.model.ms2 import pDeepModel, normalize_fragment_intensities\n", + "from peptdeep.model.rt import IRT_PEPTIDE_DF\n", + "from alphabase.spectral_library.flat import SpecLibFlat\n", + "import numpy as np\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sequencepep_nameirtmodsmod_sitesnAAchargefrag_start_idxfrag_stop_idxnceinstrument
0LGGNEQVTRRT-pep a-24.92920830Lumos
3YILAGVENSKRT-pep d19.7910281730Lumos
4TPVISGGPYEYRRT-pep e28.71122172830Lumos
5TPVITGAPYEYRRT-pep f33.38122283930Lumos
8GTFIIDPGGVIRRT-pep i70.52122395030Lumos
\n", + "
" + ], + "text/plain": [ + " sequence pep_name irt mods mod_sites nAA charge frag_start_idx \\\n", + "0 LGGNEQVTR RT-pep a -24.92 9 2 0 \n", + "3 YILAGVENSK RT-pep d 19.79 10 2 8 \n", + "4 TPVISGGPYEYR RT-pep e 28.71 12 2 17 \n", + "5 TPVITGAPYEYR RT-pep f 33.38 12 2 28 \n", + "8 GTFIIDPGGVIR RT-pep i 70.52 12 2 39 \n", + "\n", + " frag_stop_idx nce instrument \n", + "0 8 30 Lumos \n", + "3 17 30 Lumos \n", + "4 28 30 Lumos \n", + "5 39 30 Lumos \n", + "8 50 30 Lumos " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_prediction_dataset():\n", + " df=IRT_PEPTIDE_DF.copy()\n", + " df['charge'] = 2\n", + " df['mods'] = ''\n", + " df['mod_sites'] = ''\n", + " # sort by nAA\n", + " df = df.sort_values('nAA')\n", + " idxes = np.zeros(len(df)+1,dtype=np.int64)\n", + " idxes[1:] = np.cumsum(df.nAA.values-1)\n", + " df['frag_start_idx'] = idxes[:-1]\n", + " df['frag_stop_idx'] = idxes[1:]\n", + " df['nce'] = 30\n", + " df['instrument'] = \"Lumos\"\n", + " # sort by \n", + " return df\n", + "get_prediction_dataset().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Legacy weights vs new weights\n", + "\n", + "- Both weights share the same exact underlying weights for the model, the only difference is with the new format we save the charged frag types used during training in the weights file.\n", + "- So both models are trained on frag types: 'b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "legacy_path = \"../legacy_pretrained_models/generic/ms2.pth\"\n", + "new_path = \"../new_pretrained_models/generic/ms2.pth\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ms2 Prediction " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## User importing a legacy model " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "a) Using incorrect *len* of frag types when initialization (Should raise mismatch error)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[6], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlegacy_path\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:638\u001b[0m, in \u001b[0;36mModelInterface.load\u001b[1;34m(self, model_file, model_path_in_zip, **kwargs)\u001b[0m\n\u001b[0;32m 636\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_zipfile(model_file, model_path_in_zip)\n\u001b[0;32m 637\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 638\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_pytorchfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 640\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_stream(model_file)\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:727\u001b[0m, in \u001b[0;36mModelInterface._load_model_from_pytorchfile\u001b[1;34m(self, model_file)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_model_from_pytorchfile\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_file):\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(model_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m pt_file:\n\u001b[1;32m--> 727\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpt_file\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:717\u001b[0m, in \u001b[0;36mpDeepModel._load_model_from_stream\u001b[1;34m(self, stream)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moverride_from_weights:\n\u001b[0;32m 715\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types \u001b[38;5;241m=\u001b[39m tensor_to_charged_frags(loaded_charged_frag_types)\n\u001b[1;32m--> 717\u001b[0m (missing_keys, unexpect_keys) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mto_be_loaded_state_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m 719\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_model_state()\n\u001b[0;32m 721\u001b[0m missing_keys \u001b[38;5;241m=\u001b[39m [key \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m missing_keys \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_supported_charged_frag_types\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m key]\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\envs\\peptdeep\\lib\\site-packages\\torch\\nn\\modules\\module.py:2153\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[1;34m(self, state_dict, strict, assign)\u001b[0m\n\u001b[0;32m 2148\u001b[0m error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[0;32m 2149\u001b[0m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2150\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[0;32m 2152\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2153\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[0;32m 2155\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n", + "\u001b[1;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2])." + ] + } + ], + "source": [ + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", + "model.load(legacy_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "b) Using the correct *len* of frag types when initialization\n", + "- This is the ideal use case for the legacy weights were users request exactly the same frag types used when training. \n", + "- It's important to notice that the old implementation won't raise an error if the user requested different frag types as long as the number of frag types are the same." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Notice replacing the y_z1 and y_z2 with x_z1 and x_z2 and the model is loaded successfully and we get an incorrect prediction\n", + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'x_z1', 'x_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model.load(legacy_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2x_z1x_z2b_modloss_z1b_modloss_z2y_modloss_z1y_modloss_z2
00.0000000.01.0000000.0047390.00.00.00.0
10.1620340.00.3604140.0000000.00.00.00.0
20.0466600.00.1099200.0055160.00.00.00.0
30.0186280.00.2033260.0000000.00.00.00.0
40.0135300.00.2675070.0000000.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 x_z1 x_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 1.000000 0.004739 0.0 0.0 \n", + "1 0.162034 0.0 0.360414 0.000000 0.0 0.0 \n", + "2 0.046660 0.0 0.109920 0.005516 0.0 0.0 \n", + "3 0.018628 0.0 0.203326 0.000000 0.0 0.0 \n", + "4 0.013530 0.0 0.267507 0.000000 0.0 0.0 \n", + "\n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds = model.predict(get_prediction_dataset())\n", + "preds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Ideal use case requested frag types == training frag types\n", + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model.load(legacy_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2y_z1y_z2b_modloss_z1b_modloss_z2y_modloss_z1y_modloss_z2
00.0000000.01.0000000.0047390.00.00.00.0
10.1620340.00.3604140.0000000.00.00.00.0
20.0466600.00.1099200.0055160.00.00.00.0
30.0186280.00.2033260.0000000.00.00.00.0
40.0135300.00.2675070.0000000.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 1.000000 0.004739 0.0 0.0 \n", + "1 0.162034 0.0 0.360414 0.000000 0.0 0.0 \n", + "2 0.046660 0.0 0.109920 0.005516 0.0 0.0 \n", + "3 0.018628 0.0 0.203326 0.000000 0.0 0.0 \n", + "4 0.013530 0.0 0.267507 0.000000 0.0 0.0 \n", + "\n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 " + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legacy_full_preds = model.predict(get_prediction_dataset())\n", + "legacy_full_preds.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "If you don't have the new weighst format uncomment the following line and run the cell \n", + "after loading the legacy model weights with the correct frag types (last 2 cells)\n", + "it should the save the new weights in the new path.\n", + "\"\"\"\n", + "# model.save(new_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## User importing weighst in the new format " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Using the correct *len* of frag types when initialization (ideal use case)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model.load(new_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2y_z1y_z2b_modloss_z1b_modloss_z2y_modloss_z1y_modloss_z2
00.0000000.01.0000000.0047390.00.00.00.0
10.1620340.00.3604140.0000000.00.00.00.0
20.0466600.00.1099200.0055160.00.00.00.0
30.0186280.00.2033260.0000000.00.00.00.0
40.0135300.00.2675070.0000000.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 1.000000 0.004739 0.0 0.0 \n", + "1 0.162034 0.0 0.360414 0.000000 0.0 0.0 \n", + "2 0.046660 0.0 0.109920 0.005516 0.0 0.0 \n", + "3 0.018628 0.0 0.203326 0.000000 0.0 0.0 \n", + "4 0.013530 0.0 0.267507 0.000000 0.0 0.0 \n", + "\n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_full_preds = model.predict(get_prediction_dataset())\n", + "# verify the predictions are the same\n", + "assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)\n", + "new_full_preds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using incorrect *len* of frag types when initialization but still a subset of what was used during training. \n", + "\n", + "This is use case where a user request a subset of the frag types used during training for example:\n", + "\n", + "1) Excluding the modloss frags, preivously done by setting mask_modloss = True" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Interface has charged_frag_types ['b_z1', 'b_z2', 'y_z1', 'y_z2']\n", + "Supported charged_frag_types in the loaded weights ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n" + ] + } + ], + "source": [ + "# Excluding the modloss fragment types\n", + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2'])\n", + "model.load(new_path)\n", + "print(f\"Model Interface has charged_frag_types {model.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2y_z1y_z2
00.0000000.01.0000000.004739
10.1620340.00.3604140.000000
20.0466600.00.1099200.005516
30.0186280.00.2033260.000000
40.0135300.00.2675070.000000
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 y_z1 y_z2\n", + "0 0.000000 0.0 1.000000 0.004739\n", + "1 0.162034 0.0 0.360414 0.000000\n", + "2 0.046660 0.0 0.109920 0.005516\n", + "3 0.018628 0.0 0.203326 0.000000\n", + "4 0.013530 0.0 0.267507 0.000000" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types\n", + "new_subset_preds = model.predict(get_prediction_dataset())\n", + "assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)\n", + "new_subset_preds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2) Excluding frag types that are not modloss (New feature)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "# Excluding the the y fragments while keeping the modloss fragments\n", + "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", + "model.load(new_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2b_modloss_z1b_modloss_z2
00.0000000.00.00.0
10.1620340.00.00.0
20.0466600.00.00.0
30.0186280.00.00.0
40.0135300.00.00.0
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 b_modloss_z1 b_modloss_z2\n", + "0 0.000000 0.0 0.0 0.0\n", + "1 0.162034 0.0 0.0 0.0\n", + "2 0.046660 0.0 0.0 0.0\n", + "3 0.018628 0.0 0.0 0.0\n", + "4 0.013530 0.0 0.0 0.0" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types\n", + "new_subset_preds = model.predict(get_prediction_dataset())\n", + "assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)\n", + "new_subset_preds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the new format we have more semantics on what charged frag types are supported, so when a user request frag types that are not supported we can detect and raise an interpretable *error* (New feature)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[48], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 2\u001b[0m model\u001b[38;5;241m.\u001b[39mload(new_path)\n\u001b[1;32m----> 4\u001b[0m new_subset_preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_prediction_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:654\u001b[0m, in \u001b[0;36mpDeepModel.predict\u001b[1;34m(self, precursor_df, batch_size, verbose, reference_frag_df, **kwargs)\u001b[0m\n\u001b[0;32m 644\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpredict\u001b[39m(\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 646\u001b[0m precursor_df: pd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 651\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 652\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_safe_to_predict:\n\u001b[1;32m--> 654\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 655\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe model is not safe to use for prediction. This might mean that the requested charged_frag_types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m are not a subset of the charged_frag_types used to train the loaded pretrained model \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msupported_charged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Please retrain the model or use a pretrained model with the correct charged_frag_types.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 656\u001b[0m )\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mpredict(\n\u001b[0;32m 658\u001b[0m precursor_df,\n\u001b[0;32m 659\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 662\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 663\u001b[0m )\n", + "\u001b[1;31mValueError\u001b[0m: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types." + ] + } + ], + "source": [ + "model = pDeepModel(charged_frag_types=['x_z1', 'x_z2'])\n", + "model.load(new_path)\n", + "\n", + "new_subset_preds = model.predict(get_prediction_dataset())\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even if the user requested correct *len* of frag types when initialization but the requested frag types are not a subset of what was used during training we should raise an *error*." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[49], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 2\u001b[0m model\u001b[38;5;241m.\u001b[39mload(new_path)\n\u001b[1;32m----> 4\u001b[0m new_subset_preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_prediction_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:654\u001b[0m, in \u001b[0;36mpDeepModel.predict\u001b[1;34m(self, precursor_df, batch_size, verbose, reference_frag_df, **kwargs)\u001b[0m\n\u001b[0;32m 644\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpredict\u001b[39m(\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 646\u001b[0m precursor_df: pd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 651\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 652\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_safe_to_predict:\n\u001b[1;32m--> 654\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 655\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe model is not safe to use for prediction. This might mean that the requested charged_frag_types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m are not a subset of the charged_frag_types used to train the loaded pretrained model \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msupported_charged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Please retrain the model or use a pretrained model with the correct charged_frag_types.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 656\u001b[0m )\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mpredict(\n\u001b[0;32m 658\u001b[0m precursor_df,\n\u001b[0;32m 659\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 662\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 663\u001b[0m )\n", + "\u001b[1;31mValueError\u001b[0m: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types." + ] + } + ], + "source": [ + "model = pDeepModel(charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model.load(new_path)\n", + "\n", + "new_subset_preds = model.predict(get_prediction_dataset())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "User has a weights file and want to predict all fragment types used for training without knowing what exactly was used during training (New feature)\n", + "\n", + "Notice how the requested frag types are overridden in the model interface" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Interface has requested charged_frag_types ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", + "Supported charged_frag_types in the loaded weights ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n" + ] + } + ], + "source": [ + "model = pDeepModel(\n", + " charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'], # Will be overridden by the model weights\n", + " override_from_weights=True\n", + " )\n", + "model.load(new_path)\n", + "\n", + "print(f\"Model Interface has requested charged_frag_types {model.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
b_z1b_z2y_z1y_z2b_modloss_z1b_modloss_z2y_modloss_z1y_modloss_z2
00.0000000.01.0000000.0047390.00.00.00.0
10.1620340.00.3604140.0000000.00.00.00.0
20.0466600.00.1099200.0055160.00.00.00.0
30.0186280.00.2033260.0000000.00.00.00.0
40.0135300.00.2675070.0000000.00.00.00.0
\n", + "
" + ], + "text/plain": [ + " b_z1 b_z2 y_z1 y_z2 b_modloss_z1 b_modloss_z2 \\\n", + "0 0.000000 0.0 1.000000 0.004739 0.0 0.0 \n", + "1 0.162034 0.0 0.360414 0.000000 0.0 0.0 \n", + "2 0.046660 0.0 0.109920 0.005516 0.0 0.0 \n", + "3 0.018628 0.0 0.203326 0.000000 0.0 0.0 \n", + "4 0.013530 0.0 0.267507 0.000000 0.0 0.0 \n", + "\n", + " y_modloss_z1 y_modloss_z2 \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_full_preds = model.predict(get_prediction_dataset())\n", + "# verify the predictions are the same\n", + "assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)\n", + "new_full_preds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ms2 model training" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fragment types in the training data: Index(['a_z1', 'a_z2', 'b_z1', 'b_z2', 'c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1',\n", + " 'y_z2', 'z_z1', 'z_z2', 'b_H2O_z1', 'b_H2O_z2', 'b_NH3_z1', 'b_NH3_z2',\n", + " 'c_lossH_z1', 'c_lossH_z2', 'y_H2O_z1', 'y_H2O_z2', 'y_NH3_z1',\n", + " 'y_NH3_z2', 'z_addH_z1', 'z_addH_z2', 'b_modloss_z1', 'b_modloss_z2',\n", + " 'y_modloss_z1', 'y_modloss_z2'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "trainin_data_path = \"C:/Users/USER/Desktop/Germany/work/MPIB/alphadia/2oh_evidence_txt_0_batch_0.hdf\"\n", + "speclib = SpecLibFlat()\n", + "speclib.load_hdf(trainin_data_path)\n", + "speclib.fragment_intensity_df[\"b_modloss_z1\"] = 0\n", + "speclib.fragment_intensity_df[\"b_modloss_z2\"] = 0\n", + "speclib.fragment_intensity_df[\"y_modloss_z1\"] = 0\n", + "speclib.fragment_intensity_df[\"y_modloss_z2\"] = 0\n", + "frgament_types_in_data = speclib.fragment_intensity_df.columns\n", + "\n", + "speclib.precursor_df['nce'] = 30\n", + "speclib.precursor_df['instrument'] = \"Lumos\"\n", + "# sample only 100 samples\n", + "speclib.precursor_df = speclib.precursor_df.sample(100)\n", + "\n", + "# normalize intensity \n", + "normalize_fragment_intensities(speclib.precursor_df, speclib.fragment_intensity_df)\n", + "print(f\"Fragment types in the training data: {frgament_types_in_data}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## User importing a legacy model " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using correct *len* of frag types when initialization \n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-02-12 19:42:53> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, Mean Loss=0.0174776264175307\n" + ] + } + ], + "source": [ + "target_frag_types = ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", + "model = pDeepModel(charged_frag_types=target_frag_types)\n", + "model.load(legacy_path)\n", + "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using incorrect *len* of frag types when initialization (Should raise a mismatch error)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "ename": "RuntimeError", + "evalue": "Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[54], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m target_frag_types \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 2\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39mtarget_frag_types)\n\u001b[1;32m----> 3\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlegacy_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m model\u001b[38;5;241m.\u001b[39mtrain(precursor_df\u001b[38;5;241m=\u001b[39mspeclib\u001b[38;5;241m.\u001b[39mprecursor_df, fragment_intensity_df\u001b[38;5;241m=\u001b[39mspeclib\u001b[38;5;241m.\u001b[39mfragment_intensity_df\u001b[38;5;241m.\u001b[39mloc[:,target_frag_types], epoch\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:638\u001b[0m, in \u001b[0;36mModelInterface.load\u001b[1;34m(self, model_file, model_path_in_zip, **kwargs)\u001b[0m\n\u001b[0;32m 636\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_zipfile(model_file, model_path_in_zip)\n\u001b[0;32m 637\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 638\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_pytorchfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 640\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_stream(model_file)\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:727\u001b[0m, in \u001b[0;36mModelInterface._load_model_from_pytorchfile\u001b[1;34m(self, model_file)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_model_from_pytorchfile\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_file):\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(model_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m pt_file:\n\u001b[1;32m--> 727\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpt_file\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:717\u001b[0m, in \u001b[0;36mpDeepModel._load_model_from_stream\u001b[1;34m(self, stream)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moverride_from_weights:\n\u001b[0;32m 715\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types \u001b[38;5;241m=\u001b[39m tensor_to_charged_frags(loaded_charged_frag_types)\n\u001b[1;32m--> 717\u001b[0m (missing_keys, unexpect_keys) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mto_be_loaded_state_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m 719\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_model_state()\n\u001b[0;32m 721\u001b[0m missing_keys \u001b[38;5;241m=\u001b[39m [key \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m missing_keys \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_supported_charged_frag_types\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m key]\n", + "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\envs\\peptdeep\\lib\\site-packages\\torch\\nn\\modules\\module.py:2153\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[1;34m(self, state_dict, strict, assign)\u001b[0m\n\u001b[0;32m 2148\u001b[0m error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[0;32m 2149\u001b[0m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2150\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[0;32m 2152\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2153\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[0;32m 2155\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n", + "\u001b[1;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2])." + ] + } + ], + "source": [ + "target_frag_types = ['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']\n", + "model = pDeepModel(charged_frag_types=target_frag_types)\n", + "model.load(legacy_path)\n", + "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## User importing a new model " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trining on new fragment types that were not part of the training of the original weights (New feature). \n", + "- This is not training from scratch but rather loading the pre-trained backbone and only the prediction heads are initialized from scratch which results in a much faster convergence and reduce the risk of overfiting.\n", + "- Notice how when the requested frag types are not a subset of the supported the model is not safe to use for prediction, but after training the model is now safe to predict. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trying to predict when the requested fragment types are not supported by the pretrained model\n", + "Error: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.\n", + "Training the model with the requested fragment types\n", + "2025-02-12 19:43:01> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, Mean Loss=0.08573874365538359\n", + "Trying to predict after training with the requested fragment types\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
a_z1a_z2b_H2O_z1b_H2O_z2
00.0000000.01.0000000.000000
10.1564330.00.3520000.000000
20.0425330.00.1016340.001894
30.0152290.00.1883990.000000
40.0099350.00.2505300.000000
\n", + "
" + ], + "text/plain": [ + " a_z1 a_z2 b_H2O_z1 b_H2O_z2\n", + "0 0.000000 0.0 1.000000 0.000000\n", + "1 0.156433 0.0 0.352000 0.000000\n", + "2 0.042533 0.0 0.101634 0.001894\n", + "3 0.015229 0.0 0.188399 0.000000\n", + "4 0.009935 0.0 0.250530 0.000000" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_frag_types = ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] \n", + "model = pDeepModel(charged_frag_types=target_frag_types)\n", + "model.load(new_path)\n", + "print(\"Trying to predict when the requested fragment types are not supported by the pretrained model\")\n", + "try: \n", + " # try to predict with the new model\n", + " model.predict(get_prediction_dataset())\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n", + "print(\"Training the model with the requested fragment types\")\n", + "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)\n", + "\n", + "print(\"Trying to predict after training with the requested fragment types\")\n", + "try: \n", + " # try to predict with the new model\n", + " preds = model.predict(get_prediction_dataset())\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n", + "preds.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After training the the underlying supported fragment types is aligned with teh requested frag types which can then be saved." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Interface has requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2']\n", + "Supported charged_frag_types in the loaded weights ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2']\n" + ] + } + ], + "source": [ + "print(f\"Model Interface has requested charged_frag_types {model.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "peptdeep", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index fa7b1035..39683462 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -12,6 +12,9 @@ update_sliced_fragment_dataframe, get_sliced_fragment_dataframe, get_charged_frag_types, + sort_charged_frag_types, + parse_charged_frag_type, + FRAGMENT_TYPES ) from peptdeep.utils import get_available_device @@ -21,7 +24,7 @@ parse_instrument_indices, get_batch_mod_feature, ) - +from peptdeep.utils import logging from peptdeep.settings import global_settings as settings, model_const import peptdeep.model.model_interface as model_interface @@ -151,9 +154,7 @@ class ModelMS2Bert(torch.nn.Module): def __init__( self, - num_frag_types, - num_modloss_types=0, - mask_modloss=True, + charged_frag_types, dropout=0.1, nlayers=4, hidden=256, @@ -161,14 +162,18 @@ def __init__( **kwargs, ): super().__init__() - + charged_frag_types = sort_charged_frag_types(charged_frag_types) self.dropout = torch.nn.Dropout(dropout) - - self._num_modloss_types = num_modloss_types - self._num_non_modloss = num_frag_types - num_modloss_types - self._mask_modloss = mask_modloss - if num_modloss_types == 0: - self._mask_modloss = True + num_frag_types = len(charged_frag_types) + + # register charged fragment types + self.register_buffer( + "_supported_charged_frag_types", + charged_frags_to_tensor(charged_frag_types), + ) + self._get_modloss_frags() + self._num_modloss_types = len(self._modloss_frag_types) + self._num_non_modloss = num_frag_types - self._num_modloss_types meta_dim = 8 self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding( @@ -189,8 +194,8 @@ def __init__( hidden, self._num_non_modloss, ) - - if num_modloss_types > 0: + + if self._num_modloss_types > 0: # for transfer learning of modloss frags self.modloss_nn = torch.nn.ModuleList( [ @@ -202,13 +207,18 @@ def __init__( ), building_block.Decoder_Linear( hidden, - num_modloss_types, + self._num_modloss_types, ), ] ) else: self.modloss_nn = None - + def _get_modloss_frags(self): + self._modloss_frag_types = [] + for i, frag in enumerate(self.supported_charged_frag_types): + frag_type, _ = parse_charged_frag_type(frag) + if FRAGMENT_TYPES[frag_type].modloss: + self._modloss_frag_types.append(i) @property def output_attentions(self): return self._output_attentions @@ -218,7 +228,10 @@ def output_attentions(self, val: bool): self._output_attentions = val self.hidden_nn.output_attentions = val self.modloss_nn[0].output_attentions = val - + @property + def supported_charged_frag_types(self): + return tensor_to_charged_frags(self._supported_charged_frag_types) + def forward( self, aa_indices, @@ -246,25 +259,12 @@ def forward( self.modloss_attentions = None if self._num_modloss_types > 0: - if self._mask_modloss: - out_x = torch.cat( - ( - out_x, - torch.zeros( - *out_x.size()[:2], - self._num_modloss_types, - device=in_x.device, - ), - ), - 2, - ) - else: - modloss_x = self.modloss_nn[0](in_x) - if self.output_attentions: - self.modloss_attentions = modloss_x[-1] - modloss_x = modloss_x[0] + hidden_x - modloss_x = self.modloss_nn[-1](modloss_x) - out_x = torch.cat((out_x, modloss_x), 2) + modloss_x = self.modloss_nn[0](in_x) + if self.output_attentions: + self.modloss_attentions = modloss_x[-1] + modloss_x = modloss_x[0] + hidden_x + modloss_x = self.modloss_nn[-1](modloss_x) + out_x = torch.cat((out_x, modloss_x), 2) return out_x[:, 3:, :] @@ -379,42 +379,59 @@ def forward(self, pred, target): class pDeepModel(model_interface.ModelInterface): """ `ModelInterface` for MS2 prediction models + + Parameters + ---------- + charged_frag_types : List[str] + Charged fragment types to predict + dropout : float, optional + Dropout rate, by default 0.1 + model_class : torch.nn.Module, optional + Ms2 Model class, by default ModelMS2Bert + device : str, optional + Device to run the model, by default "gpu" + override_from_weights : bool, optional default False + Over ride the requested charged frag types when loading model from weights, this will + make the model always in a safe to predict state. + mask_modloss : bool, optional (deprecated) + Mask the modloss fragments, this is deprecated and will be removed in the future. To mask the modloss fragments, + the charged_frag_types should not include the modloss fragments. + """ def __init__( self, charged_frag_types=get_charged_frag_types(frag_types, max_frag_charge), dropout=0.1, - mask_modloss=True, - modloss_type="modloss", model_class: torch.nn.Module = ModelMS2Bert, device: str = "gpu", + mask_modloss: bool|None = None, + override_from_weights: bool = False, **kwargs, # model params ): super().__init__(device=device) + if mask_modloss is not None: + warnings.warn("mask_modloss is deprecated and will be removed in the future. To mask the modloss fragments, the charged_frag_types should not include the modloss fragments.") + + self.override_from_weights = override_from_weights self.charged_frag_types = charged_frag_types - self._get_modloss_frags(modloss_type) self.charge_factor = 0.1 self.NCE_factor = 0.01 self.model: ModelMS2Bert = None + self._model_kwargs = kwargs self.build( model_class, - num_frag_types=len(self.charged_frag_types), - num_modloss_types=len(self._modloss_frag_types), - mask_modloss=mask_modloss, + charged_frag_types=self.charged_frag_types, dropout=dropout, **kwargs, # other model params ) self.loss_func = torch.nn.L1Loss() self.min_inten = 1e-4 + self._safe_to_predict = True + self._safe_to_train = True - def _get_modloss_frags(self, modloss="modloss"): - self._modloss_frag_types = [] - for i, frag in enumerate(self.charged_frag_types): - if modloss in frag: - self._modloss_frag_types.append(i) def _prepare_train_data_df( self, @@ -493,6 +510,10 @@ def _set_batch_predict_data( apex_intens[apex_intens <= 0] = 1 predicts /= apex_intens.reshape((-1, 1, 1)) predicts[predicts < self.min_inten] = 0.0 + # mask out predicted charged frag types that are not in the requested charged_frag_types + columns_mask = np.isin(self.model.supported_charged_frag_types, self.charged_frag_types) + predicts = predicts[:, :, columns_mask] + if self._predict_in_order: self.predict_df.values[ batch_df.frag_start_idx.values[0] : batch_df.frag_stop_idx.values[-1], : @@ -504,6 +525,41 @@ def _set_batch_predict_data( predicts.reshape((-1, len(self.charged_frag_types))), batch_df[["frag_start_idx", "frag_stop_idx"]].values, ) + def _align_model_charged_frag_types(self): + """ + Align the underlying model charged_frag_types with the interface charged_frag_types, + this function is necessary for the model to be safe to train. + Important: This function when called will most probably reset the last layer of the model + and randomly initialize is it so it might not be safe to use the model for prediction since + the last layer will be randomly initialized. + """ + loaded_model_state_dict = self.model.state_dict() + self.build( + self.model.__class__, + dropout = self.model.dropout.p, + charged_frag_types=self.charged_frag_types, + **self._model_kwargs, + ) + current_model_dict = self.model.state_dict() + # use the layers/modules that are shared from the original model instead of starting from scratch + filtered_params = {} + size_mismatches = [] + unexpected_keys = [] + for source_key, source_value in loaded_model_state_dict.items(): + if source_key in current_model_dict: + if source_value.size() == current_model_dict[source_key].size(): + filtered_params[source_key] = source_value + else: + size_mismatches.append(source_key) + else: + unexpected_keys.append(source_key) + self.model.load_state_dict(filtered_params, strict=False) + + if len(size_mismatches) > 0 or len(unexpected_keys) > 0: + self._safe_to_predict = False + + self._safe_to_train = True + def train_with_warmup( self, @@ -518,7 +574,9 @@ def train_with_warmup( verbose_each_epoch=False, **kwargs, ): - return super().train_with_warmup( + if not self._safe_to_train: + self._align_model_charged_frag_types() + super().train_with_warmup( precursor_df, fragment_intensity_df=fragment_intensity_df, batch_size=batch_size, @@ -529,6 +587,7 @@ def train_with_warmup( verbose_each_epoch=verbose_each_epoch, **kwargs, ) + self._safe_to_predict = True def test( self, @@ -566,7 +625,9 @@ def train( verbose_each_epoch=False, **kwargs, ): - return super().train( + if not self._safe_to_train: + self._align_model_charged_frag_types() + super().train( precursor_df, fragment_intensity_df=fragment_intensity_df, batch_size=batch_size, @@ -577,6 +638,8 @@ def train( verbose_each_epoch=verbose_each_epoch, **kwargs, ) + self._safe_to_predict = True + def predict( self, @@ -587,6 +650,10 @@ def predict( reference_frag_df=None, **kwargs, ) -> pd.DataFrame: + if not self._safe_to_predict: + raise ValueError( + f"The model is not safe to use for prediction. This might mean that the requested charged_frag_types {self.charged_frag_types} are not a subset of the charged_frag_types used to train the loaded pretrained model {self.model.supported_charged_frag_types}. Please retrain the model or use a pretrained model with the correct charged_frag_types." + ) return super().predict( precursor_df, batch_size=batch_size, @@ -632,7 +699,49 @@ def bootstrap_nce_search( ) nce_list.append(nce) return np.median(nce_list), instrument + def _load_model_from_stream(self, stream): + to_be_loaded_state_dict = torch.load(stream, map_location=self.device) + if "_supported_charged_frag_types" in to_be_loaded_state_dict: + loaded_charged_frag_types = to_be_loaded_state_dict["_supported_charged_frag_types"] + # build a model that has the same charged_frag_types as the loaded model + self.build( + self.model.__class__, + dropout = self.model.dropout.p, + charged_frag_types=tensor_to_charged_frags(loaded_charged_frag_types), + **self._model_kwargs, + ) + + if self.override_from_weights: + self.charged_frag_types = tensor_to_charged_frags(loaded_charged_frag_types) + (missing_keys, unexpect_keys) = self.model.load_state_dict( + to_be_loaded_state_dict, strict=False + ) + self._update_model_state() + missing_keys = [key for key in missing_keys if "_supported_charged_frag_types" not in key] + if len(missing_keys) > 0: + logging.warn( + f"nn parameters {missing_keys} are MISSING while loading models in {self.__class__}" + ) + if len(unexpect_keys) > 0: + logging.warn( + f"nn parameters {unexpect_keys} are UNEXPECTED while loading models in {self.__class__}" + ) + + def _update_model_state(self): + """ + Update the model state "safe_to_predict" and "safe_to_train". + Depending on the the interface (pDeepModel) charged frag types and he underlying model charged frag types, + the model state will be updated. + - safe_to_predict: True if the interface charged frag types are a subset of the underlying model charged frag types + - safe_to_train: True if the interface charged frag types are the same as the underlying model charged frag types + """ + self._safe_to_predict = set(self.charged_frag_types).issubset( + set(self.model.supported_charged_frag_types) + ) + self._safe_to_train = set(self.charged_frag_types) == set( + self.model.supported_charged_frag_types + ) def grid_nce_search( self, psm_df: pd.DataFrame, @@ -884,3 +993,31 @@ def calc_ms2_similarity( torch.cuda.empty_cache() return psm_df, metrics_describ + + +def charged_frags_to_tensor(charged_frags: List[str]) -> torch.Tensor: + """ + Convert a list of strings (charged fragment types, modloss fragment types) to a tensor + + Parameters + ---------- + list : List[str] + List of strings + """ + seperator = "," + string = seperator.join(charged_frags) + return torch.tensor( + [ord(char) for char in string], dtype=torch.int32 + ).unsqueeze(0) + +def tensor_to_charged_frags(tensor: torch.Tensor) -> List[str]: + """ + Convert a tensor to a list of strings (charged fragment types, modloss fragment types) + + Parameters + ---------- + tensor : torch.Tensor + Tensor of int32 + """ + string = "".join([chr(char) for char in tensor[0].tolist()]) + return string.split(",") \ No newline at end of file From 87c0b9bbb5f07f1f1c789e382c0a35904c9e5372 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 00:33:29 +0100 Subject: [PATCH 2/8] test: add unit tests --- tests/unit/test_ms2_adaptive_frag_types.py | 246 +++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 tests/unit/test_ms2_adaptive_frag_types.py diff --git a/tests/unit/test_ms2_adaptive_frag_types.py b/tests/unit/test_ms2_adaptive_frag_types.py new file mode 100644 index 00000000..21bd9762 --- /dev/null +++ b/tests/unit/test_ms2_adaptive_frag_types.py @@ -0,0 +1,246 @@ +import pytest +from peptdeep.model.ms2 import pDeepModel +import os +import tempfile +from typing import List, Optional +from peptdeep.model.rt import IRT_PEPTIDE_DF +import numpy as np + + +def get_legacy_model(charged_frag_types: Optional[List[str]] = None): + if charged_frag_types is None: + return pDeepModel() + else: + return pDeepModel(charged_frag_types=charged_frag_types) + + +def transform_weights_to_new_format(): + # TODO This is a temporary solution to transform the old weights to the new format until the new weights are uploaded + # load the legacy model with the correct charged fragment types and resave them to the new format + model = get_legacy_model( + charged_frag_types=[ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + ) + temp_dir = os.path.join(tempfile.gettempdir(), "peptDeep_models") + os.makedirs(temp_dir, exist_ok=True) + weights_dist = os.path.join(temp_dir, "new_weights.pth") + model.save(weights_dist) + + return weights_dist + + +def get_prediction_dataset(): + df = IRT_PEPTIDE_DF.copy() + df["charge"] = 2 + df["mods"] = "" + df["mod_sites"] = "" + # sort by nAA + df = df.sort_values("nAA") + idxes = np.zeros(len(df) + 1, dtype=np.int64) + idxes[1:] = np.cumsum(df.nAA.values - 1) + df["frag_start_idx"] = idxes[:-1] + df["frag_stop_idx"] = idxes[1:] + df["nce"] = 30 + df["instrument"] = "Lumos" + # sort by + return df + + +def test_legacy_weights_with_correct_frag_types(): + # Given a user requests exactly the same charged frag types used when training the model + charged_frag_types = [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + + # When the user loads the model from legacy weights + model = get_legacy_model(charged_frag_types=charged_frag_types) + + # Then the model should be safe to predict and train + assert model._safe_to_predict, ( + "Model was not safe to predict when loading legacy weights with correct charged frag types" + ) + assert model._safe_to_train, ( + "Model was not safe to train when loading legacy weights with correct charged frag types" + ) + + +def test_legacy_weights_complete_prediction(): + # Given a user requests exactly the same charged frag types used when training the model + charged_frag_types = [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + + # When the user loads the model from legacy weights and uses it for prediction + model = get_legacy_model(charged_frag_types=charged_frag_types) + df = get_prediction_dataset() + pred_df = model.predict(df) + + # Then the prediction should have all requested charged frag types + assert set(pred_df.columns) == set(charged_frag_types), ( + "Prediction did not have all requested charged frag types" + ) + # Non nan values should be present + assert not pred_df.isna().all().all(), "All values in the prediction were nan" + + +def test_new_weights_complete_prediction(): + # Given a user requests exactly the same charged frag types used when training the model + requested_charged_frag_types = [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + + # When the user loads the model from new weights and uses it for prediction + model = pDeepModel(charged_frag_types=requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + + df = get_prediction_dataset() + + pred_df = model.predict(df) + + # Then the prediction should have all requested charged frag types + assert set(pred_df.columns) == set(requested_charged_frag_types), ( + "Prediction did not have all requested charged frag types" + ) + # Non nan values should be present + assert not pred_df.isna().all().all(), "All values in the prediction were nan" + + +def test_new_state_subset_prediction(): + # Given a user requests a subset of the charged frag types used when training the model + requested_charged_frag_types = ["b_z1", "b_z2", "y_z1", "y_z2"] + + # When the user loads the model from new weights + model = pDeepModel(charged_frag_types=requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + + # The model should be safe to predict but not to train + assert model._safe_to_predict, ( + "Model was not safe to predict when loading new weights with subset of charged frag types" + ) + # since theres a discrepancy between the requested and the trained charged frag types + assert not model._safe_to_train, ( + "Model was safe to train when loading new weights with subset of charged frag types" + ) + + +def test_new_state_unsupported_frag_types(): + # Given a user requests a charged frag types that are not supported by the loaded model + requested_charged_frag_types = [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + "b_modloss_z3", + ] + + # When the user loads the model from new weights + model = pDeepModel(charged_frag_types=requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + + # The model should not be safe to predict or train + assert not model._safe_to_predict, ( + "Model was safe to predict when loading new weights with unsupported charged frag types" + ) + assert not model._safe_to_train, ( + "Model was safe to train when loading new weights with unsupported charged frag types" + ) + + +def test_prediction_unsupported_frag_types(): + # Given a user requests a charged frag types that are not supported by the loaded model + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] + + # When the user loads the model from new weights and uses it for prediction + model = pDeepModel(requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + + df = get_prediction_dataset() + # Then the model should raise an error + with pytest.raises(ValueError): + model.predict(df) + + +def test_override_requested_frag_types(): + # Given a user requests any charged frag types + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] + + # When the user loads the model from new weights and uses it for prediction while using the supported charged frag types to override the requested ones + model = pDeepModel(requested_charged_frag_types, override_from_weights=True) + model.load(transform_weights_to_new_format()) + + # Then the requested charged frag types should be overridden by the supported ones and the model should be safe to predict + assert set(model.charged_frag_types) == set( + [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + ), "Overridden charged frag types were not as expected" + assert model._safe_to_predict, ( + "Model was not safe to predict when overriding requested charged frag types" + ) + + +def test_override_requested_frag_types_prediction(): + # Given a user requests any charged frag types + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] + + # When the user loads the model from new weights and uses it for prediction while using the supported charged frag types to override the requested ones + model = pDeepModel(requested_charged_frag_types, override_from_weights=True) + model.load(transform_weights_to_new_format()) + df = get_prediction_dataset() + pred_df = model.predict(df) + + # Then the prediction should have all charged frag types supported by the model + assert set(pred_df.columns) == set( + [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + ), "Prediction did not have all supported charged frag types" + # Non nan values should be present + assert not pred_df.isna().all().all(), "All values in the prediction were nan" From f26179c5d7b76404c83804381a53f9ba1429104b Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 11:39:57 +0100 Subject: [PATCH 3/8] Fix: legacy mask loss --- peptdeep/model/ms2.py | 11 +++++--- tests/unit/test_ms2_adaptive_frag_types.py | 30 +++++++++++++++++++--- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index 39683462..b371cd7f 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -3,7 +3,7 @@ import numpy as np import warnings -from typing import List, Tuple, IO +from typing import List, Tuple, IO, Optional from tqdm import tqdm @@ -405,7 +405,7 @@ def __init__( dropout=0.1, model_class: torch.nn.Module = ModelMS2Bert, device: str = "gpu", - mask_modloss: bool|None = None, + mask_modloss: Optional[bool] = None, override_from_weights: bool = False, **kwargs, # model params ): @@ -426,7 +426,12 @@ def __init__( dropout=dropout, **kwargs, # other model params ) - + if mask_modloss: # To Be removed in the future + # remove modloss fragments from charged_frag_types + self.charged_frag_types = [ + frag for frag in self.charged_frag_types if "modloss" not in frag + ] + self.loss_func = torch.nn.L1Loss() self.min_inten = 1e-4 self._safe_to_predict = True diff --git a/tests/unit/test_ms2_adaptive_frag_types.py b/tests/unit/test_ms2_adaptive_frag_types.py index 21bd9762..f4e508d2 100644 --- a/tests/unit/test_ms2_adaptive_frag_types.py +++ b/tests/unit/test_ms2_adaptive_frag_types.py @@ -7,11 +7,11 @@ import numpy as np -def get_legacy_model(charged_frag_types: Optional[List[str]] = None): +def get_legacy_model(charged_frag_types: Optional[List[str]] = None, mask_modloss: Optional[bool] = None): if charged_frag_types is None: - return pDeepModel() + return pDeepModel(mask_modloss=mask_modloss) else: - return pDeepModel(charged_frag_types=charged_frag_types) + return pDeepModel(charged_frag_types=charged_frag_types, mask_modloss=mask_modloss) def transform_weights_to_new_format(): @@ -104,6 +104,29 @@ def test_legacy_weights_complete_prediction(): # Non nan values should be present assert not pred_df.isna().all().all(), "All values in the prediction were nan" +def test_legacy_weights_mask_modloss(): + # Given a user requests exactly the same charged frag types used when training the model + charged_frag_types = [ + "b_z1", + "b_z2", + "y_z1", + "y_z2", + "b_modloss_z1", + "b_modloss_z2", + "y_modloss_z1", + "y_modloss_z2", + ] + + # When the user loads the model from legacy weights and uses it for prediction with mask_modloss + model = get_legacy_model(charged_frag_types=charged_frag_types , mask_modloss=True) + df = get_prediction_dataset() + pred_df = model.predict(df) + + # Then the prediction should have all requested charged frag types + assert set(pred_df.columns) == set(["b_z1", "b_z2", "y_z1", "y_z2"]) + # Non nan values should be present + assert not pred_df.isna().all().all(), "All values in the prediction were nan" + def test_new_weights_complete_prediction(): # Given a user requests exactly the same charged frag types used when training the model @@ -244,3 +267,4 @@ def test_override_requested_frag_types_prediction(): ), "Prediction did not have all supported charged frag types" # Non nan values should be present assert not pred_df.isna().all().all(), "All values in the prediction were nan" + From 38f2a79304daea0666a1969c9290b1fdc928afc8 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 11:56:29 +0100 Subject: [PATCH 4/8] test: training test cases --- tests/unit/test_ms2_adaptive_frag_types.py | 42 +++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/tests/unit/test_ms2_adaptive_frag_types.py b/tests/unit/test_ms2_adaptive_frag_types.py index f4e508d2..d4ccf54a 100644 --- a/tests/unit/test_ms2_adaptive_frag_types.py +++ b/tests/unit/test_ms2_adaptive_frag_types.py @@ -5,7 +5,7 @@ from typing import List, Optional from peptdeep.model.rt import IRT_PEPTIDE_DF import numpy as np - +import pandas as pd def get_legacy_model(charged_frag_types: Optional[List[str]] = None, mask_modloss: Optional[bool] = None): if charged_frag_types is None: @@ -177,17 +177,7 @@ def test_new_state_subset_prediction(): def test_new_state_unsupported_frag_types(): # Given a user requests a charged frag types that are not supported by the loaded model - requested_charged_frag_types = [ - "b_z1", - "b_z2", - "y_z1", - "y_z2", - "b_modloss_z1", - "b_modloss_z2", - "y_modloss_z1", - "y_modloss_z2", - "b_modloss_z3", - ] + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] # x_z1 and x_z2 are not supported with the loaded weights # When the user loads the model from new weights model = pDeepModel(charged_frag_types=requested_charged_frag_types) @@ -268,3 +258,31 @@ def test_override_requested_frag_types_prediction(): # Non nan values should be present assert not pred_df.isna().all().all(), "All values in the prediction were nan" +def test_model_alignment_when_training(): + # Given user requests unsupported charged frag types + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] + precursor_df = get_prediction_dataset() + fragment_df = np.random.rand(precursor_df.iloc[-1]["frag_stop_idx"], len(requested_charged_frag_types)) + fragment_df = pd.DataFrame(fragment_df, columns=requested_charged_frag_types) + # When the user loads the model from new weights and uses it for training + model = pDeepModel(requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + model.train(precursor_df, fragment_df) + # Then the model should align the fragment_df with the supported charged frag types + assert set(model.charged_frag_types) == set(model.model.supported_charged_frag_types), "Model interface and underlying model are not aligned" + +def test_prediction_after_alignment(): + # Given user requests unsupported charged frag types + requested_charged_frag_types = ["b_z1", "b_z2", "x_z1", "x_z2"] + precursor_df = get_prediction_dataset() + fragment_df = np.random.rand(precursor_df.iloc[-1]["frag_stop_idx"], len(requested_charged_frag_types)) + fragment_df = pd.DataFrame(fragment_df, columns=requested_charged_frag_types) + # When the user loads the model from new weights and uses it for training + model = pDeepModel(requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + model.train(precursor_df, fragment_df) + pred_df = model.predict(precursor_df) + # Then the model should be now safe to predict + assert model._safe_to_predict, "Model was not safe to predict after alignment" + # And the prediction should have only the supported charged frag types + assert set(pred_df.columns) == set(model.model.supported_charged_frag_types), "Prediction did not have all supported charged frag types" From 036ed2ac906a94db6e3a06238ea8cf32eb40570b Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 17:05:11 +0100 Subject: [PATCH 5/8] minor changes --- peptdeep/model/ms2.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index b371cd7f..88691281 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -391,8 +391,7 @@ class pDeepModel(model_interface.ModelInterface): device : str, optional Device to run the model, by default "gpu" override_from_weights : bool, optional default False - Over ride the requested charged frag types when loading model from weights, this will - make the model always in a safe to predict state. + Override the requested charged frag types from the model weights on loading. This allows to predict all fragment types supported by the weights even if the user doesn't know what fragments types are supported by the weights. Thereby, the model will always be in a safe to predict state. mask_modloss : bool, optional (deprecated) Mask the modloss fragments, this is deprecated and will be removed in the future. To mask the modloss fragments, the charged_frag_types should not include the modloss fragments. @@ -437,7 +436,6 @@ def __init__( self._safe_to_predict = True self._safe_to_train = True - def _prepare_train_data_df( self, precursor_df: pd.DataFrame, @@ -530,13 +528,12 @@ def _set_batch_predict_data( predicts.reshape((-1, len(self.charged_frag_types))), batch_df[["frag_start_idx", "frag_stop_idx"]].values, ) - def _align_model_charged_frag_types(self): + def _adapt_model_prediction_head(self): """ Align the underlying model charged_frag_types with the interface charged_frag_types, this function is necessary for the model to be safe to train. - Important: This function when called will most probably reset the last layer of the model - and randomly initialize is it so it might not be safe to use the model for prediction since - the last layer will be randomly initialized. + Important: This function when called will reshape the prediction head of the model to match the requested charged_frag_types + and randomly initialize is it so it might not be safe to use the model for prediction before training. """ loaded_model_state_dict = self.model.state_dict() self.build( @@ -657,7 +654,7 @@ def predict( ) -> pd.DataFrame: if not self._safe_to_predict: raise ValueError( - f"The model is not safe to use for prediction. This might mean that the requested charged_frag_types {self.charged_frag_types} are not a subset of the charged_frag_types used to train the loaded pretrained model {self.model.supported_charged_frag_types}. Please retrain the model or use a pretrained model with the correct charged_frag_types." + f"The model is not safe to use for prediction. This might mean that the requested charged_frag_types {self.charged_frag_types} are not a subset of the charged_frag_types used to train the loaded pretrained model {self.model.supported_charged_frag_types}. Please choose a subset of the supported charged_frag_types or retrain the model with the requested charged_frag_types." ) return super().predict( precursor_df, From a9bc8bd3d7acd85863dbc78d1896e89a8d962658 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 18:04:17 +0100 Subject: [PATCH 6/8] fix model alignment --- peptdeep/model/ms2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index 88691281..4eedba05 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -577,7 +577,7 @@ def train_with_warmup( **kwargs, ): if not self._safe_to_train: - self._align_model_charged_frag_types() + self._adapt_model_prediction_head() super().train_with_warmup( precursor_df, fragment_intensity_df=fragment_intensity_df, @@ -628,7 +628,7 @@ def train( **kwargs, ): if not self._safe_to_train: - self._align_model_charged_frag_types() + self._adapt_model_prediction_head() super().train( precursor_df, fragment_intensity_df=fragment_intensity_df, From cdc4575f1d1f0387b8f29da713e5863ba999139a Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 18:35:16 +0100 Subject: [PATCH 7/8] update notebook --- nbs_trials/adapt_charged_fragtypes.ipynb | 552 +++++++++++++++-------- peptdeep/model/ms2.py | 2 +- 2 files changed, 367 insertions(+), 187 deletions(-) diff --git a/nbs_trials/adapt_charged_fragtypes.ipynb b/nbs_trials/adapt_charged_fragtypes.ipynb index 083d46a5..02068078 100644 --- a/nbs_trials/adapt_charged_fragtypes.ipynb +++ b/nbs_trials/adapt_charged_fragtypes.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -24,9 +24,61 @@ "%autoreload 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Understanding the New Pretrained Weights Format \n", + "\n", + "This notebook explores various use cases and the reasons behind adopting the new pretrained weights format. \n", + "\n", + "## Summary \n", + "\n", + "The new format introduces two key improvements: \n", + "\n", + "1. **Fixes incorrect behavior** in certain valid use cases when using the legacy weight format. \n", + "2. **Enhances flexibility, adaptability, and transparency** when using the pretrained MS2 models in **PeptDeep**. \n", + "\n", + "## Key Considerations \n", + "\n", + "- The new weights format **was not obtained from retraining** of the models. \n", + "- So it **does not provide performance improvements** over the legacy weights but significantly expands the range of supported use cases. \n", + "\n", + "## Use Cases \n", + "\n", + "We categorize use cases based on two main tasks: \n", + "\n", + "1. **Prediction** \n", + "2. **Transfer Learning** \n", + "\n", + "For each task, we will: \n", + "- Show that legacy weights are **still** supported by the current implementation (Without the new use cases). \n", + "- Highlight the **limitations** of the legacy weights format. \n", + "- Present **new use cases** that are now supported with the updated format. \n", + "\n", + "\n", + "Supported use cases with the new format:\n", + "| Fragtypes use case | Override from weights (*) | Safe to predict | Safe to train |\n", + "|---------------------------------|--------------------------|----------------|--------------|\n", + "| requested = supported (1) | False | ✅ | ✅ |\n", + "| requested ⊆ supported (2) | False | ✅ | ❌ |\n", + "| requested ⊈ supported (3) | False | ❌ | ❌ |\n", + "| Any | True | ✅ | ✅ |\n", + "\n", + "(1) The ideal use case (only one supported by the old implementation) where users know and request exactly the same farg types supported in the model weights.\n", + "\n", + "(2) Users only need to predict a subset of the frag types supported by the loaded weights. \n", + "\n", + "(3) Users requested charged frag types that are not supported (can easily be identified now since we don't only look at the number of requested frag types).\n", + "\n", + "(*) `Override from weights` is the new argument added to the MS2 model, this allow users to load models without knowing exactly what are the supported frag types in a pretrained model. So this overrides the requested frag types and uses all supported frag types by the loaded model.\n", + "\n", + "Safe to train: Any model is automatically set to 'safe to train' when the `train` function is called-- by modifying *only* the underlying model output head to align with the requested frag types. After training, the model will automatically be in a 'safe to predict' state. \n" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -154,7 +206,7 @@ "8 50 30 Lumos " ] }, - "execution_count": 4, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -182,10 +234,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Legacy weights vs new weights\n", + "#### What is actually **new** about the \"new\" weights ? \n", "\n", - "- Both weights share the same exact underlying weights for the model, the only difference is with the new format we save the charged frag types used during training in the weights file.\n", - "- So both models are trained on frag types: 'b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'\n" + "- Both formats share the same exact underlying weights for the model, the only difference is with the new format we save the charged frag types used during training in the weights file. Which will allow supporting the new use cases.\n", + "- Both models support the following fragament types:\n", + "'b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'\n" ] }, { @@ -202,72 +255,72 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Ms2 Prediction " + "# 1. Ms2 Prediction " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## User importing a legacy model " + "## 1.1 Starting with the Legacy weights format\n", + "To show what the new format solve, lets consider three different use cases of loading a pretrained model." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "a) Using incorrect *len* of frag types when initialization (Should raise mismatch error)" + "a) Using incorrect *len* of frag types when initialization (Will raise mismatch error)\n", + "- Which means users can not predict a subset of the fragment types supported by the loaded pretrained model" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [ { - "ename": "RuntimeError", - "evalue": "Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlegacy_path\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:638\u001b[0m, in \u001b[0;36mModelInterface.load\u001b[1;34m(self, model_file, model_path_in_zip, **kwargs)\u001b[0m\n\u001b[0;32m 636\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_zipfile(model_file, model_path_in_zip)\n\u001b[0;32m 637\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 638\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_pytorchfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 640\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_stream(model_file)\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:727\u001b[0m, in \u001b[0;36mModelInterface._load_model_from_pytorchfile\u001b[1;34m(self, model_file)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_model_from_pytorchfile\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_file):\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(model_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m pt_file:\n\u001b[1;32m--> 727\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpt_file\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:717\u001b[0m, in \u001b[0;36mpDeepModel._load_model_from_stream\u001b[1;34m(self, stream)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moverride_from_weights:\n\u001b[0;32m 715\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types \u001b[38;5;241m=\u001b[39m tensor_to_charged_frags(loaded_charged_frag_types)\n\u001b[1;32m--> 717\u001b[0m (missing_keys, unexpect_keys) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mto_be_loaded_state_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m 719\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_model_state()\n\u001b[0;32m 721\u001b[0m missing_keys \u001b[38;5;241m=\u001b[39m [key \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m missing_keys \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_supported_charged_frag_types\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m key]\n", - "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\envs\\peptdeep\\lib\\site-packages\\torch\\nn\\modules\\module.py:2153\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[1;34m(self, state_dict, strict, assign)\u001b[0m\n\u001b[0;32m 2148\u001b[0m error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[0;32m 2149\u001b[0m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2150\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[0;32m 2152\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2153\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[0;32m 2155\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n", - "\u001b[1;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2])." + "name": "stdout", + "output_type": "stream", + "text": [ + "Error(s) in loading state_dict for ModelMS2Bert:\n", + "\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n", + "\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n", + "\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n", + "\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n" ] } ], "source": [ - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", - "model.load(legacy_path)" + "try: \n", + " model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", + " model_interface.load(legacy_path)\n", + "except Exception as e:\n", + " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "b) Using the correct *len* of frag types when initialization\n", - "- This is the ideal use case for the legacy weights were users request exactly the same frag types used when training. \n", - "- It's important to notice that the old implementation won't raise an error if the user requested different frag types as long as the number of frag types are the same." + "b) Using the correct *len* of frag types when initialization ***EVEN*** if the requested fragment types are completely different.(**Worst** use case) \n", + "- It's important to notice that the old format **won't** raise an error and the prediction goes through (even with misleading column names)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# Notice replacing the y_z1 and y_z2 with x_z1 and x_z2 and the model is loaded successfully and we get an incorrect prediction\n", - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'x_z1', 'x_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", - "model.load(legacy_path)" + "model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'x_z1', 'x_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model_interface.load(legacy_path)" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -377,30 +430,37 @@ "4 0.0 0.0 " ] }, - "execution_count": 8, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds = model.predict(get_prediction_dataset())\n", + "preds = model_interface.predict(get_prediction_dataset())\n", "preds.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "c) Using exactly the same charged frags (and same order) as used during training. (**Ideal** use case and the only one supported by the legacy format)" + ] + }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# Ideal use case requested frag types == training frag types\n", - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", - "model.load(legacy_path)\n" + "model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model_interface.load(legacy_path)\n" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -510,57 +570,68 @@ "4 0.0 0.0 " ] }, - "execution_count": 40, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "legacy_full_preds = model.predict(get_prediction_dataset())\n", + "legacy_full_preds = model_interface.predict(get_prediction_dataset())\n", "legacy_full_preds.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\nIf you don't have the new weights format uncomment the following line and run the cell \\nafter loading the legacy model weights with the correct frag types (last 2 cells)\\nit should save the new weights in the new_path.\\n\"" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\"\"\"\n", - "If you don't have the new weighst format uncomment the following line and run the cell \n", + "If you don't have the new weights format uncomment the following line and run the cell \n", "after loading the legacy model weights with the correct frag types (last 2 cells)\n", - "it should the save the new weights in the new path.\n", + "it should save the new weights in the new_path.\n", "\"\"\"\n", - "# model.save(new_path)" + "# model_interface.save(new_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## User importing weighst in the new format " + "## 1.2 Benefits of the new parameter format shown with different use-cases." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - " Using the correct *len* of frag types when initialization (ideal use case)" + "a) Using exactly the same charged frags (and same order) as used during training. (**Ideal** use case)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", - "model.load(new_path)" + "model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model_interface.load(new_path)" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -670,13 +741,13 @@ "4 0.0 0.0 " ] }, - "execution_count": 43, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_full_preds = model.predict(get_prediction_dataset())\n", + "new_full_preds = model_interface.predict(get_prediction_dataset())\n", "# verify the predictions are the same\n", "assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)\n", "new_full_preds.head()" @@ -686,16 +757,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using incorrect *len* of frag types when initialization but still a subset of what was used during training. \n", - "\n", - "This is use case where a user request a subset of the frag types used during training for example:\n", + "b) Using incorrect *len* of frag types during initialization but still a valid subset of what was used during training. \n", "\n", - "1) Excluding the modloss frags, preivously done by setting mask_modloss = True" + "Examples:\n", + "- 1) Excluding the modloss frags, preivously done by setting mask_modloss = True" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -709,15 +779,15 @@ ], "source": [ "# Excluding the modloss fragment types\n", - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2'])\n", - "model.load(new_path)\n", - "print(f\"Model Interface has charged_frag_types {model.charged_frag_types}\")\n", - "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + "model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'y_z1', 'y_z2'])\n", + "model_interface.load(new_path)\n", + "print(f\"Model Interface has charged_frag_types {model_interface.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model_interface.model.supported_charged_frag_types}\")" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -796,14 +866,14 @@ "4 0.013530 0.0 0.267507 0.000000" ] }, - "execution_count": 45, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types\n", - "new_subset_preds = model.predict(get_prediction_dataset())\n", + "new_subset_preds = model_interface.predict(get_prediction_dataset())\n", "assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)\n", "new_subset_preds.head()" ] @@ -812,23 +882,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "2) Excluding frag types that are not modloss (New feature)" + "- 2) Excluding frag types that are not modloss (*New* use case)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# Excluding the the y fragments while keeping the modloss fragments\n", - "model = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", - "model.load(new_path)" + "# Excluding the y fragments while keeping the modloss fragments\n", + "model_interface = pDeepModel(charged_frag_types=['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2'])\n", + "model_interface.load(new_path)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -907,14 +977,14 @@ "4 0.013530 0.0 0.0 0.0" ] }, - "execution_count": 47, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# verify that the predictions are the same with the legacy model for the selected subset of charged_frag_types\n", - "new_subset_preds = model.predict(get_prediction_dataset())\n", + "new_subset_preds = model_interface.predict(get_prediction_dataset())\n", "assert np.allclose(legacy_full_preds[new_subset_preds.columns], new_subset_preds)\n", "new_subset_preds.head()" ] @@ -923,79 +993,76 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Using the new format we have more semantics on what charged frag types are supported, so when a user request frag types that are not supported we can detect and raise an interpretable *error* (New feature)" + "c) Requesting fragment types that are not supported by the loaded pretrained model. \n", + "\n", + "Using the new format; the supported charged frag types are explicit and transparent, so when a user request frag types that are not supported we can detect and raise an interpretable *error* (**New** feature)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 57, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[48], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 2\u001b[0m model\u001b[38;5;241m.\u001b[39mload(new_path)\n\u001b[1;32m----> 4\u001b[0m new_subset_preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_prediction_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:654\u001b[0m, in \u001b[0;36mpDeepModel.predict\u001b[1;34m(self, precursor_df, batch_size, verbose, reference_frag_df, **kwargs)\u001b[0m\n\u001b[0;32m 644\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpredict\u001b[39m(\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 646\u001b[0m precursor_df: pd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 651\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 652\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_safe_to_predict:\n\u001b[1;32m--> 654\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 655\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe model is not safe to use for prediction. This might mean that the requested charged_frag_types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m are not a subset of the charged_frag_types used to train the loaded pretrained model \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msupported_charged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Please retrain the model or use a pretrained model with the correct charged_frag_types.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 656\u001b[0m )\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mpredict(\n\u001b[0;32m 658\u001b[0m precursor_df,\n\u001b[0;32m 659\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 662\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 663\u001b[0m )\n", - "\u001b[1;31mValueError\u001b[0m: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types." + "name": "stdout", + "output_type": "stream", + "text": [ + "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['x_z1', 'x_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please choose a subset of the supported charged_frag_types or retrain the model with the requested charged_frag_types.\n" ] } ], "source": [ - "model = pDeepModel(charged_frag_types=['x_z1', 'x_z2'])\n", - "model.load(new_path)\n", + "try:\n", + " model_interface = pDeepModel(charged_frag_types=['x_z1', 'x_z2'])\n", + " model_interface.load(new_path)\n", "\n", - "new_subset_preds = model.predict(get_prediction_dataset())\n", - "\n" + " new_subset_preds = model_interface.predict(get_prediction_dataset())\n", + "except Exception as e:\n", + " print(e)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Even if the user requested correct *len* of frag types when initialization but the requested frag types are not a subset of what was used during training we should raise an *error*." + "d) **Even** if the user requested correct *len* of frag types but the requested frag types are not a subset of what was used during training we should raise an *error* (Solving what was previously the **worst** case)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 58, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[49], line 4\u001b[0m\n\u001b[0;32m 1\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mc_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mx_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124my_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 2\u001b[0m model\u001b[38;5;241m.\u001b[39mload(new_path)\n\u001b[1;32m----> 4\u001b[0m new_subset_preds \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_prediction_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:654\u001b[0m, in \u001b[0;36mpDeepModel.predict\u001b[1;34m(self, precursor_df, batch_size, verbose, reference_frag_df, **kwargs)\u001b[0m\n\u001b[0;32m 644\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpredict\u001b[39m(\n\u001b[0;32m 645\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 646\u001b[0m precursor_df: pd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 651\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 652\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_safe_to_predict:\n\u001b[1;32m--> 654\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 655\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe model is not safe to use for prediction. This might mean that the requested charged_frag_types \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m are not a subset of the charged_frag_types used to train the loaded pretrained model \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39msupported_charged_frag_types\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Please retrain the model or use a pretrained model with the correct charged_frag_types.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 656\u001b[0m )\n\u001b[0;32m 657\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mpredict(\n\u001b[0;32m 658\u001b[0m precursor_df,\n\u001b[0;32m 659\u001b[0m batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 662\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 663\u001b[0m )\n", - "\u001b[1;31mValueError\u001b[0m: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types." + "name": "stdout", + "output_type": "stream", + "text": [ + "The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['c_z1', 'c_z2', 'x_z1', 'x_z2', 'y_z1', 'y_z2', 'y_modloss_z1', 'y_modloss_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please choose a subset of the supported charged_frag_types or retrain the model with the requested charged_frag_types.\n" ] } ], "source": [ - "model = pDeepModel(charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", - "model.load(new_path)\n", + "try:\n", + " model_interface = pDeepModel(charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + " model_interface.load(new_path)\n", "\n", - "new_subset_preds = model.predict(get_prediction_dataset())\n" + " new_subset_preds = model_interface.predict(get_prediction_dataset())\n", + "except Exception as e:\n", + " print(e)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "User has a weights file and want to predict all fragment types used for training without knowing what exactly was used during training (New feature)\n", + "e) User has a weights file and want to predict all fragment types supported **without** knowing what exactly was used during training (**New** use case)\n", "\n", "Notice how the requested frag types are overridden in the model interface" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -1008,19 +1075,19 @@ } ], "source": [ - "model = pDeepModel(\n", - " charged_frag_types=['c_z1', 'c_z2', 'y_z1', 'y_z2', 'x_z1', 'x_z2', 'y_modloss_z1', 'y_modloss_z2'], # Will be overridden by the model weights\n", + "model_interface = pDeepModel(\n", + " charged_frag_types=['c_z1'], # Will be overridden by the supported charged_frag_types in the loaded weights\n", " override_from_weights=True\n", " )\n", - "model.load(new_path)\n", + "model_interface.load(new_path)\n", "\n", - "print(f\"Model Interface has requested charged_frag_types {model.charged_frag_types}\")\n", - "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + "print(f\"Model Interface has requested charged_frag_types {model_interface.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model_interface.model.supported_charged_frag_types}\")" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 60, "metadata": {}, "outputs": [ { @@ -1130,13 +1197,13 @@ "4 0.0 0.0 " ] }, - "execution_count": 51, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_full_preds = model.predict(get_prediction_dataset())\n", + "new_full_preds = model_interface.predict(get_prediction_dataset())\n", "# verify the predictions are the same\n", "assert np.allclose(legacy_full_preds.values, new_full_preds.values, atol=1e-5)\n", "new_full_preds.head()" @@ -1146,12 +1213,40 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Ms2 model training" + "f) Even If users requested the correct frag types but with an *incorrect* order, they are implicitly ordered using `alphabase.sort_charged_frag_types`" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Interface has requested charged_frag_types ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", + "Supported charged_frag_types in the loaded weights ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n" + ] + } + ], + "source": [ + "model_interface = pDeepModel(charged_frag_types=['y_z2', 'y_z1', 'b_z2', 'b_z1', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2'])\n", + "model_interface.load(new_path)\n", + "print(f\"Model Interface has requested charged_frag_types {model_interface.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model_interface.model.supported_charged_frag_types}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Ms2 transfer learning" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1168,7 +1263,7 @@ } ], "source": [ - "trainin_data_path = \"C:/Users/USER/Desktop/Germany/work/MPIB/alphadia/2oh_evidence_txt_0_batch_0.hdf\"\n", + "trainin_data_path = \"../data/2oh_evidence_txt_0_batch_0.hdf\"\n", "speclib = SpecLibFlat()\n", "speclib.load_hdf(trainin_data_path)\n", "speclib.fragment_intensity_df[\"b_modloss_z1\"] = 0\n", @@ -1191,91 +1286,133 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## User importing a legacy model " + "## 2.1 User loading a legacy model " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Using correct *len* of frag types when initialization \n" + "Using incorrect *len* of frag types when initialization (Will raise a mismatch error)\n" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2025-02-12 19:42:53> Training with fixed sequence length: 0\n", - "[Training] Epoch=1, Mean Loss=0.0174776264175307\n" + "Error(s) in loading state_dict for ModelMS2Bert:\n", + "\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n", + "\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n", + "\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n", + "\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n" ] } ], "source": [ - "target_frag_types = ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", - "model = pDeepModel(charged_frag_types=target_frag_types)\n", - "model.load(legacy_path)\n", - "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)" + "try:\n", + " target_frag_types = ['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']\n", + " model_interface = pDeepModel(charged_frag_types=target_frag_types)\n", + " model_interface.load(legacy_path)\n", + " model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)\n", + "except Exception as e:\n", + " print(e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Using incorrect *len* of frag types when initialization (Should raise a mismatch error)\n" + "a) Using correct *len* of frag types when initialization (**Ideal** use case)\n" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 64, "metadata": {}, "outputs": [ { - "ename": "RuntimeError", - "evalue": "Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[54], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m target_frag_types \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_z2\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z1\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mb_modloss_z2\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[0;32m 2\u001b[0m model \u001b[38;5;241m=\u001b[39m pDeepModel(charged_frag_types\u001b[38;5;241m=\u001b[39mtarget_frag_types)\n\u001b[1;32m----> 3\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlegacy_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m model\u001b[38;5;241m.\u001b[39mtrain(precursor_df\u001b[38;5;241m=\u001b[39mspeclib\u001b[38;5;241m.\u001b[39mprecursor_df, fragment_intensity_df\u001b[38;5;241m=\u001b[39mspeclib\u001b[38;5;241m.\u001b[39mfragment_intensity_df\u001b[38;5;241m.\u001b[39mloc[:,target_frag_types], epoch\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m, verbose\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:638\u001b[0m, in \u001b[0;36mModelInterface.load\u001b[1;34m(self, model_file, model_path_in_zip, **kwargs)\u001b[0m\n\u001b[0;32m 636\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_zipfile(model_file, model_path_in_zip)\n\u001b[0;32m 637\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 638\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_pytorchfile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 639\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 640\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_load_model_from_stream(model_file)\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\model_interface.py:727\u001b[0m, in \u001b[0;36mModelInterface._load_model_from_pytorchfile\u001b[1;34m(self, model_file)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_model_from_pytorchfile\u001b[39m(\u001b[38;5;28mself\u001b[39m, model_file):\n\u001b[0;32m 726\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(model_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m pt_file:\n\u001b[1;32m--> 727\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_model_from_stream\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpt_file\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\users\\user\\desktop\\germany\\work\\mpib\\pepteep\\alphapeptdeep\\peptdeep\\model\\ms2.py:717\u001b[0m, in \u001b[0;36mpDeepModel._load_model_from_stream\u001b[1;34m(self, stream)\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moverride_from_weights:\n\u001b[0;32m 715\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcharged_frag_types \u001b[38;5;241m=\u001b[39m tensor_to_charged_frags(loaded_charged_frag_types)\n\u001b[1;32m--> 717\u001b[0m (missing_keys, unexpect_keys) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_state_dict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 718\u001b[0m \u001b[43m \u001b[49m\u001b[43mto_be_loaded_state_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m 719\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 720\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_update_model_state()\n\u001b[0;32m 721\u001b[0m missing_keys \u001b[38;5;241m=\u001b[39m [key \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m missing_keys \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_supported_charged_frag_types\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m key]\n", - "File \u001b[1;32mc:\\Users\\USER\\anaconda3\\envs\\peptdeep\\lib\\site-packages\\torch\\nn\\modules\\module.py:2153\u001b[0m, in \u001b[0;36mModule.load_state_dict\u001b[1;34m(self, state_dict, strict, assign)\u001b[0m\n\u001b[0;32m 2148\u001b[0m error_msgs\u001b[38;5;241m.\u001b[39minsert(\n\u001b[0;32m 2149\u001b[0m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMissing key(s) in state_dict: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m. \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2150\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mk\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m missing_keys)))\n\u001b[0;32m 2152\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(error_msgs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m-> 2153\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mError(s) in loading state_dict for \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(\n\u001b[0;32m 2154\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(error_msgs)))\n\u001b[0;32m 2155\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _IncompatibleKeys(missing_keys, unexpected_keys)\n", - "\u001b[1;31mRuntimeError\u001b[0m: Error(s) in loading state_dict for ModelMS2Bert:\n\tsize mismatch for output_nn.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for output_nn.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2]).\n\tsize mismatch for modloss_nn.1.nn.2.weight: copying a param with shape torch.Size([4, 64]) from checkpoint, the shape in current model is torch.Size([2, 64]).\n\tsize mismatch for modloss_nn.1.nn.2.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([2])." + "name": "stdout", + "output_type": "stream", + "text": [ + "2025-02-27 18:19:19> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, Mean Loss=0.01842992820052637\n", + "[Training] Epoch=2, Mean Loss=0.017078618622488446\n", + "[Training] Epoch=3, Mean Loss=0.01577050112084382\n" ] } ], "source": [ - "target_frag_types = ['b_z1', 'b_z2', 'b_modloss_z1', 'b_modloss_z2']\n", - "model = pDeepModel(charged_frag_types=target_frag_types)\n", - "model.load(legacy_path)\n", - "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)" + "target_frag_types = ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", + "model_interface = pDeepModel(charged_frag_types=target_frag_types)\n", + "model_interface.load(legacy_path)\n", + "model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=3, verbose=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## User importing a new model " + "## 2.2 Benefits of the new parameter format for transfer learning." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Trining on new fragment types that were not part of the training of the original weights (New feature). \n", - "- This is not training from scratch but rather loading the pre-trained backbone and only the prediction heads are initialized from scratch which results in a much faster convergence and reduce the risk of overfiting.\n", - "- Notice how when the requested frag types are not a subset of the supported the model is not safe to use for prediction, but after training the model is now safe to predict. \n" + "Training on new fragment types that were not part of the training of the original weights (**New** feature). \n", + "- Since the architecture to support the requested fragment types will be different lets try the most obvious solution first -- training from scratch. \n" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training the model with the requested fragment types from scratch\n", + "2025-02-27 18:19:23> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, Mean Loss=0.20166994051800835\n", + "[Training] Epoch=2, Mean Loss=0.15216446336772707\n", + "[Training] Epoch=3, Mean Loss=0.12939151955975425\n", + "[Training] Epoch=4, Mean Loss=0.1190311929417981\n", + "[Training] Epoch=5, Mean Loss=0.10276750516560343\n", + "[Training] Epoch=6, Mean Loss=0.0983661309712463\n", + "[Training] Epoch=7, Mean Loss=0.09396917579902543\n", + "[Training] Epoch=8, Mean Loss=0.08694049964348476\n", + "[Training] Epoch=9, Mean Loss=0.08316284211145507\n", + "[Training] Epoch=10, Mean Loss=0.08015417887104882\n" + ] + } + ], + "source": [ + "# Notice we are not loading any weights but training the model from scratch\n", + "target_frag_types = ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] \n", + "model_interface = pDeepModel(charged_frag_types=target_frag_types)\n", + "print(\"Training the model with the requested fragment types from scratch\")\n", + "model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=10, verbose=1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Using transfer learning by loading the pre-trained backbone and only the prediction heads are initialized from scratch which results in a much faster convergence and reduce the risk of overfiting.\n", + "- Notice 2 things:\n", + " - The final training loss compared to training from scratch for the same number of epochs.\n", + " - When the requested frag types are not a subset of the supported, the model is not safe to use for prediction. After training, the model's state is automatically changed to _safe_to_predict." + ] + }, + { + "cell_type": "code", + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -1283,10 +1420,66 @@ "output_type": "stream", "text": [ "Trying to predict when the requested fragment types are not supported by the pretrained model\n", - "Error: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please retrain the model or use a pretrained model with the correct charged_frag_types.\n", + "Requested fragment types: ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2']\n", + "Supported fragment types in the loaded weights: ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']\n", + "Error: The model is not safe to use for prediction. This might mean that the requested charged_frag_types ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] are not a subset of the charged_frag_types used to train the loaded pretrained model ['b_z1', 'b_z2', 'y_z1', 'y_z2', 'b_modloss_z1', 'b_modloss_z2', 'y_modloss_z1', 'y_modloss_z2']. Please choose a subset of the supported charged_frag_types or retrain the model with the requested charged_frag_types.\n" + ] + } + ], + "source": [ + "target_frag_types = ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] \n", + "model_interface = pDeepModel(charged_frag_types=target_frag_types)\n", + "model_interface.load(new_path)\n", + "print(\"Trying to predict when the requested fragment types are not supported by the pretrained model\")\n", + "print(f\"Requested fragment types: {target_frag_types}\")\n", + "print(f\"Supported fragment types in the loaded weights: {model_interface.model.supported_charged_frag_types}\")\n", + "try: \n", + " # try to predict with the new model\n", + " model_interface.predict(get_prediction_dataset())\n", + "except Exception as e:\n", + " print(f\"Error: {e}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "Training the model with the requested fragment types\n", - "2025-02-12 19:43:01> Training with fixed sequence length: 0\n", - "[Training] Epoch=1, Mean Loss=0.08573874365538359\n", + "2025-02-27 18:30:05> Training with fixed sequence length: 0\n", + "[Training] Epoch=1, Mean Loss=0.08342412755721146\n", + "[Training] Epoch=2, Mean Loss=0.07767527177929878\n", + "[Training] Epoch=3, Mean Loss=0.07237229889465703\n", + "[Training] Epoch=4, Mean Loss=0.06727897334429953\n", + "[Training] Epoch=5, Mean Loss=0.06287239574723774\n", + "[Training] Epoch=6, Mean Loss=0.05764101083493895\n", + "[Training] Epoch=7, Mean Loss=0.05389009426451392\n", + "[Training] Epoch=8, Mean Loss=0.05020008898443646\n", + "[Training] Epoch=9, Mean Loss=0.04623174532834026\n", + "[Training] Epoch=10, Mean Loss=0.041998228368659817\n" + ] + } + ], + "source": [ + "print(\"Training the model with the requested fragment types\")\n", + "\n", + "model_interface.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=10, verbose=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "Trying to predict after training with the requested fragment types\n" ] }, @@ -1323,35 +1516,35 @@ " 0.000000\n", " 0.0\n", " 1.000000\n", - " 0.000000\n", + " 0.0\n", " \n", " \n", " 1\n", - " 0.156433\n", + " 0.116859\n", + " 0.0\n", + " 0.316274\n", " 0.0\n", - " 0.352000\n", - " 0.000000\n", " \n", " \n", " 2\n", - " 0.042533\n", + " 0.011386\n", + " 0.0\n", + " 0.044198\n", " 0.0\n", - " 0.101634\n", - " 0.001894\n", " \n", " \n", " 3\n", - " 0.015229\n", - " 0.0\n", - " 0.188399\n", " 0.000000\n", + " 0.0\n", + " 0.092805\n", + " 0.0\n", " \n", " \n", " 4\n", - " 0.009935\n", - " 0.0\n", - " 0.250530\n", " 0.000000\n", + " 0.0\n", + " 0.125801\n", + " 0.0\n", " \n", " \n", "\n", @@ -1359,36 +1552,23 @@ ], "text/plain": [ " a_z1 a_z2 b_H2O_z1 b_H2O_z2\n", - "0 0.000000 0.0 1.000000 0.000000\n", - "1 0.156433 0.0 0.352000 0.000000\n", - "2 0.042533 0.0 0.101634 0.001894\n", - "3 0.015229 0.0 0.188399 0.000000\n", - "4 0.009935 0.0 0.250530 0.000000" + "0 0.000000 0.0 1.000000 0.0\n", + "1 0.116859 0.0 0.316274 0.0\n", + "2 0.011386 0.0 0.044198 0.0\n", + "3 0.000000 0.0 0.092805 0.0\n", + "4 0.000000 0.0 0.125801 0.0" ] }, - "execution_count": 55, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "target_frag_types = ['a_z1', 'a_z2', 'b_H2O_z1', 'b_H2O_z2'] \n", - "model = pDeepModel(charged_frag_types=target_frag_types)\n", - "model.load(new_path)\n", - "print(\"Trying to predict when the requested fragment types are not supported by the pretrained model\")\n", - "try: \n", - " # try to predict with the new model\n", - " model.predict(get_prediction_dataset())\n", - "except Exception as e:\n", - " print(f\"Error: {e}\")\n", - "\n", - "print(\"Training the model with the requested fragment types\")\n", - "model.train(precursor_df=speclib.precursor_df, fragment_intensity_df=speclib.fragment_intensity_df.loc[:,target_frag_types], epoch=1, verbose=1)\n", - "\n", "print(\"Trying to predict after training with the requested fragment types\")\n", "try: \n", " # try to predict with the new model\n", - " preds = model.predict(get_prediction_dataset())\n", + " preds = model_interface.predict(get_prediction_dataset())\n", "except Exception as e:\n", " print(f\"Error: {e}\")\n", "\n", @@ -1399,12 +1579,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After training the the underlying supported fragment types is aligned with teh requested frag types which can then be saved." + "After training the new underlying model has requested frag types as the supported frag types. Which can then be saved and used for future prediction." ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -1417,8 +1597,8 @@ } ], "source": [ - "print(f\"Model Interface has requested charged_frag_types {model.charged_frag_types}\")\n", - "print(f\"Supported charged_frag_types in the loaded weights {model.model.supported_charged_frag_types}\")" + "print(f\"Model Interface has requested charged_frag_types {model_interface.charged_frag_types}\")\n", + "print(f\"Supported charged_frag_types in the loaded weights {model_interface.model.supported_charged_frag_types}\")" ] } ], diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index 4eedba05..1e1f5a86 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -413,7 +413,7 @@ def __init__( warnings.warn("mask_modloss is deprecated and will be removed in the future. To mask the modloss fragments, the charged_frag_types should not include the modloss fragments.") self.override_from_weights = override_from_weights - self.charged_frag_types = charged_frag_types + self.charged_frag_types = sort_charged_frag_types(charged_frag_types) self.charge_factor = 0.1 self.NCE_factor = 0.01 From d470f9f804a356bf63ddd6d4b408570c1a43b117 Mon Sep 17 00:00:00 2001 From: Mohamed Sameh Date: Thu, 27 Feb 2025 18:35:32 +0100 Subject: [PATCH 8/8] add test case --- tests/unit/test_ms2_adaptive_frag_types.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/test_ms2_adaptive_frag_types.py b/tests/unit/test_ms2_adaptive_frag_types.py index d4ccf54a..450c527d 100644 --- a/tests/unit/test_ms2_adaptive_frag_types.py +++ b/tests/unit/test_ms2_adaptive_frag_types.py @@ -286,3 +286,13 @@ def test_prediction_after_alignment(): assert model._safe_to_predict, "Model was not safe to predict after alignment" # And the prediction should have only the supported charged frag types assert set(pred_df.columns) == set(model.model.supported_charged_frag_types), "Prediction did not have all supported charged frag types" + +def test_charged_frag_types_order(): + # Given user requests supported charged frag types in a different order + requested_charged_frag_types = ["y_z2", "b_z1", "b_z2", "y_z1", "b_modloss_z1", "b_modloss_z2", "y_modloss_z1", "y_modloss_z2"] + # When the user loads the model from new weights + model = pDeepModel(requested_charged_frag_types) + model.load(transform_weights_to_new_format()) + # Then the model should automatically order them to match the alphabase.sort_charged_frag_types + expected_charged_frag_types = ["b_z1", "b_z2", "y_z1", "y_z2", "b_modloss_z1", "b_modloss_z2", "y_modloss_z1", "y_modloss_z2"] + assert model.charged_frag_types == expected_charged_frag_types, "Charged frag types were not ordered as expected"