rhlvora · Jul 2, 2020
diff --git a/‎generate_annotations_with_malignancy.ipynb
+433 b/‎generate_annotations_with_malignancy.ipynb
+433
@@ -0,0 +1,433 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Malignancy Annotations\n",
+    "\n",
+    "This notebook compiles the `annotations_with_malignancy.csv` and also drops annotations for CTs it cannot find.\n",
+    "\n",
+    "In addition to the usual suspects, you need to have the `pylidc` Python package (use `pip install pylidc` or [check out the source](https://pylidc.github.io/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import SimpleITK as sitk\n",
+    "import pandas\n",
+    "import glob, os\n",
+    "import numpy\n",
+    "import tqdm\n",
+    "import pylidc\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We first load the annotations from the LUNA challenge."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "annotations = pandas.read_csv('data/part2/luna/annotations.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "scrolled": false
+   },
+   "source": [
+    "For the CTs where we have a `.mhd` file, we collect the malignancy_data from PyLIDC.\n",
+    "\n",
+    "It is a bit tedious as we need to convert the pixel locations provided by PyLIDC to physical points.\n",
+    "We will see some warnings about annotations to be too close too each other (PyLIDC expects to have 4 annotations per site, see Chapter 14 for some details, including when we consider a nodule to be malignant).\n",
+    "\n",
+    "This takes quite a while (~1-2 seconds per scan on one of the author's computer)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 11%|█▏        | 69/601 [01:52<13:05,  1.48s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 15%|█▌        | 93/601 [02:31<14:46,  1.75s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 18%|█▊        | 107/601 [02:53<14:35,  1.77s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 37%|███▋      | 225/601 [06:16<11:28,  1.83s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 44%|████▍     | 267/601 [07:24<07:51,  1.41s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 47%|████▋     | 281/601 [07:46<09:37,  1.80s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 61%|██████    | 368/601 [10:16<06:19,  1.63s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 72%|███████▏  | 434/601 [11:57<03:41,  1.32s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 74%|███████▍  | 446/601 [12:20<03:09,  1.22s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 75%|███████▍  | 450/601 [12:26<03:49,  1.52s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 88%|████████▊ | 527/601 [14:15<01:35,  1.29s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 96%|█████████▌| 577/601 [15:17<00:38,  1.59s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 99%|█████████▉| 597/601 [15:44<00:06,  1.66s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Failed to reduce all groups to <= 4 Annotations.\n",
+      "Some nodules may be close and must be grouped manually.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 601/601 [15:48<00:00,  1.58s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "malignancy_data = []\n",
+    "missing = []\n",
+    "spacing_dict = {}\n",
+    "scans = {s.series_instance_uid:s for s in pylidc.query(pylidc.Scan).all()}\n",
+    "suids = annotations.seriesuid.unique()\n",
+    "for suid in tqdm.tqdm(suids):\n",
+    "    fn = glob.glob('./data-unversioned/part2/luna/subset*/{}.mhd'.format(suid))\n",
+    "    if len(fn) == 0 or '*' in fn[0]:\n",
+    "        missing.append(suid)\n",
+    "        continue\n",
+    "    fn = fn[0]\n",
+    "    x = sitk.ReadImage(fn)\n",
+    "    spacing_dict[suid] = x.GetSpacing()\n",
+    "    s = scans[suid]\n",
+    "    for ann_cluster in s.cluster_annotations():\n",
+    "        # this is our malignancy criteron described in Chapter 14\n",
+    "        is_malignant = len([a.malignancy for a in ann_cluster if a.malignancy >= 4])>=2\n",
+    "        centroid = numpy.mean([a.centroid for a in ann_cluster], 0)\n",
+    "        bbox = numpy.mean([a.bbox_matrix() for a in ann_cluster], 0).T\n",
+    "        coord = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in centroid[[1, 0, 2]]])\n",
+    "        bbox_low = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in bbox[0, [1, 0, 2]]])\n",
+    "        bbox_high = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in bbox[1, [1, 0, 2]]])\n",
+    "        malignancy_data.append((suid, coord[0], coord[1], coord[2], bbox_low[0], bbox_low[1], bbox_low[2], bbox_high[0], bbox_high[1], bbox_high[2], is_malignant, [a.malignancy for a in ann_cluster]))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can check how many `mhd`s you are missing. It seems that the LUNA data has dropped a couple(?). Don't worry if there are <10 missing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MISSING []\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"MISSING\", missing)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We stick the data we got from PyLIDC into a DataFrame."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_mal = pandas.DataFrame(malignancy_data, columns=['seriesuid', 'coordX', 'coordY', 'coordZ', 'bboxLowX', 'bboxLowY', 'bboxLowZ', 'bboxHighX', 'bboxHighY', 'bboxHighZ', 'mal_bool', 'mal_details'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now we match the malignancy data to the annotations. This is a lot faster..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 601/601 [00:01<00:00, 316.12it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "processed_annot = []\n",
+    "annotations['mal_bool'] = float('nan')\n",
+    "annotations['mal_details'] = [[] for _ in annotations.iterrows()]\n",
+    "bbox_keys = ['bboxLowX', 'bboxLowY', 'bboxLowZ', 'bboxHighX', 'bboxHighY', 'bboxHighZ']\n",
+    "for k in bbox_keys:\n",
+    "    annotations[k] = float('nan')\n",
+    "for series_id in tqdm.tqdm(annotations.seriesuid.unique()):\n",
+    "    # series_id = '1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860'\n",
+    "    # c = candidates[candidates.seriesuid == series_id]\n",
+    "    a = annotations[annotations.seriesuid == series_id]\n",
+    "    m = df_mal[df_mal.seriesuid == series_id]\n",
+    "    if len(m) > 0:\n",
+    "        m_ctrs = m[['coordX', 'coordY', 'coordZ']].values\n",
+    "        a_ctrs = a[['coordX', 'coordY', 'coordZ']].values\n",
+    "        #print(m_ctrs.shape, a_ctrs.shape)\n",
+    "        matches = (numpy.linalg.norm(a_ctrs[:, None] - m_ctrs[None], ord=2, axis=-1) / a.diameter_mm.values[:, None] < 0.5)\n",
+    "        has_match = matches.max(-1)\n",
+    "        match_idx = matches.argmax(-1)[has_match]\n",
+    "        a_matched = a[has_match].copy()\n",
+    "        # c_matched['diameter_mm'] = a.diameter_mm.values[match_idx]\n",
+    "        a_matched['mal_bool'] = m.mal_bool.values[match_idx]\n",
+    "        a_matched['mal_details'] = m.mal_details.values[match_idx]\n",
+    "        for k in bbox_keys:\n",
+    "            a_matched[k] = m[k].values[match_idx]\n",
+    "        processed_annot.append(a_matched)\n",
+    "        processed_annot.append(a[~has_match])\n",
+    "    else:\n",
+    "        processed_annot.append(c)\n",
+    "processed_annot = pandas.concat(processed_annot)\n",
+    "processed_annot.sort_values('mal_bool', ascending=False, inplace=True)\n",
+    "processed_annot['len_mal_details'] = processed_annot.mal_details.apply(len)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we drop NAs (where we didn't find a match) and save it in the right place."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_nona = processed_annot.dropna()\n",
+    "df_nona.to_csv('./data/part2/luna/annotations_with_malignancy.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}