From 0f25a4d0bbd25f1f0f9dd11d51d40e5f38bf63fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wiewi=C3=B3rka?= Date: Sat, 21 Dec 2024 08:51:05 +0100 Subject: [PATCH] feat: Custom column names and suffixes for overlap and nearest operations (#43) * doc: Installation instructions * chore: Readme refactor * feat: Add support for custom column names and suffixes * Fixing needless borrow * Removing assertion and adding test case for non-default suffixes * Creating release 0.3.0 --- .github/workflows/publish_to_pypi.yml | 1 + .pre-commit-config.yaml | 11 ++ Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 63 +------ docs/notebooks/tutorial.ipynb | 146 ++++++++++++-- polars_bio/range_op.py | 34 ++-- polars_bio/range_op_helpers.py | 27 +-- polars_bio/range_op_io.py | 6 +- polars_bio/range_viz.py | 4 +- pyproject.toml | 2 +- src/lib.rs | 261 ++++++++++++++++++++------ tests/test_bioframe.py | 9 +- tests/test_native.py | 4 + tests/test_pandas.py | 4 + tests/test_polars.py | 26 ++- 16 files changed, 421 insertions(+), 181 deletions(-) diff --git a/.github/workflows/publish_to_pypi.yml b/.github/workflows/publish_to_pypi.yml index 29287e4..0437ce3 100644 --- a/.github/workflows/publish_to_pypi.yml +++ b/.github/workflows/publish_to_pypi.yml @@ -11,6 +11,7 @@ on: - 'docs/**' - 'benchmark/**' - 'mkdocs.yml' + - 'README.md' pull_request: workflow_dispatch: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6352607..3fabbaa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,3 +29,14 @@ repos: hooks: - id: fmt - id: cargo-check + +### FIXME +# - repo: https://github.com/ddkasa/check-mkdocs.git +# rev: 65e819a4c62ee22c38f244b51b63f2f9b89a66d0 +# hooks: +# - id: check-mkdocs +# name: check-mkdocs +# args: ["--config", "mkdocs.yml"] # Optional, mkdocs.yml is the default +# # If you have additional plugins or libraries that are not included in +# # check-mkdocs, add them here +# additional_dependencies: ['mkdocs-material', 'mkdocs-jupyter', 'mkdocstrings-python'] \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index a23c351..760ef6c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2734,7 +2734,7 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "polars_bio" -version = "0.2.11" +version = "0.2.12" dependencies = [ "arrow", "datafusion", diff --git a/Cargo.toml b/Cargo.toml index 37951f6..bbfbc7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "polars_bio" -version = "0.2.11" +version = "0.3.0" edition = "2021" [lib] diff --git a/README.md b/README.md index e0f1347..4f906e3 100644 --- a/README.md +++ b/README.md @@ -1,59 +1,10 @@ -# polars_bio +# polars-bio - Next-gen Python DataFrame operations for genomics! +![CI](https://github.com/biodatageeks/polars-bio/actions/workflows/publish_to_pypi.yml/badge.svg?branch=master) +![Docs](https://github.com/biodatageeks/polars-bio/actions/workflows/publish_documentation.yml/badge.svg?branch=master) +![logo](docs/assets/logo-large.png) -## Features +[polars-bio](https://pypi.org/project/polars-bio/) is a Python library for genomics built on top of [polars](https://pola.rs/), [Apache Arrow](https://arrow.apache.org/) and [Apache DataFusion](https://datafusion.apache.org/). +It provides a DataFrame API for genomics data and is designed to be blazing fast, memory efficient and easy to use. -## Genomic ranges operations - -| Features | Bioframe | polars-bio | PyRanges | Pybedtools | PyGenomics | GenomicRanges | -|--------------|--------------------|---------------------|--------------------|--------------------|--------------------|--------------------| -| overlap | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -| nearest | :white_check_mark: | :white_check_mark: | :white_check_mark: | | | | -| cluster | :white_check_mark: | | | | | | -| merge | :white_check_mark: | | | | | | -| complement | :white_check_mark: | | | | | | -| select/slice | :white_check_mark: | | | | | | -| | | | | | | | -| coverage | :white_check_mark: | | | | | | -| expand | :white_check_mark: | | | | | | -| sort | :white_check_mark: | | | | | | - - -## Input/Output -| I/O | Bioframe | polars-bio | PyRanges | Pybedtools | PyGenomics | GenomicRanges | -|------------------|--------------------|------------------------|--------------------|------------|------------|---------------| -| Pandas DataFrame | :white_check_mark: | :white_check_mark: | :white_check_mark: | | | | -| Polars DataFrame | | :white_check_mark: | | | | | -| Polars LazyFrame | | :white_check_mark: | | | | | -| Native readers | | :white_check_mark: | | | | | - - -## Genomic file format -| I/O | Bioframe | polars-bio | PyRanges | Pybedtools | PyGenomics | GenomicRanges | -|----------------|--------------------|------------|--------------------|------------|------------|---------------| -| BED | :white_check_mark: | | :white_check_mark: | | | | -| BAM | | | | | | | -| VCF | | | | | | | - - -## Performance -![img.png](benchmark/results-overlap-0.1.1.png) - -![img.png](benchmark/results-overlap-df-0.1.1.png) - -![img.png](benchmark/results-nearest-0.1.1.png) - -## Remarks - -Pyranges is multithreaded, but : - -* Requires Ray backend plus -```bash - nb_cpu: int, default 1 - - How many cpus to use. Can at most use 1 per chromosome or chromosome/strand tuple. - Will only lead to speedups on large datasets. -``` - -* for nearest returns no empty rows if there is no overlap (we follow Bioframe where nulls are returned) -# \ No newline at end of file +Read the [documentation](https://biodatageeks.github.io/polars-bio/) \ No newline at end of file diff --git a/docs/notebooks/tutorial.ipynb b/docs/notebooks/tutorial.ipynb index 1ad2000..f7c8335 100644 --- a/docs/notebooks/tutorial.ipynb +++ b/docs/notebooks/tutorial.ipynb @@ -9,14 +9,19 @@ { "cell_type": "code", "id": "7b173024d3e8f76", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-20T19:12:04.948380Z", + "start_time": "2024-12-20T19:12:04.544324Z" + } + }, "source": [ "import polars_bio as pb\n", "import pandas as pd\n", "from polars_bio.range_viz import visualize_intervals" ], "outputs": [], - "execution_count": null + "execution_count": 1 }, { "cell_type": "markdown", @@ -27,19 +32,24 @@ { "cell_type": "code", "id": "86fe039c3780140e", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-20T19:12:06.150356Z", + "start_time": "2024-12-20T19:12:06.145095Z" + } + }, "source": [ "df1 = pd.DataFrame(\n", " [[\"chr1\", 1, 5], [\"chr1\", 3, 8], [\"chr1\", 8, 10], [\"chr1\", 12, 14]],\n", - " columns=[\"contig\", \"pos_start\", \"pos_end\"],\n", + " columns=[\"chrom\", \"start\", \"end\"],\n", ")\n", "\n", "df2 = pd.DataFrame(\n", - " [[\"chr1\", 4, 8], [\"chr1\", 10, 11]], columns=[\"contig\", \"pos_start\", \"pos_end\"]\n", + " [[\"chr1\", 4, 8], [\"chr1\", 10, 11]], columns=[\"chrom\", \"start\", \"end\"]\n", ")" ], "outputs": [], - "execution_count": null + "execution_count": 2 }, { "cell_type": "markdown", @@ -50,32 +60,138 @@ { "cell_type": "code", "id": "304f3aa6fcdc9650", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-20T19:12:08.303754Z", + "start_time": "2024-12-20T19:12:08.294952Z" + } + }, "source": [ "overlapping_intervals = pb.overlap(df1, df2, output_type=\"pandas.DataFrame\")" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:polars_bio:Running Overlap operation with algorithm Coitrees and 1 thread(s)...\n" + ] + } + ], + "execution_count": 3 }, { "cell_type": "code", "id": "61c9254622598622", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-20T19:12:12.727046Z", + "start_time": "2024-12-20T19:12:12.719803Z" + } + }, "source": [ "display(overlapping_intervals)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + " contig_1 pos_start_1 pos_end_1 contig_2 pos_start_2 pos_end_2\n", + "0 chr1 1 5 chr1 4 8\n", + "1 chr1 3 8 chr1 4 8" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contig_1pos_start_1pos_end_1contig_2pos_start_2pos_end_2
0chr115chr148
1chr138chr148
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 4 }, { "cell_type": "code", "id": "e640901ec6e6ce11", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-20T19:12:14.857698Z", + "start_time": "2024-12-20T19:12:14.791600Z" + } + }, "source": [ "visualize_intervals(overlapping_intervals)" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAygAAADTCAYAAABqSTe2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAdCElEQVR4nO3de1hUdeLH8c8IDBAXFUKENgy8RIK3JCmldJMNy7XoYmSakD1dvGTYhm67KWqladvFtMfLrtvqthWWl8q8YaGrpZIildV6yUumq8gPlfCCCOf3hw/zNIIpRp7vMu/X8/g8znfOnPnMOeLhM2e+ZxyWZVkCAAAAAAM0sjsAAAAAAFSjoAAAAAAwBgUFAAAAgDEoKAAAAACMQUEBAAAAYAwKCgAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAIBBVq1aJYfDoVWrVtkdRZI0duxYORwOu2PUG4fDobFjx9odAwDwMygoAABcpCNHjqhRo0Zavny5JGnhwoXy9fVVeXl5jWXLy8s1atQoRUZGyt/fX4mJicrNzb3UkQHAeBQUAMA5PfPMMzpx4oTdMerNiRMn9Mwzz9Tb+vLz8yVJiYmJkqR169apU6dO8vX1rbFsRkaGXn75ZfXv319TpkyRl5eXbrvtNq1du7be8gBAQ+BtdwAAgHTy5Ek5nU67Y9Tg7e0tb++Gc6jw8/M77zLHjh1TQEDABa0vPz9fV199tZo0aSLpTEGpLitnL/fOO+/oxRdf1FNPPSVJGjhwoOLj4zVy5Eh99tlnF/4iAKCB4wwKAI+2efNm3XrrrQoODlZgYKB69uyp9evXu+7fuHGjHA6H5syZU+Oxy5cvl8Ph0OLFi11j+/bt06BBgxQeHi5fX1/FxcXp73//u9vjqueZvPPOO3rmmWd0xRVX6LLLLlNpaWmtGdesWaO+ffsqKipKvr6+uvLKKzVixIgaZzYyMjIUGBionTt3KiUlRQEBAYqMjNT48eNlWZZrud27d8vhcOgvf/mLXnnlFbVo0UL+/v7q3r27tmzZ4rbO2uagOBwODRs2TIsWLVJ8fLzrdS5btqxG9lWrVikhIUF+fn5q2bKlZs6cecHzWnr06KH4+Hht2rRJXbt2lb+/v6KjozVjxgy35U6dOqUxY8aoc+fOaty4sQICAnTjjTcqLy+vxjrPnoNSneWbb77R/fffr6ZNmyopKelncx09elTFxcUqLi7WunXr1KFDBxUXF+vgwYPatGmTYmNjVVxcrKNHj7oe895778nLy0uPPPKIa8zPz08PPfSQ1q1bp7179553ewCAp2g4b4sBQB19/fXXuvHGGxUcHKyRI0fKx8dHM2fOVI8ePbR69WolJiYqISFBMTExmjdvntLT090en5OTo6ZNmyolJUWSdPDgQV1//fWuX+DDwsK0dOlSPfTQQyotLVVmZqbb45999lk5nU499dRTKi8vP+cZlHfffVfHjx/X4MGDFRoaqvz8fE2dOlU//PCD3n33XbdlKysr1atXL11//fWaPHmyli1bpuzsbJ0+fVrjx493W3bu3Ln68ccfNXToUJ08eVJTpkzRzTffrK+++krh4eE/u+3Wrl2rBQsWaMiQIQoKCtJrr72mu+++W99//71CQ0MlnSl/vXr1UkREhMaNG6fKykqNHz9eYWFh59031Q4fPqzbbrtN9957r/r166d58+Zp8ODBcjqdGjRokCSptLRUf/vb39SvXz89/PDD+vHHHzV79mylpKQoPz9fHTt2PO/z9O3bV61bt9aECRPcylxt7rjjDq1evdptLCcnx/X3IUOGaMiQIerevbvrYgebN29WmzZtFBwc7Pa4Ll26SJIKCwt15ZVXnjcnAHgECwA8VGpqquV0Oq3vvvvONbZ//34rKCjIuummm1xjTz/9tOXj42OVlJS4xsrLy60mTZpYgwYNco099NBDVkREhFVcXOz2PPfdd5/VuHFj6/jx45ZlWVZeXp4lyYqJiXGNVau+Ly8vzzV29jKWZVkTJ060HA6HtWfPHtdYenq6Jcl6/PHHXWNVVVVW7969LafTaR06dMiyLMvatWuXJcny9/e3fvjhB9eyGzZssCRZI0aMcI1lZ2dbZx8qJFlOp9PasWOHa+yLL76wJFlTp051jfXp08e67LLLrH379rnGtm/fbnl7e9dYZ226d+9uSbJeeukl11h5ebnVsWNHq1mzZtapU6csy7Ks06dPW+Xl5W6PPXz4sBUeHu62f6qzZ2dn13h9/fr1O2+eahs3brRyc3Ot119/3ZJkvfnmm1Zubq6VkZFhXXnllVZubq6Vm5trbdy40fWYuLg46+abb66xrq+//tqSZM2YMeOCnx8AGjo+4gXAI1VWVmrFihVKTU1VTEyMazwiIkL333+/1q5d6/rIVVpamioqKrRgwQLXcitWrNCRI0eUlpYmSbIsS/Pnz1efPn1kWZbrI0DFxcVKSUnR0aNHVVBQ4JYhPT1d/v7+583602WOHTum4uJide3aVZZlafPmzTWWHzZsmOvv1WdzTp06pZUrV7otl5qaqiuuuMJ1u0uXLkpMTNSSJUvOmyk5OVktW7Z03W7fvr2Cg4O1c+dOSWe278qVK5WamqrIyEjXcq1atdKtt9563vVX8/b21qOPPuq67XQ69eijj6qoqEibNm2SJHl5ebnOPlVVVamkpESnT59WQkJCjW1+Lo899tgFZ+rcubOSk5N1+vRpRUZGqn///kpOTtahQ4fUs2dPJScnKzk5WZ07d3Y95sSJE7VOnK+eE9OQLkQAAL8UBQWARzp06JCOHz+uq6++usZ911xzjaqqqlzzAjp06KDY2Fi3j/Hk5OTo8ssv18033+xa35EjRzRr1iyFhYW5/XnwwQclSUVFRW7PEx0dfUFZv//+e2VkZCgkJESBgYEKCwtT9+7dJcltnoMkNWrUyK1wSVKbNm0knZl78lOtW7eu8Vxt2rSpsVxtoqKiaow1bdpUhw8flnTmtZ44cUKtWrWqsVxtY+cSGRlZY8J6ba9nzpw5at++vfz8/BQaGqqwsDB99NFHNbbPuVzovigrK3MVz9zcXF1//fUqLi5WUVGR1qxZo2uvvVbFxcWu7VDN39+/1ksPnzx50nU/AOAM5qAAwAVIS0vT888/r+LiYgUFBemDDz5Qv379XFe4qqqqkiQNGDCgxlyVau3bt3e7fSG/lFZWVup3v/udSkpKNGrUKMXGxiogIED79u1TRkaG63kvNS8vr1rHrfPM3/g1vPnmm8rIyFBqaqqysrLUrFkzeXl5aeLEifruu+8uaB0XWhCGDRtW44IJPz2zNnz4cA0fPlwtWrRwK1ARERHat29fjfX997//lSS3s0wA4OkoKAA8UlhYmC677DJt3bq1xn3/+c9/1KhRI7dJy2lpaRo3bpzmz5+v8PBwlZaW6r777nNbX1BQkCorK5WcnFxvOb/66itt27ZNc+bM0cCBA13j5/qCv6qqKu3cudN1lkGStm3bJkm66qqr3Jbdvn17jcdv27atxnIXo1mzZvLz89OOHTtq3Ffb2Lns37+/xmV/z3497733nmJiYrRgwQK3q4NlZ2dfZPpzGzlypAYMGKBdu3bpkUce0dy5cxUREaF58+ZpyZIl+sc//iGpZuHp2LGj8vLyVFpa6jZRfsOGDa77AQBn8BEvAB7Jy8tLt9xyi95//323d7oPHjyot956S0lJSW6/SF5zzTVq166dcnJylJOTo4iICN10001u67v77rs1f/78Gpfqlc58BOxic0ruZyYsy9KUKVPO+Zhp06a5LTtt2jT5+PioZ8+ebsstWrTI7V39/Px8bdiwoU5zRH4ud3JyshYtWqT9+/e7xnfs2KGlS5de8HpOnz6tmTNnum6fOnVKM2fOVFhYmGuOR23baMOGDVq3bt0vfRk1tG3bVsnJyfL29lbTpk01YMAAJScnq7S0VElJSa75J926dXN73D333KPKykrNmjXLNVZeXq433nhDiYmJXMELAH6CMygAPNZzzz2n3NxcJSUlaciQIfL29tbMmTNVXl6uyZMn11g+LS1NY8aMcX1/RaNG7u/xvPDCC8rLy1NiYqIefvhhtW3bViUlJSooKNDKlStVUlJS54yxsbFq2bKlnnrqKe3bt0/BwcGaP39+jTkO1fz8/LRs2TKlp6crMTFRS5cu1UcffaQ//elPNS7v26pVKyUlJWnw4MEqLy/Xq6++qtDQUI0cObLOOWszduxYrVixQt26ddPgwYNVWVmpadOmKT4+XoWFhRe0jsjISE2aNEm7d+9WmzZtlJOTo8LCQs2aNUs+Pj6SpN///vdasGCB7rzzTvXu3Vu7du3SjBkz1LZtW5WVldXLaznbp59+6rqktCR99tlnri9grE1iYqL69u2rp59+WkVFRWrVqpXmzJmj3bt3a/bs2b9KRgD4X8UZFAAeKy4uTmvWrFF8fLwmTpyocePGqUWLFq6Scba0tDRVVVXp+PHjrqt3/VR4eLjy8/P14IMPasGCBRo2bJimTJmikpISTZo06aIy+vj46MMPP1THjh1dGVu3bq25c+fWuryXl5eWLVumAwcOKCsrS59//rmys7P17LPP1lh24MCBevzxxzVt2jQ9//zziouL0yeffKKIiIiLynq2zp07a+nSpWratKlGjx6t2bNna/z48erZs+cFfaO7dGbi/ZIlS7Rx40ZlZWVp7969mjZtmh5++GHXMhkZGZowYYK++OILDR8+XMuXL9ebb76phISEenkdtfn000/VtWtXSWe+nHPv3r2u2+cyd+5cZWZm6p///KeGDx+uiooKLV682O1MHABAclh2zGgEANS7jIwMvffee+c9a7B7925FR0frxRdf/Nl3/X8tqamp+vrrr2udA/NTPXr0UHFxca0fmQMANFycQQEA/GrO/n6P7du3a8mSJerRo4c9gQAAxmMOCgDgVxMTE6OMjAzFxMRoz549mj59upxOZ73NcwEANDwUFADAr6ZXr156++23deDAAfn6+uqGG27QhAkTav2SSAAAJOagAAAAADAIc1AAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMIa33QHsVFJSorKyMrtjALYKDAxUSEiI3TEAAAAkeXBBKSkp0bhx41VRccruKICtfHycys4eQ0kBAABG8NiCUlZWpoqKUwr97d3yaRJmdxzAFhVHDun/8uarrKyMggIAAIzgsQWlmk+TMDkvj7Q7BgAAAAAxSR4AAACAQSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGAMCgoAAAAAY1BQAAAAABiDggIAAADAGBQUAAAAAMagoAAAAAAwBgUFAAAAgDEoKAAAAACMQUEBAAAAYAwKCgAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMbztDmC3iiOH7I4A2IZ//wAAwDQeW1ACAwPl4+PU/+XNtzsKYCsfH6cCAwPtjgEAACBJcliWZdkdwi4lJSUqKyuzOwZgq8DAQIWEhNgdAwAAQJKHFxQAAAAAZmGSPAAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjOFtdwDgf1lJSYnKysrsjgHYKjAwUCEhIXbHAAA0EBQU4CKVlJRo/LhxOlVRYXcUwFZOHx+Nyc6mpAAA6gUFBbhIZWVlOlVRoYxO16t5YLDdcQBbHCgr1T82r1dZWRkFBQBQLygowC/UPDBYUU34xQwAAKA+MEkeAAAAgDEoKAAAAACMQUEBAAAAYAwKCgAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGAMCgoAAAAAY1BQAAAAABiDggIAAADAGBQUAAAAAMagoAAAAAAwBgUFAAAAgDG87Q4A/K87UFZqdwTANvz7BwDUNwoKcJECAwPl9PHRPzavtzsKYCunj48CAwPtjgEAaCAclmVZdocA/leVlJSorKzM7hiArQIDAxUSEmJ3DABAA0FBAQAAAGAMJskDAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGAMCgoAAAAAY1BQAAAAABiDggIAAADAGBQUAAAAAMagoAAAAAAwBgUFAAAAgDEoKAAAAACMQUEBAAAAYAwKCgAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGAMCgoAAAAAY1BQAAAAABiDggIAAADAGBQUAAAAAMagoAAAAAAwBgUFAAAAgDEoKAAAAACMQUEBAAAAYAwKCgAAAABjUFAAAAAAGIOCAgAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGAMCgoAAAAAY1BQAAAAABiDggIAAADAGBdVUMrLyzV27FiVl5fXdx5cIPaB/dgH9mMf2I99YD/2gf3YB/ZjH9irvre/w7Isq64PKi0tVePGjXX06FEFBwfXSxDUDfvAfuwD+7EP7Mc+sB/7wH7sA/uxD+xV39ufj3gBAAAAMAYFBQAAAIAxKCgAAAAAjHFRBcXX11fZ2dny9fWt7zy4QOwD+7EP7Mc+sB/7wH7sA/uxD+zHPrBXfW//i5okDwAAAAC/Bj7iBQAAAMAYFBQAAAAAxqCgAAAAADAGBQUAAACAMS6qoLz++uu66qqr5Ofnp8TEROXn59d3LpzDxIkTdd111ykoKEjNmjVTamqqtm7dancsj/XCCy/I4XAoMzPT7igeZ9++fRowYIBCQ0Pl7++vdu3aaePGjXbH8giVlZUaPXq0oqOj5e/vr5YtW+rZZ58V11z5df373/9Wnz59FBkZKYfDoUWLFrndb1mWxowZo4iICPn7+ys5OVnbt2+3J2wD9HPbv6KiQqNGjVK7du0UEBCgyMhIDRw4UPv377cvcAN0vp+Bn3rsscfkcDj06quvXrJ8nuBC9sG3336r22+/XY0bN1ZAQICuu+46ff/993V6njoXlJycHD355JPKzs5WQUGBOnTooJSUFBUVFdV1VbgIq1ev1tChQ7V+/Xrl5uaqoqJCt9xyi44dO2Z3NI/z+eefa+bMmWrfvr3dUTzO4cOH1a1bN/n4+Gjp0qX65ptv9NJLL6lp06Z2R/MIkyZN0vTp0zVt2jR9++23mjRpkiZPnqypU6faHa1BO3bsmDp06KDXX3+91vsnT56s1157TTNmzNCGDRsUEBCglJQUnTx58hInbZh+bvsfP35cBQUFGj16tAoKCrRgwQJt3bpVt99+uw1JG67z/QxUW7hwodavX6/IyMhLlMxznG8ffPfdd0pKSlJsbKxWrVqlL7/8UqNHj5afn1/dnsiqoy5dulhDhw513a6srLQiIyOtiRMn1nVVqAdFRUWWJGv16tV2R/EoP/74o9W6dWsrNzfX6t69u/XEE0/YHcmjjBo1ykpKSrI7hsfq3bu3NWjQILexu+66y+rfv79NiTyPJGvhwoWu21VVVVbz5s2tF1980TV25MgRy9fX13r77bdtSNiwnb39a5Ofn29Jsvbs2XNpQnmYc+2DH374wbriiiusLVu2WC1atLBeeeWVS57NU9S2D9LS0qwBAwb84nXX6QzKqVOntGnTJiUnJ7vGGjVqpOTkZK1bt65uzQj14ujRo5KkkJAQm5N4lqFDh6p3795uPwu4dD744AMlJCSob9++atasmTp16qS//vWvdsfyGF27dtXHH3+sbdu2SZK++OILrV27VrfeeqvNyTzXrl27dODAAbf/kxo3bqzExESOzzY5evSoHA6HmjRpYncUj1FVVaUHHnhAWVlZiouLszuOx6mqqtJHH32kNm3aKCUlRc2aNVNiYuLPfhTvXOpUUIqLi1VZWanw8HC38fDwcB04cKDOT45fpqqqSpmZmerWrZvi4+PtjuMx3nnnHRUUFGjixIl2R/FYO3fu1PTp09W6dWstX75cgwcP1vDhwzVnzhy7o3mEP/7xj7rvvvsUGxsrHx8fderUSZmZmerfv7/d0TxW9TGY47MZTp48qVGjRqlfv34KDg62O47HmDRpkry9vTV8+HC7o3ikoqIilZWV6YUXXlCvXr20YsUK3Xnnnbrrrru0evXqOq3L+1fKiEtg6NCh2rJli9auXWt3FI+xd+9ePfHEE8rNza375ylRb6qqqpSQkKAJEyZIkjp16qQtW7ZoxowZSk9Ptzldwzdv3jz961//0ltvvaW4uDgVFhYqMzNTkZGRbH94vIqKCt17772yLEvTp0+3O47H2LRpk6ZMmaKCggI5HA6743ikqqoqSdIdd9yhESNGSJI6duyozz77TDNmzFD37t0veF11OoNy+eWXy8vLSwcPHnQbP3jwoJo3b16XVeEXGjZsmBYvXqy8vDz95je/sTuOx9i0aZOKiop07bXXytvbW97e3lq9erVee+01eXt7q7Ky0u6IHiEiIkJt27Z1G7vmmmvqfJUQXJysrCzXWZR27drpgQce0IgRIziraKPqYzDHZ3tVl5M9e/YoNzeXsyeX0Jo1a1RUVKSoqCjX8XnPnj36wx/+oKuuusrueB7h8ssvl7e3d70cn+tUUJxOpzp37qyPP/7YNVZVVaWPP/5YN9xwQ52eGBfHsiwNGzZMCxcu1CeffKLo6Gi7I3mUnj176quvvlJhYaHrT0JCgvr376/CwkJ5eXnZHdEjdOvWrcbltbdt26YWLVrYlMizHD9+XI0auR8+vLy8XO+e4dKLjo5W8+bN3Y7PpaWl2rBhA8fnS6S6nGzfvl0rV65UaGio3ZE8ygMPPKAvv/zS7fgcGRmprKwsLV++3O54HsHpdOq6666rl+NznT/i9eSTTyo9PV0JCQnq0qWLXn31VR07dkwPPvhgXVeFizB06FC99dZbev/99xUUFOT6bHHjxo3l7+9vc7qGLygoqMZ8n4CAAIWGhjIP6BIaMWKEunbtqgkTJujee+9Vfn6+Zs2apVmzZtkdzSP06dNHzz//vKKiohQXF6fNmzfr5Zdf1qBBg+yO1qCVlZVpx44drtu7du1SYWGhQkJCFBUVpczMTD333HNq3bq1oqOjNXr0aEVGRio1NdW+0A3Iz23/iIgI3XPPPSooKNDixYtVWVnpOj6HhITI6XTaFbtBOd/PwNml0MfHR82bN9fVV199qaM2WOfbB1lZWUpLS9NNN92k3/72t1q2bJk+/PBDrVq1qm5PdDGX/po6daoVFRVlOZ1Oq0uXLtb69et/8eXEcGEk1frnjTfesDuax+Iyw/b48MMPrfj4eMvX19eKjY21Zs2aZXckj1FaWmo98cQTVlRUlOXn52fFxMRYf/7zn63y8nK7ozVoeXl5tf7/n56eblnWmUsNjx492goPD7d8fX2tnj17Wlu3brU3dAPyc9t/165d5zw+5+Xl2R29wTjfz8DZuMxw/buQfTB79myrVatWlp+fn9WhQwdr0aJFdX4eh2Xx1b8AAAAAzFDnb5IHAAAAgF8LBQUAAACAMSgoAAAAAIxBQQEAAABgDAoKAAAAAGNQUAAAAAAYg4ICAAAAwBgUFAAAAADGoKAAAAAAMAYFBQAAAIAxKCgAAAAAjEFBAQAAAGCM/wc8ExM4N5QQIwAAAABJRU5ErkJggg==" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAygAAADTCAYAAABqSTe2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAckElEQVR4nO3de1RVZeL/8c8ROIDcFEQujRDeIq+ZF0otLZmwcSyqKTMtyFZTiplW5lxSzErTpumiLS9T09hqKiwvleUFHXRyUknRymq8kqWOIl9UwgsiPL8/Wp5fJ1DByP3Eeb/WYi3Pc/bZ+3P2I24/7LM3LmOMEQAAAABYoJHTAQAAAADgNAoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAWGTVqlVyuVxatWqV01EkSRMnTpTL5XI6Rr1xuVyaOHGi0zEAAGdBQQEA4DwdPnxYjRo10rJlyyRJCxcuVGBgoMrLy72WKysrU3Z2tvr376/IyEi5XC794x//cCAxANiPggIAOKPHHntMx48fdzpGvTl+/Lgee+yxeltffn6+JCklJUWStHbtWnXp0kWBgYFeyxUXF2vSpEn66quv1Llz53rbPgA0RP5OBwAASCdOnJDb7XY6RjX+/v7y9284h4qgoKBzLnP06FGFhITUan35+fm65JJL1KRJE0nfF5TTZeWH4uLi9L///U+xsbHasGGDunfvXqfcAOBLOIMCwKdt2rRJ119/vcLDwxUaGqp+/fpp3bp1nuc3bNggl8uluXPnVnvtsmXL5HK5tHjxYs/Y3r17NWzYMMXExCgwMFDt27fX3//+d6/Xnb7O5K233tJjjz2miy66SI0bN1ZpaWmNGT/66CPdeuutSkhIUGBgoFq0aKExY8ZUO7ORmZmp0NBQ7dq1S2lpaQoJCVF8fLwmTZokY4xnua+//loul0t/+ctf9NxzzykxMVHBwcHq06ePtmzZ4rXOmq5BcblcGjlypBYtWqQOHTp43ufSpUurZV+1apW6deumoKAgtWrVSrNnz671dS19+/ZVhw4dtHHjRvXs2VPBwcFKSkrSrFmzvJY7efKkJkyYoK5duyoiIkIhISG66qqrlJeXV22dP74G5XSWL7/8UnfccYeaNm2q3r17nzXXkSNHVFxcrOLiYq1du1adO3dWcXGxDhw4oI0bNyo5OVnFxcU6cuSI5zWBgYGKjY0953sGAHAGBYAP++KLL3TVVVcpPDxcjz76qAICAjR79mz17dtXq1evVkpKirp166aWLVtq3rx5ysjI8Hp9Tk6OmjZtqrS0NEnSgQMHdMUVV3j+Ax8dHa0lS5bonnvuUWlpqUaPHu31+ieeeEJut1uPPPKIysvLz3gG5e2339axY8c0fPhwRUVFKT8/X9OnT9eePXv09ttvey1bWVmp/v3764orrtC0adO0dOlSZWdn69SpU5o0aZLXsq+99pq+++47ZWVl6cSJE3rhhRd07bXX6vPPP1dMTMxZ992aNWu0YMECjRgxQmFhYXrxxRd1yy236JtvvlFUVJSk78tf//79FRcXp8cff1yVlZWaNGmSoqOjzzk3px06dEi/+c1vdNttt2nw4MGaN2+ehg8fLrfbrWHDhkmSSktL9fLLL2vw4MG699579d133+mVV15RWlqa8vPzddlll51zO7feeqvatGmjyZMne5W5mtx4441avXq111hOTo7nzyNGjNCIESPUp08fa252AAC/KAYAfFR6erpxu91m586dnrF9+/aZsLAwc/XVV3vG/vjHP5qAgABTUlLiGSsvLzdNmjQxw4YN84zdc889Ji4uzhQXF3tt5/bbbzcRERHm2LFjxhhj8vLyjCTTsmVLz9hpp5/Ly8vzjP14GWOMmTJlinG5XGb37t2esYyMDCPJPPDAA56xqqoqM2DAAON2u83BgweNMcYUFhYaSSY4ONjs2bPHs+z69euNJDNmzBjPWHZ2tvnxoUKScbvdZseOHZ6xTz/91Egy06dP94wNHDjQNG7c2Ozdu9cztn37duPv719tnTXp06ePkWSeffZZz1h5ebm57LLLTPPmzc3JkyeNMcacOnXKlJeXe7320KFDJiYmxmt+TmfPzs6u9v4GDx58zjynbdiwweTm5pqXXnrJSDKvv/66yc3NNZmZmaZFixYmNzfX5Obmmg0bNtT4+k8++cRIMq+++mqttwkAvoSPeAHwSZWVlVq+fLnS09PVsmVLz3hcXJzuuOMOrVmzxvORq0GDBqmiokILFizwLLd8+XIdPnxYgwYNkiQZYzR//nwNHDhQxhjPR4CKi4uVlpamI0eOqKCgwCtDRkaGgoODz5n1h8scPXpUxcXF6tmzp4wx2rRpU7XlR44c6fnz6bM5J0+e1IoVK7yWS09P10UXXeR53KNHD6WkpOjDDz88Z6bU1FS1atXK87hTp04KDw/Xrl27JH2/f1esWKH09HTFx8d7lmvdurWuv/76c67/NH9/f913332ex263W/fdd5+Kioq0ceNGSZKfn5/n7FNVVZVKSkp06tQpdevWrdo+P5P777+/1pm6du2q1NRUnTp1SvHx8RoyZIhSU1N18OBB9evXT6mpqUpNTVXXrl1rvU4AwP9HQQHgkw4ePKhjx47pkksuqfbcpZdeqqqqKn377beSpM6dOys5OdnrYzw5OTlq1qyZrr32Ws/6Dh8+rDlz5ig6Otrr6+6775YkFRUVeW0nKSmpVlm/+eYbZWZmKjIyUqGhoYqOjlafPn0kyes6B0lq1KiRV+GSpLZt20r6/tqTH2rTpk21bbVt27bacjVJSEioNta0aVMdOnRI0vfv9fjx42rdunW15WoaO5P4+PhqF6zX9H7mzp2rTp06KSgoSFFRUYqOjtYHH3xQbf+cSW3noqyszFM8c3NzdcUVV6i4uFhFRUX66KOPdPnll6u4uNizHwAAdcc1KABQC4MGDdJTTz2l4uJihYWF6b333tPgwYM9d7iqqqqSJA0dOrTatSqnderUyetxbc6eVFZW6te//rVKSko0btw4JScnKyQkRHv37lVmZqZnuxean59fjePmHNdv/Bxef/11ZWZmKj09XWPHjlXz5s3l5+enKVOmaOfOnbVaR23mQvr+7NSPb5jwwzNro0aN0qhRo5SYmFirogcAqI6CAsAnRUdHq3Hjxtq6dWu15/773/+qUaNGatGihWds0KBBevzxxzV//nzFxMSotLRUt99+u9f6wsLCVFlZqdTU1HrL+fnnn2vbtm2aO3eu7rrrLs94bm5ujctXVVVp165dnrMMkrRt2zZJ0sUXX+y17Pbt26u9ftu2bdWWOx/NmzdXUFCQduzYUe25msbOZN++fdVu+/vj9/POO++oZcuWWrBggdfdwbKzs88z/Zk9+uijGjp0qAoLC/X73/9er732muLi4jRv3jx9+OGHnl++WNvCAwCojo94AfBJfn5+uu666/Tuu+96/aT7wIEDeuONN9S7d2+Fh4d7xi+99FJ17NhROTk5ysnJUVxcnK6++mqv9d1yyy2aP39+tVv1St9/BOx8c0reZyaMMXrhhRfO+JoZM2Z4LTtjxgwFBASoX79+XsstWrRIe/fu9TzOz8/X+vXr63SNyNlyp6amatGiRdq3b59nfMeOHVqyZEmt13Pq1CnNnj3b8/jkyZOaPXu2oqOjPdd41LSP1q9fr7Vr1/7Ut1FNu3btlJqaKn9/fzVt2lRDhw5VamqqSktL1bt3b8/1J7169ar3bQOAr+AMCgCf9eSTTyo3N1e9e/fWiBEj5O/vr9mzZ6u8vFzTpk2rtvygQYM0YcIEBQUF6Z577lGjRt4/43n66aeVl5enlJQU3XvvvWrXrp1KSkpUUFCgFStWqKSkpM4Zk5OT1apVKz3yyCPau3evwsPDNX/+/DNe4xAUFKSlS5cqIyNDKSkpWrJkiT744AP96U9/qnZ739atW6t3794aPny4ysvL9fzzzysqKkqPPvponXPWZOLEiVq+fLl69eql4cOHq7KyUjNmzFCHDh20efPmWq0jPj5eU6dO1ddff622bdsqJydHmzdv1pw5cxQQECBJ+u1vf6sFCxbopptu0oABA1RYWKhZs2apXbt2Kisrq5f38mP/+c9/PLeUlqSPP/5YjzzyyFlfM2PGDB0+fNhT2N5//33t2bNHkvTAAw8oIiLiZ8kKAL84zt1ADACcV1BQYNLS0kxoaKhp3Lixueaaa8zHH39c47Lbt283kowks2bNmhqXOXDggMnKyjItWrQwAQEBJjY21vTr18/MmTPHs8zpWwm//fbb1V5f022Gv/zyS5OammpCQ0NNs2bNzL333uu5re8Pb1WbkZFhQkJCzM6dO811111nGjdubGJiYkx2draprKz0LHf6NsPPPPOMefbZZ02LFi1MYGCgueqqq8ynn37qledMtxnOysqqlj0xMdFkZGR4ja1cudJ06dLFuN1u06pVK/Pyyy+bhx9+2AQFBdW4/36oT58+pn379mbDhg3myiuvNEFBQSYxMdHMmDHDa7mqqiozefJkk5iYaAIDA02XLl3M4sWLTUZGhklMTKyWvabbDJ++BXNtJScnmyeeeMIYY8yePXuMJPPJJ5+c9TWJiYmevz8//iosLKzT9gGgIXMZ48AVjQCAepeZmal33nnnnGcNvv76ayUlJemZZ54550/9fw7p6en64osvarwG5of69u2r4uLiGj8yBwBouLgGBQDwszl+/LjX4+3bt+vDDz9U3759nQkEALAe16AAAH42LVu2VGZmplq2bKndu3dr5syZcrvd9XadCwCg4aGgAAB+Nv3799ebb76p/fv3KzAwUFdeeaUmT55c4y+JBABAkrgGBQAAAIA1uAYFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWIOCAgAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGv4Ox0Avq2kpERlZWVOxwDwE4SGhioyMtLpGACABoKCAseUlJTo8ccnqaLipNNRAPwEAQFuZWdPoKQAAOoFBQWOKSsrU0XFSUVdc4sCmkQ7HQfAeag4fFD/lzdfZWVlFBQAQL2goMBxAU2i5W4W73QMAAAAWICL5AEAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAAAAAa1BQAAAAAFiDggIAAADAGhQUAAAAANagoAAAAACwBgUFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWIOCAgAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYw9/pAEDF4YNORwBwnvj+BQDUNwoKHBMaGqqAALf+L2++01EA/AQBAW6FhoY6HQMA0EC4jDHG6RDwXSUlJSorK3M6BoCfIDQ0VJGRkU7HAAA0EBQUAAAAANbgInkAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAAAAAa1BQAAAAAFjD3+kAwC9ZSUmJysrKnI4BOCo0NFSRkZFOxwAANBAUFOA8lZSUaNLjj+tkRYXTUQBHuQMCNCE7m5ICAKgXFBTgPJWVlelkRYUyu1yh2NBwp+MAjthfVqp/bFqnsrIyCgoAoF5QUICfKDY0XAlN+I8ZAABAfeAieQAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAAAAAa1BQAAAAAFiDggIAAADAGhQUAAAAANagoAAAAACwBgUFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWIOCAgAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADW8Hc6APBLt7+s1OkIgGP4+w8AqG8UFOA8hYaGyh0QoH9sWud0FMBR7oAAhYaGOh0DANBAuIwxxukQwC9VSUmJysrKnI4BOCo0NFSRkZFOxwAANBAUFAAAAADW4CJ5AAAAANagoAAAAACwBgUFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWIOCAgAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAAAAAa1BQAAAAAFiDggIAAADAGhQUAAAAANagoAAAAACwBgUFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWIOCAgAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1KCgAAAAArEFBAQAAAGANCgoAAAAAa1BQAAAAAFiDggIAAADAGhQUAAAAANagoAAAAACwBgUFAAAAgDUoKAAAAACsQUEBAAAAYA0KCgAAAABrUFAAAAAAWOO8Ckp5ebkmTpyo8vLy+s6DWmIOnMccOI85cB5z4DzmwHnMgfOYA2fV9/53GWNMXV9UWlqqiIgIHTlyROHh4fUSBHXDHDiPOXAec+A85sB5zIHzmAPnMQfOqu/9z0e8AAAAAFiDggIAAADAGhQUAAAAANY4r4ISGBio7OxsBQYG1nce1BJz4DzmwHnMgfOYA+cxB85jDpzHHDirvvf/eV0kDwAAAAA/Bz7iBQAAAMAaFBQAAAAA1qCgAAAAALAGBQUAAACANc6roLz00ku6+OKLFRQUpJSUFOXn59d3LpzBlClT1L17d4WFhal58+ZKT0/X1q1bnY7ls55++mm5XC6NHj3a6Sg+Z+/evRo6dKiioqIUHBysjh07asOGDU7H8gmVlZUaP368kpKSFBwcrFatWumJJ54Q91z5ef373//WwIEDFR8fL5fLpUWLFnk9b4zRhAkTFBcXp+DgYKWmpmr79u3OhG2Azrb/KyoqNG7cOHXs2FEhISGKj4/XXXfdpX379jkXuAE61/fAD91///1yuVx6/vnnL1g+X1CbOfjqq690ww03KCIiQiEhIerevbu++eabOm2nzgUlJydHDz30kLKzs1VQUKDOnTsrLS1NRUVFdV0VzsPq1auVlZWldevWKTc3VxUVFbruuut09OhRp6P5nE8++USzZ89Wp06dnI7icw4dOqRevXopICBAS5Ys0Zdffqlnn31WTZs2dTqaT5g6dapmzpypGTNm6KuvvtLUqVM1bdo0TZ8+3eloDdrRo0fVuXNnvfTSSzU+P23aNL344ouaNWuW1q9fr5CQEKWlpenEiRMXOGnDdLb9f+zYMRUUFGj8+PEqKCjQggULtHXrVt1www0OJG24zvU9cNrChQu1bt06xcfHX6BkvuNcc7Bz50717t1bycnJWrVqlT777DONHz9eQUFBdduQqaMePXqYrKwsz+PKykoTHx9vpkyZUtdVoR4UFRUZSWb16tVOR/Ep3333nWnTpo3Jzc01ffr0MQ8++KDTkXzKuHHjTO/evZ2O4bMGDBhghg0b5jV28803myFDhjiUyPdIMgsXLvQ8rqqqMrGxseaZZ57xjB0+fNgEBgaaN99804GEDduP939N8vPzjSSze/fuCxPKx5xpDvbs2WMuuugis2XLFpOYmGiee+65C57NV9Q0B4MGDTJDhw79yeuu0xmUkydPauPGjUpNTfWMNWrUSKmpqVq7dm3dmhHqxZEjRyRJkZGRDifxLVlZWRowYIDX9wIunPfee0/dunXTrbfequbNm6tLly7629/+5nQsn9GzZ0+tXLlS27ZtkyR9+umnWrNmja6//nqHk/muwsJC7d+/3+vfpIiICKWkpHB8dsiRI0fkcrnUpEkTp6P4jKqqKt15550aO3as2rdv73Qcn1NVVaUPPvhAbdu2VVpampo3b66UlJSzfhTvTOpUUIqLi1VZWamYmBiv8ZiYGO3fv7/OG8dPU1VVpdGjR6tXr17q0KGD03F8xltvvaWCggJNmTLF6Sg+a9euXZo5c6batGmjZcuWafjw4Ro1apTmzp3rdDSf8Ic//EG33367kpOTFRAQoC5dumj06NEaMmSI09F81uljMMdnO5w4cULjxo3T4MGDFR4e7nQcnzF16lT5+/tr1KhRTkfxSUVFRSorK9PTTz+t/v37a/ny5brpppt08803a/Xq1XVal//PlBEXQFZWlrZs2aI1a9Y4HcVnfPvtt3rwwQeVm5tb989Tot5UVVWpW7dumjx5siSpS5cu2rJli2bNmqWMjAyH0zV88+bN0z//+U+98cYbat++vTZv3qzRo0crPj6e/Q+fV1FRodtuu03GGM2cOdPpOD5j48aNeuGFF1RQUCCXy+V0HJ9UVVUlSbrxxhs1ZswYSdJll12mjz/+WLNmzVKfPn1qva46nUFp1qyZ/Pz8dODAAa/xAwcOKDY2ti6rwk80cuRILV68WHl5efrVr37ldByfsXHjRhUVFenyyy+Xv7+//P39tXr1ar344ovy9/dXZWWl0xF9QlxcnNq1a+c1dumll9b5LiE4P2PHjvWcRenYsaPuvPNOjRkzhrOKDjp9DOb47KzT5WT37t3Kzc3l7MkF9NFHH6moqEgJCQme4/Pu3bv18MMP6+KLL3Y6nk9o1qyZ/P396+X4XKeC4na71bVrV61cudIzVlVVpZUrV+rKK6+s04ZxfowxGjlypBYuXKh//etfSkpKcjqST+nXr58+//xzbd682fPVrVs3DRkyRJs3b5afn5/TEX1Cr169qt1ee9u2bUpMTHQokW85duyYGjXyPnz4+fl5fnqGCy8pKUmxsbFex+fS0lKtX7+e4/MFcrqcbN++XStWrFBUVJTTkXzKnXfeqc8++8zr+BwfH6+xY8dq2bJlTsfzCW63W927d6+X43OdP+L10EMPKSMjQ926dVOPHj30/PPP6+jRo7r77rvruiqch6ysLL3xxht69913FRYW5vlscUREhIKDgx1O1/CFhYVVu94nJCREUVFRXAd0AY0ZM0Y9e/bU5MmTddtttyk/P19z5szRnDlznI7mEwYOHKinnnpKCQkJat++vTZt2qS//vWvGjZsmNPRGrSysjLt2LHD87iwsFCbN29WZGSkEhISNHr0aD355JNq06aNkpKSNH78eMXHxys9Pd250A3I2fZ/XFycfve736mgoECLFy9WZWWl5/gcGRkpt9vtVOwG5VzfAz8uhQEBAYqNjdUll1xyoaM2WOeag7Fjx2rQoEG6+uqrdc0112jp0qV6//33tWrVqrpt6Hxu/TV9+nSTkJBg3G636dGjh1m3bt1Pvp0YakdSjV+vvvqq09F8FrcZdsb7779vOnToYAIDA01ycrKZM2eO05F8RmlpqXnwwQdNQkKCCQoKMi1btjR//vOfTXl5udPRGrS8vLwa//3PyMgwxnx/q+Hx48ebmJgYExgYaPr162e2bt3qbOgG5Gz7v7Cw8IzH57y8PKejNxjn+h74MW4zXP9qMwevvPKKad26tQkKCjKdO3c2ixYtqvN2XMbwq38BAAAA2KHOv0keAAAAAH4uFBQAAAAA1qCgAAAAALAGBQUAAACANSgoAAAAAKxBQQEAAABgDQoKAAAAAGtQUAAAAABYg4ICAAAAwBoUFAAAAADWoKAAAAAAsAYFBQAAAIA1/h8OW0rnHR6yZgAAAABJRU5ErkJggg==" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 5 }, { "cell_type": "markdown", diff --git a/polars_bio/range_op.py b/polars_bio/range_op.py index 1676957..6c14c53 100644 --- a/polars_bio/range_op.py +++ b/polars_bio/range_op.py @@ -10,7 +10,7 @@ pass from polars_bio.polars_bio import FilterOp, RangeOp, RangeOptions -DEFAULT_INTERVAL_COLUMNS = ["contig", "pos_start", "pos_end"] +DEFAULT_INTERVAL_COLUMNS = ["chrom", "start", "end"] ctx = Context().ctx @@ -20,7 +20,7 @@ def overlap( df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], how: str = "inner", overlap_filter: FilterOp = FilterOp.Strict, - suffixes: tuple[str] = ("_1", "_2"), + suffixes: tuple[str, str] = ("_1", "_2"), on_cols=None, col1: Union[list[str], None] = None, col2: Union[list[str], None] = None, @@ -85,19 +85,23 @@ def overlap( _validate_overlap_input(col1, col2, on_cols, suffixes, output_type, how) - col1 = ["contig", "pos_start", "pos_end"] if col1 is None else col1 - col2 = ["contig", "pos_start", "pos_end"] if col2 is None else col2 - range_options = RangeOptions(range_op=RangeOp.Overlap, filter_op=overlap_filter) - return range_operation( - df1, df2, suffixes, range_options, col1, col2, output_type, ctx + col1 = DEFAULT_INTERVAL_COLUMNS if col1 is None else col1 + col2 = DEFAULT_INTERVAL_COLUMNS if col2 is None else col2 + range_options = RangeOptions( + range_op=RangeOp.Overlap, + filter_op=overlap_filter, + suffixes=suffixes, + columns_1=col1, + columns_2=col2, ) + return range_operation(df1, df2, range_options, output_type, ctx) def nearest( df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], overlap_filter: FilterOp = FilterOp.Strict, - suffixes: (str, str) = ("_1", "_2"), + suffixes: tuple[str, str] = ("_1", "_2"), on_cols: Union[list[str], None] = None, col1: Union[list[str], None] = None, col2: Union[list[str], None] = None, @@ -136,9 +140,13 @@ def nearest( _validate_overlap_input(col1, col2, on_cols, suffixes, output_type, how="inner") - col1 = ["contig", "pos_start", "pos_end"] if col1 is None else col1 - col2 = ["contig", "pos_start", "pos_end"] if col2 is None else col2 - range_options = RangeOptions(range_op=RangeOp.Nearest, filter_op=overlap_filter) - return range_operation( - df1, df2, suffixes, range_options, col1, col2, output_type, ctx + col1 = DEFAULT_INTERVAL_COLUMNS if col1 is None else col1 + col2 = DEFAULT_INTERVAL_COLUMNS if col2 is None else col2 + range_options = RangeOptions( + range_op=RangeOp.Nearest, + filter_op=overlap_filter, + suffixes=suffixes, + columns_1=col1, + columns_2=col2, ) + return range_operation(df1, df2, range_options, output_type, ctx) diff --git a/polars_bio/range_op_helpers.py b/polars_bio/range_op_helpers.py index 519206e..caaa1cd 100644 --- a/polars_bio/range_op_helpers.py +++ b/polars_bio/range_op_helpers.py @@ -39,10 +39,7 @@ def __init__(self): def range_operation( df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], - suffixes: tuple[str, str], range_options: RangeOptions, - col1: list[str], - col2: list[str], output_type: str, ctx: BioSessionContext, ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]: @@ -56,8 +53,8 @@ def range_operation( ext2 == ".parquet" or ext2 == ".csv" ), "Dataframe1 must be a Parquet or CSV file" # use suffixes to avoid column name conflicts - df_schema1 = _get_schema(df1, suffixes[0]) - df_schema2 = _get_schema(df2, suffixes[1]) + df_schema1 = _get_schema(df1, range_options.suffixes[0]) + df_schema2 = _get_schema(df2, range_options.suffixes[1]) merged_schema = pl.Schema({**df_schema1, **df_schema2}) if output_type == "polars.LazyFrame": return range_lazy_scan( @@ -66,8 +63,6 @@ def range_operation( merged_schema, range_options=range_options, ctx=ctx, - col1=col1, - col2=col2, ) elif output_type == "polars.DataFrame": return range_operation_scan(ctx, df1, df2, range_options).to_polars() @@ -88,13 +83,11 @@ def range_operation( if output_type == "polars.LazyFrame": merged_schema = pl.Schema( { - **_rename_columns(df1, suffixes[0]).schema, - **_rename_columns(df2, suffixes[1]).schema, + **_rename_columns(df1, range_options.suffixes[0]).schema, + **_rename_columns(df2, range_options.suffixes[1]).schema, } ) - return range_lazy_scan( - df1, df2, merged_schema, col1, col2, range_options, ctx - ) + return range_lazy_scan(df1, df2, merged_schema, range_options, ctx) elif output_type == "polars.DataFrame": if isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame): df1 = df1.to_arrow().to_reader() @@ -106,8 +99,8 @@ def range_operation( return range_operation_frame(ctx, df1, df2, range_options).to_polars() elif output_type == "pandas.DataFrame": if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame): - df1 = _df_to_arrow(df1, col1[0]).to_reader() - df2 = _df_to_arrow(df2, col2[0]).to_reader() + df1 = _df_to_arrow(df1, range_options.columns_1[0]).to_reader() + df2 = _df_to_arrow(df2, range_options.columns_2[0]).to_reader() else: raise ValueError( "Input and output dataframes must be of the same type: either polars or pandas" @@ -120,14 +113,8 @@ def range_operation( def _validate_overlap_input(col1, col2, on_cols, suffixes, output_type, how): - # TODO: Add support for col1 and col2 - assert col1 is None, "col1 is not supported yet" - assert col2 is None, "col2 is not supported yet" - # TODO: Add support for on_cols () assert on_cols is None, "on_cols is not supported yet" - - assert suffixes == ("_1", "_2"), "Only default suffixes are supported" assert output_type in [ "polars.LazyFrame", "polars.DataFrame", diff --git a/polars_bio/range_op_io.py b/polars_bio/range_op_io.py index 5b5c035..e9fecbf 100644 --- a/polars_bio/range_op_io.py +++ b/polars_bio/range_op_io.py @@ -20,8 +20,6 @@ def range_lazy_scan( df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame], schema: pl.Schema, - col1: list[str], - col2: list[str], range_options: RangeOptions, ctx: BioSessionContext, ) -> pl.LazyFrame: @@ -34,8 +32,8 @@ def range_lazy_scan( df_2 = df_2.to_arrow().to_reader() elif isinstance(df_1, pd.DataFrame) and isinstance(df_2, pd.DataFrame): range_function = range_operation_frame - df_1 = _df_to_arrow(df_1, col1[0]).to_reader() - df_2 = _df_to_arrow(df_2, col2[0]).to_reader() + df_1 = _df_to_arrow(df_1, range_options.columns_1[0]).to_reader() + df_2 = _df_to_arrow(df_2, range_options.columns_2[0]).to_reader() else: raise ValueError("Only polars and pandas dataframes are supported") diff --git a/polars_bio/range_viz.py b/polars_bio/range_viz.py index d08adb7..c146492 100644 --- a/polars_bio/range_viz.py +++ b/polars_bio/range_viz.py @@ -22,8 +22,8 @@ def visualize_intervals( df = df if isinstance(df, pd.DataFrame) else df.to_pandas() for i, reg_pair in df.iterrows(): bf.vis.plot_intervals_arr( - starts=[reg_pair.pos_start_1, reg_pair.pos_start_2], - ends=[reg_pair.pos_end_1, reg_pair.pos_end_2], + starts=[reg_pair.start_1, reg_pair.start_2], + ends=[reg_pair.end_1, reg_pair.end_2], colors=["skyblue", "lightpink"], levels=[2, 1], xlim=(0, 16), diff --git a/pyproject.toml b/pyproject.toml index 8378f16..e02e19a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "polars-bio" -version = "0.2.11" +version = "0.3.0" description = "Blazing fast genomic operations on large Python dataframes" authors = [] requires-python = ">=3.9" diff --git a/src/lib.rs b/src/lib.rs index 9950ce8..affe2e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::sync::Arc; use datafusion::arrow::array::RecordBatch; @@ -21,13 +22,20 @@ const RIGHT_TABLE: &str = "s2"; #[pyclass(name = "RangeOptions")] #[derive(Clone)] pub struct RangeOptions { - pub range_op: RangeOp, - pub filter_op: Option, - pub suffixes: Option>, - pub columns_1: Option>, - pub columns_2: Option>, - pub on_cols: Option>, - pub overlap_alg: Option, + #[pyo3(get, set)] + range_op: RangeOp, + #[pyo3(get, set)] + filter_op: Option, + #[pyo3(get, set)] + suffixes: Option<(String, String)>, + #[pyo3(get, set)] + columns_1: Option>, + #[pyo3(get, set)] + columns_2: Option>, + #[pyo3(get, set)] + on_cols: Option>, + #[pyo3(get, set)] + overlap_alg: Option, } #[pymethods] @@ -37,7 +45,7 @@ impl RangeOptions { pub fn new( range_op: RangeOp, filter_op: Option, - suffixes: Option>, + suffixes: Option<(String, String)>, columns_1: Option>, columns_2: Option>, on_cols: Option>, @@ -69,6 +77,19 @@ pub enum RangeOp { Complement = 1, Cluster = 2, Nearest = 3, + Coverage = 4, +} + +impl fmt::Display for RangeOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + RangeOp::Overlap => write!(f, "Overlap"), + RangeOp::Nearest => write!(f, "Nearest"), + RangeOp::Complement => write!(f, "Complement"), + RangeOp::Cluster => write!(f, "Cluster"), + RangeOp::Coverage => write!(f, "Coverage"), + } + } } pub enum InputFormat { @@ -169,85 +190,189 @@ async fn register_table(ctx: &SessionContext, path: &str, table_name: &str, form } } -async fn do_nearest(ctx: &SessionContext, filter: FilterOp) -> datafusion::dataframe::DataFrame { - info!( - "Running nearest: algorithm {} with {} thread(s)", - ctx.state() - .config() - .options() - .extensions - .get::() - .unwrap() - .interval_join_algorithm, - ctx.state().config().options().execution.target_partitions - ); - let sign = match filter { +async fn do_nearest( + ctx: &SessionContext, + range_opts: RangeOptions, +) -> datafusion::dataframe::DataFrame { + let sign = match range_opts.filter_op.unwrap() { FilterOp::Weak => "=".to_string(), _ => "".to_string(), }; + let suffixes = match range_opts.suffixes { + Some((s1, s2)) => (s1, s2), + _ => ("_1".to_string(), "_2".to_string()), + }; + let columns_1 = match range_opts.columns_1 { + Some(cols) => cols, + _ => vec![ + "contig".to_string(), + "pos_start".to_string(), + "pos_end".to_string(), + ], + }; + let columns_2 = match range_opts.columns_2 { + Some(cols) => cols, + _ => vec![ + "contig".to_string(), + "pos_start".to_string(), + "pos_end".to_string(), + ], + }; + let query = format!( r#" SELECT - a.contig AS contig_1, - a.pos_start AS pos_start_1, - a.pos_end AS pos_end_1, - b.contig AS contig_2, - b.pos_start AS pos_start_2, - b.pos_end AS pos_end_2, + a.{} AS {}{}, -- contig + a.{} AS {}{}, -- pos_start + a.{} AS {}{}, -- pos_end + b.{} AS {}{}, -- contig + b.{} AS {}{}, -- pos_start + b.{} AS {}{}, -- pos_end + a.* except({}, {}, {}), -- all join columns from left table + b.* except({}, {}, {}), -- all join columns from right table CAST( - CASE WHEN b.pos_start >= a.pos_end + CASE WHEN b.{} >= a.{} THEN - abs(b.pos_start-a.pos_end) - WHEN b.pos_end <= a.pos_start + abs(b.{}-a.{}) + WHEN b.{} <= a.{} THEN - abs(a.pos_start-b.pos_end) + abs(a.{}-b.{}) ELSE 0 END AS BIGINT) AS distance FROM {} AS b, {} AS a - WHERE b.contig = a.contig - AND cast(b.pos_end AS INT) >{} cast(a.pos_start AS INT ) - AND cast(b.pos_start AS INT) <{} cast(a.pos_end AS INT) + WHERE b.{} = a.{} + AND cast(b.{} AS INT) >{} cast(a.{} AS INT ) + AND cast(b.{} AS INT) <{} cast(a.{} AS INT) "#, - RIGHT_TABLE, LEFT_TABLE, sign, sign + columns_1[0], + columns_1[0], + suffixes.0, // contig + columns_1[1], + columns_1[1], + suffixes.0, // pos_start + columns_1[2], + columns_1[2], + suffixes.0, // pos_end + columns_2[0], + columns_2[0], + suffixes.1, // contig + columns_2[1], + columns_2[1], + suffixes.1, // pos_start + columns_2[2], + columns_2[2], + suffixes.1, // pos_end + columns_1[0], + columns_1[1], + columns_1[2], // all join columns from right table + columns_2[0], + columns_2[1], + columns_2[2], // all join columns from left table + columns_2[1], + columns_1[2], // b.pos_start >= a.pos_end + columns_2[1], + columns_1[2], // b.pos_start-a.pos_end + columns_2[2], + columns_1[1], // b.pos_end <= a.pos_start + columns_2[2], + columns_1[1], // a.pos_start-b.pos_end + RIGHT_TABLE, + LEFT_TABLE, + columns_1[0], + columns_2[0], // contig + columns_1[2], + sign, + columns_2[1], // pos_start + columns_1[1], + sign, + columns_2[2], // pos_end ); ctx.sql(&query).await.unwrap() } -async fn do_overlap(ctx: &SessionContext, filter: FilterOp) -> datafusion::dataframe::DataFrame { - let sign = match filter { + +async fn do_overlap( + ctx: &SessionContext, + range_opts: RangeOptions, +) -> datafusion::dataframe::DataFrame { + let sign = match range_opts.clone().filter_op.unwrap() { FilterOp::Weak => "=".to_string(), _ => "".to_string(), }; - info!( - "Running overlap: algorithm {} with {} thread(s)", - ctx.state() - .config() - .options() - .extensions - .get::() - .unwrap() - .interval_join_algorithm, - ctx.state().config().options().execution.target_partitions - ); + let suffixes = match range_opts.suffixes { + Some((s1, s2)) => (s1, s2), + _ => ("_1".to_string(), "_2".to_string()), + }; + let columns_1 = match range_opts.columns_1 { + Some(cols) => cols, + _ => vec![ + "contig".to_string(), + "pos_start".to_string(), + "pos_end".to_string(), + ], + }; + let columns_2 = match range_opts.columns_2 { + Some(cols) => cols, + _ => vec![ + "contig".to_string(), + "pos_start".to_string(), + "pos_end".to_string(), + ], + }; let query = format!( r#" SELECT - a.contig as contig_1, - a.pos_start as pos_start_1, - a.pos_end as pos_end_1, - b.contig as contig_2, - b.pos_start as pos_start_2, - b.pos_end as pos_end_2 + a.{} as {}{}, -- contig + a.{} as {}{}, -- pos_start + a.{} as {}{}, -- pos_end + b.{} as {}{}, -- contig + b.{} as {}{}, -- pos_start + b.{} as {}{}, -- pos_end + a.* except({}, {}, {}), -- all join columns from left table + b.* except({}, {}, {}) -- all join columns from right table FROM {} a, {} b WHERE - a.contig=b.contig + a.{}=b.{} AND - cast(a.pos_end AS INT) >{} cast(b.pos_start AS INT) + cast(a.{} AS INT) >{} cast(b.{} AS INT) AND - cast(a.pos_start AS INT) <{} cast(b.pos_end AS INT) + cast(a.{} AS INT) <{} cast(b.{} AS INT) "#, - LEFT_TABLE, RIGHT_TABLE, sign, sign, + columns_1[0], + columns_1[0], + suffixes.0, // contig + columns_1[1], + columns_1[1], + suffixes.0, // pos_start + columns_1[2], + columns_1[2], + suffixes.0, // pos_end + columns_2[0], + columns_2[0], + suffixes.1, // contig + columns_2[1], + columns_2[1], + suffixes.1, // pos_start + columns_2[2], + columns_2[2], + suffixes.1, // pos_end + columns_1[0], + columns_1[1], + columns_1[2], // all join columns from right table + columns_2[0], + columns_2[1], + columns_2[2], // all join columns from left table + LEFT_TABLE, + RIGHT_TABLE, + columns_1[0], + columns_2[0], // contig + columns_1[2], + sign, + columns_2[1], // pos_start + columns_1[1], + sign, + columns_2[2], // pos_end ); ctx.sql(&query).await.unwrap() } @@ -306,12 +431,12 @@ fn do_range_operation( range_options: RangeOptions, ) -> datafusion::dataframe::DataFrame { // defaults - match range_options.overlap_alg { + match &range_options.overlap_alg { Some(alg) if alg == "coitreesnearest" => { panic!("CoitreesNearest is an internal algorithm for nearest operation. Can't be set explicitly."); }, Some(alg) => { - set_option_internal(ctx, "sequila.interval_join_algorithm", &alg); + set_option_internal(ctx, "sequila.interval_join_algorithm", alg); }, _ => { set_option_internal( @@ -321,11 +446,23 @@ fn do_range_operation( ); }, } + info!( + "Running {} operation with algorithm {} and {} thread(s)...", + range_options.range_op, + ctx.state() + .config() + .options() + .extensions + .get::() + .unwrap() + .interval_join_algorithm, + ctx.state().config().options().execution.target_partitions + ); match range_options.range_op { - RangeOp::Overlap => rt.block_on(do_overlap(ctx, range_options.filter_op.unwrap())), + RangeOp::Overlap => rt.block_on(do_overlap(ctx, range_options)), RangeOp::Nearest => { set_option_internal(ctx, "sequila.interval_join_algorithm", "coitreesnearest"); - rt.block_on(do_nearest(ctx, range_options.filter_op.unwrap())) + rt.block_on(do_nearest(ctx, range_options)) }, _ => panic!("Unsupported operation"), } diff --git a/tests/test_bioframe.py b/tests/test_bioframe.py index 752f97d..41c248a 100644 --- a/tests/test_bioframe.py +++ b/tests/test_bioframe.py @@ -6,11 +6,14 @@ from polars_bio.polars_bio import FilterOp -class TestOverlapBioframe: +class TestBioframe: result_overlap = pb.overlap( BIO_PD_DF1, BIO_PD_DF2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), output_type="pandas.DataFrame", + suffixes=("_1", "_3"), overlap_filter=FilterOp.Strict, ) result_bio_overlap = bf.overlap( @@ -18,13 +21,15 @@ class TestOverlapBioframe: BIO_PD_DF2, cols1=("contig", "pos_start", "pos_end"), cols2=("contig", "pos_start", "pos_end"), - suffixes=("_1", "_2"), + suffixes=("_1", "_3"), how="inner", ) resust_nearest = pb.nearest( BIO_PD_DF1, BIO_PD_DF2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), overlap_filter=FilterOp.Strict, output_type="pandas.DataFrame", ) diff --git a/tests/test_native.py b/tests/test_native.py index 82d20f5..b053545 100644 --- a/tests/test_native.py +++ b/tests/test_native.py @@ -16,6 +16,8 @@ class TestOverlapNative: result_csv = pb.overlap( DF_OVER_PATH1, DF_OVER_PATH2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), output_type="pandas.DataFrame", overlap_filter=FilterOp.Weak, ) @@ -35,6 +37,8 @@ class TestNearestNative: result = pb.nearest( DF_NEAREST_PATH1, DF_NEAREST_PATH2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), output_type="pandas.DataFrame", overlap_filter=FilterOp.Weak, ) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 71695a9..f5629d1 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -16,6 +16,8 @@ class TestOverlapPandas: result = pb.overlap( PD_OVERLAP_DF1, PD_OVERLAP_DF2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), output_type="pandas.DataFrame", overlap_filter=FilterOp.Weak, ) @@ -35,6 +37,8 @@ class TestNearestPandas: result = pb.nearest( PD_NEAREST_DF1, PD_NEAREST_DF2, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), output_type="pandas.DataFrame", overlap_filter=FilterOp.Weak, ) diff --git a/tests/test_polars.py b/tests/test_polars.py index a559c4c..e3b0fd3 100644 --- a/tests/test_polars.py +++ b/tests/test_polars.py @@ -13,10 +13,20 @@ class TestOverlapPolars: result_frame = pb.overlap( - PL_DF1, PL_DF2, output_type="polars.DataFrame", overlap_filter=FilterOp.Weak + PL_DF1, + PL_DF2, + output_type="polars.DataFrame", + overlap_filter=FilterOp.Weak, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), ) result_lazy = pb.overlap( - PL_DF1, PL_DF2, output_type="polars.LazyFrame", overlap_filter=FilterOp.Weak + PL_DF1, + PL_DF2, + output_type="polars.LazyFrame", + overlap_filter=FilterOp.Weak, + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), ).collect() expected = PL_DF_OVERLAP @@ -35,10 +45,18 @@ def test_overlap_schema_rows_lazy(self): class TestNearestPolars: result_frame = pb.nearest( - PL_NEAREST_DF1, PL_NEAREST_DF2, output_type="polars.DataFrame" + PL_NEAREST_DF1, + PL_NEAREST_DF2, + output_type="polars.DataFrame", + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), ) result_lazy = pb.nearest( - PL_NEAREST_DF1, PL_NEAREST_DF2, output_type="polars.LazyFrame" + PL_NEAREST_DF1, + PL_NEAREST_DF2, + output_type="polars.LazyFrame", + col1=("contig", "pos_start", "pos_end"), + col2=("contig", "pos_start", "pos_end"), ).collect() expected = PL_DF_NEAREST