feat: Unary operations (#62)

* feat: Adding BED support for range operations * chore: Streaming documentatiob update * feat: Adding read_table method * feat: Added sort_bedframe operation * feat: Expand
biodatageeks · Jan 11, 2025 · 0a3f247 · 0a3f247
1 parent 2e0b154
commit 0a3f247
Show file tree

Hide file tree

Showing 16 changed files with 408 additions and 103 deletions.
diff --git a/docs/api.md b/docs/api.md
@@ -1,3 +1,35 @@
+
+## API structure
+
+There are 2 ways of using polars-bio API:
+
+* directly on a Polars LazyFrame under a registered `pb` [namespace](https://docs.pola.rs/api/python/stable/reference/api/polars.api.register_lazyframe_namespace.html#polars.api.register_lazyframe_namespace)
+
+!!! example
+
+       ```plaintext
+        >>> type(df)
+        <class 'polars.lazyframe.frame.LazyFrame'>
+
+       ```
+       ```python
+          import polars_bio as pb
+          df.pb.sort().limit(5).collect()
+       ```
+
+ * using `polars_bio` module
+
+!!! example
+
+       ```python
+          import polars_bio as pb
+          df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
+       ```
+
+!!! tip
+    1. Not all are available in both ways.
+    2. You can of course use both ways in the same script.
+
 ::: polars_bio
     handler: python
     options:

diff --git a/docs/features.md b/docs/features.md
diff --git a/docs/performance.md b/docs/performance.md
@@ -1278,13 +1278,13 @@ pyranges1    | 35,940           | 2.45x    |
 
 #### Calculate overlaps and export to a CSV file 7-8
 
-| Library           | Time (s)               | Speedup  | Peak Memory (MB) | Factor   |
-|-------------------|------------------------|----------|------------------|----------|
-| polars-bio        | **23.765**             | 0.77x    | 14,660           | 26.07x   |
-| polars-bio-stream | **18.221**<sup>1</sup> | **1.0x** | **562.22**       | **1.0x** |
-| bioframe          | 370.010                | 0.05x    | 33,352           | 59.32x   |
-| pyranges0         | 275.237                | 0.07x    | 30.052           | 53.45x   |
-| pyranges1         | 351.041                | 0.05x    | 36,530           | 0.06x    |
+| Library               | Time (s)               | Speedup  | Peak Memory (MB) | Factor   |
+|-----------------------|------------------------|----------|------------------|----------|
+| polars-bio            | **23.765**             | 0.77x    | 14,660           | 26.07x   |
+| **polars-bio-stream** | **18.221**<sup>1</sup> | **1.0x** | **562.22**       | **1.0x** |
+| bioframe              | 370.010                | 0.05x    | 33,352           | 59.32x   |
+| pyranges0             | 275.237                | 0.07x    | 30.052           | 53.45x   |
+| pyranges1             | 351.041                | 0.05x    | 36,530           | 64.97x   |
 
 <sup>1</sup> Despite limiting the number of threads in DataFusion (`datafusion.execution.target_partitions=1`) and in Polars (`POLARS_MAX_THREADS=1`) cpu utilization was constant and approx.160%.
 ###### polars-bio

diff --git a/polars_bio/__init__.py b/polars_bio/__init__.py
@@ -3,7 +3,8 @@
 from polars_bio.polars_bio import InputFormat
 
 from .context import ctx
-from .io import read_bam, read_bed, read_fasta, read_fastq, read_vcf
+from .io import read_bam, read_fasta, read_fastq, read_table, read_vcf
+from .polars_ext import PolarsRangesOperations as LazyFrame
 from .range_op import FilterOp, nearest, overlap
 from .range_viz import visualize_intervals
 
@@ -12,6 +13,8 @@
 logger = logging.getLogger("polars_bio")
 logger.setLevel(logging.INFO)
 
+POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
+
 
 __version__ = "0.5.0"
 __all__ = [
@@ -22,8 +25,9 @@
     "visualize_intervals",
     "read_bam",
     "read_vcf",
-    "read_bed",
     "read_fasta",
     "read_fastq",
+    "read_table",
     "InputFormat",
+    "LazyFrame",
 ]
diff --git a/polars_bio/constants.py b/polars_bio/constants.py
@@ -0,0 +1 @@
+DEFAULT_INTERVAL_COLUMNS = ["chrom", "start", "end"]
diff --git a/polars_bio/io.py b/polars_bio/io.py
@@ -1,6 +1,7 @@
-from typing import Iterator, Union
+from typing import Dict, Iterator, Union
 
 import polars as pl
+from bioframe import SCHEMAS
 from datafusion import DataFrame
 from polars.io.plugins import register_io_source
 
@@ -49,29 +50,16 @@ def read_vcf(path: str) -> pl.LazyFrame:
     Read a VCF file into a LazyFrame.
 
     Parameters:
-        Parameters:
         path: The path to the VCF file.
     """
     return file_lazy_scan(path, InputFormat.Vcf)
 
 
-def read_bed(path: str) -> pl.LazyFrame:
-    """
-    Read a BED file into a LazyFrame.
-
-    Parameters:
-        Parameters:
-        path: The path to the BED file.
-    """
-    return file_lazy_scan(path, InputFormat.Bed)
-
-
 def read_fasta(path: str) -> pl.LazyFrame:
     """
     Read a FASTA file into a LazyFrame.
 
     Parameters:
-        Parameters:
         path: The path to the FASTA file.
     """
     return file_lazy_scan(path, InputFormat.Fasta)
@@ -82,7 +70,6 @@ def read_fastq(path: str) -> pl.LazyFrame:
     Read a FASTQ file into a LazyFrame.
 
     Parameters:
-        Parameters:
         path: The path to the FASTQ file.
     """
     return file_lazy_scan(path, InputFormat.Fastq)
@@ -156,3 +143,27 @@ def read_file(path: str, input_format: InputFormat) -> pl.DataFrame:
     """
     table = py_register_table(ctx, path, input_format)
     return py_scan_table(ctx, table.name)
+
+
+def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
+    """
+     Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
+     Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
+     but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
+
+    Parameters:
+        path: The path to the file.
+        schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).
+
+
+    """
+    df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
+    if schema is not None:
+        columns = SCHEMAS[schema]
+        if len(columns) != len(df.collect_schema()):
+            raise ValueError(
+                f"Schema incompatible with the input. Expected {len(columns)} columns in a schema, got {len(df.collect_schema())} in the input data file. Please provide a valid schema."
+            )
+        for i, c in enumerate(columns):
+            df = df.rename({f"column_{i+1}": c})
+    return df
diff --git a/polars_bio/polars_ext.py b/polars_bio/polars_ext.py
@@ -0,0 +1,138 @@
+from typing import Union
+
+import polars as pl
+
+
+@pl.api.register_lazyframe_namespace("pb")
+class PolarsRangesOperations:
+    def __init__(self, ldf: pl.LazyFrame) -> None:
+        self._ldf = ldf
+
+    def sort(
+        self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
+    ) -> pl.LazyFrame:
+        """
+        Sort a bedframe.
+        !!! note
+            Adapted to Polars API from [bioframe.sort_bedframe](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L1698)
+
+        Parameters:
+            cols: The names of columns containing the chromosome, start and end of the genomic intervals.
+
+
+        !!! Example
+              ```python
+              import polars_bio as pb
+              df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
+              df.pb.sort().limit(5).collect()
+              ```
+                ```plaintext
+                <class 'builtins.PyExpr'>
+                shape: (5, 9)
+                ┌───────┬─────────┬─────────┬──────┬───┬────────┬────────────┬──────────┬──────────┐
+                │ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  │
+                │ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      │
+                │ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      │
+                ╞═══════╪═════════╪═════════╪══════╪═══╪════════╪════════════╪══════════╪══════════╡
+                │ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 │
+                │ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 │
+                │ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 │
+                │ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 │
+                │ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 │
+                └───────┴─────────┴─────────┴──────┴───┴────────┴────────────┴──────────┴──────────┘
+
+                ```
+
+        """
+        return self._ldf.sort(by=cols)
+
+    def expand(
+        self,
+        pad: Union[int, None] = None,
+        scale: Union[float, None] = None,
+        side: str = "both",
+        cols: Union[list[str], None] = ["chrom", "start", "end"],
+    ) -> pl.LazyFrame:
+        """
+        Expand each interval by an amount specified with `pad`.
+        !!! Note
+            Adapted to Polars API from [bioframe.expand](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L150)
+
+        Negative values for pad shrink the interval, up to the midpoint.
+        Multiplicative rescaling of intervals enabled with scale. Only one of pad
+        or scale can be provided.
+
+        Parameters:
+            pad :
+                The amount by which the intervals are additively expanded *on each side*.
+                Negative values for pad shrink intervals, but not beyond the interval
+                midpoint. Either `pad` or `scale` must be supplied.
+
+            scale :
+                The factor by which to scale intervals multiplicatively on each side, e.g
+                ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
+                ``scale=1`` returns original intervals. Default False.
+                Either `pad` or `scale` must be supplied.
+
+            side :
+                Which side to expand, possible values are 'left', 'right' and 'both'.
+                Default 'both'.
+
+            cols :
+                The names of columns containing the chromosome, start and end of the
+                genomic intervals. Default values are 'chrom', 'start', 'end'.
+
+
+        """
+        df = self._ldf
+        ck, sk, ek = ["chrom", "start", "end"] if cols is None else cols
+        padsk = "pads"
+        midsk = "mids"
+
+        if scale is not None and pad is not None:
+            raise ValueError("only one of pad or scale can be supplied")
+        elif scale is not None:
+            if scale < 0:
+                raise ValueError("multiplicative scale must be >=0")
+            df = df.with_columns(
+                [(0.5 * (scale - 1) * (pl.col(ek) - pl.col(sk))).alias(padsk)]
+            )
+        elif pad is not None:
+            if not isinstance(pad, int):
+                raise ValueError("additive pad must be integer")
+            df = df.with_columns([pl.lit(pad).alias(padsk)])
+        else:
+            raise ValueError("either pad or scale must be supplied")
+        if side == "both" or side == "left":
+            df = df.with_columns([(pl.col(sk) - pl.col(padsk)).alias(sk)])
+        if side == "both" or side == "right":
+            df = df.with_columns([(pl.col(ek) + pl.col(padsk)).alias(ek)])
+
+        if pad is not None:
+            if pad < 0:
+                df = df.with_columns(
+                    [(pl.col(sk) + 0.5 * (pl.col(ek) - pl.col(sk))).alias(midsk)]
+                )
+                df = df.with_columns(
+                    [
+                        pl.min_horizontal(pl.col(sk), pl.col(midsk))
+                        .cast(pl.Int64)
+                        .alias(sk),
+                        pl.max_horizontal(pl.col(ek), pl.col(midsk))
+                        .cast(pl.Int64)
+                        .alias(ek),
+                    ]
+                )
+        if scale is not None:
+            df = df.with_columns(
+                [
+                    pl.col(sk).round(0).cast(pl.Int64).alias(sk),
+                    pl.col(ek).round(0).cast(pl.Int64).alias(ek),
+                ]
+            )
+        schema = df.collect_schema().names()
+        if padsk in schema:
+            df = df.drop(padsk)
+        if midsk in schema:
+            df = df.drop(midsk)
+        return df
diff --git a/polars_bio/range_op.py b/polars_bio/range_op.py
@@ -4,15 +4,17 @@
 import polars as pl
 from typing_extensions import TYPE_CHECKING, Union
 
+from .constants import DEFAULT_INTERVAL_COLUMNS
 from .context import ctx
 from .range_op_helpers import _validate_overlap_input, range_operation
 
+__all__ = ["overlap", "nearest"]
+
+
 if TYPE_CHECKING:
     pass
 from polars_bio.polars_bio import FilterOp, RangeOp, RangeOptions
 
-DEFAULT_INTERVAL_COLUMNS = ["chrom", "start", "end"]
-
 
 def overlap(
     df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
@@ -21,8 +23,8 @@ def overlap(
     overlap_filter: FilterOp = FilterOp.Strict,
     suffixes: tuple[str, str] = ("_1", "_2"),
     on_cols: Union[list[str], None] = None,
-    cols1: Union[list[str], None] = None,
-    cols2: Union[list[str], None] = None,
+    cols1: Union[list[str], None] = ["chrom", "start", "end"],
+    cols2: Union[list[str], None] = ["chrom", "start", "end"],
     algorithm: str = "Coitrees",
     output_type: str = "polars.LazyFrame",
     streaming: bool = False,
@@ -32,19 +34,17 @@ def overlap(
     Bioframe inspired API.
 
     Parameters:
-        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
-        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
+        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
+        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
         how: How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.
-        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). default is FilterOp.Weak.
+        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
         cols1: The names of columns containing the chromosome, start and end of the
-            genomic intervals, provided separately for each set. The default
-            values are 'chrom', 'start', 'end'.
+            genomic intervals, provided separately for each set.
         cols2:  The names of columns containing the chromosome, start and end of the
-            genomic intervals, provided separately for each set. The default
-            values are 'chrom', 'start', 'end'.
+            genomic intervals, provided separately for each set.
         suffixes: Suffixes for the columns of the two overlapped sets.
         on_cols: List of additional column names to join on. default is None.
-        algorithm: The algorithm to use for the overlap operation. default is "Coitrees".
+        algorithm: The algorithm to use for the overlap operation.
         output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
         streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming-out-of-core-processing) engine.
 
@@ -109,8 +109,8 @@ def nearest(
     overlap_filter: FilterOp = FilterOp.Strict,
     suffixes: tuple[str, str] = ("_1", "_2"),
     on_cols: Union[list[str], None] = None,
-    cols1: Union[list[str], None] = None,
-    cols2: Union[list[str], None] = None,
+    cols1: Union[list[str], None] = ["chrom", "start", "end"],
+    cols2: Union[list[str], None] = ["chrom", "start", "end"],
     output_type: str = "polars.LazyFrame",
     streaming: bool = False,
 ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
@@ -119,15 +119,13 @@ def nearest(
     Bioframe inspired API.
 
     Parameters:
-        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
-        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header and Parquet are supported.
-        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). default is FilterOp.Weak.
+        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
+        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
+        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
         cols1: The names of columns containing the chromosome, start and end of the
-            genomic intervals, provided separately for each set. The default
-            values are 'chrom', 'start', 'end'.
+            genomic intervals, provided separately for each set.
         cols2:  The names of columns containing the chromosome, start and end of the
-            genomic intervals, provided separately for each set. The default
-            values are 'chrom', 'start', 'end'.
+            genomic intervals, provided separately for each set.
         suffixes: Suffixes for the columns of the two overlapped sets.
         on_cols: List of additional column names to join on. default is None.
         output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		DEFAULT_INTERVAL_COLUMNS = ["chrom", "start", "end"]