diff --git a/dev-requirements.txt b/dev-requirements.txt index 653e3c3b..7ae13033 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,6 +2,7 @@ cython matplotlib numpydoc pandas +polars pyyaml sphinx pysam diff --git a/docker/pbt-test-py2/Dockerfile b/docker/pbt-test-py2/Dockerfile index c0c6d381..210a37d3 100644 --- a/docker/pbt-test-py2/Dockerfile +++ b/docker/pbt-test-py2/Dockerfile @@ -30,6 +30,7 @@ RUN conda install -c daler \ numpydoc \ pip \ pandas \ + polars \ pyyaml \ sphinx \ pysam diff --git a/docker/pbt-test-py3/Dockerfile b/docker/pbt-test-py3/Dockerfile index ae449995..1b595f55 100644 --- a/docker/pbt-test-py3/Dockerfile +++ b/docker/pbt-test-py3/Dockerfile @@ -30,6 +30,7 @@ RUN conda install -c daler \ numpydoc \ pip \ pandas \ + polars \ pyyaml \ sphinx \ pysam diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst new file mode 100755 index 00000000..6d07d316 --- /dev/null +++ b/docs/source/dataframe.rst @@ -0,0 +1,42 @@ +.. include:: includeme.rst + +.. _saveresults: + +Exporting to a dataframe +================== + +If you want to export the results as a dataframe for more analysis, use +the :meth:`BedTool.to_dataframe` method to export to a pandas dataframe or the :meth:`BedTool.to_polars_dataframe` method to export to a polars dataframe. This method also lets you optionally specify column names for the dataframes instead of the default columns names that pybedtools uses. You can use the same arguments you would normally use while reading a file into a pandas (`names=`) or polars (`new_columns=`) dataframe. By default, pybedtools assumes that there is no header line in the bed file. If your bed file already has names in the first row, you can set the `disable_auto_names` argument to `False`. + +.. doctest:: + :options: +NORMALIZE_WHITESPACE + + >>> import pandas + >>> import polars + >>> a = pybedtools.example_bedtool('a.bed') + <BLANKLINE> + + >>> pandas_df = a.to_dataframe() + >>> print(pandas_df) + chrom start end name score strand + 0 chr1 1 100 feature1 0 + + 1 chr1 100 200 feature2 0 + + 2 chr1 150 500 feature3 0 - + 3 chr1 900 950 feature4 0 + + <BLANKLINE> + + >>> polars_df = a.to_polars_dataframe() + >>> print(polars_df) + —————————————————————————————————————————————————————— + │ chrom ┆ start ┆ end ┆ name ┆ score ┆ strand │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 ┆ str ┆ i64 ┆ str │ + ══════════════════════════════════════════════════════ + │ chr1 ┆ 1 ┆ 100 ┆ feature1 ┆ 0 ┆ + │ + │ chr1 ┆ 100 ┆ 200 ┆ feature2 ┆ 0 ┆ + │ + │ chr1 ┆ 150 ┆ 500 ┆ feature3 ┆ 0 ┆ - │ + │ chr1 ┆ 900 ┆ 950 ┆ feature4 ┆ 0 ┆ + │ + —————————————————————————————————————————————————————— + <BLANKLINE> + +You can also generate a :class:`BedTool` object from a pandas or polars dataframe using the :meth:`BedTool.from_dataframe` or :meth:`BedTool.from_polars_dataframe` method respectively. diff --git a/docs/source/tutorial-contents.rst b/docs/source/tutorial-contents.rst index 125a4157..2a915e84 100644 --- a/docs/source/tutorial-contents.rst +++ b/docs/source/tutorial-contents.rst @@ -11,6 +11,7 @@ Tutorial Contents create-a-bedtool-tutorial intersections save-results + dataframe default-arguments piping intervals diff --git a/pybedtools/bedtool.py b/pybedtools/bedtool.py index cbe37743..2813fbf3 100644 --- a/pybedtools/bedtool.py +++ b/pybedtools/bedtool.py @@ -644,6 +644,47 @@ def from_dataframe( ) return BedTool(fn) + @classmethod + def from_polars_dataframe( + self, + polars_df, + outfile=None, + separator="\t", + has_header=False, + **kwargs + ): + """ + Creates a BedTool from a polars.DataFrame. + + If `outfile` is None, a temporary file will be used. Otherwise it can + be a specific filename or an open file handle. Additional kwargs will + be passed to `polars.DataFrame.write_csv`. + + The fields of the resulting BedTool will match the order of columns in + the dataframe. + """ + try: + import polars + except ImportError: + raise ImportError("polars must be installed to use dataframes") + if outfile is None: + outfile = self._tmp() + default_kwargs = dict(separator=separator, has_header=has_header) + default_kwargs.update(kwargs) + polars_df.write_csv(outfile, **default_kwargs) + + if isinstance(outfile, six.string_types): + fn = outfile + else: + try: + fn = outfile.name + except AttributeError: + raise ValueError( + "`outfile` is not a string and doesn't have a `name` attribute. " + "Unable to determine filename." + ) + return BedTool(fn) + def split(self, func, *args, **kwargs): """ Split each feature using a user-defined function. @@ -3715,6 +3756,54 @@ def to_dataframe(self, disable_auto_names=False, *args, **kwargs): else: return pandas.DataFrame() + def to_polars_dataframe(self, disable_auto_names=False, *args, **kwargs): + """ + Create a polars.DataFrame, passing args and kwargs to polars.read_csv + The separator kwarg `separator` is given a tab `\\t` value by default. + + Parameters + ---------- + disable_auto_names : bool + By default, the created dataframe fills in column names + automatically according to the detected filetype (e.g., "chrom", + "start", "end" for a BED3 file). Set this argument to True to + disable this behavior. + """ + # Complain if BAM or if not a file + if self._isbam: + raise ValueError("BAM not supported for converting to DataFrame") + if not isinstance(self.fn, six.string_types): + raise ValueError("use .saveas() to make sure self.fn is a file") + + try: + import polars + except ImportError: + raise ImportError("polars must be installed to convert to polars.DataFrame") + # Otherwise we're good: + names = kwargs.get("new_columns", None) + if names is None and not disable_auto_names: + try: + _names = settings._column_names[self.file_type][: self.field_count()] + if len(_names) < self.field_count(): + warn( + "Default names for filetype %s are:\n%s\nbut file has " + "%s fields; you can supply custom names with the " + "`names` kwarg" % (self.file_type, _names, self.field_count()) + ) + _names = None + except KeyError: + _names = None + kwargs["new_columns"] = _names + + has_header = kwargs.get("has_header", False) + if disable_auto_names: + has_header = True + kwargs["has_header"] = has_header + if os.path.isfile(self.fn) and os.path.getsize(self.fn) > 0: + return polars.read_csv(self.fn, *args, separator="\t", **kwargs) + else: + return polars.DataFrame() + def tail(self, lines=10, as_string=False): """ Like `head`, but prints last 10 lines of the file by default. diff --git a/pybedtools/test/test_1.py b/pybedtools/test/test_1.py index b2ebbd51..256f8479 100644 --- a/pybedtools/test/test_1.py +++ b/pybedtools/test/test_1.py @@ -2041,3 +2041,88 @@ def test_new_head(): # however, printing should still complain: with pytest.raises(pybedtools.cbedtools.MalformedBedLineError): print(a) + + +def test_from_polars_dataframe(): + try: + import polars + except ImportError: + pytest.xfail("polars not installed; skipping test") + + a = pybedtools.example_bedtool("a.bed") + + results = a.to_polars_dataframe() + assert results[0, "name"] == "feature1" + assert list(results.columns) == ["chrom", "start", "end", "name", "score", "strand"] + assert results[3, "strand"] == "+" + + # reverse should work, too: + df = a.to_polars_dataframe() + a2 = pybedtools.BedTool.from_polars_dataframe(df) + assert a2 == a + + # try converting only part of the dataframe to a BedTool + a3 = pybedtools.BedTool.from_polars_dataframe( + df.filter(polars.col("start") < 100).select(["chrom", "start", "end", "name"]) + ) + assert a3 == fix( + """ + chr1 1 100 feature1 + """ + ), str(a3) + + d = pybedtools.example_bedtool("d.gff") + results = d.to_polars_dataframe() + assert list(results.columns) == [ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", + "attributes", + ] + assert results[0, "seqname"] == "chr1" + assert results[4, "attributes"] == "ID=rRNA1;" + + # get a gff file with too many fields... + x = pybedtools.example_bedtool("c.gff") + x = x.intersect(x, c=True) + with warnings.catch_warnings(record=True) as w: + # trigger the warning + x.to_polars_dataframe() + # assert a few things + assert len(w) == 1 + assert issubclass(w[-1].category, UserWarning) + assert str(w[-1].message).startswith("Default names for filetype") + + names = [ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", + "attributes", + "count", + ] + results = x.to_polars_dataframe(new_columns=names) + assert list(results.columns) == [ + "seqname", + "source", + "feature", + "start", + "end", + "score", + "strand", + "frame", + "attributes", + "count", + ] + assert results[0, "seqname"] == "chr1" + assert results[13, "count"] == 3 + diff --git a/requirements.txt b/requirements.txt index a997fc10..d546a56a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ numpy pandas +polars pysam six