diff --git a/docs/_templates/custom_method_summary.rst b/docs/_templates/custom_method_summary.rst new file mode 100644 index 00000000..500838b5 --- /dev/null +++ b/docs/_templates/custom_method_summary.rst @@ -0,0 +1,5 @@ +{{ fullname | escape | underline }} + +.. currentmodule:: {{ module }} + +.. automethod:: {{ objname }} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index bd2b8882..67877446 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,7 +60,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = ['_generated_hidden*'] master_doc = "index" diff --git a/docs/developer_guide.rst b/docs/developer_guide.rst index c1eace51..b50f4473 100644 --- a/docs/developer_guide.rst +++ b/docs/developer_guide.rst @@ -99,7 +99,7 @@ Task sequence ~~~~~~~~~~~~~ 1. Create and download your own Pyranges fork ------------------------------------------- +--------------------------------------------- The easiest way to do this is through github. Login into the github website if you aren't already, then visit the Pyranges page on github, click "Fork" on the top right. @@ -295,7 +295,7 @@ case, google how to set up a github token that you can use. 10. Open a pull request ----------------------- +----------------------- The easiest way to open a pull request is through the github website. Go to **your** Pyranges fork on github, then find the "Contribute" button (near the **<> Code** button). Click @@ -315,7 +315,7 @@ Pyranges administrators will inspect the pull request, comment it if necessary, 11. Core team only: upload to PyPI ---------------------------------- +---------------------------------- Every now and then, the core development team considers that a new pyranges version should be released. To do so: @@ -347,5 +347,7 @@ property-based tests): pytest -n 4 tests/property_based +Other useful tools: + * [rg](https://github.com/BurntSushi/ripgrep): ripgrep recursively searches directories for a regex pattern while respecting your gitignore * [fd](https://github.com/sharkdp/fd): A simple, fast and user-friendly alternative to 'find' diff --git a/docs/how_to_overlap.rst b/docs/how_to_overlap.rst new file mode 100644 index 00000000..6d384a64 --- /dev/null +++ b/docs/how_to_overlap.rst @@ -0,0 +1,911 @@ +Overlap-related operations +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. contents:: + :local: + :depth: 2 + + + +Pyranges offers many efficient methods to detect / process overlaps. We present them here split in two groups: + +1. Methods accepting a **pair of PyRanges** as input: + +.. autosummary:: + :toctree: _generated_hidden + :template: custom_method_summary.rst + + pyranges.PyRanges.overlap + pyranges.PyRanges.join_ranges + pyranges.PyRanges.set_intersect + pyranges.PyRanges.set_union + pyranges.PyRanges.intersect + pyranges.PyRanges.subtract_ranges + pyranges.PyRanges.count_overlaps + pyranges.PyRanges.nearest + +2. Methods accepting a **single PyRanges** as input: + +.. autosummary:: + :toctree: _generated_hidden2 + :template: custom_method_summary.rst + + pyranges.PyRanges.cluster + pyranges.PyRanges.merge_overlaps + pyranges.PyRanges.split + pyranges.PyRanges.max_disjoint + + +Methods for pairs of PyRanges +============================= +These methods accept two PyRanges objects as input (i.e. self and other), +and return a new PyRanges object with the results of the operation. + +We will briefly showcase them in this document. First, let's create two PyRanges objects to work with: + + >>> import pyranges as pr + >>> a = pr.PyRanges(dict(Chromosome="chr1", + ... Start=[3, 13, 18, 23, 28, 32, 33], + ... End=[6, 15, 21, 27, 29, 37, 36], + ... Strand=["+", "+", "-", "-", "-", "+", "+"])) + >>> a + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + 1 | chr1 13 15 + + 2 | chr1 18 21 - + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> b = pr.PyRanges(dict(Chromosome="chr1", + ... Start=[6, 12, 19, 25, 34], + ... End=[8, 14, 20, 29, 36], + ... Strand=["+", "+", "+", "-", "+"])) + >>> b + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 6 8 + + 1 | chr1 12 14 + + 2 | chr1 19 20 + + 3 | chr1 25 29 - + 4 | chr1 34 36 + + PyRanges with 5 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + +Filter rows: overlap +-------------------- + +The most intuitive overlap-related method is :func:`overlap `. +This is simply a filter of the intervals in self, so that only those overlapping any interval in other are returned: + + >>> a.overlap(b) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 1 | chr1 13 15 + + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 5 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Let's now filter the rows of the other PyRanges object: + + >>> b.overlap(a) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 1 | chr1 12 14 + + 3 | chr1 25 29 - + 4 | chr1 34 36 + + PyRanges with 3 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Note above how intervals overlapping with more than one interval in the other PyRanges are reported just once. + +To get the self intervals **without overlap** in other, use ``invert=True``: + + >>> a.overlap(b, invert=True) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + 2 | chr1 18 21 - + PyRanges with 2 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +A reminder that intervals are encoded in pythonic convention: +0-based coordinates, with start included and end excluded. +Thus, the closest possible intervals that are not overlapping are two intervals wherein +the end of the first is equal of the start of the second. These are called **"bookended"** intervals, e.g.: + + >>> a.head(1) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + PyRanges with 1 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 1 strands. + + >>> b.head(1) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 6 8 + + PyRanges with 1 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 1 strands. + + +Common arguments: slack, strand_behavior, match_by +-------------------------------------------------- + +We will now use :func:`overlap ` to showcase arguments +that are available in many overlap-related methods. + +``slack`` (default: 0) is used to relax the criteria of overlap. +A value of 1 will report bookended intervals, previously not considered overlapping: + + >>> a.overlap(b, slack=1) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + 1 | chr1 13 15 + + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 6 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + +Analogously, higher values will report increasingly distant intervals. +In practice, the self intervals are temporarily extended by the slack amount on both ends before the overlap operation. + +``strand_behavior`` determines how strand is treated. +The value 'same' results in the intuitive behavior, i.e. two intervals overlap only if on the same strand: + + >>> a.overlap(b, strand_behavior="same") # the result here is the same as: a.overlap(b) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 1 | chr1 13 15 + + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 5 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +On the other hand, value 'ignore' will define overlaps regardless of strands: + + >>> a.overlap(b, strand_behavior="ignore") + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 1 | chr1 13 15 + + 2 | chr1 18 21 - + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 6 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Value 'opposite' will require two intervals to be on the opposite strands to be considered overlapping: + + >>> a.overlap(b, strand_behavior="opposite") + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 2 | chr1 18 21 - + PyRanges with 1 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 1 strands. + +Naturally, values 'same' and 'opposite' can only be used when the PyRanges objects have +valid strand information, i.e. the Strand column is present and all its values are either '+' or '-': + + >>> b_unstranded = b.remove_strand() + >>> b_unstranded + index | Chromosome Start End + int64 | object int64 int64 + ------- --- ------------ ------- ------- + 0 | chr1 6 8 + 1 | chr1 12 14 + 2 | chr1 19 20 + 3 | chr1 25 29 + 4 | chr1 34 36 + PyRanges with 5 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. + + >>> a.overlap(b_unstranded, strand_behavior="same") + Traceback (most recent call last): + ... + ValueError: Can only do same strand operations when both PyRanges contain valid strand info. + +See function :func:`strand_valid ` for details, and +:func:`make_strand_valid ` to convert non-standard strand values to standard ones. + +The default value of ``strand_behavior`` is 'auto'. +This is transformed to 'same' if both PyRanges have valid strands, and to 'ignore' otherwise: + + >>> a.overlap(b).equals( + ... a.overlap(b, strand_behavior='same') ) + True + + >>> a.overlap(b_unstranded).equals( + ... a.overlap(b, strand_behavior='ignore') ) + True + +Above, we leveraged method ``equals`` inherited from pandas Dataframe to compare table contents. + + +Note that the presence of any non-standard Strand value will result in strand being ignored for all rows. +When leading to potentially non-intuitive behavior, a warning is printed: + + >>> a_invalid = a.copy() + >>> a_invalid.loc[2, 'Strand'] = "." + >>> a_invalid + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + 1 | chr1 13 15 + + 2 | chr1 18 21 . + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 3 strands (including non-genomic strands: .). + + >>> a_invalid.overlap(b) # doctest: +SKIP + :1: UserWarning: overlap: 'auto' strand_behavior treated as ignore due to invalid Strand values. Please use strand_behavior=ignore + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 1 | chr1 13 15 + + 2 | chr1 18 21 . + 3 | chr1 23 27 - + 4 | chr1 28 29 - + 5 | chr1 32 37 + + 6 | chr1 33 36 + + PyRanges with 6 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 3 strands (including non-genomic strands: .). + + +Finally, argument ``match_by`` can be used to specify additional columns whose values must match for two intervals +to be considered overlapping. +For example, let's add a column to both objects to mark intervals whose Start is an odd number, then +use this column to filter the overlaps: + + >>> a2 = a.assign(odd = lambda x:x.Start % 2 ) + >>> a2 + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 0 | chr1 3 6 + 1 + 1 | chr1 13 15 + 1 + 2 | chr1 18 21 - 0 + 3 | chr1 23 27 - 1 + 4 | chr1 28 29 - 0 + 5 | chr1 32 37 + 0 + 6 | chr1 33 36 + 1 + PyRanges with 7 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> b2 = b.assign(odd = lambda x:x.Start % 2 ) + >>> b2 + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 0 | chr1 6 8 + 0 + 1 | chr1 12 14 + 0 + 2 | chr1 19 20 + 1 + 3 | chr1 25 29 - 1 + 4 | chr1 34 36 + 0 + PyRanges with 5 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + + >>> a2.overlap(b2, match_by='odd') + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 3 | chr1 23 27 - 1 + 5 | chr1 32 37 + 0 + PyRanges with 2 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + +The most versatile: join_ranges +-------------------------------- + +The most versatile overlap-related method for pairs of PyRanges is :func:`join_ranges `. +This method is analogous to a SQL join operation, but rather than matching rows in two tables through a common key, +they are matched in virtue of their overlap. + +This function searches for overlaps between the intervals in self and other, and reports in output the full +information related to the input intervals. The returned PyRanges object will have a number of rows equal to the +number of overlaps found, and the columns will be the union of the columns of self and other, using a suffix +to differentiate columns in other which are present with the same name in the self PyRanges, like Start and End: + + >>> a.join_ranges(b) + index | Chromosome Start End Strand Start_b End_b + int64 | object int64 int64 object int64 int64 + ------- --- ------------ ------- ------- -------- --------- ------- + 0 | chr1 13 15 + 12 14 + 1 | chr1 23 27 - 25 29 + 2 | chr1 28 29 - 25 29 + 3 | chr1 32 37 + 34 36 + 4 | chr1 33 36 + 34 36 + PyRanges with 5 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +In contrast to :func:`overlap `, a row is returned per overlap, so +if an interval in self overlaps with more than one interval in other, it will be reported multiple times: + + >>> b.join_ranges(a) + index | Chromosome Start End Strand Start_b End_b + int64 | object int64 int64 object int64 int64 + ------- --- ------------ ------- ------- -------- --------- ------- + 0 | chr1 12 14 + 13 15 + 1 | chr1 25 29 - 23 27 + 2 | chr1 25 29 - 28 29 + 3 | chr1 34 36 + 32 37 + 4 | chr1 34 36 + 33 36 + PyRanges with 5 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Like all overlap-related methods accepting two PyRanges as input, +:func:`join_ranges ` +accepts the ``strand_behavior`` argument. If Strand is not used to determine overlaps, it will +be returned for both PyRanges: + + >>> a.join_ranges(b, strand_behavior="ignore") + index | Chromosome Start End Strand Start_b End_b Strand_b + int64 | object int64 int64 object int64 int64 object + ------- --- ------------ ------- ------- -------- --------- ------- ---------- + 0 | chr1 13 15 + 12 14 + + 1 | chr1 18 21 - 19 20 + + 2 | chr1 23 27 - 25 29 - + 3 | chr1 28 29 - 25 29 - + 4 | chr1 32 37 + 34 36 + + 5 | chr1 33 36 + 34 36 + + PyRanges with 6 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +``slack`` and ``match_by`` arguments are also available, e.g.: + + >>> a2.join_ranges(b2, match_by='odd') + index | Chromosome Start End Strand odd Start_b End_b + int64 | object int64 int64 object int64 int64 int64 + ------- --- ------------ ------- ------- -------- ------- --------- ------- + 0 | chr1 23 27 - 1 25 29 + 1 | chr1 32 37 + 0 34 36 + PyRanges with 2 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> a2.join_ranges(b2, match_by='odd', slack=5) + index | Chromosome Start End Strand odd Start_b End_b + int64 | object int64 int64 object int64 int64 int64 + ------- --- ------------ ------- ------- -------- ------- --------- ------- + 0 | chr1 13 15 + 1 19 20 + 1 | chr1 23 27 - 1 25 29 + 2 | chr1 32 37 + 0 34 36 + PyRanges with 3 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Pyranges provides method :func:`combine_interval_columns ` +to post-process the output of :func:`join_ranges ` +and aggregate the coordinates of the overlapping intervals in Start and End columns. +For example, this allows to obtain the union of the overlapping intervals: + + >>> a2.join_ranges(b2, match_by='odd', slack=5).combine_interval_columns('union') + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 0 | chr1 13 20 + 1 + 1 | chr1 23 29 - 1 + 2 | chr1 32 37 + 0 + PyRanges with 3 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +:func:`join_ranges ` is one of most versatile methods in PyRanges, since it +retains the full information of the two input PyRanges objects. +Nevertheless, more efficient alternative methods are available for specific use cases, detailed below. + +Set operations: set_intersect, set_union +---------------------------------------- + +Pyranges offers efficient methods based on the concept of set operations in mathematics. These are useful +when the user is interested in the intervals themselves, rather than in the full information (i.e. metadata) +of the input intervals. + + +Method :func:`set_intersect `, allows to obtain the genomic regions +present in both PyRanges: + + >>> a.set_intersect(b) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 13 14 + + 1 | chr1 34 36 + + 2 | chr1 25 27 - + 3 | chr1 28 29 - + PyRanges with 4 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> a.set_intersect(b, strand_behavior="ignore") + index | Chromosome Start End + int64 | object int64 int64 + ------- --- ------------ ------- ------- + 0 | chr1 13 14 + 1 | chr1 19 20 + 2 | chr1 25 27 + 3 | chr1 28 29 + 4 | chr1 34 36 + PyRanges with 5 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. + +The regions reported may be part of any interval in the two PyRanges. All metadata columns are dropped: + + >>> a2.set_intersect(b2).columns # see above: a2 and b2 had the 'odd' column + Index(['Chromosome', 'Start', 'End', 'Strand'], dtype='object') + +Analogously, method :func:`set_union ` allows to obtain the genomic regions that +are present in at least one of the PyRanges: + + >>> a.set_union(b) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 3 6 + + 1 | chr1 6 8 + + 2 | chr1 12 15 + + 3 | chr1 19 20 + + 4 | chr1 32 37 + + 5 | chr1 18 21 - + 6 | chr1 23 29 - + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> a2.set_union(b2, strand_behavior='ignore') + index | Chromosome Start End + int64 | object int64 int64 + ------- --- ------------ ------- ------- + 0 | chr1 3 6 + 1 | chr1 6 8 + 2 | chr1 12 15 + 3 | chr1 18 21 + 4 | chr1 23 29 + 5 | chr1 32 37 + PyRanges with 6 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. + +Interval manipulation operations: intersect, subtract +----------------------------------------------------- +Set operations do not preserve input metadata. +:func:`join_ranges ` preserve metadata of both PyRanges, but is less efficient. +Pyranges also offers methods that preserve the metadata in self, but not in other. +Specifically, method :func:`intersect ` allows to obtain the intervals in self that overlap +with any interval in other. It is similar to :func:`overlap `, but here coordinates +are modified to return only the actual overlaps: + + >>> a2.intersect(b) + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 1 | chr1 13 14 + 1 + 3 | chr1 25 27 - 1 + 4 | chr1 28 29 - 0 + 5 | chr1 34 36 + 0 + 6 | chr1 34 36 + 1 + PyRanges with 5 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> a2.intersect(b2, strand_behavior='ignore', match_by='odd') + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 3 | chr1 25 27 - 1 + 5 | chr1 34 36 + 0 + PyRanges with 2 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Method :func:`subtract_ranges ` allows to obtain the portions of intervals in self +that do not overlap any interval in other: + + >>> a2.subtract_ranges(b) + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 0 | chr1 3 6 + 1 + 1 | chr1 14 15 + 1 + 2 | chr1 18 21 - 0 + 3 | chr1 23 25 - 1 + 5 | chr1 32 34 + 0 + 5 | chr1 36 37 + 0 + 6 | chr1 33 34 + 1 + PyRanges with 7 rows, 5 columns, and 1 index columns (with 1 index duplicates). + Contains 1 chromosomes and 2 strands. + + + >>> a2.subtract_ranges(b, strand_behavior='ignore') + index | Chromosome Start End Strand odd + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ------- + 0 | chr1 3 6 + 1 + 1 | chr1 14 15 + 1 + 2 | chr1 18 19 - 0 + 2 | chr1 20 21 - 0 + 3 | chr1 23 25 - 1 + 5 | chr1 32 34 + 0 + 5 | chr1 36 37 + 0 + 6 | chr1 33 34 + 1 + PyRanges with 8 rows, 5 columns, and 1 index columns (with 2 index duplicates). + Contains 1 chromosomes and 2 strands. + +Fast counting: count_overlaps +----------------------------- + +Method :func:`count_overlaps ` allows to count, for each interval in self, +the number of intervals in other that overlaps with it. +Input coordinates are not modified, and a new column is added: + + >>> a2.count_overlaps(b) # using a2 to show the 'odd' column is preserved + index | Chromosome Start End Strand odd NumberOverlaps + int64 | object int64 int64 object int64 int64 + ------- --- ------------ ------- ------- -------- ------- ---------------- + 0 | chr1 3 6 + 1 0 + 1 | chr1 13 15 + 1 1 + 2 | chr1 18 21 - 0 0 + 3 | chr1 23 27 - 1 1 + 4 | chr1 28 29 - 0 1 + 5 | chr1 32 37 + 0 1 + 6 | chr1 33 36 + 1 1 + PyRanges with 7 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Arguments ``strand_behavior`` and ``match_by`` are available: + + >>> a.count_overlaps(b, strand_behavior='ignore') + index | Chromosome Start End Strand NumberOverlaps + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- ---------------- + 0 | chr1 3 6 + 0 + 1 | chr1 13 15 + 1 + 2 | chr1 18 21 - 1 + 3 | chr1 23 27 - 1 + 4 | chr1 28 29 - 1 + 5 | chr1 32 37 + 1 + 6 | chr1 33 36 + 1 + PyRanges with 7 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + + >>> a2.count_overlaps(b2, strand_behavior='ignore', match_by='odd') + index | Chromosome Start End Strand odd NumberOverlaps + int64 | object int64 int64 object int64 int64 + ------- --- ------------ ------- ------- -------- ------- ---------------- + 0 | chr1 3 6 + 1 0 + 1 | chr1 13 15 + 1 0 + 2 | chr1 18 21 - 0 0 + 3 | chr1 23 27 - 1 1 + 4 | chr1 28 29 - 0 0 + 5 | chr1 32 37 + 0 1 + 6 | chr1 33 36 + 1 0 + PyRanges with 7 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Optionally, argument ``calculate_coverage`` can be set to True to calculate the fraction of the self interval covered +by intervals in other: + + >>> a.count_overlaps(b, strand_behavior='ignore', calculate_coverage=True) + index | Chromosome Start End Strand NumberOverlaps CoverageOverlaps + int64 | object int64 int64 object int64 float64 + ------- --- ------------ ------- ------- -------- ---------------- ------------------ + 0 | chr1 3 6 + 0 0 + 1 | chr1 13 15 + 1 0.5 + 2 | chr1 18 21 - 1 0.333333 + 3 | chr1 23 27 - 1 0.5 + 4 | chr1 28 29 - 1 1 + 5 | chr1 32 37 + 1 0.4 + 6 | chr1 33 36 + 1 0.666667 + PyRanges with 7 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Find the closest interval: nearest +---------------------------------- + +Method :func:`nearest ` allows to find the closest interval in other for each interval +in self: + + >>> a.nearest(b) + index | Chromosome Start End Strand Start_b End_b Distance + int64 | object int64 int64 object int64 int64 int64 + ------- --- ------------ ------- ------- -------- --------- ------- ---------- + 0 | chr1 3 6 + 6 8 1 + 1 | chr1 13 15 + 12 14 0 + 2 | chr1 18 21 - 25 29 5 + 3 | chr1 23 27 - 25 29 0 + 4 | chr1 28 29 - 25 29 0 + 5 | chr1 32 37 + 34 36 0 + 6 | chr1 33 36 + 34 36 0 + PyRanges with 7 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +The output format is similar to :func:`join_ranges `. +Note the "Distance" column, which reports the distance between the intervals in self and other. + +In case you want to find the nearest interval which does not overlap with each self interval, use +``exclude_overlaps=True``: + + >>> a.nearest(b, exclude_overlaps=True) + index | Chromosome Start End Strand Start_b End_b Distance + int64 | object int64 int64 object float64 float64 int64 + ------- --- ------------ ------- ------- -------- --------- --------- ---------- + 0 | chr1 3 6 + 6 8 1 + 1 | chr1 13 15 + 19 20 5 + 2 | chr1 18 21 - 25 29 5 + 5 | chr1 32 37 + 19 20 13 + 6 | chr1 33 36 + 19 20 14 + PyRanges with 5 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +The :func:`nearest ` method also accepts the ``strand_behavior`` argument: + + >>> a.nearest(b, strand_behavior='ignore', exclude_overlaps=True) + index | Chromosome Start End Strand Start_b End_b Strand_b Distance + int64 | object int64 int64 object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- --------- ------- ---------- ---------- + 0 | chr1 3 6 + 6 8 + 1 + 1 | chr1 13 15 + 19 20 + 5 + 2 | chr1 18 21 - 12 14 + 5 + 3 | chr1 23 27 - 19 20 + 4 + 4 | chr1 28 29 - 34 36 + 6 + 5 | chr1 32 37 + 25 29 - 4 + 6 | chr1 33 36 + 25 29 - 5 + PyRanges with 7 rows, 8 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Methods for single PyRanges +=========================== + +These overlap-related methods return a modified version of the input PyRanges object (self). +We will showcase them with this data: + + >>> c = pr.PyRanges(dict(Chromosome="chr1", + ... Start=[1, 4, 10, 12, 19, 20, 24, 28], + ... End=[5, 7, 14, 16, 27, 22, 25, 30], + ... Strand=["+", "+", "+", "-", "+", "+", "+", "+"])) + >>> c + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 1 5 + + 1 | chr1 4 7 + + 2 | chr1 10 14 + + 3 | chr1 12 16 - + 4 | chr1 19 27 + + 5 | chr1 20 22 + + 6 | chr1 24 25 + + 7 | chr1 28 30 + + PyRanges with 8 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + +Grouping overlapping intervals: cluster +--------------------------------------- +The most flexible method in this category is :func:`cluster `. +This function will detect overlaps among intervals in self, and assign a cluster identifier +to each group of overlapping intervals. The object returned is identical to the input, +with an additional column "Cluster" containing the cluster identifier: + + >>> c.cluster() + index | Chromosome Start End Strand Cluster + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- --------- + 0 | chr1 1 5 + 0 + 1 | chr1 4 7 + 0 + 2 | chr1 10 14 + 1 + 3 | chr1 12 16 - 2 + 4 | chr1 19 27 + 3 + 5 | chr1 20 22 + 3 + 6 | chr1 24 25 + 3 + 7 | chr1 28 30 + 4 + PyRanges with 8 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Note that clusters 1 and 2 are kept separated only because of strand. +We introduce argument ``use_strand``, accepted by all overlap-related methods for single PyRanges. +When set to False, strand is ignored for overlap detection: + + >>> c.cluster(use_strand=False) + index | Chromosome Start End Strand Cluster + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- --------- + 0 | chr1 1 5 + 0 + 1 | chr1 4 7 + 0 + 2 | chr1 10 14 + 1 + 3 | chr1 12 16 - 1 + 4 | chr1 19 27 + 2 + 5 | chr1 20 22 + 2 + 6 | chr1 24 25 + 2 + 7 | chr1 28 30 + 3 + PyRanges with 8 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +The default value of argument ``use_strand`` is 'auto', which is interpreted as True if the PyRanges object +has valid strand information, and False otherwise. + +Note that cluster 2 contain intervals with indices 5 and 6 which do not directly overlap, +but they both overlap with the interval with index 4. (More generally, a cluster is a connected +component of the overlap graph, where two intervals are connected if they overlap directly.) + +Argument ``cluster_column`` controls the name of the column containing the cluster identifier. +Also, argument ``slack`` is also available here, and it is analogous to its use in methods for pairs of PyRanges. +Its default value is 0. With ``slack=1``, bookended intervals are placed in the same cluster. +With ``slack=2``, intervals that are distant at the most 1 bp are placed in the same cluster, +like those with index 4 and 7 above; and so on. + + >>> c2 = c.cluster(slack=2, use_strand=False, cluster_column='myClust') + >>> c2 + index | Chromosome Start End Strand myClust + int64 | object int64 int64 object int64 + ------- --- ------------ ------- ------- -------- --------- + 0 | chr1 1 5 + 0 + 1 | chr1 4 7 + 0 + 2 | chr1 10 14 + 1 + 3 | chr1 12 16 - 1 + 4 | chr1 19 27 + 2 + 5 | chr1 20 22 + 2 + 6 | chr1 24 25 + 2 + 7 | chr1 28 30 + 2 + PyRanges with 8 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Argument ``match_by`` is also available. +Only intervals with the same value in the specified column will be considered for overlap detection. +Let's add a gene column to the PyRanges object and compare "Cluster" results with the previous column "myClust": + + >>> c2['gene'] = ['abc'[s % 3] for s in c2.Start] # arbitrary gene assignment + >>> c2.cluster(slack=2, use_strand=False, match_by='gene') + index | Chromosome Start End Strand myClust gene Cluster + int64 | object int64 int64 object int64 object int64 + ------- --- ------------ ------- ------- -------- --------- -------- --------- + 0 | chr1 1 5 + 0 b 0 + 1 | chr1 4 7 + 0 b 0 + 2 | chr1 10 14 + 1 b 1 + 3 | chr1 12 16 - 1 a 2 + 4 | chr1 19 27 + 2 b 3 + 5 | chr1 20 22 + 2 c 4 + 6 | chr1 24 25 + 2 a 5 + 7 | chr1 28 30 + 2 b 3 + PyRanges with 8 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Resolve overlaps: merge_overlaps, split, max_disjoint +----------------------------------------------------- +Various methods exists to obtain a PyRanges object without internal overlaps. + +Method :func:`merge_overlaps ` +allows to merge overlapping intervals in self. +In practice, this function returns the union of all intervals in self. +All metadata columns are dropped: + + >>> c.merge_overlaps() + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 1 7 + + 1 | chr1 10 14 + + 2 | chr1 19 27 + + 3 | chr1 28 30 + + 4 | chr1 12 16 - + PyRanges with 5 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +As before, both ``slack`` and ``use_strand`` are supported: + + >>> c.merge_overlaps(slack=2) + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 1 7 + + 1 | chr1 10 14 + + 2 | chr1 19 30 + + 3 | chr1 12 16 - + PyRanges with 4 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Note that Strand is not reported when ``use_strand`` is set to False: + + >>> c.merge_overlaps(slack=2, use_strand=False) + index | Chromosome Start End + int64 | object int64 int64 + ------- --- ------------ ------- ------- + 0 | chr1 1 7 + 1 | chr1 10 16 + 2 | chr1 19 30 + PyRanges with 3 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. + +On the other hand, method :func:`split ` allows to split intervals in self +at the position of overlaps, leaving as many bookended intervals as necessary to avoid overlaps. +This function drops metadata, too: + + >>> pr.options.set_option('max_rows_to_show', 12) # to see all rows + >>> c.split() + index | Chromosome Start End Strand + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 1 4 + + 1 | chr1 4 5 + + 2 | chr1 5 7 + + 4 | chr1 10 14 + + 6 | chr1 19 20 + + 7 | chr1 20 22 + + 8 | chr1 22 24 + + 9 | chr1 24 25 + + 10 | chr1 25 27 + + 12 | chr1 28 30 + + 13 | chr1 12 16 - + PyRanges with 11 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +Function :func:`max_disjoint ` also returns a set of non-overlapping intervals. +In this case, however, input intervals are not modified, just filtered. +The intervals to return are chosen to maximize the number of intervals in the output. + + >>> pr.options.reset_options() + >>> c2.max_disjoint() # using c2 to show that metadata is retained + index | Chromosome Start End Strand myClust gene + int64 | object int64 int64 object int64 object + ------- --- ------------ ------- ------- -------- --------- -------- + 0 | chr1 1 5 + 0 b + 2 | chr1 10 14 + 1 b + 3 | chr1 12 16 - 1 a + 5 | chr1 20 22 + 2 c + 6 | chr1 24 25 + 2 a + 7 | chr1 28 30 + 2 b + PyRanges with 6 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + +``slack`` and ``use_strand`` are also available: + + >>> c2.max_disjoint(slack=2, use_strand=False) + index | Chromosome Start End Strand myClust gene + int64 | object int64 int64 object int64 object + ------- --- ------------ ------- ------- -------- --------- -------- + 0 | chr1 1 5 + 0 b + 2 | chr1 10 14 + 1 b + 5 | chr1 20 22 + 2 c + 6 | chr1 24 25 + 2 a + 7 | chr1 28 30 + 2 b + PyRanges with 5 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 1 strands. + +As well as ``match_by``: + + >>> c2.max_disjoint(slack=2, use_strand=False, match_by='gene') + index | Chromosome Start End Strand myClust gene + int64 | object int64 int64 object int64 object + ------- --- ------------ ------- ------- -------- --------- -------- + 0 | chr1 1 5 + 0 b + 2 | chr1 10 14 + 1 b + 3 | chr1 12 16 - 1 a + 4 | chr1 19 27 + 2 b + 5 | chr1 20 22 + 2 c + 6 | chr1 24 25 + 2 a + PyRanges with 6 rows, 6 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. \ No newline at end of file diff --git a/docs/how_to_pages.rst b/docs/how_to_pages.rst index 3ba0656b..73e6669b 100644 --- a/docs/how_to_pages.rst +++ b/docs/how_to_pages.rst @@ -4,7 +4,7 @@ How-to pages These pages explain pyranges functionalities grouped by topic: .. toctree:: - :maxdepth: 2 + :maxdepth: 3 how_to_create how_to_write @@ -12,4 +12,5 @@ These pages explain pyranges functionalities grouped by topic: how_to_rows how_to_columns how_to_sequences - how_to_genomic_ops \ No newline at end of file + how_to_genomic_ops + how_to_overlap \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 5705e70a..77eefcf9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -50,9 +50,8 @@ Documentation outline developer_guide -Indices and tables -================== +Indices +======= * :ref:`genindex` * :ref:`modindex` -* :ref:`search` diff --git a/docs/installation.rst b/docs/installation.rst index a9fd7df7..68f316ff 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -12,10 +12,10 @@ The preferred way to install pyranges is via pip:: pip install pyranges1 -The command above will install a minimal version of pyranges. +The command above will install a **minimal version** of pyranges. Pyranges has several optional dependencies, required for certain functionalities. -To install all optional dependencies, use:: +To **install all optional dependencies**, use:: pip install pyranges1[all] diff --git a/pyproject.toml b/pyproject.toml index 41a699f4..c3f625e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyranges1" -version = "1.0.0" +version = "1.0.1" description = "GenomicRanges for Python." requires-python = ">=3.12.0" readme = "README.md" diff --git a/pyranges/core/pyranges_helpers.py b/pyranges/core/pyranges_helpers.py index 5be8d361..9d699f6d 100644 --- a/pyranges/core/pyranges_helpers.py +++ b/pyranges/core/pyranges_helpers.py @@ -11,7 +11,6 @@ STRAND_BEHAVIOR_SAME, STRAND_COL, STRICT_STRAND_BEHAVIOR_TYPE, - TEMP_STRAND_COL, USE_STRAND_AUTO, VALID_BY_OPTIONS, VALID_STRAND_BEHAVIOR_OPTIONS, @@ -119,17 +118,17 @@ def group_keys_from_validated_strand_behavior( """Return group keys based on strand behavior. If strand_behavior is 'same', return [CHROM_COL, STRAND_COL]. - If strand_behavior is 'opposite', return [CHROM_COL, TEMP_STRAND_COL]. + If strand_behavior is 'opposite', return [CHROM_COL, STRAND_COL]. If strand_behavior is 'ignore', return [CHROM_COL]. In each case, if a by argument is provided, it is appended/extended to the list of group keys. """ - strand_cols = ( - [STRAND_COL] - if strand_behavior == STRAND_BEHAVIOR_SAME - else ([TEMP_STRAND_COL] if strand_behavior == STRAND_BEHAVIOR_OPPOSITE else []) - ) + if strand_behavior == STRAND_BEHAVIOR_AUTO: + msg = "this function must be called with a validated strand_behavior" + raise ValueError(msg) + + strand_cols = [STRAND_COL] if strand_behavior in [STRAND_BEHAVIOR_SAME, STRAND_BEHAVIOR_OPPOSITE] else [] return [CHROM_COL, *strand_cols, *arg_to_list(by)] diff --git a/pyranges/core/pyranges_main.py b/pyranges/core/pyranges_main.py index ed361ab1..fd69a9ca 100644 --- a/pyranges/core/pyranges_main.py +++ b/pyranges/core/pyranges_main.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Optional, cast +import numpy as np import pandas as pd from natsort import natsort, natsorted # type: ignore[import] @@ -25,11 +26,14 @@ OVERLAP_FIRST, PANDAS_COMPRESSION_TYPE, RANGE_COLS, + REVERSE_STRAND, SKIP_IF_EMPTY_LEFT, START_COL, + STRAND_BEHAVIOR_IGNORE, STRAND_BEHAVIOR_OPPOSITE, STRAND_COL, TEMP_END_SLACK_COL, + TEMP_ID_COL, TEMP_NAME_COL, TEMP_NUM_COL, TEMP_START_SLACK_COL, @@ -590,6 +594,8 @@ def apply_pair( # type: ignore[override] function: BinaryOperation, strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "auto", by: VALID_BY_TYPES = None, + *, + preserve_order: bool = False, **kwargs, ) -> "pr.PyRanges": """Apply function to pairs of overlapping intervals, by chromosome and optionally strand. @@ -601,7 +607,7 @@ def apply_pair( # type: ignore[override] function : Callable Function that takes two PyRanges and returns a PyRanges. - The function shouldb accept a **kwargs argument. It may be used to extract useful information: + The function should accept a **kwargs argument, which may be used to extract useful information: group = kwargs.get("__by__", {}) # e.g. chromosome = group.get("Chromosome", None) # e.g. strand = group.get("Strand", "+") @@ -614,6 +620,12 @@ def apply_pair( # type: ignore[override] by : str or list of str or None Additional columns - in addition to chromosome and strand - to group by. + copy_self : bool, default False + Whether to copy the first PyRanges before operations. Use False if copied beforehand. + + preserve_order : bool, default False + Preserve the order of the intervals in the first PyRanges in output. + kwargs : dict Other arguments passed along to the function. @@ -622,17 +634,30 @@ def apply_pair( # type: ignore[override] by = arg_to_list(by) if strand_behavior == STRAND_BEHAVIOR_OPPOSITE: - self[TEMP_STRAND_COL] = self[STRAND_COL].replace({"+": "-", "-": "+"}) - other[TEMP_STRAND_COL] = other[STRAND_COL] + # swap strands in STRAND_COL, but keep original values in TEMP_STRAND_COL + self = mypy_ensure_pyranges( + self.assign(**{TEMP_STRAND_COL: self[STRAND_COL]}).assign( + **{ + STRAND_COL: self[STRAND_COL].replace( + {FORWARD_STRAND: REVERSE_STRAND, REVERSE_STRAND: FORWARD_STRAND}, + ), + }, + ), + ) grpby_ks = group_keys_from_validated_strand_behavior(strand_behavior, by=by) res = mypy_ensure_pyranges(super().apply_pair(other, function, by=grpby_ks, **kwargs)) if strand_behavior == STRAND_BEHAVIOR_OPPOSITE: - res = res.drop_and_return(TEMP_STRAND_COL, axis="columns") + # swap back strands in STRAND_COL + res = res.drop_and_return(STRAND_COL, axis="columns").rename(columns={TEMP_STRAND_COL: STRAND_COL}) - return res + if preserve_order: + common_index = self.index.intersection(res.index) + res = res.reindex(common_index) + + return mypy_ensure_pyranges(res) def boundaries( self, @@ -767,9 +792,9 @@ def cluster( If provided, only intervals with an equal value in column(s) `match_by` may be considered as overlapping. slack : int, default 0 - Consider intervals separated by less than `slack` to be in the same cluster. If `slack` - is negative, intervals overlapping less than `slack` are not considered to be in the - same cluster. + Length by which the criteria of overlap are loosened. + A value of 1 clusters also bookended intervals. + Higher slack values cluster more distant intervals (with a maximum distance of slack-1 between them). cluster_column: Name the cluster column added in output. Default: "Cluster" @@ -782,83 +807,116 @@ def cluster( PyRanges PyRanges with an ID-column "Cluster" added. - Warning - ------- - - Bookended intervals (i.e. the End of a PyRanges interval is the Start of - another one) are by default considered to overlap. - Avoid this with slack=-1. - See Also -------- PyRanges.merge: combine overlapping intervals into one Examples -------- - >>> gr = pr.PyRanges({"Chromosome": [1, 1, 2, 1, 1], "Start": [1, 2, 0, 3, 9], - ... "End": [3, 3, 4, 10, 12], "Gene": [1, 2, 6, 3, 3]}) + >>> gr = pr.PyRanges(dict(Chromosome=1, Start=[5, 6, 12, 16, 20, 22, 24], End=[9, 8, 16, 18, 23, 25, 27])) >>> gr - index | Chromosome Start End Gene - int64 | int64 int64 int64 int64 - ------- --- ------------ ------- ------- ------- - 0 | 1 1 3 1 - 1 | 1 2 3 2 - 2 | 2 0 4 6 - 3 | 1 3 10 3 - 4 | 1 9 12 3 - PyRanges with 5 rows, 4 columns, and 1 index columns. - Contains 2 chromosomes. + index | Chromosome Start End + int64 | int64 int64 int64 + ------- --- ------------ ------- ------- + 0 | 1 5 9 + 1 | 1 6 8 + 2 | 1 12 16 + 3 | 1 16 18 + 4 | 1 20 23 + 5 | 1 22 25 + 6 | 1 24 27 + PyRanges with 7 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. - >>> gr.cluster(cluster_column="ClusterId").sort_ranges() - index | Chromosome Start End Gene ClusterId - int64 | int64 int64 int64 int64 int64 - ------- --- ------------ ------- ------- ------- ----------- - 0 | 1 1 3 1 0 - 1 | 1 2 3 2 0 - 3 | 1 3 10 3 0 - 4 | 1 9 12 3 0 - 2 | 2 0 4 6 1 - PyRanges with 5 rows, 5 columns, and 1 index columns. - Contains 2 chromosomes. + >>> gr.cluster() + index | Chromosome Start End Cluster + int64 | int64 int64 int64 int64 + ------- --- ------------ ------- ------- --------- + 0 | 1 5 9 0 + 1 | 1 6 8 0 + 2 | 1 12 16 1 + 3 | 1 16 18 2 + 4 | 1 20 23 3 + 5 | 1 22 25 3 + 6 | 1 24 27 3 + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes. - >>> gr.cluster(match_by=["Gene"], count_column="Counts") - index | Chromosome Start End Gene Cluster Counts - int64 | int64 int64 int64 int64 int64 int64 - ------- --- ------------ ------- ------- ------- --------- -------- - 0 | 1 1 3 1 0 1 - 1 | 1 2 3 2 1 1 - 2 | 2 0 4 6 2 1 - 3 | 1 3 10 3 3 2 - 4 | 1 9 12 3 3 2 - PyRanges with 5 rows, 6 columns, and 1 index columns. - Contains 2 chromosomes. + Slack=1 will cluster also bookended intervals: + + >>> gr.cluster(slack=1) + index | Chromosome Start End Cluster + int64 | int64 int64 int64 int64 + ------- --- ------------ ------- ------- --------- + 0 | 1 5 9 0 + 1 | 1 6 8 0 + 2 | 1 12 16 1 + 3 | 1 16 18 1 + 4 | 1 20 23 2 + 5 | 1 22 25 2 + 6 | 1 24 27 2 + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes. + + Higher values of slack will cluster more distant intervals: + + >>> gr.cluster(slack=3) + index | Chromosome Start End Cluster + int64 | int64 int64 int64 int64 + ------- --- ------------ ------- ------- --------- + 0 | 1 5 9 0 + 1 | 1 6 8 0 + 2 | 1 12 16 1 + 3 | 1 16 18 1 + 4 | 1 20 23 1 + 5 | 1 22 25 1 + 6 | 1 24 27 1 + PyRanges with 7 rows, 4 columns, and 1 index columns. + Contains 1 chromosomes. + + >>> gr.cluster(slack=3, count_column="Count") + index | Chromosome Start End Cluster Count + int64 | int64 int64 int64 int64 int64 + ------- --- ------------ ------- ------- --------- ------- + 0 | 1 5 9 0 2 + 1 | 1 6 8 0 2 + 2 | 1 12 16 1 5 + 3 | 1 16 18 1 5 + 4 | 1 20 23 1 5 + 5 | 1 22 25 1 5 + 6 | 1 24 27 1 5 + PyRanges with 7 rows, 5 columns, and 1 index columns. + Contains 1 chromosomes. """ from pyranges.methods.cluster import _cluster if not len(self): # returning empty PyRanges with consistent columns - cols_to_add = {cluster_column: None} + cols_to_add = {cluster_column: 0} if count_column: - cols_to_add[count_column] = None + cols_to_add[count_column] = 0 return mypy_ensure_pyranges(self.copy().assign(**cols_to_add)) - strand = validate_and_convert_use_strand(self, use_strand=use_strand) _self = mypy_ensure_pyranges(self.sort_values(START_COL)) - _by = [match_by] if isinstance(match_by, str) else ([*match_by] if match_by is not None else []) + use_strand = validate_and_convert_use_strand(_self, use_strand) + gr = _self.apply_single( _cluster, by=match_by, - use_strand=strand, + use_strand=use_strand, slack=slack, count_column=count_column, cluster_column=cluster_column, preserve_index=True, ) + by_split_groups = ( + (CHROM_AND_STRAND_COLS if use_strand else [CHROM_COL]) + arg_to_list(match_by) + [cluster_column] + ) gr = gr.reindex(self.index) - gr[cluster_column] = gr.groupby(self.loc_columns + _by + [cluster_column], sort=False).ngroup() + gr[cluster_column] = gr.groupby(by_split_groups, sort=False).ngroup() return mypy_ensure_pyranges(gr) def copy(self, *args, **kwargs) -> "pr.PyRanges": @@ -994,7 +1052,7 @@ def count_overlaps( strand_behavior = validate_and_convert_strand_behavior(self, other, strand_behavior) if coverage_col != "CoverageOverlaps" and not calculate_coverage: - msg = "coverage_col can only be provided if calculate_coverage is True." + msg = "coverage_col can only be provided with calculate_coverage=True." raise ValueError(msg) if slack and calculate_coverage: @@ -1036,7 +1094,6 @@ def count_overlaps( overlap_col=overlap_col, skip_if_empty=not keep_nonoverlapping, ) - if slack and len(result) > 0: result[START_COL] = result[TEMP_START_SLACK_COL] result[END_COL] = result[TEMP_END_SLACK_COL] @@ -1304,14 +1361,11 @@ def join_ranges( Report amount of overlap in base pairs. slack : int, default 0 - Temporarily lengthen intervals in self before joining. + Before joining, temporarily extend intervals in self by this much on both ends. suffix : str or tuple, default "_b" Suffix to give overlapping columns in other. - apply_strand_suffix : bool, default None - If first pyranges is unstranded, but the second is not, the first will be given a strand column. - apply_strand_suffix makes the added strand column a regular data column instead by adding a suffix. Returns ------- @@ -1321,11 +1375,10 @@ def join_ranges( Note ---- - The chromosome from other will never be reported as it is always the same as in self. + The indices of the two input PyRanges are not preserved in output. + The chromosome column from other will never be reported as it is always the same as in self. + Whether the strand column from other is reported depends on the strand_behavior. - As pandas did not have NaN for non-float datatypes until recently, "left" and "right" join - give non-overlapping rows the value -1 to avoid promoting columns to object. This will - change to NaN in a future version as general NaN becomes stable in pandas. See Also -------- @@ -1360,7 +1413,7 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b int64 | object int64 int64 object int64 int64 object ------- --- ------------ ------- ------- --------- --------- ------- -------- - 2 | chr1 5 7 interval2 6 7 b + 0 | chr1 5 7 interval2 6 7 b PyRanges with 1 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. @@ -1370,9 +1423,9 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b int64 | object int64 int64 object float64 float64 object ------- --- ------------ ------- ------- --------- --------- --------- -------- - 2 | chr1 5 7 interval2 6 7 b 0 | chr1 3 6 interval1 nan nan nan 1 | chr1 8 9 interval3 nan nan nan + 2 | chr1 5 7 interval2 6 7 b PyRanges with 3 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. @@ -1380,14 +1433,14 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b int64 | object float64 float64 object float64 float64 object ------- --- ------------ --------- --------- --------- --------- --------- -------- - 1 | chr1 5 7 interval2 6 7 b 0 | chr1 3 6 interval1 nan nan nan 1 | chr1 8 9 interval3 nan nan nan - 0 | nan nan nan nan 1 2 a - PyRanges with 4 rows, 7 columns, and 1 index columns (with 2 index duplicates). + 2 | chr1 5 7 interval2 6 7 b + 3 | nan nan nan nan 1 2 a + PyRanges with 4 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. Invalid ranges: - * 1 starts or ends are nan. See indexes: 0 + * 1 starts or ends are nan. See indexes: 3 >>> gr = pr.PyRanges({"Chromosome": ["chr1", "chr2", "chr1", "chr3"], "Start": [1, 4, 10, 0], @@ -1420,9 +1473,9 @@ def join_ranges( int64 | object int64 int64 object int64 int64 object ------- --- ------------ ------- ------- -------- --------- ------- -------- 0 | chr1 1 3 a 1 10 c - 0 | chr1 1 3 a 2 9 b - 0 | chr1 1 3 a 2 3 a - PyRanges with 3 rows, 7 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 1 3 a 2 9 b + 2 | chr1 1 3 a 2 3 a + PyRanges with 3 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. >>> gr.join_ranges(gr2, match_by="ID") @@ -1438,12 +1491,12 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b int64 | object float64 float64 object int64 int64 object ------- --- ------------ --------- --------- --------- --------- ------- -------- - 1 | chr1 5 7 interval2 6 7 b - 0 | nan nan nan nan 1 2 a + 0 | chr1 5 7 interval2 6 7 b + 1 | nan nan nan nan 1 2 a PyRanges with 2 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. Invalid ranges: - * 1 starts or ends are nan. See indexes: 0 + * 1 starts or ends are nan. See indexes: 1 >>> f2.join_ranges(bad) # bad.join_ranges(f2) would not work either. Traceback (most recent call last): @@ -1457,7 +1510,7 @@ def join_ranges( int64 | object int64 int64 object int64 int64 object ------- --- ------------ ------- ------- --------- --------- ------- -------- 0 | chr1 3 6 interval1 6 7 b - 2 | chr1 5 7 interval2 6 7 b + 1 | chr1 5 7 interval2 6 7 b PyRanges with 2 rows, 7 columns, and 1 index columns. Contains 1 chromosomes. @@ -1465,7 +1518,7 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b Overlap int64 | object int64 int64 object int64 int64 object int64 ------- --- ------------ ------- ------- --------- --------- ------- -------- --------- - 2 | chr1 5 7 interval2 6 7 b 1 + 0 | chr1 5 7 interval2 6 7 b 1 PyRanges with 1 rows, 8 columns, and 1 index columns. Contains 1 chromosomes. @@ -1474,7 +1527,7 @@ def join_ranges( index | Chromosome Start End Name Start_b End_b Name_b Overlap int64 | object int64 int64 object int64 int64 object int64 ------- --- ------------ ------- ------- --------- --------- ------- -------- --------- - 2 | chr1 5 7 interval2 6 7 b 1 + 0 | chr1 5 7 interval2 6 7 b 1 PyRanges with 1 rows, 8 columns, and 1 index columns. Contains 1 chromosomes. @@ -1485,10 +1538,10 @@ def join_ranges( int64 | object int64 int64 object int64 int64 object int64 ------- --- ------------ ------- ------- --------- --------- ------- -------- --------- 0 | chr1 3 6 interval1 1 2 a -1 - 0 | chr1 3 6 interval1 6 7 b 0 - 1 | chr1 8 9 interval3 6 7 b -1 - 2 | chr1 5 7 interval2 6 7 b 1 - PyRanges with 4 rows, 8 columns, and 1 index columns (with 1 index duplicates). + 1 | chr1 3 6 interval1 6 7 b 0 + 2 | chr1 8 9 interval3 6 7 b -1 + 3 | chr1 5 7 interval2 6 7 b 1 + PyRanges with 4 rows, 8 columns, and 1 index columns. Contains 1 chromosomes. """ @@ -1498,10 +1551,10 @@ def join_ranges( if slack: _self[TEMP_START_SLACK_COL] = _self.Start _self[TEMP_END_SLACK_COL] = _self.End - _self = _self.extend(slack, use_strand=False) + _self[TEMP_ID_COL] = np.arange(len(_self)) - gr: pd.DataFrame | PyRanges = _self.apply_pair( + gr: PyRanges = _self.apply_pair( other, _both_dfs, strand_behavior=strand_behavior, @@ -1514,13 +1567,14 @@ def join_ranges( # even if empty, make sure the object returned has consistent columns if gr.empty: use_strand = use_strand_from_validated_strand_behavior(self, other, strand_behavior) - empty_with_cols = self.head(0).merge( - other.head(0), - how="inner", - suffixes=("", suffix), - on=(([CHROM_COL, STRAND_COL] if use_strand else [CHROM_COL]) + arg_to_list(match_by)), + gr = mypy_ensure_pyranges( + self.head(0).merge( + other.head(0), + how="inner", + suffixes=("", suffix), + on=(([CHROM_COL, STRAND_COL] if use_strand else [CHROM_COL]) + arg_to_list(match_by)), + ), ) - return mypy_ensure_pyranges(empty_with_cols) if slack and len(gr) > 0: gr[START_COL] = gr[TEMP_START_SLACK_COL] @@ -1530,7 +1584,10 @@ def join_ranges( if report_overlap: gr["Overlap"] = gr[["End", "End" + suffix]].min(axis=1) - gr[["Start", "Start" + suffix]].max(axis=1) - return gr + # preserving order. I can't use the reindex syntax as in other functions because of potential duplicates + gr = mypy_ensure_pyranges(gr.sort_values(TEMP_ID_COL).reset_index(drop=True)) + + return gr.drop_and_return(TEMP_ID_COL, axis=1) @property def length(self) -> int: @@ -1629,7 +1686,9 @@ def max_disjoint( The default "auto" means True if PyRanges has valid strands (see .strand_valid). slack : int, default 0 - Consider intervals within a distance of slack to be overlapping. + Length by which the criteria of overlap are loosened. + A value of 1 implies that bookended intervals are considered overlapping. + Higher slack values allow more distant intervals (with a maximum distance of slack-1 between them). match_by : str or list, default None If provided, only intervals with an equal value in column(s) `match_by` may be considered as overlapping. @@ -1637,13 +1696,13 @@ def max_disjoint( Returns ------- PyRanges - PyRanges with maximal disjoint set of intervals. See Also -------- PyRanges.merge_overlaps : merge intervals into non-overlapping superintervals PyRanges.split : split intervals into non-overlapping subintervals + PyRanges.cluster : annotate overlapping intervals with common ID Examples -------- @@ -1689,6 +1748,9 @@ def merge_overlaps( ) -> "PyRanges": """Merge overlapping intervals into one. + Returns a PyRanges with superintervals that are the union of overlapping intervals. + Bookended intervals are merged by default. + Parameters ---------- use_strand: {"auto", True, False}, default: "auto" @@ -1710,8 +1772,7 @@ def merge_overlaps( Returns ------- PyRanges - - PyRanges with superintervals. + PyRanges with superintervals. Metadata columns, index, and order are not preserved. Note ---- @@ -1721,6 +1782,8 @@ def merge_overlaps( See Also -------- PyRanges.cluster : annotate overlapping intervals with common ID + PyRanges.max_disjoint : find the maximal disjoint set of intervals + PyRanges.split : split intervals into non-overlapping subintervals Examples -------- @@ -1771,11 +1834,12 @@ def merge_overlaps( def nearest( self, other: "PyRanges", - strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "ignore", + strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "auto", direction: VALID_NEAREST_TYPE = "any", *, + match_by: VALID_BY_TYPES = None, suffix: str = JOIN_SUFFIX, - overlap: bool = True, + exclude_overlaps: bool = False, ) -> "PyRanges": """Find closest interval. @@ -1791,12 +1855,14 @@ def nearest( information. The default, "auto", means use "same" if both PyRanges are stranded (see .strand_valid) otherwise ignore the strand information. - overlap : bool, default True - Whether to include overlaps. + exclude_overlaps : bool, default True + Whether to not report intervals of others that overlap with self as the nearest ones. direction : {"any", "upstream", "downstream"}, default "any", i.e. both directions - Whether to only look for nearest in one direction. Always with respect to the PyRanges - it is called on. + Whether to only look for nearest in one direction. + + match_by : str or list, default None + If provided, only intervals with an equal value in column(s) `match_by` may be matched. suffix : str, default "_b" Suffix to give columns with shared name in other. @@ -1824,56 +1890,117 @@ def nearest( PyRanges with 3 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. - >>> f2 = pr.example_data.f2.remove_nonloc_columns() + >>> f2 = pr.PyRanges(dict(Chromosome="chr1", Start=[1, 6, 20], End=[2, 7, 22], Strand=["+", "-", "+"])) >>> f2 index | Chromosome Start End Strand - int64 | category int64 int64 category - ------- --- ------------ ------- ------- ---------- + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- 0 | chr1 1 2 + 1 | chr1 6 7 - - PyRanges with 2 rows, 4 columns, and 1 index columns. + 2 | chr1 20 22 + + PyRanges with 3 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. >>> f1.nearest(f2) + index | Chromosome Start End Strand Start_b End_b Distance + int64 | category int64 int64 category int64 int64 int64 + ------- --- ------------ ------- ------- ---------- --------- ------- ---------- + 0 | chr1 3 6 + 1 2 2 + 1 | chr1 5 7 - 6 7 0 + 2 | chr1 8 9 + 1 2 7 + PyRanges with 3 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + >>> f1.nearest(f2, strand_behavior='ignore') index | Chromosome Start End Strand Start_b End_b Strand_b Distance - int64 | category int64 int64 category int64 int64 category int64 + int64 | category int64 int64 category int64 int64 object int64 ------- --- ------------ ------- ------- ---------- --------- ------- ---------- ---------- - 1 | chr1 5 7 - 6 7 - 0 0 | chr1 3 6 + 6 7 - 1 - 1 | chr1 8 9 + 6 7 - 2 - PyRanges with 3 rows, 8 columns, and 1 index columns (with 1 index duplicates). + 1 | chr1 5 7 - 6 7 - 0 + 2 | chr1 8 9 + 6 7 - 2 + PyRanges with 3 rows, 8 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. - >>> f1.nearest(f2, direction="upstream") + >>> f1.nearest(f2, strand_behavior='ignore', exclude_overlaps=True) index | Chromosome Start End Strand Start_b End_b Strand_b Distance - int64 | category int64 int64 category int64 int64 category int64 + int64 | category int64 int64 category int64 int64 object int64 ------- --- ------------ ------- ------- ---------- --------- ------- ---------- ---------- - 1 | chr1 5 7 - 6 7 - 0 - 0 | chr1 3 6 + 1 2 + 2 - 1 | chr1 8 9 + 6 7 - 2 - PyRanges with 3 rows, 8 columns, and 1 index columns (with 1 index duplicates). + 0 | chr1 3 6 + 6 7 - 1 + 1 | chr1 5 7 - 1 2 + 4 + 2 | chr1 8 9 + 6 7 - 2 + PyRanges with 3 rows, 8 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. + >>> f1.nearest(f2, direction='downstream') + index | Chromosome Start End Strand Start_b End_b Distance + int64 | category int64 int64 category int64 int64 int64 + ------- --- ------------ ------- ------- ---------- --------- ------- ---------- + 0 | chr1 3 6 + 20 22 15 + 1 | chr1 5 7 - 6 7 0 + 2 | chr1 8 9 + 20 22 12 + PyRanges with 3 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 2 strands. + + If an input interval has no suitable nearest interval, these rows are dropped: + + >>> f1.nearest(f2, direction='upstream', exclude_overlaps=True) + index | Chromosome Start End Strand Start_b End_b Distance + int64 | category int64 int64 category float64 float64 int64 + ------- --- ------------ ------- ------- ---------- --------- --------- ---------- + 0 | chr1 3 6 + 1 2 2 + 2 | chr1 8 9 + 1 2 7 + PyRanges with 2 rows, 7 columns, and 1 index columns. + Contains 1 chromosomes and 1 strands. + """ from pyranges.methods.nearest import _nearest - if direction in {NEAREST_UPSTREAM, NEAREST_DOWNSTREAM} and not other.strand_valid: - msg = "If doing upstream or downstream nearest, other pyranges must be stranded" + strand_behavior = validate_and_convert_strand_behavior(self, other, strand_behavior) + if direction in {NEAREST_UPSTREAM, NEAREST_DOWNSTREAM} and strand_behavior == STRAND_BEHAVIOR_IGNORE: + msg = "For upstream or downstream nearest, strands must be valid and strand_behavior cannot be 'ignore'" raise AssertionError(msg) - return self.apply_pair( - other, - _nearest, - strand_behavior=strand_behavior, - how=direction, - overlap=overlap, - suffix=suffix, + if strand_behavior == STRAND_BEHAVIOR_OPPOSITE: + # swap strands in STRAND_COL, but keep original values in TEMP_STRAND_COL + self = mypy_ensure_pyranges( + self.assign(**{TEMP_STRAND_COL: self[STRAND_COL]}).assign( + **{ + STRAND_COL: self[STRAND_COL].replace( + {FORWARD_STRAND: REVERSE_STRAND, REVERSE_STRAND: FORWARD_STRAND}, + ), + }, + ), + ) + # switch direction + if direction == NEAREST_UPSTREAM: + direction = NEAREST_DOWNSTREAM + elif direction == NEAREST_DOWNSTREAM: + direction = NEAREST_UPSTREAM + + res = mypy_ensure_pyranges( + self.apply_pair( + other, + _nearest, + strand_behavior=strand_behavior, + how=direction, + by=match_by, + overlap=not exclude_overlaps, + suffix=suffix, + preserve_order=True, + ), ) + if strand_behavior == STRAND_BEHAVIOR_OPPOSITE: + # swap back strands in STRAND_COL + res = res.drop_and_return(STRAND_COL, axis="columns").rename(columns={TEMP_STRAND_COL: STRAND_COL}) + + return mypy_ensure_pyranges(res) + def overlap( # type: ignore[override] self, other: "PyRanges", strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "auto", + slack: int = 0, *, match_by: VALID_BY_TYPES = None, invert: bool = False, @@ -1893,6 +2020,11 @@ def overlap( # type: ignore[override] information. The default, "auto", means use "same" if both PyRanges are stranded (see .strand_valid) otherwise ignore the strand information. + slack : int, default 0 + Intervals in self are temporarily extended by slack on both ends before overlap is calculated, so that + we allow non-overlapping intervals to be considered overlapping if they are within less than slack distance + e.g. slack=1 reports bookended intervals. + invert : bool, default False Whether to return the intervals without overlaps. @@ -1928,6 +2060,18 @@ def overlap( # type: ignore[override] PyRanges with 3 rows, 4 columns, and 1 index columns. Contains 2 chromosomes. + >>> gr.overlap(gr2, slack=2) + index | Chromosome Start End ID + int64 | object int64 int64 object + ------- --- ------------ ------- ------- -------- + 0 | chr1 1 3 A + 1 | chr1 1 3 a + 2 | chr1 10 11 c + 3 | chr2 4 9 b + PyRanges with 4 rows, 4 columns, and 1 index columns. + Contains 2 chromosomes. + + >>> gr.overlap(gr2, contained=True) index | Chromosome Start End ID int64 | object int64 int64 object @@ -1980,16 +2124,30 @@ def overlap( # type: ignore[override] how = OVERLAP_CONTAINMENT if contained else OVERLAP_FIRST - return self.apply_pair( + _self = self.copy() + if slack: + _self[TEMP_START_SLACK_COL] = _self.Start + _self[TEMP_END_SLACK_COL] = _self.End + _self = _self.extend(slack, use_strand=False) + + gr = _self.apply_pair( other, _overlap, strand_behavior=strand_behavior, by=match_by, + preserve_order=True, invert=invert, skip_if_empty=False if invert else SKIP_IF_EMPTY_LEFT, how=how, ) + if slack and len(gr) > 0: + gr[START_COL] = gr[TEMP_START_SLACK_COL] + gr[END_COL] = gr[TEMP_END_SLACK_COL] + gr = gr.drop_and_return([TEMP_START_SLACK_COL, TEMP_END_SLACK_COL], axis=1) + + return mypy_ensure_pyranges(gr) + def set_intersect( self, other: "PyRanges", @@ -2012,20 +2170,22 @@ def set_intersect( multiple : {"all", "first", "last"}, default "all" What to report when multiple merged intervals in 'other' overlap with the same merged interval in self. - The default "all" reports all overlapping subintervals, which will have duplicate indices. + The default "all" reports all overlapping subintervals. "first" reports only, for each merged self interval, the overlapping 'other' subinterval with smallest Start "last" reports only the overlapping subinterval with the biggest End in 'other' Returns ------- PyRanges - - A PyRanges with overlapping subintervals. + A PyRanges with overlapping subintervals. Input index is not preserved. + No columns other than Chromosome, Start, End, and optionally Strand are returned. See Also -------- + PyRanges.set_union : set-theoretical union PyRanges.intersect : find overlapping subintervals PyRanges.overlap : report overlapping intervals + PyRanges.merge_overlaps : merge overlapping intervals Examples -------- @@ -2091,6 +2251,9 @@ def set_intersect( def set_union(self, other: "PyRanges", strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "auto") -> "PyRanges": """Return set-theoretical union. + Returns the regions present in either self or other. + Both PyRanges are merged first. + Parameters ---------- other : PyRanges @@ -2104,13 +2267,14 @@ def set_union(self, other: "PyRanges", strand_behavior: VALID_STRAND_BEHAVIOR_TY Returns ------- PyRanges - - A PyRanges with the union of intervals. + A PyRanges with the union of intervals. Input index is not preserved. + No columns other than Chromosome, Start, End, and optionally Strand are returned. See Also -------- PyRanges.set_intersect : set-theoretical intersection PyRanges.overlap : report overlapping intervals + PyRanges.merge_overlaps : merge overlapping intervals Examples -------- @@ -2140,11 +2304,24 @@ def set_union(self, other: "PyRanges", strand_behavior: VALID_STRAND_BEHAVIOR_TY >>> gr.set_union(gr2) index | Chromosome Start End int64 | object int64 int64 + ------- --- ------------ ------- ------- + 0 | chr1 1 9 + 1 | chr1 9 10 + 2 | chr1 10 11 + PyRanges with 3 rows, 3 columns, and 1 index columns. + Contains 1 chromosomes. + + Merging bookended intervals: + + >>> gr.set_union(gr2).merge_overlaps(slack=1) + index | Chromosome Start End + int64 | object int64 int64 ------- --- ------------ ------- ------- 0 | chr1 1 11 PyRanges with 1 rows, 3 columns, and 1 index columns. Contains 1 chromosomes. + """ if self.empty and other.empty: return mypy_ensure_pyranges(self.copy()) @@ -2155,6 +2332,16 @@ def set_union(self, other: "PyRanges", strand_behavior: VALID_STRAND_BEHAVIOR_TY if not use_strand: self = self.remove_strand() other = other.remove_strand() + elif strand_behavior == STRAND_BEHAVIOR_OPPOSITE: + other = mypy_ensure_pyranges( + other.assign( + **{ + STRAND_COL: other[STRAND_COL].replace( + {FORWARD_STRAND: REVERSE_STRAND, REVERSE_STRAND: FORWARD_STRAND}, + ), + }, + ), + ) gr = pr.concat([self, other]) @@ -2574,7 +2761,10 @@ def split( See Also -------- - pyranges.multioverlap : find overlaps with multiple PyRanges + PyRanges.merge_overlaps : merge overlapping intervals + PyRanges.max_disjoint : find the maximal disjoint set of intervals + PyRanges.split : split intervals into non-overlapping subintervals + Examples -------- @@ -2975,6 +3165,12 @@ def subtract_ranges( match_by : str or list, default None If provided, only intervals with an equal value in column(s) `match_by` may be considered as overlapping. + Returns + ------- + PyRanges + PyRanges with subintervals from self that do not overlap with any interval in other. + Columns and index are preserved. + Warning ------- The returned Pyranges may have index duplicates. Call .reset_index(drop=True) to fix it. @@ -3052,6 +3248,7 @@ def subtract_ranges( """ from pyranges.methods.subtraction import _subtraction + strand_behavior = validate_and_convert_strand_behavior(self, other, strand_behavior) use_strand = use_strand_from_validated_strand_behavior(self, other, strand_behavior) other_clusters = other.merge_overlaps(use_strand=use_strand, match_by=match_by) @@ -3064,6 +3261,7 @@ def subtract_ranges( overlap_col=TEMP_NUM_COL, match_by=grpby_ks, ) + gr[TEMP_ID_COL] = np.arange(len(gr)) result = gr.apply_pair( other_clusters, @@ -3073,7 +3271,10 @@ def subtract_ranges( skip_if_empty=False, ) - return result.drop_and_return(TEMP_NUM_COL, axis=1) + return mypy_ensure_pyranges(result.sort_values(TEMP_ID_COL)).drop_and_return( + [TEMP_NUM_COL, TEMP_ID_COL], + axis=1, + ) def summary( self, @@ -4205,8 +4406,7 @@ def intersect( # type: ignore[override] Returns ------- PyRanges - - A PyRanges with overlapping intervals. + A PyRanges with overlapping intervals. Input index is preserved, but may contain duplicates. See Also -------- @@ -4275,7 +4475,10 @@ def intersect( # type: ignore[override] # note: argument multiple = 'containment' is formally accepted but omitted in docstring since the result # will be always the same as self.overlap(other, contained=True), no intersect is done in that case - return self.apply_pair( + _self = self.copy() + _self[TEMP_ID_COL] = np.arange(len(_self)) + + gr = _self.apply_pair( other, _intersect, strand_behavior=strand_behavior, @@ -4283,6 +4486,10 @@ def intersect( # type: ignore[override] how=multiple, ) + # preserving order. I can't use the reindex syntax as in other functions because of potential duplicates + gr = mypy_ensure_pyranges(gr.sort_values(TEMP_ID_COL)) + return gr.drop_and_return(TEMP_ID_COL, axis=1) + def combine_interval_columns( self, function: VALID_COMBINE_OPTIONS | CombineIntervalColumnsOperation = "intersect", @@ -4324,12 +4531,12 @@ def combine_interval_columns( index | Chromosome Start End Strand Start_b End_b int64 | category int64 int64 category int64 int64 ------- --- ------------ ------- ------- ---------- --------- ------- - 1 | chr1 9939 10138 + 10073 10272 0 | chr1 9916 10115 - 9988 10187 - 0 | chr1 9916 10115 - 10079 10278 - 2 | chr1 9951 10150 - 9988 10187 - 2 | chr1 9951 10150 - 10079 10278 - PyRanges with 5 rows, 6 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 9916 10115 - 10079 10278 + 2 | chr1 9939 10138 + 10073 10272 + 3 | chr1 9951 10150 - 9988 10187 + 4 | chr1 9951 10150 - 10079 10278 + PyRanges with 5 rows, 6 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. The default operation is to intersect the intervals: @@ -4338,12 +4545,12 @@ def combine_interval_columns( index | Chromosome Start End Strand int64 | category int64 int64 category ------- --- ------------ ------- ------- ---------- - 1 | chr1 10073 10138 + 0 | chr1 9988 10115 - - 0 | chr1 10079 10115 - - 2 | chr1 9988 10150 - - 2 | chr1 10079 10150 - - PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 10079 10115 - + 2 | chr1 10073 10138 + + 3 | chr1 9988 10150 - + 4 | chr1 10079 10150 - + PyRanges with 5 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. Take the union instead: @@ -4352,24 +4559,24 @@ def combine_interval_columns( index | Chromosome Start End Strand int64 | category int64 int64 category ------- --- ------------ ------- ------- ---------- - 1 | chr1 9939 10272 + 0 | chr1 9916 10187 - - 0 | chr1 9916 10278 - - 2 | chr1 9951 10187 - - 2 | chr1 9951 10278 - - PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 9916 10278 - + 2 | chr1 9939 10272 + + 3 | chr1 9951 10187 - + 4 | chr1 9951 10278 - + PyRanges with 5 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. >>> j.combine_interval_columns('swap') index | Chromosome Start End Strand int64 | category int64 int64 category ------- --- ------------ ------- ------- ---------- - 1 | chr1 10073 10272 + 0 | chr1 9988 10187 - - 0 | chr1 10079 10278 - - 2 | chr1 9988 10187 - - 2 | chr1 10079 10278 - - PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 10079 10278 - + 2 | chr1 10073 10272 + + 3 | chr1 9988 10187 - + 4 | chr1 10079 10278 - + PyRanges with 5 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. @@ -4380,12 +4587,12 @@ def combine_interval_columns( index | Chromosome Start End Strand int64 | category int64 int64 category ------- --- ------------ ------- ------- ---------- - 1 | chr1 9939 10272 + 0 | chr1 9916 10187 - - 0 | chr1 9916 10278 - - 2 | chr1 9951 10187 - - 2 | chr1 9951 10278 - - PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). + 1 | chr1 9916 10278 - + 2 | chr1 9939 10272 + + 3 | chr1 9951 10187 - + 4 | chr1 9951 10278 - + PyRanges with 5 rows, 4 columns, and 1 index columns. Contains 1 chromosomes and 2 strands. """ diff --git a/pyranges/methods/cluster.py b/pyranges/methods/cluster.py index 1c290bd1..414c5359 100644 --- a/pyranges/methods/cluster.py +++ b/pyranges/methods/cluster.py @@ -14,7 +14,10 @@ def _cluster( if df.empty: return df - ids = annotate_clusters(df[START_COL].to_numpy(), df[END_COL].values, slack=slack) + # important: sorted_nearest interprets slack differently than pyranges + # 0 slack in sorted_nearest means that bookended intervals are clustered + # together, while in pyranges it means that they are not. + ids = annotate_clusters(df[START_COL].to_numpy(), df[END_COL].to_numpy(), slack=slack - 1) df.insert(df.shape[1], cluster_column, ids) diff --git a/pyranges/methods/max_disjoint.py b/pyranges/methods/max_disjoint.py index ba57cd96..e16ae037 100644 --- a/pyranges/methods/max_disjoint.py +++ b/pyranges/methods/max_disjoint.py @@ -9,7 +9,9 @@ def _max_disjoint(df: pd.DataFrame, **kwargs) -> pd.DataFrame: slack = kwargs.get("slack", 0) cdf = df.sort_values("End") - - idx = max_disjoint(cdf.index.values, cdf.Start.values, cdf.End.values, slack) + # important: sorted_nearest interprets slack differently than pyranges + # 0 slack in sorted_nearest means that bookended intervals are clustered + # together, while in pyranges it means that they are not. + idx = max_disjoint(cdf.index.values, cdf.Start.values, cdf.End.values, slack - 1) return cdf.reindex(idx) diff --git a/pyranges/methods/merge.py b/pyranges/methods/merge.py index 95ea247d..f1748839 100644 --- a/pyranges/methods/merge.py +++ b/pyranges/methods/merge.py @@ -13,7 +13,10 @@ def _merge(df: pd.DataFrame, **kwargs) -> pd.DataFrame: cdf = df.sort_values(START_COL) - starts, ends, number = find_clusters(cdf.Start.values, cdf.End.values, slack) + # important: sorted_nearest interprets slack differently than pyranges + # 0 slack in sorted_nearest means that bookended intervals are considered overlapping + # together, while in pyranges it means that they are not. + starts, ends, number = find_clusters(cdf.Start.values, cdf.End.values, slack - 1) by_values = df.head(1).squeeze()[by].to_dict() diff --git a/pyranges/methods/nearest.py b/pyranges/methods/nearest.py index 0f0279b7..0c8e5a17 100644 --- a/pyranges/methods/nearest.py +++ b/pyranges/methods/nearest.py @@ -10,7 +10,15 @@ import pyranges.core.empty from pyranges import PyRanges -from pyranges.core.names import END_COL, START_COL +from pyranges.core.names import ( + BY_ENTRY_IN_KWARGS, + END_COL, + FORWARD_STRAND, + GENOME_LOC_COLS_WITH_STRAND, + REVERSE_STRAND, + START_COL, + STRAND_COL, +) from pyranges.methods.sort import sort_one_by_one if TYPE_CHECKING: @@ -39,7 +47,7 @@ def _insert_distance(df2: pd.DataFrame, dist: "NDArray | int", suffix: str) -> p def _overlapping_for_nearest(df: pd.DataFrame, df2: pd.DataFrame, suffix: str) -> tuple[pd.DataFrame, pd.DataFrame]: - nearest_df = pd.DataFrame(columns="Chromosome Start End Strand".split()) + nearest_df = pd.DataFrame(columns=GENOME_LOC_COLS_WITH_STRAND) it = NCLS(df2.Start.to_numpy(), df2.End.to_numpy(), df2.index.to_numpy()) @@ -103,18 +111,18 @@ def _nearest(df: "DataFrame", df2: "DataFrame", **kwargs) -> pd.DataFrame: if df.empty or df2.empty: return PyRanges() + columns_used_for_by = kwargs.get(BY_ENTRY_IN_KWARGS, {}).keys() overlap = kwargs["overlap"] how = kwargs["how"] suffix = kwargs["suffix"] + strand = kwargs.get(BY_ENTRY_IN_KWARGS, {}).get(STRAND_COL) if how == "upstream": - strand = df.Strand.iloc[0] - how = {"+": "previous", "-": "next"}[strand] + how = {FORWARD_STRAND: "previous", REVERSE_STRAND: "next"}[strand] elif how == "downstream": - strand = df.Strand.iloc[0] - how = {"+": "next", "-": "previous"}[strand] + how = {FORWARD_STRAND: "next", REVERSE_STRAND: "previous"}[strand] - df2 = df2.reset_index(drop=True) + df2 = df2.drop(columns=columns_used_for_by).reset_index(drop=True) if overlap: nearest_df, df_to_find_nearest_in = _overlapping_for_nearest(df, df2, suffix) @@ -124,9 +132,13 @@ def _nearest(df: "DataFrame", df2: "DataFrame", **kwargs) -> pd.DataFrame: df = pyranges.core.empty.empty_df() if not df_to_find_nearest_in.empty: - df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, "Start", "End") - df2 = sort_one_by_one(df2, "Start", "End") - df_to_find_nearest_in.index = pd.Index(range(len(df_to_find_nearest_in))) + df_to_find_nearest_in = sort_one_by_one(df_to_find_nearest_in, START_COL, END_COL) + df2 = sort_one_by_one(df2, START_COL, END_COL) + + # preserving index name + original_index_names = df_to_find_nearest_in.index.names + index_names = [name if name is not None else f"__index{i}" for i, name in enumerate(original_index_names)] + df_to_find_nearest_in = df_to_find_nearest_in.reset_index(names=index_names) if how == "next": r_idx, dist = _next_nonoverlapping(df_to_find_nearest_in.End, df2.Start, df2.index.to_numpy()) @@ -150,9 +162,13 @@ def _nearest(df: "DataFrame", df2: "DataFrame", **kwargs) -> pd.DataFrame: df = df_to_find_nearest_in.join(df2, rsuffix=suffix) - if overlap and "df" in locals() and not df.empty and not nearest_df.empty: + # restoring index and index names + df = df.set_index(index_names) + df.index.names = original_index_names + + if overlap and not df.empty and not nearest_df.empty: df = pd.concat([nearest_df, df], sort=False) elif overlap and not nearest_df.empty: df = nearest_df - return PyRanges(df.drop("Chromosome" + suffix, axis=1)) + return df diff --git a/pyranges/methods/subsequence.py b/pyranges/methods/subsequence.py index 0e61ab1d..d542812e 100644 --- a/pyranges/methods/subsequence.py +++ b/pyranges/methods/subsequence.py @@ -41,7 +41,7 @@ def _subseq( by_argument_given = bool(by) by = by or [TEMP_INDEX_COL] - strand = kwargs.get(BY_ENTRY_IN_KWARGS, {}).get("Strand") + strand = kwargs.get(BY_ENTRY_IN_KWARGS, {}).get(STRAND_COL) # at this point, strand is False if 1. spliced_subsequence was called with use_strand=False or # 2. it was called with use_strand='auto' and not self.valid_strand diff --git a/pyranges/readers.py b/pyranges/readers.py index a14d576b..cd7205ec 100644 --- a/pyranges/readers.py +++ b/pyranges/readers.py @@ -636,6 +636,10 @@ def read_bigwig(f: str | Path) -> "PyRanges": ------- PyRanges + Note + ---- + This function requires the library pyBigWig, it can be installed with pip install pyBigWig + Examples -------- >>> import pyranges as pr diff --git a/tests/unit/test_join.py b/tests/unit/test_join.py index 1be97fa7..5de71f34 100644 --- a/tests/unit/test_join.py +++ b/tests/unit/test_join.py @@ -40,7 +40,7 @@ def test_join_issue_8(): "Start_b": [731, 821], "End_b": [831, 921], }, - index=[3, 3], + index=[0, 1], ) assert j.equals(expected_result) @@ -74,6 +74,6 @@ def test_join_issue_8_right(): "Start_b": [731, 821, 157, 584], "End_b": [831, 921, 257, 684], }, - index=[2, 3, 0, 1], + index=[0, 1, 2, 3], ) assert j.equals(expected_result)