diff --git a/pyranges/core/names.py b/pyranges/core/names.py index 33332099..87a11f32 100644 --- a/pyranges/core/names.py +++ b/pyranges/core/names.py @@ -84,7 +84,7 @@ def wrapper(*args, **kwargs) -> "PyRanges | pd.DataFrame | pd.Series": VALID_JOIN_OPTIONS = [JOIN_INNER, JOIN_LEFT, JOIN_OUTER, JOIN_RIGHT] JOIN_SUFFIX = "_b" -VALID_COMBINE_OPTIONS = Literal["intersect", "union"] +VALID_COMBINE_OPTIONS = Literal["intersect", "union", "swap"] NEAREST_ANY_DIRECTION: Final = "any" NEAREST_UPSTREAM: Final = "upstream" diff --git a/pyranges/core/pyranges_main.py b/pyranges/core/pyranges_main.py index 71cbb459..67c0bb1d 100644 --- a/pyranges/core/pyranges_main.py +++ b/pyranges/core/pyranges_main.py @@ -4261,12 +4261,13 @@ def combine_interval_columns( ) -> "pr.PyRanges": """Use two pairs of columns representing intervals to create a new start and end column. + The function is designed as post-processing after join_ranges to aggregate the coordinates of the two intervals. By default, the new start and end columns will be the intersection of the intervals. Parameters ---------- - function : {"intersect", "union"} or Callable, default "intersect" - How to combine the intervals: "intersect" or "union". + function : {"intersect", "union", "swap"} or Callable, default "intersect" + How to combine the self and other intervals: "intersect", "union", or "swap" If a callable is passed, it should take four Series arguments: start1, end1, start2, end2; and return a tuple of two integers: (new_starts, new_ends). @@ -4325,6 +4326,19 @@ def combine_interval_columns( PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). Contains 1 chromosomes and 2 strands. + >>> j.combine_interval_columns('swap') + index | Chromosome Start End Strand + int64 | category int64 int64 category + ------- --- ------------ ------- ------- ---------- + 1 | chr1 10073 10272 + + 0 | chr1 9988 10187 - + 0 | chr1 10079 10278 - + 2 | chr1 9988 10187 - + 2 | chr1 10079 10278 - + PyRanges with 5 rows, 4 columns, and 1 index columns (with 2 index duplicates). + Contains 1 chromosomes and 2 strands. + + Use a custom function that keeps the start of the first interval and the end of the second: >>> def custom_combine(s1, e1, s2, e2): return (s1, e2) @@ -4341,12 +4355,18 @@ def combine_interval_columns( Contains 1 chromosomes and 2 strands. """ - from pyranges.methods.combine_positions import _intersect_interval_columns, _union_interval_columns + from pyranges.methods.combine_positions import ( + _intersect_interval_columns, + _swap_interval_columns, + _union_interval_columns, + ) if function == "intersect": function = _intersect_interval_columns elif function == "union": function = _union_interval_columns + elif function == "swap": + function = _swap_interval_columns new_starts, new_ends = function(self[start], self[end], self[start2], self[end2]) diff --git a/pyranges/methods/combine_positions.py b/pyranges/methods/combine_positions.py index 768de2e2..0a049d0f 100644 --- a/pyranges/methods/combine_positions.py +++ b/pyranges/methods/combine_positions.py @@ -12,7 +12,6 @@ def _intersect_interval_columns( np.where(starts > starts2.to_numpy(), starts, starts2), index=starts.index, ) - new_ends = pd.Series( np.where(ends < ends2.to_numpy(), ends, ends2), index=ends.index, @@ -30,10 +29,17 @@ def _union_interval_columns( np.where(starts < starts2.to_numpy(), starts, starts2), index=starts.index, ) - new_ends = pd.Series( np.where(ends > ends2.to_numpy(), ends, ends2), index=ends.index, ) - return new_starts, new_ends + + +def _swap_interval_columns( + starts: pd.Series, # noqa: ARG001 + ends: pd.Series, # noqa: ARG001 + starts2: pd.Series, + ends2: pd.Series, +) -> tuple[pd.Series, pd.Series]: + return starts2, ends2