Skip to content

Commit

Permalink
Merge pull request #27 from marco-mariotti/migration_guide
Browse files Browse the repository at this point in the history
count overlaps
  • Loading branch information
marco-mariotti authored May 7, 2024
2 parents bedf5df + aa1e677 commit e46731a
Showing 1 changed file with 46 additions and 12 deletions.
58 changes: 46 additions & 12 deletions pyranges/core/pyranges_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,7 @@ def count_overlaps(
strand_behavior: VALID_STRAND_BEHAVIOR_TYPE = "auto",
*,
match_by: str | list[str] | None = None,
slack: int = 0,
overlap_col: str = "NumberOverlaps",
keep_nonoverlapping: bool = True,
calculate_coverage: bool = False,
Expand All @@ -893,6 +894,9 @@ def count_overlaps(
information. The default, "auto", means use "same" if both PyRanges are stranded (see .strand_valid)
otherwise ignore the strand information.
slack : int, default 0
Temporarily lengthen intervals in self before searching for overlaps.
keep_nonoverlapping : bool, default True
Keep intervals without overlaps.
Expand Down Expand Up @@ -942,21 +946,31 @@ def count_overlaps(
int64 | category int64 int64 category int64
------- --- ------------ ------- ------- ---------- -------
0 | chr1 3 6 + 0
2 | chr1 8 9 + 0
1 | chr1 5 7 - 1
2 | chr1 8 9 + 0
PyRanges with 3 rows, 5 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
>>> f1.count_overlaps(f2, overlap_col="C", calculate_coverage=True, coverage_col="F")
index | Chromosome Start End Strand F
int64 | category int64 int64 category float64
------- --- ------------ ------- ------- ---------- ---------
0 | chr1 3 6 + 0
>>> f1.count_overlaps(f2, overlap_col="Count", slack=1, strand_behavior="ignore")
index | Chromosome Start End Strand Count
int64 | category int64 int64 category int64
------- --- ------------ ------- ------- ---------- -------
0 | chr1 3 6 + 1
1 | chr1 5 7 - 1
2 | chr1 8 9 + 0
1 | chr1 5 7 - 0.5
PyRanges with 3 rows, 5 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
>>> f1.count_overlaps(f2, overlap_col="C", calculate_coverage=True, coverage_col="F")
index | Chromosome Start End Strand C F
int64 | category int64 int64 category int64 float64
------- --- ------------ ------- ------- ---------- ------- ---------
0 | chr1 3 6 + 0 0
1 | chr1 5 7 - 1 0.5
2 | chr1 8 9 + 0 0
PyRanges with 3 rows, 6 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
>>> annotation = pr.example_data.ensembl_gtf.get_with_loc_columns(['transcript_id', 'Feature'])
>>> reads = pr.random(1000, chromsizes={'1':150000}, strand=False, seed=123)
>>> annotation.count_overlaps(reads)
Expand All @@ -983,7 +997,20 @@ def count_overlaps(
msg = "coverage_col can only be provided if calculate_coverage is True."
raise ValueError(msg)

result = self.apply_pair(
if slack and calculate_coverage:
msg = "calculate_coverage can only be computed with slack=0."
raise ValueError(msg)

if slack:
_self = self.copy()
_self[TEMP_START_SLACK_COL] = _self.Start
_self[TEMP_END_SLACK_COL] = _self.End

_self = _self.extend(slack, use_strand=False)
else:
_self = self

result = _self.apply_pair(
other,
_number_overlapping,
strand_behavior=strand_behavior,
Expand All @@ -999,7 +1026,7 @@ def count_overlaps(
use_strand = use_strand_from_validated_strand_behavior(self, other, strand_behavior)
other = other.merge_overlaps(use_strand=use_strand, match_by=match_by, count_col="Count")

result = self.copy().apply_pair(
result = result.apply_pair(
other,
_coverage,
strand_behavior=strand_behavior,
Expand All @@ -1009,7 +1036,14 @@ def count_overlaps(
overlap_col=overlap_col,
skip_if_empty=not keep_nonoverlapping,
)
return result

if slack and len(result) > 0:
result[START_COL] = result[TEMP_START_SLACK_COL]
result[END_COL] = result[TEMP_END_SLACK_COL]
result = result.drop_and_return([TEMP_START_SLACK_COL, TEMP_END_SLACK_COL], axis=1)

# reindex to original order
return mypy_ensure_pyranges(result.reindex(self.index))

# to do: optimize, doesn't need to split by chromosome, only strand and only if ext_3/5
def extend(
Expand Down Expand Up @@ -1270,7 +1304,7 @@ def join_ranges(
Report amount of overlap in base pairs.
slack : int, default 0
Lengthen intervals in self before joining.
Temporarily lengthen intervals in self before joining.
suffix : str or tuple, default "_b"
Suffix to give overlapping columns in other.
Expand Down Expand Up @@ -1465,7 +1499,7 @@ def join_ranges(
_self[TEMP_START_SLACK_COL] = _self.Start
_self[TEMP_END_SLACK_COL] = _self.End

_self = _self.extend(slack)
_self = _self.extend(slack, use_strand=False)

gr: pd.DataFrame | PyRanges = _self.apply_pair(
other,
Expand Down

0 comments on commit e46731a

Please sign in to comment.