Skip to content

Commit

Permalink
added complement function, useful to get introns
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-mariotti committed May 30, 2024
1 parent 7aea65a commit 4640ae7
Showing 1 changed file with 240 additions and 6 deletions.
246 changes: 240 additions & 6 deletions pyranges/core/pyranges_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ def apply_pair( # type: ignore[override]

def boundaries(
self,
transcript_id: str | list[str] | None = None,
transcript_id: VALID_BY_TYPES = None,
agg: dict[str, str | Callable] | None = None,
) -> "PyRanges":
"""Return the boundaries of groups of intervals (e.g. transcripts).
Expand Down Expand Up @@ -1108,7 +1108,7 @@ def extend(
ext: int | None = None,
ext_3: int | None = None,
ext_5: int | None = None,
transcript_id: str | list[str] | None = None,
transcript_id: VALID_BY_TYPES = None,
use_strand: VALID_USE_STRAND_TYPE = "auto",
) -> "PyRanges":
"""Extend the intervals from the 5' and/or 3' ends.
Expand Down Expand Up @@ -1236,7 +1236,7 @@ def extend(

def five_end(
self,
transcript_id: str | list[str] | None = None,
transcript_id: VALID_BY_TYPES = None,
) -> "PyRanges":
"""Return the five prime end of intervals.
Expand Down Expand Up @@ -1749,7 +1749,6 @@ def merge_overlaps(
"""Merge overlapping intervals into one.
Returns a PyRanges with superintervals that are the union of overlapping intervals.
Bookended intervals are merged by default.
Parameters
----------
Expand Down Expand Up @@ -3470,7 +3469,7 @@ def tile(

def three_end(
self,
transcript_id: str | list[str] | None = None,
transcript_id: VALID_BY_TYPES = None,
) -> "PyRanges":
"""Return the 3'-end.
Expand Down Expand Up @@ -4619,6 +4618,240 @@ def combine_interval_columns(

return z.drop_and_return(labels=cols_to_drop, axis="columns")

def complement(
self: "PyRanges",
transcript_id: VALID_BY_TYPES = None,
*,
use_strand: VALID_USE_STRAND_TYPE = USE_STRAND_DEFAULT,
chromsizes: "dict[str | int, int] | PyRanges | None" = None,
) -> "PyRanges":
"""Return the internal complement of the intervals, i.e. its introns.
The complement of an interval is the set of intervals that are not covered by the original interval.
This function is useful for obtaining the introns of a set of exons, corresponding to the
"internal" complement, i.e. excluding the first and last portion of each chromosome not covered by intervals.
Parameters
----------
transcript_id : str or list, default None
Column(s) to group by intervals (i.e. exons). If provided, the complement will be calculated for each group.
use_strand: {"auto", True, False}, default "auto"
Whether to return complement separately for intervals on the positive and negative strand.
The default "auto" means use strand information if present and valid (see .strand_valid)
chromsizes : dict or PyRanges or pyfaidx.Fasta
If provided, external complement intervals will also be returned, i.e. the intervals corresponding to the
beginning of the chromosome up to the first interval and from the last interval to the end of the
chromosome. If transcript_id is provided, these are returned for each group.
Format of chromsizes: dict or PyRanges describing the lengths of the chromosomes.
pyfaidx.Fasta object is also accepted since it conveniently loads chromosome length
Returns
-------
PyRanges
Complement intervals, i.e. introns in the typical use case.
Notes
-----
* To ensure non-overlap among the input intervals, merge_overlaps is run before the complement is calculated.
* Bookended intervals will result in complement intervals returned since they would be of length 0.
Examples
--------
>>> a = pr.PyRanges(dict(Chromosome="chr1", Start=[2, 10, 20, 40], End=[5, 18, 30, 46], ID=['a', 'a', 'b', 'b']))
>>> a
index | Chromosome Start End ID
int64 | object int64 int64 object
------- --- ------------ ------- ------- --------
0 | chr1 2 5 a
1 | chr1 10 18 a
2 | chr1 20 30 b
3 | chr1 40 46 b
PyRanges with 4 rows, 4 columns, and 1 index columns.
Contains 1 chromosomes.
Using complement to get introns:
>>> a.complement('ID')
index | Chromosome Start End ID
int64 | object int64 int64 object
------- --- ------------ ------- ------- --------
0 | chr1 5 10 a
1 | chr1 30 40 b
PyRanges with 2 rows, 4 columns, and 1 index columns.
Contains 1 chromosomes.
Get complement of the whole set of intervals, without grouping:
>>> a.complement()
index | Chromosome Start End
int64 | object int64 int64
------- --- ------------ ------- -------
0 | chr1 5 10
1 | chr1 18 20
2 | chr1 30 40
PyRanges with 3 rows, 3 columns, and 1 index columns.
Contains 1 chromosomes.
Include external intervals:
>>> a.complement(chromsizes={'chr1': 10000})
index | Chromosome Start End
int64 | object int64 int64
------- --- ------------ ------- -------
0 | chr1 0 2
1 | chr1 5 10
2 | chr1 18 20
3 | chr1 30 40
4 | chr1 46 10000
PyRanges with 5 rows, 3 columns, and 1 index columns.
Contains 1 chromosomes.
>>> a.complement('ID', chromsizes={'chr1': 10000})
index | Chromosome Start End ID
int64 | object int64 int64 object
------- --- ------------ ------- ------- --------
0 | chr1 0 2 a
1 | chr1 5 10 a
2 | chr1 18 10000 a
3 | chr1 0 20 b
4 | chr1 30 40 b
5 | chr1 46 10000 b
PyRanges with 6 rows, 4 columns, and 1 index columns.
Contains 1 chromosomes.
For complement of whole sets of intervals, you can explicitly use_strand or not:
>>> b = pr.PyRanges(dict(Chromosome="chr1", Start=[1, 10, 20, 40], End=[5, 18, 30, 46],
... Strand=['+', '+', '-', '-']))
>>> b
index | Chromosome Start End Strand
int64 | object int64 int64 object
------- --- ------------ ------- ------- --------
0 | chr1 1 5 +
1 | chr1 10 18 +
2 | chr1 20 30 -
3 | chr1 40 46 -
PyRanges with 4 rows, 4 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
>>> b.complement(use_strand=True) # same as b.complement() because b.strand_valid == True
index | Chromosome Start End Strand
int64 | object int64 int64 object
------- --- ------------ ------- ------- --------
0 | chr1 5 10 +
1 | chr1 30 40 -
PyRanges with 2 rows, 4 columns, and 1 index columns.
Contains 1 chromosomes and 2 strands.
>>> b.complement(use_strand=False)
index | Chromosome Start End
int64 | object int64 int64
------- --- ------------ ------- -------
0 | chr1 5 10
1 | chr1 18 20
2 | chr1 30 40
PyRanges with 3 rows, 3 columns, and 1 index columns.
Contains 1 chromosomes.
>>> b.complement(use_strand=False, chromsizes={'chr1': 10000})
index | Chromosome Start End
int64 | object int64 int64
------- --- ------------ ------- -------
0 | chr1 0 1
1 | chr1 5 10
2 | chr1 18 20
3 | chr1 30 40
4 | chr1 46 10000
PyRanges with 5 rows, 3 columns, and 1 index columns.
Contains 1 chromosomes.
Bookended intervals (indices 0-1 below) and overlapping intervals (2-3) won't return any in-between intervals:
>>> c = pr.PyRanges(dict(Chromosome="chr1", Start=[1, 5, 8, 10], End=[5, 7, 14, 16]))
>>> c.complement()
index | Chromosome Start End
int64 | object int64 int64
------- --- ------------ ------- -------
0 | chr1 7 8
PyRanges with 1 rows, 3 columns, and 1 index columns.
Contains 1 chromosomes.
"""
use_strand = validate_and_convert_use_strand(self, use_strand)
transcript_id = (
transcript_id if transcript_id is not None else [CHROM_COL] + ([STRAND_COL] if use_strand else [])
)

include_external = chromsizes is not None
if include_external:
if isinstance(chromsizes, pd.DataFrame):
chromsizes = dict(*zip(chromsizes[CHROM_COL], chromsizes[END_COL], strict=True))
elif isinstance(chromsizes, dict):
pass
else: # A hack because pyfaidx might not be installed, but we want type checking anyway
pyfaidx_chromsizes = cast(dict[str | int, list], chromsizes)
chromsizes = {k: len(pyfaidx_chromsizes[k]) for k in pyfaidx_chromsizes.keys()} # noqa: SIM118

## if include external, include a bunch of rows with the Start == chromosome size, End = same +1; for each chromosome

unique_chroms = self[CHROM_COL].drop_duplicates().to_numpy()
# keeping only chromosomes present in self
chromsizes = {k: chromsizes[k] for k in chromsizes if k in unique_chroms}

missing_chroms = {k for k in chromsizes if k not in unique_chroms}
# some chromosomes are not found in chromsize, raise error
if missing_chroms:
msg = (
f"ERROR chromosomes missing from chromsizes provided : {",".join([str(c) for c in missing_chroms])}"
)
raise ValueError(msg)

# adding extra interval which are just out of bounds on the right
end_bits = pr.PyRanges(
{
"Chromosome": list(chromsizes.keys()),
"Start": list(chromsizes.values()),
"End": [x + 1 for x in chromsizes.values()],
},
)

# duplicating them just enough times
cols = list(set(arg_to_list(transcript_id) + [CHROM_COL] + ([STRAND_COL] if use_strand else [])))
end_bits = mypy_ensure_pyranges(end_bits.merge(self[cols].drop_duplicates(), on=[CHROM_COL]))

_self = pr.concat([self, end_bits])

else:
_self = self

introns = _self.merge_overlaps(match_by=transcript_id, use_strand=use_strand).sort_ranges(
by=transcript_id,
use_strand=False,
)

# intron start is exon end shifted
fill_value = 0
introns[END_COL] = introns.groupby(transcript_id, group_keys=False, observed=True)[END_COL].shift(
fill_value=fill_value,
)

# swapping start and end
introns[[START_COL, END_COL]] = introns[[END_COL, START_COL]]

# removing introns resulting from bookended intervals
selector = introns.lengths() != 0

# removing first line of each group
if not include_external:
selector = selector & (introns.groupby(transcript_id).cumcount() != 0)
introns = introns.loc[selector].reset_index(drop=True)

return mypy_ensure_pyranges(introns)

def get_sequence(
self: "PyRanges",
path: Path | None = None,
Expand Down Expand Up @@ -4741,8 +4974,9 @@ def get_sequence(

def get_transcript_sequence(
self: "PyRanges",
transcript_id: str,
transcript_id: str | list[str],
path: Path | None = None,
*,
pyfaidx_fasta: Optional["pyfaidx.Fasta"] = None,
use_strand: VALID_USE_STRAND_TYPE = USE_STRAND_DEFAULT,
) -> pd.DataFrame:
Expand Down

0 comments on commit 4640ae7

Please sign in to comment.