From 8ad1d115265133de2a84dad342956813cbc228f0 Mon Sep 17 00:00:00 2001 From: marco-mariotti Date: Wed, 24 Jul 2024 20:38:40 +0200 Subject: [PATCH 1/2] (temporarily?) removed fisher from add-ons dependencies, and minor fixes due to numpy types --- docs/tutorial.rst | 4 ++-- pyproject.toml | 6 +++--- pyranges/ext/stats.py | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index b8a62fd6..5e9e40ca 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -297,7 +297,7 @@ The function :func:`get_sequence ` returns one s The ``Sequence`` column is a pandas Series containing strings. We see that the starting codon is ATG in most cases, as expected. When we check the length of the sequences, we notice that some are not 3-letter long: - >>> (first.Sequence.str.len() == 3 ).all() + >>> bool( (first.Sequence.str.len() == 3 ).all() ) False Let's look at those sequences, using a row selector as before: @@ -345,7 +345,7 @@ i.e. joining exons together. The sequence is given 5' to 3'. ``seq_first`` is not a PyRanges object, but a pandas DataFrame. It has a column for the group (ID) and one for Sequence. Here we confirm the sequence length is always 3: - >>> (seq_first.Sequence.str.len()==3).all() + >>> bool( (seq_first.Sequence.str.len()==3).all() ) True diff --git a/pyproject.toml b/pyproject.toml index f57bc4d9..e134b52d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyranges1" -version = "1.0.2" +version = "1.0.3" description = "GenomicRanges for Python." requires-python = ">=3.12.0" readme = "README.md" @@ -27,7 +27,7 @@ keywords = ["bioinformatics", "genomicranges", "genomics"] dependencies = ["pandas", "ncls>=0.0.63", "tabulate", "sorted_nearest>=0.0.33", "natsort"] [project.optional-dependencies] -add-ons = ["pyrle >= 0.0.39", "bamread", "fisher", "pyfaidx", "pyBigWig", "joblib"] +add-ons = ["pyrle >= 0.0.39", "bamread", "pyfaidx", "pyBigWig", "joblib"] #"fisher", dev = ["tox", "ruff == 0.3.0", "pyright", "pandas-stubs", "types-tabulate", "pytest-watcher", "pytest-xdist", "hypothesis>=6.92.1"] docs = ["sphinx", "sphinx_rtd_theme", "sphinx-autoapi", "sphinxcontrib-napoleon"] all = ["pyranges1[add-ons]", "pyranges1[dev]", "pyranges1[docs]"] @@ -50,7 +50,6 @@ envlist = py312 deps = pyrle >= 0.0.39 bamread - fisher pyBigWig pyfaidx tox @@ -61,6 +60,7 @@ deps = pyright joblib hypothesis==6.92.1 + #fisher commands = python tests/run_doctest_tutorial_howto.py pytest --doctest-modules pyranges diff --git a/pyranges/ext/stats.py b/pyranges/ext/stats.py index 6a6dfc01..d7c7ca82 100644 --- a/pyranges/ext/stats.py +++ b/pyranges/ext/stats.py @@ -93,9 +93,9 @@ def _find_chromosome_max_end_positions(grs: list["PyRanges"]) -> pd.DataFrame: Examples -------- >>> f1, f2 = pr.example_data.f1, pr.example_data.f2 # both only have chr1 - >>> f1["End"].max() + >>> int(f1["End"].max()) 9 - >>> f2["End"].max() + >>> int(f2["End"].max()) 7 """ @@ -220,7 +220,7 @@ def fisher_exact(tp: Series, fp: Series, fn: Series, tn: Series, pseudocount: in 0 12 5 29 2 1 0 12 10 2 - >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN) + >>> pr.stats.fisher_exact(df.TP, df.FP, df.TN, df.FN) # doctest: +SKIP odds_ratio P PLeft PRight 0 0.165517 0.080269 0.044555 0.994525 1 0.000000 0.000067 0.000034 1.000000 @@ -726,7 +726,7 @@ def forbes( Examples -------- >>> gr, gr2 = pr.example_data.f1, pr.example_data.f2 - >>> pr.stats.forbes(gr, gr2, chromsizes={"chr1": 10}) + >>> float(pr.stats.forbes(gr, gr2, chromsizes={"chr1": 10})) 0.8333333333333334 """ @@ -778,7 +778,7 @@ def jaccard( -------- >>> gr, gr2 = pr.example_data.f1, pr.example_data.f2 >>> chromsizes = pr.example_data.chromsizes - >>> pr.stats.jaccard(gr, gr2) + >>> float(pr.stats.jaccard(gr, gr2)) 0.14285714285714285 """ From 9195353dccdc91a252f02bf4c0b94f762e941bbc Mon Sep 17 00:00:00 2001 From: marco-mariotti Date: Thu, 25 Jul 2024 09:47:12 +0200 Subject: [PATCH 2/2] fix ambyguous DF order in documentation examples --- docs/how_to_overlap.rst | 8 +-- docs/how_to_rows.rst | 119 ++++++++++++++++++++-------------------- 2 files changed, 65 insertions(+), 62 deletions(-) diff --git a/docs/how_to_overlap.rst b/docs/how_to_overlap.rst index 3f7c08a0..382949e6 100644 --- a/docs/how_to_overlap.rst +++ b/docs/how_to_overlap.rst @@ -529,7 +529,7 @@ are modified to return only the actual overlaps: Method :func:`subtract_ranges ` allows to obtain the portions of intervals in self that do not overlap any interval in other: - >>> a2.subtract_ranges(b) + >>> a2.subtract_ranges(b).sort_values('Start') index | Chromosome Start End Strand odd int64 | object int64 int64 object int64 ------- --- ------------ ------- ------- -------- ------- @@ -538,13 +538,13 @@ that do not overlap any interval in other: 2 | chr1 18 21 - 0 3 | chr1 23 25 - 1 5 | chr1 32 34 + 0 - 5 | chr1 36 37 + 0 6 | chr1 33 34 + 1 + 5 | chr1 36 37 + 0 PyRanges with 7 rows, 5 columns, and 1 index columns (with 1 index duplicates). Contains 1 chromosomes and 2 strands. - >>> a2.subtract_ranges(b, strand_behavior='ignore') + >>> a2.subtract_ranges(b, strand_behavior='ignore').sort_values('Start') index | Chromosome Start End Strand odd int64 | object int64 int64 object int64 ------- --- ------------ ------- ------- -------- ------- @@ -554,8 +554,8 @@ that do not overlap any interval in other: 2 | chr1 20 21 - 0 3 | chr1 23 25 - 1 5 | chr1 32 34 + 0 - 5 | chr1 36 37 + 0 6 | chr1 33 34 + 1 + 5 | chr1 36 37 + 0 PyRanges with 8 rows, 5 columns, and 1 index columns (with 2 index duplicates). Contains 1 chromosomes and 2 strands. diff --git a/docs/how_to_rows.rst b/docs/how_to_rows.rst index 6fa0d96e..38f7b266 100644 --- a/docs/how_to_rows.rst +++ b/docs/how_to_rows.rst @@ -262,22 +262,22 @@ Sorting PyRanges PyRanges objects can be sorted (i.e. altering the order of rows) by calling the pandas dataframe method ``sort_values``, or the PyRanges method :func:`sort_ranges `. - >>> import random; random.seed(123) + >>> import random; random.seed(1) >>> c = pr.example_data.chipseq.remove_nonloc_columns() - >>> c['peak'] = [random.randint(0, 100) for _ in range(len(c))] # add a column with random values + >>> c['peak'] = [random.randint(0, 1000) for _ in range(len(c))] # add a column with random values >>> c index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 0 | chr8 28510032 28510057 - 6 - 1 | chr7 107153363 107153388 - 34 - 2 | chr5 135821802 135821827 - 11 - 3 | chr14 19418999 19419024 - 98 + 0 | chr8 28510032 28510057 - 137 + 1 | chr7 107153363 107153388 - 582 + 2 | chr5 135821802 135821827 - 867 + 3 | chr14 19418999 19419024 - 821 ... | ... ... ... ... ... - 16 | chr9 120803448 120803473 + 43 - 17 | chr6 89296757 89296782 - 71 - 18 | chr1 194245558 194245583 + 42 - 19 | chr8 57916061 57916086 + 89 + 16 | chr9 120803448 120803473 + 96 + 17 | chr6 89296757 89296782 - 499 + 18 | chr1 194245558 194245583 + 29 + 19 | chr8 57916061 57916086 + 914 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. @@ -289,18 +289,19 @@ For example, let's sort by column ``peak``: index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 3 | chr14 19418999 19419024 - 98 - 19 | chr8 57916061 57916086 + 89 - 17 | chr6 89296757 89296782 - 71 - 10 | chr4 98488749 98488774 + 71 + 19 | chr8 57916061 57916086 + 914 + 2 | chr5 135821802 135821827 - 867 + 3 | chr14 19418999 19419024 - 821 + 14 | chr2 152562484 152562509 - 807 ... | ... ... ... ... ... - 2 | chr5 135821802 135821827 - 11 - 13 | chr1 80668132 80668157 - 6 - 0 | chr8 28510032 28510057 - 6 - 7 | chr19 19571102 19571127 + 4 + 7 | chr19 19571102 19571127 + 120 + 16 | chr9 120803448 120803473 + 96 + 5 | chr21 40099618 40099643 + 64 + 18 | chr1 194245558 194245583 + 29 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. + PyRanges :func:`sort_ranges ` is designed for genomic ranges. By default, it sorts by Chromosome, Strand, then interval coordinates. If Strands are valid ( see :func:`strand_valid `), then intervals on the reverse strand are @@ -310,33 +311,34 @@ sorted in reverse order: index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 12 | chr1 38457520 38457545 + 43 - 18 | chr1 194245558 194245583 + 42 - 13 | chr1 80668132 80668157 - 6 - 9 | chr10 35419784 35419809 - 68 + 12 | chr1 38457520 38457545 + 667 + 18 | chr1 194245558 194245583 + 29 + 13 | chr1 80668132 80668157 - 388 + 9 | chr10 35419784 35419809 - 779 ... | ... ... ... ... ... - 19 | chr8 57916061 57916086 + 89 - 0 | chr8 28510032 28510057 - 6 - 6 | chr8 22714402 22714427 - 13 - 16 | chr9 120803448 120803473 + 43 + 19 | chr8 57916061 57916086 + 914 + 0 | chr8 28510032 28510057 - 137 + 6 | chr8 22714402 22714427 - 261 + 16 | chr9 120803448 120803473 + 96 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. + Above, ``chr10`` appears before ``chr8`` because that what string sorting does. We can force 'natural sorting': >>> c.sort_ranges(natsorting=True) index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 12 | chr1 38457520 38457545 + 43 - 18 | chr1 194245558 194245583 + 42 - 13 | chr1 80668132 80668157 - 6 - 14 | chr2 152562484 152562509 - 20 + 12 | chr1 38457520 38457545 + 667 + 18 | chr1 194245558 194245583 + 29 + 13 | chr1 80668132 80668157 - 388 + 14 | chr2 152562484 152562509 - 807 ... | ... ... ... ... ... - 4 | chr12 106679761 106679786 - 52 - 3 | chr14 19418999 19419024 - 98 - 7 | chr19 19571102 19571127 + 4 - 5 | chr21 40099618 40099643 + 34 + 4 | chr12 106679761 106679786 - 782 + 3 | chr14 19418999 19419024 - 821 + 7 | chr19 19571102 19571127 + 120 + 5 | chr21 40099618 40099643 + 64 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. @@ -347,18 +349,19 @@ coordinates: index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 18 | chr1 194245558 194245583 + 42 - 12 | chr1 38457520 38457545 + 43 - 13 | chr1 80668132 80668157 - 6 - 9 | chr10 35419784 35419809 - 68 + 18 | chr1 194245558 194245583 + 29 + 12 | chr1 38457520 38457545 + 667 + 13 | chr1 80668132 80668157 - 388 + 9 | chr10 35419784 35419809 - 779 ... | ... ... ... ... ... - 19 | chr8 57916061 57916086 + 89 - 0 | chr8 28510032 28510057 - 6 - 6 | chr8 22714402 22714427 - 13 - 16 | chr9 120803448 120803473 + 43 + 19 | chr8 57916061 57916086 + 914 + 0 | chr8 28510032 28510057 - 137 + 6 | chr8 22714402 22714427 - 261 + 16 | chr9 120803448 120803473 + 96 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. + Note that above each block defined by Chromosome and Strand is sorted by ``peak`` in ascending order. Let's sort by descending order: @@ -366,15 +369,15 @@ Let's sort by descending order: index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 12 | chr1 38457520 38457545 + 43 - 18 | chr1 194245558 194245583 + 42 - 13 | chr1 80668132 80668157 - 6 - 9 | chr10 35419784 35419809 - 68 + 12 | chr1 38457520 38457545 + 667 + 18 | chr1 194245558 194245583 + 29 + 13 | chr1 80668132 80668157 - 388 + 9 | chr10 35419784 35419809 - 779 ... | ... ... ... ... ... - 19 | chr8 57916061 57916086 + 89 - 6 | chr8 22714402 22714427 - 13 - 0 | chr8 28510032 28510057 - 6 - 16 | chr9 120803448 120803473 + 43 + 19 | chr8 57916061 57916086 + 914 + 6 | chr8 22714402 22714427 - 261 + 0 | chr8 28510032 28510057 - 137 + 16 | chr9 120803448 120803473 + 96 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands. @@ -384,15 +387,15 @@ To use a different priorization of genomic location columns, specify them in the index | Chromosome Start End Strand peak int64 | category int64 int64 category int64 ------- --- ------------ --------- --------- ---------- ------- - 3 | chr14 19418999 19419024 - 98 - 19 | chr8 57916061 57916086 + 89 - 10 | chr4 98488749 98488774 + 71 - 17 | chr6 89296757 89296782 - 71 + 19 | chr8 57916061 57916086 + 914 + 2 | chr5 135821802 135821827 - 867 + 3 | chr14 19418999 19419024 - 821 + 14 | chr2 152562484 152562509 - 807 ... | ... ... ... ... ... - 2 | chr5 135821802 135821827 - 11 - 13 | chr1 80668132 80668157 - 6 - 0 | chr8 28510032 28510057 - 6 - 7 | chr19 19571102 19571127 + 4 + 7 | chr19 19571102 19571127 + 120 + 16 | chr9 120803448 120803473 + 96 + 5 | chr21 40099618 40099643 + 64 + 18 | chr1 194245558 194245583 + 29 PyRanges with 20 rows, 5 columns, and 1 index columns. Contains 15 chromosomes and 2 strands.