feat: read_source (#72)

Co-authored-by: CJ Herrmann <[email protected]> Co-authored-by: Alex Kanitz <[email protected]>
zavolanlab · May 18, 2022 · 6e6dc8c · 6e6dc8c
1 parent 35f6e2b
commit 6e6dc8c
Show file tree

Hide file tree

Showing 24 changed files with 5,993 additions and 239 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,32 +11,32 @@ jobs:
         shell: bash -l {0}
     steps:
 
-      - name: Checkout the repository
+      - name: check out repository
         uses: actions/checkout@v2
 
-      - name: Setup miniconda & htsinfer root env
+      - name: set up miniconda and env
         uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           environment-file: environment.yml
           activate-environment: htsinfer
           auto-activate-base: false
 
-      - name: Update htsinfer env with dev. packages
+      - name: update env with dev packages
         run: conda env update --file environment-dev.yml
 
-      - name: Display all miniconda & env info
+      - name: display env info
         run: |
           conda info -a
           conda list
 
-      - name: Run Flake8
+      - name: flake8
         run: flake8
 
-      - name: Run Pylint
+      - name: pylint
         run: pylint --rcfile pylint.cfg setup.py htsinfer/
 
-      - name: Run Mypy
+      - name: mypy
         run: mypy htsinfer
 
   unit-testing:
@@ -54,10 +54,10 @@ jobs:
 
     steps:
 
-      - name: Checkout the repository
+      - name: check out repository
         uses: actions/checkout@v2
 
-      - name: Setup miniconda & htsinfer root env
+      - name: set up miniconda and env
         uses: conda-incubator/setup-miniconda@v2
         with:
           python-version: ${{ matrix.python-version }}
@@ -66,20 +66,20 @@ jobs:
           activate-environment: htsinfer
           auto-activate-base: false
 
-      - name: Update htsinfer env with dev. packages
+      - name: update env with dev packages
         run: conda env update --file environment-dev.yml
 
-      - name: Display all miniconda & env info
+      - name: display env info
         run: |
           conda info -a
           conda list
 
-      - name: Test code coverage
+      - name: run unit tests
         run: |
           coverage run --source htsinfer -m pytest
           coverage xml
 
-      - name: Submit Report to Codecov
+      - name: submit coverage report
         uses: codecov/codecov-action@v2
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
@@ -102,10 +102,10 @@ jobs:
 
     steps:
 
-      - name: Checkout the repository
+      - name: check out repository
         uses: actions/checkout@v2
 
-      - name: Setup miniconda & htsinfer root env
+      - name: set up miniconda and env
         uses: conda-incubator/setup-miniconda@v2
         with:
           python-version: ${{ matrix.python-version }}
@@ -114,60 +114,19 @@ jobs:
           activate-environment: htsinfer
           auto-activate-base: false
 
-      - name: Update htsinfer env with dev. packages
+      - name: update env with dev packages
         run: conda env update --file environment-dev.yml
 
-      - name: Display all miniconda & env info
+      - name: display env info
         run: |
           conda info -a
           conda list
 
-      - name: Run integration test
+      - name: integration test - help screen
         run: htsinfer --help
 
-      - name: Cleanup
-        run: |
-          htsinfer --cleanup-regime=KEEP_NONE --records=3 tests/files/single.fastq
-          htsinfer --cleanup-regime=KEEP_NONE tests/files/{first,second}_mate.fastq
-
-#  semantic-release:
-#
-#    needs:
-#      - static-code-analysis
-#      - unit-testing
-#      - integration-testing
-#
-#    runs-on: ubuntu-latest
-#
-#    defaults:
-#      run:
-#        shell: bash -l {0}
-#    
-#    steps:
-#
-#      - name: Checkout the repository
-#        uses: actions/checkout@v3
-#        with:
-#          fetch-depth: '0'
-#
-#      - name: Setup miniconda & htsinfer root env
-#        uses: conda-incubator/setup-miniconda@v2
-#        with:
-#          auto-update-conda: true
-#          environment-file: environment.yml
-#          activate-environment: htsinfer
-#          auto-activate-base: false
-#
-#      - name: Update htsinfer env with dev. packages
-#        run: conda env update --file environment-dev.yml
-#
-#      - name: Display all miniconda & env info
-#        run: |
-#          conda info -a
-#          conda list
-#
-#      - name: Publish the package (DEBUG mode)
-#        run: |
-#          git config --global user.name "semantic-release (via GH Actions)"
-#          git config --global user.email "semantic-release@actions"
-#          semantic-release publish -v DEBUG
+      - name: integration test - single-ended library
+        run: htsinfer --cleanup-regime=KEEP_NONE --verbosity=DEBUG tests/files/adapter_single.fastq
+
+      - name: integration test - paired-ended library
+        run: htsinfer --cleanup-regime=KEEP_NONE --verbosity=DEBUG tests/files/adapter_{1,2}.fastq
diff --git a/.gitignore b/.gitignore
@@ -109,4 +109,8 @@ dmypy.json
 # End of https://www.gitignore.io/api/python
 
 # Custom additions
+.vscode
+results
+.DS_Store
+tests/.DS_Store
 results_htsinfer
diff --git a/README.md b/README.md
@@ -14,10 +14,12 @@ htsinfer [--output-directory PATH] [--temporary-directory PATH]
          [--cleanup-regime {DEFAULT,KEEP_ALL,KEEP_NONE,KEEP_RESULTS}]
          [--records INT]
          [--threads INT]
-         [--organism STR] [--transcripts FASTA]
+         [--transcripts FASTA]
          [--read-layout-adapters PATH]
          [--read-layout-min-match-percentage FLOAT]
          [--read-layout-min-frequency-ratio FLOAT]
+         [--library-source-min-match-percentage FLOAT]
+         [--library-source-min-frequency-ratio FLOAT]
          [--read-orientation-min-mapped-reads INT]
          [--read-orientation-min-fraction FLOAT]
          [--verbosity {DEBUG,INFO,WARN,ERROR,CRITICAL}]
@@ -50,17 +52,15 @@ optional arguments:
                         specified value equals or exceeds the number of
                         available records, all records will be processed
                         (default: 0)
+  --threads INT         number of threads to run STAR with
   --transcripts FASTA   FASTA file containing transcripts to be used for
-                        mapping files `--file-1` and `--file-2` against for
-                        inferring organism and read orientation. Requires that
+                        mapping files `--file-1` and `--file-2` for inferring
+                        library source and read orientation. Requires that
                         sequence identifier lines are separated by the pipe
                         (`|`) character and that the 4th and 5th columns
-                        contain a short organism name and taxon identifier,
+                        contain a short source name and taxon identifier,
                         respectively. Example sequence identifier:
                         `rpl-13|ACYPI006272|ACYPI006272-RA|apisum|7029`
-  --threads INT         number of threads to run STAR with
-  --organism STR        source organism of the sequencing library, if provided, 
-                        will not be inferred by the application
   --read-layout-adapters PATH
                         path to text file containing 3' adapter sequences to
                         scan for (one sequence per line; default:
@@ -74,6 +74,14 @@ optional arguments:
                         minimum frequency ratio between the first and second
                         most frequent adapter in order for the former to be
                         considered as the library's 3'-end adapter (default: 2)
+  --library-source-min-match-percentage FLOAT
+                        Minimum percentage of reads that are consistent with a
+                        given source in order for it to be considered as the
+                        to be considered the library's source.
+  --library-source-min-frequency-ratio FLOAT
+                        Minimum frequency ratio between the first and second
+                        most frequent source in order for the former to be
+                        considered the library's source.
   --read-orientation-min-mapped-reads INT
                         minimum number of mapped reads for deeming the read
                         orientation result reliable (default: 20)

diff --git a/environment.yml b/environment.yml
@@ -5,10 +5,11 @@ channels:
   - conda-forge
 dependencies:
   - biopython>=1.78
-  - pip>=20.2.3
+  - kallisto>=0.46.1
   - pandas>=1.0.5
-  - pydantic>=1.8.1
+  - pip>=20.2.3
   - pyahocorasick>=1.4.0
+  - pydantic>=1.8.1
   - pysam>=0.16.0
   - python>=3.6, <=3.10
   - star>=2.7.6

diff --git a/htsinfer/__init__.py b/htsinfer/__init__.py
@@ -2,4 +2,4 @@
 
 from htsinfer.htsinfer import HtsInfer  # noqa:F401
 
-__version__ = "0.8.0"
+__version__ = "0.9.0"
diff --git a/htsinfer/cli.py b/htsinfer/cli.py
@@ -115,23 +115,6 @@ def __call__(
             "records, all records will be processed"
         )
     )
-    parser.add_argument(
-        "--transcripts",
-        metavar="FASTA",
-        type=str,
-        default=(
-            Path(__file__).parents[1].absolute() / "data/transcripts.fasta.gz"
-        ),
-        help=(
-            "FASTA file containing transcripts to be used for mapping files "
-            "`--file-1` and `--file-2` against for inferring organism and "
-            "read orientation. Requires that sequence identifier lines are "
-            "separated by the pipe (`|`) character and that the 4th and 5th "
-            "columns contain a short organism name and taxon identifier, "
-            "respectively. Example sequence identifier: "
-            "`rpl-13|ACYPI006272|ACYPI006272-RA|apisum|7029`"
-        )
-    )
     parser.add_argument(
         "--threads",
         metavar="INT",
@@ -142,13 +125,20 @@ def __call__(
         )
     )
     parser.add_argument(
-        "--organism",
-        metavar="STR",
+        "--transcripts",
+        metavar="FASTA",
         type=str,
-        default="hsapiens",
+        default=(
+            Path(__file__).parents[1].absolute() / "data/transcripts.fasta.gz"
+        ),
         help=(
-            "source organism of the sequencing library, if provided: "
-            "will not be inferred by the application"
+            "FASTA file containing transcripts to be used for mapping files "
+            "`--file-1` and `--file-2` for inferring library source and read "
+            "orientation. Requires that sequence identifier lines are "
+            "separated by the pipe (`|`) character and that the 4th and 5th "
+            "columns contain a short source name and taxon identifier, "
+            "respectively. Example sequence identifier: "
+            "`rpl-13|ACYPI006272|ACYPI006272-RA|apisum|7029`"
         )
     )
     parser.add_argument(
@@ -186,6 +176,27 @@ def __call__(
             "library's 3'-end adapter"
         )
     )
+    parser.add_argument(
+        "--library-source-min-match-percentage",
+        metavar="FLOAT",
+        type=float,
+        default=5,
+        help=(
+            "minimum percentage of reads that are consistent with a given "
+            "source in order for it to be considered the library's source"
+        )
+    )
+    parser.add_argument(
+        "--library-source-min-frequency-ratio",
+        metavar="FLOAT",
+        type=float,
+        default=2,
+        help=(
+            "minimum frequency ratio between the first and second "
+            "most frequent source in order for the former to be considered "
+            "the library's source"
+        )
+    )
     parser.add_argument(
         "--read-orientation-min-mapped-reads",
         metavar="INT",
@@ -268,11 +279,12 @@ def main() -> None:
             cleanup_regime=CleanupRegimes[args.cleanup_regime],
             records=args.records,
             threads=args.threads,
-            organism=args.organism,
             transcripts_file=args.transcripts,
             read_layout_adapter_file=args.read_layout_adapters,
             read_layout_min_match_pct=args.read_layout_min_match_percentage,
             read_layout_min_freq_ratio=args.read_layout_min_frequency_ratio,
+            lib_source_min_match_pct=args.library_source_min_match_percentage,
+            lib_source_min_freq_ratio=args.library_source_min_frequency_ratio,
             read_orientation_min_mapped_reads=(
                 args.read_orientation_min_mapped_reads
             ),

diff --git a/htsinfer/data/transcript.fasta.zip b/htsinfer/data/transcript.fasta.zip
diff --git a/htsinfer/exceptions.py b/htsinfer/exceptions.py
@@ -11,10 +11,18 @@ class InconsistentFastqIdentifiers(Exception):
     """
 
 
+class KallistoProblem(Exception):
+    """Exception raised when running kallisto index and quant commands."""
+
+
 class MetadataWarning(Exception):
     """Exception raised when metadata could not be determined."""
 
 
+class StarProblem(Exception):
+    """Exception raised when running STAR index and quant commands."""
+
+
 class UnknownFastqIdentifier(Exception):
     """Exception raised when a FASTQ sequence identifier of unknown format was
     ecountered.
@@ -24,7 +32,3 @@ class UnknownFastqIdentifier(Exception):
 class WorkEnvProblem(Exception):
     """Exception raised when the work environment could not be set up or
     cleaned."""
-
-
-class StarProblem(Exception):
-    """Exception raised when running STAR index and quant commands."""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@

		from htsinfer.htsinfer import HtsInfer # noqa:F401

		__version__ = "0.8.0"
		__version__ = "0.9.0"