Merge pull request #3 from Sentieon/dev

Merge Dev changes
Sentieon · Jul 9, 2024 · d6382d1 · d6382d1
2 parents 097a74e + 1f9639e
commit d6382d1
Show file tree

Hide file tree

Showing 9 changed files with 82 additions and 22 deletions.
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,32 @@
+name: Release
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: write
+
+jobs:
+  build-release:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"
+      - name: Run image
+        uses: abatilo/actions-poetry@v2
+        with:
+          poetry-version: "1.5.1"
+      - name: Test build
+        run: |
+          poetry config virtualenvs.in-project true
+          poetry install
+          . .venv/bin/activate
+          sentieon-cli -h
+      - name: build
+        run: |
+          poetry build -f sdist
+          gh release upload ${{github.event.release.tag_name}} dist/*.tar.gz
+        env:
+          GITHUB_TOKEN: ${{ github.TOKEN }}
diff --git a/README.md b/README.md
@@ -4,15 +4,23 @@
 
 A command-line interface for the Sentieon software
 
-## Setup
+## Installation from sdist (recommended)
+
+Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip:
+```sh
+curl -LO https://github.com/sentieon/sentieon-cli/releases/download/v0.4.0/sentieon_cli-1.0.0.tar.gz
+pip install sentieon_cli-1.0.0.tar.gz
+```
+
+## Installation with Poetry
 
 Create a new python virtual environment for the project, if needed:
 ```
 # Create a new venv, if needed
-python3 -m venv /path/to/new/virtal/environment/sentieon_cli
+python3 -m venv /path/to/new/virtual/environment/sentieon_cli
 
 # Activate the venv
-source /path/to/new/virtal/environment/sentieon_cli/bin/activate
+source /path/to/new/virtual/environment/sentieon_cli/bin/activate
 ```
 
 `sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry:

diff --git a/docs/dnascope.md b/docs/dnascope.md
@@ -1,6 +1,6 @@
 # DNAscope
 
-Sentieon DNAscope is a pipeline for alignment and germline variant calling (SNVs, SVs and indels) from short-read DNA sequence data. The DNAscope pipeline uses a combination of traditional statistical approaches and machine learning to achieve high variant calling accuracy.
+Sentieon DNAscope is a pipeline for alignment and germline variant calling (SNVs, SVs and indels) from short-read DNA sequence data. The DNAscope pipeline uses a combination of traditional statistical approaches and machine learning to achieve high variant calling accuracy. The DNAscope pipeline supports samples sequenced using whole-genome or targeted (hybrid-capture) enrichment library preps.
 
 The pipeline accepts as input aligned reads in BAM or CRAM format, or un-aligned reads in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats.
 
@@ -40,7 +40,7 @@ sentieon-cli dnascope [-h] \
   [-t NUMBER_THREADS] \
   [--pcr-free] \
   [-g] \
-  [--duplicate-marking MARKDUP] \
+  [--duplicate-marking DUP_MARKING] \
   [--assay ASSAY] \
   [--consensus] \
   [--dry-run] \
@@ -50,21 +50,21 @@ sentieon-cli dnascope [-h] \
 
 With FASTQ input, the DNAscope pipeline requires the following arguments:
 - `-r REFERENCE`: the location of the reference FASTA file. A reference fasta index, ".fai" file, and bwa index files, are also required.
-- `--r1-fastq R1_FASTQ`: the R1 input FASTQ. Can be used multiple times. `--r1-fastq` files without a corresponding `--r2-fastq` are assumed to be single-ended.
+- `--r1-fastq R1_FASTQ`: the R1 input FASTQ. Can be used multiple times. `--r1-fastq` files without a corresponding `--r2-fastq` are assumed to be single-ended. Be aware that the pipeline performs single-sample processing, and all fastq are expected to be from the same sample.
 - `--r2-fastq R2_FASTQ`: the R2 input FASTQ. Can be used multiple times.
 - `--readgroups READGROUPS`: readgroup information for each FASTQ. The pipeline will expect the same number of arguments to `--r1-fastq` and `--readgroups`. An example argument is, `--readgroups "@RG\tID:HG002-1\tSM:HG002\tLB:HG002-LB-1\tPL:ILLUMINA"`
 - `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository.
 - `sample.vcf.gz`: the location of the output VCF file for SNVs and indels. The pipeline requires the output file end with the suffix, ".vcf.gz". The file path without the suffix will be used as the basename for other output files.
 
 The DNAscope pipeline accepts the following optional arguments:
 - `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required.
-- `-b BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file.
+- `-b BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file. If a BED file is not supplied, the software will process the whole genome.
 - `--interval_padding INTERVAL_PADDING`: adds INTERVAL_PADDING bases padding to the edges of the input intervals. The default value is 0.
 - `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has.
-- `--pcr-free`: Use variant calling settings appropriate for a PCR-free library prep.
+- `--pcr-free`: Call variants using `--pcr_indel_model NONE`, which is appropriate for libraries prepared with a PCR-free library prep. Deduplication is still performed to identify optical duplicates. 
 - `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file.
-- `--duplicate-marking MARKDUP`: setting for duplicate marking. `markdup` will mark duplicate reads. `rmdup` will remove duplicate reads. `none` will skip duplicate marking.
-- `--assay ASSAY`: assay setting for metrics collection `WGS` or `WES`.
+- `--duplicate-marking DUP_MARKING`: setting for duplicate marking. `markdup` will mark duplicate reads. `rmdup` will remove duplicate reads. `none` will skip duplicate marking. The default setting is `markdup`.
+- `--assay ASSAY`: assay setting for metrics collection `WGS` or `WES`. The default setting is `WGS`.
 - `--consensus`: generate consensus reads during duplicate marking.
 - `-h`: print the command-line help and exit.
 - `--dry-run`: print the pipeline commands, but do not actually execute them.
@@ -85,7 +85,7 @@ sentieon-cli dnascope [-h] \
   [-t NUMBER_THREADS] \
   [--pcr-free] \
   [-g] \
-  [--duplicate-marking MARKDUP] \
+  [--duplicate-marking DUP_MARKING] \
   [--assay ASSAY] \
   [--consensus] \
   [--dry-run] \
@@ -115,7 +115,7 @@ sentieon-cli dnascope [-h] \
   [-t NUMBER_THREADS] \
   [--pcr-free] \
   [-g] \
-  [--duplicate-marking MARKDUP] \
+  [--duplicate-marking DUP_MARKING] \
   [--assay ASSAY] \
   [--consensus] \
   [--dry-run] \
@@ -139,7 +139,7 @@ sentieon-cli dnascope [-h] \
   [-t NUMBER_THREADS] \
   [--pcr-free] \
   [-g] \
-  [--duplicate-marking MARKDUP] \
+  [--duplicate-marking DUP_MARKING] \
   [--assay ASSAY] \
   [--consensus] \
   [--dry-run] \
@@ -155,10 +155,19 @@ Not supplying the `--align` and `--collate-align` arguments will direct the pipe
 
 The following files are output when processing WGS FASTQ with default arguments:
 - `sample.vcf.gz`: SNV and indel variant calls across the regions of the genome as defined in the `-b BED` file.
-- `sample_deduped.cram`: aligned, coordinate-sorted and duplicate-marked read data from the input FASTQ files.
+- `sample_deduped.cram` or `sample_deduped.bam`: aligned, coordinate-sorted and duplicate-marked read data from the input FASTQ files.
 - `sample_svs.vcf.gz`: structural variant calls from DNAscope and SVSolver.
-- `sample_metrics`: a directory containing QC metrics for the analyzed sample.
-  - `sample_metrics/coverage*`: coverage metrics for the processed sample. Only available for WGS samples. Replaced by HS metrics for WES samples.
+- `sample_metrics`: a directory containing QC metrics for the analyzed sample. 
+  - `sample_metrics/coverage*`: coverage metrics for the processed sample. Only available for WGS samples.
+  - `sample_metrics/{sample}.txt.alignment_stat.txt`: Metrics from the AlignmentStat algo.
+  - `sample_metrics/{sample}.txt.base_distribution_by_cycle.txt`: Metrics from the BaseDistributionByCycle algo.
+  - `sample_metrics/{sample}.txt.dedup_metrics.txt`: Metrics from the Dedup algo.
+  - `sample_metrics/{sample}.txt.gc_bias*`: Metrics from the GCBias algo. Only available for WGS samples.
+  - `sample_metrics/{sample}.txt.insert_size.txt`: Metrics from the InsertSizeMetricAlgo algo.
+  - `sample_metrics/{sample}.txt.mean_qual_by_cycle.txt`: Metrics from the MeanQualityByCycle algo.
+  - `sample_metrics/{sample}.txt.qual_distribution.txt`: Metrics from the QualDistribution algo.
+  - `sample_metrics/{sample}.txt.wgs.txt`: Metrics from the WgsMetricsAlgo algo. Only available for WGS samples.
+  - `sample_metrics/{sample}.txt.hybrid-selection.txt`: Metrics from the HsMetricAlgo algo.
   - `sample_metrics/multiqc_report.html`: collected QC metrics aggregated by MultiQC.
 
 [samtools]: https://www.htslib.org/

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 #https://stackoverflow.com/questions/75408641/whats-difference-between-tool-poetry-and-project-in-pyproject-toml
 [tool.poetry]
 name = "sentieon_cli"
-version = "0.3.0"
+version = "1.0.0"
 description = "entry point for sentieon command-line tools"
 authors = ["Don Freed <[email protected]>", "Brent <[email protected]>"]
 readme = "README.md"

diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py
@@ -231,6 +231,7 @@ def cmd_samtools_fastq_minimap2(
     model_bundle: pathlib.Path,
     cores: int,
     rg_lines: List[str],
+    sample_name: str,
     input_ref: Optional[pathlib.Path] = None,
     fastq_taglist: str = "*",
     util_sort_args: str = "--cram_write_options version=3.0,compressor=rans",
@@ -284,6 +285,9 @@ def cmd_samtools_fastq_minimap2(
     # Commands to replace the @RG lines in the header
     rg_cmds: List[List[str]] = []
     for rg_line in rg_lines:
+        # Add a SM value, if missing
+        if "\tSM:" not in rg_line:
+            rg_line += f"\tSM:{sample_name}"
         rg_cmds.append(
             [
                 "samtools",

diff --git a/sentieon_cli/dnascope_longread.py b/sentieon_cli/dnascope_longread.py
@@ -79,6 +79,7 @@ def align_inputs(
 
     res: List[pathlib.Path] = []
     suffix = "bam" if bam_format else "cram"
+    sample_name = output_vcf.name.replace(".vcf.gz", "")
     for i, input_aln in enumerate(sample_input):
         out_aln = pathlib.Path(
             str(output_vcf).replace(".vcf.gz", f"_mm2_sorted_{i}.{suffix}")
@@ -96,6 +97,7 @@ def align_inputs(
                 model_bundle,
                 cores,
                 rg_lines,
+                sample_name,
                 input_ref,
                 fastq_taglist,
                 util_sort_args,

diff --git a/sentieon_cli/runner.py b/sentieon_cli/runner.py
@@ -9,7 +9,7 @@
 
 def run(cmd: str):
     """Run a command."""
-    logger.debug("running: %s", cmd)
+    logger.info("running: %s", cmd)
     t0 = time.time()
     sp.run(
         cmd,
@@ -19,4 +19,4 @@ def run(cmd: str):
         stderr=sys.stderr,
         executable="/bin/bash",
     )
-    logger.debug("finished in: %s seconds", f"{time.time() - t0:.1f}")
+    logger.info("finished in: %s seconds", f"{time.time() - t0:.1f}")
diff --git a/sentieon_cli/scripts/vcf_mod.py b/sentieon_cli/scripts/vcf_mod.py
@@ -4,7 +4,7 @@
 Functionality for manipulating DNAscope-LR VCFs
 """
 
-# Copyright (c) 2023 Sentieon Inc. All rights reserved
+# Copyright (c) 2023-2024 Sentieon Inc. All rights reserved
 
 from __future__ import print_function
 import argparse
@@ -551,7 +551,7 @@ def join2(f0, f1, f2, v0, v1, v2, pos, bed):
 
     ps = bed and bed.get(v.chrom, pos, pos) or None
     if ps:
-        v.samples[0]['PS'] = ps[0][0]
+        v.samples[0]['PS'] = ps[0][0]+1
 
     if v1:
         i1 = int(v1.samples[0].get('GT'))
@@ -686,6 +686,11 @@ def merge2(vcfi1, vcfi2, vcfi3, vcfi0, vcfo, bed=None, **kwargs):
                 v.line = None
         else:
             v = v0
+            ps = (bed and v and v.samples[0].get('PS') and
+                  bed.get(v.chrom, pos, pos) or None)
+            if ps:
+                v.samples[0]['PS'] = ps[0][0]+1
+                v.line = None
 
         if v:
             vcfo.emit(v)

diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py
@@ -15,7 +15,7 @@
 
 from .logging import get_logger
 
-__version__ = "0.3.0"
+__version__ = "1.0.0"
 
 logger = get_logger(__name__)