diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..9eb17ab --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,32 @@ +name: Release +on: + release: + types: [published] + +permissions: + contents: write + +jobs: + build-release: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.12" + - name: Run image + uses: abatilo/actions-poetry@v2 + with: + poetry-version: "1.5.1" + - name: Test build + run: | + poetry config virtualenvs.in-project true + poetry install + . .venv/bin/activate + sentieon-cli -h + - name: build + run: | + poetry build -f sdist + gh release upload ${{github.event.release.tag_name}} dist/*.tar.gz + env: + GITHUB_TOKEN: ${{ github.TOKEN }} \ No newline at end of file diff --git a/README.md b/README.md index ef38e6e..3e20995 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,23 @@ A command-line interface for the Sentieon software -## Setup +## Installation from sdist (recommended) + +Download the latest tar.gz file from the GitHub release page, https://github.com/sentieon/sentieon-cli/releases/ and install the package with pip: +```sh +curl -LO https://github.com/sentieon/sentieon-cli/releases/download/v0.4.0/sentieon_cli-1.0.0.tar.gz +pip install sentieon_cli-1.0.0.tar.gz +``` + +## Installation with Poetry Create a new python virtual environment for the project, if needed: ``` # Create a new venv, if needed -python3 -m venv /path/to/new/virtal/environment/sentieon_cli +python3 -m venv /path/to/new/virtual/environment/sentieon_cli # Activate the venv -source /path/to/new/virtal/environment/sentieon_cli/bin/activate +source /path/to/new/virtual/environment/sentieon_cli/bin/activate ``` `sentieon-cli` uses [poetry](https://pypi.org/project/poetry/) for packaging and dependency management. Initially, you will need to install poetry: diff --git a/docs/dnascope.md b/docs/dnascope.md index d9a64b7..3025d0e 100644 --- a/docs/dnascope.md +++ b/docs/dnascope.md @@ -1,6 +1,6 @@ # DNAscope -Sentieon DNAscope is a pipeline for alignment and germline variant calling (SNVs, SVs and indels) from short-read DNA sequence data. The DNAscope pipeline uses a combination of traditional statistical approaches and machine learning to achieve high variant calling accuracy. +Sentieon DNAscope is a pipeline for alignment and germline variant calling (SNVs, SVs and indels) from short-read DNA sequence data. The DNAscope pipeline uses a combination of traditional statistical approaches and machine learning to achieve high variant calling accuracy. The DNAscope pipeline supports samples sequenced using whole-genome or targeted (hybrid-capture) enrichment library preps. The pipeline accepts as input aligned reads in BAM or CRAM format, or un-aligned reads in FASTQ, uBAM, or uCRAM format. The pipeline will output variants in the VCF (or gVCF) formats and aligned reads in BAM or CRAM formats. @@ -40,7 +40,7 @@ sentieon-cli dnascope [-h] \ [-t NUMBER_THREADS] \ [--pcr-free] \ [-g] \ - [--duplicate-marking MARKDUP] \ + [--duplicate-marking DUP_MARKING] \ [--assay ASSAY] \ [--consensus] \ [--dry-run] \ @@ -50,7 +50,7 @@ sentieon-cli dnascope [-h] \ With FASTQ input, the DNAscope pipeline requires the following arguments: - `-r REFERENCE`: the location of the reference FASTA file. A reference fasta index, ".fai" file, and bwa index files, are also required. -- `--r1-fastq R1_FASTQ`: the R1 input FASTQ. Can be used multiple times. `--r1-fastq` files without a corresponding `--r2-fastq` are assumed to be single-ended. +- `--r1-fastq R1_FASTQ`: the R1 input FASTQ. Can be used multiple times. `--r1-fastq` files without a corresponding `--r2-fastq` are assumed to be single-ended. Be aware that the pipeline performs single-sample processing, and all fastq are expected to be from the same sample. - `--r2-fastq R2_FASTQ`: the R2 input FASTQ. Can be used multiple times. - `--readgroups READGROUPS`: readgroup information for each FASTQ. The pipeline will expect the same number of arguments to `--r1-fastq` and `--readgroups`. An example argument is, `--readgroups "@RG\tID:HG002-1\tSM:HG002\tLB:HG002-LB-1\tPL:ILLUMINA"` - `-m MODEL_BUNDLE`: the location of the model bundle. Model bundle files can be found in the [sentieon-models] repository. @@ -58,13 +58,13 @@ With FASTQ input, the DNAscope pipeline requires the following arguments: The DNAscope pipeline accepts the following optional arguments: - `-d DBSNP`: the location of the Single Nucleotide Polymorphism database (dbSNP) used to label known variants in VCF (`.vcf`) or bgzip compressed VCF (`.vcf.gz`) format. Only one file is supported. Supplying this file will annotate variants with their dbSNP refSNP ID numbers. A VCF index file is required. -- `-b BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file. +- `-b BED`: interval in the reference to restrict variant calling, in BED file format. Supplying this file will limit variant calling to the intervals inside the BED file. If a BED file is not supplied, the software will process the whole genome. - `--interval_padding INTERVAL_PADDING`: adds INTERVAL_PADDING bases padding to the edges of the input intervals. The default value is 0. - `-t NUMBER_THREADS`: number of computing threads that will be used by the software to run parallel processes. The argument is optional; if omitted, the pipeline will use as many threads as the server has. -- `--pcr-free`: Use variant calling settings appropriate for a PCR-free library prep. +- `--pcr-free`: Call variants using `--pcr_indel_model NONE`, which is appropriate for libraries prepared with a PCR-free library prep. Deduplication is still performed to identify optical duplicates. - `-g`: output variants in the gVCF format, in addition to the VCF output file. The tool will output a bgzip compressed gVCF file with a corresponding index file. -- `--duplicate-marking MARKDUP`: setting for duplicate marking. `markdup` will mark duplicate reads. `rmdup` will remove duplicate reads. `none` will skip duplicate marking. -- `--assay ASSAY`: assay setting for metrics collection `WGS` or `WES`. +- `--duplicate-marking DUP_MARKING`: setting for duplicate marking. `markdup` will mark duplicate reads. `rmdup` will remove duplicate reads. `none` will skip duplicate marking. The default setting is `markdup`. +- `--assay ASSAY`: assay setting for metrics collection `WGS` or `WES`. The default setting is `WGS`. - `--consensus`: generate consensus reads during duplicate marking. - `-h`: print the command-line help and exit. - `--dry-run`: print the pipeline commands, but do not actually execute them. @@ -85,7 +85,7 @@ sentieon-cli dnascope [-h] \ [-t NUMBER_THREADS] \ [--pcr-free] \ [-g] \ - [--duplicate-marking MARKDUP] \ + [--duplicate-marking DUP_MARKING] \ [--assay ASSAY] \ [--consensus] \ [--dry-run] \ @@ -115,7 +115,7 @@ sentieon-cli dnascope [-h] \ [-t NUMBER_THREADS] \ [--pcr-free] \ [-g] \ - [--duplicate-marking MARKDUP] \ + [--duplicate-marking DUP_MARKING] \ [--assay ASSAY] \ [--consensus] \ [--dry-run] \ @@ -139,7 +139,7 @@ sentieon-cli dnascope [-h] \ [-t NUMBER_THREADS] \ [--pcr-free] \ [-g] \ - [--duplicate-marking MARKDUP] \ + [--duplicate-marking DUP_MARKING] \ [--assay ASSAY] \ [--consensus] \ [--dry-run] \ @@ -155,10 +155,19 @@ Not supplying the `--align` and `--collate-align` arguments will direct the pipe The following files are output when processing WGS FASTQ with default arguments: - `sample.vcf.gz`: SNV and indel variant calls across the regions of the genome as defined in the `-b BED` file. -- `sample_deduped.cram`: aligned, coordinate-sorted and duplicate-marked read data from the input FASTQ files. +- `sample_deduped.cram` or `sample_deduped.bam`: aligned, coordinate-sorted and duplicate-marked read data from the input FASTQ files. - `sample_svs.vcf.gz`: structural variant calls from DNAscope and SVSolver. -- `sample_metrics`: a directory containing QC metrics for the analyzed sample. - - `sample_metrics/coverage*`: coverage metrics for the processed sample. Only available for WGS samples. Replaced by HS metrics for WES samples. +- `sample_metrics`: a directory containing QC metrics for the analyzed sample. + - `sample_metrics/coverage*`: coverage metrics for the processed sample. Only available for WGS samples. + - `sample_metrics/{sample}.txt.alignment_stat.txt`: Metrics from the AlignmentStat algo. + - `sample_metrics/{sample}.txt.base_distribution_by_cycle.txt`: Metrics from the BaseDistributionByCycle algo. + - `sample_metrics/{sample}.txt.dedup_metrics.txt`: Metrics from the Dedup algo. + - `sample_metrics/{sample}.txt.gc_bias*`: Metrics from the GCBias algo. Only available for WGS samples. + - `sample_metrics/{sample}.txt.insert_size.txt`: Metrics from the InsertSizeMetricAlgo algo. + - `sample_metrics/{sample}.txt.mean_qual_by_cycle.txt`: Metrics from the MeanQualityByCycle algo. + - `sample_metrics/{sample}.txt.qual_distribution.txt`: Metrics from the QualDistribution algo. + - `sample_metrics/{sample}.txt.wgs.txt`: Metrics from the WgsMetricsAlgo algo. Only available for WGS samples. + - `sample_metrics/{sample}.txt.hybrid-selection.txt`: Metrics from the HsMetricAlgo algo. - `sample_metrics/multiqc_report.html`: collected QC metrics aggregated by MultiQC. [samtools]: https://www.htslib.org/ diff --git a/pyproject.toml b/pyproject.toml index 810ef6b..c671584 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ #https://stackoverflow.com/questions/75408641/whats-difference-between-tool-poetry-and-project-in-pyproject-toml [tool.poetry] name = "sentieon_cli" -version = "0.3.0" +version = "1.0.0" description = "entry point for sentieon command-line tools" authors = ["Don Freed ", "Brent "] readme = "README.md" diff --git a/sentieon_cli/command_strings.py b/sentieon_cli/command_strings.py index b6a649f..3893d5a 100644 --- a/sentieon_cli/command_strings.py +++ b/sentieon_cli/command_strings.py @@ -231,6 +231,7 @@ def cmd_samtools_fastq_minimap2( model_bundle: pathlib.Path, cores: int, rg_lines: List[str], + sample_name: str, input_ref: Optional[pathlib.Path] = None, fastq_taglist: str = "*", util_sort_args: str = "--cram_write_options version=3.0,compressor=rans", @@ -284,6 +285,9 @@ def cmd_samtools_fastq_minimap2( # Commands to replace the @RG lines in the header rg_cmds: List[List[str]] = [] for rg_line in rg_lines: + # Add a SM value, if missing + if "\tSM:" not in rg_line: + rg_line += f"\tSM:{sample_name}" rg_cmds.append( [ "samtools", diff --git a/sentieon_cli/dnascope_longread.py b/sentieon_cli/dnascope_longread.py index 7143129..74e7b4a 100644 --- a/sentieon_cli/dnascope_longread.py +++ b/sentieon_cli/dnascope_longread.py @@ -79,6 +79,7 @@ def align_inputs( res: List[pathlib.Path] = [] suffix = "bam" if bam_format else "cram" + sample_name = output_vcf.name.replace(".vcf.gz", "") for i, input_aln in enumerate(sample_input): out_aln = pathlib.Path( str(output_vcf).replace(".vcf.gz", f"_mm2_sorted_{i}.{suffix}") @@ -96,6 +97,7 @@ def align_inputs( model_bundle, cores, rg_lines, + sample_name, input_ref, fastq_taglist, util_sort_args, diff --git a/sentieon_cli/runner.py b/sentieon_cli/runner.py index ad79cf3..2596664 100644 --- a/sentieon_cli/runner.py +++ b/sentieon_cli/runner.py @@ -9,7 +9,7 @@ def run(cmd: str): """Run a command.""" - logger.debug("running: %s", cmd) + logger.info("running: %s", cmd) t0 = time.time() sp.run( cmd, @@ -19,4 +19,4 @@ def run(cmd: str): stderr=sys.stderr, executable="/bin/bash", ) - logger.debug("finished in: %s seconds", f"{time.time() - t0:.1f}") + logger.info("finished in: %s seconds", f"{time.time() - t0:.1f}") diff --git a/sentieon_cli/scripts/vcf_mod.py b/sentieon_cli/scripts/vcf_mod.py index 14041aa..540bf52 100644 --- a/sentieon_cli/scripts/vcf_mod.py +++ b/sentieon_cli/scripts/vcf_mod.py @@ -4,7 +4,7 @@ Functionality for manipulating DNAscope-LR VCFs """ -# Copyright (c) 2023 Sentieon Inc. All rights reserved +# Copyright (c) 2023-2024 Sentieon Inc. All rights reserved from __future__ import print_function import argparse @@ -551,7 +551,7 @@ def join2(f0, f1, f2, v0, v1, v2, pos, bed): ps = bed and bed.get(v.chrom, pos, pos) or None if ps: - v.samples[0]['PS'] = ps[0][0] + v.samples[0]['PS'] = ps[0][0]+1 if v1: i1 = int(v1.samples[0].get('GT')) @@ -686,6 +686,11 @@ def merge2(vcfi1, vcfi2, vcfi3, vcfi0, vcfo, bed=None, **kwargs): v.line = None else: v = v0 + ps = (bed and v and v.samples[0].get('PS') and + bed.get(v.chrom, pos, pos) or None) + if ps: + v.samples[0]['PS'] = ps[0][0]+1 + v.line = None if v: vcfo.emit(v) diff --git a/sentieon_cli/util.py b/sentieon_cli/util.py index 1e7af94..2565994 100644 --- a/sentieon_cli/util.py +++ b/sentieon_cli/util.py @@ -15,7 +15,7 @@ from .logging import get_logger -__version__ = "0.3.0" +__version__ = "1.0.0" logger = get_logger(__name__)