From 07c0ba412dc43c03f5474d80657f612abe276487 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Mon, 6 Dec 2021 17:48:27 +0900 Subject: [PATCH 01/26] Bump up version number --- CHANGELOG.rst | 3 +++ fuc/version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6201458..a16fdb9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,9 @@ Changelog ********* +0.29.0 (in development) +----------------------- + 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/version.py b/fuc/version.py index 1bf3675..9093e4e 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.28.0' +__version__ = '0.29.0' From 770a159bc97e2811c116529c8d7cac25eaeb4bee Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Mon, 6 Dec 2021 20:36:48 +0900 Subject: [PATCH 02/26] Add `pyvcf.VcfFrame.phased` --- CHANGELOG.rst | 2 ++ fuc/api/pyvcf.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a16fdb9..fa90e17 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ Changelog 0.29.0 (in development) ----------------------- +* Add new property ``pyvcf.VcfFrame.phased``. + 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 650f5ff..f50b2e0 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1127,11 +1127,80 @@ def sites_only(self): """bool : Whether the VCF is sites-only.""" return not self.samples or 'FORMAT' not in self.df.columns + @property + def phased(self): + """ + Return True if every genotype in VcfFrame is haplotype phased. + + Returns + ------- + bool + If VcfFrame is fully phased, return True, if not return False. + + Examples + -------- + + >>> from fuc import pyvcf + >>> data1 = { + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'A'], + ... 'ALT': ['A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'A': ['1|1', '0|0', '1|0'], + ... 'B': ['1|0', '0|1', '1|0'], + ... } + >>> vf1 = pyvcf.VcfFrame.from_dict([], data1) + >>> vf1.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 1|1 1|0 + 1 chr1 101 . T C . . . GT 0|0 0|1 + 2 chr1 102 . A T . . . GT 1|0 1|0 + >>> vf1.phased + True + >>> data2 = { + ... 'CHROM': ['chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102], + ... 'ID': ['.', '.', '.'], + ... 'REF': ['G', 'T', 'A'], + ... 'ALT': ['A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.'], + ... 'FILTER': ['.', '.', '.'], + ... 'INFO': ['.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT'], + ... 'C': ['1|1', '0/0', '1|0'], + ... 'D': ['1|0', '0/1', '1|0'], + ... } + >>> vf2 = pyvcf.VcfFrame.from_dict([], data2) + >>> vf2.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C D + 0 chr1 100 . G A . . . GT 1|1 1|0 + 1 chr1 101 . T C . . . GT 0/0 0/1 + 2 chr1 102 . A T . . . GT 1|0 1|0 + >>> vf2.phased + False + """ + def one_row(r): + def one_gt(g): + return '|' in g.split(':')[0] + return r[9:].apply(one_gt).all() + s = self.df.apply(one_row, axis=1) + return s.all() + @property def empty(self): """ Indicator whether VcfFrame is empty. + Returns + ------- + bool + If VcfFrame is empty, return True, if not return False. + Examples -------- From 22c965fac48877d26a58815bb394da74456ba2fc Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 8 Dec 2021 15:23:29 +0900 Subject: [PATCH 03/26] Update `pyvcf.VcfFrame.phased` --- fuc/api/pyvcf.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index f50b2e0..13e45e3 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1132,6 +1132,8 @@ def phased(self): """ Return True if every genotype in VcfFrame is haplotype phased. + The method will return False if VcfFrame is empty. + Returns ------- bool @@ -1184,6 +1186,8 @@ def phased(self): >>> vf2.phased False """ + if self.empty: + return False def one_row(r): def one_gt(g): return '|' in g.split(':')[0] From f8e29bf8240a46c821218733efe8ffbd34c62dca Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 8 Dec 2021 15:40:23 +0900 Subject: [PATCH 04/26] Update docs --- fuc/api/pyvcf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 13e45e3..cd8e9c6 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1132,12 +1132,11 @@ def phased(self): """ Return True if every genotype in VcfFrame is haplotype phased. - The method will return False if VcfFrame is empty. - Returns ------- bool If VcfFrame is fully phased, return True, if not return False. + Also return False if VcfFrame is empty. Examples -------- From 432639d5749d0be1d6db858e4d9c36e517881ff2 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 8 Dec 2021 15:59:53 +0900 Subject: [PATCH 05/26] Update `pyvcf.VcfFrame.slice` --- CHANGELOG.rst | 1 + fuc/api/common.py | 19 +++++++++++++++---- fuc/api/pyvcf.py | 4 ++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index fa90e17..c1723a9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ Changelog ----------------------- * Add new property ``pyvcf.VcfFrame.phased``. +* Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/api/common.py b/fuc/api/common.py index 33b6ea4..164b314 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -1333,6 +1333,9 @@ def update_chr_prefix(regions, mode='remove'): """ Add or remove the (annoying) 'chr' string from specified regions. + The method will automatically detect regions that don't need to be + updated and will return them unchanged. + Parameters ---------- regions : str or list @@ -1349,10 +1352,18 @@ def update_chr_prefix(regions, mode='remove'): ------- >>> from fuc import common - >>> common.update_chr_prefix(['chr1:100-200', '1:300-400'], mode='remove') - ['1:100-200', '1:300-400'] - >>> common.update_chr_prefix(['chr1:100-200', '1:300-400'], mode='add') - ['chr1:100-200', 'chr1:300-400'] + >>> common.update_chr_prefix(['chr1:100-200', '2:300-400'], mode='remove') + ['1:100-200', '2:300-400'] + >>> common.update_chr_prefix(['chr1:100-200', '2:300-400'], mode='add') + ['chr1:100-200', 'chr2:300-400'] + >>> common.update_chr_prefix('chr1:100-200', mode='remove') + '1:100-200' + >>> common.update_chr_prefix('chr1:100-200', mode='add') + 'chr1:100-200' + >>> common.update_chr_prefix('2:300-400', mode='add') + 'chr2:300-400' + >>> common.update_chr_prefix('2:300-400', mode='remove') + '2:300-400' """ def remove(x): return x.replace('chr', '') diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index cd8e9c6..be634a7 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -4703,6 +4703,10 @@ def slice(self, region): 0 chr1 205 . T C . . . GT 1/1 1 chr1 297 . A T . . . GT 0/1 """ + if self.has_chr_prefix: + region = common.update_chr_prefix(region, mode='add') + else: + region = common.update_chr_prefix(region, mode='remove') chrom, start, end = common.parse_region(region) df = self.df[self.df.CHROM == chrom] if not pd.isna(start): From 0bbbcabc413794a8bcf11999367ea76d3195c40d Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 8 Dec 2021 16:27:16 +0900 Subject: [PATCH 06/26] Update docs --- fuc/api/pyvcf.py | 75 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 24 deletions(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index be634a7..294d8bd 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -4646,12 +4646,14 @@ def func(r): def slice(self, region): """ - Slice the VcfFrame for the region. + Slice VcfFrame for specified region. Parameters ---------- region : str - Region ('chrom:start-end'). + Region to slice ('chrom:start-end'). The 'chr' string will be + handled automatically. For example, 'chr1' and '1' will have the + same effect in slicing. Returns ------- @@ -4661,6 +4663,8 @@ def slice(self, region): Examples -------- + Assume we have following data: + >>> from fuc import pyvcf >>> data = { ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr2'], @@ -4672,36 +4676,59 @@ def slice(self, region): ... 'FILTER': ['.', '.', '.', '.'], ... 'INFO': ['.', '.', '.', '.'], ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '1/1', '0/1', '0/1'] + ... 'A': ['0/1', '1/1', '0/0', '0/0'], + ... 'B': ['0/0', '0/0', '0/1', '0/1'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) >>> vf.df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 205 . T C . . . GT 1/1 - 2 chr1 297 . A T . . . GT 0/1 - 3 chr2 101 . C A . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/1 0/0 + 1 chr1 205 . T C . . . GT 1/1 0/0 + 2 chr1 297 . A T . . . GT 0/0 0/1 + 3 chr2 101 . C A . . . GT 0/0 0/1 + + We can specify a full region: + >>> vf.slice('chr1:101-300').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 205 . T C . . . GT 1/1 - 1 chr1 297 . A T . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 205 . T C . . . GT 1/1 0/0 + 1 chr1 297 . A T . . . GT 0/0 0/1 + + We can specify a region without the 'chr' string: + + >>> vf.slice('1:101-300').df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 205 . T C . . . GT 1/1 0/0 + 1 chr1 297 . A T . . . GT 0/0 0/1 + + We can specify the contig only: + >>> vf.slice('chr1').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 205 . T C . . . GT 1/1 - 2 chr1 297 . A T . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/1 0/0 + 1 chr1 205 . T C . . . GT 1/1 0/0 + 2 chr1 297 . A T . . . GT 0/0 0/1 + + We can omit the start position: + >>> vf.slice('chr1:-296').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G A . . . GT 0/1 - 1 chr1 205 . T C . . . GT 1/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 100 . G A . . . GT 0/1 0/0 + 1 chr1 205 . T C . . . GT 1/1 0/0 + + We can omit the end position as well: + >>> vf.slice('chr1:101').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 205 . T C . . . GT 1/1 - 1 chr1 297 . A T . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 205 . T C . . . GT 1/1 0/0 + 1 chr1 297 . A T . . . GT 0/0 0/1 + + You can also omit the end position this way: + >>> vf.slice('chr1:101-').df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 205 . T C . . . GT 1/1 - 1 chr1 297 . A T . . . GT 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B + 0 chr1 205 . T C . . . GT 1/1 0/0 + 1 chr1 297 . A T . . . GT 0/0 0/1 """ if self.has_chr_prefix: region = common.update_chr_prefix(region, mode='add') From aefc0cda23bd753ebb0a1774b8ed1f1dede5f8e7 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 9 Dec 2021 09:31:02 +0900 Subject: [PATCH 07/26] Fix typo --- fuc/api/pyvcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 294d8bd..41511a6 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -1806,7 +1806,7 @@ def from_file( except AttributeError: pass else: - TypeError('Incorrect input type') + raise TypeError(f'Incorrect input type: {type(fn).__name__}') vf = cls.from_string(s) return vf From ce34f36ee69b4ea40c43e89a98fa0a81f2e44598 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 10 Dec 2021 12:45:07 +0900 Subject: [PATCH 08/26] Update `ngs-hc`: * Update :command:`ngs-hc` to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. --- CHANGELOG.rst | 1 + fuc/cli/ngs_hc.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c1723a9..896a9c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. +* Update :command:`ngs-hc` to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 0e931d0..33930d2 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -144,6 +144,7 @@ def main(args): command += f' --QUIET' command += f' --java-options "{args.java1}"' command += f' -R {args.fasta}' + command += f' --native-pair-hmm-threads 1' command += f' --emit-ref-confidence GVCF' command += f' -I {r.BAM}' command += f' -O {args.output}/temp/{basename}.g.vcf' From f626dde4f3d790d294af0d5f1c56d4402eedadef Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 10 Dec 2021 12:45:49 +0900 Subject: [PATCH 09/26] Update CHANGELOG.rst --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 896a9c1..f68d635 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. -* Update :command:`ngs-hc` to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. +* Update :command:`ngs-hc` command to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. 0.28.0 (2021-12-05) ------------------- From 5b4f81a57f3508198128f5b18b37c397fd071d22 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 10 Dec 2021 15:20:28 +0900 Subject: [PATCH 10/26] Update `ngs-hc`: * Update :command:`ngs-hc` command to set ``-XX:ParallelGCThreads=1`` and ``-XX:ConcGCThreads=1`` for Java. --- CHANGELOG.rst | 1 + fuc/cli/ngs_hc.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f68d635..5662d11 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. * Update :command:`ngs-hc` command to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. +* Update :command:`ngs-hc` command to set ``-XX:ParallelGCThreads=1`` and ``-XX:ConcGCThreads=1`` for Java. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 33930d2..3a155e9 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -128,6 +128,8 @@ def main(args): else: remove = 'rm' + java_shared = '-XX:ParallelGCThreads=1 -XX:ConcGCThreads=1' + basenames = [] for i, r in df.iterrows(): @@ -142,7 +144,7 @@ def main(args): command = 'gatk HaplotypeCaller' command += f' --QUIET' - command += f' --java-options "{args.java1}"' + command += f' --java-options "{java_shared} {args.java1}"' command += f' -R {args.fasta}' command += f' --native-pair-hmm-threads 1' command += f' --emit-ref-confidence GVCF' @@ -188,7 +190,7 @@ def main(args): command1 = 'gatk GenomicsDBImport' command1 += f' --QUIET' - command1 += f' --java-options "{args.java2}"' + command1 += f' --java-options "{java_shared} {args.java2}"' command1 += f' -L {chrom}' command1 += f' --genomicsdb-workspace-path {args.output}/temp/db-{chrom}' command1 += ' ' + ' '.join([f'-V {args.output}/temp/{x}.g.vcf' for x in basenames]) @@ -199,7 +201,7 @@ def main(args): command2 = 'gatk GenotypeGVCFs' command2 += f' --QUIET' - command2 += f' --java-options "{args.java2}"' + command2 += f' --java-options "{java_shared} {args.java2}"' command2 += f' -R {args.fasta}' command2 += f' -V gendb://{args.output}/temp/db-{chrom}' command2 += f' -O {args.output}/temp/{chrom}.joint.vcf' @@ -213,7 +215,7 @@ def main(args): command3 = 'gatk VariantFiltration' command3 += f' --QUIET' - command3 += f' --java-options "{args.java2}"' + command3 += f' --java-options "{java_shared} {args.java2}"' command3 += f' -R {args.fasta}' command3 += f' -O {args.output}/temp/{chrom}.joint.filtered.vcf' command3 += f' --variant {args.output}/temp/{chrom}.joint.vcf' @@ -247,7 +249,7 @@ def main(args): command = 'gatk GatherVcfs' command += f' --QUIET' - command += f' --java-options "{args.java2}"' + command += f' --java-options "{java_shared} {args.java2}"' command += f' -O {args.output}/merged.joint.filtered.vcf' command += ' ' + ' '.join([f'-I {args.output}/temp/{x}.joint.filtered.vcf' for x in chroms]) From 3d094e2e33b274ca76e522716f40fd47bde16c62 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 13 Dec 2021 10:38:18 +0900 Subject: [PATCH 11/26] Update `ngs-hc`: * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for HaplotypeCaller and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. --- CHANGELOG.rst | 3 +-- docs/cli.rst | 5 +++-- fuc/cli/ngs_hc.py | 11 +++++++++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5662d11..c60b161 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,8 +6,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. -* Update :command:`ngs-hc` command to set ``--native-pair-hmm-threads 1`` for HaplotypeCaller. -* Update :command:`ngs-hc` command to set ``-XX:ParallelGCThreads=1`` and ``-XX:ConcGCThreads=1`` for Java. +* Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for HaplotypeCaller and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. 0.28.0 (2021-12-05) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index 848b2bf..f252853 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -844,8 +844,8 @@ ngs-hc .. code-block:: text $ fuc ngs-hc -h - usage: fuc ngs-hc [-h] [--bed PATH] [--dbsnp PATH] [--job TEXT] [--force] - [--keep] [--posix] + usage: fuc ngs-hc [-h] [--bed PATH] [--dbsnp PATH] [--thread INT] [--job TEXT] + [--force] [--keep] [--posix] manifest fasta output qsub java1 java2 Pipeline for germline short variant discovery. @@ -869,6 +869,7 @@ ngs-hc -h, --help Show this help message and exit. --bed PATH BED file. --dbsnp PATH VCF file from dbSNP. + --thread INT Number of threads to use (default: 1). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 3a155e9..668810b 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -87,6 +87,13 @@ def create_parser(subparsers): type=str, help='VCF file from dbSNP.' ) + parser.add_argument( + '--thread', + metavar='INT', + type=int, + default=1, + help='Number of threads to use (default: 1).' + ) parser.add_argument( '--job', metavar='TEXT', @@ -128,7 +135,7 @@ def main(args): else: remove = 'rm' - java_shared = '-XX:ParallelGCThreads=1 -XX:ConcGCThreads=1' + java_shared = f'-XX:ParallelGCThreads={args.thread} -XX:ConcGCThreads={args.thread}' basenames = [] @@ -146,7 +153,7 @@ def main(args): command += f' --QUIET' command += f' --java-options "{java_shared} {args.java1}"' command += f' -R {args.fasta}' - command += f' --native-pair-hmm-threads 1' + command += f' --native-pair-hmm-threads {args.thread}' command += f' --emit-ref-confidence GVCF' command += f' -I {r.BAM}' command += f' -O {args.output}/temp/{basename}.g.vcf' From 560eb1e2d6185fb44ad1fe228c6d2905fe9de93d Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 13 Dec 2021 17:08:07 +0900 Subject: [PATCH 12/26] Update `ngs-hc` --- CHANGELOG.rst | 2 +- fuc/cli/ngs_hc.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c60b161..2fa926c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,7 +6,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. -* Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for HaplotypeCaller and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. +* Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 668810b..9c581b9 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -199,6 +199,7 @@ def main(args): command1 += f' --QUIET' command1 += f' --java-options "{java_shared} {args.java2}"' command1 += f' -L {chrom}' + command1 += f' --reader-threads {args.thread}' command1 += f' --genomicsdb-workspace-path {args.output}/temp/db-{chrom}' command1 += ' ' + ' '.join([f'-V {args.output}/temp/{x}.g.vcf' for x in basenames]) From 596c6223e0515151689d520a582d7eab6162ac3a Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 10:04:30 +0900 Subject: [PATCH 13/26] Fix typo in docs --- docs/cli.rst | 2 +- fuc/cli/bam_slice.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index f252853..34cb41f 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -210,7 +210,7 @@ bam-slice provide a BED file (compressed or uncompressed) to specify regions. Note that the 'chr' prefix in contig names (e.g. 'chr1' vs. '1') will be automatically added or removed as - necessary to match the input VCF's contig names. + necessary to match the input BED's contig names. Optional arguments: -h, --help Show this help message and exit. diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index cbbabff..91324df 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -43,7 +43,7 @@ def create_parser(subparsers): "provide a BED file (compressed or uncompressed) to specify \n" "regions. Note that the 'chr' prefix in contig names (e.g. \n" "'chr1' vs. '1') will be automatically added or removed as \n" - "necessary to match the input VCF's contig names." + "necessary to match the input BED's contig names." ) parser.add_argument( '--format', From eab410b0630e1f2e1f86d4a72109d20d7336a357 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 11:29:43 +0900 Subject: [PATCH 14/26] Update `ngs-hc`: * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. --- CHANGELOG.rst | 1 + docs/cli.rst | 10 ++++++++-- fuc/cli/ngs_hc.py | 13 +++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2fa926c..5ab3497 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. +* Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. 0.28.0 (2021-12-05) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index 34cb41f..d3c021d 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -844,8 +844,8 @@ ngs-hc .. code-block:: text $ fuc ngs-hc -h - usage: fuc ngs-hc [-h] [--bed PATH] [--dbsnp PATH] [--thread INT] [--job TEXT] - [--force] [--keep] [--posix] + usage: fuc ngs-hc [-h] [--bed PATH] [--dbsnp PATH] [--thread INT] + [--batch INT] [--job TEXT] [--force] [--keep] [--posix] manifest fasta output qsub java1 java2 Pipeline for germline short variant discovery. @@ -870,6 +870,12 @@ ngs-hc --bed PATH BED file. --dbsnp PATH VCF file from dbSNP. --thread INT Number of threads to use (default: 1). + --batch INT Batch size used for the GenomicsDBImport command. This + argument controls the number of samples for which readers + are open at once and therefore provides a way to minimize + memory consumption. The size of 0 (default) means no + batching (i.e. readers for all samples will be opened at + once). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 9c581b9..2081d3e 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -94,6 +94,18 @@ def create_parser(subparsers): default=1, help='Number of threads to use (default: 1).' ) + parser.add_argument( + '--batch', + metavar='INT', + type=int, + default=0, + help='Batch size used for the GenomicsDBImport command. This \n' + 'argument controls the number of samples for which readers \n' + 'are open at once and therefore provides a way to minimize \n' + 'memory consumption. The size of 0 (default) means no \n' + 'batching (i.e. readers for all samples will be opened at \n' + 'once).' + ) parser.add_argument( '--job', metavar='TEXT', @@ -200,6 +212,7 @@ def main(args): command1 += f' --java-options "{java_shared} {args.java2}"' command1 += f' -L {chrom}' command1 += f' --reader-threads {args.thread}' + command1 += f' --batch-size {args.batch}' command1 += f' --genomicsdb-workspace-path {args.output}/temp/db-{chrom}' command1 += ' ' + ' '.join([f'-V {args.output}/temp/{x}.g.vcf' for x in basenames]) From e5e0315d4e4ca96e4b55ecfa57e289a51e2cb697 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 12:45:59 +0900 Subject: [PATCH 15/26] Update docs --- docs/cli.rst | 11 +++++------ fuc/cli/ngs_hc.py | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index d3c021d..b75499c 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -870,12 +870,11 @@ ngs-hc --bed PATH BED file. --dbsnp PATH VCF file from dbSNP. --thread INT Number of threads to use (default: 1). - --batch INT Batch size used for the GenomicsDBImport command. This - argument controls the number of samples for which readers - are open at once and therefore provides a way to minimize - memory consumption. The size of 0 (default) means no - batching (i.e. readers for all samples will be opened at - once). + --batch INT Batch size used for GenomicsDBImport (default: 0). This + controls the number of samples for which readers are + open at once and therefore provides a way to minimize + memory consumption. The size of 0 means no batching (i.e. + readers for all samples will be opened at once). --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 2081d3e..d0e10e4 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -99,12 +99,11 @@ def create_parser(subparsers): metavar='INT', type=int, default=0, - help='Batch size used for the GenomicsDBImport command. This \n' - 'argument controls the number of samples for which readers \n' - 'are open at once and therefore provides a way to minimize \n' - 'memory consumption. The size of 0 (default) means no \n' - 'batching (i.e. readers for all samples will be opened at \n' - 'once).' + help='Batch size used for GenomicsDBImport (default: 0). This \n' + 'controls the number of samples for which readers are \n' + 'open at once and therefore provides a way to minimize \n' + 'memory consumption. The size of 0 means no batching (i.e. \n' + 'readers for all samples will be opened at once).' ) parser.add_argument( '--job', From 9e2b2430d8be2c7e5dfe03bd259c43a2f9ea0eb8 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 12:51:39 +0900 Subject: [PATCH 16/26] Update `ngs-bam2fq`: * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. --- CHANGELOG.rst | 1 + fuc/cli/ngs_bam2fq.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5ab3497..74265f0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,7 @@ Changelog * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. +* Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/cli/ngs_bam2fq.py b/fuc/cli/ngs_bam2fq.py index da74e58..7561c20 100644 --- a/fuc/cli/ngs_bam2fq.py +++ b/fuc/cli/ngs_bam2fq.py @@ -115,6 +115,6 @@ def main(args): for name in ${{names[@]}} do - qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N $name $p/shell/$name.sh + qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N S1-$name $p/shell/$name.sh done """) From 40d952e72196a668e04aa3c0484db4ee00a05ae7 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 16:00:15 +0900 Subject: [PATCH 17/26] Update `ngs-hc`: * Update :command:`ngs-hc` command so that when ``--posix`` is set it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from :command:`GenomicsDBImport` command instead of doing ``export TILEDB_DISABLE_FILE_LOCKING=1``. --- CHANGELOG.rst | 1 + docs/cli.rst | 8 +++++++- fuc/cli/ngs_hc.py | 22 +++++++++------------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 74265f0..d4dc878 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Changelog * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. +* Update :command:`ngs-hc` command so that when ``--posix`` is set it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from :command:`GenomicsDBImport` command instead of doing ``export TILEDB_DISABLE_FILE_LOCKING=1``. 0.28.0 (2021-12-05) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index b75499c..f702704 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -878,7 +878,13 @@ ngs-hc --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. - --posix Optimize for a POSIX filesystem. + --posix Set GenomicsDBImport to allow for optimizations to improve + the usability and performance for shared Posix Filesystems + (e.g. NFS, Lustre). If set, file level locking is disabled + and file system writes are minimized by keeping a higher + number of file descriptors open for longer periods of time. + Use with --batch if keeping a large number of file + descriptors open is an issue. [Example] Specify queue: $ fuc ngs-hc \ diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index d0e10e4..504a49e 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -124,7 +124,13 @@ def create_parser(subparsers): parser.add_argument( '--posix', action='store_true', - help='Optimize for a POSIX filesystem.' + help='Set GenomicsDBImport to allow for optimizations to improve \n' + 'the usability and performance for shared Posix Filesystems \n' + '(e.g. NFS, Lustre). If set, file level locking is disabled \n' + 'and file system writes are minimized by keeping a higher \n' + 'number of file descriptors open for longer periods of time. \n' + 'Use with --batch if keeping a large number of file \n' + 'descriptors open is an issue.' ) def main(args): @@ -193,15 +199,6 @@ def main(args): for chrom in chroms: with open(f'{args.output}/shell/S2-{chrom}.sh', 'w') as f: - #################### - # POSIX filesystem # - #################### - - if args.posix: - export = 'export TILEDB_DISABLE_FILE_LOCKING=1' - else: - export = '# export TILEDB_DISABLE_FILE_LOCKING=1' - #################### # GenomicsDBImport # #################### @@ -211,6 +208,8 @@ def main(args): command1 += f' --java-options "{java_shared} {args.java2}"' command1 += f' -L {chrom}' command1 += f' --reader-threads {args.thread}' + if args.posix: + command1 += ' --genomicsdb-shared-posixfs-optimizations' command1 += f' --batch-size {args.batch}' command1 += f' --genomicsdb-workspace-path {args.output}/temp/db-{chrom}' command1 += ' ' + ' '.join([f'-V {args.output}/temp/{x}.g.vcf' for x in basenames]) @@ -245,9 +244,6 @@ def main(args): f.write( f"""#!/bin/bash -# Optimize for POSIX filesystem. -{export} - # Activate conda environment. source activate {api.common.conda_env()} From 963015dca95c9c96ba391e8e9d4ca0bcdd4a64d4 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 16:30:22 +0900 Subject: [PATCH 18/26] Update `ngs-fq2bam`: * Add new argument ``--job`` argument to :command:`ngs-fq2bam` command. * Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. --- CHANGELOG.rst | 2 + docs/cli.rst | 3 +- fuc/cli/ngs_fq2bam.py | 95 +++++++++++++++++++++++-------------------- 3 files changed, 54 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d4dc878..4e9789b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,8 @@ Changelog * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. * Update :command:`ngs-hc` command so that when ``--posix`` is set it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from :command:`GenomicsDBImport` command instead of doing ``export TILEDB_DISABLE_FILE_LOCKING=1``. +* Add new argument ``--job`` argument to :command:`ngs-fq2bam` command. +* Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. 0.28.0 (2021-12-05) ------------------- diff --git a/docs/cli.rst b/docs/cli.rst index f702704..7c8eb4b 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -773,7 +773,7 @@ ngs-fq2bam $ fuc ngs-fq2bam -h usage: fuc ngs-fq2bam [-h] [--bed PATH] [--thread INT] [--platform TEXT] - [--force] [--keep] + [--job TEXT] [--force] [--keep] manifest fasta output qsub1 qsub2 java vcf [vcf ...] Pipeline for converting FASTQ files to analysis-ready BAM files. @@ -813,6 +813,7 @@ ngs-fq2bam --bed PATH BED file. --thread INT Number of threads to use (default: 1). --platform TEXT Sequencing platform (default: 'Illumina'). + --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index c39217b..299e9d6 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -116,6 +116,12 @@ def create_parser(subparsers): default='Illumina', help="Sequencing platform (default: 'Illumina')." ) + parser.add_argument( + '--job', + metavar='TEXT', + type=str, + help='Job submission ID for SGE.' + ) parser.add_argument( '--force', action='store_true', @@ -153,63 +159,49 @@ def main(args): # BWA-MEM # ########### - f.write( -f"""#!/bin/bash - -# Activate conda environment. -source activate {api.common.conda_env()} - -# Get read group information. -first=`zcat {r.Read1} | head -1` -flowcell=`echo "$first" | awk -F " " '{{print $1}}' | awk -F ":" '{{print $3}}'` -barcode=`echo "$first" | awk -F " " '{{print $2}}' | awk -F ":" '{{print $4}}'` -group="@RG\\tID:$flowcell\\tPU:$flowcell.$barcode\\tSM:{r.Name}\\tPL:{args.platform}\\tLB:{r.Name}" - -# Align and sort seuqnece reads. Assign read group as well. -bwa mem -M -R $group -t {args.thread} {args.fasta} {r.Read1} {r.Read2} | samtools sort -@ {args.thread} -o {args.output}/temp/{r.Name}.sorted.bam - -""") - - with open(f'{args.output}/shell/S2-{r.Name}.sh', 'w') as f: + command1 = 'bwa mem' + command1 += f' -M -R $group -t {args.thread} ' + command1 += f' {args.fasta} {r.Read1} {r.Read2} |' + command1 += f' samtools sort -@ {args.thread}' + command1 += f' -o {args.output}/temp/{r.Name}.sorted.bam -' ################## # MarkDuplicates # ################## - command1 = 'gatk MarkDuplicates' - command1 += f' --QUIET' - command1 += f' --java-options "{args.java}"' - command1 += f' -I {args.output}/temp/{r.Name}.sorted.bam' - command1 += f' -O {args.output}/temp/{r.Name}.sorted.markdup.bam' - command1 += f' -M {args.output}/temp/{r.Name}.metrics' + command2 = 'gatk MarkDuplicates' + command2 += f' --QUIET' + command2 += f' --java-options "{args.java}"' + command2 += f' -I {args.output}/temp/{r.Name}.sorted.bam' + command2 += f' -O {args.output}/temp/{r.Name}.sorted.markdup.bam' + command2 += f' -M {args.output}/temp/{r.Name}.metrics' #################### # BaseRecalibrator # #################### - command2 = 'gatk BaseRecalibrator' - command2 += f' --QUIET' - command2 += f' --java-options "{args.java}"' - command2 += f' -R {args.fasta}' - command2 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' - command2 += f' -O {args.output}/temp/{r.Name}.table' - command2 += ' ' + ' '.join([f'--known-sites {x}' for x in args.vcf]) - + command3 = 'gatk BaseRecalibrator' + command3 += f' --QUIET' + command3 += f' --java-options "{args.java}"' + command3 += f' -R {args.fasta}' + command3 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' + command3 += f' -O {args.output}/temp/{r.Name}.table' + command3 += ' ' + ' '.join([f'--known-sites {x}' for x in args.vcf]) if args.bed is not None: - command2 += f' -L {args.bed}' + command3 += f' -L {args.bed}' ############# # ApplyBQSR # ############# - command3 = 'gatk ApplyBQSR' - command3 += f' --QUIET' - command3 += f' --java-options "{args.java}"' - command3 += f' -bqsr {args.output}/temp/{r.Name}.table' - command3 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' - command3 += f' -O {args.output}/{r.Name}.sorted.markdup.recal.bam' - + command4 = 'gatk ApplyBQSR' + command4 += f' --QUIET' + command4 += f' --java-options "{args.java}"' + command4 += f' -bqsr {args.output}/temp/{r.Name}.table' + command4 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' + command4 += f' -O {args.output}/{r.Name}.sorted.markdup.recal.bam' if args.bed is not None: - command3 += f' -L {args.bed}' + command4 += f' -L {args.bed}' f.write( f"""#!/bin/bash @@ -217,17 +209,26 @@ def main(args): # Activate conda environment. source activate {api.common.conda_env()} -# Mark duplicate reads. +# Get read group information. +first=`zcat {r.Read1} | head -1` +flowcell=`echo "$first" | awk -F " " '{{print $1}}' | awk -F ":" '{{print $3}}'` +barcode=`echo "$first" | awk -F " " '{{print $2}}' | awk -F ":" '{{print $4}}'` +group="@RG\\tID:$flowcell\\tPU:$flowcell.$barcode\\tSM:{r.Name}\\tPL:{args.platform}\\tLB:{r.Name}" + +# Align and sort seuqnece reads. Assign read group as well. {command1} +# Mark duplicate reads. +{command2} + # Index BAM file. samtools index {args.output}/temp/{r.Name}.sorted.markdup.bam # Build BQSR model. -{command2} +{command3} # Apply BQSR model. -{command3} +{command4} # Remove temporary files. {remove} {args.output}/temp/{r.Name}.metrics @@ -237,6 +238,11 @@ def main(args): {remove} {args.output}/temp/{r.Name}.sorted.markdup.bam.bai """) + if args.job is None: + jid = '' + else: + jid = '-' + args.job + with open(f'{args.output}/shell/qsubme.sh', 'w') as f: f.write( f"""#!/bin/bash @@ -247,7 +253,6 @@ def main(args): for sample in ${{samples[@]}} do - qsub {args.qsub1} -S /bin/bash -e $p/log -o $p/log -N S1-$sample $p/shell/S1-$sample.sh - qsub {args.qsub2} -S /bin/bash -e $p/log -o $p/log -N S2-$sample -hold_jid S1-$sample $p/shell/S2-$sample.sh + qsub {args.qsub1} -S /bin/bash -e $p/log -o $p/log -N S1-$sample{jid} $p/shell/S1-$sample.sh done """) From 492de555c4863576f20a3a35097863dff570b47e Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 16:42:55 +0900 Subject: [PATCH 19/26] Update `ngs-fq2bam` --- docs/cli.rst | 11 ++--------- fuc/cli/ngs_fq2bam.py | 17 +++-------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 7c8eb4b..dc747a9 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -774,7 +774,7 @@ ngs-fq2bam $ fuc ngs-fq2bam -h usage: fuc ngs-fq2bam [-h] [--bed PATH] [--thread INT] [--platform TEXT] [--job TEXT] [--force] [--keep] - manifest fasta output qsub1 qsub2 java vcf [vcf ...] + manifest fasta output qsub java vcf [vcf ...] Pipeline for converting FASTQ files to analysis-ready BAM files. @@ -798,12 +798,7 @@ ngs-fq2bam manifest Sample manifest CSV file. fasta Reference FASTA file. output Output directory. - qsub1 SGE resoruce to request with qsub for read alignment - and sorting. Since both tasks support multithreading, - it is recommended to speicfy a parallel environment (PE) - to speed up the process (also see --thread). - qsub2 SGE resoruce to request with qsub for the rest of the - tasks, which do not support multithreading. + qsub SGE resoruce to request for qsub. java Java resoruce to request for GATK. vcf One or more reference VCF files containing known variant sites (e.g. 1000 Genomes Project). @@ -823,7 +818,6 @@ ngs-fq2bam ref.fa \ output_dir \ "-q queue_name -pe pe_name 10" \ - "-q queue_name" \ "-Xmx15g -Xms15g" \ 1.vcf 2.vcf 3.vcf \ --thread 10 @@ -834,7 +828,6 @@ ngs-fq2bam ref.fa \ output_dir \ "-l h='node_A|node_B' -pe pe_name 10" \ - "-l h='node_A|node_B'" \ "-Xmx15g -Xms15g" \ 1.vcf 2.vcf 3.vcf \ --thread 10 diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index 299e9d6..979da5f 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -33,7 +33,6 @@ ref.fa \\ output_dir \\ "-q queue_name -pe pe_name 10" \\ - "-q queue_name" \\ "-Xmx15g -Xms15g" \\ 1.vcf 2.vcf 3.vcf \\ --thread 10 @@ -44,7 +43,6 @@ ref.fa \\ output_dir \\ "-l h='node_A|node_B' -pe pe_name 10" \\ - "-l h='node_A|node_B'" \\ "-Xmx15g -Xms15g" \\ 1.vcf 2.vcf 3.vcf \\ --thread 10 @@ -72,18 +70,9 @@ def create_parser(subparsers): help='Output directory.' ) parser.add_argument( - 'qsub1', + 'qsub', type=str, - help="SGE resoruce to request with qsub for read alignment \n" - "and sorting. Since both tasks support multithreading, \n" - "it is recommended to speicfy a parallel environment (PE) \n" - "to speed up the process (also see --thread)." - ) - parser.add_argument( - 'qsub2', - type=str, - help="SGE resoruce to request with qsub for the rest of the \n" - "tasks, which do not support multithreading." + help='SGE resoruce to request for qsub.' ) parser.add_argument( 'java', @@ -253,6 +242,6 @@ def main(args): for sample in ${{samples[@]}} do - qsub {args.qsub1} -S /bin/bash -e $p/log -o $p/log -N S1-$sample{jid} $p/shell/S1-$sample.sh + qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N S1-$sample{jid} $p/shell/S1-$sample.sh done """) From e71b613c4bfd59dd481d96c133a6dc15f1c9ffaa Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 16:44:08 +0900 Subject: [PATCH 20/26] Update docs --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4e9789b..0b4deb1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,7 +10,7 @@ Changelog * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. * Update :command:`ngs-hc` command so that when ``--posix`` is set it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from :command:`GenomicsDBImport` command instead of doing ``export TILEDB_DISABLE_FILE_LOCKING=1``. -* Add new argument ``--job`` argument to :command:`ngs-fq2bam` command. +* Add new argument ``--job`` to :command:`ngs-fq2bam` command. * Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. 0.28.0 (2021-12-05) From 300706e9e6e39a9ab335e6fe45e89e0c2867958c Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 17:11:31 +0900 Subject: [PATCH 21/26] Update `ngs-fq2bam`: * Update :command:`ngs-fq2bam` command so that ``--thread`` is now also used to set ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. --- CHANGELOG.rst | 5 +++-- fuc/cli/ngs_fq2bam.py | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0b4deb1..0ff768e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,11 +7,12 @@ Changelog * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string. * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. -* Add new argument ``--batch`` to :command:`ngs-hc` command. This argument sets the batch size used for :command:`GenomicsDBImport` command. +* Add new argument ``--batch`` to :command:`ngs-hc` command. This argument will be used to set ``--batch-size`` for GATK's :command:`GenomicsDBImport` command. * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. -* Update :command:`ngs-hc` command so that when ``--posix`` is set it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from :command:`GenomicsDBImport` command instead of doing ``export TILEDB_DISABLE_FILE_LOCKING=1``. +* Update :command:`ngs-hc` command so that when ``--posix`` is set, it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from GATK's :command:`GenomicsDBImport` command instead of exporting shell variable (i.e. ``export TILEDB_DISABLE_FILE_LOCKING=1``). * Add new argument ``--job`` to :command:`ngs-fq2bam` command. * Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. +* Update :command:`ngs-fq2bam` command so that ``--thread`` is now also used to set ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index 979da5f..0aacd75 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -141,6 +141,8 @@ def main(args): else: remove = 'rm' + java_shared = f'-XX:ParallelGCThreads={args.thread} -XX:ConcGCThreads={args.thread}' + for i, r in df.iterrows(): with open(f'{args.output}/shell/S1-{r.Name}.sh', 'w') as f: @@ -160,7 +162,7 @@ def main(args): command2 = 'gatk MarkDuplicates' command2 += f' --QUIET' - command2 += f' --java-options "{args.java}"' + command2 += f' --java-options "{java_shared} {args.java}"' command2 += f' -I {args.output}/temp/{r.Name}.sorted.bam' command2 += f' -O {args.output}/temp/{r.Name}.sorted.markdup.bam' command2 += f' -M {args.output}/temp/{r.Name}.metrics' @@ -171,7 +173,7 @@ def main(args): command3 = 'gatk BaseRecalibrator' command3 += f' --QUIET' - command3 += f' --java-options "{args.java}"' + command3 += f' --java-options "{java_shared} {args.java}"' command3 += f' -R {args.fasta}' command3 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' command3 += f' -O {args.output}/temp/{r.Name}.table' @@ -185,7 +187,7 @@ def main(args): command4 = 'gatk ApplyBQSR' command4 += f' --QUIET' - command4 += f' --java-options "{args.java}"' + command4 += f' --java-options "{java_shared} {args.java}"' command4 += f' -bqsr {args.output}/temp/{r.Name}.table' command4 += f' -I {args.output}/temp/{r.Name}.sorted.markdup.bam' command4 += f' -O {args.output}/{r.Name}.sorted.markdup.recal.bam' From 5efb28ba503ac8b5d041573982f3b84c7c122b32 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 15 Dec 2021 17:23:04 +0900 Subject: [PATCH 22/26] Update `ngs-hc` --- CHANGELOG.rst | 2 +- fuc/cli/ngs_hc.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0ff768e..a38016b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,7 @@ Changelog * Add new argument ``--thread`` to :command:`ngs-hc` command. This argument will be used to set ``--native-pair-hmm-threads`` for GATK's :command:`HaplotypeCaller` command, ``--reader-threads`` for GATK's :command:`GenomicsDBImport` command, and ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. * Add new argument ``--batch`` to :command:`ngs-hc` command. This argument will be used to set ``--batch-size`` for GATK's :command:`GenomicsDBImport` command. * Update :command:`ngs-bam2fq` command to fix the SGE issue that outputs an error like ``Unable to run job: denied: "XXXXX" is not a valid object name (cannot start with a digit)``. -* Update :command:`ngs-hc` command so that when ``--posix`` is set, it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from GATK's :command:`GenomicsDBImport` command instead of exporting shell variable (i.e. ``export TILEDB_DISABLE_FILE_LOCKING=1``). +* Update :command:`ngs-hc` command so that when ``--posix`` is set, it will use ``--genomicsdb-shared-posixfs-optimizations`` argument from GATK's :command:`GenomicsDBImport` command in addition to exporting relevant shell variable (i.e. ``export TILEDB_DISABLE_FILE_LOCKING=1``). * Add new argument ``--job`` to :command:`ngs-fq2bam` command. * Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. * Update :command:`ngs-fq2bam` command so that ``--thread`` is now also used to set ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 504a49e..b39e29d 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -199,6 +199,15 @@ def main(args): for chrom in chroms: with open(f'{args.output}/shell/S2-{chrom}.sh', 'w') as f: + #################### + # POSIX filesystem # + #################### + + if args.posix: + export = 'export TILEDB_DISABLE_FILE_LOCKING=1' + else: + export = '# export TILEDB_DISABLE_FILE_LOCKING=1' + #################### # GenomicsDBImport # #################### @@ -244,6 +253,9 @@ def main(args): f.write( f"""#!/bin/bash +# Optimize for POSIX filesystem. +{export} + # Activate conda environment. source activate {api.common.conda_env()} From a01fe8f76baaf9f5251324d100b47a412f586ef6 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 16 Dec 2021 10:22:57 +0900 Subject: [PATCH 23/26] Update `ngs-fq2bam` --- fuc/cli/ngs_fq2bam.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index 0aacd75..239385e 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -151,7 +151,7 @@ def main(args): ########### command1 = 'bwa mem' - command1 += f' -M -R $group -t {args.thread} ' + command1 += f' -M -R $group -t {args.thread}' command1 += f' {args.fasta} {r.Read1} {r.Read2} |' command1 += f' samtools sort -@ {args.thread}' command1 += f' -o {args.output}/temp/{r.Name}.sorted.bam -' @@ -209,6 +209,13 @@ def main(args): # Align and sort seuqnece reads. Assign read group as well. {command1} +if [ $? -eq 0 ]; then + echo "Successfully finished aligning and sorting seuqnece reads" +else + echo "Failed to align and sort seuqnece reads, aborting" + exit 1 +fi + # Mark duplicate reads. {command2} From 36aee4c672259f614790651036388df3674947ae Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Thu, 16 Dec 2021 11:25:34 +0900 Subject: [PATCH 24/26] Update `ngs-fq2bam` --- fuc/cli/ngs_fq2bam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fuc/cli/ngs_fq2bam.py b/fuc/cli/ngs_fq2bam.py index 239385e..c207b0a 100644 --- a/fuc/cli/ngs_fq2bam.py +++ b/fuc/cli/ngs_fq2bam.py @@ -239,7 +239,7 @@ def main(args): if args.job is None: jid = '' else: - jid = '-' + args.job + jid = args.job + '-' with open(f'{args.output}/shell/qsubme.sh', 'w') as f: f.write( @@ -251,6 +251,6 @@ def main(args): for sample in ${{samples[@]}} do - qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N S1-$sample{jid} $p/shell/S1-$sample.sh + qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N {jid}S1-$sample $p/shell/S1-$sample.sh done """) From d5250d17d124aacd103edc799001dda1db4b545f Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Fri, 17 Dec 2021 16:45:25 +0900 Subject: [PATCH 25/26] Add `common.parse_list_or_file` --- CHANGELOG.rst | 1 + fuc/api/common.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a38016b..9977e01 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Changelog * Add new argument ``--job`` to :command:`ngs-fq2bam` command. * Update :command:`ngs-fq2bam` command so that BAM creation step and BAM processing step are now in one step. * Update :command:`ngs-fq2bam` command so that ``--thread`` is now also used to set ``-XX:ParallelGCThreads`` and ``-XX:ConcGCThreads`` for Java. +* Add new method :meth:`common.parse_list_or_file`. 0.28.0 (2021-12-05) ------------------- diff --git a/fuc/api/common.py b/fuc/api/common.py index 164b314..b352265 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -1379,3 +1379,54 @@ def add(x): return modes[mode](regions) return [modes[mode](x) for x in regions] + +def parse_list_or_file(obj, extensions=['txt', 'tsv', 'csv', 'list']): + """ + Parse the input variable and then return a list of items. + + This method is useful when parsing a command line argument that accepts + either a list of items or a text file containing one item per line. + + Parameters + ---------- + obj : str or list + Object to be tested. Must be non-empty. + extensions : list, default: ['txt', 'tsv', 'csv', 'list'] + Recognized file extensions. + + Returns + ------- + list + List of items. + + Examples + -------- + + >>> from fuc import common + >>> common.parse_list_or_file(['A', 'B', 'C']) + ['A', 'B', 'C'] + >>> common.parse_list_or_file('A') + ['A'] + >>> common.parse_list_or_file('example.txt') + ['A', 'B', 'C'] + >>> common.parse_list_or_file(['example.txt']) + ['A', 'B', 'C'] + """ + if not isinstance(obj, str) and not isinstance(obj, list): + raise TypeError( + f'Input must be str or list, not {type(obj).__name__}') + + if not obj: + raise ValueError('Input is empty') + + if isinstance(obj, str): + obj = [obj] + + if len(obj) > 1: + return obj + + for extension in extensions: + if obj[0].endswith(f'.{extension}'): + return convert_file2list(obj[0]) + + return obj From ab764b0e092df36594422901118b60deb00e9409 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Sun, 19 Dec 2021 16:07:37 +0900 Subject: [PATCH 26/26] Update CHANGELOG.rst --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9977e01..e8fc786 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ********* -0.29.0 (in development) ------------------------ +0.29.0 (2021-12-19) +------------------- * Add new property ``pyvcf.VcfFrame.phased``. * Update :meth:`pyvcf.VcfFrame.slice` method to automatically handle the 'chr' string.