diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1c1d07b..6201458 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,24 @@ Changelog ********* +0.28.0 (2021-12-05) +------------------- + +* Update :meth:`pyvcf.VcfFrame.filter_empty` method so that users can choose a varying number of missing genotypes as threshold. +* Add new method :meth:`pyvcf.plot_af_correlation`. +* Update :command:`bam-slice` command to support BED file as input for specifying regions. Additionally, from now on, the command will automatically handle the annoying 'chr' prefix. +* Add new method :meth:`pycov.CovFrame.matrix_uniformity`. +* Fix bug in :meth:`pyvcf.slice` method when the input region is missing start or end. +* Add new command :command:`ngs-bam2fq`. +* Add new command :command:`fa-filter`. +* Update :meth:`pycov.CovFrame.plot_region` and :meth:`pyvcf.VcfFrame.plot_region` methods to raise an error if the CovFrame/VcfFrame is empty. +* Update :meth:`pyvcf.VcfFrame.filter_*` methods so that they don't raise an error when the VcfFrame is empty (i.e. will return the empty VcfFrame). +* Update :meth:`common.plot_exons` method to not italicize text by default (use ``name='$text$'`` to italicize). +* Add new argument ``--posix`` to :command:`ngs-hc` command. +* Add new method :meth:`common.AnnFrame.subset`. +* Update :meth:`common.AnnFrame.plot_annot` method to raise an error if user provides an invalid group in ``group_order``. +* Add new method :meth:`pymaf.MafFrame.get_gene_concordance`. + 0.27.0 (2021-11-20) ------------------- diff --git a/README.rst b/README.rst index f9125e8..47f0180 100644 --- a/README.rst +++ b/README.rst @@ -121,6 +121,7 @@ For getting help on the fuc CLI: bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. + fa-filter Filter sequence records in a FASTA file fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. @@ -133,6 +134,7 @@ For getting help on the fuc CLI: maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. + ngs-bam2fq Pipeline for converting BAM files to FASTQ files. ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. diff --git a/docs/cli.rst b/docs/cli.rst index 6699a79..848b2bf 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -28,6 +28,7 @@ For getting help on the fuc CLI: bed-sum Summarize a BED file. cov-concat Concatenate depth of coverage files. cov-rename Rename the samples in a depth of coverage file. + fa-filter Filter sequence records in a FASTA file fq-count Count sequence reads in FASTQ files. fq-sum Summarize a FASTQ file. fuc-bgzip Write a BGZF compressed file. @@ -40,6 +41,7 @@ For getting help on the fuc CLI: maf-oncoplt Create an oncoplot with a MAF file. maf-sumplt Create a summary plot with a MAF file. maf-vcf2maf Convert a VCF file to a MAF file. + ngs-bam2fq Pipeline for converting BAM files to FASTQ files. ngs-fq2bam Pipeline for converting FASTQ files to analysis-ready BAM files. ngs-hc Pipeline for germline short variant discovery. ngs-m2 Pipeline for somatic short variant discovery. @@ -195,11 +197,20 @@ bam-slice usage: fuc bam-slice [-h] [--format TEXT] [--fasta PATH] bam regions [regions ...] - Slice a SAM/BAM/CRAM file. + Slice an alignment file (SAM/BAM/CRAM). Positional arguments: - bam Alignment file. - regions List of regions to be sliced ('chrom:start-end'). + bam Input alignment file must be already indexed (.bai) to allow + random access. You can index an alignment file with the + bam-index command. + regions One or more regions to be sliced. Each region must have the + format chrom:start-end and be a half-open interval with + (start, end]. This means, for example, chr1:100-103 will + extract positions 101, 102, and 103. Alternatively, you can + provide a BED file (compressed or uncompressed) to specify + regions. Note that the 'chr' prefix in contig names (e.g. + 'chr1' vs. '1') will be automatically added or removed as + necessary to match the input VCF's contig names. Optional arguments: -h, --help Show this help message and exit. @@ -207,11 +218,14 @@ bam-slice 'CRAM'). --fasta PATH FASTA file. Required when --format is 'CRAM'. - [Example] Slice a BAM file: - $ fuc bam-slice in.bam chr1:100-200 chr2:100-200 > out.bam + [Example] Specify regions manually: + $ fuc bam-slice in.bam 1:100-300 2:400-700 > out.bam + + [Example] Speicfy regions with a BED file: + $ fuc bam-slice in.bam regions.bed > out.bam [Example] Slice a CRAM file: - $ fuc bam-slice in.bam chr1:100-200 --format CRAM --fasta ref.fa > out.cram + $ fuc bam-slice in.bam regions.bed --format CRAM --fasta ref.fa > out.cram bed-intxn ========= @@ -322,6 +336,32 @@ cov-rename [Example] Using the 'RANGE' mode: $ fuc cov-rename in.tsv new_only.tsv --mode RANGE --range 2 5 > out.tsv +fa-filter +========= + +.. code-block:: text + + $ fuc fa-filter -h + usage: fuc fa-filter [-h] [--contigs TEXT [TEXT ...]] [--exclude] fasta + + Filter sequence records in a FASTA file. + + Positional arguments: + fasta FASTA file (compressed or uncompressed). + + Optional arguments: + -h, --help Show this help message and exit. + --contigs TEXT [TEXT ...] + One or more contigs to be selected. Alternatively, you can + provide a file containing one contig per line. + --exclude Exclude specified contigs. + + [Example] Select certain contigs: + $ fuc fa-filter in.fasta --contigs chr1 chr2 > out.fasta + + [Example] Select certain contigs: + $ fuc fa-filter in.fasta --contigs contigs.list --exclude > out.fasta + fq-count ======== @@ -677,6 +717,55 @@ maf-vcf2maf [Example] Convert VCF to MAF: $ fuc maf-vcf2maf in.vcf > out.maf +ngs-bam2fq +========== + +.. code-block:: text + + $ fuc ngs-bam2fq -h + usage: fuc ngs-bam2fq [-h] [--thread INT] [--force] manifest output qsub + + Pipeline for converting BAM files to FASTQ files. + + This pipeline will assume input BAM files consist of paired-end reads + and output two zipped FASTQ files for each sample (forward and reverse + reads). That is, SAMPLE.bam will produce SAMPLE_R1.fastq.gz and + SAMPLE_R2.fastq.gz. + + External dependencies: + - SGE: Required for job submission (i.e. qsub). + - SAMtools: Required for BAM to FASTQ conversion. + + Manifest columns: + - BAM: BAM file. + + Positional arguments: + manifest Sample manifest CSV file. + output Output directory. + qsub SGE resoruce to request with qsub for BAM to FASTQ + conversion. Since this oppoeration supports multithreading, + it is recommended to speicfy a parallel environment (PE) + to speed up the process (also see --thread). + + Optional arguments: + -h, --help Show this help message and exit. + --thread INT Number of threads to use (default: 1). + --force Overwrite the output directory if it already exists. + + [Example] Specify queue: + $ fuc ngs-bam2fq \ + manifest.csv \ + output_dir \ + "-q queue_name -pe pe_name 10" \ + --thread 10 + + [Example] Specify nodes: + $ fuc ngs-bam2fq \ + manifest.csv \ + output_dir \ + "-l h='node_A|node_B' -pe pe_name 10" \ + --thread 10 + ngs-fq2bam ========== @@ -756,7 +845,7 @@ ngs-hc $ fuc ngs-hc -h usage: fuc ngs-hc [-h] [--bed PATH] [--dbsnp PATH] [--job TEXT] [--force] - [--keep] + [--keep] [--posix] manifest fasta output qsub java1 java2 Pipeline for germline short variant discovery. @@ -783,6 +872,7 @@ ngs-hc --job TEXT Job submission ID for SGE. --force Overwrite the output directory if it already exists. --keep Keep temporary files. + --posix Optimize for a POSIX filesystem. [Example] Specify queue: $ fuc ngs-hc \ @@ -1104,10 +1194,10 @@ vcf-index -h, --help Show this help message and exit. --force Force to overwrite the index file if it is already present. - [Example] Index a compressed VCF file. + [Example] Index a compressed VCF file: $ fuc vcf-index in.vcf.gz - [Example] Index an uncompressed VCF file. Will create a compressed file first. + [Example] Index an uncompressed VCF file (will create a compressed VCF first): $ fuc vcf-index in.vcf vcf-merge @@ -1194,7 +1284,9 @@ vcf-slice Positional arguments: vcf Input VCF file must be already BGZF compressed (.gz) and - indexed (.tbi) to allow random access. + indexed (.tbi) to allow random access. A VCF file can be + compressed with the fuc-bgzip command and indexed with the + vcf-index command. regions One or more regions to be sliced. Each region must have the format chrom:start-end and be a half-open interval with (start, end]. This means, for example, chr1:100-103 will @@ -1207,11 +1299,14 @@ vcf-slice Optional arguments: -h, --help Show this help message and exit. - [Example] Specify regions manually. - $ fuc vcf-slice in.vcf.gz 1:100-300 2:400-700 > out.vcf + [Example] Specify regions manually: + $ fuc vcf-slice in.vcf.gz 1:100-300 2:400-700 > out.vcf + + [Example] Speicfy regions with a BED file: + $ fuc vcf-slice in.vcf.gz regions.bed > out.vcf - [Example] Speicfy regions with a BED file. - $ fuc vcf-slice in.vcf.gz regions.bed > out.vcf + [Example] Output a compressed file: + $ fuc vcf-slice in.vcf.gz regions.bed | fuc fuc-bgzip > out.vcf.gz vcf-vcf2bed =========== diff --git a/fuc/api/common.py b/fuc/api/common.py index 62de084..33b6ea4 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -233,9 +233,12 @@ def plot_annot( Parameters ---------- group_col : str - AnnFrame column containing sample group information. + AnnFrame column containing sample group information. If the + column has NaN values, they will be converted to 'N/A' string. group_order : list, optional - List of sample group names. + List of sample group names (in that order too). You can use this + to subset samples belonging to specified groups only. You must + include all relevant groups when also using ``samples``. samples : list, optional Display only specified samples (in that order too). colors : str or list, default: 'tab10' @@ -298,6 +301,7 @@ def plot_annot( """ # Get the selected column. s = self.df[group_col] + s = s.fillna('N/A') # Subset the samples, if necessary. if samples is not None: @@ -307,9 +311,26 @@ def plot_annot( if group_order is None: group_order = sorted([x for x in s.unique() if x == x]) else: - s = s[s.isin(group_order)] + # Make sure all specified groups are valid. + for group in group_order: + groups = ', '.join([f"'{x}'" for x in sorted(s.unique())]) + if group not in s.unique(): + raise ValueError(f"The group '{group}' does not exist. " + f"The following groups are available: {groups}.") + + if len(group_order) < len(s.unique()): + if samples is None: + s = s[s.isin(group_order)] + else: + missing = ', '.join([f"'{x}'" for x in s.unique() + if x not in group_order]) + raise ValueError("The 'group_order' argumnet must " + "include all groups when used with the 'samples' " + "argument. Following groups are currently missing: " + f"{missing}.") + d = {k: v for v, k in enumerate(group_order)} - df = s.to_frame().applymap(lambda x: x if pd.isna(x) else d[x]) + df = s.to_frame().applymap(lambda x: d[x]) # Determine the colors to use. if isinstance(colors, str): @@ -445,6 +466,63 @@ def sorted_samples(self, by, mf=None, keep_empty=False, nonsyn=False): return df.index.to_list() + def subset(self, samples, exclude=False): + """ + Subset AnnFrame for specified samples. + + Parameters + ---------- + samples : str or list + Sample name or list of names (the order matters). + exclude : bool, default: False + If True, exclude specified samples. + + Returns + ------- + AnnFrame + Subsetted AnnFrame. + + Examples + -------- + + >>> from fuc import common + >>> data = { + ... 'SampleID': ['A', 'B', 'C', 'D'], + ... 'PatientID': ['P1', 'P1', 'P2', 'P2'], + ... 'Tissue': ['Normal', 'Tumor', 'Normal', 'Tumor'], + ... 'Age': [30, 30, 57, 57] + ... } + >>> af = common.AnnFrame.from_dict(data, sample_col='SampleID') # or sample_col=0 + >>> af.df + PatientID Tissue Age + SampleID + A P1 Normal 30 + B P1 Tumor 30 + C P2 Normal 57 + D P2 Tumor 57 + + We can subset the AnnFrame for the normal samples A and C: + + >>> af.subset(['A', 'C']).df + PatientID Tissue Age + SampleID + A P1 Normal 30 + C P2 Normal 57 + + Alternatively, we can exclude those samples: + + >>> af.subset(['A', 'C'], exclude=True).df + PatientID Tissue Age + SampleID + B P1 Tumor 30 + D P2 Tumor 57 + """ + if isinstance(samples, str): + samples = [samples] + if exclude: + samples = [x for x in self.samples if x not in samples] + return self.__class__(self.df.loc[samples]) + def _script_name(): """Return the current script's filename.""" fn = inspect.stack()[1].filename @@ -1034,7 +1112,7 @@ def plot_exons( ends : list List of exon end positions. name : str, optional - Gene name. + Gene name. Use ``name='$text$'`` to italicize the text. offset : float, default: 1 How far gene name should be plotted from the gene model. color : str, default: 'black' @@ -1086,7 +1164,7 @@ def plot_exons( ax.text( x=(starts[0]+ends[-1]) / 2, y=y-offset, - s=f'${name}$', + s=name, horizontalalignment='center', fontsize=fontsize, ) diff --git a/fuc/api/pycov.py b/fuc/api/pycov.py index 49c1801..dd210ea 100644 --- a/fuc/api/pycov.py +++ b/fuc/api/pycov.py @@ -194,7 +194,7 @@ def from_bam( zero : bool, default: False If True, output all positions (including those with zero depth). map_qual: int, optional - Only count reads with mapping quality greater than orequal to + Only count reads with mapping quality greater than or equal to this number. names : list, optional Use these as sample names instead of the SM tags. @@ -437,6 +437,9 @@ def plot_region( >>> ax.legend() >>> plt.tight_layout() """ + if self.df.empty: + raise ValueError('CovFrame is empty') + if region is None: if len(self.contigs) == 1: cf = self.copy() @@ -523,6 +526,91 @@ def to_string(self): """ return self.df.to_csv(index=False, sep='\t') + def matrix_uniformity( + self, frac=0.1, n=20, m=None + ): + """ + Compute a matrix of fraction of sampled bases >= coverage with a + shape of (coverages, samples). + + Parameters + ---------- + frac : float, default: 0.1 + Fraction of data to be sampled (to speed up the process). + n : int or list, default: 20 + Number of evenly spaced points to generate for the x-axis. + Alternatively, positions can be manually specified by providing + a list. + m : float, optional + Maximum point in the x-axis. By default, it will be the maximum + depth in the entire dataset. + + Returns + ------- + pandas.DataFrame + Matrix of fraction of sampled bases >= coverage. + + Examples + -------- + + >>> import numpy as np + >>> from fuc import pycov + >>> data = { + ... 'Chromosome': ['chr1'] * 1000, + ... 'Position': np.arange(1000, 2000), + ... 'A': pycov.simulate(loc=35, scale=5), + ... 'B': pycov.simulate(loc=25, scale=7), + ... } + >>> cf = pycov.CovFrame.from_dict(data) + >>> cf.matrix_uniformity() + A B + Coverage + 1.000000 1.00 1.00 + 3.368421 1.00 1.00 + 5.736842 1.00 1.00 + 8.105263 1.00 1.00 + 10.473684 1.00 1.00 + 12.842105 1.00 0.98 + 15.210526 1.00 0.93 + 17.578947 1.00 0.87 + 19.947368 1.00 0.77 + 22.315789 1.00 0.64 + 24.684211 1.00 0.50 + 27.052632 0.97 0.35 + 29.421053 0.84 0.25 + 31.789474 0.70 0.16 + 34.157895 0.51 0.07 + 36.526316 0.37 0.07 + 38.894737 0.21 0.03 + 41.263158 0.09 0.02 + 43.631579 0.04 0.00 + 46.000000 0.02 0.00 + """ + # Sample positions to speed up the process. + df = self.df.sample(frac=frac) + + # Determine x-axis points. + if isinstance(n, list): + coverages = n + else: + if m is None: + m = df.iloc[:, 2:].max().max() + coverages = np.linspace(1, m, n, endpoint=True) + + data = {'Coverage': coverages} + + for sample in self.samples: + fractions = [] + for coverage in coverages: + count = sum(df[sample] >= coverage) + fractions.append(count / df.shape[0]) + data[sample] = fractions + + df = pd.DataFrame(data) + df = df.set_index("Coverage") + + return df + def plot_uniformity( self, mode='aggregated', frac=0.1, n=20, m=None, marker=None, ax=None, figsize=None, **kwargs @@ -592,26 +680,8 @@ def plot_uniformity( >>> cf.plot_uniformity(mode='individual') >>> plt.tight_layout() """ - # Sample positions to speed up the process. - df = self.df.sample(frac=frac) - - # Determine x-axis points. - if isinstance(n, list): - coverages = n - else: - if m is None: - m = df.iloc[:, 2:].max().max() - coverages = np.linspace(1, m, n, endpoint=True) - - data = {'Coverage': coverages} - - for sample in self.samples: - fractions = [] - for coverage in coverages: - count = sum(df[sample] >= coverage) - fractions.append(count / df.shape[0]) - data[sample] = fractions - df = pd.DataFrame(data) + df = self.matrix_uniformity(frac=frac, n=n, m=m) + df = df.reset_index() df = df.melt(id_vars=['Coverage'], var_name='Sample', value_name='Fraction') diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 1d2ba43..4485bab 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -3798,3 +3798,36 @@ def _plot_comparison_three(self, a, b, c, venn_kws): n = [x + y for x, y in zip(n, self.calculate_concordance(a[i], b[i], c[i]))] out = venn3(subsets=n[:-1], **venn_kws) return out + + def get_gene_concordance(self, gene, a, b): + """ + Test whether two samples have the identical mutation profile for + specified gene. + + Parameters + ---------- + gene : str + Name of the gene. + a, b : str + Sample name. + + Returns + ------- + bool + True if the two samples have the same mutation profile. + """ + df = self.df[self.df.Hugo_Symbol == gene] + df = df[df.Tumor_Sample_Barcode.isin([a, b])] + + # Both samples don't have any mutations. + if df.empty: + return True + + # Only one sample has mutation(s). + if df.Tumor_Sample_Barcode.nunique() == 1: + return False + + p1 = set(df[df.Tumor_Sample_Barcode == a].Protein_Change) + p2 = set(df[df.Tumor_Sample_Barcode == b].Protein_Change) + + return p1 == p2 diff --git a/fuc/api/pyvcf.py b/fuc/api/pyvcf.py index 7b7858f..650f5ff 100644 --- a/fuc/api/pyvcf.py +++ b/fuc/api/pyvcf.py @@ -902,6 +902,10 @@ def slice(file, regions, path=None): data += str(vcf.header) for region in regions: chrom, start, end = common.parse_region(region) + if np.isnan(start): + start = None + if np.isnan(end): + end = None for record in vcf.fetch(chrom, start, end): data += str(record) else: @@ -917,6 +921,108 @@ def slice(file, regions, path=None): return data +def plot_af_correlation(vf1, vf2, ax=None, figsize=None): + """ + Create a scatter plot showing the correlation of allele frequency between + two VCF files. + + This method will exclude the following sites: + + - non-onverlapping sites + - multiallelic sites + - sites with one or more missing genotypes + + Parameters + ---------- + vf1, vf2 : VcfFrame + VcfFrame objects to be compared. + ax : matplotlib.axes.Axes, optional + Pre-existing axes for the plot. Otherwise, crete a new one. + figsize : tuple, optional + Width, height in inches. Format: (float, float). + + Returns + ------- + matplotlib.axes.Axes + The matplotlib axes containing the plot. + + Examples + -------- + .. plot:: + :context: close-figs + + >>> from fuc import pyvcf, common + >>> import matplotlib.pyplot as plt + >>> data1 = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 104, 105], + ... 'ID': ['.', '.', '.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'G', 'T', 'A', 'C'], + ... 'ALT': ['A', 'C', 'C', 'G,A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT:DP', 'GT', 'GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1:30', '0/0', '1/1', '0/1', '1/1', '0/1'], + ... 'B': ['0/0:30', '0/0', '0/1', '0/1', '1/1', '0/1'], + ... 'C': ['1/1:30', '0/0', '1/1', '0/1', '1/1', '0/1'], + ... 'D': ['0/0:30', '0/0', '0/0', '0/0', '1/1', '0/1'], + ... 'E': ['0/0:30', '0/0', '0/0', '1/2', '1/1', '0/1'], + ... } + >>> vf1 = pyvcf.VcfFrame.from_dict([], data1) + >>> data2 = { + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [101, 102, 103, 104, 105], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['T', 'G', 'T', 'A', 'C'], + ... 'ALT': ['C', 'C', 'G,A', 'C', 'T'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'F': ['0/0', '0/1', '0/1', '1/1', '0/0'], + ... 'G': ['0/0', '0/1', '0/1', '1/1', './.'], + ... 'H': ['0/0', '0/1', '0/1', '1/1', '1/1'], + ... 'I': ['0/0', '0/1', '0/0', '1/1', '1/1'], + ... 'J': ['0/0', '0/1', '1/2', '1/1', '0/1'], + ... } + >>> vf2 = pyvcf.VcfFrame.from_dict([], data2) + >>> pyvcf.plot_af_correlation(vf1, vf2) + >>> plt.tight_layout() + """ + def one_gt(g): + alleles = g.split(':')[0].split('/') + alleles = [x for x in alleles if x != '0'] + return len(alleles) + + def one_row(r): + locus = f'{r.CHROM}-{r.POS}-{r.REF}-{r.ALT}' + ac = r[9:].apply(one_gt).sum() + if 'X' in r.CHROM or 'Y' in r.CHROM: + total = len(r[9:]) + else: + total = len(r[9:]) * 2 + af = ac / total + return pd.Series([locus, af]) + + s1 = vf1.filter_multialt().filter_empty(threshold=1).df.apply(one_row, axis=1) + s2 = vf2.filter_multialt().filter_empty(threshold=1).df.apply(one_row, axis=1) + + s1.columns = ['Locus', 'First'] + s2.columns = ['Locus', 'Second'] + + s1 = s1.set_index('Locus') + s2 = s2.set_index('Locus') + + df = pd.concat([s1, s2], axis=1).dropna() + + if ax is None: + fig, ax = plt.subplots(figsize=figsize) + + sns.scatterplot(data=df, x='First', y='Second', ax=ax) + + return ax + class VcfFrame: """ Class for storing VCF data. @@ -2347,6 +2453,9 @@ def plot_region( >>> vf.plot_region('NA18973', k='#AD_FRAC_ALT', label='ALT', ax=ax) >>> plt.tight_layout() """ + if self.df.empty: + raise ValueError('VcfFrame is empty') + sample = sample if isinstance(sample, str) else self.samples[sample] if region is None: @@ -3120,13 +3229,20 @@ def filter_bed(self, bed, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) - def filter_empty(self, opposite=False, as_index=False): - """Remove rows with no genotype calls at all. + def filter_empty(self, threshold=0, opposite=False, as_index=False): + """ + Remove rows with missing genotype calls. Parameters ---------- + threshold: int, default: 0 + Exclude the row if it has a number of missing genotypes that is + greater than or equal to this number. When 0 (default), exclude + rows where all of the samples have a missing genotype. opposite : bool, default: False If True, return rows that don't meet the said criteria. as_index : bool, default: False @@ -3143,56 +3259,75 @@ def filter_empty(self, opposite=False, as_index=False): >>> from fuc import pyvcf >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'], - ... 'POS': [100, 101, 102, 103], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'T', 'A', 'C'], - ... 'ALT': ['A', 'C', 'T', 'A'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': ['.', '.', '.', '.'], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/0', './.', '0/1', './.'], - ... 'Sara': ['0/0', './.', './.', './.'], - ... 'James': ['0/0', './.', '0/1', './.'], + ... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1', 'chr1'], + ... 'POS': [100, 101, 102, 103, 104], + ... 'ID': ['.', '.', '.', '.', '.'], + ... 'REF': ['G', 'T', 'A', 'C', 'C'], + ... 'ALT': ['A', 'C', 'T', 'A', 'T'], + ... 'QUAL': ['.', '.', '.', '.', '.'], + ... 'FILTER': ['.', '.', '.', '.', '.'], + ... 'INFO': ['.', '.', '.', '.', '.'], + ... 'FORMAT': ['GT', 'GT', 'GT', 'GT', 'GT'], + ... 'A': ['0/1', './.', './.', './.', './.'], + ... 'B': ['0/0', '0/1', './.', './.', './.'], + ... 'C': ['0/0', '0/0', '0/1', './.', './.'], ... } >>> vf = pyvcf.VcfFrame.from_dict([], data) - >>> vf.filter_indel().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara James - 0 chr1 100 . G A . . . GT 0/0 0/0 0/0 - 1 chr1 101 . T C . . . GT ./. ./. ./. - 2 chr1 102 . A T . . . GT 0/1 ./. 0/1 - 3 chr1 103 . C A . . . GT ./. ./. ./. + >>> vf.df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + 2 chr1 102 . A T . . . GT ./. ./. 0/1 + 3 chr1 103 . C A . . . GT ./. ./. ./. + 4 chr1 104 . C T . . . GT ./. ./. ./. - We can remove empty rows: + We can remove rows that are completely empty: >>> vf.filter_empty().df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara James - 0 chr1 100 . G A . . . GT 0/0 0/0 0/0 - 1 chr1 102 . A T . . . GT 0/1 ./. 0/1 + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + 2 chr1 102 . A T . . . GT ./. ./. 0/1 - We can also select those rows: + We can remove rows where at least two samples have missing genotype: + + >>> vf.filter_empty(threshold=2).df + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 100 . G A . . . GT 0/1 0/0 0/0 + 1 chr1 101 . T C . . . GT ./. 0/1 0/0 + + We can show rows that are completely empty: >>> vf.filter_empty(opposite=True).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven Sara James - 0 chr1 101 . T C . . . GT ./. ./. ./. - 1 chr1 103 . C A . . . GT ./. ./. ./. + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C + 0 chr1 103 . C A . . . GT ./. ./. ./. + 1 chr1 104 . C T . . . GT ./. ./. ./. Finally, we can return boolean index array from the filtering: >>> vf.filter_empty(as_index=True) 0 True - 1 False + 1 True 2 True 3 False + 4 False dtype: bool """ - f = lambda r: not all(r.iloc[9:].apply(gt_miss)) - i = self.df.apply(f, axis=1) + if not threshold: + threshold = self.shape[1] + + def one_row(r): + s = r[9:].apply(gt_miss) + return s.sum() < threshold + + i = self.df.apply(one_row, axis=1) + if opposite: i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_indel(self, opposite=False, as_index=False): @@ -3264,6 +3399,8 @@ def filter_indel(self, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_flagall(self, flags, opposite=False, as_index=False): @@ -3345,6 +3482,8 @@ def f(r): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_flagany(self, flags, opposite=False, as_index=False): @@ -3426,6 +3565,8 @@ def f(r): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_multialt(self, opposite=False, as_index=False): @@ -3499,6 +3640,8 @@ def filter_multialt(self, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_pass(self, opposite=False, as_index=False): @@ -3570,10 +3713,13 @@ def filter_pass(self, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_phased(self, opposite=False, as_index=False): - """Remove rows with phased genotypes. + """ + Remove rows with phased genotypes. Parameters ---------- @@ -3641,10 +3787,13 @@ def filter_phased(self, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_polyp(self, opposite=False, as_index=False): - """Remove rows with a polyploid genotype call. + """ + Remove rows with a polyploid genotype call. Parameters ---------- @@ -3712,6 +3861,8 @@ def filter_polyp(self, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_qual(self, threshold, opposite=False, as_index=False): @@ -3790,6 +3941,8 @@ def one_row(r): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_sampall(self, samples=None, opposite=False, as_index=False): @@ -3886,6 +4039,8 @@ def filter_sampall(self, samples=None, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_sampany(self, samples=None, opposite=False, as_index=False): @@ -3982,6 +4137,8 @@ def filter_sampany(self, samples=None, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def filter_sampnum(self, threshold, opposite=False, as_index=False): @@ -4067,8 +4224,9 @@ def f(r): i = ~i if as_index: return i - vf = self.__class__(self.copy_meta(), self.df[i]) - return vf + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) + return self.__class__(self.copy_meta(), self.df[i]) def filter_vcf(self, vcf, opposite=False, as_index=False): """ @@ -4167,6 +4325,8 @@ def filter_vcf(self, vcf, opposite=False, as_index=False): i = ~i if as_index: return i + if i.empty: + return self.__class__(self.copy_meta(), self.copy_df()) return self.__class__(self.copy_meta(), self.df[i]) def subtract(self, a, b): diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index fd5f02e..cbbabff 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -5,15 +5,18 @@ import pysam description = """ -Slice a SAM/BAM/CRAM file. +Slice an alignment file (SAM/BAM/CRAM). """ epilog = f""" -[Example] Slice a BAM file: - $ fuc {api.common._script_name()} in.bam chr1:100-200 chr2:100-200 > out.bam +[Example] Specify regions manually: + $ fuc {api.common._script_name()} in.bam 1:100-300 2:400-700 > out.bam + +[Example] Speicfy regions with a BED file: + $ fuc {api.common._script_name()} in.bam regions.bed > out.bam [Example] Slice a CRAM file: - $ fuc {api.common._script_name()} in.bam chr1:100-200 --format CRAM --fasta ref.fa > out.cram + $ fuc {api.common._script_name()} in.bam regions.bed --format CRAM --fasta ref.fa > out.cram """ def create_parser(subparsers): @@ -26,12 +29,21 @@ def create_parser(subparsers): ) parser.add_argument( 'bam', - help='Alignment file.' + help="Input alignment file must be already indexed (.bai) to allow \n" + "random access. You can index an alignment file with the \n" + "bam-index command." ) parser.add_argument( 'regions', nargs='+', - help="List of regions to be sliced ('chrom:start-end')." + help="One or more regions to be sliced. Each region must have the \n" + "format chrom:start-end and be a half-open interval with \n" + "(start, end]. This means, for example, chr1:100-103 will \n" + "extract positions 101, 102, and 103. Alternatively, you can \n" + "provide a BED file (compressed or uncompressed) to specify \n" + "regions. Note that the 'chr' prefix in contig names (e.g. \n" + "'chr1' vs. '1') will be automatically added or removed as \n" + "necessary to match the input VCF's contig names." ) parser.add_argument( '--format', @@ -65,6 +77,17 @@ def main(args): else: stdout_method = sys.stdout.write + # Parse the regions argument. + if '.bed' in args.regions[0]: + args.regions = api.pybed.BedFrame.from_file(args.regions[0]).to_regions() + else: + args.regions = api.common.sort_regions(args.regions) + + if api.pybam.has_chr_prefix(args.bam): + args.regions = api.common.update_chr_prefix(args.regions, mode='add') + else: + args.regions = api.common.update_chr_prefix(args.regions, mode='remove') + alignments = pysam.view(args.bam, *args.regions, *options) stdout_method(alignments) diff --git a/fuc/cli/fa_filter.py b/fuc/cli/fa_filter.py new file mode 100644 index 0000000..83fcae0 --- /dev/null +++ b/fuc/cli/fa_filter.py @@ -0,0 +1,62 @@ +import sys +import subprocess +import os + +from .. import api + +from Bio import SeqIO + +description = """ +Filter sequence records in a FASTA file. +""" + +epilog = f""" +[Example] Select certain contigs: + $ fuc {api.common._script_name()} in.fasta --contigs chr1 chr2 > out.fasta + +[Example] Select certain contigs: + $ fuc {api.common._script_name()} in.fasta --contigs contigs.list --exclude > out.fasta +""" + +def create_parser(subparsers): + parser = api.common._add_parser( + subparsers, + api.common._script_name(), + description=description, + epilog=epilog, + help='Filter sequence records in a FASTA file', + ) + parser.add_argument( + 'fasta', + help='FASTA file (compressed or uncompressed).' + ) + parser.add_argument( + '--contigs', + metavar='TEXT', + nargs='+', + help='One or more contigs to be selected. Alternatively, you can \n' + 'provide a file containing one contig per line. ' + ) + parser.add_argument( + '--exclude', + action='store_true', + help='Exclude specified contigs.' + ) + +def main(args): + if os.path.exists(args.contigs[0]): + contigs = api.common.convert_file2list(args.contigs[0]) + else: + contigs = args.contigs + + records = [] + + for record in SeqIO.parse(args.fasta, 'fasta'): + if args.exclude: + if record.id not in contigs: + records.append(record) + else: + if record.id in contigs: + records.append(record) + + SeqIO.write(records, sys.stdout, 'fasta') diff --git a/fuc/cli/ngs_bam2fq.py b/fuc/cli/ngs_bam2fq.py new file mode 100644 index 0000000..da74e58 --- /dev/null +++ b/fuc/cli/ngs_bam2fq.py @@ -0,0 +1,120 @@ +import os +import sys +import shutil + +from .. import api + +import pandas as pd + +description = f""" +Pipeline for converting BAM files to FASTQ files. + +This pipeline will assume input BAM files consist of paired-end reads +and output two zipped FASTQ files for each sample (forward and reverse +reads). That is, SAMPLE.bam will produce SAMPLE_R1.fastq.gz and +SAMPLE_R2.fastq.gz. + +External dependencies: + - SGE: Required for job submission (i.e. qsub). + - SAMtools: Required for BAM to FASTQ conversion. + +Manifest columns: + - BAM: BAM file. +""" + +epilog = f""" +[Example] Specify queue: + $ fuc {api.common._script_name()} \\ + manifest.csv \\ + output_dir \\ + "-q queue_name -pe pe_name 10" \\ + --thread 10 + +[Example] Specify nodes: + $ fuc {api.common._script_name()} \\ + manifest.csv \\ + output_dir \\ + "-l h='node_A|node_B' -pe pe_name 10" \\ + --thread 10 +""" + +def create_parser(subparsers): + parser = api.common._add_parser( + subparsers, + api.common._script_name(), + description=description, + epilog=epilog, + help='Pipeline for converting BAM files to FASTQ files.', + ) + parser.add_argument( + 'manifest', + help='Sample manifest CSV file.' + ) + parser.add_argument( + 'output', + type=os.path.abspath, + help='Output directory.' + ) + parser.add_argument( + 'qsub', + type=str, + help="SGE resoruce to request with qsub for BAM to FASTQ \n" + "conversion. Since this oppoeration supports multithreading, \n" + "it is recommended to speicfy a parallel environment (PE) \n" + "to speed up the process (also see --thread)." + ) + parser.add_argument( + '--thread', + metavar='INT', + type=int, + default=1, + help='Number of threads to use (default: 1).' + ) + parser.add_argument( + '--force', + action='store_true', + help='Overwrite the output directory if it already exists.' + ) + +def main(args): + if os.path.exists(args.output) and args.force: + shutil.rmtree(args.output) + + os.mkdir(args.output) + os.mkdir(f'{args.output}/shell') + os.mkdir(f'{args.output}/log') + + with open(f'{args.output}/command.txt', 'w') as f: + f.write(' '.join(sys.argv) + '\n') + + df = pd.read_csv(args.manifest) + + names = [] + + for i, r in df.iterrows(): + name = os.path.basename(r.BAM).replace('.bam', '') + names.append(name) + with open(f'{args.output}/shell/{name}.sh', 'w') as f: + f.write( +f"""#!/bin/bash + +# Activate conda environment. +source activate {api.common.conda_env()} + +# Convert BAM to FASTQ. +samtools collate -O -@ {args.thread} {r.BAM} | samtools fastq -0 /dev/null -1 {args.output}/{name}_R1.fastq.gz -2 {args.output}/{name}_R2.fastq.gz -s /dev/null +""") + + with open(f'{args.output}/shell/qsubme.sh', 'w') as f: + f.write( +f"""#!/bin/bash + +p={args.output} + +names=({" ".join(names)}) + +for name in ${{names[@]}} +do + qsub {args.qsub} -S /bin/bash -e $p/log -o $p/log -N $name $p/shell/$name.sh +done +""") diff --git a/fuc/cli/ngs_hc.py b/fuc/cli/ngs_hc.py index 30979e5..0e931d0 100644 --- a/fuc/cli/ngs_hc.py +++ b/fuc/cli/ngs_hc.py @@ -103,6 +103,11 @@ def create_parser(subparsers): action='store_true', help='Keep temporary files.' ) + parser.add_argument( + '--posix', + action='store_true', + help='Optimize for a POSIX filesystem.' + ) def main(args): if os.path.exists(args.output) and args.force: @@ -167,6 +172,15 @@ def main(args): for chrom in chroms: with open(f'{args.output}/shell/S2-{chrom}.sh', 'w') as f: + #################### + # POSIX filesystem # + #################### + + if args.posix: + export = 'export TILEDB_DISABLE_FILE_LOCKING=1' + else: + export = '# export TILEDB_DISABLE_FILE_LOCKING=1' + #################### # GenomicsDBImport # #################### @@ -208,6 +222,9 @@ def main(args): f.write( f"""#!/bin/bash +# Optimize for POSIX filesystem. +{export} + # Activate conda environment. source activate {api.common.conda_env()} diff --git a/fuc/cli/vcf_index.py b/fuc/cli/vcf_index.py index 7e3ce24..4ecd621 100644 --- a/fuc/cli/vcf_index.py +++ b/fuc/cli/vcf_index.py @@ -11,10 +11,10 @@ """ epilog = f""" -[Example] Index a compressed VCF file. +[Example] Index a compressed VCF file: $ fuc {api.common._script_name()} in.vcf.gz -[Example] Index an uncompressed VCF file. Will create a compressed file first. +[Example] Index an uncompressed VCF file (will create a compressed VCF first): $ fuc {api.common._script_name()} in.vcf """ diff --git a/fuc/cli/vcf_slice.py b/fuc/cli/vcf_slice.py index 4a172db..3acfba2 100644 --- a/fuc/cli/vcf_slice.py +++ b/fuc/cli/vcf_slice.py @@ -7,11 +7,14 @@ """ epilog = f""" -[Example] Specify regions manually. -$ fuc {api.common._script_name()} in.vcf.gz 1:100-300 2:400-700 > out.vcf +[Example] Specify regions manually: + $ fuc {api.common._script_name()} in.vcf.gz 1:100-300 2:400-700 > out.vcf -[Example] Speicfy regions with a BED file. -$ fuc {api.common._script_name()} in.vcf.gz regions.bed > out.vcf +[Example] Speicfy regions with a BED file: + $ fuc {api.common._script_name()} in.vcf.gz regions.bed > out.vcf + +[Example] Output a compressed file: + $ fuc {api.common._script_name()} in.vcf.gz regions.bed | fuc fuc-bgzip > out.vcf.gz """ def create_parser(subparsers): @@ -24,8 +27,10 @@ def create_parser(subparsers): ) parser.add_argument( 'vcf', - help='Input VCF file must be already BGZF compressed (.gz) and \n' - 'indexed (.tbi) to allow random access.' + help="Input VCF file must be already BGZF compressed (.gz) and \n" + "indexed (.tbi) to allow random access. A VCF file can be \n" + "compressed with the fuc-bgzip command and indexed with the \n" + "vcf-index command." ) parser.add_argument( 'regions', diff --git a/fuc/version.py b/fuc/version.py index cf7b6d6..1bf3675 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.27.0' +__version__ = '0.28.0'