From b1a3125386086825a26fbd29aaf3e8478040bd89 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 08:26:39 +0900 Subject: [PATCH 01/29] Bump up version number --- fuc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuc/version.py b/fuc/version.py index ef91994..a842d05 100644 --- a/fuc/version.py +++ b/fuc/version.py @@ -1 +1 @@ -__version__ = '0.14.0' +__version__ = '0.15.0' From b54e10e9271e2fb66bff5d342cad5b9199b14468 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 08:49:03 +0900 Subject: [PATCH 02/29] Update vcf_filter command --- docs/cli.rst | 51 +++++++++++++++++------------ fuc/cli/vcf_filter.py | 75 +++++++++++++++++++++++++++---------------- 2 files changed, 78 insertions(+), 48 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 4980249..23c2e50 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -558,36 +558,45 @@ vcf_filter .. code-block:: text $ fuc vcf_filter -h - usage: fuc vcf_filter [-h] [--greedy] [--opposite] [--samples PATH] + usage: fuc vcf_filter [-h] [--expr TEXT] [--samples PATH] + [--drop_duplicates [TEXT ...]] [--greedy] [--opposite] [--filter_empty] - vcf expr + vcf This command will filter a VCF file (both zipped and unzipped). It essentially - wraps the 'pyvcf.VcfFrame.markmiss' method from the fuc API. + wraps multiple methods from the fuc API. usage examples: - $ fuc vcf_filter in.vcf 'GT == "0/0"' > out.vcf - $ fuc vcf_filter in.vcf 'GT != "0/0"' > out.vcf - $ fuc vcf_filter in.vcf 'DP < 30' > out.vcf - $ fuc vcf_filter in.vcf 'DP < 30' --greedy > out.vcf - $ fuc vcf_filter in.vcf 'AD[1] < 10' --greedy > out.vcf - $ fuc vcf_filter in.vcf 'AD[1] < 10 and DP < 30' --greedy > out.vcf - $ fuc vcf_filter in.vcf 'AD[1] < 10 or DP < 30' --greedy > out.vcf - $ fuc vcf_filter in.vcf 'AD[1] < 10 or DP < 30' --opposite > out.vcf - $ fuc vcf_filter in.vcf 'np.mean(AD) < 10' --greedy --samples sample.list > out.vcf + $ fuc vcf_filter in.vcf --expr 'GT == "0/0"' > out.vcf + $ fuc vcf_filter in.vcf --expr 'GT != "0/0"' > out.vcf + $ fuc vcf_filter in.vcf --expr 'DP < 30' > out.vcf + $ fuc vcf_filter in.vcf --expr 'DP < 30' --greedy > out.vcf + $ fuc vcf_filter in.vcf --expr 'AD[1] < 10' --greedy > out.vcf + $ fuc vcf_filter in.vcf --expr 'AD[1] < 10 and DP < 30' --greedy > out.vcf + $ fuc vcf_filter in.vcf --expr 'AD[1] < 10 or DP < 30' --greedy > out.vcf + $ fuc vcf_filter in.vcf --expr 'AD[1] < 10 or DP < 30' --opposite > out.vcf + $ fuc vcf_filter in.vcf --expr 'np.mean(AD) < 10' --greedy --samples sample.list > out.vcf + $ fuc vcf_filter in.vcf --drop_duplicates CHROM POS REF ALT > out.vcf + $ fuc vcf_filter in.vcf --filter_empty > out.vcf positional arguments: - vcf VCF file - expr expression to evaluate + vcf VCF file optional arguments: - -h, --help show this help message and exit - --greedy use this flag to mark even ambiguous genotypes as missing - --opposite use this flag to mark all genotypes that do not satisfy the - query expression as missing and leave those that do intact - --samples PATH file of sample names to apply the marking (one sample per - line) - --filter_empty use this flag to remove rows with no genotype calls at all + -h, --help show this help message and exit + --expr TEXT expression to evaluate + --samples PATH file of sample names to apply the marking (one sample + per line) + --drop_duplicates [TEXT ...] + only consider certain columns for identifying + duplicates, by default use all of the columns. + --greedy use this flag to mark even ambiguous genotypes as + missing + --opposite use this flag to mark all genotypes that do not + satisfy the query expression as missing and leave + those that do intact + --filter_empty use this flag to remove rows with no genotype calls at + all vcf_merge ========= diff --git a/fuc/cli/vcf_filter.py b/fuc/cli/vcf_filter.py index dc28ffc..9868a50 100644 --- a/fuc/cli/vcf_filter.py +++ b/fuc/cli/vcf_filter.py @@ -3,18 +3,20 @@ description = f""" This command will filter a VCF file (both zipped and unzipped). It essentially -wraps the 'pyvcf.VcfFrame.markmiss' method from the fuc API. +wraps multiple methods from the fuc API. usage examples: - $ fuc {api.common._script_name()} in.vcf 'GT == "0/0"' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'GT != "0/0"' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'DP < 30' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'DP < 30' --greedy > out.vcf - $ fuc {api.common._script_name()} in.vcf 'AD[1] < 10' --greedy > out.vcf - $ fuc {api.common._script_name()} in.vcf 'AD[1] < 10 and DP < 30' --greedy > out.vcf - $ fuc {api.common._script_name()} in.vcf 'AD[1] < 10 or DP < 30' --greedy > out.vcf - $ fuc {api.common._script_name()} in.vcf 'AD[1] < 10 or DP < 30' --opposite > out.vcf - $ fuc {api.common._script_name()} in.vcf 'np.mean(AD) < 10' --greedy --samples sample.list > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'GT == "0/0"' > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'GT != "0/0"' > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'DP < 30' > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'DP < 30' --greedy > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'AD[1] < 10' --greedy > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'AD[1] < 10 and DP < 30' --greedy > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'AD[1] < 10 or DP < 30' --greedy > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'AD[1] < 10 or DP < 30' --opposite > out.vcf + $ fuc {api.common._script_name()} in.vcf --expr 'np.mean(AD) < 10' --greedy --samples sample.list > out.vcf + $ fuc {api.common._script_name()} in.vcf --drop_duplicates CHROM POS REF ALT > out.vcf + $ fuc {api.common._script_name()} in.vcf --filter_empty > out.vcf """ def create_parser(subparsers): @@ -25,7 +27,23 @@ def create_parser(subparsers): description=description, ) parser.add_argument('vcf', help='VCF file') - parser.add_argument('expr', help='expression to evaluate') + parser.add_argument( + '--expr', + metavar='TEXT', + help='expression to evaluate' + ) + parser.add_argument( + '--samples', + metavar='PATH', + help='file of sample names to apply the marking (one sample per line)' + ) + parser.add_argument( + '--drop_duplicates', + metavar='TEXT', + nargs='*', + help='only consider certain columns for identifying duplicates, ' + 'by default use all of the columns.' + ) parser.add_argument( '--greedy', action='store_true', @@ -35,12 +53,7 @@ def create_parser(subparsers): '--opposite', action='store_true', help='use this flag to mark all genotypes that do not satisfy the ' - 'query expression as missing and leave those that do intact' - ) - parser.add_argument( - '--samples', - metavar='PATH', - help='file of sample names to apply the marking (one sample per line)' + 'query expression as missing and leave those that do intact' ) parser.add_argument( '--filter_empty', @@ -50,14 +63,22 @@ def create_parser(subparsers): def main(args): vf = api.pyvcf.VcfFrame.from_file(args.vcf) - if args.samples is None: - samples = None - else: - samples = pd.read_table(args.samples, header=None)[0].to_list() - filtered_vf = vf.markmiss(args.expr, - greedy=args.greedy, - opposite=args.opposite, - samples=samples) + + if args.expr is not None: + if args.samples is None: + samples = None + else: + samples = pd.read_table(args.samples, header=None)[0].to_list() + + vf = vf.markmiss(args.expr, + greedy=args.greedy, + opposite=args.opposite, + samples=samples) + + if args.drop_duplicates is not None: + vf = vf.drop_duplicates(args.drop_duplicates) + if args.filter_empty: - filtered_vf = filtered_vf.filter_empty() - print(filtered_vf.to_string(), end='') + vf = vf.filter_empty() + + print(vf.to_string(), end='') From 69e32adddde1acdb60f16ef3319aa2b2bb0612a9 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 10:53:19 +0900 Subject: [PATCH 03/29] Update tbl_sum command --- CHANGELOG.rst | 6 ++++++ fuc/cli/tbl_sum.py | 49 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e1a958f..309c51a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,12 @@ Changelog ********* +0.15.0 (in development) +----------------------- + +* Update :command:`vcf_filter` command. +* Update :command:`tbl_sum` command. + 0.14.0 (2021-06-20) ------------------- diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index 85bf485..10a4952 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -9,6 +9,7 @@ usage examples: $ fuc {api.common._script_name()} table.tsv + $ fuc {api.common._script_name()} table.csv --sep , """ def create_parser(subparsers): @@ -51,10 +52,10 @@ def create_parser(subparsers): '--keep_default_na', action='store_false', help='whether or not to include the default NaN values when ' - 'parsing the data (see `pandas.read_table` for details)' + "parsing the data (see 'pandas.read_table' for details)" ) parser.add_argument( - '--query', + '--expr', metavar='TEXT', help='query the columns of a pandas.DataFrame with a ' """boolean expression (e.g. `--query "A == 'yes'"`)""" @@ -66,6 +67,13 @@ def create_parser(subparsers): help='columns to be summarized (by default, all columns ' 'will be included)' ) + parser.add_argument( + '--dtypes', + metavar='PATH', + help="file of column names and their data types (etheir " + "'categorical' or 'numeric'); one tab-delimited pair of column " + "name and data type per line" + ) def main(args): if args.skiprows is None: @@ -74,23 +82,44 @@ def main(args): skiprows = [int(x) for x in args.skiprows.split(',') if x] else: skiprows = int(args.skiprows) + df = pd.read_table(args.table_file, sep=args.sep, skiprows=skiprows, na_values=args.na_values, keep_default_na=args.keep_default_na) - if args.query: - df = df.query(args.query) + + if args.dtypes is not None: + with open(args.dtypes) as f: + for line in f: + fields = line.strip().split('\t') + col = fields[0] + dtype = fields[1] + if dtype == 'numeric': + dtype = float + elif dtype == 'categorical': + dtype = 'category' + else: + raise TypeError("Data type must be either numeric or " + "categorical.") + df[col] = df[col].astype(dtype) + + if args.expr: + df = df.query(args.expr) + if args.columns: headers = args.columns else: headers = df.columns + print('='*70) + for header in headers: - column = df[header] - if is_numeric_dtype(column): - print(column.describe()) - print('='*70) + s = df[header] + print(f'Name: {header}') + print('Summary:') + if is_numeric_dtype(s): + print(s.describe().to_string()) else: - print(column.value_counts()) - print('='*70) + print(s.value_counts().sort_index().to_string()) + print('='*70) From 8fc919a352ec9831b40ab6595238d9ab7a6f747b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 10:56:56 +0900 Subject: [PATCH 04/29] Update CLI --- fuc/cli/tbl_sum.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index 10a4952..f984825 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -100,8 +100,8 @@ def main(args): elif dtype == 'categorical': dtype = 'category' else: - raise TypeError("Data type must be either numeric or " - "categorical.") + raise TypeError("Data type must be either 'numeric' or " + "'categorical'.") df[col] = df[col].astype(dtype) if args.expr: From e0624f11ceff89fc4474d815bd8c5486e9165571 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 11:11:21 +0900 Subject: [PATCH 05/29] Update CLI --- fuc/cli/tbl_sum.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index f984825..bfe6fa5 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -117,6 +117,7 @@ def main(args): for header in headers: s = df[header] print(f'Name: {header}') + print(f'Count: {len(s)}') print('Summary:') if is_numeric_dtype(s): print(s.describe().to_string()) From d045a5a62533c56e578da9d9dd900efeed7a0f3a Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 12:21:01 +0900 Subject: [PATCH 06/29] Update docs --- README.rst | 2 +- docs/cli.rst | 10 +++++++--- docs/create.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index e383d93..dbb8106 100644 --- a/README.rst +++ b/README.rst @@ -249,7 +249,7 @@ To create read depth profile of a region from a CRAM file: >>> from fuc import pycov >>> cf = pycov.CovFrame.from_file('HG00525.final.cram', zero=True, ... region='chr12:21161194-21239796', names=['HG00525']) - >>> cf.plot_region('chr12', start=21161194, end=21239796) + >>> cf.plot_region('chr12:21161194-21239796') .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/coverage.png diff --git a/docs/cli.rst b/docs/cli.rst index 23c2e50..95a3133 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -517,7 +517,7 @@ tbl_sum $ fuc tbl_sum -h usage: fuc tbl_sum [-h] [--sep TEXT] [--skiprows TEXT] [--na_values TEXT [TEXT ...]] [--keep_default_na] - [--query TEXT] [--columns TEXT [TEXT ...]] + [--expr TEXT] [--columns TEXT [TEXT ...]] [--dtypes PATH] table_file This command will summarize a table file. It essentially wraps the @@ -526,6 +526,7 @@ tbl_sum usage examples: $ fuc tbl_sum table.tsv + $ fuc tbl_sum table.csv --sep , positional arguments: table_file table file @@ -545,12 +546,15 @@ tbl_sum '-NaN', '-nan', '1.#IND', '1.#QNAN', '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null') --keep_default_na whether or not to include the default NaN values when - parsing the data (see `pandas.read_table` for details) - --query TEXT query the columns of a pandas.DataFrame with a boolean + parsing the data (see 'pandas.read_table' for details) + --expr TEXT query the columns of a pandas.DataFrame with a boolean expression (e.g. `--query "A == 'yes'"`) --columns TEXT [TEXT ...] columns to be summarized (by default, all columns will be included) + --dtypes PATH file of column names and their data types (etheir + 'categorical' or 'numeric'); one tab-delimited pair of + column name and data type per line vcf_filter ========== diff --git a/docs/create.py b/docs/create.py index 0e729e0..2daa2bc 100644 --- a/docs/create.py +++ b/docs/create.py @@ -237,7 +237,7 @@ >>> from fuc import pycov >>> cf = pycov.CovFrame.from_file('HG00525.final.cram', zero=True, ... region='chr12:21161194-21239796', names=['HG00525']) - >>> cf.plot_region('chr12', start=21161194, end=21239796) + >>> cf.plot_region('chr12:21161194-21239796') .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/coverage.png From da67fadc0e15e60aa1da9b4e0a44a9ee1fe66c62 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Mon, 21 Jun 2021 17:20:20 +0900 Subject: [PATCH 07/29] Update API --- CHANGELOG.rst | 3 ++ docs/tutorials.rst | 6 +-- fuc/api/pymaf.py | 116 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 97 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 309c51a..eda066e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,9 @@ Changelog * Update :command:`vcf_filter` command. * Update :command:`tbl_sum` command. +* Add ``samples`` and ``shape`` attributes to :class:`pymaf.AnnFrame` class. +* Rename :meth:`pymaf.MafFrame.compute_genes/tmb/waterfall` methods to :meth:`pymaf.MafFrame.matrix_genes/tmb/waterfall`. +* Add ``keep_empty`` argument to :meth:`pymaf.MafFrame.matrix_waterfall/plot_oncoplot/plot_waterfall` methods. 0.14.0 (2021-06-20) ------------------- diff --git a/docs/tutorials.rst b/docs/tutorials.rst index e9e3032..b671fb7 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -36,14 +36,14 @@ We can (relatively) easily achieve above thanks to the LEGO block-like plotting fig.suptitle('Customized oncoplot', fontsize=20) # Create the TMB plot. - samples = list(mf.compute_waterfall(count).columns) + samples = list(mf.matrix_waterfall(count).columns) mf.plot_tmb(ax=ax1, samples=samples) ax1.set_xlabel('') ax1.spines['right'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.spines['bottom'].set_visible(False) ax1.set_ylabel('TMB', fontsize=label_fontsize) - ax1.set_yticks([0, mf.compute_tmb().sum(axis=1).max()]) + ax1.set_yticks([0, mf.matrix_tmb().sum(axis=1).max()]) ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) ax2.remove() @@ -60,7 +60,7 @@ We can (relatively) easily achieve above thanks to the LEGO block-like plotting ax4.spines['top'].set_visible(False) ax4.set_yticks([]) ax4.set_xlabel('Samples', fontsize=label_fontsize) - ax4.set_xticks([0, mf.compute_genes(10, mode='samples').sum(axis=1).max()]) + ax4.set_xticks([0, mf.matrix_genes(10, mode='samples').sum(axis=1).max()]) ax4.set_ylim(-0.5, count-0.5) ax4.tick_params(axis='x', which='major', labelsize=ticklabels_fontsize) diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 9a2b3f3..d89ac4b 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -283,8 +283,6 @@ class AnnFrame: Sara_N Sara Normal 57 Sara_T Sara Tumor 57 """ - def __init__(self, df): - self._df = self._check_df(df) def _check_df(self, df): if type(df.index) == pd.RangeIndex: @@ -292,6 +290,9 @@ def _check_df(self, df): raise ValueError(m) return df + def __init__(self, df): + self._df = self._check_df(df) + @property def df(self): """pandas.DataFrame : DataFrame containing sample annotation data.""" @@ -301,6 +302,33 @@ def df(self): def df(self, value): self._df = self._check_df(value) + @property + def samples(self): + """list : List of the sample names.""" + return list(self.df.index.to_list()) + + @property + def shape(self): + """tuple : Dimensionality of AnnFrame (samples, annotations).""" + return self.df.shape + + def filter_mf(self, mf): + """ + Filter the AnnFrame for the samples in the MafFrame. + + Parameters + ---------- + mf : MafFrame + MafFrame containing target samples. + + Returns + ------- + AnnFrame + Filtered AnnFrame object. + """ + df = self.df.loc[mf.samples] + return self.__class__(df) + @classmethod def from_dict(cls, data, sample_col='Tumor_Sample_Barcode'): """Construct AnnFrame from dict of array-like or dicts. @@ -765,8 +793,10 @@ def one_row(r): return cls(df) - def compute_genes(self, count=10, mode='variants'): - """Compute a matrix of counts for genes and variant classifications. + def matrix_genes(self, count=10, mode='variants'): + """ + Compute a matrix of variant counts with a shape of (genes, variant + classifications). Parameters ---------- @@ -796,7 +826,7 @@ def compute_genes(self, count=10, mode='variants'): df = df[:count] df = df.rename_axis(None, axis=1) elif mode == 'samples': - df = self.compute_waterfall(count) + df = self.matrix_waterfall(count) df = df.apply(lambda r: r.value_counts(), axis=1) for varcls in NONSYN_NAMES + ['Multi_Hit']: if varcls not in df.columns: @@ -807,8 +837,10 @@ def compute_genes(self, count=10, mode='variants'): raise ValueError(f'Found incorrect mode: {mode}') return df - def compute_tmb(self): - """Compute a matrix of counts for samples and variant classifications. + def matrix_tmb(self): + """ + Compute a matrix of variant counts with a shape of (samples, variant + classifications). Returns ------- @@ -832,13 +864,22 @@ def compute_tmb(self): df = df.rename_axis(None, axis=1) return df - def compute_waterfall(self, count=10): - """Compute a matrix of variant classifications for genes and samples. + def matrix_waterfall(self, count=10, samples=None, keep_empty=False): + """ + Compute a matrix of variant classifications with a shape of + (genes, samples). + + If there are multiple variant classifications available for a given + cell, they will be replaced as 'Multi_Hit'. Parameters ---------- count : int, default: 10 - Number of top mutated genes to display. + Number of top mutated genes to include. + samples : list, optional + List of samples that should be used to compute the matrix. + keep_empty : bool, default: False + If True, keep samples with all ``NaN``'s. Returns ------- @@ -846,6 +887,10 @@ def compute_waterfall(self, count=10): Dataframe containing waterfall data. """ df = self.df[self.df.Variant_Classification.isin(NONSYN_NAMES)] + + if samples is not None: + df = df[df.Tumor_Sample_Barcode.isin(samples)] + f = lambda x: ''.join(x) if len(x) == 1 else 'Multi_Hit' df = df.groupby(['Hugo_Symbol', 'Tumor_Sample_Barcode'])[ 'Variant_Classification'].apply(f).to_frame() @@ -860,8 +905,9 @@ def compute_waterfall(self, count=10): # Select the top mutated genes. df = df[:count] - # Remove columns (samples) with all NaN's. - df = df.dropna(axis=1, how='all') + # Drop samples with all NaN's. + if not keep_empty: + df = df.dropna(axis=1, how='all') # Sort the columns (samples). c = df.applymap(lambda x: 0 if pd.isnull(x) else 1).sort_values( @@ -921,7 +967,7 @@ def plot_genes( colors = NONSYN_COLORS + ['k'] else: raise ValueError(f'Found incorrect mode: {mode}') - df = self.compute_genes(count, mode=mode) + df = self.matrix_genes(count, mode=mode) df = df.iloc[::-1] if ax is None: fig, ax = plt.subplots(figsize=figsize) @@ -932,7 +978,7 @@ def plot_genes( return ax def plot_oncoplot( - self, count=10, figsize=(15, 10), label_fontsize=15, + self, count=10, keep_empty=False, figsize=(15, 10), label_fontsize=15, ticklabels_fontsize=15, legend_fontsize=15 ): """Create an oncoplot. @@ -941,6 +987,8 @@ def plot_oncoplot( ---------- count : int, default: 10 Number of top mutated genes to display. + keep_empty : bool, default: False + If True, display samples that do not have any mutations. figsize : tuple, default: (15, 10) Width, height in inches. Format: (float, float). label_fontsize : float, default: 15 @@ -967,14 +1015,14 @@ def plot_oncoplot( [[ax1, ax2], [ax3, ax4], [ax5, ax6]] = axes # Create the TMB plot. - samples = list(self.compute_waterfall(count).columns) + samples = list(self.matrix_waterfall(count=count, keep_empty=keep_empty).columns) self.plot_tmb(ax=ax1, samples=samples) ax1.set_xlabel('') ax1.spines['right'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.spines['bottom'].set_visible(False) ax1.set_ylabel('TMB', fontsize=label_fontsize) - ax1.set_yticks([0, self.compute_tmb().sum(axis=1).max()]) + ax1.set_yticks([0, self.matrix_tmb().sum(axis=1).max()]) ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) @@ -982,7 +1030,7 @@ def plot_oncoplot( ax2.remove() # Create the waterfall plot. - self.plot_waterfall(count=count, ax=ax3, linewidths=1) + self.plot_waterfall(count=count, ax=ax3, linewidths=1, keep_empty=keep_empty) ax3.set_xlabel('') ax3.tick_params(axis='y', which='major', labelrotation=0, labelsize=ticklabels_fontsize) @@ -994,7 +1042,7 @@ def plot_oncoplot( ax4.spines['top'].set_visible(False) ax4.set_yticks([]) ax4.set_xlabel('Samples', fontsize=label_fontsize) - ax4.set_xticks([0, self.compute_genes( + ax4.set_xticks([0, self.matrix_genes( 10, mode='samples').sum(axis=1).max()]) ax4.set_ylim(-0.5, count-0.5) ax4.tick_params(axis='x', which='major', @@ -1262,7 +1310,7 @@ def plot_summary( labelsize=ticklabels_fontsize) # Create the 'Variants per sample' figure. - median = self.compute_tmb().sum(axis=1).median() + median = self.matrix_tmb().sum(axis=1).median() self.plot_tmb(ax=ax4) ax4.set_title(f'Variants per sample (median={median:.1f})', fontsize=title_fontsize) @@ -1333,7 +1381,7 @@ def plot_tmb(self, ax=None, figsize=None, samples=None, **kwargs): >>> mf.plot_tmb() >>> plt.tight_layout() """ - df = self.compute_tmb() + df = self.matrix_tmb() if samples is not None: df = df.loc[samples] if ax is None: @@ -1381,7 +1429,7 @@ def plot_vaf(self, col, count=10, ax=None, figsize=None, **kwargs): >>> mf.plot_vaf('i_TumorVAF_WU') >>> plt.tight_layout() """ - genes = self.compute_genes(count=count).index.to_list() + genes = self.matrix_genes(count=count).index.to_list() s = self.df.groupby('Hugo_Symbol')[col].median() genes = s[genes].sort_values(ascending=False).index.to_list() df = self.df[self.df.Hugo_Symbol.isin(genes)] @@ -1470,7 +1518,7 @@ def plot_varsum(self, flip=False, ax=None, figsize=None): >>> mf.plot_varsum() >>> plt.tight_layout() """ - df = self.compute_tmb() + df = self.matrix_tmb() df = pd.melt(df, value_vars=df.columns) if ax is None: fig, ax = plt.subplots(figsize=figsize) @@ -1527,20 +1575,30 @@ def plot_vartype(self, ax=None, figsize=None, **kwargs): ax.set_ylabel('') return ax - def plot_waterfall(self, count=10, ax=None, figsize=None, **kwargs): + def plot_waterfall( + self, count=10, keep_empty=False, af=None, sort_by=None, + sample_order=None, ax=None, figsize=None, **kwargs + ): """Create a waterfall plot. Parameters ---------- count : int, default: 10 Number of top mutated genes to display. + keep_empty : bool, default: False + If True, display samples that do not have any mutations. + af : AnnFrame, optional + AnnFrame containing sample annoation data for the MafFrame. + sort_by : list, optional + Columns in the AnnFrame to sort the samples by. + sample_order : list, optional + List of samples to display. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. figsize : tuple, optional Width, height in inches. Format: (float, float). kwargs Other keyword arguments will be passed down to - :meth:`matplotlib.axes.Axes.pcolormesh()` and :meth:`seaborn.heatmap`. Returns @@ -1561,7 +1619,15 @@ def plot_waterfall(self, count=10, ax=None, figsize=None, **kwargs): >>> mf.plot_waterfall(linewidths=0.5) >>> plt.tight_layout() """ - df = self.compute_waterfall(count) + df = self.matrix_waterfall(count=count, keep_empty=keep_empty) + + if sort_by is not None: + _ = af.df.loc[df.columns] + _ = _.sort_values(sort_by) + df = df[_.index] + + if sample_order is not None: + df = df[sample_order] # Apply the mapping between items and integers. l = reversed(NONSYN_NAMES + ['Multi_Hit', 'None']) From 8f4bb1cd70576ca2962709b9e12a7ade2273c009 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Mon, 21 Jun 2021 20:29:16 +0900 Subject: [PATCH 08/29] Add pymaf.MafFrame.filter_annot method --- CHANGELOG.rst | 1 + fuc/api/pymaf.py | 63 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index eda066e..8ec40c7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ Changelog * Add ``samples`` and ``shape`` attributes to :class:`pymaf.AnnFrame` class. * Rename :meth:`pymaf.MafFrame.compute_genes/tmb/waterfall` methods to :meth:`pymaf.MafFrame.matrix_genes/tmb/waterfall`. * Add ``keep_empty`` argument to :meth:`pymaf.MafFrame.matrix_waterfall/plot_oncoplot/plot_waterfall` methods. +* Add :meth:`pymaf.MafFrame.filter_annot` method. 0.14.0 (2021-06-20) ------------------- diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index d89ac4b..23289ec 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -380,7 +380,8 @@ def from_dict(cls, data, sample_col='Tumor_Sample_Barcode'): @classmethod def from_file(cls, fn, sample_col='Tumor_Sample_Barcode', sep='\t'): - """Construct AnnFrame from a delimited text file. + """ + Construct an AnnFrame from a delimited text file. The text file must have at least one column that represents sample names which are used as index for pandas.DataFrame. @@ -864,7 +865,7 @@ def matrix_tmb(self): df = df.rename_axis(None, axis=1) return df - def matrix_waterfall(self, count=10, samples=None, keep_empty=False): + def matrix_waterfall(self, count=10, keep_empty=False): """ Compute a matrix of variant classifications with a shape of (genes, samples). @@ -876,8 +877,6 @@ def matrix_waterfall(self, count=10, samples=None, keep_empty=False): ---------- count : int, default: 10 Number of top mutated genes to include. - samples : list, optional - List of samples that should be used to compute the matrix. keep_empty : bool, default: False If True, keep samples with all ``NaN``'s. @@ -888,9 +887,6 @@ def matrix_waterfall(self, count=10, samples=None, keep_empty=False): """ df = self.df[self.df.Variant_Classification.isin(NONSYN_NAMES)] - if samples is not None: - df = df[df.Tumor_Sample_Barcode.isin(samples)] - f = lambda x: ''.join(x) if len(x) == 1 else 'Multi_Hit' df = df.groupby(['Hugo_Symbol', 'Tumor_Sample_Barcode'])[ 'Variant_Classification'].apply(f).to_frame() @@ -1576,8 +1572,7 @@ def plot_vartype(self, ax=None, figsize=None, **kwargs): return ax def plot_waterfall( - self, count=10, keep_empty=False, af=None, sort_by=None, - sample_order=None, ax=None, figsize=None, **kwargs + self, count=10, keep_empty=False, samples=None, ax=None, figsize=None, **kwargs ): """Create a waterfall plot. @@ -1587,11 +1582,7 @@ def plot_waterfall( Number of top mutated genes to display. keep_empty : bool, default: False If True, display samples that do not have any mutations. - af : AnnFrame, optional - AnnFrame containing sample annoation data for the MafFrame. - sort_by : list, optional - Columns in the AnnFrame to sort the samples by. - sample_order : list, optional + samples : list, optional List of samples to display. ax : matplotlib.axes.Axes, optional Pre-existing axes for the plot. Otherwise, crete a new one. @@ -1621,13 +1612,8 @@ def plot_waterfall( """ df = self.matrix_waterfall(count=count, keep_empty=keep_empty) - if sort_by is not None: - _ = af.df.loc[df.columns] - _ = _.sort_values(sort_by) - df = df[_.index] - - if sample_order is not None: - df = df[sample_order] + if samples is not None: + df = df[samples] # Apply the mapping between items and integers. l = reversed(NONSYN_NAMES + ['Multi_Hit', 'None']) @@ -1802,3 +1788,38 @@ def to_string(self): String representation of MafFrame. """ return self.df.to_csv(index=False, sep='\t') + + def filter_annot(self, af, expr): + """ + Filter the MafFrame using sample annotation data. + + Samples are selected by querying the columns of an AnnFrame with a + boolean expression. Samples not present in the MafFrame will be + excluded automatically. + + Parameters + ---------- + af : AnnFrame + AnnFrame with sample annotaton data. + expr : str + Query expression to evaluate. + + Returns + ------- + MafFrame + Filtered MafFrame. + + Examples + -------- + + >>> from fuc import common, pymaf + >>> common.load_dataset('tcga-laml') + >>> mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') + >>> af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') + >>> filtered_mf = mf.filter_annot(af, "FAB_classification == 'M4') + """ + samples = af.df.query(expr).index + i = self.df.Tumor_Sample_Barcode.isin(samples) + df = self.df[i] + mf = self.__class__(df) + return mf From 5232ae8fefa596b8721066c63ac13369b767fd47 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 09:47:06 +0900 Subject: [PATCH 09/29] Fix typo --- fuc/api/pymaf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 23289ec..4d24f5f 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -1816,7 +1816,7 @@ def filter_annot(self, af, expr): >>> common.load_dataset('tcga-laml') >>> mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') >>> af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') - >>> filtered_mf = mf.filter_annot(af, "FAB_classification == 'M4') + >>> filtered_mf = mf.filter_annot(af, "FAB_classification == 'M4'") """ samples = af.df.query(expr).index i = self.df.Tumor_Sample_Barcode.isin(samples) From 6e73ac78efe663854cab9a20b1f35c4ea4e55926 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 10:33:40 +0900 Subject: [PATCH 10/29] Update docs --- docs/examples/customized_oncoplot_1.py | 90 +++++++++++++++++++++++ docs/tutorials.rst | 99 ++------------------------ fuc/api/pymaf.py | 33 +++++++-- 3 files changed, 121 insertions(+), 101 deletions(-) create mode 100644 docs/examples/customized_oncoplot_1.py diff --git a/docs/examples/customized_oncoplot_1.py b/docs/examples/customized_oncoplot_1.py new file mode 100644 index 0000000..42d7bea --- /dev/null +++ b/docs/examples/customized_oncoplot_1.py @@ -0,0 +1,90 @@ +# File: customized_oncoplot_1.py + +import matplotlib.pyplot as plt +from fuc import common, pymaf +common.load_dataset('tcga-laml') +mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') +af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') + +# Define the shared variables. +count=10 +figsize=(15, 10) +label_fontsize=13 +ticklabels_fontsize=12 +legend_fontsize=12 + +# Create the figure. Getting the right height ratios can be tricky and often requires a trial-and-error process. +fig, axes = plt.subplots(6, 2, figsize=figsize, gridspec_kw={'height_ratios': [1, 10, 1, 1, 1, 3.5], 'width_ratios': [10, 1]}) +[[ax1, ax2], [ax3, ax4], [ax5, ax6], [ax7, ax8], [ax9, ax10], [ax11, ax12]] = axes +fig.suptitle('customized_oncoplot_1.py', fontsize=20) + +# Create the TMB plot. +samples = list(mf.matrix_waterfall(count).columns) +mf.plot_tmb(ax=ax1, samples=samples) +ax1.set_xlabel('') +ax1.spines['right'].set_visible(False) +ax1.spines['top'].set_visible(False) +ax1.spines['bottom'].set_visible(False) +ax1.set_ylabel('TMB', fontsize=label_fontsize) +ax1.set_yticks([0, mf.matrix_tmb().sum(axis=1).max()]) +ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) + +ax2.remove() + +# Create the waterfall plot. +mf.plot_waterfall(count=count, ax=ax3, linewidths=1) +ax3.set_xlabel('') +ax3.tick_params(axis='y', which='major', labelrotation=0, labelsize=ticklabels_fontsize) + +# Create the genes plot. +mf.plot_genes(count=count, ax=ax4, mode='samples', width=0.95) +ax4.spines['right'].set_visible(False) +ax4.spines['left'].set_visible(False) +ax4.spines['top'].set_visible(False) +ax4.set_yticks([]) +ax4.set_xlabel('Samples', fontsize=label_fontsize) +ax4.set_xticks([0, mf.matrix_genes(10, mode='samples').sum(axis=1).max()]) +ax4.set_ylim(-0.5, count-0.5) +ax4.tick_params(axis='x', which='major', labelsize=ticklabels_fontsize) + +# Create the annotation plot for 'FAB_classification'. +af.plot_annot('FAB_classification', samples=samples, ax=ax5, linewidths=1, cmap='Dark2') +ax5.set_xlabel('') +ax5.set_ylabel('') + +ax6.remove() + +# Create the annotation plot for 'days_to_last_followup'. +af.plot_annot('days_to_last_followup', samples=samples, numeric=True, ax=ax7, linewidths=1, cmap='viridis') +ax7.set_xlabel('') +ax7.set_ylabel('') + +ax8.remove() + +# Create the annotation plot for 'Overall_Survival_Status'. +af.plot_annot('Overall_Survival_Status', samples=samples, ax=ax9, linewidths=1) +ax9.set_xlabel('Samples', fontsize=label_fontsize) +ax9.set_ylabel('') + +ax10.remove() + +# Create the legends. Getting the right legend locations can be tricky and often requires a trial-and-error process. +handles1 = pymaf.legend_handles(name='waterfall') +handles2 = af.legend_handles('FAB_classification', samples=samples, cmap='Dark2') +handles3 = af.legend_handles('days_to_last_followup', samples=samples, numeric=True, cmap='viridis') +handles4 = af.legend_handles('Overall_Survival_Status', samples=samples) +leg1 = ax11.legend(handles=handles1, loc=(0, 0), title='Variant_Classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg2 = ax11.legend(handles=handles2, loc=(0.43, 0), title='FAB_classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg3 = ax11.legend(handles=handles3, loc=(0.62, 0), title='days_to_last_followup', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg4 = ax11.legend(handles=handles4, loc=(0.82, 0), title='Overall_Survival_Status', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +ax11.add_artist(leg1) +ax11.add_artist(leg2) +ax11.add_artist(leg3) +ax11.add_artist(leg4) +ax11.axis('off') + +ax12.remove() + +plt.tight_layout() +plt.subplots_adjust(wspace=0.01, hspace=0.01) +plt.savefig('customized_oncoplot_1.png') diff --git a/docs/tutorials.rst b/docs/tutorials.rst index b671fb7..ea7a28e 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -1,8 +1,8 @@ Tutorials ********* -Create customized oncoplot -========================== +Create customized oncoplots +=========================== We can use either the :command:`maf_oncoplt` command or the :meth:`pymaf.plot_oncoplot` method to create a "standard" oncoplot like the one shown below. @@ -10,97 +10,6 @@ We can use either the :command:`maf_oncoplt` command or the :meth:`pymaf.plot_on While the plot is pleasing to the eye, one may wish to customize it to add, for example, various annotation data for the samples like this: -.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot.png +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot_1.png -We can (relatively) easily achieve above thanks to the LEGO block-like plotting methods in the ``pymaf`` submodule: - -.. code:: python3 - - # File: customized_oncoplot.py - import matplotlib.pyplot as plt - from fuc import common, pymaf - common.load_dataset('tcga-laml') - mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') - af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') - - # Define the shared variables. - count=10 - figsize=(15, 10) - label_fontsize=13 - ticklabels_fontsize=12 - legend_fontsize=12 - - # Create the figure. Getting the right height ratios can be tricky and often requires a trial-and-error process. - fig, axes = plt.subplots(6, 2, figsize=figsize, gridspec_kw={'height_ratios': [1, 10, 1, 1, 1, 3.5], 'width_ratios': [10, 1]}) - [[ax1, ax2], [ax3, ax4], [ax5, ax6], [ax7, ax8], [ax9, ax10], [ax11, ax12]] = axes - fig.suptitle('Customized oncoplot', fontsize=20) - - # Create the TMB plot. - samples = list(mf.matrix_waterfall(count).columns) - mf.plot_tmb(ax=ax1, samples=samples) - ax1.set_xlabel('') - ax1.spines['right'].set_visible(False) - ax1.spines['top'].set_visible(False) - ax1.spines['bottom'].set_visible(False) - ax1.set_ylabel('TMB', fontsize=label_fontsize) - ax1.set_yticks([0, mf.matrix_tmb().sum(axis=1).max()]) - ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) - - ax2.remove() - - # Create the waterfall plot. - mf.plot_waterfall(count=count, ax=ax3, linewidths=1) - ax3.set_xlabel('') - ax3.tick_params(axis='y', which='major', labelrotation=0, labelsize=ticklabels_fontsize) - - # Create the genes plot. - mf.plot_genes(count=count, ax=ax4, mode='samples', width=0.95) - ax4.spines['right'].set_visible(False) - ax4.spines['left'].set_visible(False) - ax4.spines['top'].set_visible(False) - ax4.set_yticks([]) - ax4.set_xlabel('Samples', fontsize=label_fontsize) - ax4.set_xticks([0, mf.matrix_genes(10, mode='samples').sum(axis=1).max()]) - ax4.set_ylim(-0.5, count-0.5) - ax4.tick_params(axis='x', which='major', labelsize=ticklabels_fontsize) - - # Create the annotation plot for 'FAB_classification'. - af.plot_annot('FAB_classification', samples=samples, ax=ax5, linewidths=1, cmap='Dark2') - ax5.set_xlabel('') - ax5.set_ylabel('') - - ax6.remove() - - # Create the annotation plot for 'days_to_last_followup'. - af.plot_annot('days_to_last_followup', samples=samples, numeric=True, ax=ax7, linewidths=1, cmap='viridis') - ax7.set_xlabel('') - ax7.set_ylabel('') - - ax8.remove() - - # Create the annotation plot for 'Overall_Survival_Status'. - af.plot_annot('Overall_Survival_Status', samples=samples, ax=ax9, linewidths=1) - ax9.set_xlabel('Samples', fontsize=label_fontsize) - ax9.set_ylabel('') - - ax10.remove() - - # Create the legends. Getting the right legend locations can be tricky and often requires a trial-and-error process. - handles1 = pymaf.legend_handles(name='waterfall') - handles2 = af.legend_handles('FAB_classification', samples=samples, cmap='Dark2') - handles3 = af.legend_handles('days_to_last_followup', samples=samples, numeric=True, cmap='viridis') - handles4 = af.legend_handles('Overall_Survival_Status', samples=samples) - leg1 = ax11.legend(handles=handles1, loc=(0, 0), title='Variant_Classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) - leg2 = ax11.legend(handles=handles2, loc=(0.43, 0), title='FAB_classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) - leg3 = ax11.legend(handles=handles3, loc=(0.62, 0), title='days_to_last_followup', fontsize=legend_fontsize, title_fontsize=legend_fontsize) - leg4 = ax11.legend(handles=handles4, loc=(0.79, 0), title='Overall_Survival_Status', fontsize=legend_fontsize, title_fontsize=legend_fontsize) - ax11.add_artist(leg1) - ax11.add_artist(leg2) - ax11.add_artist(leg3) - ax11.add_artist(leg4) - ax11.axis('off') - - ax12.remove() - - plt.tight_layout() - plt.subplots_adjust(wspace=0.01, hspace=0.01) +We can (relatively) easily achieve above thanks to the LEGO block-like plotting methods in the ``pymaf`` submodule (:download:`click `). diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 4d24f5f..e3588c1 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -240,10 +240,11 @@ def legend_handles(name='regular'): return handles class AnnFrame: - """Class for storing sample annotation data. + """ + Class for storing sample annotation data. - This class stores sample annotation data as pandas.DataFrame with sample - names as index. + This class stores sample annotation data as :class:`pandas.DataFrame` + with sample names as index. Note that an AnnFrame can have a different set of samples than its accompanying MafFrame. @@ -520,11 +521,12 @@ def plot_annot( self, col, samples=None, numeric=False, segments=5, decimals=0, cmap='Pastel1', ax=None, figsize=None, **kwargs ): - """Create a 1D categorical heatmap for the given column. + """ + Create a 1D categorical heatmap of the column. - In the case of a numeric column, use ``numeric=True`` which will + In the case of a numeric column, set ``numeric`` as True which will divide the values into equal-sized intervals, with the number of - intervals determined by the `segments` option. + intervals determined by ``segments``. Parameters ---------- @@ -588,6 +590,25 @@ def plot_annot( ax.set_yticks([]) return ax + def sorted_samples(self, by, mf=None, keep_empty=False, nonsyn=False): + """ + Return a sorted list of sample names. + + Parameters + ---------- + df : str or list + Column or list of columns to sort by. + """ + df = self.df.copy() + + if nonsyn: + samples = mf.matrix_waterfall(keep_empty=keep_empty).columns + df = df.loc[samples] + + df = df.sort_values(by=by) + + return df.index.to_list() + class MafFrame: """Class for storing MAF data. From 63aef455a5edb9df86bb26c98ff5295783778bed Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 10:51:23 +0900 Subject: [PATCH 11/29] Update docs --- CHANGELOG.rst | 1 + docs/examples/customized_oncoplot_2.py | 90 ++++++++++++++++++++++++++ docs/tutorials.rst | 6 +- fuc/api/pymaf.py | 15 ++++- 4 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 docs/examples/customized_oncoplot_2.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8ec40c7..2e092c1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,7 @@ Changelog * Rename :meth:`pymaf.MafFrame.compute_genes/tmb/waterfall` methods to :meth:`pymaf.MafFrame.matrix_genes/tmb/waterfall`. * Add ``keep_empty`` argument to :meth:`pymaf.MafFrame.matrix_waterfall/plot_oncoplot/plot_waterfall` methods. * Add :meth:`pymaf.MafFrame.filter_annot` method. +* Add :meth:`pymaf.AnnFrame.sorted_samples` method. 0.14.0 (2021-06-20) ------------------- diff --git a/docs/examples/customized_oncoplot_2.py b/docs/examples/customized_oncoplot_2.py new file mode 100644 index 0000000..9c66e04 --- /dev/null +++ b/docs/examples/customized_oncoplot_2.py @@ -0,0 +1,90 @@ +# File: customized_oncoplot_2.py + +import matplotlib.pyplot as plt +from fuc import common, pymaf +common.load_dataset('tcga-laml') +mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') +af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') + +# Define the shared variables. +count=10 +figsize=(15, 10) +label_fontsize=13 +ticklabels_fontsize=12 +legend_fontsize=12 + +# Create the figure. Getting the right height ratios can be tricky and often requires a trial-and-error process. +fig, axes = plt.subplots(6, 2, figsize=figsize, gridspec_kw={'height_ratios': [1, 10, 1, 1, 1, 3.5], 'width_ratios': [10, 1]}) +[[ax1, ax2], [ax3, ax4], [ax5, ax6], [ax7, ax8], [ax9, ax10], [ax11, ax12]] = axes +fig.suptitle('customized_oncoplot_2.py', fontsize=20) + +# Create the TMB plot. +samples = af.sorted_samples(['FAB_classification', 'Overall_Survival_Status'], mf=mf, nonsyn=True) +mf.plot_tmb(ax=ax1, samples=samples) +ax1.set_xlabel('') +ax1.spines['right'].set_visible(False) +ax1.spines['top'].set_visible(False) +ax1.spines['bottom'].set_visible(False) +ax1.set_ylabel('TMB', fontsize=label_fontsize) +ax1.set_yticks([0, mf.matrix_tmb().sum(axis=1).max()]) +ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) + +ax2.remove() + +# Create the waterfall plot. +mf.plot_waterfall(count=count, ax=ax3, linewidths=1, samples=samples) +ax3.set_xlabel('') +ax3.tick_params(axis='y', which='major', labelrotation=0, labelsize=ticklabels_fontsize) + +# Create the genes plot. +mf.plot_genes(count=count, ax=ax4, mode='samples', width=0.95) +ax4.spines['right'].set_visible(False) +ax4.spines['left'].set_visible(False) +ax4.spines['top'].set_visible(False) +ax4.set_yticks([]) +ax4.set_xlabel('Samples', fontsize=label_fontsize) +ax4.set_xticks([0, mf.matrix_genes(10, mode='samples').sum(axis=1).max()]) +ax4.set_ylim(-0.5, count-0.5) +ax4.tick_params(axis='x', which='major', labelsize=ticklabels_fontsize) + +# Create the annotation plot for 'FAB_classification'. +af.plot_annot('FAB_classification', samples=samples, ax=ax5, linewidths=1, cmap='Dark2') +ax5.set_xlabel('') +ax5.set_ylabel('') + +ax6.remove() + +# Create the annotation plot for 'days_to_last_followup'. +af.plot_annot('days_to_last_followup', samples=samples, numeric=True, ax=ax7, linewidths=1, cmap='viridis') +ax7.set_xlabel('') +ax7.set_ylabel('') + +ax8.remove() + +# Create the annotation plot for 'Overall_Survival_Status'. +af.plot_annot('Overall_Survival_Status', samples=samples, ax=ax9, linewidths=1) +ax9.set_xlabel('Samples', fontsize=label_fontsize) +ax9.set_ylabel('') + +ax10.remove() + +# Create the legends. Getting the right legend locations can be tricky and often requires a trial-and-error process. +handles1 = pymaf.legend_handles(name='waterfall') +handles2 = af.legend_handles('FAB_classification', samples=samples, cmap='Dark2') +handles3 = af.legend_handles('days_to_last_followup', samples=samples, numeric=True, cmap='viridis') +handles4 = af.legend_handles('Overall_Survival_Status', samples=samples) +leg1 = ax11.legend(handles=handles1, loc=(0, 0), title='Variant_Classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg2 = ax11.legend(handles=handles2, loc=(0.43, 0), title='FAB_classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg3 = ax11.legend(handles=handles3, loc=(0.62, 0), title='days_to_last_followup', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg4 = ax11.legend(handles=handles4, loc=(0.82, 0), title='Overall_Survival_Status', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +ax11.add_artist(leg1) +ax11.add_artist(leg2) +ax11.add_artist(leg3) +ax11.add_artist(leg4) +ax11.axis('off') + +ax12.remove() + +plt.tight_layout() +plt.subplots_adjust(wspace=0.01, hspace=0.01) +plt.savefig('customized_oncoplot_2.png') diff --git a/docs/tutorials.rst b/docs/tutorials.rst index ea7a28e..6d1e298 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -12,4 +12,8 @@ While the plot is pleasing to the eye, one may wish to customize it to add, for .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot_1.png -We can (relatively) easily achieve above thanks to the LEGO block-like plotting methods in the ``pymaf`` submodule (:download:`click `). +We can (relatively) easily achieve above thanks to the LEGO block-like plotting methods in the ``pymaf`` submodule (:download:`customized_oncoplot_1.py `). + +We can go one step further and sort the samples by one or more annotations (:download:`customized_oncoplot_2.py `): + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot_2.png diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index e3588c1..91e5ae5 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -528,6 +528,9 @@ def plot_annot( divide the values into equal-sized intervals, with the number of intervals determined by ``segments``. + See the :ref:`tutorials:Create customized oncoplots` tutorial to + learn how to create customized oncoplots. + Parameters ---------- col : str @@ -998,7 +1001,11 @@ def plot_oncoplot( self, count=10, keep_empty=False, figsize=(15, 10), label_fontsize=15, ticklabels_fontsize=15, legend_fontsize=15 ): - """Create an oncoplot. + """ + Create a standard oncoplot. + + See the :ref:`tutorials:Create customized oncoplots` tutorial to + learn how to create customized oncoplots. Parameters ---------- @@ -1595,7 +1602,11 @@ def plot_vartype(self, ax=None, figsize=None, **kwargs): def plot_waterfall( self, count=10, keep_empty=False, samples=None, ax=None, figsize=None, **kwargs ): - """Create a waterfall plot. + """ + Create a waterfall plot. + + See the :ref:`tutorials:Create customized oncoplots` tutorial to + learn how to create customized oncoplots. Parameters ---------- From b1192988f75a7b8cec544ee3af25c6fef113842b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 12:39:01 +0900 Subject: [PATCH 12/29] Update tutorial --- docs/examples/customized_oncoplot_3.py | 93 ++++++++++++++++++++++++++ docs/tutorials.rst | 4 ++ 2 files changed, 97 insertions(+) create mode 100644 docs/examples/customized_oncoplot_3.py diff --git a/docs/examples/customized_oncoplot_3.py b/docs/examples/customized_oncoplot_3.py new file mode 100644 index 0000000..b294096 --- /dev/null +++ b/docs/examples/customized_oncoplot_3.py @@ -0,0 +1,93 @@ +# File: customized_oncoplot_3.py + +import matplotlib.pyplot as plt +from fuc import common, pymaf +common.load_dataset('tcga-laml') +mf = pymaf.MafFrame.from_file('~/fuc-data/tcga-laml/tcga_laml.maf.gz') +af = pymaf.AnnFrame.from_file('~/fuc-data/tcga-laml/tcga_laml_annot.tsv') + +# Filter the MafFrame. +mf = mf.filter_annot(af, "Overall_Survival_Status == 1") + +# Define the shared variables. +count=10 +figsize=(15, 10) +label_fontsize=13 +ticklabels_fontsize=12 +legend_fontsize=12 + +# Create the figure. Getting the right height ratios can be tricky and often requires a trial-and-error process. +fig, axes = plt.subplots(6, 2, figsize=figsize, gridspec_kw={'height_ratios': [1, 10, 1, 1, 1, 3.5], 'width_ratios': [10, 1]}) +[[ax1, ax2], [ax3, ax4], [ax5, ax6], [ax7, ax8], [ax9, ax10], [ax11, ax12]] = axes +fig.suptitle('customized_oncoplot_3.py', fontsize=20) + +# Create the TMB plot. +samples = list(mf.matrix_waterfall(count).columns) +mf.plot_tmb(ax=ax1, samples=samples) +ax1.set_xlabel('') +ax1.spines['right'].set_visible(False) +ax1.spines['top'].set_visible(False) +ax1.spines['bottom'].set_visible(False) +ax1.set_ylabel('TMB', fontsize=label_fontsize) +ax1.set_yticks([0, mf.matrix_tmb().sum(axis=1).max()]) +ax1.tick_params(axis='y', which='major', labelsize=ticklabels_fontsize) + +ax2.remove() + +# Create the waterfall plot. +mf.plot_waterfall(count=count, ax=ax3, linewidths=1, samples=samples) +ax3.set_xlabel('') +ax3.tick_params(axis='y', which='major', labelrotation=0, labelsize=ticklabels_fontsize) + +# Create the genes plot. +mf.plot_genes(count=count, ax=ax4, mode='samples', width=0.95) +ax4.spines['right'].set_visible(False) +ax4.spines['left'].set_visible(False) +ax4.spines['top'].set_visible(False) +ax4.set_yticks([]) +ax4.set_xlabel('Samples', fontsize=label_fontsize) +ax4.set_xticks([0, mf.matrix_genes(10, mode='samples').sum(axis=1).max()]) +ax4.set_ylim(-0.5, count-0.5) +ax4.tick_params(axis='x', which='major', labelsize=ticklabels_fontsize) + +# Create the annotation plot for 'FAB_classification'. +af.plot_annot('FAB_classification', samples=samples, ax=ax5, linewidths=1, cmap='Dark2') +ax5.set_xlabel('') +ax5.set_ylabel('') + +ax6.remove() + +# Create the annotation plot for 'days_to_last_followup'. +af.plot_annot('days_to_last_followup', samples=samples, numeric=True, ax=ax7, linewidths=1, cmap='viridis') +ax7.set_xlabel('') +ax7.set_ylabel('') + +ax8.remove() + +# Create the annotation plot for 'Overall_Survival_Status'. +af.plot_annot('Overall_Survival_Status', samples=samples, ax=ax9, linewidths=1) +ax9.set_xlabel('Samples', fontsize=label_fontsize) +ax9.set_ylabel('') + +ax10.remove() + +# Create the legends. Getting the right legend locations can be tricky and often requires a trial-and-error process. +handles1 = pymaf.legend_handles(name='waterfall') +handles2 = af.legend_handles('FAB_classification', samples=samples, cmap='Dark2') +handles3 = af.legend_handles('days_to_last_followup', samples=samples, numeric=True, cmap='viridis') +handles4 = af.legend_handles('Overall_Survival_Status', samples=samples) +leg1 = ax11.legend(handles=handles1, loc=(0, 0), title='Variant_Classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg2 = ax11.legend(handles=handles2, loc=(0.43, 0), title='FAB_classification', ncol=2, fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg3 = ax11.legend(handles=handles3, loc=(0.62, 0), title='days_to_last_followup', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +leg4 = ax11.legend(handles=handles4, loc=(0.82, 0), title='Overall_Survival_Status', fontsize=legend_fontsize, title_fontsize=legend_fontsize) +ax11.add_artist(leg1) +ax11.add_artist(leg2) +ax11.add_artist(leg3) +ax11.add_artist(leg4) +ax11.axis('off') + +ax12.remove() + +plt.tight_layout() +plt.subplots_adjust(wspace=0.01, hspace=0.01) +plt.savefig('customized_oncoplot_3.png') diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 6d1e298..bd95c08 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -17,3 +17,7 @@ We can (relatively) easily achieve above thanks to the LEGO block-like plotting We can go one step further and sort the samples by one or more annotations (:download:`customized_oncoplot_2.py `): .. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot_2.png + +Finally, we can also subset the samples with annotation data (:download:`customized_oncoplot_3.py `): + +.. image:: https://raw.githubusercontent.com/sbslee/fuc-data/main/images/customized_oncoplot_3.png From 40b15dc66a4514b26ffaafc2e90f5d98b28609a2 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 13:40:13 +0900 Subject: [PATCH 13/29] Updata pymaf.MafFrame.to_vcf --- fuc/api/pymaf.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index 91e5ae5..c66de3c 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -645,22 +645,31 @@ def genes(self): @classmethod def from_file(cls, fn): - """Construct MafFrame from a MAF file. + """ + Construct a MafFrame from a MAF file. Parameters ---------- fn : str - MAF file path (zipped or unzipped). + MAF file (zipped or unzipped). Returns ------- MafFrame - MafFrame. + MafFrame object. See Also -------- MafFrame MafFrame object creation using constructor. + + Examples + -------- + + >>> from fuc import common, pymaf + >>> common.load_dataset('tcga-laml') + >>> fn = '~/fuc-data/tcga-laml/tcga_laml.maf.gz' + >>> mf = pymaf.MafFrame.from_file(fn) """ # Read the MAF file. df = pd.read_table(fn) @@ -684,6 +693,9 @@ def from_file(cls, fn): if case_dict: df = df.rename(columns=case_dict) + # Set the data types. + df.Chromosome = df.Chromosome.astype(str) + return cls(df) @classmethod @@ -1795,8 +1807,14 @@ def one_row(r): return r df = df.apply(one_row, axis=1) + # Create the metadata. + meta = [ + '##fileformat=VCFv4.3', + '##source=fuc.api.pymaf.MafFrame.to_vcf', + ] + # Create the VcfFrame. - vf = pyvcf.VcfFrame(['##fileformat=VCFv4.3'], df) + vf = pyvcf.VcfFrame(meta, df) return vf From 4de7ccbbf10d7d932d214f71cf87a4889c725f31 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 13:52:46 +0900 Subject: [PATCH 14/29] Update common.load_dataset --- fuc/api/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fuc/api/common.py b/fuc/api/common.py index ea4beab..07833a1 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -79,6 +79,8 @@ def load_dataset(name, force=False): 'tcga_cohort.txt.gz', 'tcga_laml.maf.gz', 'tcga_laml_annot.tsv', + 'tcga_laml.vcf', + 'tcga_laml_vep.vcf', ], 'pyvcf': [ 'plot_comparison.vcf', From 7a73014739c2bf5ff7006fcc0054b97320064191 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 14:36:29 +0900 Subject: [PATCH 15/29] Deprecate pyvep.filter_lof method --- CHANGELOG.rst | 2 ++ fuc/api/pyvep.py | 64 +++++++++++++++------------------------------- fuc/cli/vcf_vep.py | 18 ++++++------- 3 files changed, 31 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2e092c1..ec0d447 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,8 @@ Changelog * Add ``keep_empty`` argument to :meth:`pymaf.MafFrame.matrix_waterfall/plot_oncoplot/plot_waterfall` methods. * Add :meth:`pymaf.MafFrame.filter_annot` method. * Add :meth:`pymaf.AnnFrame.sorted_samples` method. +* Fix minor bug in :meth:`pymaf.MafFrame.to_frame` method. +* Deprecate :meth:`pyvep.filter_lof` method. 0.14.0 (2021-06-20) ------------------- diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 5f99eec..8b2fa19 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -505,7 +505,11 @@ def to_frame(vf, as_zero=False): else: df = df.replace('', np.nan) df.columns = annot_names(vf) - df = df.astype(DATA_TYPES) + d = {} + for col in df.columns: + if col in DATA_TYPES: + d[col] = DATA_TYPES[col] + df = df.astype(d) return df def pick_result(vf, mode='mostsevere'): @@ -592,48 +596,6 @@ def annot_names(vf): l = re.search(r'Format: (.*?)">', vf.meta[i]).group(1).split('|') return l -def filter_lof(vf, opposite=None, as_index=False): - """Select rows whose conseuqence annotation is deleterious and/or LoF. - - Parameters - ---------- - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - as_index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - """ - consequence_index = annot_names(vf).index('Consequence') - impact_index = annot_names(vf).index('IMPACT') - polyphen_index = annot_names(vf).index('PolyPhen') - sift_index = annot_names(vf).index('SIFT') - def one_row(r): - l = row_firstann(r).split('|') - consequence = l[consequence_index] - impact = l[impact_index] - polyphen = l[polyphen_index] - sift = l[sift_index] - if impact not in ['HIGH', 'MODERATE']: - return False - if consequence == 'missense_variant': - if 'damaging' in polyphen or 'deleterious' in sift: - return True - else: - return False - return True - i = vf.df.apply(one_row, axis=1) - if opposite: - i = ~i - if as_index: - return i - df = vf.df[i] - vf = vf.__class__(vf.copy_meta(), df) - return vf - def filter_query(vf, expr, opposite=None, as_index=False, as_zero=False): """ Select rows that satisfy the query expression. @@ -655,6 +617,22 @@ def filter_query(vf, expr, opposite=None, as_index=False, as_zero=False): ------- VcfFrame or pandas.Series Filtered VcfFrame or boolean index array. + + Examples + -------- + + >>> from fuc import common, pyvcf, pyvep + >>> common.load_dataset('tcga-laml') + >>> fn = '~/fuc-data/tcga-laml/tcga_laml_vep.vcf' + >>> vf = pyvcf.VcfFrame.from_file(fn) + >>> filtered_vf = pyvep.filter_query(vf, "SYMBOL == 'TP53'") + >>> filtered_vf = pyvep.filter_query(vf, "SYMBOL != 'TP53'") + >>> filtered_vf = pyvep.filter_query(vf, "SYMBOL != 'TP53'", opposite=True) + >>> filtered_vf = pyvep.filter_query(vf, "Consequence in ['splice_donor_variant', 'stop_gained']") + >>> filtered_vf = pyvep.filter_query(vf, "(SYMBOL == 'TP53') and (Consequence.str.contains('stop_gained'))") + >>> filtered_vf = pyvep.filter_query(vf, "gnomAD_AF < 0.001") + >>> filtered_vf = pyvep.filter_query(vf, "gnomAD_AF < 0.001", as_zero=True) + >>> filtered_vf = pyvep.filter_query(vf, "(IMPACT == 'HIGH') or (Consequence.str.contains('missense_variant') and (PolyPhen.str.contains('damaging') or SIFT.str.contains('deleterious')))") """ df = to_frame(vf, as_zero=as_zero) i = vf.df.index.isin(df.query(expr).index) diff --git a/fuc/cli/vcf_vep.py b/fuc/cli/vcf_vep.py index a39ddbf..d4b8c54 100644 --- a/fuc/cli/vcf_vep.py +++ b/fuc/cli/vcf_vep.py @@ -6,15 +6,13 @@ details on query expression, please visit the method's documentation page. usage examples: - $ fuc {api.common._script_name()} in.vcf 'SYMBOL == "TP53"' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'SYMBOL != "TP53"' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf - $ fuc {api.common._script_name()} in.vcf \\ - 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf - $ fuc {api.common._script_name()} in.vcf \\ - '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'gnomAD_AF < 0.001' > out.vcf - $ fuc {api.common._script_name()} in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf + $ fuc {api.common._script_name()} in.vcf "SYMBOL == 'TP53'" > out.vcf + $ fuc {api.common._script_name()} in.vcf "SYMBOL != 'TP53'" > out.vcf + $ fuc {api.common._script_name()} in.vcf "SYMBOL == 'TP53'" --opposite > out.vcf + $ fuc {api.common._script_name()} in.vcf "Consequence in ['splice_donor_variant', 'stop_gained']" > out.vcf + $ fuc {api.common._script_name()} in.vcf "(SYMBOL == 'TP53') and (Consequence.str.contains('stop_gained'))" > out.vcf + $ fuc {api.common._script_name()} in.vcf "gnomAD_AF < 0.001" > out.vcf + $ fuc {api.common._script_name()} in.vcf "gnomAD_AF < 0.001" --as_zero > out.vcf """ def create_parser(subparsers): @@ -24,7 +22,7 @@ def create_parser(subparsers): help='[VCF] filter a VCF file annotated by Ensemble VEP', description=description, ) - parser.add_argument('vcf', help='Ensemble VEP-annotated VCF file') + parser.add_argument('vcf', help='VCF file annotated with Ensemble VEP') parser.add_argument('expr', help='query expression to evaluate') parser.add_argument( '--opposite', From b6af06c29cb85d5abf4e83adb12847f58932076b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 14:39:26 +0900 Subject: [PATCH 16/29] Update docs --- docs/cli.rst | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/cli.rst b/docs/cli.rst index 95a3133..91c4538 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -729,18 +729,16 @@ vcf_vep details on query expression, please visit the method's documentation page. usage examples: - $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' > out.vcf - $ fuc vcf_vep in.vcf 'SYMBOL != "TP53"' > out.vcf - $ fuc vcf_vep in.vcf 'SYMBOL == "TP53"' --opposite > out.vcf - $ fuc vcf_vep in.vcf \ - 'Consequence in ["splice_donor_variant", "stop_gained"]' > out.vcf - $ fuc vcf_vep in.vcf \ - '(SYMBOL == "TP53") and (Consequence.str.contains("stop_gained"))' > out.vcf - $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' > out.vcf - $ fuc vcf_vep in.vcf 'gnomAD_AF < 0.001' --as_zero > out.vcf + $ fuc vcf_vep in.vcf "SYMBOL == 'TP53'" > out.vcf + $ fuc vcf_vep in.vcf "SYMBOL != 'TP53'" > out.vcf + $ fuc vcf_vep in.vcf "SYMBOL == 'TP53'" --opposite > out.vcf + $ fuc vcf_vep in.vcf "Consequence in ['splice_donor_variant', 'stop_gained']" > out.vcf + $ fuc vcf_vep in.vcf "(SYMBOL == 'TP53') and (Consequence.str.contains('stop_gained'))" > out.vcf + $ fuc vcf_vep in.vcf "gnomAD_AF < 0.001" > out.vcf + $ fuc vcf_vep in.vcf "gnomAD_AF < 0.001" --as_zero > out.vcf positional arguments: - vcf Ensemble VEP-annotated VCF file + vcf VCF file annotated with Ensemble VEP expr query expression to evaluate optional arguments: From 12e919b2dfcc90e126670f9097e1afb6ca5320cb Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 14:50:22 +0900 Subject: [PATCH 17/29] Deprecate pyvep.filter_clinsig method --- CHANGELOG.rst | 2 +- fuc/api/pyvep.py | 122 ----------------------------------------------- 2 files changed, 1 insertion(+), 123 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ec0d447..0f4848d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,7 +12,7 @@ Changelog * Add :meth:`pymaf.MafFrame.filter_annot` method. * Add :meth:`pymaf.AnnFrame.sorted_samples` method. * Fix minor bug in :meth:`pymaf.MafFrame.to_frame` method. -* Deprecate :meth:`pyvep.filter_lof` method. +* Deprecate :meth:`pyvep.filter_lof/clinsig` methods. 0.14.0 (2021-06-20) ------------------- diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 8b2fa19..47981b8 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -259,128 +259,6 @@ def row_mostsevere(r): f = lambda x: SEVERITIY.index(x.split('|')[1].split('&')[0]) return sorted(results.split(','), key=f)[0] -def filter_clinsig(vf, whitelist=None, blacklist=None, opposite=False, - index=False): - """Select rows based on the given CLIN_SIG values. - - List of CLIN_SIG values: - - - benign - - likely_benign - - pathogenic - - likely_pathogenic - - drug_response - - risk_factor - - uncertain_significance - - conflicting_interpretations_of_pathogenicity - - not_provided - - other - - benign/likely_benign - - pathogenic/likely_pathogenic - - ... - - Parameters - ---------- - vf : VcfFrame - VcfFrame. - whilelist : list, default: None - If these CLIN_SIG values are present, select the row. - blacklist : list, default: None - If these CLIN_SIG values are present, do not select the row. - opposite : bool, default: False - If True, return rows that don't meet the said criteria. - index : bool, default: False - If True, return boolean index array instead of VcfFrame. - - Returns - ------- - VcfFrame or pandas.Series - Filtered VcfFrame or boolean index array. - - Examples - -------- - Assume we have the following data: - - >>> meta = [ - ... '##fileformat=VCFv4.1', - ... '##VEP="v104" time="2021-05-20 10:50:12" cache="/net/isilonP/public/ro/ensweb-data/latest/tools/grch37/e104/vep/cache/homo_sapiens/104_GRCh37" db="homo_sapiens_core_104_37@hh-mysql-ens-grch37-web" 1000genomes="phase3" COSMIC="92" ClinVar="202012" HGMD-PUBLIC="20204" assembly="GRCh37.p13" dbSNP="154" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"', - ... '##INFO=' - ... ] - >>> data = { - ... 'CHROM': ['chr1', 'chr1', 'chr2', 'chr2'], - ... 'POS': [100, 101, 200, 201], - ... 'ID': ['.', '.', '.', '.'], - ... 'REF': ['G', 'C', 'CAG', 'C'], - ... 'ALT': ['T', 'T', 'C', 'T'], - ... 'QUAL': ['.', '.', '.', '.'], - ... 'FILTER': ['.', '.', '.', '.'], - ... 'INFO': [ - ... 'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|47/58||||6721|6644|2215|S/Y|tCt/tAt|rs587777894&COSV63868278&COSV63868313||-1||HGNC|3942|||||deleterious(0)|possibly_damaging(0.876)||likely_pathogenic&pathogenic|0&1&1|1&1&1|26619011&27159400&24631838&26018084&27830187|||||', - ... 'CSQ=T|synonymous_variant|LOW|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|49/58||||6986|6909|2303|L|ctG/ctA|rs11121691&COSV63870864||-1||HGNC|3942|||||||0.2206|benign|0&1|1&1|24996771|||||', - ... 'CSQ=-|frameshift_variant|HIGH|BRCA2|ENSG00000139618|Transcript|ENST00000380152.3|protein_coding|18/27||||8479-8480|8246-8247|2749|Q/X|cAG/c|rs80359701||1||HGNC|1101||||||||pathogenic||1|26467025&26295337&15340362|||||', - ... 'CSQ=T|missense_variant|MODERATE|MTOR|ENSG00000198793|Transcript|ENST00000361445.4|protein_coding|30/58||||4516|4439|1480|R/H|cGc/cAc|rs780930764&COSV63868373||-1||HGNC|3942|||||tolerated(0.13)|benign(0)||likely_benign|0&1|1&1||||||' - ... ], - ... 'FORMAT': ['GT', 'GT', 'GT', 'GT'], - ... 'Steven': ['0/1', '0/1', '0/1', '0/1'], - ... } - >>> vf = pyvcf.VcfFrame.from_dict(meta, data) - >>> pyvep.parseann(vf, ['CLIN_SIG']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G T . . likely_pathogenic&pathogenic GT 0/1 - 1 chr1 101 . C T . . benign GT 0/1 - 2 chr2 200 . CAG C . . pathogenic GT 0/1 - 3 chr2 201 . C T . . likely_benign GT 0/1 - - We can select rows with pathogenic or likely_pathogenic: - - >>> whitelist=['pathogenic', 'likely_pathogenic'] - >>> temp_vf = pyvep.filter_clinsig(vf, whitelist=whitelist) - >>> pyvep.parseann(temp_vf, ['CLIN_SIG']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 100 . G T . . likely_pathogenic&pathogenic GT 0/1 - 1 chr2 200 . CAG C . . pathogenic GT 0/1 - - We can also remove those rows: - - >>> temp_vf = pyvep.filter_clinsig(vf, whitelist=whitelist, opposite=True) - >>> pyvep.parseann(temp_vf, ['CLIN_SIG']).df - CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Steven - 0 chr1 101 . C T . . benign GT 0/1 - 1 chr2 201 . C T . . likely_benign GT 0/1 - - Finally, we can return boolean index array from the filtering: - - >>> pyvep.filter_clinsig(vf, whitelist=whitelist, index=True) - 0 True - 1 False - 2 True - 3 False - dtype: bool - """ - if whitelist is None and blacklist is None: - raise ValueError('must provide either whitelist or blcklist') - if whitelist is None: - whitelist = [] - if blacklist is None: - blacklist = [] - def func(r): - ann = row_firstann(r) - i = annot_names(vf).index('CLIN_SIG') - values = ann.split('|')[i].split('&') - if not list(set(values) & set(whitelist)): - return False - if list(set(values) & set(blacklist)): - return False - return True - i = vf.df.apply(func, axis=1) - if opposite: - i = ~i - if index: - return i - df = vf.df[i].reset_index(drop=True) - vf = vf.__class__(vf.copy_meta(), df) - return vf - def parseann(vf, targets, sep=' | ', as_series=False): """Parse variant annotations in VcfFrame. From 6b0c77e248bdc997070d10e72fdf831af5bac828 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 15:27:22 +0900 Subject: [PATCH 18/29] Update docs --- fuc/api/pymaf.py | 2 +- fuc/api/pyvep.py | 62 ++++++++++++++++++++++++------------------------ 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index c66de3c..db79247 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -700,7 +700,7 @@ def from_file(cls, fn): @classmethod def from_vcf(cls, vcf): - """Construct MafFrame from a VCF file or VcfFrame. + """Construct a MafFrame from a VCF file or VcfFrame. The input VCF should already contain functional annotation data from a tool such as Ensemble VEP, SnpEff, and ANNOVAR. The diff --git a/fuc/api/pyvep.py b/fuc/api/pyvep.py index 47981b8..88b6720 100644 --- a/fuc/api/pyvep.py +++ b/fuc/api/pyvep.py @@ -13,37 +13,37 @@ symbol to protein change. However, most of the analysis in pyvep uses the following fields: -+-----+--------------------+---------------------+----------------------------------------------+ -| No. | Name | Description | Examples | -+=====+====================+=====================+==============================================+ -| 1 | Allele | Variant allele | 'C', 'T', 'CG' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 2 | Consequence | Consequence type | 'missense_variant', 'stop_gained' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 3 | IMPACT | Consequence impact | 'MODERATE', 'HIGH' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 4 | SYMBOL | Gene symbol | 'TP53', 'APC' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 5 | Gene | Ensembl ID | 7157, 55294 | -+-----+--------------------+---------------------+----------------------------------------------+ -| 6 | BIOTYPE | Transcript biotype | 'protein_coding', 'pseudogene', 'miRNA' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 7 | EXON | Exon number | '8/17', '17/17' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 8 | Protein_position | Amino acid position | 234, 1510 | -+-----+--------------------+---------------------+----------------------------------------------+ -| 9 | Amino_acids | Amino acid changes | 'R/\*', 'A/X', 'V/A' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 10 | Existing_variation | Variant identifier | 'rs17851045', 'rs786201856&COSV57325157' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 11 | STRAND | Genomic strand | 1, -1 | -+-----+--------------------+---------------------+----------------------------------------------+ -| 12 | SIFT | SIFT prediction | 'deleterious(0)', 'tolerated(0.6)' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 13 | PolyPhen | PolyPhen prediction | 'benign(0.121)', 'probably_damaging(0.999)' | -+-----+--------------------+---------------------+----------------------------------------------+ -| 14 | CLIN_SIG | ClinVar prediction | 'pathogenic', 'likely_pathogenic&pathogenic' | -+-----+--------------------+---------------------+----------------------------------------------+ ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| No. | Name | Description | Examples | ++=====+====================+=====================+==========================================================================================================+ +| 1 | Allele | Variant allele | 'C', 'T', 'CG' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 2 | Consequence | Consequence type | 'missense_variant', 'stop_gained' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 3 | IMPACT | Consequence impact | 'MODERATE', 'HIGH' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 4 | SYMBOL | Gene symbol | 'TP53', 'APC' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 5 | Gene | Ensembl ID | 7157, 55294 | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 6 | BIOTYPE | Transcript biotype | 'protein_coding', 'pseudogene', 'miRNA' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 7 | EXON | Exon number | '8/17', '17/17' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 8 | Protein_position | Amino acid position | 234, 1510 | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 9 | Amino_acids | Amino acid changes | 'R/\*', 'A/X', 'V/A' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 10 | Existing_variation | Variant identifier | 'rs17851045', 'rs786201856&COSV57325157' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 11 | STRAND | Genomic strand | 1, -1 | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 12 | SIFT | SIFT prediction | 'tolerated(0.6)', 'tolerated_low_confidence(0.37)', 'deleterious_low_confidence(0.03)', 'deleterious(0)' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 13 | PolyPhen | PolyPhen prediction | 'benign(0.121)', 'possibly_damaging(0.459)', 'probably_damaging(0.999)' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ +| 14 | CLIN_SIG | ClinVar prediction | 'pathogenic', 'likely_pathogenic&pathogenic' | ++-----+--------------------+---------------------+----------------------------------------------------------------------------------------------------------+ """ import re From 89e5489800212f5036c117b9687e74a88d0caff6 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Tue, 22 Jun 2021 16:38:22 +0900 Subject: [PATCH 19/29] Update pymaf.MafFrame.from_vcf method to extract genotype keys --- CHANGELOG.rst | 1 + fuc/api/pymaf.py | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0f4848d..56f9705 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Changelog * Add :meth:`pymaf.AnnFrame.sorted_samples` method. * Fix minor bug in :meth:`pymaf.MafFrame.to_frame` method. * Deprecate :meth:`pyvep.filter_lof/clinsig` methods. +* Update :meth:`pymaf.MafFrame.from_vcf` method to extract genotype keys (e.g. DP, AD, AF). 0.14.0 (2021-06-20) ------------------- diff --git a/fuc/api/pymaf.py b/fuc/api/pymaf.py index db79247..1c878c3 100644 --- a/fuc/api/pymaf.py +++ b/fuc/api/pymaf.py @@ -699,7 +699,7 @@ def from_file(cls, fn): return cls(df) @classmethod - def from_vcf(cls, vcf): + def from_vcf(cls, vcf, keys=None, names=None): """Construct a MafFrame from a VCF file or VcfFrame. The input VCF should already contain functional annotation data @@ -712,6 +712,10 @@ def from_vcf(cls, vcf): ---------- vcf : str or VcfFrame VCF file or VcfFrame. + keys : str or list + Genotype key or list of genotype keys. + names : str or list + Column name or list of column names to use in the MafFrame. Examples -------- @@ -813,6 +817,10 @@ def one_row(r): Tumor_Seq_Allele2 = tumor_seq_allele2, Tumor_Sample_Barcode = tumor_sample_barcode, Protein_Change = protein_change, + CHROM = r.CHROM, + POS = r.POS, + REF = r.REF, + ALT = r.ALT, ) return pd.Series(d) @@ -828,6 +836,32 @@ def one_row(r): del df['Tumor_Sample_Barcode'] df = df.join(s) + # Append the genotype keys. + if keys is None: + keys = [] + if names is None: + names = [] + if isinstance(keys, str): + keys = [keys] + if isinstance(names, str): + names = [names] + for i, key in enumerate(keys): + temp_df = vf.extract(key) + temp_df = pd.concat([vf.df.iloc[:, :9], temp_df], axis=1) + temp_df = temp_df.drop( + columns=['ID', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) + temp_df = pd.melt( + temp_df, + id_vars=['CHROM', 'POS', 'REF', 'ALT'], + var_name='Tumor_Sample_Barcode', + ) + temp_df = temp_df[temp_df.value != '.'] + df = df.merge(temp_df, + on=['CHROM', 'POS', 'REF', 'ALT', 'Tumor_Sample_Barcode']) + df = df.rename(columns={'value': names[i]}) + + df = df.drop(columns=['CHROM', 'POS', 'REF', 'ALT']) + return cls(df) def matrix_genes(self, count=10, mode='variants'): From 7debd86e81e23409390ef0b77044022a96381a5d Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 13:08:04 +0900 Subject: [PATCH 20/29] Fix typo in README --- README.rst | 2 +- docs/create.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index dbb8106..301684c 100644 --- a/README.rst +++ b/README.rst @@ -177,7 +177,7 @@ To index a CRAM file: .. code-block:: text - $ fuc bam_head in.cram + $ fuc bam_index in.cram To slice a BAM file: diff --git a/docs/create.py b/docs/create.py index 2daa2bc..bb02ba0 100644 --- a/docs/create.py +++ b/docs/create.py @@ -165,7 +165,7 @@ .. code-block:: text - $ fuc bam_head in.cram + $ fuc bam_index in.cram To slice a BAM file: From 280eb1209e64a9b6a2b7cad67c120c1330190419 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 15:40:29 +0900 Subject: [PATCH 21/29] Update CLI --- CHANGELOG.rst | 1 + README.rst | 54 ++++++------- docs/cli.rst | 180 +++++++++++++++++------------------------ fuc/__main__.py | 15 +++- fuc/api/common.py | 17 +++- fuc/cli/bam_head.py | 2 +- fuc/cli/bam_index.py | 2 +- fuc/cli/bam_rename.py | 2 +- fuc/cli/bam_slice.py | 66 ++++++++++----- fuc/cli/bed_intxn.py | 2 +- fuc/cli/bed_sum.py | 2 +- fuc/cli/fq_count.py | 2 +- fuc/cli/fq_sum.py | 2 +- fuc/cli/fuc_compf.py | 2 +- fuc/cli/fuc_demux.py | 2 +- fuc/cli/fuc_exist.py | 4 +- fuc/cli/fuc_find.py | 2 +- fuc/cli/maf_maf2vcf.py | 2 +- fuc/cli/maf_oncoplt.py | 2 +- fuc/cli/maf_sumplt.py | 2 +- fuc/cli/maf_vcf2maf.py | 2 +- fuc/cli/tbl_merge.py | 2 +- fuc/cli/tbl_sum.py | 2 +- fuc/cli/vcf_filter.py | 2 +- fuc/cli/vcf_merge.py | 2 +- fuc/cli/vcf_rename.py | 2 +- fuc/cli/vcf_slice.py | 2 +- fuc/cli/vcf_vcf2bed.py | 2 +- fuc/cli/vcf_vep.py | 2 +- 29 files changed, 200 insertions(+), 181 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 56f9705..23738fa 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,7 @@ Changelog * Fix minor bug in :meth:`pymaf.MafFrame.to_frame` method. * Deprecate :meth:`pyvep.filter_lof/clinsig` methods. * Update :meth:`pymaf.MafFrame.from_vcf` method to extract genotype keys (e.g. DP, AD, AF). +* Update :command:`bam_slice` command. 0.14.0 (2021-06-20) ------------------- diff --git a/README.rst b/README.rst index 301684c..9cfdd45 100644 --- a/README.rst +++ b/README.rst @@ -107,35 +107,35 @@ For getting help on the fuc CLI: usage: fuc [-h] [-v] COMMAND ... positional arguments: - COMMAND name of the command - bam_head [BAM] print the header of a SAM/BAM/CRAM file - bam_index [BAM] index a SAM/BAM/CRAM file - bam_rename [BAM] rename the samples in a SAM/BAM/CRAM file - bam_slice [BAM] slice a SAM/BAM/CRAM file - bed_intxn [BED] find intersection of two or more BED files - bed_sum [BED] summarize a BED file - fq_count [FASTQ] count sequence reads in FASTQ files - fq_sum [FASTQ] summarize a FASTQ file - fuc_compf [FUC] compare contents of two files - fuc_demux [FUC] parse Reports directory from bcl2fastq or bcl2fastq2 - fuc_exist [FUC] check whether files/directories exist - fuc_find [FUC] find files with certain extension recursively - maf_maf2vcf [MAF] convert a MAF file to a VCF file - maf_oncoplt [MAF] create an oncoplot with a MAF file - maf_sumplt [MAF] create a summary plot with a MAF file - maf_vcf2maf [MAF] convert an annotated VCF file to a MAF file - tbl_merge [TABLE] merge two table files - tbl_sum [TABLE] summarize a table file - vcf_filter [VCF] filter a VCF file - vcf_merge [VCF] merge two or more VCF files - vcf_rename [VCF] rename the samples in a VCF file. - vcf_slice [VCF] slice a VCF file - vcf_vcf2bed [VCF] convert a VCF file to a BED file - vcf_vep [VCF] filter a VCF file annotated by Ensemble VEP + COMMAND + bam_head [BAM] Print the header of a SAM/BAM/CRAM file. + bam_index [BAM] Index a SAM/BAM/CRAM file. + bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. + bam_slice [BAM] Slice a SAM/BAM/CRAM file. + bed_intxn [BED] Find the intersection of two or more BED files. + bed_sum [BED] Summarize a BED file. + fq_count [FASTQ] Count sequence reads in FASTQ files. + fq_sum [FASTQ] Summarize a FASTQ file. + fuc_compf [FUC] Compare the contents of two files. + fuc_demux [FUC] Parse the Reports directory from bcl2fastq. + fuc_exist [FUC] Check whether certain files exist. + fuc_find [FUC] Find all files with a certain extension recursively. + maf_maf2vcf [MAF] Convert a MAF file to a VCF file. + maf_oncoplt [MAF] Create an oncoplot with a MAF file. + maf_sumplt [MAF] Create a summary plot with a MAF file. + maf_vcf2maf [MAF] Convert a VCF file to a MAF file. + tbl_merge [TABLE] Merge two table files. + tbl_sum [TABLE] Summarize a table file. + vcf_filter [VCF] Filter a VCF file. + vcf_merge [VCF] Merge two or more VCF files. + vcf_rename [VCF] Rename the samples in a VCF file. + vcf_slice [VCF] Slice a VCF file. + vcf_vcf2bed [VCF] Convert a VCF file to a BED file. + vcf_vep [VCF] Filter a VCF file annotated by Ensemble VEP. optional arguments: - -h, --help show this help message and exit - -v, --version show the version number and exit + -h, --help Show this help message and exit. + -v, --version Show the version number and exit. For getting help on a specific command (e.g. vcf_merge): diff --git a/docs/cli.rst b/docs/cli.rst index 91c4538..d62b703 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -17,35 +17,35 @@ For getting help on the fuc CLI: usage: fuc [-h] [-v] COMMAND ... positional arguments: - COMMAND name of the command - bam_head [BAM] print the header of a SAM/BAM/CRAM file - bam_index [BAM] index a SAM/BAM/CRAM file - bam_rename [BAM] rename the samples in a SAM/BAM/CRAM file - bam_slice [BAM] slice a SAM/BAM/CRAM file - bed_intxn [BED] find intersection of two or more BED files - bed_sum [BED] summarize a BED file - fq_count [FASTQ] count sequence reads in FASTQ files - fq_sum [FASTQ] summarize a FASTQ file - fuc_compf [FUC] compare contents of two files - fuc_demux [FUC] parse Reports directory from bcl2fastq or bcl2fastq2 - fuc_exist [FUC] check whether files/directories exist - fuc_find [FUC] find files with certain extension recursively - maf_maf2vcf [MAF] convert a MAF file to a VCF file - maf_oncoplt [MAF] create an oncoplot with a MAF file - maf_sumplt [MAF] create a summary plot with a MAF file - maf_vcf2maf [MAF] convert an annotated VCF file to a MAF file - tbl_merge [TABLE] merge two table files - tbl_sum [TABLE] summarize a table file - vcf_filter [VCF] filter a VCF file - vcf_merge [VCF] merge two or more VCF files - vcf_rename [VCF] rename the samples in a VCF file. - vcf_slice [VCF] slice a VCF file - vcf_vcf2bed [VCF] convert a VCF file to a BED file - vcf_vep [VCF] filter a VCF file annotated by Ensemble VEP + COMMAND + bam_head [BAM] Print the header of a SAM/BAM/CRAM file. + bam_index [BAM] Index a SAM/BAM/CRAM file. + bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. + bam_slice [BAM] Slice a SAM/BAM/CRAM file. + bed_intxn [BED] Find the intersection of two or more BED files. + bed_sum [BED] Summarize a BED file. + fq_count [FASTQ] Count sequence reads in FASTQ files. + fq_sum [FASTQ] Summarize a FASTQ file. + fuc_compf [FUC] Compare the contents of two files. + fuc_demux [FUC] Parse the Reports directory from bcl2fastq. + fuc_exist [FUC] Check whether certain files exist. + fuc_find [FUC] Find all files with a certain extension recursively. + maf_maf2vcf [MAF] Convert a MAF file to a VCF file. + maf_oncoplt [MAF] Create an oncoplot with a MAF file. + maf_sumplt [MAF] Create a summary plot with a MAF file. + maf_vcf2maf [MAF] Convert a VCF file to a MAF file. + tbl_merge [TABLE] Merge two table files. + tbl_sum [TABLE] Summarize a table file. + vcf_filter [VCF] Filter a VCF file. + vcf_merge [VCF] Merge two or more VCF files. + vcf_rename [VCF] Rename the samples in a VCF file. + vcf_slice [VCF] Slice a VCF file. + vcf_vcf2bed [VCF] Convert a VCF file to a BED file. + vcf_vep [VCF] Filter a VCF file annotated by Ensemble VEP. optional arguments: - -h, --help show this help message and exit - -v, --version show the version number and exit + -h, --help Show this help message and exit. + -v, --version Show the version number and exit. For getting help on a specific command (e.g. vcf_merge): @@ -73,7 +73,7 @@ bam_head bam SAM/BAM/CRAM file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. bam_index ========= @@ -95,7 +95,7 @@ bam_index bam SAM/BAM/CRAM file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. bam_rename ========== @@ -119,7 +119,7 @@ bam_rename out output file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. bam_slice ========= @@ -127,27 +127,21 @@ bam_slice .. code-block:: text $ fuc bam_slice -h - usage: fuc bam_slice [-h] [--no_index] bam region out + usage: fuc bam_slice [-h] [--format TEXT] bam region [region ...] - This command will slice a SAM/BAM/CRAM file. It essentially wraps the - 'pysam.view' method from the pysam package. - - By default, the command will index the output file. Use the '--no_index' flag - to skip indexing. + This command will slice the input SAM/BAM/CRAM file for the specified region(s). usage examples: - $ fuc bam_slice in.sam 4:300-400 out.sam - $ fuc bam_slice in.bam chr1:100-200 out.bam - $ fuc bam_slice in.cram chr1:100-200 out.cram --no_index + $ fuc bam_slice in.bam chr1:100-200 > out.bam + $ fuc bam_slice in.bam chr1:100-200 chr2:100-200 > out.bam positional arguments: - bam SAM/BAM/CRAM file - region region ('chrom:start-end') - out output file + bam SAM/BAM/CRAM file + region space-separated regions ('chrom:start-end') optional arguments: - -h, --help show this help message and exit - --no_index use this flag to skip indexing + -h, --help Show this help message and exit. + --format TEXT output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM') bed_intxn ========= @@ -167,7 +161,7 @@ bed_intxn bed BED files optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. bed_sum ======= @@ -191,7 +185,7 @@ bed_sum bed BED file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --bases INT number to divide covered base pairs (default: 1) --decimals INT number of decimals (default: 0) @@ -214,7 +208,7 @@ fq_count fastq FASTQ files (default: stdin) optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. fq_sum ====== @@ -236,7 +230,7 @@ fq_sum fastq FASTQ file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. fuc_compf ========= @@ -258,7 +252,7 @@ fuc_compf right right file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. fuc_demux ========= @@ -285,7 +279,7 @@ fuc_demux output_dir output directory optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. fuc_exist ========= @@ -295,7 +289,7 @@ fuc_exist $ fuc fuc_exist -h usage: fuc fuc_exist [-h] [files ...] - This command will check whether files/directories exist. It will return + This command will check whether files exist. It will return 'True' if they exist and 'False' otherwise. The command will look for stdin if there are no arguments. @@ -308,7 +302,7 @@ fuc_exist files test files/directories (default: stdin) optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. fuc_find ======== @@ -330,7 +324,7 @@ fuc_find ext file extension optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --dir PATH directory to search in (default: current directory) maf_maf2vcf @@ -374,7 +368,7 @@ maf_maf2vcf maf MAF file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --fasta PATH FASTA file (required to include INDELs in the output) --ignore_indels use this flag to exclude INDELs from the output --cols TEXT [TEXT ...] @@ -409,7 +403,7 @@ maf_oncoplt out image file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --count INT number of top mutated genes to display (default: 10) --figsize FLOAT FLOAT width, height in inches (default: [15, 10]) @@ -446,7 +440,7 @@ maf_sumplt out output image file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --figsize FLOAT FLOAT width, height in inches (default: [15, 10]) --title_fontsize FLOAT @@ -474,7 +468,7 @@ maf_vcf2maf vcf VCF file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. tbl_merge ========= @@ -501,9 +495,8 @@ tbl_merge right right file optional arguments: - -h, --help show this help message and exit - --how TEXT type of merge to be performed ['left', 'right', - 'outer', 'inner', 'cross'] (default: 'inner') + -h, --help Show this help message and exit. + --how TEXT type of merge to be performed ['left', 'right', 'outer', 'inner', 'cross'] (default: 'inner') --on TEXT [TEXT ...] column names to join on --lsep TEXT delimiter to use for the left file (default: '\t') --rsep TEXT delimiter to use for the right file (default: '\t') @@ -532,29 +525,16 @@ tbl_sum table_file table file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --sep TEXT delimiter to use (default: '\t') - --skiprows TEXT comma-separated line numbers to skip (0-indexed) or - number of lines to skip at the start of the file (e.g. - `--skiprows 1,` will skip the second line, `--skiprows - 2,4` will skip the third and fifth lines, and - `--skiprows 10` will skip the first 10 lines) + --skiprows TEXT comma-separated line numbers to skip (0-indexed) or number of lines to skip at the start of the file (e.g. `--skiprows 1,` will skip the second line, `--skiprows 2,4` will skip the third and fifth lines, and `--skiprows 10` will skip the first 10 lines) --na_values TEXT [TEXT ...] - additional strings to recognize as NA/NaN (by default, - the following values are interpreted as NaN: '', - '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', - '-NaN', '-nan', '1.#IND', '1.#QNAN', '', 'N/A', - 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null') - --keep_default_na whether or not to include the default NaN values when - parsing the data (see 'pandas.read_table' for details) - --expr TEXT query the columns of a pandas.DataFrame with a boolean - expression (e.g. `--query "A == 'yes'"`) + additional strings to recognize as NA/NaN (by default, the following values are interpreted as NaN: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null') + --keep_default_na whether or not to include the default NaN values when parsing the data (see 'pandas.read_table' for details) + --expr TEXT query the columns of a pandas.DataFrame with a boolean expression (e.g. `--query "A == 'yes'"`) --columns TEXT [TEXT ...] - columns to be summarized (by default, all columns will - be included) - --dtypes PATH file of column names and their data types (etheir - 'categorical' or 'numeric'); one tab-delimited pair of - column name and data type per line + columns to be summarized (by default, all columns will be included) + --dtypes PATH file of column names and their data types (etheir 'categorical' or 'numeric'); one tab-delimited pair of column name and data type per line vcf_filter ========== @@ -587,20 +567,14 @@ vcf_filter vcf VCF file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. --expr TEXT expression to evaluate - --samples PATH file of sample names to apply the marking (one sample - per line) + --samples PATH file of sample names to apply the marking (one sample per line) --drop_duplicates [TEXT ...] - only consider certain columns for identifying - duplicates, by default use all of the columns. - --greedy use this flag to mark even ambiguous genotypes as - missing - --opposite use this flag to mark all genotypes that do not - satisfy the query expression as missing and leave - those that do intact - --filter_empty use this flag to remove rows with no genotype calls at - all + only consider certain columns for identifying duplicates, by default use all of the columns. + --greedy use this flag to mark even ambiguous genotypes as missing + --opposite use this flag to mark all genotypes that do not satisfy the query expression as missing and leave those that do intact + --filter_empty use this flag to remove rows with no genotype calls at all vcf_merge ========= @@ -625,11 +599,9 @@ vcf_merge vcf_files VCF files optional arguments: - -h, --help show this help message and exit - --how TEXT type of merge as defined in `pandas.DataFrame.merge` - (default: 'inner') - --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') (default: - 'GT') + -h, --help Show this help message and exit. + --how TEXT type of merge as defined in `pandas.DataFrame.merge` (default: 'inner') + --format TEXT FORMAT subfields to be retained (e.g. 'GT:AD:DP') (default: 'GT') --sort use this flag to turn off sorting of records (default: True) --collapse use this flag to collapse duplicate records (default: False) @@ -665,9 +637,8 @@ vcf_rename names delimited text file optional arguments: - -h, --help show this help message and exit - --mode TEXT renaming mode (default: 'MAP') (choices: 'MAP', 'INDICIES', - 'RANGE') + -h, --help Show this help message and exit. + --mode TEXT renaming mode (default: 'MAP') (choices: 'MAP', 'INDICIES', 'RANGE') --range INT INT specify an index range --sep TEXT delimiter to use (default: '\t') @@ -694,7 +665,7 @@ vcf_slice region region ('chrom:start-end') optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. vcf_vcf2bed =========== @@ -714,7 +685,7 @@ vcf_vcf2bed vcf VCF file optional arguments: - -h, --help show this help message and exit + -h, --help Show this help message and exit. vcf_vep ======= @@ -742,8 +713,7 @@ vcf_vep expr query expression to evaluate optional arguments: - -h, --help show this help message and exit - --opposite use this flag to return records that don’t meet the said - criteria + -h, --help Show this help message and exit. + --opposite use this flag to return records that don’t meet the said criteria --as_zero use this flag to treat missing values as zero instead of NaN diff --git a/fuc/__main__.py b/fuc/__main__.py index e2fcd13..492cf1e 100644 --- a/fuc/__main__.py +++ b/fuc/__main__.py @@ -4,18 +4,27 @@ from .cli import commands def main(): - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + add_help=False, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + '-h', + '--help', + action='help', + default=argparse.SUPPRESS, + help='Show this help message and exit.', + ) parser.add_argument( '-v', '--version', action='version', version=f'%(prog)s {__version__}', - help='show the version number and exit' + help='Show the version number and exit.' ) subparsers = parser.add_subparsers( dest='command', metavar='COMMAND', - help='name of the command', required=True, ) for name, command in commands.items(): diff --git a/fuc/api/common.py b/fuc/api/common.py index 07833a1..353eaa1 100644 --- a/fuc/api/common.py +++ b/fuc/api/common.py @@ -13,7 +13,7 @@ import pysam import warnings import inspect -from argparse import RawDescriptionHelpFormatter +from argparse import RawTextHelpFormatter, SUPPRESS FUC_PATH = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -24,8 +24,19 @@ def _script_name(): def _add_parser(subparsers, name, **kwargs): """Return a formatted parser.""" - parser = subparsers.add_parser(name, - formatter_class=RawDescriptionHelpFormatter, **kwargs) + parser = subparsers.add_parser( + name, + add_help=False, + formatter_class=RawTextHelpFormatter, + **kwargs, + ) + parser.add_argument( + '-h', + '--help', + action='help', + default=SUPPRESS, + help='Show this help message and exit.', + ) return parser def get_similarity(a, b): diff --git a/fuc/cli/bam_head.py b/fuc/cli/bam_head.py index ffb1ef6..fb50e5a 100644 --- a/fuc/cli/bam_head.py +++ b/fuc/cli/bam_head.py @@ -15,7 +15,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] print the header of a SAM/BAM/CRAM file', + help='[BAM] Print the header of a SAM/BAM/CRAM file.', description=description, ) parser.add_argument('bam', help='SAM/BAM/CRAM file') diff --git a/fuc/cli/bam_index.py b/fuc/cli/bam_index.py index c836472..4002ae0 100644 --- a/fuc/cli/bam_index.py +++ b/fuc/cli/bam_index.py @@ -15,7 +15,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] index a SAM/BAM/CRAM file', + help='[BAM] Index a SAM/BAM/CRAM file.', description=description, ) parser.add_argument('bam', help='SAM/BAM/CRAM file') diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index c6e047c..6fe5109 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -14,7 +14,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] rename the samples in a SAM/BAM/CRAM file', + help='[BAM] Rename the samples in a SAM/BAM/CRAM file.', description=description, ) parser.add_argument('bam', help='SAM/BAM/CRAM file') diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 0012476..337842b 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -1,35 +1,63 @@ +import sys + from .. import api + import pysam description = f""" -This command will slice a SAM/BAM/CRAM file. It essentially wraps the -'pysam.view' method from the pysam package. - -By default, the command will index the output file. Use the '--no_index' flag -to skip indexing. +This command will slice the input SAM/BAM/CRAM file for the specified region(s). usage examples: - $ fuc {api.common._script_name()} in.sam 4:300-400 out.sam - $ fuc {api.common._script_name()} in.bam chr1:100-200 out.bam - $ fuc {api.common._script_name()} in.cram chr1:100-200 out.cram --no_index + $ fuc {api.common._script_name()} in.bam chr1:100-200 > out.bam + $ fuc {api.common._script_name()} in.bam chr1:100-200 chr2:100-200 > out.bam + $ fuc {api.common._script_name()} in.bam chr1:100-200 --format SAM > out.sam + $ fuc {api.common._script_name()} in.bam chr1:100-200 --format CRAM --fasta ref.fa > out.cram """ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] slice a SAM/BAM/CRAM file', + help='[BAM] Slice a SAM/BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='SAM/BAM/CRAM file') - parser.add_argument('region', help="region ('chrom:start-end')") - parser.add_argument('out', help='output file') - parser.add_argument('--no_index', action='store_true', - help='use this flag to skip indexing') + parser.add_argument( + 'bam', + help='SAM/BAM/CRAM file.' + ) + parser.add_argument( + 'region', + nargs='+', + help="Space-separated regions ('chrom:start-end')." + ) + parser.add_argument( + '--format', + metavar='TEXT', + default='BAM', + choices=['SAM', 'BAM', 'CRAM'], + help= + "Output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM'). " + "A FASTA file ('--fasta') is required for 'CRAM'." + ) + parser.add_argument( + '--fasta', + metavar='PATH', + help="FASTA file. Required when '--format' is 'CRAM'." + ) def main(args): - b = pysam.view('-b', args.bam, args.region) - with open(args.out, 'wb') as f: - f.write(b) - if not args.no_index: - pysam.index(args.out) + options = [] + + # Determine the output format. + if args.format == 'BAM': + stdout_method = sys.stdout.buffer.write + options.append('-b') + elif args.format == 'CRAM': + stdout_method = sys.stdout.buffer.write + options.append('-C') + else: + stdout_method = sys.stdout.write + + alignments = pysam.view(args.bam, *args.region, *options) + + stdout_method(alignments) diff --git a/fuc/cli/bed_intxn.py b/fuc/cli/bed_intxn.py index 947e06d..2a13fe8 100644 --- a/fuc/cli/bed_intxn.py +++ b/fuc/cli/bed_intxn.py @@ -12,7 +12,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BED] find intersection of two or more BED files', + help='[BED] Find the intersection of two or more BED files.', description=description, ) parser.add_argument('bed', help='BED files', nargs='+') diff --git a/fuc/cli/bed_sum.py b/fuc/cli/bed_sum.py index b46ca44..1cd668f 100644 --- a/fuc/cli/bed_sum.py +++ b/fuc/cli/bed_sum.py @@ -16,7 +16,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BED] summarize a BED file', + help='[BED] Summarize a BED file.', description=description, ) parser.add_argument('bed', help='BED file') diff --git a/fuc/cli/fq_count.py b/fuc/cli/fq_count.py index 738c688..0c461b8 100644 --- a/fuc/cli/fq_count.py +++ b/fuc/cli/fq_count.py @@ -14,7 +14,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FASTQ] count sequence reads in FASTQ files', + help='[FASTQ] Count sequence reads in FASTQ files.', description=description, ) parser.add_argument('fastq', nargs='*', diff --git a/fuc/cli/fq_sum.py b/fuc/cli/fq_sum.py index 98698b4..dd9cc7d 100644 --- a/fuc/cli/fq_sum.py +++ b/fuc/cli/fq_sum.py @@ -14,7 +14,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FASTQ] summarize a FASTQ file', + help='[FASTQ] Summarize a FASTQ file.', description=description, ) parser.add_argument('fastq', help='FASTQ file') diff --git a/fuc/cli/fuc_compf.py b/fuc/cli/fuc_compf.py index be1bf03..2f02ad5 100644 --- a/fuc/cli/fuc_compf.py +++ b/fuc/cli/fuc_compf.py @@ -14,7 +14,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FUC] compare contents of two files', + help='[FUC] Compare the contents of two files.', description=description, ) parser.add_argument('left', help='left file') diff --git a/fuc/cli/fuc_demux.py b/fuc/cli/fuc_demux.py index 703dd3d..5c5fe1c 100644 --- a/fuc/cli/fuc_demux.py +++ b/fuc/cli/fuc_demux.py @@ -23,7 +23,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FUC] parse Reports directory from bcl2fastq or bcl2fastq2', + help='[FUC] Parse the Reports directory from bcl2fastq.', description=description, ) parser.add_argument('reports_dir', help='Reports directory') diff --git a/fuc/cli/fuc_exist.py b/fuc/cli/fuc_exist.py index 92ca8fe..0f7c484 100644 --- a/fuc/cli/fuc_exist.py +++ b/fuc/cli/fuc_exist.py @@ -3,7 +3,7 @@ import sys description = f""" -This command will check whether files/directories exist. It will return +This command will check whether files exist. It will return 'True' if they exist and 'False' otherwise. The command will look for stdin if there are no arguments. @@ -17,7 +17,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FUC] check whether files/directories exist', + help='[FUC] Check whether certain files exist.', description=description, ) parser.add_argument('files', nargs='*', diff --git a/fuc/cli/fuc_find.py b/fuc/cli/fuc_find.py index c156fe9..075f9d9 100644 --- a/fuc/cli/fuc_find.py +++ b/fuc/cli/fuc_find.py @@ -17,7 +17,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[FUC] find files with certain extension recursively', + help='[FUC] Find all files with a certain extension recursively.', description=description, ) parser.add_argument('ext', help='file extension') diff --git a/fuc/cli/maf_maf2vcf.py b/fuc/cli/maf_maf2vcf.py index e27e4a3..e707bf3 100644 --- a/fuc/cli/maf_maf2vcf.py +++ b/fuc/cli/maf_maf2vcf.py @@ -33,7 +33,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[MAF] convert a MAF file to a VCF file', + help='[MAF] Convert a MAF file to a VCF file.', description=description, ) parser.add_argument('maf', help='MAF file') diff --git a/fuc/cli/maf_oncoplt.py b/fuc/cli/maf_oncoplt.py index 3ad6723..569f52a 100644 --- a/fuc/cli/maf_oncoplt.py +++ b/fuc/cli/maf_oncoplt.py @@ -18,7 +18,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[MAF] create an oncoplot with a MAF file', + help='[MAF] Create an oncoplot with a MAF file.', description=description, ) parser.add_argument('maf', help='MAF file') diff --git a/fuc/cli/maf_sumplt.py b/fuc/cli/maf_sumplt.py index 69f5fde..b146064 100644 --- a/fuc/cli/maf_sumplt.py +++ b/fuc/cli/maf_sumplt.py @@ -18,7 +18,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[MAF] create a summary plot with a MAF file', + help='[MAF] Create a summary plot with a MAF file.', description=description, ) parser.add_argument('maf', help='MAF file') diff --git a/fuc/cli/maf_vcf2maf.py b/fuc/cli/maf_vcf2maf.py index 1a950e2..3e8f633 100644 --- a/fuc/cli/maf_vcf2maf.py +++ b/fuc/cli/maf_vcf2maf.py @@ -12,7 +12,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[MAF] convert an annotated VCF file to a MAF file', + help='[MAF] Convert a VCF file to a MAF file.', description=description, ) parser.add_argument('vcf', help='VCF file') diff --git a/fuc/cli/tbl_merge.py b/fuc/cli/tbl_merge.py index 244af3b..1e58c14 100644 --- a/fuc/cli/tbl_merge.py +++ b/fuc/cli/tbl_merge.py @@ -19,7 +19,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[TABLE] merge two table files', + help='[TABLE] Merge two table files.', description=description, ) parser.add_argument('left', help='left file') diff --git a/fuc/cli/tbl_sum.py b/fuc/cli/tbl_sum.py index bfe6fa5..5819a9e 100644 --- a/fuc/cli/tbl_sum.py +++ b/fuc/cli/tbl_sum.py @@ -16,7 +16,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[TABLE] summarize a table file', + help='[TABLE] Summarize a table file.', description=description, ) parser.add_argument( diff --git a/fuc/cli/vcf_filter.py b/fuc/cli/vcf_filter.py index 9868a50..f133e9c 100644 --- a/fuc/cli/vcf_filter.py +++ b/fuc/cli/vcf_filter.py @@ -23,7 +23,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] filter a VCF file', + help='[VCF] Filter a VCF file.', description=description, ) parser.add_argument('vcf', help='VCF file') diff --git a/fuc/cli/vcf_merge.py b/fuc/cli/vcf_merge.py index 65a148b..3dd8c34 100644 --- a/fuc/cli/vcf_merge.py +++ b/fuc/cli/vcf_merge.py @@ -16,7 +16,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] merge two or more VCF files', + help='[VCF] Merge two or more VCF files.', description=description, ) parser.add_argument('vcf_files', help='VCF files', nargs='+') diff --git a/fuc/cli/vcf_rename.py b/fuc/cli/vcf_rename.py index ea3dbb1..070fc04 100644 --- a/fuc/cli/vcf_rename.py +++ b/fuc/cli/vcf_rename.py @@ -25,7 +25,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] rename the samples in a VCF file.', + help='[VCF] Rename the samples in a VCF file.', description=description, ) parser.add_argument('vcf', help='VCF file') diff --git a/fuc/cli/vcf_slice.py b/fuc/cli/vcf_slice.py index 821d5d0..8fe2c08 100644 --- a/fuc/cli/vcf_slice.py +++ b/fuc/cli/vcf_slice.py @@ -16,7 +16,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] slice a VCF file', + help='[VCF] Slice a VCF file.', description=description, ) parser.add_argument('vcf', help='VCF file') diff --git a/fuc/cli/vcf_vcf2bed.py b/fuc/cli/vcf_vcf2bed.py index 1ab5d7f..e98caab 100644 --- a/fuc/cli/vcf_vcf2bed.py +++ b/fuc/cli/vcf_vcf2bed.py @@ -12,7 +12,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] convert a VCF file to a BED file', + help='[VCF] Convert a VCF file to a BED file.', description=description, ) parser.add_argument('vcf', help='VCF file') diff --git a/fuc/cli/vcf_vep.py b/fuc/cli/vcf_vep.py index d4b8c54..083aa8b 100644 --- a/fuc/cli/vcf_vep.py +++ b/fuc/cli/vcf_vep.py @@ -19,7 +19,7 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[VCF] filter a VCF file annotated by Ensemble VEP', + help='[VCF] Filter a VCF file annotated by Ensemble VEP.', description=description, ) parser.add_argument('vcf', help='VCF file annotated with Ensemble VEP') From c103595d9087fc2375801c7e158aa578af262af1 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 15:58:55 +0900 Subject: [PATCH 22/29] Update CLI --- fuc/cli/bam_head.py | 10 ++++++---- fuc/cli/bam_index.py | 6 +++--- fuc/cli/bam_rename.py | 9 ++++----- fuc/cli/bam_slice.py | 11 ++++++++--- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fuc/cli/bam_head.py b/fuc/cli/bam_head.py index fb50e5a..150d297 100644 --- a/fuc/cli/bam_head.py +++ b/fuc/cli/bam_head.py @@ -1,9 +1,11 @@ +import sys + from .. import api + import pysam description = f""" -This command will print the header of a SAM/BAM/CRAM file. It essentially -wraps the 'pysam.view' method from the pysam package. +This command will print the header of the input SAM/BAM/CRAM file. usage examples: $ fuc {api.common._script_name()} in.sam @@ -18,8 +20,8 @@ def create_parser(subparsers): help='[BAM] Print the header of a SAM/BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='SAM/BAM/CRAM file') + parser.add_argument('bam', help='SAM/BAM/CRAM file.') def main(args): header = pysam.view('-H', args.bam) - print(header, end='') + sys.stdout.write(header) diff --git a/fuc/cli/bam_index.py b/fuc/cli/bam_index.py index 4002ae0..4a4358f 100644 --- a/fuc/cli/bam_index.py +++ b/fuc/cli/bam_index.py @@ -1,9 +1,9 @@ from .. import api + import pysam description = f""" -This command will index a SAM/BAM/CRAM file. It essentially wraps the -'pysam.index' method from the pysam package. +This command will index the input SAM/BAM/CRAM file. usage examples: $ fuc {api.common._script_name()} in.sam @@ -18,7 +18,7 @@ def create_parser(subparsers): help='[BAM] Index a SAM/BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='SAM/BAM/CRAM file') + parser.add_argument('bam', help='SAM/BAM/CRAM file.') def main(args): pysam.index(args.bam) diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 6fe5109..5814ef7 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -1,8 +1,7 @@ from .. import api description = f""" -This command will rename the samples in a SAM/BAM/CRAM file. It essentially -wraps the 'pybam.rename' method from the fuc API. +This command will rename the samples in a SAM/BAM/CRAM file. usage examples: $ fuc {api.common._script_name()} in.sam new_name out.sam @@ -17,9 +16,9 @@ def create_parser(subparsers): help='[BAM] Rename the samples in a SAM/BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='SAM/BAM/CRAM file') - parser.add_argument('name', help='sample name') - parser.add_argument('out', help='output file') + parser.add_argument('bam', help='SAM/BAM/CRAM file.') + parser.add_argument('name', help='Sample name.') + parser.add_argument('out', help='Output file.') def main(args): api.pybam.rename(args.bam, args.name, args.out) diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 337842b..6e70ead 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -37,7 +37,7 @@ def create_parser(subparsers): choices=['SAM', 'BAM', 'CRAM'], help= "Output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM'). " - "A FASTA file ('--fasta') is required for 'CRAM'." + "A FASTA file must be specified with '--fasta' for 'CRAM'." ) parser.add_argument( '--fasta', @@ -51,10 +51,15 @@ def main(args): # Determine the output format. if args.format == 'BAM': stdout_method = sys.stdout.buffer.write - options.append('-b') + options += ['-b'] elif args.format == 'CRAM': stdout_method = sys.stdout.buffer.write - options.append('-C') + if args.fasta is None: + raise ValueError( + "A FASTA file must be specified with '--fasta' " + "when '--format' is 'CRAM'." + ) + options += ['-C', '-T', args.fasta] else: stdout_method = sys.stdout.write From c9a4749f1c019a602d24319a2fe2bbd29acea3e6 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 16:50:10 +0900 Subject: [PATCH 23/29] Update `bam_rename` command --- CHANGELOG.rst | 2 +- README.rst | 2 +- docs/cli.rst | 39 ++++++++++++++++++------------------ fuc/api/pybam.py | 45 ------------------------------------------ fuc/cli/bam_rename.py | 46 ++++++++++++++++++++++++++++++++++--------- 5 files changed, 58 insertions(+), 76 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 23738fa..9de88bf 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,7 +14,7 @@ Changelog * Fix minor bug in :meth:`pymaf.MafFrame.to_frame` method. * Deprecate :meth:`pyvep.filter_lof/clinsig` methods. * Update :meth:`pymaf.MafFrame.from_vcf` method to extract genotype keys (e.g. DP, AD, AF). -* Update :command:`bam_slice` command. +* Update :command:`bam_slice` and :command:`bam_rename` commands. 0.14.0 (2021-06-20) ------------------- diff --git a/README.rst b/README.rst index 9cfdd45..0238e38 100644 --- a/README.rst +++ b/README.rst @@ -110,7 +110,7 @@ For getting help on the fuc CLI: COMMAND bam_head [BAM] Print the header of a SAM/BAM/CRAM file. bam_index [BAM] Index a SAM/BAM/CRAM file. - bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. + bam_rename [BAM] Rename the samples in a BAM/CRAM file. bam_slice [BAM] Slice a SAM/BAM/CRAM file. bed_intxn [BED] Find the intersection of two or more BED files. bed_sum [BED] Summarize a BED file. diff --git a/docs/cli.rst b/docs/cli.rst index d62b703..4400d22 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -20,7 +20,7 @@ For getting help on the fuc CLI: COMMAND bam_head [BAM] Print the header of a SAM/BAM/CRAM file. bam_index [BAM] Index a SAM/BAM/CRAM file. - bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. + bam_rename [BAM] Rename the samples in a BAM/CRAM file. bam_slice [BAM] Slice a SAM/BAM/CRAM file. bed_intxn [BED] Find the intersection of two or more BED files. bed_sum [BED] Summarize a BED file. @@ -61,8 +61,7 @@ bam_head $ fuc bam_head -h usage: fuc bam_head [-h] bam - This command will print the header of a SAM/BAM/CRAM file. It essentially - wraps the 'pysam.view' method from the pysam package. + This command will print the header of the input SAM/BAM/CRAM file. usage examples: $ fuc bam_head in.sam @@ -70,7 +69,7 @@ bam_head $ fuc bam_head in.cram positional arguments: - bam SAM/BAM/CRAM file + bam SAM/BAM/CRAM file. optional arguments: -h, --help Show this help message and exit. @@ -83,8 +82,7 @@ bam_index $ fuc bam_index -h usage: fuc bam_index [-h] bam - This command will index a SAM/BAM/CRAM file. It essentially wraps the - 'pysam.index' method from the pysam package. + This command will index the input SAM/BAM/CRAM file. usage examples: $ fuc bam_index in.sam @@ -92,7 +90,7 @@ bam_index $ fuc bam_index in.cram positional arguments: - bam SAM/BAM/CRAM file + bam SAM/BAM/CRAM file. optional arguments: -h, --help Show this help message and exit. @@ -103,20 +101,17 @@ bam_rename .. code-block:: text $ fuc bam_rename -h - usage: fuc bam_rename [-h] bam name out + usage: fuc bam_rename [-h] bam name - This command will rename the samples in a SAM/BAM/CRAM file. It essentially - wraps the 'pybam.rename' method from the fuc API. + This command will rename the sample(s) in the input BAM/CRAM file. usage examples: - $ fuc bam_rename in.sam new_name out.sam - $ fuc bam_rename in.bam new_name out.bam - $ fuc bam_rename in.cram new_name out.cram + $ fuc bam_rename in.bam NA12878 > out.bam + $ fuc bam_rename in.cram NA12878 > out.cram positional arguments: - bam SAM/BAM/CRAM file - name sample name - out output file + bam BAM/CRAM file. + name New sample name. optional arguments: -h, --help Show this help message and exit. @@ -127,21 +122,25 @@ bam_slice .. code-block:: text $ fuc bam_slice -h - usage: fuc bam_slice [-h] [--format TEXT] bam region [region ...] + usage: fuc bam_slice [-h] [--format TEXT] [--fasta PATH] + bam region [region ...] This command will slice the input SAM/BAM/CRAM file for the specified region(s). usage examples: $ fuc bam_slice in.bam chr1:100-200 > out.bam $ fuc bam_slice in.bam chr1:100-200 chr2:100-200 > out.bam + $ fuc bam_slice in.bam chr1:100-200 --format SAM > out.sam + $ fuc bam_slice in.bam chr1:100-200 --format CRAM --fasta ref.fa > out.cram positional arguments: - bam SAM/BAM/CRAM file - region space-separated regions ('chrom:start-end') + bam SAM/BAM/CRAM file. + region Space-separated regions ('chrom:start-end'). optional arguments: -h, --help Show this help message and exit. - --format TEXT output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM') + --format TEXT Output format (default: 'BAM') (choices: 'SAM', 'BAM', 'CRAM'). A FASTA file must be specified with '--fasta' for 'CRAM'. + --fasta PATH FASTA file. Required when '--format' is 'CRAM'. bed_intxn ========= diff --git a/fuc/api/pybam.py b/fuc/api/pybam.py index b646cdd..6866f6d 100644 --- a/fuc/api/pybam.py +++ b/fuc/api/pybam.py @@ -6,56 +6,11 @@ """ import pysam -import tempfile def header(fn): """Return the header of a BAM file.""" return pysam.view('-H', fn).strip() -def rename(fn, name, out, index=True): - """Update the SM tags in a BAM file. - - This method will write a new BAM file with updated SM tags. By default, - the method will also create an index file. - - .. warning:: - This method will only update the SM tags and nothing else; - therefore, the old sample name(s) may still be present in other - tags. - - Parameters - ---------- - fn : str - Path to the input BAM file. - name : str - New sample name. - out : str - Path to the output BAM file. - index : bool, default: True - If False, don't index the output BAM file. - """ - lines1 = pysam.view('-H', fn).strip().split('\n') - lines2 = [] - for line in lines1: - fields = line.split('\t') - if fields[0] != '@RG': - lines2.append(line) - continue - t = [] - for field in fields: - if 'SM:' not in field: - t.append(field) - continue - t.append(f'SM:{name}') - lines2.append('\t'.join(t)) - with tempfile.TemporaryDirectory() as t: - with open(f'{t}/header.sam', 'w') as f1: - f1.write('\n'.join(lines2)) - with open(out, 'wb') as f2: - f2.write(pysam.reheader(f'{t}/header.sam', fn)) - if index: - pysam.index(out) - def tag_sm(fn): """Extract the SM tags (sample names) from a BAM file. diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 5814ef7..70fb80d 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -1,24 +1,52 @@ +import sys +import tempfile + from .. import api +import pysam + description = f""" -This command will rename the samples in a SAM/BAM/CRAM file. +This command will rename the sample(s) in the input BAM/CRAM file. usage examples: - $ fuc {api.common._script_name()} in.sam new_name out.sam - $ fuc {api.common._script_name()} in.bam new_name out.bam - $ fuc {api.common._script_name()} in.cram new_name out.cram + $ fuc {api.common._script_name()} in.bam NA12878 > out.bam + $ fuc {api.common._script_name()} in.cram NA12878 > out.cram """ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] Rename the samples in a SAM/BAM/CRAM file.', + help='[BAM] Rename the samples in a BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='SAM/BAM/CRAM file.') - parser.add_argument('name', help='Sample name.') - parser.add_argument('out', help='Output file.') + parser.add_argument('bam', help='BAM/CRAM file.') + parser.add_argument('name', help='New sample name.') def main(args): - api.pybam.rename(args.bam, args.name, args.out) + old_lines = pysam.view('-H', args.bam).strip().split('\n') + new_lines = [] + + for old_line in old_lines: + old_fields = old_line.split('\t') + + if old_fields[0] != '@RG': + new_lines.append(old_line) + continue + + new_fields = [] + + for old_field in old_fields: + if 'SM:' not in old_field: + new_fields.append(old_field) + continue + + new_fields.append(f'SM:{args.name}') + + new_lines.append('\t'.join(new_fields)) + + with tempfile.TemporaryDirectory() as t: + with open(f'{t}/header.sam', 'w') as f: + f.write('\n'.join(new_lines)) + + sys.stdout.buffer.write(pysam.reheader(f'{t}/header.sam', args.bam)) From f26afb08360efc0a881090701940c3fedaaedb5b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 16:58:51 +0900 Subject: [PATCH 24/29] Update CHANGELOG.rst --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9de88bf..4331392 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,6 +15,7 @@ Changelog * Deprecate :meth:`pyvep.filter_lof/clinsig` methods. * Update :meth:`pymaf.MafFrame.from_vcf` method to extract genotype keys (e.g. DP, AD, AF). * Update :command:`bam_slice` and :command:`bam_rename` commands. +* Deprecate :meth:`pybam.rename`. 0.14.0 (2021-06-20) ------------------- From e4365a91432b2ffb3b29cbc6f2e4c50aaacafa0b Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 16:59:27 +0900 Subject: [PATCH 25/29] Update CHANGELOG.rst --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4331392..47b5d26 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -15,7 +15,7 @@ Changelog * Deprecate :meth:`pyvep.filter_lof/clinsig` methods. * Update :meth:`pymaf.MafFrame.from_vcf` method to extract genotype keys (e.g. DP, AD, AF). * Update :command:`bam_slice` and :command:`bam_rename` commands. -* Deprecate :meth:`pybam.rename`. +* Deprecate :meth:`pybam.rename` method. 0.14.0 (2021-06-20) ------------------- From b2fc7cbde72ec69c7abecb5799fed24c38388db2 Mon Sep 17 00:00:00 2001 From: "Seung-been \"Steven\" Lee" Date: Wed, 23 Jun 2021 17:10:45 +0900 Subject: [PATCH 26/29] Update CLI --- fuc/cli/bam_head.py | 2 +- fuc/cli/bam_rename.py | 5 +++-- fuc/cli/bam_slice.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fuc/cli/bam_head.py b/fuc/cli/bam_head.py index 150d297..287f94f 100644 --- a/fuc/cli/bam_head.py +++ b/fuc/cli/bam_head.py @@ -23,5 +23,5 @@ def create_parser(subparsers): parser.add_argument('bam', help='SAM/BAM/CRAM file.') def main(args): - header = pysam.view('-H', args.bam) + header = pysam.view('-H', args.bam, '--no-PG') sys.stdout.write(header) diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 70fb80d..6f563aa 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -24,7 +24,7 @@ def create_parser(subparsers): parser.add_argument('name', help='New sample name.') def main(args): - old_lines = pysam.view('-H', args.bam).strip().split('\n') + old_lines = pysam.view('-H', args.bam, '--no-PG').strip().split('\n') new_lines = [] for old_line in old_lines: @@ -49,4 +49,5 @@ def main(args): with open(f'{t}/header.sam', 'w') as f: f.write('\n'.join(new_lines)) - sys.stdout.buffer.write(pysam.reheader(f'{t}/header.sam', args.bam)) + alignments = pysam.reheader(f'{t}/header.sam', args.bam, '--no-PG') + sys.stdout.buffer.write(alignments) diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 6e70ead..2c5536d 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -46,7 +46,7 @@ def create_parser(subparsers): ) def main(args): - options = [] + options = ['--no-PG'] # Determine the output format. if args.format == 'BAM': From 73384f6f90c201a89ed515e2edf509a6e881036e Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Wed, 23 Jun 2021 20:11:29 +0900 Subject: [PATCH 27/29] Update `bam_slice` command --- fuc/cli/bam_rename.py | 43 ++++++++++++++++++++++++++++--------------- fuc/cli/bam_slice.py | 2 +- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/fuc/cli/bam_rename.py b/fuc/cli/bam_rename.py index 6f563aa..44e37c1 100644 --- a/fuc/cli/bam_rename.py +++ b/fuc/cli/bam_rename.py @@ -6,9 +6,10 @@ import pysam description = f""" -This command will rename the sample(s) in the input BAM/CRAM file. +This command will rename the sample(s) in the input SAM/BAM/CRAM file. usage examples: + $ fuc {api.common._script_name()} in.sam NA12878 > out.sam $ fuc {api.common._script_name()} in.bam NA12878 > out.bam $ fuc {api.common._script_name()} in.cram NA12878 > out.cram """ @@ -17,37 +18,49 @@ def create_parser(subparsers): parser = api.common._add_parser( subparsers, api.common._script_name(), - help='[BAM] Rename the samples in a BAM/CRAM file.', + help='[BAM] Rename the samples in a SAM/BAM/CRAM file.', description=description, ) - parser.add_argument('bam', help='BAM/CRAM file.') + parser.add_argument('bam', help='SAM/BAM/CRAM file.') parser.add_argument('name', help='New sample name.') def main(args): + # Detect the input format. + try: + with open(args.bam, 'r') as f: + for line in f: + is_sam = True + break + except UnicodeDecodeError: + is_sam = False + + # Rename the sample(s). old_lines = pysam.view('-H', args.bam, '--no-PG').strip().split('\n') new_lines = [] - for old_line in old_lines: old_fields = old_line.split('\t') - if old_fields[0] != '@RG': new_lines.append(old_line) continue - new_fields = [] - for old_field in old_fields: if 'SM:' not in old_field: new_fields.append(old_field) continue - new_fields.append(f'SM:{args.name}') - new_lines.append('\t'.join(new_fields)) - with tempfile.TemporaryDirectory() as t: - with open(f'{t}/header.sam', 'w') as f: - f.write('\n'.join(new_lines)) - - alignments = pysam.reheader(f'{t}/header.sam', args.bam, '--no-PG') - sys.stdout.buffer.write(alignments) + # Write the output file. + if is_sam: + for new_line in new_lines: + sys.stdout.write(new_line + '\n') + alignments = pysam.view(args.bam, '--no-PG') + sys.stdout.write(alignments) + else: + with tempfile.TemporaryDirectory() as t: + with open(f'{t}/header.sam', 'w') as f: + f.write('\n'.join(new_lines)) + alignments = pysam.reheader(f'{t}/header.sam', + args.bam, + '--no-PG') + sys.stdout.buffer.write(alignments) diff --git a/fuc/cli/bam_slice.py b/fuc/cli/bam_slice.py index 2c5536d..e99f79f 100644 --- a/fuc/cli/bam_slice.py +++ b/fuc/cli/bam_slice.py @@ -46,7 +46,7 @@ def create_parser(subparsers): ) def main(args): - options = ['--no-PG'] + options = ['-h', '--no-PG'] # Determine the output format. if args.format == 'BAM': From 7604274aa5f8c64d4c7bccd7dfac73180336d104 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Wed, 23 Jun 2021 20:28:06 +0900 Subject: [PATCH 28/29] Update docs --- README.rst | 12 +++++++++--- docs/cli.rst | 7 ++++--- docs/create.py | 10 ++++++++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 0238e38..360538e 100644 --- a/README.rst +++ b/README.rst @@ -110,7 +110,7 @@ For getting help on the fuc CLI: COMMAND bam_head [BAM] Print the header of a SAM/BAM/CRAM file. bam_index [BAM] Index a SAM/BAM/CRAM file. - bam_rename [BAM] Rename the samples in a BAM/CRAM file. + bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. bam_slice [BAM] Slice a SAM/BAM/CRAM file. bed_intxn [BED] Find the intersection of two or more BED files. bed_sum [BED] Summarize a BED file. @@ -165,7 +165,7 @@ For getting help on a specific submodule (e.g. pyvcf): CLI Examples ============ -**BAM** +**SAM/BAM/CRAM** To print the header of a SAM file: @@ -179,11 +179,17 @@ To index a CRAM file: $ fuc bam_index in.cram +To rename the samples in a SAM file: + +.. code-block:: text + + $ fuc bam_rename in.sam NA12878 > out.sam + To slice a BAM file: .. code-block:: text - $ fuc bam_slice in.bam chr1:100-200 out.bam + $ fuc bam_slice in.bam chr1:100-200 > out.bam **BED** diff --git a/docs/cli.rst b/docs/cli.rst index 4400d22..b041882 100644 --- a/docs/cli.rst +++ b/docs/cli.rst @@ -20,7 +20,7 @@ For getting help on the fuc CLI: COMMAND bam_head [BAM] Print the header of a SAM/BAM/CRAM file. bam_index [BAM] Index a SAM/BAM/CRAM file. - bam_rename [BAM] Rename the samples in a BAM/CRAM file. + bam_rename [BAM] Rename the samples in a SAM/BAM/CRAM file. bam_slice [BAM] Slice a SAM/BAM/CRAM file. bed_intxn [BED] Find the intersection of two or more BED files. bed_sum [BED] Summarize a BED file. @@ -103,14 +103,15 @@ bam_rename $ fuc bam_rename -h usage: fuc bam_rename [-h] bam name - This command will rename the sample(s) in the input BAM/CRAM file. + This command will rename the sample(s) in the input SAM/BAM/CRAM file. usage examples: + $ fuc bam_rename in.sam NA12878 > out.sam $ fuc bam_rename in.bam NA12878 > out.bam $ fuc bam_rename in.cram NA12878 > out.cram positional arguments: - bam BAM/CRAM file. + bam SAM/BAM/CRAM file. name New sample name. optional arguments: diff --git a/docs/create.py b/docs/create.py index bb02ba0..7850b2f 100644 --- a/docs/create.py +++ b/docs/create.py @@ -153,7 +153,7 @@ CLI Examples ============ -**BAM** +**SAM/BAM/CRAM** To print the header of a SAM file: @@ -167,11 +167,17 @@ $ fuc bam_index in.cram +To rename the samples in a SAM file: + +.. code-block:: text + + $ fuc bam_rename in.sam NA12878 > out.sam + To slice a BAM file: .. code-block:: text - $ fuc bam_slice in.bam chr1:100-200 out.bam + $ fuc bam_slice in.bam chr1:100-200 > out.bam **BED** From 4b0c492a69496ac6bad231ac40ebeaf4176e3644 Mon Sep 17 00:00:00 2001 From: Seung-been Lee Date: Thu, 24 Jun 2021 20:03:45 +0900 Subject: [PATCH 29/29] Update CHANGELOG.rst --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 47b5d26..0770efd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,8 @@ Changelog ********* -0.15.0 (in development) ------------------------ +0.15.0 (2021-06-24) +------------------- * Update :command:`vcf_filter` command. * Update :command:`tbl_sum` command.