diff --git a/.gitignore b/.gitignore index 30e6eb51..448e4525 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/freyja.egg-info/PKG-INFO b/freyja.egg-info/PKG-INFO index 7503c46c..5f4e339b 100644 --- a/freyja.egg-info/PKG-INFO +++ b/freyja.egg-info/PKG-INFO @@ -52,7 +52,7 @@ We can then run Freyja on the output files using the commmand: ``` freyja demix [variants-file] [depth-file] --output [output-file] ``` -This outputs to a tsv file that includes the lineages present, their corresponding abundances, and summarization by constellation. This method also includes a `--eps` option, which enables the user to define the minimum lineage abundance returned to the user (e.g. `--eps 0.0001`). A custom barcode file can be provided using the `--barcodes [path-to-barcode-file]` option. As the UShER tree now included proposed lineages, we now offer the ```--confirmedonly``` flag which removes unconfirmed lineages from the analysis. For additional flexibility and reproducibility of analyses, a custom lineage-to-contellation mapping metadata file can be provided using the `--meta` option. An example output should have the format +This outputs to a tsv file that includes the lineages present, their corresponding abundances, and summarization by constellation. This method also includes a `--eps` option, which enables the user to define the minimum lineage abundance returned to the user (e.g. `--eps 0.0001`). A custom barcode file can be provided using the `--barcodes [path-to-barcode-file]` option. As the UShER tree now included proposed lineages, we now offer the ```--confirmedonly``` flag which removes unconfirmed lineages from the analysis. For additional flexibility and reproducibility of analyses, a custom lineage-to-contellation mapping metadata file can be provided using the `--meta` option. A coverage depth minimum can be specified using the `--depthcutoff` option, which excludes sites with coverage less than the specified value. An example output should have the format | | filename | @@ -63,7 +63,7 @@ This outputs to a tsv file that includes the lineages present, their correspondi | resid | 3.14159 | | coverage | 95.8 | -Where ```summarized``` denotes a sum of all lineage abundances in a particular WHO designation (i.e. B.1.617.2 and AY.6 abundances are summed in the above example), otherwise they are grouped into "Other". The ```lineage``` array lists the identified lineages in descending order, and ```abundances``` contains the corresponding abundances estimates. The value of ```resid``` corresponds to the residual of the weighted least absolute devation problem used to estimate lineage abundances. The ```coverage``` value provides the 10x coverage estimate (percent of sites with 10 or greater reads- 10 is the default but can be modfied using the ```--covcut``` option in ```demix```). +Where ```summarized``` denotes a sum of all lineage abundances in a particular WHO designation (i.e. B.1.617.2 and AY.6 abundances are summed in the above example), otherwise they are grouped into "Other". The ```lineage``` array lists the identified lineages in descending order, and ```abundances``` contains the corresponding abundances estimates. Using the `--depthcutoff` option may result in some distinct lineages now having identical barcodes, which are grouped into the format `[lineage]-like(num)` (based on their shared phylogeny) in the output. A summary of this lineage grouping is outputted to `[output-file]_collapsed_lineages.yml`. The value of ```resid``` corresponds to the residual of the weighted least absolute devation problem used to estimate lineage abundances. The ```coverage``` value provides the 10x coverage estimate (percent of sites with 10 or greater reads- 10 is the default but can be modfied using the ```--covcut``` option in ```demix```) . NOTE: The ```freyja variants``` output is stable in time, and does not need to be re-run to incorporate updated lineage designations/corresponding mutational barcodes, whereas the outputs of ```freyja demix``` will change as barcodes are updated (and thus ```demix``` should be re-run as new information is made available). @@ -81,7 +81,7 @@ We now provide a fast bootstrapping method for freyja, which can be run using th ``` freyja boot [variants-file] [depth-file] --nt [number-of-cpus] --nb [number-of-bootstraps] --output_basename [base-name] ``` -which results in two output files `base-name_lineages.csv` and `base-name_summarized.csv`, which contain the 0.025, 0.05,0.25,0.5 (median),0.75, 0.95, and 0.975 percentiles for each lineage and WHO designated VOI/VOC, respectively, as obtained via the bootstrap. We also provide the `--eps`, `--barcodes`, and `--meta` options as in `freyja demix`. We now also provide a `--boxplot` option, which should be specified in the form `--boxplot pdf` if you want the boxplot in pdf format. +which results in two output files: `base-name_lineages.csv` and `base-name_summarized.csv`, which contain the 0.025, 0.05, 0.25, 0.5 (median),0.75, 0.95, and 0.975 percentiles for each lineage and WHO designated VOI/VOC, respectively, as obtained via the bootstrap. If the `--rawboots` option is used, it will return two additional output files `base-name_lineages_boot.csv` and `base-name_summarized_boot.csv`, which contain the bootstrap estimates (rather than summary statistics). We also provide the `--eps`, `--barcodes`, and `--meta` options as in `freyja demix`. We now also provide a `--boxplot` option, which should be specified in the form `--boxplot pdf` if you want the boxplot in pdf format. For rapid visualization of results, we also offer two utility methods for manipulating the "demixed" output files. The first is an aggregation method @@ -142,7 +142,7 @@ We now provide tools for the analysis of indexed bam files given a set of mutati ``` freyja extract [query-mutations.csv] [input-bam] --output [outputname.bam] --same_read ``` -The above command will extract reads containing one or more mutations of interest and save them to `[input-bam]_extracted.bam`. Additionally, the `--same_read` flag can be included to specify that all query mutations must occur on the same read. In order to exclude reads containing one or more mutations, use the following: +The above command will extract reads containing one or more mutations of interest and save them to `[input-bam]_extracted.bam`. Additionally, the `--same_read` flag can be included to specify that all query mutations must occur on the same set of paired reads (same UMI). In order to **exclude** reads containing one or more mutations, use the following: ``` freyja filter [query-mutations.csv] [input-bam] [min-site] [max-site] --output [outputname.bam] ``` @@ -152,15 +152,15 @@ C75T,G230A,A543C (732:'TT'),(1349:'A'),(12333:'A') (1443:32),(1599:2),(2036:3) ``` -In many cases, it can be useful to study covariant mutations (i.e. mutations co-occurring on the same read). This can be performed with the command: +In many cases, it can be useful to study covariant mutations (i.e. mutations co-occurring on the same read pair). This can be performed with the command: ``` -freyja covariants [input_bam] [min_site] [max_site] --output [outputname.tsv] +freyja covariants [input_bam] [min_site] [max_site] --ref-genome [ref-genome.fasta (defaults to NC_045512.2 Hu-1)] --gff-file [gene-annotations.gff] --output [outputname.tsv] --sort_by [count|freq|site] ``` -This outputs to a tsv file that includes the mutations present in each set of covariants, their frequency (with respect to the total number of reads between `min_site` and `max_site`, inclusively), as well as the coverage ranges. Should the user wish to only consider reads that span the entire genomic region defined by (min_site, max_site), they may include the `--spans_region` flag. Inclusion thresholds for read-mapping quality and the number of observed instances of a set of covariants can be set using `--min_quality` and `--min_count` respectively. The output can then be passed into: +This outputs to a tsv file that includes the mutations present in each set of covariants, their absolute counts (the number of read pairs with the mutations), their coverage ranges (the minimum and maximum position for read-pairs with the mutations), their "maximum" counts (the number of read pairs that span the positions in the mutations), and their frequencies (the absolute count divided by the maximum count). Should the user wish to only consider read pairs that span the entire genomic region defined by (min_site, max_site), they may include the `--spans_region` flag. By default, the covariant patterns are sorted in descending order by count, however they can also be sorted in descending order by frequency by setting the `--sort_by` option to "freq", or sorted sequentially by mutation site by setting the `--sort_by` option to "site". The `--ref-genome` argument defaults to `freyja/data/NC_045512_Hu-1.fasta`. If you are using a different build to perfrom alignment, it is important to pass that file in to `--ref-genome` instead. Optionally, a gff file (e.g. `freyja/data/NC_045512_Hu-1.gff`) may be included via the `--gff-file` option to output amino acid mutations alongside nucleotide mutations. Inclusion thresholds for read-mapping quality and the number of observed instances of a set of covariants can be set using `--min_quality` and `--min_count` respectively. The output can then be passed into: ``` freyja plot-covariants [covariant_file.tsv] --output [plot-filename(.pdf,.png,etc.)] ``` -This generates an informative heatmap visualization, showing variants as well as coverage across different reads. The threshold, `--min_mutations`, sets a minimum number of mutations per set of covariants for inclusion in the plot. Example output: -| **Covariants Plot**| -| :---: | -|![](freyja/data/example_covariants_plot.png)| +This generates an informative heatmap visualization showing the frequency and coverage ranges of each observed covariant pattern. Each pattern is listed in the format CP(n). `--num_clusters` sets the number of patterns (i.e. rows in the plot) to display, while the threshold `--min_mutations` sets a minimum number of mutations per set of covariants for inclusion in the plot. The flag `--nt_muts` can be included to provide nucleotide-specific information in the plot (e.g. C22995A(S:T478K) as opposed to S:T478K), making it possible to view multi-nucleotide variants. + +Example output: +![](freyja/data/example_covariants_plot.png) diff --git a/freyja.egg-info/SOURCES.txt b/freyja.egg-info/SOURCES.txt index 48e19bd4..f43dfd1c 100644 --- a/freyja.egg-info/SOURCES.txt +++ b/freyja.egg-info/SOURCES.txt @@ -5,6 +5,7 @@ freyja/__init__.py freyja/_cli.py freyja/convert_paths2barcodes.py freyja/read_analysis_tools.py +freyja/read_analysis_utils.py freyja/sample_deconv.py freyja/updates.py freyja/utils.py @@ -37,8 +38,10 @@ freyja/data/sweep_metadata.csv freyja/data/sweep_metadata_missing.csv freyja/data/test.bam freyja/data/test.bam.bai +freyja/data/test.depth freyja/data/test.pdf freyja/data/test.png +freyja/data/test.tsv freyja/data/test.vcf freyja/data/test0.html freyja/data/test0.png diff --git a/freyja/_cli.py b/freyja/_cli.py index 0b6aab78..d9091dc1 100644 --- a/freyja/_cli.py +++ b/freyja/_cli.py @@ -25,7 +25,7 @@ @click.group() -@click.version_option('1.4.7') +@click.version_option('1.4.8') def cli(): pass @@ -57,8 +57,13 @@ def print_barcode_version(ctx, param, value): @click.option('--depthcutoff', default=0, help='exclude sites with coverage depth below this value and' 'group identical barcodes') +@click.option('--adapt', default=0., + help='adaptive lasso penalty parameter') +@click.option('--a_eps', default=1E-8, + help='adaptive lasso parameter, hard threshold') def demix(variants, depths, output, eps, barcodes, meta, - covcut, confirmedonly, depthcutoff): + covcut, confirmedonly, depthcutoff, + adapt, a_eps): locDir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir)) # option for custom barcodes @@ -92,7 +97,8 @@ def demix(variants, depths, output, eps, barcodes, meta, sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths_, - eps) + eps, adapt, + a_eps) # merge intra-lineage diversity if multiple hits. if len(set(sample_strains)) < len(sample_strains): localDict = {} diff --git a/freyja/sample_deconv.py b/freyja/sample_deconv.py index eee33302..df17fa8e 100644 --- a/freyja/sample_deconv.py +++ b/freyja/sample_deconv.py @@ -149,7 +149,7 @@ def map_to_constellation(sample_strains, vals, mapDict): return localDict -def solve_demixing_problem(df_barcodes, mix, depths, eps): +def solve_demixing_problem(df_barcodes, mix, depths, eps, adapt, a_eps): # single file problem setup, solving dep = np.log2(depths+1) @@ -169,6 +169,28 @@ def solve_demixing_problem(df_barcodes, mix, depths, eps): 'likely due to insufficient sequencing depth.') sys.exit(1) sol = x.value + rnorm0 = cp.norm(A @ x - b, 1).value + if adapt > 0.: + # if specified, run the adaptive lasso based method + # thresholding step + solNz = [j for j, s in enumerate(sol) if s > a_eps] + solFlip = [1./sol[j] for j in solNz] + A = A[:, solNz] + x = cp.Variable(A.shape[1]) + cost = cp.norm(A @ x - b, 1) +\ + (rnorm0*adapt)*cp.norm(cp.multiply(solFlip, x), 1) + constraints = [sum(x) == 1, x >= 0] + prob = cp.Problem(cp.Minimize(cost), constraints) + try: + prob.solve(verbose=False) + except cp.error.SolverError: + print('demix: Solver error encountered, most ' + 'likely due to insufficient sequencing depth. ' + 'Try increasing the --depthcutoff parameter.') + sys.exit(1) + sol = np.zeros(len(sol)) + sol[solNz] = x.value + rnorm = cp.norm(A @ x - b, 1).value # extract lineages with non-negligible abundance sol[sol < eps] = 0 @@ -181,7 +203,8 @@ def solve_demixing_problem(df_barcodes, mix, depths, eps): def bootstrap_parallel(jj, samplesDefining, fracDepths_adj, mix_grp, - mix, df_barcodes, eps0, muts, mapDict): + mix, df_barcodes, eps0, muts, mapDict, + adapt, a_eps): # helper function for fast bootstrap and solve # get sequencing depth at the position of all defining mutations mix_boot = mix.copy() @@ -220,14 +243,16 @@ def bootstrap_parallel(jj, samplesDefining, fracDepths_adj, mix_grp, mix_boot, dps_) sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix_boot_, - dps_, eps0) + dps_, eps0, + adapt, a_eps) localDict = map_to_constellation(sample_strains, abundances, mapDict) return sample_strains, abundances, localDict def perform_bootstrap(df_barcodes, mix, depths_, numBootstraps, eps0, n_jobs, - mapDict, muts, boxplot, basename): + mapDict, muts, boxplot, basename, + adapt=0., a_eps=1E-8): depths_.index = depths_.index.to_series().apply(lambda x: int(x[1:len(x)-1])) depths_ = depths_[~depths_.index.duplicated(keep='first')] @@ -257,7 +282,9 @@ def perform_bootstrap(df_barcodes, mix, depths_, df_barcodes, eps0, muts, - mapDict) + mapDict, + adapt, + a_eps) for jj0 in tqdm(range(numBootstraps))) for i in range(len(out)): sample_lins, abundances, localDict = out[i] diff --git a/freyja/tests/test_deconv.py b/freyja/tests/test_deconv.py index d46b8851..1eaa97c3 100644 --- a/freyja/tests/test_deconv.py +++ b/freyja/tests/test_deconv.py @@ -1,8 +1,8 @@ import unittest import pandas as pd -from freyja.sample_deconv import buildLineageMap, build_mix_and_depth_arrays,\ - reindex_dfs, map_to_constellation, solve_demixing_problem,\ - perform_bootstrap +from freyja.sample_deconv import buildLineageMap, build_mix_and_depth_arrays, \ + reindex_dfs, map_to_constellation, solve_demixing_problem, \ + perform_bootstrap import pandas.testing as pdt import pandas.api.types as ptypes from numpy.random import negative_binomial @@ -56,7 +56,8 @@ def test_demixing(self): eps = 0.001 sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, depths, - eps) + eps, 0., + 1.E-8) self.assertAlmostEqual( abundances[sample_strains.tolist().index(strain1)], mixFracs[0]) self.assertAlmostEqual( diff --git a/freyja/utils.py b/freyja/utils.py index a155189d..8ea2b538 100644 --- a/freyja/utils.py +++ b/freyja/utils.py @@ -183,6 +183,10 @@ def get_color_scheme(df, default_color_scheme, config=None): def prepLineageDict(agg_d0, thresh=0.001, config=None, lineage_info=None): + + if len(agg_d0.index[agg_d0.index.duplicated(keep=False)]) > 0: + print('WARNING: multiple samples have the same ID/filename.') + print(agg_d0.index[agg_d0.index.duplicated(keep=False)]) agg_d0.loc[:, 'lineages'] = agg_d0['lineages']\ .apply(lambda x: x.replace("'", "")