From 1dda6c1960a6a6c4bd248751c169ee0ff0178715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=BCller-B=C3=B6tticher?= Date: Fri, 28 Jul 2023 09:29:07 +0200 Subject: [PATCH] allow setting the count prior for empty droplets on cli --- cellbender/remove_background/argparse.py | 5 ++++ cellbender/remove_background/cli.py | 6 +++++ cellbender/remove_background/data/dataset.py | 25 ++++++++++++-------- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/cellbender/remove_background/argparse.py b/cellbender/remove_background/argparse.py index a52a3eb..73aa716 100644 --- a/cellbender/remove_background/argparse.py +++ b/cellbender/remove_background/argparse.py @@ -45,6 +45,11 @@ def add_subparser_args(subparsers: argparse) -> argparse: help="Number of cells expected in the dataset " "(a rough estimate within a factor of 2 " "is sufficient).") + subparser.add_argument("--expected-ambient-size", nargs=None, type=int, + default=None, + dest="expected_ambient_size", + help="Prior for the number of counts expected in empty " + "droplets.") subparser.add_argument("--total-droplets-included", nargs=None, type=int, default=consts.TOTAL_DROPLET_DEFAULT, diff --git a/cellbender/remove_background/cli.py b/cellbender/remove_background/cli.py index 16e93e8..2ca0eb2 100644 --- a/cellbender/remove_background/cli.py +++ b/cellbender/remove_background/cli.py @@ -51,6 +51,11 @@ def validate_args(self, args): assert args.total_droplets > args.expected_cell_count, \ f"total_droplets must be an integer greater than the input " \ f"expected_cell_count, which is {args.expected_cell_count}." + + # If expected counts for empty drolets are specified, it should be positive + if (args.expected_ambient_size is not None): + assert args.expected_ambient_size > 0, \ + f"expected_ambient_size must be an integer greater than 0." assert (args.fraction_empties > 0) and (args.fraction_empties < 1), \ "fraction_empties must be between 0 and 1, exclusive. This is " \ @@ -150,6 +155,7 @@ def run_remove_background(args): dataset_obj = \ SingleCellRNACountsDataset(input_file=args.input_file, expected_cell_count=args.expected_cell_count, + expected_ambient_size=args.expected_ambient_size, total_droplet_barcodes=args.total_droplets, fraction_empties=args.fraction_empties, model_name=args.model, diff --git a/cellbender/remove_background/data/dataset.py b/cellbender/remove_background/data/dataset.py index a4eb84b..0e147ab 100644 --- a/cellbender/remove_background/data/dataset.py +++ b/cellbender/remove_background/data/dataset.py @@ -68,6 +68,7 @@ def __init__(self, low_count_threshold: int, fpr: List[float], expected_cell_count: Optional[int] = None, + expected_ambient_size: Optional[int] = None, total_droplet_barcodes: int = consts.TOTAL_DROPLET_DEFAULT, fraction_empties: Optional[float] = None, gene_blacklist: List[int] = []): @@ -83,7 +84,8 @@ def __init__(self, self.fraction_empties = fraction_empties self.is_trimmed = False self.low_count_threshold = low_count_threshold - self.priors = {'n_cells': expected_cell_count} # Expected cells could be None. + self.priors = {'n_cells': expected_cell_count, # Expected cells could be None. + 'empty_counts': expected_ambient_size} self.posterior = None self.fpr = fpr self.random = np.random.RandomState(seed=1234) @@ -1425,17 +1427,20 @@ def get_d_priors_from_dataset(dataset: SingleCellRNACountsDataset) \ # Models that include both cells and empty droplets. else: + + if dataset.priors['empty_counts'] is None: + # Cutoff for original data. Empirical. + cut = dataset.low_count_threshold - # Cutoff for original data. Empirical. - cut = dataset.low_count_threshold - - # Estimate the number of UMI counts in empty droplets. + # Estimate the number of UMI counts in empty droplets. - # Mode of (rounded) log counts (for counts > cut) is a robust - # empty estimator. - empty_log_counts = mode(np.round(np.log1p(counts[counts > cut]), - decimals=1))[0] - empty_counts = int(np.expm1(empty_log_counts).item()) + # Mode of (rounded) log counts (for counts > cut) is a robust + # empty estimator. + empty_log_counts = mode(np.round(np.log1p(counts[counts > cut]), + decimals=1))[0] + empty_counts = int(np.expm1(empty_log_counts).item()) + else: + empty_counts = dataset.priors['empty_counts'] # Estimate the number of UMI counts in cells.