From 1dda6c1960a6a6c4bd248751c169ee0ff0178715 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=BCller-B=C3=B6tticher?= <niklasmueboe@github.com>
Date: Fri, 28 Jul 2023 09:29:07 +0200
Subject: [PATCH] allow setting the count prior for empty droplets on cli

---
 cellbender/remove_background/argparse.py     |  5 ++++
 cellbender/remove_background/cli.py          |  6 +++++
 cellbender/remove_background/data/dataset.py | 25 ++++++++++++--------
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/cellbender/remove_background/argparse.py b/cellbender/remove_background/argparse.py
index a52a3eb..73aa716 100644
--- a/cellbender/remove_background/argparse.py
+++ b/cellbender/remove_background/argparse.py
@@ -45,6 +45,11 @@ def add_subparser_args(subparsers: argparse) -> argparse:
                            help="Number of cells expected in the dataset "
                                 "(a rough estimate within a factor of 2 "
                                 "is sufficient).")
+    subparser.add_argument("--expected-ambient-size", nargs=None, type=int,
+                           default=None,
+                           dest="expected_ambient_size",
+                           help="Prior for the number of counts expected in empty "
+                                "droplets.")
     subparser.add_argument("--total-droplets-included",
                            nargs=None, type=int,
                            default=consts.TOTAL_DROPLET_DEFAULT,
diff --git a/cellbender/remove_background/cli.py b/cellbender/remove_background/cli.py
index 16e93e8..2ca0eb2 100644
--- a/cellbender/remove_background/cli.py
+++ b/cellbender/remove_background/cli.py
@@ -51,6 +51,11 @@ def validate_args(self, args):
             assert args.total_droplets > args.expected_cell_count, \
                 f"total_droplets must be an integer greater than the input " \
                 f"expected_cell_count, which is {args.expected_cell_count}."
+            
+        # If expected counts for empty drolets are specified, it should be positive
+        if (args.expected_ambient_size is not None):
+            assert args.expected_ambient_size > 0, \
+                f"expected_ambient_size must be an integer greater than 0."
 
         assert (args.fraction_empties > 0) and (args.fraction_empties < 1), \
             "fraction_empties must be between 0 and 1, exclusive.  This is " \
@@ -150,6 +155,7 @@ def run_remove_background(args):
         dataset_obj = \
             SingleCellRNACountsDataset(input_file=args.input_file,
                                        expected_cell_count=args.expected_cell_count,
+                                       expected_ambient_size=args.expected_ambient_size,
                                        total_droplet_barcodes=args.total_droplets,
                                        fraction_empties=args.fraction_empties,
                                        model_name=args.model,
diff --git a/cellbender/remove_background/data/dataset.py b/cellbender/remove_background/data/dataset.py
index a4eb84b..0e147ab 100644
--- a/cellbender/remove_background/data/dataset.py
+++ b/cellbender/remove_background/data/dataset.py
@@ -68,6 +68,7 @@ def __init__(self,
                  low_count_threshold: int,
                  fpr: List[float],
                  expected_cell_count: Optional[int] = None,
+                 expected_ambient_size: Optional[int] = None,
                  total_droplet_barcodes: int = consts.TOTAL_DROPLET_DEFAULT,
                  fraction_empties: Optional[float] = None,
                  gene_blacklist: List[int] = []):
@@ -83,7 +84,8 @@ def __init__(self,
         self.fraction_empties = fraction_empties
         self.is_trimmed = False
         self.low_count_threshold = low_count_threshold
-        self.priors = {'n_cells': expected_cell_count}  # Expected cells could be None.
+        self.priors = {'n_cells': expected_cell_count, # Expected cells could be None.
+                       'empty_counts': expected_ambient_size}  
         self.posterior = None
         self.fpr = fpr
         self.random = np.random.RandomState(seed=1234)
@@ -1425,17 +1427,20 @@ def get_d_priors_from_dataset(dataset: SingleCellRNACountsDataset) \
 
     # Models that include both cells and empty droplets.
     else:
+        
+        if dataset.priors['empty_counts'] is None:
+            # Cutoff for original data.  Empirical.
+            cut = dataset.low_count_threshold
 
-        # Cutoff for original data.  Empirical.
-        cut = dataset.low_count_threshold
-
-        # Estimate the number of UMI counts in empty droplets.
+            # Estimate the number of UMI counts in empty droplets.
 
-        # Mode of (rounded) log counts (for counts > cut) is a robust
-        # empty estimator.
-        empty_log_counts = mode(np.round(np.log1p(counts[counts > cut]),
-                                         decimals=1))[0]
-        empty_counts = int(np.expm1(empty_log_counts).item())
+            # Mode of (rounded) log counts (for counts > cut) is a robust
+            # empty estimator.
+            empty_log_counts = mode(np.round(np.log1p(counts[counts > cut]),
+                                            decimals=1))[0]
+            empty_counts = int(np.expm1(empty_log_counts).item())
+        else:
+            empty_counts = dataset.priors['empty_counts']
 
         # Estimate the number of UMI counts in cells.