merenlab · ivagljiva · Jul 5, 2022 · Jul 5, 2022 · Jul 5, 2022 · Jul 6, 2022
diff --git a/anvio/__init__.py b/anvio/__init__.py
@@ -626,6 +626,14 @@ def TABULATE(table, header, numalign="right", max_width=0):
                      "A file name must be provided. To learn more about how the functional occurrence is computed, please "
                      "refer to the tutorial."}
                 ),
+    'qlambda': (
+            ['--qlambda'],
+            {'default': None,
+             'metavar': 'NUM',
+             'type': float,
+             'help': "If you want to set the maximum lambda value for q-value estimation, provide a fraction between "
+                     "0.05 and 1."}
+                ),
     'table': (
             ['--table'],
             {'metavar': 'TABLE_NAME',

diff --git a/anvio/genomedescriptions.py b/anvio/genomedescriptions.py
@@ -922,6 +922,7 @@ def __init__(self, args, layer_groups=None, skip_sanity_check=False, skip_init=F
         self.print_genome_names_and_quit = A('print_genome_names_and_quit') or False
         self.functional_occurrence_table_output_path = A('functional_occurrence_table_output')
         self.functional_enrichment_output_path = A('output_file')
+        self.qlambda = A('qlambda')
 
         # -----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----
         # these are some primary data structures this class reports
@@ -1357,6 +1358,7 @@ def do_functional_enrichment_analysis(self):
         # run the enrichment analysis
         self.functional_enrichment_stats_dict = utils.run_functional_enrichment_stats(functional_occurrence_stats_input_file_path=self.functional_occurrence_table_output_path,
                                                                                       enrichment_output_file_path=self.functional_enrichment_output_path,
+                                                                                      qlambda=self.qlambda,
                                                                                       run=self.run,
                                                                                       progress=self.progress)
 

diff --git a/anvio/kegg.py b/anvio/kegg.py
@@ -7829,6 +7829,7 @@ def __init__(self, args, run=run, progress=progress):
         self.output_file_path = A('output_file')
         self.include_missing = True if A('include_samples_missing_from_groups_txt') else False
         self.use_stepwise_completeness = A('use_stepwise_completeness')
+        self.qlambda = A('qlambda')
 
         # init the base class
         KeggContext.__init__(self, self.args)
@@ -8088,6 +8089,7 @@ def run_enrichment_stats(self):
         # run the enrichment analysis
         enrichment_stats = utils.run_functional_enrichment_stats(enrichment_input_path,
                                                                  self.output_file_path,
+                                                                 qlambda=self.qlambda,
                                                                  run=self.run,
                                                                  progress=self.progress)
 

diff --git a/anvio/summarizer.py b/anvio/summarizer.py
@@ -323,6 +323,8 @@ def functional_enrichment_stats(self):
         '''
         A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None
         output_file_path = A('output_file')
+        qlambda = A('qlambda')
+
         tmp_functional_occurrence_file = filesnpaths.get_temp_file_path()
 
         enrichment_file_path = output_file_path
@@ -340,7 +342,8 @@ def functional_enrichment_stats(self):
         self.functional_occurrence_stats()
 
         # run enrichment script. this uses the output saved from the previous step.
-        enrichment_stats = utils.run_functional_enrichment_stats(tmp_functional_occurrence_file, output_file_path, run=self.run, progress=self.progress)
+        enrichment_stats = utils.run_functional_enrichment_stats(tmp_functional_occurrence_file, output_file_path,
+                                                                qlambda=qlambda, run=self.run, progress=self.progress)
 
         return enrichment_stats
 

diff --git a/anvio/utils.py b/anvio/utils.py
@@ -1178,7 +1178,8 @@ def apply_and_concat(df, fields, func, column_names, func_args=tuple([])):
     return pd.concat((df, df2), axis=1, sort=True)
 
 
-def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path, enrichment_output_file_path=None, run=run, progress=progress):
+def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path, enrichment_output_file_path=None,
+                                    qlambda=None, run=run, progress=progress):
     """This function runs the enrichment analysis implemented by Amy Willis.
 
     Since the enrichment analysis is an R script, we interface with that program by
@@ -1193,6 +1194,8 @@ def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path,
         script.
     enrichment_output_file_path, str file path
         An optional output file path for the enrichment analysis.
+    qlambda : float
+        An optional fraction that sets the maximum lambda for q-value
 
     Returns
     =======
@@ -1224,14 +1227,19 @@ def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path,
     run.warning(None, header="AMY's ENRICHMENT ANALYSIS 🚀", lc="green")
     run.info("Functional occurrence stats input file path: ", functional_occurrence_stats_input_file_path)
     run.info("Functional enrichment output file path: ", enrichment_output_file_path)
+    if anvio.DEBUG:
+        run.info("User set max lambda for q-values: ", qlambda)
     run.info("Temporary log file (use `--debug` to keep): ", log_file_path, nl_after=2)
 
     # run enrichment script
     progress.new('Functional enrichment analysis')
     progress.update("Running Amy's enrichment")
-    run_command(['anvi-script-enrichment-stats',
+    enrichment_command = ['anvi-script-enrichment-stats',
                  '--input', f'{functional_occurrence_stats_input_file_path}',
-                 '--output', f'{enrichment_output_file_path}'], log_file_path)
+                 '--output', f'{enrichment_output_file_path}']
+    if qlambda:
+        enrichment_command += ['--qlambda', f'{qlambda}']
+    run_command(enrichment_command, log_file_path)
     progress.end()
 
     if not filesnpaths.is_file_exists(enrichment_output_file_path, dont_raise=True):

diff --git a/bin/anvi-compute-functional-enrichment-across-genomes b/bin/anvi-compute-functional-enrichment-across-genomes
@@ -70,6 +70,7 @@ if __name__ == '__main__':
     groupD.add_argument(*anvio.A('functional-occurrence-table-output'), **anvio.K('functional-occurrence-table-output'))
 
     groupE = parser.add_argument_group('OPTIONAL THINGIES', "If you want it, here it is, come and get it.")
+    groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
     groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 
     args = parser.get_args(parser)

diff --git a/bin/anvi-compute-functional-enrichment-in-pan b/bin/anvi-compute-functional-enrichment-in-pan
@@ -69,6 +69,7 @@ if __name__ == '__main__':
     groupD.add_argument(*anvio.A('functional-occurrence-table-output'), **anvio.K('functional-occurrence-table-output'))
 
     groupE = parser.add_argument_group('OPTIONAL THINGIES', "If you want it, here it is, come and get it.")
+    groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
     groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 
     args = parser.get_args(parser)

diff --git a/bin/anvi-compute-metabolic-enrichment b/bin/anvi-compute-metabolic-enrichment
@@ -67,6 +67,7 @@ if __name__ == '__main__':
                                          "if you wish. The default threshold is %(default)g."}))
     groupE.add_argument(*anvio.A('use-stepwise-completeness'), **anvio.K('use-stepwise-completeness'))
     groupE.add_argument(*anvio.A('include-samples-missing-from-groups-txt'), **anvio.K('include-samples-missing-from-groups-txt'))
+    groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
     groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))
 
     args = parser.get_args(parser)

diff --git a/sandbox/anvi-script-enrichment-stats b/sandbox/anvi-script-enrichment-stats
@@ -99,7 +99,9 @@ option_list <- list(
   make_option(c("--output"),
               help = "Output file name"),
   make_option(c("--input"),
-              help = "Input file name")
+              help = "Input file name"),
+  make_option(c("--qlambda"),
+              help = "Optional parameter. The maximum lambda for q-value estimation.")
 )
 parser <- OptionParser(option_list = option_list,
                        description = "Hypothesis testing for 'enrichment' claims. Fits a GLM and computes the Rao Test statistic as an enrichment score.")
@@ -207,7 +209,21 @@ pvalues_df <- df_unique %>%
 
 # find the maximum lambda for qvalue
 # reasons for this change documented at https://github.com/merenlab/anvio/issues/1383
-max_lambda <- min(0.95, max(pvalues_df$unadjusted_p_value, na.rm=T))
+if (length(opts$qlambda) > 0) {
+    max_lambda = opts$qlambda
+    sprintf("Using user-defined max lambda value of %s for q-value estimation", opts$qlambda)
+} else {
+    max_lambda <- min(0.95, max(pvalues_df$unadjusted_p_value, na.rm=T))
+}
+if (max_lambda < 0.05) {
+    stop(sprintf("Unfortunately, the maximum lambda for q-value estimation is < 0.05. This \
+         value is coming from the p-values in the enrichment test, so something in \
+         your data is making those p-values low. This sometimes happens when you \
+         have a very short input file (your input file has %s rows). Regardless, \
+         a possible solution is for you to pick your own max lambda that is >= 0.05, \
+         and provide it to the program calling this script by using the \
+         --qlambda parameter.", nrow(df_in)))
+}
 lambdas <- seq(0.05, max_lambda, 0.05)
 
 # obtain an estimate of the proportion of features that are truly null