Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lambda parameter for q-value estimation in enrichment script #1953

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
8 changes: 8 additions & 0 deletions anvio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,14 @@ def TABULATE(table, header, numalign="right", max_width=0):
"A file name must be provided. To learn more about how the functional occurrence is computed, please "
"refer to the tutorial."}
),
'qlambda': (
['--qlambda'],
{'default': None,
'metavar': 'NUM',
'type': float,
'help': "If you want to set the maximum lambda value for q-value estimation, provide a fraction between "
"0.05 and 1."}
),
'table': (
['--table'],
{'metavar': 'TABLE_NAME',
Expand Down
2 changes: 2 additions & 0 deletions anvio/genomedescriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,7 @@ def __init__(self, args, layer_groups=None, skip_sanity_check=False, skip_init=F
self.print_genome_names_and_quit = A('print_genome_names_and_quit') or False
self.functional_occurrence_table_output_path = A('functional_occurrence_table_output')
self.functional_enrichment_output_path = A('output_file')
self.qlambda = A('qlambda')

# -----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----8<-----
# these are some primary data structures this class reports
Expand Down Expand Up @@ -1357,6 +1358,7 @@ def do_functional_enrichment_analysis(self):
# run the enrichment analysis
self.functional_enrichment_stats_dict = utils.run_functional_enrichment_stats(functional_occurrence_stats_input_file_path=self.functional_occurrence_table_output_path,
enrichment_output_file_path=self.functional_enrichment_output_path,
qlambda=self.qlambda,
run=self.run,
progress=self.progress)

Expand Down
2 changes: 2 additions & 0 deletions anvio/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -7829,6 +7829,7 @@ def __init__(self, args, run=run, progress=progress):
self.output_file_path = A('output_file')
self.include_missing = True if A('include_samples_missing_from_groups_txt') else False
self.use_stepwise_completeness = A('use_stepwise_completeness')
self.qlambda = A('qlambda')

# init the base class
KeggContext.__init__(self, self.args)
Expand Down Expand Up @@ -8088,6 +8089,7 @@ def run_enrichment_stats(self):
# run the enrichment analysis
enrichment_stats = utils.run_functional_enrichment_stats(enrichment_input_path,
self.output_file_path,
qlambda=self.qlambda,
run=self.run,
progress=self.progress)

Expand Down
5 changes: 4 additions & 1 deletion anvio/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ def functional_enrichment_stats(self):
'''
A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None
output_file_path = A('output_file')
qlambda = A('qlambda')

tmp_functional_occurrence_file = filesnpaths.get_temp_file_path()

enrichment_file_path = output_file_path
Expand All @@ -340,7 +342,8 @@ def functional_enrichment_stats(self):
self.functional_occurrence_stats()

# run enrichment script. this uses the output saved from the previous step.
enrichment_stats = utils.run_functional_enrichment_stats(tmp_functional_occurrence_file, output_file_path, run=self.run, progress=self.progress)
enrichment_stats = utils.run_functional_enrichment_stats(tmp_functional_occurrence_file, output_file_path,
qlambda=qlambda, run=self.run, progress=self.progress)

return enrichment_stats

Expand Down
14 changes: 11 additions & 3 deletions anvio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1178,7 +1178,8 @@ def apply_and_concat(df, fields, func, column_names, func_args=tuple([])):
return pd.concat((df, df2), axis=1, sort=True)


def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path, enrichment_output_file_path=None, run=run, progress=progress):
def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path, enrichment_output_file_path=None,
qlambda=None, run=run, progress=progress):
"""This function runs the enrichment analysis implemented by Amy Willis.

Since the enrichment analysis is an R script, we interface with that program by
Expand All @@ -1193,6 +1194,8 @@ def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path,
script.
enrichment_output_file_path, str file path
An optional output file path for the enrichment analysis.
qlambda : float
An optional fraction that sets the maximum lambda for q-value

Returns
=======
Expand Down Expand Up @@ -1224,14 +1227,19 @@ def run_functional_enrichment_stats(functional_occurrence_stats_input_file_path,
run.warning(None, header="AMY's ENRICHMENT ANALYSIS 🚀", lc="green")
run.info("Functional occurrence stats input file path: ", functional_occurrence_stats_input_file_path)
run.info("Functional enrichment output file path: ", enrichment_output_file_path)
if anvio.DEBUG:
run.info("User set max lambda for q-values: ", qlambda)
run.info("Temporary log file (use `--debug` to keep): ", log_file_path, nl_after=2)

# run enrichment script
progress.new('Functional enrichment analysis')
progress.update("Running Amy's enrichment")
run_command(['anvi-script-enrichment-stats',
enrichment_command = ['anvi-script-enrichment-stats',
'--input', f'{functional_occurrence_stats_input_file_path}',
'--output', f'{enrichment_output_file_path}'], log_file_path)
'--output', f'{enrichment_output_file_path}']
if qlambda:
enrichment_command += ['--qlambda', f'{qlambda}']
run_command(enrichment_command, log_file_path)
progress.end()

if not filesnpaths.is_file_exists(enrichment_output_file_path, dont_raise=True):
Expand Down
1 change: 1 addition & 0 deletions bin/anvi-compute-functional-enrichment-across-genomes
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if __name__ == '__main__':
groupD.add_argument(*anvio.A('functional-occurrence-table-output'), **anvio.K('functional-occurrence-table-output'))

groupE = parser.add_argument_group('OPTIONAL THINGIES', "If you want it, here it is, come and get it.")
groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))

args = parser.get_args(parser)
Expand Down
1 change: 1 addition & 0 deletions bin/anvi-compute-functional-enrichment-in-pan
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ if __name__ == '__main__':
groupD.add_argument(*anvio.A('functional-occurrence-table-output'), **anvio.K('functional-occurrence-table-output'))

groupE = parser.add_argument_group('OPTIONAL THINGIES', "If you want it, here it is, come and get it.")
groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))

args = parser.get_args(parser)
Expand Down
1 change: 1 addition & 0 deletions bin/anvi-compute-metabolic-enrichment
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ if __name__ == '__main__':
"if you wish. The default threshold is %(default)g."}))
groupE.add_argument(*anvio.A('use-stepwise-completeness'), **anvio.K('use-stepwise-completeness'))
groupE.add_argument(*anvio.A('include-samples-missing-from-groups-txt'), **anvio.K('include-samples-missing-from-groups-txt'))
groupE.add_argument(*anvio.A('qlambda'), **anvio.K('qlambda'))
groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it'))

args = parser.get_args(parser)
Expand Down
20 changes: 18 additions & 2 deletions sandbox/anvi-script-enrichment-stats
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ option_list <- list(
make_option(c("--output"),
help = "Output file name"),
make_option(c("--input"),
help = "Input file name")
help = "Input file name"),
make_option(c("--qlambda"),
help = "Optional parameter. The maximum lambda for q-value estimation.")
)
parser <- OptionParser(option_list = option_list,
description = "Hypothesis testing for 'enrichment' claims. Fits a GLM and computes the Rao Test statistic as an enrichment score.")
Expand Down Expand Up @@ -207,7 +209,21 @@ pvalues_df <- df_unique %>%

# find the maximum lambda for qvalue
# reasons for this change documented at https://github.com/merenlab/anvio/issues/1383
max_lambda <- min(0.95, max(pvalues_df$unadjusted_p_value, na.rm=T))
if (length(opts$qlambda) > 0) {
max_lambda = opts$qlambda
sprintf("Using user-defined max lambda value of %s for q-value estimation", opts$qlambda)
} else {
max_lambda <- min(0.95, max(pvalues_df$unadjusted_p_value, na.rm=T))
}
if (max_lambda < 0.05) {
stop(sprintf("Unfortunately, the maximum lambda for q-value estimation is < 0.05. This \
value is coming from the p-values in the enrichment test, so something in \
your data is making those p-values low. This sometimes happens when you \
have a very short input file (your input file has %s rows). Regardless, \
a possible solution is for you to pick your own max lambda that is >= 0.05, \
and provide it to the program calling this script by using the \
--qlambda parameter.", nrow(df_in)))
}
lambdas <- seq(0.05, max_lambda, 0.05)

# obtain an estimate of the proportion of features that are truly null
Expand Down