Skip to content

Commit

Permalink
Test for overlapping gene names between bed and genes_of_interest
Browse files Browse the repository at this point in the history
  • Loading branch information
Redmar-van-den-Berg committed Dec 4, 2024
1 parent 7a1bc6d commit b22a34f
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
2 changes: 1 addition & 1 deletion includes/expression/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ rule normalized_coverage:
src=srcdir("scripts/coverage.py"),
params:
housekeeping=config["housekeeping"],
genes_of_interest=config.get("genes_of_interest", ""),
genes_of_interest=config["genes_of_interest"],
output:
normalized="{sample}/expression/coverage.normalized.csv",
raw="{sample}/expression/coverage.csv",
Expand Down
45 changes: 45 additions & 0 deletions includes/expression/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,30 @@ for s in samples:
raise RuntimeError(f'Spaces in samples are not supported ("{s.sample}")')


def str_to_list(value):
"""Convert a space separated str to a list"""
if value is None:
return list()
elif isinstance(value, str):
return value.split(" ")
elif isinstance(value, list):
return value
else:
raise RuntimeError


def set_genes_of_interest():
"""Set the genes of interest to a list, if it is a string"""
# Genes of interest is either a list of str, or a string with spaces
goi = config.get("genes_of_interest")
config["genes_of_interest"] = str_to_list(goi)


def set_housekeeping_genes():
housekeeping = config.get("housekeeping")
config["housekeeping"] = str_to_list(housekeeping)


## Input functions ##
def get_bam(wildcards):
return pep.sample_table.loc[wildcards.sample, "bam"]
Expand All @@ -44,6 +68,24 @@ def check_housekeeping():
raise RuntimeError(msg)


def check_bed_genes_of_interest():
"""Check for duplicates genes bed file and genes_of_interest"""
if "bed" not in config:
return

# Get the names from the bed file
names = list()
with open(config["bed"]) as fin:
for line in fin:
spline = line.strip("\n").split("\t")
names.append(spline[3])

for gene in config["genes_of_interest"]:
if gene in names:
msg = f"{gene} is specified twice: in the bed file and genes_of_interest"
raise RuntimeError(msg)


## Functions for module outputs ##
def coverage(wildcards):
return f"{wildcards.sample}/expression/coverage.csv"
Expand All @@ -61,7 +103,10 @@ def multiqc_files():
return unstranded + forward + reverse_


set_genes_of_interest()
set_housekeeping_genes()
check_housekeeping()
check_bed_genes_of_interest()

module_output = SimpleNamespace(
coverage=coverage, normalized_expression=normalized, multiqc_files=multiqc_files()
Expand Down
20 changes: 20 additions & 0 deletions test/test_expression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,26 @@
contains:
- "Unknown housekeeping gene: Z"

- name: Test error on gene in .bed and genes_of_interest
tags:
- sanity
- expression
- current
command: >
snakemake
--snakefile includes/expression/Snakefile
--workflow-profile test
--dry-run
--configfile test/data/config/expression.json
--config pepfile=test/pep/expression.csv
housekeeping="MT-CO2"
bed=test/data/reference/transcripts_chrM.bed
genes_of_interest="MT-ND3 MT-ND2"
exit_code: 1
stdout:
contains:
- "MT-ND3 is specified twice: in the bed file and genes_of_interest"

- name: Test running the expression module with a bed file
tags:
- functional
Expand Down

0 comments on commit b22a34f

Please sign in to comment.