From 71e2cd101fa81d23e1b974a53bc2a38dcb03fcb7 Mon Sep 17 00:00:00 2001 From: iquasere Date: Mon, 1 Feb 2021 12:56:19 +0000 Subject: [PATCH] Run MT without MG * MOSCA is now capable of running MT without MG * Combined with approach to not use assembly * In the future might implement the use of assembly to analyze MT Also added rejected and accepted files from rRNA removal to list of files to delete at preprocessing closes #3 --- .idea/rSettings.xml | 6 ---- workflow/Snakefile | 51 ++++++++++++++++++++++++++++------ workflow/envs/meta.yaml | 2 +- workflow/mosca.py | 2 +- workflow/scripts/preprocess.py | 6 +++- 5 files changed, 49 insertions(+), 18 deletions(-) delete mode 100644 .idea/rSettings.xml diff --git a/.idea/rSettings.xml b/.idea/rSettings.xml deleted file mode 100644 index 56885e2..0000000 --- a/.idea/rSettings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 5bd2c2d..5e7b445 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,6 +1,7 @@ import pandas as pd import shutil import sys +from mosca_tools import parse_blast, multi_sheet_excel scripts_dir = sys.path[0] @@ -56,8 +57,7 @@ def all_input(wildcards): ) else: return ( - expand("{output}/Annotation/{sample}/aligned.blast", output=config["output"], - sample=set(experiments['Sample'])) + "{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"]) ) def preprocess_input(wildcards): @@ -83,6 +83,13 @@ def annotation_input(wildcards): return expand("{output}/Preprocess/piled_{name}.fasta", output = config["output"], name = wildcards.sample) +def upimapi_input(wildcards): + if config['do_assembly']: + return expand("{output}/Annotation/{sample}/aligned.blast",output=config["output"], + sample = set(experiments['Sample'])) + return expand("{output}/Annotation/{name}/aligned.blast",output=config["output"], + name = set(experiments['Name'])) + rule all: input: all_input @@ -184,16 +191,18 @@ rule annotation: rule upimapi: input: - expand("{output}/Annotation/{sample}/aligned.blast", output = config["output"], - sample = set(experiments["Sample"])) + upimapi_input output: - expand("{output}/Annotation/uniprotinfo.tsv", output = config["output"]) + expand("{output}/Annotation/uniprotinfo.tsv", output = config["output"]), threads: 1 run: - shell("upimapi.py -i {input} -o {output}/Annotation/uniprotinfo --blast --full-id", - output = config["output"]) - shell("echo 'done' > {output}/Annotation/{{sample}}.txt", output = config["output"]) + if config["do_assembly"]: + shell("upimapi.py -i {input} -o {output}/Annotation/uniprotinfo --blast --full-id", output = config["output"]) + else: + for blast in input: + shell("upimapi.py -i {blast} -o {output}/Annotation/uniprotinfo --blast --full-id", + output=config["output"]) rule recognizer: input: @@ -259,7 +268,7 @@ rule join_information: expand("{output}/Metatranscriptomics/{name}.readcounts", output = config["output"], name = set(mt_experiments['Name'])), expand("{output}/Annotation/{name}.readcounts", output = config["output"], - name = set(mg_experiments['Name'])), + name = set(mg_experiments['Name'])) output: expand("{output}/MOSCA_Protein_Report.xlsx", output = config["output"]), expand("{output}/MOSCA_Entry_Report.xlsx", output = config["output"]), @@ -272,6 +281,30 @@ rule join_information: experiments = config["experiments"], output = config["output"], normalization_method = config["normalization_method"]) +rule entry_count: + input: + uniprotinfo=expand("{output}/Annotation/uniprotinfo.tsv",output=config["output"]), + blasts=expand("{output}/Annotation/{name}/aligned.blast",output=config["output"], name=set(experiments['Name'])) + output: + "{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"]) + threads: + 1 + run: + uniprotinfo=pd.read_csv(input.uniprotinfo[0], sep='\t') + result = pd.DataFrame(columns=['sseqid']) + i = 1 + for blast in input.blasts: + print('[{}/{}] Quantifying entries for: {}'.format(i, len(input.blasts), blast)) + data = parse_blast(blast).groupby('sseqid').size().reset_index(name=blast.split('/')[-2]) + data['sseqid'] = [ide.split('|')[1] if ide != '*' else ide for ide in data['sseqid']] + result = pd.merge(result, data, on='sseqid', how='outer') + i += 1 + result.columns = ['Entry'] + result.columns.to_list()[1:] + print('Merging entry counts with info at {}'.format(input.uniprotinfo[0])) + result = pd.merge(result, uniprotinfo, on='Entry', how='left') + multi_sheet_excel("{}/MOSCA_Entry_Counts_Report.xlsx".format(config["output"]), result, sheet_name='Sheet') + result.to_csv("{}/MOSCA_Entry_Counts_Report.tsv".format(config["output"]), index=False, sep='\t') + rule differential_expression: input: expand("{output}/Metatranscriptomics/expression_matrix.tsv", output = config["output"]) diff --git a/workflow/envs/meta.yaml b/workflow/envs/meta.yaml index b42fc62..4b86c36 100644 --- a/workflow/envs/meta.yaml +++ b/workflow/envs/meta.yaml @@ -1,5 +1,5 @@ {% set name = "mosca" %} -{% set version = "1.2.3" %} +{% set version = "1.3.0" %} {% set sha256 = "87cbca039ea9b9c85f417543f2426b2b2acffebe58179878ee6872a32ae949ba" %} package: diff --git a/workflow/mosca.py b/workflow/mosca.py index 3bdb4f6..b19722a 100644 --- a/workflow/mosca.py +++ b/workflow/mosca.py @@ -6,7 +6,7 @@ import multiprocessing import sys -__version__ = '1.2.3' +__version__ = '1.3.0' parser = argparse.ArgumentParser(description="MOSCA's main script") parser.add_argument("-s", "--snakefile", type=str, default="{}/Snakefile".format(sys.path[0]), help="Snakefile file") diff --git a/workflow/scripts/preprocess.py b/workflow/scripts/preprocess.py index 2e0024d..40f0d3b 100644 --- a/workflow/scripts/preprocess.py +++ b/workflow/scripts/preprocess.py @@ -229,7 +229,6 @@ def rrna_removal(self, files, out_dir, name, databases, threads='12', original_f self.remove_orphans(basename + '_forward.fastq', basename + '_reverse.fastq') ''' - for file in files_to_delete: os.remove(file) print('Removed: {}'.format(file)) @@ -356,9 +355,14 @@ def run(self): if self.paired: args.input = ['{}/SortMeRNA/{}_{}.fastq'.format( args.output, name, fr) for fr in ['forward', 'reverse']] + files_to_delete = ['{}/SortMeRNA/{}_{}.fastq'.format( + args.output, name, rejection) for rejection in ['accepted', 'rejected']] else: args.input = ['{}/SortMeRNA/{}_rejected.fq'.format( args.output, name)] + files_to_delete = ['{}/SortMeRNA/{}_{}.fastq'.format(args.output, name, 'accepted')] + for file in files_to_delete: + os.remove(file) self.run_fastqc(args.input, '{}/FastQC'.format(args.output), threads=args.threads)