From 7c61a707132fbeb8646df58487b95f08b4015ae6 Mon Sep 17 00:00:00 2001 From: iquasere Date: Fri, 22 Jan 2021 22:31:01 +0000 Subject: [PATCH] Several fixes on reporting --- resources/reporter_columns.txt | 7 ++- workflow/Snakefile | 12 +++--- workflow/scripts/assembly.py | 6 +-- workflow/scripts/report.py | 79 ++++++++++++++-------------------- 4 files changed, 45 insertions(+), 59 deletions(-) diff --git a/resources/reporter_columns.txt b/resources/reporter_columns.txt index 399703b..1dbf148 100644 --- a/resources/reporter_columns.txt +++ b/resources/reporter_columns.txt @@ -1,4 +1,4 @@ -[Initial quality assessment] # of initial reads +[Initial quality assessment] # of reads [Initial quality assessment] Per base sequence quality [Initial quality assessment] Per base sequence quality [Initial quality assessment] Per tile sequence quality @@ -11,8 +11,7 @@ [Initial quality assessment] Overrepresented sequences [Initial quality assessment] Adapter Content [Adapter removal] adapter files -[Adapter removal] # of reads remaining -[rRNA removal] # of reads remaining +[Adapter removal] # of reads [Before quality trimming] Per base sequence quality [Before quality trimming] Per base sequence quality [Before quality trimming] Per tile sequence quality @@ -24,7 +23,7 @@ [Before quality trimming] Sequence Duplication Levels [Before quality trimming] Overrepresented sequences [Before quality trimming] Adapter Content -[Quality trimming] # of reads remaining +[Quality trimming] # of reads [Quality trimming] Parameters [After quality trimming] Per base sequence quality [After quality trimming] Per base sequence quality diff --git a/workflow/Snakefile b/workflow/Snakefile index d6fda1a..6d15d30 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -60,7 +60,7 @@ rule all: expand("{output}/Binning/{sample}/checkm.tsv",output=config["output"],sample=set(experiments['Sample'])), expand("{output}/MOSCA_Protein_Report.xlsx", output = config["output"]), expand("{output}/MOSCA_Entry_Report.xlsx", output = config["output"]), - expand("{output}/technical_report.txt", output = config["output"]), + expand("{output}/technical_report.tsv", output = config["output"]), expand("{output}/MOSCA_General_Report.xlsx", output = config["output"]), expand("{output}/MOSCA_results.zip", output = config["output"]) @@ -236,12 +236,13 @@ rule differential_expression: expand("{output}/Metatranscriptomics/expression_matrix.tsv", output = config["output"]) output: expand("{output}/Metatranscriptomics/gene_expression.jpeg", output = config["output"]), - expand("{output}/Metatranscriptomics/sample_distances.jpeg", output = config["output"]) + expand("{output}/Metatranscriptomics/sample_distances.jpeg", output = config["output"]), + expand("{output}/Metatranscriptomics/condition_treated_results.csv", output = config["output"]) threads: 1 run: conditions = ",".join(map(str, mt_experiments['Condition'].tolist())) - shell("Rscript {scripts_dir}/de_analysis.R --readcounts {input} --conditions {conditions} --output {output}/Metatranscriptomics", # problems with libreadline.so.6 might be solved with cd /lib/x86_64-linux-gnu/; sudo ln -s libreadline.so.7.0 libreadline.so.6 + shell("{scripts_dir}/../../../bin/Rscript {scripts_dir}/de_analysis.R --readcounts {input} --conditions {conditions} --output {output}/Metatranscriptomics", # problems with libreadline.so.6 might be solved with cd /lib/x86_64-linux-gnu/; sudo ln -s libreadline.so.7.0 libreadline.so.6 conditions = conditions, output = config["output"]) rule keggcharter: @@ -263,9 +264,10 @@ rule keggcharter: rule report: input: - expand("{output}/MOSCA_Protein_Report.xlsx", output = config["output"]) + expand("{output}/MOSCA_Protein_Report.xlsx", output = config["output"]), + expand("{output}/Metatranscriptomics/condition_treated_results.csv", output = config["output"]) output: - expand("{output}/technical_report.txt", output = config["output"]), + expand("{output}/technical_report.tsv", output = config["output"]), expand("{output}/MOSCA_General_Report.xlsx", output = config["output"]), expand("{output}/MOSCA_results.zip", output = config["output"]) threads: diff --git a/workflow/scripts/assembly.py b/workflow/scripts/assembly.py index 8fa6a29..ee0082a 100644 --- a/workflow/scripts/assembly.py +++ b/workflow/scripts/assembly.py @@ -79,7 +79,7 @@ def run(self): # Quality control pathlib.Path('{}/quality_control'.format(args.output)).mkdir(parents=True, exist_ok=True) - self.run_metaquast('{}/contigs.fasta'.format(args.output), args.output) + self.run_metaquast('{}/contigs.fasta'.format(args.output), '{}/quality_control'.format(args.output)) perform_alignment('{}/contigs.fasta'.format(args.output), args.reads, '{}/quality_control/alignment'.format(args.output), threads=args.threads) @@ -87,8 +87,8 @@ def run(self): if os.path.isfile('{}/quality_control/combined_reference/report.tsv'.format( args.output)): # if metaquast finds references to the contigs, it will output results to different folders - os.rename('{}/quality_control/combined_reference/report.tsv'.format(args.output), - '{}/quality_control/report.tsv'.format(args.output)) + shutil.copyfile('{}/quality_control/combined_reference/report.tsv'.format(args.output), + '{}/quality_control/report.tsv'.format(args.output)) with open(args.output + '/quality_control/report.tsv', 'a') as f: f.write('Reads aligned (%)\t{}\n'.format(percentage_of_reads)) diff --git a/workflow/scripts/report.py b/workflow/scripts/report.py index d6ccf6e..b4662e9 100644 --- a/workflow/scripts/report.py +++ b/workflow/scripts/report.py @@ -38,16 +38,12 @@ def get_arguments(self): the softwares used by MOSCA and respective versions ''' - def write_technical_report(self, tools_list_file, - output): # TODO - add proteomics software that cannot be installed with conda + def write_technical_report(self, output): # TODO - add proteomics software that cannot be installed with conda conda_list = run_pipe_command('conda list', output='PIPE').split('\n')[2:] lines = [line.split() for line in conda_list] lines[0] = lines[0][1:] df = pd.DataFrame(lines, columns=lines.pop(0)).set_index('Name') - tools = open(tools_list_file).read().split('\n') - present_tools = [tool for tool in tools if tool in df.index] - open(output, 'w').write(df.loc[present_tools][['Version']].to_string( - justify='left')) + df[['Version']].to_csv(output, sep='\t') def initialize_report(self, reporter_columns): print('Initializing Report') @@ -65,13 +61,19 @@ def initialize_report(self, reporter_columns): named 'name', and the columns that start with 'prefix' ''' - def info_from_fastqc(self, output_dir, name, prefix, prefix2terms, fastq_columns): - reports = [parse_fastqc_report('{}/Preprocess/FastQC/{}{}_{}_fastqc/fastqc_data.txt'.format(output_dir, - prefix2terms[ - prefix][0], - name, prefix2terms[ - prefix][i])) for - i in [1, 2]] + def info_from_fastqc(self, output_dir, name, prefix, prefix2terms, fastq_columns, initial_file=None): + if initial_file is not None: + if '_R' in initial_file: + original_name = initial_file.split('/')[-1].split('_R')[0] + else: + original_name = initial_file.split('/')[-1].split('.f')[0] + else: + original_name = name + reports = [parse_fastqc_report('{}/Preprocess/FastQC/{}{}_{}_fastqc/fastqc_data.txt'.format( + output_dir, prefix2terms[prefix][0], original_name, prefix2terms[prefix][i])) for i in [1, 2]] + self.report.loc[name, '{} # of reads'.format(prefix)] = reports[0]['Basic Statistics'][1].loc[ + 'Total Sequences']['Value'] + for column in fastq_columns: for i in range(2): if column not in reports[i].keys(): @@ -85,7 +87,7 @@ def info_from_fastqc(self, output_dir, name, prefix, prefix2terms, fastq_columns reports[0][column][0], reports[1][column][0])) def info_from_preprocessing(self, output_dir, name, input_file, fastq_columns, performed_rrna_removal=False): - print('Retrieving preprocessing information for sample: ' + name) + print('Retrieving preprocessing information for dataset: ' + name) if name not in self.report.index: self.report = self.report.append(pd.Series(name=name)) self.report.loc[name] = self.report.loc[name].fillna(value='') @@ -94,7 +96,7 @@ def info_from_preprocessing(self, output_dir, name, input_file, fastq_columns, p if len(adapter_files[0]) > 0 and not adapter_files[0] == 'None': adapter = adapter_files[0].split('/')[-1].split('.fa')[0] else: - adapter_files = list(); + adapter_files = list() adapter = None # For each preprocessing step, a tuple of (prefix, suffix for forward, suffix for reverse) @@ -106,45 +108,27 @@ def info_from_preprocessing(self, output_dir, name, input_file, fastq_columns, p 'quality_trimmed_', 'forward_paired', 'reverse_paired')} # Initial assessment - self.report.loc[name, '[Initial quality assessment] # of initial reads'] = count_on_file( - '@', input_file, compressed=True if input_file.endswith('.gz') else False) - self.info_from_fastqc(output_dir, name, '[Initial quality assessment]', prefix2terms, fastq_columns) + self.info_from_fastqc(output_dir, name, '[Initial quality assessment]', prefix2terms, fastq_columns, + initial_file=input_file) # After adapter removal try: if len(adapter_files) > 0: self.report.loc[name, '[Adapter removal] adapter files'] = ', '.join(set(adapter_files)) - self.report.loc[name, '[Adapter removal] # of reads remaining'] = ( - count_on_file('@', '{}/Preprocess/Trimmomatic/{}_{}_forward_paired.fq'.format( - output_dir, name, adapter))) else: self.report.loc[name, '[Adapter removal] adapter files'] = 'None' - self.report.loc[name, '[Adapter removal] # of reads remaining'] = ( - self.report.loc[name]['[Initial quality assessment] # of initial reads']) except: print('Failed at adapter removal!') self.report.to_csv('{}/report.tsv'.format(output_dir), sep='\t') - # rRNA removal - if performed_rrna_removal: - self.report.loc[name, '[rRNA removal] # of reads remaining'] = ( - count_on_file('@', '{}/Preprocess/SortMeRNA/{}_forward.fastq'.format(output_dir, name))) - else: - self.report.loc[name, '[rRNA removal] # of reads remaining'] = ( - self.report.loc[name]['[Adapter removal] # of reads remaining']) - # Quality trimming - self.info_from_fastqc(output_dir, name, '[Before quality trimming]', prefix2terms, fastq_columns) - - self.report.loc[name, '[Quality trimming] Parameters'] = '; '.join([file for file in - set(open( - '{}/Preprocess/Trimmomatic/{}_quality_params.txt'.format( - output_dir, name)).read().split( - '\n')) if len( - file) > 2]) # TODO - because '' must be interfering, try to cut the problem at the root before troubles - self.report.loc[name, '[Quality trimming] # of reads remaining'] = ( - count_on_file('@', '{}/Preprocess/Trimmomatic/quality_trimmed_{}_forward_paired.fq'.format( - output_dir, name))) + self.info_from_fastqc(output_dir, name, '[Before quality trimming]', prefix2terms, fastq_columns, + initial_file=input_file) + + self.report.loc[name, '[Quality trimming] Parameters'] = '; '.join([ + file for file in set(open('{}/Preprocess/Trimmomatic/{}_quality_params.txt'.format(output_dir, name)).read( + ).split('\n')) if len(file) > 2]) # TODO - because '' must be interfering, try to cut the problem at the root before troubles + self.info_from_fastqc(output_dir, name, '[After quality trimming]', prefix2terms, fastq_columns) def set_samples(self, experiments): @@ -156,6 +140,7 @@ def info_from_assembly(self, output_dir, sample): qc_report = pd.read_csv('{}/Assembly/{}/quality_control/report.tsv'.format( output_dir, sample), sep='\t', index_col=0).transpose() qc_report.index = [sample] + for col in qc_report.columns.tolist(): self.report.loc[self.report['Sample'] == sample, '[Assembly] ' + col] = qc_report.loc[sample][col] @@ -172,7 +157,7 @@ def info_from_annotation(self, output_dir, sample): sample))) sample_report['# of proteins annotated (reCOGnizer)'] = ( - len(set(parse_blast('{}/Annotation/{}/cdd_aligned.blast'.format( + len(set(parse_blast('{}/Annotation/{}/COG_aligned.blast'.format( output_dir, sample))['qseqid']))) sample_report = pd.DataFrame.from_dict(sample_report, orient='index').transpose() @@ -205,7 +190,7 @@ def info_from_binning(self, output_dir, sample): def info_from_alignment(self, output_dir, mt_name): self.report.set_index('Name', inplace=True) self.report.loc[mt_name, '[Metatranscriptomics] # of reads aligned'] = ( - run_pipe_command("""samtools view {}/Metatranscriptomics/{}.sam | + run_pipe_command("""cat {}/Metatranscriptomics/{}.sam | cut -f 3 | sort | uniq -c | awk \'{{printf(\"%s\\t%s\\n\", $2, $1)}}\' | awk '{{sum+=$2}} END {{print sum}}\'""".format(output_dir, mt_name), output='PIPE').rstrip('\n')) @@ -228,8 +213,7 @@ def run(self): fastq_columns = open('{}/fastqc_columns.txt'.format(args.lists_directory)).read().split('\n') reporter_columns = open('{}/reporter_columns.txt'.format(args.lists_directory)).read().split('\n') - self.write_technical_report('{}/tools_for_versions.txt'.format(args.lists_directory), - '{}/technical_report.txt'.format(args.output)) + self.write_technical_report('{}/technical_report.tsv'.format(args.output)) exps = ( pd.read_csv(args.experiments, sep='\t') if args.input_format == 'tsv' else pd.read_excel(args.experiments)) @@ -243,6 +227,7 @@ def run(self): self.set_samples(exps) + self.report.to_csv('report.tsv', sep='\t') for sample in set(exps['Sample']): self.info_from_assembly(args.output, sample) @@ -262,7 +247,7 @@ def run(self): 'MOSCA_Protein_Report.xlsx', 'MOSCA_Entry_Report.xlsx', 'MOSCA_General_Report.xlsx', - 'technical_report.txt' + 'technical_report.tsv' ]], '{}/{}'.format(args.output, 'MOSCA_results.zip'))