New config option: download cdd

If CDD has already been downloaded, it won't download it again, nor other reCOGnizer's resources
iquasere · Feb 25, 2021 · 861f7dd · 861f7dd
1 parent daba00d
commit 861f7dd
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 93 deletions.
diff --git a/config/config.json b/config/config.json
@@ -13,6 +13,7 @@
   "diamond_database": "resources_directory/uniprot.dmnd",
   "diamond_max_target_seqs": 1,
   "download_uniprot": true,
+  "download_cdd": false,
   "uniprot_columns": [
     "id",
     "entry name",

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -214,9 +214,10 @@ rule recognizer:
         config["threads"] - 1
     run:
         shell("recognizer.py -f {input} -t {threads} -o {output}/Annotation/{sample} -rd {resources_directory} "
-              "--remove-spaces --download-resources",
+              "--remove-spaces{download_resources}",
             output = config["output"], sample = set(experiments["Sample"]),
-            resources_directory = config["resources_directory"])
+            resources_directory = config["resources_directory"],
+            download_resources = '' if not config['download_cdd'] else ' --download-resources')
 
 rule quantification_analysis:
     input:

diff --git a/workflow/envs/meta.yaml b/workflow/envs/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "mosca" %}
-{% set version = "1.3.3" %}
+{% set version = "1.3.4" %}
 {% set sha256 = "87cbca039ea9b9c85f417543f2426b2b2acffebe58179878ee6872a32ae949ba" %}
 
 package:

diff --git a/workflow/mosca.py b/workflow/mosca.py
@@ -6,7 +6,7 @@
 import multiprocessing
 import sys
 
-__version__ = '1.3.3'
+__version__ = '1.3.4'
 
 parser = argparse.ArgumentParser(description="MOSCA's main script")
 parser.add_argument("-s", "--snakefile", type=str, default="{}/Snakefile".format(sys.path[0]), help="Snakefile file")

diff --git a/workflow/scripts/metaproteomics_analyser.py b/workflow/scripts/metaproteomics_analyser.py
@@ -25,6 +25,7 @@
 
 # TODO - integrate apt-get install -y libpwiz-tools poppler-utils
 
+
 class MetaproteomicsAnalyser:
 
     def __init__ (self, **kwargs):

diff --git a/workflow/scripts/mosca_tools.py b/workflow/scripts/mosca_tools.py
@@ -40,41 +40,6 @@ def run_pipe_command(bashCommand, output='', mode='w', sep=' ', print_message=Tr
             subprocess.Popen(bashCommand, stdin=subprocess.PIPE, shell=True, stdout=output_file).communicate()
 
 
-'''
-Input:
-    filename: str - filename of FastQC report
-Output:
-    returns dict{module:(value, pd.DataFrame)} with data from FastQC report
-'''
-
-
-def parse_fastqc(filename):
-    data = dict()
-    file = open(filename).read().split('\n')
-    i = 1
-    while i < len(file):
-        if file[i].startswith('>>') and file[i] != '>>END_MODULE':
-            name, flag = file[i][2:].split('\t')[0], file[i][2:].split('\t')[1]
-            if name == 'Sequence Duplication Levels':
-                i += 1
-            i += 1
-            if file[i] == '>>END_MODULE':
-                data[name] = (flag, pd.DataFrame())
-            else:
-                labels = file[i][1:].split('\t')
-                i += 1
-                partial_data = np.array(labels)
-                while i < len(file) and not file[i].startswith('>>'):
-                    partial_data = np.append(partial_data, file[i].split('\t'))
-                    i += 1
-                partial_data = np.reshape(partial_data, (int(partial_data.size / len(labels)), len(labels)))
-                data[name] = (flag, pd.DataFrame(data=partial_data[1:, 1:],
-                                                 index=partial_data[1:, 0],
-                                                 columns=partial_data[0, 1:]))
-        i += 1
-    return data
-
-
 def parse_blast(blast):
     result = pd.read_csv(blast, sep='\t', header=None)
     result.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch',
@@ -204,11 +169,10 @@ def perform_alignment(reference, reads, basename, threads=1, blast=None,
         else:
             print('GFF file was located at ' + blast.replace('.blast', '.gff'))
 
-    run_command('htseq-count -i {0} -c {1}.readcounts -n {2} {1}.sam {3}{4}'.format(attribute, basename, threads,
-                                                       (reference.replace('.fasta',
-                                                                          '.gff') if blast is None else blast.replace(
-                                                           '.blast', '.gff')),
-                                                       ('' if blast is not None else ' --stranded=no')))
+    run_command('htseq-count -i {0} -c {1}.readcounts -n {2} {1}.sam {3}{4}'.format(
+        attribute, basename, threads, (reference.replace('.fasta', '.gff') if blast is None else
+                                       blast.replace('.blast', '.gff')),
+        ('' if blast is not None else ' --stranded=no')))
 
 
 def fastq2fasta(fastq, output):
@@ -259,25 +223,14 @@ def add_abundance(data, readcounts, name, origin_of_data='metagenomics', readcou
         pass
 
 
-'''
-Input:
-    output: str - filename of output
-    data: pd.DataFrame - data to write on several sheets
-    lines: int - number of lines per sheet (Excel's maximum is 1048575)
-    index: bool - write index or not
-Output:
-    data will be outputed through several sheets
-'''
-
-
 def multi_sheet_excel(output, data, sheet_name='Sheet', lines=1000000, index=False):
     writer = pd.ExcelWriter(output, engine='xlsxwriter')
-    i = 0;
-    j = 1
-    while i + lines < len(data):
-        data.iloc[i:(i + lines)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index)
-        j += 1
-    data.iloc[i:len(data)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, str(j)), index=index)
+    if len(data) < lines:
+        data.to_excel(writer, sheet_name='{}'.format(sheet_name), index=index)
+    else:
+        for i in range(0, len(data), lines):
+            j = min(i + lines, len(data))
+            data.iloc[i:(i + lines)].to_excel(writer, sheet_name='{} ({})'.format(sheet_name, j), index=index)
     writer.save()
 
 
@@ -310,29 +263,10 @@ def normalize_readcounts(joined, columns, method='TMM', rscript_folder=''):
     return info
 
 
-'''
-Input:
-    message: a message to be printed
-Output:
-    will print the message with the time in human readable format
-'''
-
-
 def timed_message(message=None):
     print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + ': ' + message)
 
 
-'''
-Input:
-    df: pandas.DataFrame to manipulate
-    column: column composed of lists from where to expand the dataframe
-Output:
-    Returns the DataFrame expanded through one column by repeating all the
-    values of the row where in 'column' there is a list with more than one 
-    element
-'''
-
-
 def expand_by_list_column(self, df, column='Pathway'):
     lens = [len(item) for item in df[column]]
     dictionary = dict()
@@ -341,12 +275,7 @@ def expand_by_list_column(self, df, column='Pathway'):
     dictionary[column] = np.concatenate(df[column].values)
     return pd.DataFrame(dictionary)
 
-'''
-Input:
-    filename: str - filename of FastQC report
-Output:
-    returns dict{module:(value, pd.DataFrame)} with data from FastQC report
-'''
+
 def parse_fastqc_report(filename):
     data = dict()
     file = open(filename).read().split('\n')
@@ -366,24 +295,27 @@ def parse_fastqc_report(filename):
                 while i < len(file) and not file[i].startswith('>>'):
                     partial_data = np.append(partial_data, file[i].split('\t'))
                     i += 1
-                partial_data = np.reshape(partial_data,(int(partial_data.size/len(labels)),len(labels)))
-                data[name] = (flag, pd.DataFrame(data = partial_data[1:,1:],
-                                                index = partial_data[1:,0],
-                                                columns = partial_data[0,1:]))
+                partial_data = np.reshape(partial_data, (int(partial_data.size / len(labels)), len(labels)))
+                data[name] = (flag, pd.DataFrame(data=partial_data[1:, 1:], index=partial_data[1:, 0],
+                                                 columns=partial_data[0, 1:]))
         i += 1
     return data
 
+
 '''
 Input:
     bashCommand: str - the command to retrieve the output from
     shell: bool - True if using some shell tool like grep or awk
 Output:
     Number of occurrences of character on file
 '''
-def count_on_file(expression, file, compressed = False):
+
+
+def count_on_file(expression, file, compressed=False):
     return int(subprocess.check_output("{} -c '{}' {}".format(
-            'zgrep' if compressed else 'grep', expression, file), shell = True))
+        'zgrep' if compressed else 'grep', expression, file), shell=True))
+
 
 def sort_alphanumeric(alphanumeric_list):
     return sorted(alphanumeric_list, key=lambda item: (int(item.partition(' ')[0])
-            if item[0].isdigit() else float('inf'), item))
+                                                       if item[0].isdigit() else float('inf'), item))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,6 +25,7 @@

		# TODO - integrate apt-get install -y libpwiz-tools poppler-utils


		class MetaproteomicsAnalyser:

		def __init__ (self, **kwargs):
Expand Down