Merge pull request #54 from tcezard/EVA3263_shallow

EVA-3263 - Shallow validation
EBIvariation · Sep 9, 2024 · d14d9fb · d14d9fb
2 parents 3ed9462 + f9a891e
commit d14d9fb
Show file tree

Hide file tree

Showing 22 changed files with 650 additions and 378 deletions.
diff --git a/eva_sub_cli/executables/check_fasta_insdc.py b/eva_sub_cli/executables/check_fasta_insdc.py
@@ -10,6 +10,7 @@
 from requests import HTTPError
 from retry import retry
 
+from eva_sub_cli.file_utils import fasta_iter
 from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
     get_reference_assembly_for_analysis
 
@@ -19,13 +20,6 @@
 logger = logging_config.get_logger(__name__)
 
 
-def open_gzip_if_required(input_file):
-    if input_file.endswith('.gz'):
-        return gzip.open(input_file, 'rt')
-    else:
-        return open(input_file, 'r')
-
-
 def write_result_yaml(output_yaml, results):
     with open(output_yaml, 'w') as open_yaml:
         yaml.safe_dump(data=results, stream=open_yaml)
@@ -34,27 +28,6 @@ def write_result_yaml(output_yaml, results):
 def refget_md5_digest(sequence):
     return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()
 
-
-def fasta_iter(input_fasta):
-    """
-    Given a fasta file. yield tuples of header, sequence
-    """
-    # first open the file outside
-    with open(input_fasta, 'r') as open_file:
-
-        # ditch the boolean (x[0]) and just keep the header or sequence since
-        # we know they alternate.
-        faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))
-
-        for header in faiter:
-            # drop the ">"
-            headerStr = header.__next__()[1:].strip()
-
-            # join all sequence lines to one.
-            seq = "".join(s.strip() for s in faiter.__next__())
-            yield (headerStr, seq)
-
-
 @retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
 def get_refget_metadata(md5_digest):
     response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')

diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py
@@ -35,8 +35,7 @@ def validate_command_line_arguments(args, argparser):
         print(f"'{args.submission_dir}' does not have write permissions or is not a directory.")
         sys.exit(1)
 
-
-def main():
+def parse_args(cmd_line_args):
     argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
     argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
     argparser.add_argument('--submission_dir', required=True, type=str,
@@ -67,18 +66,26 @@ def main():
                                                                   'upload to the EVA')
     credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
     credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
-    argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')
+    argparser.add_argument('--shallow', action='store_true', default=False,
+                           help='Set the validation to be performed on the first 10000 records of the VCF. '
+                                'Only applies if the number of record exceed 10000')
+    argparser.add_argument('--debug', action='store_true', default=False,
+                           help='Set the script to output debug messages')
+    args = argparser.parse_args(cmd_line_args)
+    validate_command_line_arguments(args, argparser)
+    return args
 
-    args = argparser.parse_args()
 
-    validate_command_line_arguments(args, argparser)
+def main():
+
+    args = parse_args(sys.argv[1:])
 
     args.submission_dir = os.path.abspath(args.submission_dir)
 
     if args.debug:
         logging_config.add_stdout_handler(logging.DEBUG)
     else:
-        logging_config.add_stdout_handler()
+        logging_config.add_stdout_handler(logging.INFO)
     logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)
 
     try:

diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py
@@ -7,18 +7,12 @@
 
 import yaml
 
+from eva_sub_cli.file_utils import open_gzip_if_required
 from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file
 
 logger = logging_config.get_logger(__name__)
 
 
-def open_gzip_if_required(input_file):
-    if input_file.endswith('.gz'):
-        return gzip.open(input_file, 'rt')
-    else:
-        return open(input_file, 'r')
-
-
 def get_samples_from_vcf(vcf_file):
     """
     Get the list of samples present in a single VCF file

diff --git a/eva_sub_cli/executables/trim_down.py b/eva_sub_cli/executables/trim_down.py
@@ -0,0 +1,78 @@
+import argparse
+import os
+
+import yaml
+from ebi_eva_common_pyutils.logger import logging_config
+from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter
+
+logger = logging_config.get_logger(__name__)
+
+
+max_nb_lines = 10000
+
+
+def trim_down_vcf(vcf_file, output_vcf):
+    """
+    Produce a smaller vcf files containing a maximum of 10000 records
+    """
+    with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
+        line_count = 0
+        ref_seq_names = set()
+        for line in vcf_in:
+            if line.startswith('#') or line_count < max_nb_lines:
+                vcf_out.write(line)
+                if not line.startswith('#'):
+                    line_count += 1
+                    ref_seq_names.add(line.split('\t')[0])
+            else:
+                break
+    if line_count != max_nb_lines:
+        logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
+    return line_count, ref_seq_names
+
+
+def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
+    """
+    Produce a smaller fasta files containing only the reference sequences found in the VCF file
+    """
+    found_sequences = set()
+    with open(output_fasta, 'w') as fasta_out:
+        for header, sequence in fasta_iter(fasta_file):
+            name = header.split()[0]
+            if name in ref_seq_names:
+                found_sequences.add(name)
+                print(f'>{header}', file=fasta_out)
+                for i in range(0, len(sequence), 80):
+                    print(sequence[i:i+80], file=fasta_out)
+    return found_sequences
+
+
+def main():
+    arg_parser = argparse.ArgumentParser(
+        description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
+                    f'associated reference genome')
+    arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
+                            help='Path to the vcf file to be trimmed down')
+    arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
+                            help='Path to the output vcf file')
+    arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
+                            help='Path to the fasta file to be trimmed down')
+    arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
+                            help='Path to the output fasta file')
+    arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
+                            help='Path to the yaml file containing the trim down metrics')
+
+    args = arg_parser.parse_args()
+    logging_config.add_stdout_handler()
+
+    line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
+    sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
+    trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
+                         'trim_down_required': line_count == max_nb_lines}
+    if len(sequence_found) != len(ref_sequence):
+        logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
+        os.link(args.fasta_file, args.output_fasta_file)
+        trim_down_metrics.pop('number_sequence_found')
+    with open(args.output_yaml_file) as open_file:
+        yaml.safe_dump(trim_down_metrics, open_file)
+
diff --git a/eva_sub_cli/file_utils.py b/eva_sub_cli/file_utils.py
@@ -1,5 +1,16 @@
+import glob
+import gzip
 import os
 import shutil
+from itertools import groupby
+
+
+def resolve_single_file_path(file_path):
+    files = glob.glob(file_path)
+    if len(files) == 0:
+        return None
+    elif len(files) > 0:
+        return files[0]
 
 
 def is_submission_dir_writable(submission_dir):
@@ -32,3 +43,30 @@ def backup_file_or_directory(file_name, max_backups=None):
         else:
             os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}')
     os.rename(file_name, file_name + '.1')
+
+
+def open_gzip_if_required(input_file):
+    """Open a file in read mode using gzip if the file extension says .gz"""
+    if input_file.endswith('.gz'):
+        return gzip.open(input_file, 'rt')
+    else:
+        return open(input_file, 'r')
+
+
+def fasta_iter(input_fasta):
+    """
+    Given a fasta file. yield tuples of header, sequence
+    """
+    # first open the file outside
+    with open_gzip_if_required(input_fasta) as open_file:
+        # ditch the boolean (x[0]) and just keep the header or sequence since
+        # we know they alternate.
+        faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))
+
+        for header in faiter:
+            # drop the ">"
+            headerStr = header.__next__()[1:].strip()
+
+            # join all sequence lines to one.
+            seq = "".join(s.strip() for s in faiter.__next__())
+            yield (headerStr, seq)
diff --git a/eva_sub_cli/jinja_templates/html_report.html b/eva_sub_cli/jinja_templates/html_report.html
@@ -4,6 +4,7 @@
 {% from 'sample_name_check.html' import sample_name_check_report %}
 {% from 'fasta_check.html' import fasta_check_report %}
 {% from 'metadata_validation.html' import metadata_validation_report %}
+{% from 'shallow_validation.html' import optional_shallow_validation_report %}
 
 <html lang="EN">
 <head>
@@ -46,6 +47,8 @@ <h6>eva-sub-cli v{{cli_version}}</h6>
     </div>
 </header>
 
+{{ optional_shallow_validation_report(validation_results) }}
+
 <section>
     <h2>Project Summary</h2>
     <div class="description">

diff --git a/eva_sub_cli/jinja_templates/project_details.html b/eva_sub_cli/jinja_templates/project_details.html
@@ -32,4 +32,6 @@
         </div>
     {% endif %}
 
+
+
 {%- endmacro %}
diff --git a/eva_sub_cli/jinja_templates/shallow_validation.html b/eva_sub_cli/jinja_templates/shallow_validation.html
@@ -0,0 +1,29 @@
+
+{% macro optional_shallow_validation_report(validation_results) -%}
+    {% set results = validation_results.get('shallow_validation', {}) %}
+
+    {% if results.get('required') %}
+    <section>
+        <div class="report-section fail collapsible"> <span class="expand_icon">&#9654;</span>
+            &#10060; <b>You requested to run the shallow validation, please run full validation before submitting the data</b>
+        </div>
+        <div class="no-show">
+        <table>
+            <tr>
+                <th><strong>VCF File</strong></th>
+                <th><strong>Variant lines validated in VCF</strong></th>
+                <th><strong>Entries used in Fasta</strong></th>
+            </tr>
+            {% for vcf_file in results.get('metrics') %}
+                <tr>
+                    <td>{{ vcf_file }}</td>
+                    <td>{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}</td>
+                    <td>{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}</td>
+                </tr>
+                {% endfor %}
+            </table>
+        </div>
+    </section>
+    {% endif %}
+
+{%- endmacro %}
diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf
@@ -30,12 +30,14 @@ params.python_scripts = [
     "samples_checker": "samples_checker.py",
     "fasta_checker": "check_fasta_insdc.py",
     "xlsx2json": "xlsx2json.py",
-    "semantic_checker": "check_metadata_semantics.py"
+    "semantic_checker": "check_metadata_semantics.py",
+    "trim_down": "trim_down.py"
 ]
 // prefix to prepend to all provided path
 params.base_dir = ""
 // help
 params.help = null
+params.shallow_validation = false
 
 // Show help message
 if (params.help) exit 0, helpMessage()
@@ -63,20 +65,23 @@ output_dir = joinBasePath(params.output_dir)
 
 workflow {
     // Prepare the file path
-    vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
+    vcf_and_ref_ch = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
         .splitCsv(header:true)
         .map{row -> tuple(
             file(joinBasePath(row.vcf)),
             file(joinBasePath(row.fasta)),
             file(joinBasePath(row.report))
         )}
-    vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
-        .splitCsv(header:true)
-        .map{row -> file(joinBasePath(row.vcf))}
-
+    if (params.shallow_validation){
+        // create a smaller vcf and fasta then replace the channel
+        trim_down_vcf(vcf_and_ref_ch)
+        vcf_and_ref_ch = trim_down_vcf.out.vcf_and_ref
+    }
+    vcf_files = vcf_and_ref_ch.map{row -> row[0]}
+    fasta_to_vcfs = vcf_and_ref_ch.map{row -> tuple(row[1], row[0])}.groupTuple(by:0)
     // VCF checks
-    check_vcf_valid(vcf_channel)
-    check_vcf_reference(vcf_channel)
+    check_vcf_valid(vcf_and_ref_ch)
+    check_vcf_reference(vcf_and_ref_ch)
 
     generate_file_size_and_md5_digests(vcf_files)
     collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
@@ -94,14 +99,32 @@ workflow {
         metadata_json_validation(metadata_json)
         metadata_semantic_check(metadata_json)
         sample_name_concordance(metadata_json, vcf_files.collect())
-        fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
-            .splitCsv(header:true)
-            .map{row -> tuple(file(joinBasePath(row.fasta)), file(joinBasePath(row.vcf)))}
-            .groupTuple(by:0)
         insdc_checker(metadata_json, fasta_to_vcfs)
     }
 }
 
+
+process trim_down_vcf {
+    publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
+    publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"
+
+    input:
+    tuple path(vcf), path(fasta), path(report)
+
+    output:
+    tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
+    path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
+    path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric
+
+    """
+    mkdir output
+    $params.python_scripts.trim_down --vcf_file $vcf  --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
+    # This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
+    touch $report
+    """
+
+}
+
 /*
 * Validate the VCF file format
 */