Skip to content

Commit

Permalink
Merge pull request #54 from tcezard/EVA3263_shallow
Browse files Browse the repository at this point in the history
EVA-3263 - Shallow validation
  • Loading branch information
tcezard authored Sep 9, 2024
2 parents 3ed9462 + f9a891e commit d14d9fb
Show file tree
Hide file tree
Showing 22 changed files with 650 additions and 378 deletions.
29 changes: 1 addition & 28 deletions eva_sub_cli/executables/check_fasta_insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from requests import HTTPError
from retry import retry

from eva_sub_cli.file_utils import fasta_iter
from eva_sub_cli.metadata_utils import get_files_per_analysis, get_analysis_for_vcf_file, \
get_reference_assembly_for_analysis

Expand All @@ -19,13 +20,6 @@
logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def write_result_yaml(output_yaml, results):
with open(output_yaml, 'w') as open_yaml:
yaml.safe_dump(data=results, stream=open_yaml)
Expand All @@ -34,27 +28,6 @@ def write_result_yaml(output_yaml, results):
def refget_md5_digest(sequence):
return hashlib.md5(sequence.upper().encode('utf-8')).hexdigest()


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
# first open the file outside
with open(input_fasta, 'r') as open_file:

# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))

for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)


@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_refget_metadata(md5_digest):
response = requests.get(f'{REFGET_SERVER}/sequence/{md5_digest}/metadata')
Expand Down
19 changes: 13 additions & 6 deletions eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ def validate_command_line_arguments(args, argparser):
print(f"'{args.submission_dir}' does not have write permissions or is not a directory.")
sys.exit(1)


def main():
def parse_args(cmd_line_args):
argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
Expand Down Expand Up @@ -67,18 +66,26 @@ def main():
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')
argparser.add_argument('--shallow', action='store_true', default=False,
help='Set the validation to be performed on the first 10000 records of the VCF. '
'Only applies if the number of record exceed 10000')
argparser.add_argument('--debug', action='store_true', default=False,
help='Set the script to output debug messages')
args = argparser.parse_args(cmd_line_args)
validate_command_line_arguments(args, argparser)
return args

args = argparser.parse_args()

validate_command_line_arguments(args, argparser)
def main():

args = parse_args(sys.argv[1:])

args.submission_dir = os.path.abspath(args.submission_dir)

if args.debug:
logging_config.add_stdout_handler(logging.DEBUG)
else:
logging_config.add_stdout_handler()
logging_config.add_stdout_handler(logging.INFO)
logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)

try:
Expand Down
8 changes: 1 addition & 7 deletions eva_sub_cli/executables/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,12 @@

import yaml

from eva_sub_cli.file_utils import open_gzip_if_required
from eva_sub_cli.metadata_utils import get_samples_per_analysis, get_files_per_analysis, get_analysis_for_vcf_file

logger = logging_config.get_logger(__name__)


def open_gzip_if_required(input_file):
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def get_samples_from_vcf(vcf_file):
"""
Get the list of samples present in a single VCF file
Expand Down
78 changes: 78 additions & 0 deletions eva_sub_cli/executables/trim_down.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import argparse
import os

import yaml
from ebi_eva_common_pyutils.logger import logging_config
from eva_sub_cli.file_utils import open_gzip_if_required, fasta_iter

logger = logging_config.get_logger(__name__)


max_nb_lines = 10000


def trim_down_vcf(vcf_file, output_vcf):
"""
Produce a smaller vcf files containing a maximum of 10000 records
"""
with open_gzip_if_required(vcf_file) as vcf_in, open(output_vcf, 'w') as vcf_out:
line_count = 0
ref_seq_names = set()
for line in vcf_in:
if line.startswith('#') or line_count < max_nb_lines:
vcf_out.write(line)
if not line.startswith('#'):
line_count += 1
ref_seq_names.add(line.split('\t')[0])
else:
break
if line_count != max_nb_lines:
logger.warning(f'Only {line_count} found in the source VCF {vcf_file} ')
return line_count, ref_seq_names


def trim_down_fasta(fasta_file, output_fasta, ref_seq_names):
"""
Produce a smaller fasta files containing only the reference sequences found in the VCF file
"""
found_sequences = set()
with open(output_fasta, 'w') as fasta_out:
for header, sequence in fasta_iter(fasta_file):
name = header.split()[0]
if name in ref_seq_names:
found_sequences.add(name)
print(f'>{header}', file=fasta_out)
for i in range(0, len(sequence), 80):
print(sequence[i:i+80], file=fasta_out)
return found_sequences


def main():
arg_parser = argparse.ArgumentParser(
description=f'Take a VCF file and only keep {max_nb_lines} lines and remove unused fasta sequence from the '
f'associated reference genome')
arg_parser.add_argument('--vcf_file', dest='vcf_file', required=True,
help='Path to the vcf file to be trimmed down')
arg_parser.add_argument('--output_vcf_file', dest='output_vcf_file', required=True,
help='Path to the output vcf file')
arg_parser.add_argument('--fasta_file', dest='fasta_file', required=True,
help='Path to the fasta file to be trimmed down')
arg_parser.add_argument('--output_fasta_file', dest='output_fasta_file', required=True,
help='Path to the output fasta file')
arg_parser.add_argument('--output_yaml_file', dest='output_yaml_file', required=True,
help='Path to the yaml file containing the trim down metrics')

args = arg_parser.parse_args()
logging_config.add_stdout_handler()

line_count, ref_sequence = trim_down_vcf(args.vcf_file, args.output_vcf_file)
sequence_found = trim_down_fasta(args.fasta_file, args.output_fasta_file, ref_sequence)
trim_down_metrics = {'trim_down_vcf_record': line_count, 'number_sequence_found': sequence_found,
'trim_down_required': line_count == max_nb_lines}
if len(sequence_found) != len(ref_sequence):
logger.warning(f'Not all sequences were found in the fasta file. Cancelling trimming down of fasta file')
os.link(args.fasta_file, args.output_fasta_file)
trim_down_metrics.pop('number_sequence_found')
with open(args.output_yaml_file) as open_file:
yaml.safe_dump(trim_down_metrics, open_file)

38 changes: 38 additions & 0 deletions eva_sub_cli/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
import glob
import gzip
import os
import shutil
from itertools import groupby


def resolve_single_file_path(file_path):
files = glob.glob(file_path)
if len(files) == 0:
return None
elif len(files) > 0:
return files[0]


def is_submission_dir_writable(submission_dir):
Expand Down Expand Up @@ -32,3 +43,30 @@ def backup_file_or_directory(file_name, max_backups=None):
else:
os.rename(f'{file_name}.{i - 1}', f'{file_name}.{i}')
os.rename(file_name, file_name + '.1')


def open_gzip_if_required(input_file):
"""Open a file in read mode using gzip if the file extension says .gz"""
if input_file.endswith('.gz'):
return gzip.open(input_file, 'rt')
else:
return open(input_file, 'r')


def fasta_iter(input_fasta):
"""
Given a fasta file. yield tuples of header, sequence
"""
# first open the file outside
with open_gzip_if_required(input_fasta) as open_file:
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(open_file, lambda line: line[0] == ">"))

for header in faiter:
# drop the ">"
headerStr = header.__next__()[1:].strip()

# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.__next__())
yield (headerStr, seq)
3 changes: 3 additions & 0 deletions eva_sub_cli/jinja_templates/html_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
{% from 'sample_name_check.html' import sample_name_check_report %}
{% from 'fasta_check.html' import fasta_check_report %}
{% from 'metadata_validation.html' import metadata_validation_report %}
{% from 'shallow_validation.html' import optional_shallow_validation_report %}

<html lang="EN">
<head>
Expand Down Expand Up @@ -46,6 +47,8 @@ <h6>eva-sub-cli v{{cli_version}}</h6>
</div>
</header>

{{ optional_shallow_validation_report(validation_results) }}

<section>
<h2>Project Summary</h2>
<div class="description">
Expand Down
2 changes: 2 additions & 0 deletions eva_sub_cli/jinja_templates/project_details.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,6 @@
</div>
{% endif %}



{%- endmacro %}
29 changes: 29 additions & 0 deletions eva_sub_cli/jinja_templates/shallow_validation.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

{% macro optional_shallow_validation_report(validation_results) -%}
{% set results = validation_results.get('shallow_validation', {}) %}

{% if results.get('required') %}
<section>
<div class="report-section fail collapsible"> <span class="expand_icon">&#9654;</span>
&#10060; <b>You requested to run the shallow validation, please run full validation before submitting the data</b>
</div>
<div class="no-show">
<table>
<tr>
<th><strong>VCF File</strong></th>
<th><strong>Variant lines validated in VCF</strong></th>
<th><strong>Entries used in Fasta</strong></th>
</tr>
{% for vcf_file in results.get('metrics') %}
<tr>
<td>{{ vcf_file }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('trim_down_vcf_record') }}</td>
<td>{{ results.get('metrics').get(vcf_file).get('number_sequence_found') }}</td>
</tr>
{% endfor %}
</table>
</div>
</section>
{% endif %}

{%- endmacro %}
47 changes: 35 additions & 12 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ params.python_scripts = [
"samples_checker": "samples_checker.py",
"fasta_checker": "check_fasta_insdc.py",
"xlsx2json": "xlsx2json.py",
"semantic_checker": "check_metadata_semantics.py"
"semantic_checker": "check_metadata_semantics.py",
"trim_down": "trim_down.py"
]
// prefix to prepend to all provided path
params.base_dir = ""
// help
params.help = null
params.shallow_validation = false

// Show help message
if (params.help) exit 0, helpMessage()
Expand Down Expand Up @@ -63,20 +65,23 @@ output_dir = joinBasePath(params.output_dir)

workflow {
// Prepare the file path
vcf_channel = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
vcf_and_ref_ch = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> tuple(
file(joinBasePath(row.vcf)),
file(joinBasePath(row.fasta)),
file(joinBasePath(row.report))
)}
vcf_files = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> file(joinBasePath(row.vcf))}

if (params.shallow_validation){
// create a smaller vcf and fasta then replace the channel
trim_down_vcf(vcf_and_ref_ch)
vcf_and_ref_ch = trim_down_vcf.out.vcf_and_ref
}
vcf_files = vcf_and_ref_ch.map{row -> row[0]}
fasta_to_vcfs = vcf_and_ref_ch.map{row -> tuple(row[1], row[0])}.groupTuple(by:0)
// VCF checks
check_vcf_valid(vcf_channel)
check_vcf_reference(vcf_channel)
check_vcf_valid(vcf_and_ref_ch)
check_vcf_reference(vcf_and_ref_ch)

generate_file_size_and_md5_digests(vcf_files)
collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect())
Expand All @@ -94,14 +99,32 @@ workflow {
metadata_json_validation(metadata_json)
metadata_semantic_check(metadata_json)
sample_name_concordance(metadata_json, vcf_files.collect())
fasta_to_vcfs = Channel.fromPath(joinBasePath(params.vcf_files_mapping))
.splitCsv(header:true)
.map{row -> tuple(file(joinBasePath(row.fasta)), file(joinBasePath(row.vcf)))}
.groupTuple(by:0)
insdc_checker(metadata_json, fasta_to_vcfs)
}
}


process trim_down_vcf {
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.log"
publishDir output_dir, overwrite: false, mode: "copy", pattern: "*.yml"

input:
tuple path(vcf), path(fasta), path(report)

output:
tuple path("output/$vcf"), path("output/$fasta"), path(report), emit: vcf_and_ref
path "${vcf.getBaseName()}_trim_down.log", emit: trim_down_log
path "${vcf.getBaseName()}_trim_down.yml", emit: trim_down_metric

"""
mkdir output
$params.python_scripts.trim_down --vcf_file $vcf --output_vcf_file output/$vcf --fasta_file $fasta --output_fasta_file output/$fasta --output_yaml_file ${vcf.getBaseName()}_trim_down.yml > ${vcf.getBaseName()}_trim_down.log
# This is needed to ensure that a missing (NO_FILE) report can still be passed down to subsequent steps
touch $report
"""

}

/*
* Validate the VCF file format
*/
Expand Down
Loading

0 comments on commit d14d9fb

Please sign in to comment.