Skip to content

Commit

Permalink
Merge pull request #53 from tcezard/EVA3635_fix_from_test
Browse files Browse the repository at this point in the history
EVA-3635 - Fixes from Testing
  • Loading branch information
tcezard authored Aug 30, 2024
2 parents 5879d11 + 4732580 commit 3ed9462
Show file tree
Hide file tree
Showing 13 changed files with 93 additions and 52 deletions.
12 changes: 9 additions & 3 deletions eva_sub_cli/executables/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main():
argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
argparser.add_argument('--submission_dir', required=True, type=str,
help='Full path to the directory where all processing will be done '
help='Path to the directory where all processing will be done '
'and submission info is/will be stored')
vcf_group = argparser.add_argument_group(
'Input VCF and assembly',
Expand All @@ -57,7 +57,7 @@ def main():
help="Json file that describe the project, analysis, samples and files")
metadata_group.add_argument("--metadata_xlsx",
help="Excel spreadsheet that describe the project, analysis, samples and files")
argparser.add_argument('--tasks', nargs='*', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the'
' outcome of previous runs. Selecting SUBMIT will run validate only if the validation'
' was not performed successfully before and then run the submission.')
Expand All @@ -67,12 +67,18 @@ def main():
'upload to the EVA')
credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
argparser.add_argument('--debug', action='store_true', default=False, help='Set the script to output debug messages')

args = argparser.parse_args()

validate_command_line_arguments(args, argparser)

logging_config.add_stdout_handler()
args.submission_dir = os.path.abspath(args.submission_dir)

if args.debug:
logging_config.add_stdout_handler(logging.DEBUG)
else:
logging_config.add_stdout_handler()
logging_config.add_file_handler(os.path.join(args.submission_dir, 'eva_submission.log'), logging.DEBUG)

try:
Expand Down
2 changes: 1 addition & 1 deletion eva_sub_cli/executables/xlsx2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,6 @@ def main():
try:
parser.json(args.metadata_json)
except Exception as e:
parser.add_error(e)
parser.add_error(f'An Error was raised while converting the spreadsheet to JSON: {repr(e)}')
finally:
parser.save_errors(args.errors_yaml)
3 changes: 1 addition & 2 deletions eva_sub_cli/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ def get_files_per_analysis(metadata):
"""Returns mapping of analysis alias to filenames, based on metadata."""
files_per_analysis = defaultdict(list)
for file_info in metadata.get('files', []):
if file_info.get('fileType') == 'vcf':
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
files_per_analysis[file_info.get('analysisAlias')].append(file_info.get('fileName'))
return {
analysis_alias: set(filepaths)
for analysis_alias, filepaths in files_per_analysis.items()
Expand Down
14 changes: 9 additions & 5 deletions eva_sub_cli/nextflow/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,13 @@ process convert_xlsx_2_json {
output:
path "metadata.json", emit: metadata_json, optional: true
path "metadata_conversion_errors.yml", emit: errors_yaml
path "xlsx2json.log", emit: xlsx2json_log

script:
metadata_json = metadata_xlsx.getBaseName() + '.json'

"""
$params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration
$params.python_scripts.xlsx2json --metadata_xlsx $metadata_xlsx --metadata_json metadata.json --errors_yaml metadata_conversion_errors.yml --conversion_configuration $conversion_configuration > xlsx2json.log 2>&1
"""
}

Expand All @@ -217,7 +218,7 @@ process metadata_json_validation {

script:
"""
$params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt
$params.executable.biovalidator --schema $schema_dir/eva_schema.json --ref $schema_dir/eva-biosamples.json --data $metadata_json > metadata_validation.txt 2>&1
"""
}

Expand All @@ -232,10 +233,11 @@ process sample_name_concordance {

output:
path "sample_checker.yml", emit: sample_checker_yml
path "sample_checker.log", emit: sample_checker_log

script:
"""
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml
$params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1
"""
}

Expand All @@ -251,10 +253,11 @@ process insdc_checker {

output:
path "${fasta_file}_check.yml", emit: fasta_checker_yml
path "fasta_checker.log", emit: fasta_checker_log

script:
"""
$params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml
$params.python_scripts.fasta_checker --metadata_json $metadata_json --vcf_files $vcf_files --input_fasta $fasta_file --output_yaml ${fasta_file}_check.yml > fasta_checker.log 2>&1
"""
}

Expand All @@ -269,9 +272,10 @@ process metadata_semantic_check {

output:
path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml
path "semantic_checker.log", emit: semantic_checker_log

script:
"""
$params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml
$params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1
"""
}
28 changes: 20 additions & 8 deletions eva_sub_cli/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, re
project_title, vcf_files_mapping = get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, True)

for mapping in vcf_files_mapping:
writer.writerow(mapping);
writer.writerow(mapping)

return project_title, mapping_file

Expand All @@ -80,6 +80,7 @@ def get_project_and_vcf_fasta_mapping_from_metadata_json(metadata_json, mapping_

return project_title, vcf_fasta_report_mapping


def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_req=False):
workbook = load_workbook(metadata_xlsx)

Expand Down Expand Up @@ -108,23 +109,27 @@ def get_project_and_vcf_fasta_mapping_from_metadata_xlsx(metadata_xlsx, mapping_
files_headers[cell.value] = cell.column - 1

for row in files_sheet.iter_rows(min_row=2, values_only=True):
file_name = row[files_headers['File Name']]
file_name = os.path.abspath(row[files_headers['File Name']])
analysis_alias = row[files_headers['Analysis Alias']]
reference_fasta = analysis_alias_dict[analysis_alias]
reference_fasta = os.path.abspath(analysis_alias_dict[analysis_alias])
if not (file_name and os.path.isfile(file_name)):
raise FileNotFoundError(f'The variant file {file_name} provided in spreadsheet {metadata_xlsx} does not exist')
if not (reference_fasta and os.path.isfile(reference_fasta)):
raise FileNotFoundError(f'The reference fasta {reference_fasta} in spreadsheet {metadata_xlsx} does not exist')
vcf_fasta_report_mapping.append([os.path.abspath(file_name), os.path.abspath(reference_fasta), ''])

return project_title, vcf_fasta_report_mapping


def check_validation_required(tasks, sub_config):
def check_validation_required(tasks, sub_config, username=None, password=None):
# Validation is mandatory so if submit is requested then VALIDATE must have run before or be requested as well
if SUBMIT in tasks:
if not sub_config.get(READY_FOR_SUBMISSION_TO_EVA, False):
return True
submission_id = sub_config.get(SUB_CLI_CONFIG_KEY_SUBMISSION_ID, None)
if submission_id:
try:
submission_status = SubmissionWSClient().get_submission_status(submission_id)
submission_status = SubmissionWSClient(username, password).get_submission_status(submission_id)
if submission_status == 'FAILED':
return True
else:
Expand All @@ -150,13 +155,20 @@ def orchestrate_process(submission_dir, vcf_files, reference_fasta, metadata_jso

metadata_file = metadata_json or metadata_xlsx
if not os.path.exists(os.path.abspath(metadata_file)):
raise FileNotFoundError(f'The provided metadata file {metadata_file} does not exist')
raise FileNotFoundError(f'The provided metadata file {os.path.abspath(metadata_file)} does not exist')

if metadata_json:
metadata_json = os.path.abspath(metadata_json)
if metadata_xlsx:
metadata_xlsx = os.path.abspath(metadata_xlsx)

# Get the provided Project Title and VCF files mapping (VCF, Fasta and Report)
project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx)
project_title, vcf_files_mapping = get_project_title_and_create_vcf_files_mapping(
submission_dir, vcf_files, reference_fasta, metadata_json, metadata_xlsx
)
vcf_files = get_vcf_files(vcf_files_mapping)

if VALIDATE not in tasks and check_validation_required(tasks, sub_config):
if VALIDATE not in tasks and check_validation_required(tasks, sub_config, username, password):
tasks.append(VALIDATE)

if VALIDATE in tasks:
Expand Down
8 changes: 4 additions & 4 deletions eva_sub_cli/submission_ws.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,19 @@ def _submission_status_url(self, submission_id):

def mark_submission_uploaded(self, submission_id, metadata_json):
response = requests.put(self._submission_uploaded_url(submission_id),
headers={'Accept': 'application/hal+json', 'Authorization': 'Bearer ' + self.auth.token},
data=metadata_json)
headers={'Accept': 'application/json', 'Authorization': 'Bearer ' + self.auth.token},
json=metadata_json)
response.raise_for_status()
return response.json()

def initiate_submission(self):
response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/hal+json',
response = requests.post(self._submission_initiate_url(), headers={'Accept': 'application/json',
'Authorization': 'Bearer ' + self.auth.token})
response.raise_for_status()
return response.json()

@retry(exceptions=(HTTPError,), tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def get_submission_status(self, submission_id):
response = requests.get(self.get_submission_status_url(submission_id))
response = requests.get(self._submission_status_url(submission_id))
response.raise_for_status()
return response.text
3 changes: 2 additions & 1 deletion eva_sub_cli/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def _upload_submission(self):
def _upload_file(self, submission_upload_url, input_file):
base_name = os.path.basename(input_file)
self.debug(f'Transfer {base_name} to EVA FTP')
r = requests.put(os.path.join(submission_upload_url, base_name), data=open(input_file, 'rb'))
with open(input_file, 'rb') as f:
r = requests.put(os.path.join(submission_upload_url, base_name), data=f)
r.raise_for_status()
self.debug(f'Upload of {base_name} completed')

Expand Down
18 changes: 9 additions & 9 deletions eva_sub_cli/validators/docker_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger = logging_config.get_logger(__name__)

container_image = 'ebivariation/eva-sub-cli'
container_tag = 'v0.0.1.dev14'
container_tag = 'v0.0.1.dev15'
container_validation_dir = '/opt/vcf_validation'
container_validation_output_dir = 'vcf_validation_output'

Expand Down Expand Up @@ -100,10 +100,10 @@ def verify_container_is_running(self):
try:
container_run_cmd_output = self._run_quiet_command("check if container is running", f"{self.docker_path} ps", return_process_output=True)
if container_run_cmd_output is not None and self.container_name in container_run_cmd_output:
logger.info(f"Container ({self.container_name}) is running")
logger.debug(f"Container ({self.container_name}) is running")
return True
else:
logger.info(f"Container ({self.container_name}) is not running")
logger.debug(f"Container ({self.container_name}) is not running")
return False
except subprocess.CalledProcessError as ex:
logger.error(ex)
Expand All @@ -114,10 +114,10 @@ def verify_container_is_stopped(self):
"check if container is stopped", f"{self.docker_path} ps -a", return_process_output=True
)
if container_stop_cmd_output is not None and self.container_name in container_stop_cmd_output:
logger.info(f"Container ({self.container_name}) is in stop state")
logger.debug(f"Container ({self.container_name}) is in stop state")
return True
else:
logger.info(f"Container ({self.container_name}) is not in stop state")
logger.debug(f"Container ({self.container_name}) is not in stop state")
return False

def try_restarting_container(self):
Expand All @@ -137,14 +137,14 @@ def verify_image_available_locally(self):
return_process_output=True
)
if container_images_cmd_ouptut is not None and re.search(container_image + r'\s+' + container_tag, container_images_cmd_ouptut):
logger.info(f"Container ({container_image}) image is available locally")
logger.debug(f"Container ({container_image}) image is available locally")
return True
else:
logger.info(f"Container ({container_image}) image is not available locally")
logger.debug(f"Container ({container_image}) image is not available locally")
return False

def run_container(self):
logger.info(f"Trying to run container {self.container_name}")
logger.debug(f"Trying to run container {self.container_name}")
try:
self._run_quiet_command(
"Try running container",
Expand All @@ -166,7 +166,7 @@ def stop_running_container(self):
)

def download_container_image(self):
logger.info(f"Pulling container ({container_image}) image")
logger.debug(f"Pulling container ({container_image}) image")
try:
self._run_quiet_command("pull container image", f"{self.docker_path} pull {container_image}:{container_tag}")
except subprocess.CalledProcessError as ex:
Expand Down
4 changes: 4 additions & 0 deletions eva_sub_cli/validators/native_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ def _validate(self):

def run_validator(self):
self.verify_executables_installed()
curr_wd = os.getcwd()
try:
command = self.get_validation_cmd()
os.chdir(self.submission_dir)
self._run_quiet_command("Run Validation using Nextflow", command)
except subprocess.CalledProcessError as ex:
logger.error(ex)
finally:
os.chdir(curr_wd)

def get_validation_cmd(self):
if self.metadata_xlsx and not self.metadata_json:
Expand Down
17 changes: 11 additions & 6 deletions eva_sub_cli/validators/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,10 @@ def _collect_file_info_to_metadata(self):
file_name_2_md5[os.path.basename(vcf_file)] = md5sum
file_path_2_file_size[vcf_file] = file_size
file_name_2_file_size[os.path.basename(vcf_file)] = file_size
else:
self.error(
f"Cannot locate file_info.txt at {os.path.join(self.output_dir, 'other_validations', 'file_info.txt')}"
)
if self.metadata_json_post_validation:
with open(self.metadata_json_post_validation) as open_file:
try:
Expand All @@ -553,12 +557,11 @@ def _collect_file_info_to_metadata(self):
files_from_metadata = json_data.get('files', [])
if files_from_metadata:
for file_dict in json_data.get('files', []):
if file_dict.get('fileType') == 'vcf':
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
file_name_2_file_size.get(file_dict.get('fileName')) or ''
file_path = self._validation_file_path_for(file_dict.get('fileName'))
file_dict['md5'] = file_path_2_md5.get(file_path) or \
file_name_2_md5.get(file_dict.get('fileName')) or ''
file_dict['fileSize'] = file_path_2_file_size.get(file_path) or \
file_name_2_file_size.get(file_dict.get('fileName')) or ''
file_rows.append(file_dict)
else:
self.error('No file found in metadata and multiple analysis alias exist: '
Expand All @@ -570,6 +573,8 @@ def _collect_file_info_to_metadata(self):
if json_data:
with open(self.metadata_json_post_validation, 'w') as open_file:
json.dump(json_data, open_file)
else:
self.error(f'Cannot locate the metadata in JSON format in {os.path.join(self.output_dir, "metadata.json")}')

def get_vcf_fasta_analysis_mapping(self):
vcf_fasta_analysis_mapping = []
Expand Down
Loading

0 comments on commit 3ed9462

Please sign in to comment.