Merge pull request #56 from apriltuesday/updates-for-testing

EVA-3659 - Updates for testing session
EBIvariation · Sep 17, 2024 · 5222976 · 5222976
2 parents 9a18cc1 + 4058a1e
commit 5222976
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -68,12 +68,12 @@ The path to the VCF files are provided in the Files section of the metadata and
 This allows us to support different assemblies for each VCF file. 
 Please check the below sections `The metadata spreadsheet` and `The metadata JSON` for the format and options available in metadata files.
 
-### The metadata spreadsheet 
+#### The metadata spreadsheet
 
 The metadata template can be found within the etc folder at `eva_sub_cli/etc/EVA_Submission_template.xlsx`
 It should be populated following the instruction provided within the template
 
-### The metadata JSON
+#### The metadata JSON
 
 The metadata can also be provided via a JSON file which should conform to the schema located  at 
 `eva_sub_cli/etc/eva_schema.json` 
@@ -114,3 +114,9 @@ or
 eva-sub-cli.py --metadata_xlsx metadata_spreadsheet.xlsx --submission_dir submission_dir --tasks SUBMIT
 ```
 Will only submit the data and not validate.
+
+### Shallow validation
+
+If you are working with large VCF files and find that validation takes a very long time, you can add the
+argument `--shallow` to the command, which will validate only the first 10,000 lines in each VCF. Note that running
+shallow validation will **not** be sufficient for actual submission.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.10
 
-ENV vcf_validator_version=0.9.4
+ENV vcf_validator_version=0.9.7
 ENV NXF_VER=22.10.6
 
 WORKDIR /opt

diff --git a/eva_sub_cli/executables/cli.py b/eva_sub_cli/executables/cli.py
@@ -35,40 +35,42 @@ def validate_command_line_arguments(args, argparser):
         print(f"'{args.submission_dir}' does not have write permissions or is not a directory.")
         sys.exit(1)
 
+
 def parse_args(cmd_line_args):
-    argparser = ArgumentParser(prog='eva-sub-cli', description='EVA Submission CLI - validate and submit data to EVA')
+    argparser = ArgumentParser(prog='eva-sub-cli',
+                               description='EVA Submission CLI - validate and submit data to EVA. '
+                               'For full details, please see https://github.com/EBIvariation/eva-sub-cli')
     argparser.add_argument('--version', action='version', version=f'%(prog)s {eva_sub_cli.__version__}')
     argparser.add_argument('--submission_dir', required=True, type=str,
-                           help='Path to the directory where all processing will be done '
-                                'and submission info is/will be stored')
+                           help='Path to the directory where all processing is done and submission info is stored')
     vcf_group = argparser.add_argument_group(
         'Input VCF and assembly',
         "Specify the VCF files and associated assembly with the following options. If you used different assemblies "
-        "for different VCF files then include these in the metadata file."
+        "for different VCF files, then you must include these in the metadata file rather than specifying them here."
     )
-    vcf_group.add_argument('--vcf_files', nargs='+', help="One or several vcf files to validate")
+    vcf_group.add_argument('--vcf_files', nargs='+', help="One or more VCF files to validate")
     vcf_group.add_argument('--reference_fasta',
-                           help="The fasta file containing the reference genome from which the variants were derived")
+                           help="The FASTA file containing the reference genome from which the variants were derived")
 
     metadata_group = argparser.add_argument_group('Metadata', 'Specify the metadata in a spreadsheet or in a JSON file')
     metadata_group = metadata_group.add_mutually_exclusive_group(required=True)
     metadata_group.add_argument("--metadata_json",
-                                help="Json file that describe the project, analysis, samples and files")
+                                help="JSON file that describes the project, analysis, samples and files")
     metadata_group.add_argument("--metadata_xlsx",
-                                help="Excel spreadsheet  that describe the project, analysis, samples and files")
+                                help="Excel spreadsheet that describes the project, analysis, samples and files")
     argparser.add_argument('--tasks', nargs='+', choices=[VALIDATE, SUBMIT], default=[SUBMIT], type=str.lower,
-                           help='Select a task to perform. Selecting VALIDATE will run the validation regardless of the'
-                                ' outcome of previous runs. Selecting SUBMIT will run validate only if the validation'
-                                ' was not performed successfully before and then run the submission.')
+                           help='Select a task to perform (default SUBMIT). VALIDATE will run the validation'
+                                ' regardless of the outcome of previous runs. SUBMIT will run validate only if'
+                                ' the validation was not performed successfully before and then run the submission.')
     argparser.add_argument('--executor', choices=[DOCKER, NATIVE], default=NATIVE, type=str.lower,
-                           help='Select an execution type for running validation (default native)')
-    credential_group = argparser.add_argument_group('Credential', 'Specify the Webin credential you want to use to '
-                                                                  'upload to the EVA')
-    credential_group.add_argument("--username", help="Username used for connecting to the ENA webin account")
-    credential_group.add_argument("--password", help="Password used for connecting to the ENA webin account")
+                           help='Select the execution type for running validation (default native)')
+    credential_group = argparser.add_argument_group('Credentials', 'Specify the ENA Webin credentials you want to use '
+                                                                   'to submit to the EVA')
+    credential_group.add_argument("--username", help="Username for your ENA Webin account")
+    credential_group.add_argument("--password", help="Password for your ENA Webin account")
     argparser.add_argument('--shallow', action='store_true', default=False,
                            help='Set the validation to be performed on the first 10000 records of the VCF. '
-                                'Only applies if the number of record exceed 10000')
+                                'Only applies if the number of records exceed 10000')
     argparser.add_argument('--debug', action='store_true', default=False,
                            help='Set the script to output debug messages')
     args = argparser.parse_args(cmd_line_args)
@@ -77,7 +79,6 @@ def parse_args(cmd_line_args):
 
 
 def main():
-
     args = parse_args(sys.argv[1:])
 
     args.submission_dir = os.path.abspath(args.submission_dir)
@@ -96,4 +97,4 @@ def main():
     except SubmissionNotFoundException as snfe:
         print(f'{snfe}. Please contact EVA Helpdesk')
     except SubmissionStatusException as sse:
-        print(f'{sse}. Please try again later. If the problem persists, please contact EVA Helpdesk')
+        print(f'{sse}. Please try again later. If the problem persists, please contact EVA Helpdesk')
diff --git a/eva_sub_cli/validators/docker_validator.py b/eva_sub_cli/validators/docker_validator.py
@@ -11,7 +11,7 @@
 logger = logging_config.get_logger(__name__)
 
 container_image = 'ebivariation/eva-sub-cli'
-container_tag = 'v0.0.1.dev17'
+container_tag = 'v0.0.1'
 container_validation_dir = '/opt/vcf_validation'
 container_validation_output_dir = 'vcf_validation_output'
 

diff --git a/eva_sub_cli/validators/validator.py b/eva_sub_cli/validators/validator.py
@@ -478,5 +478,6 @@ def create_reports(self):
         file_path = os.path.join(self.output_dir, 'report.html')
         with open(file_path, "w") as f:
             f.write(report_html)
-        self.info(f'View the validation report in your browser: {file_path}')
+        self.info(f'Validation result: {"SUCCESS" if self.verify_ready_for_submission_to_eva() else "FAILURE"}')
+        self.info(f'View the full report in your browser: {file_path}')
         return file_path
diff --git a/tests/test_docker_validator.py b/tests/test_docker_validator.py
@@ -109,8 +109,8 @@ def test_validate(self):
 
         with open(assembly_check_log_file) as assembly_check_log_file:
             assembly_check_logs = assembly_check_log_file.readlines()
-            self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[5])
-            self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[6])
+            self.assertEqual('[info] Number of matches: 247/247\n', assembly_check_logs[4])
+            self.assertEqual('[info] Percentage of matches: 100%\n', assembly_check_logs[5])
 
         # Assert Samples concordance
         expected_checker = {