diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt index 7c2041b6d..7af62c7e6 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt @@ -749,7 +749,8 @@ open class SubmissionDatabaseService( SequenceEntriesTable.versionColumn, SequenceEntriesTable.versionCommentColumn, SequenceEntriesTable.submissionIdColumn, - SequenceEntriesTable.submitterColumn, SequenceEntriesTable.groupIdColumn, + SequenceEntriesTable.submitterColumn, + SequenceEntriesTable.groupIdColumn, SequenceEntriesTable.submittedAtTimestampColumn, SequenceEntriesTable.isRevocationColumn, SequenceEntriesTable.organismColumn, diff --git a/docs/public/images/team/theoSanderson-square.jpg b/docs/public/images/team/theoSanderson-square.jpg index 3b0a82f2f..711f83758 100644 Binary files a/docs/public/images/team/theoSanderson-square.jpg and b/docs/public/images/team/theoSanderson-square.jpg differ diff --git a/ena-submission/config/config.yaml b/ena-submission/config/config.yaml index 1f3c4b45d..2d783b972 100644 --- a/ena-submission/config/config.yaml +++ b/ena-submission/config/config.yaml @@ -2,7 +2,7 @@ backend_url: http://localhost:8079/ keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token db_username: postgres db_password: unsecure -db_url: "jdbc:postgresql://127.0.0.1:5432/loculus?options=-c%20search_path%3Dena_deposition_schema" +db_url: "jdbc:postgresql://127.0.0.1:5432/loculus" organisms: cchf: ingest: diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index 292873fdb..eb31f786d 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -8,12 +8,12 @@ import pytz import yaml from ena_submission_helper import ( - CreationResults, - check_ena, + CreationResult, create_chromosome_list, create_ena_assembly, create_fasta, create_manifest, + get_ena_analysis_process, get_ena_config, ) from ena_types import ( @@ -100,14 +100,12 @@ def create_chromosome_list_object( return AssemblyChromosomeListFile(chromosomes=entries) -def get_segment_order(unaligned_sequences) -> list[str]: +def get_segment_order(unaligned_sequences: dict[str, str]) -> list[str]: + """Order in which we put the segments in the chromosome list file""" segment_order = [] - if len(unaligned_sequences.keys()) > 1: - for segment_name, item in unaligned_sequences.items(): - if item: # Only list sequenced segments - segment_order.append(segment_name) - else: - segment_order.append("main") + for segment_name, item in unaligned_sequences.items(): + if item: # Only list sequenced segments + segment_order.append(segment_name) return sorted(segment_order) @@ -371,14 +369,14 @@ def assembly_table_create( segment_order = get_segment_order( sample_data_in_submission_table[0]["unaligned_nucleotide_sequences"] ) - assembly_creation_results: CreationResults = create_ena_assembly( + assembly_creation_results: CreationResult = create_ena_assembly( ena_config, manifest_file, center_name=center_name, test=test ) - if assembly_creation_results.results: - assembly_creation_results.results["segment_order"] = segment_order + if assembly_creation_results.result: + assembly_creation_results.result["segment_order"] = segment_order update_values = { "status": Status.WAITING, - "result": json.dumps(assembly_creation_results.results), + "result": json.dumps(assembly_creation_results.result), } number_rows_updated = 0 tries = 0 @@ -448,17 +446,52 @@ def assembly_table_update( logger.debug("Checking state in ENA") for row in waiting: seq_key = {"accession": row["accession"], "version": row["version"]} - segment_order = row["result"]["segment_order"] - check_results: CreationResults = check_ena( - ena_config, row["result"]["erz_accession"], segment_order + # Previous means from the last time the entry was checked, from db + previous_result = row["result"] + segment_order = previous_result["segment_order"] + new_result: CreationResult = get_ena_analysis_process( + ena_config, previous_result["erz_accession"], segment_order ) _last_ena_check = time - if not check_results.results: + + if not new_result.result: continue + result_contains_gca_accession = "gca_accession" in new_result.result + result_contains_insdc_accession = any( + key.startswith("insdc_accession_full") for key in new_result.result + ) + + if not (result_contains_gca_accession and result_contains_insdc_accession): + if previous_result == new_result.result: + continue + update_values = { + "status": Status.WAITING, + "result": json.dumps(new_result.result), + "finished_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + logger.warning( + f"Assembly partially in ENA but DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="assembly_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info( + f"Partial results of assembly submission for accession {row["accession"]} returned!" + ) + continue update_values = { "status": Status.SUBMITTED, - "result": json.dumps(check_results.results), + "result": json.dumps(new_result.result), "finished_at": datetime.now(tz=pytz.utc), } number_rows_updated = 0 @@ -522,7 +555,10 @@ def assembly_table_handle_errors( f" status WAITING for over {time_threshold_waiting}h" ) send_slack_notification( - config, error_msg, time=datetime.now(tz=pytz.utc), time_threshold=slack_time_threshold + error_msg, + slack_config, + time=datetime.now(tz=pytz.utc), + time_threshold=slack_time_threshold, ) diff --git a/ena-submission/scripts/create_project.py b/ena-submission/scripts/create_project.py index 92ba15d87..3a6bec28d 100644 --- a/ena-submission/scripts/create_project.py +++ b/ena-submission/scripts/create_project.py @@ -8,7 +8,7 @@ import pytz import yaml from call_loculus import get_group_info -from ena_submission_helper import CreationResults, create_ena_project, get_ena_config +from ena_submission_helper import CreationResult, create_ena_project, get_ena_config from ena_types import ( OrganismType, ProjectLink, @@ -275,11 +275,11 @@ def project_table_create( logger.info( f"Starting Project creation for group_id {row["group_id"]} organism {row["organism"]}" ) - project_creation_results: CreationResults = create_ena_project(ena_config, project_set) - if project_creation_results.results: + project_creation_results: CreationResult = create_ena_project(ena_config, project_set) + if project_creation_results.result: update_values = { "status": Status.SUBMITTED, - "result": json.dumps(project_creation_results.results), + "result": json.dumps(project_creation_results.result), "finished_at": datetime.now(tz=pytz.utc), } number_rows_updated = 0 diff --git a/ena-submission/scripts/create_sample.py b/ena-submission/scripts/create_sample.py index 3b1ea001f..ece6f9a55 100644 --- a/ena-submission/scripts/create_sample.py +++ b/ena-submission/scripts/create_sample.py @@ -8,7 +8,7 @@ import click import pytz import yaml -from ena_submission_helper import CreationResults, create_ena_sample, get_ena_config +from ena_submission_helper import CreationResult, create_ena_sample, get_ena_config from ena_types import ( ProjectLink, SampleAttribute, @@ -320,11 +320,11 @@ def sample_table_create( ) continue logger.info(f"Starting sample creation for accession {row["accession"]}") - sample_creation_results: CreationResults = create_ena_sample(ena_config, sample_set) - if sample_creation_results.results: + sample_creation_results: CreationResult = create_ena_sample(ena_config, sample_set) + if sample_creation_results.result: update_values = { "status": Status.SUBMITTED, - "result": json.dumps(sample_creation_results.results), + "result": json.dumps(sample_creation_results.result), "finished_at": datetime.now(tz=pytz.utc), } number_rows_updated = 0 diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index eeccf42a5..418a5e1c7 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -1,3 +1,4 @@ +import datetime import gzip import json import logging @@ -9,13 +10,18 @@ from dataclasses import dataclass from typing import Any +import pytz import requests import xmltodict from ena_types import ( + Action, + Actions, AssemblyChromosomeListFile, AssemblyManifest, + Hold, ProjectSet, SampleSetType, + Submission, XmlAttribute, ) from requests.auth import HTTPBasicAuth @@ -65,10 +71,10 @@ def get_ena_config( @dataclass -class CreationResults: +class CreationResult: errors: list[str] warnings: list[str] - results: dict[str, str] | None = None + result: dict[str, str] | None = None def recursive_defaultdict(): @@ -102,13 +108,15 @@ def dataclass_to_xml(dataclass_instance, root_name="root"): return xmltodict.unparse({root_name: dataclass_dict}, pretty=True) -def get_submission_dict(): - submission = recursive_defaultdict() - submission["SUBMISSION"]["ACTIONS"]["ACTION"]["ADD"] = None - return submission +def get_submission_dict(hold_until_date: str | None = None): + if not hold_until_date: + hold_until_date = datetime.datetime.now(tz=pytz.utc).strftime("%Y-%m-%d") + return Submission( + actions=Actions(action=[Action(add=""), Action(hold=Hold(XmlAttribute(hold_until_date)))]) + ) -def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResults: +def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult: """ The project creation request should be equivalent to curl -u {params.ena_submission_username}:{params.ena_submission_password} \ @@ -123,7 +131,7 @@ def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationRe def get_project_xml(project_set): submission_set = get_submission_dict() return { - "SUBMISSION": xmltodict.unparse(submission_set, pretty=True), + "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), "PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"), } @@ -135,7 +143,7 @@ def get_project_xml(project_set): ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) try: parsed_response = xmltodict.parse(response.text) valid = ( @@ -149,15 +157,15 @@ def get_project_xml(project_set): error_message = f"Response is in unexpected format: {e}. " f"Response: {response.text}." logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) project_results = { "bioproject_accession": parsed_response["RECEIPT"]["PROJECT"]["@accession"], "ena_submission_accession": parsed_response["RECEIPT"]["SUBMISSION"]["@accession"], } - return CreationResults(results=project_results, errors=errors, warnings=warnings) + return CreationResult(result=project_results, errors=errors, warnings=warnings) -def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationResults: +def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationResult: """ The sample creation request should be equivalent to curl -u {params.ena_submission_username}:{params.ena_submission_password} \ @@ -172,7 +180,7 @@ def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationR def get_sample_xml(sample_set): submission_set = get_submission_dict() files = { - "SUBMISSION": xmltodict.unparse(submission_set, pretty=True), + "SUBMISSION": dataclass_to_xml(submission_set, root_name="SUBMISSION"), "SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"), } return files @@ -186,7 +194,7 @@ def get_sample_xml(sample_set): ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) try: parsed_response = xmltodict.parse(response.text) valid = ( @@ -205,13 +213,13 @@ def get_sample_xml(sample_set): ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) sample_results = { "ena_sample_accession": parsed_response["RECEIPT"]["SAMPLE"]["@accession"], "biosample_accession": parsed_response["RECEIPT"]["SAMPLE"]["EXT_ID"]["@accession"], "ena_submission_accession": parsed_response["RECEIPT"]["SUBMISSION"]["@accession"], } - return CreationResults(results=sample_results, errors=errors, warnings=warnings) + return CreationResult(result=sample_results, errors=errors, warnings=warnings) def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response: @@ -320,7 +328,7 @@ def post_webin_cli( def create_ena_assembly( config: ENAConfig, manifest_filename: str, center_name=None, test=True -) -> CreationResults: +) -> CreationResult: """ This is equivalent to running: webin-cli -username {params.ena_submission_username} -password {params.ena_submission_password} @@ -338,7 +346,7 @@ def create_ena_assembly( ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) lines = response.stdout.splitlines() erz_accession = None @@ -355,15 +363,18 @@ def create_ena_assembly( ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) assembly_results = { "erz_accession": erz_accession, } - return CreationResults(results=assembly_results, errors=errors, warnings=warnings) + return CreationResult(result=assembly_results, errors=errors, warnings=warnings) -def check_ena(config: ENAConfig, erz_accession: str, segment_order: list[str]) -> CreationResults: +def get_ena_analysis_process( + config: ENAConfig, erz_accession: str, segment_order: list[str] +) -> CreationResult: """ + Weird name "process" instead of "processing_result" is to match the ENA API. This is equivalent to running: curl -X 'GET' \ '{config.ena_reports_service_url}/analysis-process/{erz_accession}?format=json&max-results=100' \ @@ -374,7 +385,7 @@ def check_ena(config: ENAConfig, erz_accession: str, segment_order: list[str]) - errors = [] warnings = [] - assembly_results = {"segment_order": segment_order} + assembly_results = {"segment_order": segment_order, "erz_accession": erz_accession} response = requests.get( url, @@ -389,12 +400,12 @@ def check_ena(config: ENAConfig, erz_accession: str, segment_order: list[str]) - ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) if response.text == "[]": # For some minutes the response will be empty, requests to # f"{config.ena_reports_service_url}/analysis-files/{erz_accession}?format=json" # should still succeed - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) try: parsed_response = json.loads(response.text) entry = parsed_response[0]["report"] @@ -403,48 +414,21 @@ def check_ena(config: ENAConfig, erz_accession: str, segment_order: list[str]) - if entry["processingStatus"] == "COMPLETED": acc_list = entry["acc"].split(",") acc_dict = {a.split(":")[0]: a.split(":")[-1] for a in acc_list} - if "genome" not in acc_dict: - logger.error("Unexpected response format: genome not in acc_dict") - raise requests.exceptions.RequestException - gca_accession = acc_dict["genome"] - if "chromosomes" not in acc_dict: - logger.error("Unexpected response format: chromosome not in acc_dict") - raise requests.exceptions.RequestException - insdc_accession_range = acc_dict["chromosomes"] - if len(segment_order) == 1 and len(insdc_accession_range.split("-")) == 0: - assembly_results["insdc_accession"] = insdc_accession_range - else: - start_letters = insdc_accession_range.split("-")[0][:2] - start_digit = 10 ** ( - len(insdc_accession_range.split("-")[0]) - 2 - ) # after letters accession can start with 0 - insdc_accession_start_int = start_digit + int( - insdc_accession_range.split("-")[0][2:] + gca_accession = acc_dict.get("genome") + if gca_accession: + assembly_results.update( + { + "gca_accession": gca_accession, + } ) - insdc_accession_end_int = start_digit + int( - insdc_accession_range.split("-")[-1][2:] + insdc_accession_range = acc_dict.get("chromosomes") + if insdc_accession_range: + chromosome_accessions_dict = get_chromsome_accessions( + insdc_accession_range, segment_order ) - if insdc_accession_end_int - insdc_accession_start_int != len(segment_order) - 1: - logger.error( - "Unexpected response format: chromosome does not have expected number of segments" - ) - raise requests.exceptions.RequestException - insdc_accession_base_dict = { - ("insdc_accession_" + segment): ( - start_letters + str(insdc_accession_start_int + i)[1:] - ) - for i, segment in enumerate(segment_order) - } - insdc_accession_full_dict = { - ("insdc_accession_full_" + segment): ( - start_letters + str(insdc_accession_start_int + i)[1:] + ".1" - ) - for i, segment in enumerate(segment_order) - } # set version to 1 by default - assembly_results.update(insdc_accession_base_dict) - assembly_results.update(insdc_accession_full_dict) + assembly_results.update(chromosome_accessions_dict) else: - return CreationResults(results=None, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) except: error_message = ( f"ENA Check returned errors or is in unexpected format. " @@ -452,11 +436,62 @@ def check_ena(config: ENAConfig, erz_accession: str, segment_order: list[str]) - ) logger.warning(error_message) errors.append(error_message) - return CreationResults(results=None, errors=errors, warnings=warnings) - assembly_results.update( - { - "erz_accession": erz_accession, - "gca_accession": gca_accession, - } - ) - return CreationResults(results=assembly_results, errors=errors, warnings=warnings) + return CreationResult(result=None, errors=errors, warnings=warnings) + return CreationResult(result=assembly_results, errors=errors, warnings=warnings) + + +# TODO: Also pass the full segment list from config so we can handle someone submitting +# a multi-segmented virus that has a main segment. This will require having one pipeline +# per organism, not one pipeline for all. Wider changes, thus. +def get_chromsome_accessions( + insdc_accession_range: str, segment_order: list[str] +) -> dict[str, str]: + """ + ENA doesn't actually give us the version, we assume it's 1. + ### Example inputs + insdc_accession_range: "OZ189935-OZ189936" + segment_order: ["segment1", "segment2"] + ### Example output + { + "insdc_accession_segment1": "OZ189935", + "insdc_accession_full_segment1": "OZ189935.1", + "insdc_accession_segment2": "OZ189936", + "insdc_accession_full_segment2": "OZ189936.1", + } + """ + try: + start, end = insdc_accession_range.split("-") + start_letters = start[:2] + end_letters = end[:2] + + if start_letters != end_letters: + raise ValueError("Prefixes in the accession range do not match") + + num_digits = len(start) - 2 + start_num = int(start[2:]) + end_num = int(end[2:]) + + if end_num - start_num != len(segment_order) - 1: + logger.error( + "Unexpected response format: chromosome does not have expected number of segments" + ) + raise ValueError("Unexpected number of segments") + + match segment_order: + case ["main"]: + accession = f"{start_letters}{start_num:0{num_digits}d}" + return { + "insdc_accession": accession, + "insdc_accession_full": f"{accession}.1", + } + case _: + results = {} + for i, segment in enumerate(segment_order): + accession = f"{start_letters}{(start_num + i):0{num_digits}d}" + results[f"insdc_accession_{segment}"] = accession + results[f"insdc_accession_full_{segment}"] = f"{accession}.1" + return results + + except Exception as e: + logger.error(f"Error processing chromosome accessions: {str(e)}") + raise ValueError("Failed to process chromosome accessions") from e diff --git a/ena-submission/scripts/ena_types.py b/ena-submission/scripts/ena_types.py index 2f8043649..e08ae5e7f 100644 --- a/ena-submission/scripts/ena_types.py +++ b/ena-submission/scripts/ena_types.py @@ -245,3 +245,23 @@ class AssemblyChromosomeListFileObject: @dataclass class AssemblyChromosomeListFile: chromosomes: list[AssemblyChromosomeListFileObject] + + +@dataclass +class Hold: + HoldUntilDate: XmlAttribute | None = None + + +@dataclass +class Action: + add: str | None = None + hold: Hold | None = None + + +@dataclass +class Actions: + action: list[Action] + +@dataclass +class Submission: + actions: Actions diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index 70667ae4e..8b277ac87 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -21,9 +21,10 @@ create_fasta, create_manifest, dataclass_to_xml, + get_chromsome_accessions, + get_ena_analysis_process, ) from ena_types import default_project_type, default_sample_type -from requests import exceptions # Default configs with open("config/defaults.yaml", encoding="utf-8") as f: @@ -62,6 +63,9 @@ def mock_config(): test_sample_xml_request = Path("test/test_sample_request.xml").read_text(encoding="utf-8") test_sample_xml_response = Path("test/test_sample_response.xml").read_text(encoding="utf-8") +process_response_text = Path("test/get_ena_analysis_process_response.json").read_text( + encoding="utf-8" +) # Test sample @@ -108,7 +112,7 @@ def test_create_project_success(self, mock_post) -> None: "bioproject_accession": "PRJEB20767", "ena_submission_accession": "ERA912529", } - self.assertEqual(response.results, desired_response) + self.assertEqual(response.result, desired_response) @mock.patch("requests.post") def test_create_project_xml_failure(self, mock_post): @@ -151,7 +155,7 @@ def test_create_sample_success(self, mock_post): "biosample_accession": "SAMEA104174130", "ena_submission_accession": "ERA979927", } - self.assertEqual(response.results, desired_response) + self.assertEqual(response.result, desired_response) def test_sample_set_construction(self): config = mock_config() @@ -266,6 +270,66 @@ def test_create_manifest(self): self.assertEqual(data, expected_data) + def test_get_chromsome_accessions(self): + insdc_accession_range = "OZ189935-OZ189936" + segment_order = ["seg2", "seg3"] + result_multi = get_chromsome_accessions(insdc_accession_range, segment_order) + self.assertEqual( + result_multi, + { + "insdc_accession_seg2": "OZ189935", + "insdc_accession_seg3": "OZ189936", + "insdc_accession_full_seg2": "OZ189935.1", + "insdc_accession_full_seg3": "OZ189936.1", + }, + ) + + insdc_accession_range = "OZ189935-OZ189935" + segment_order = ["main"] + result_single = get_chromsome_accessions(insdc_accession_range, segment_order) + self.assertEqual( + result_single, + { + "insdc_accession": "OZ189935", + "insdc_accession_full": "OZ189935.1", + }, + ) + + insdc_accession_range = "OZ189935-OZ189935" + segment_order = ["seg3"] + result_single = get_chromsome_accessions(insdc_accession_range, segment_order) + self.assertEqual( + result_single, + { + "insdc_accession_seg3": "OZ189935", + "insdc_accession_full_seg3": "OZ189935.1", + }, + ) + + insdc_accession_range = "OZ189935-OZ189936" + segment_order = ["main"] + with self.assertRaises(ValueError): + get_chromsome_accessions(insdc_accession_range, segment_order) + + insdc_accession_range = "OZ189935-TK189936" + segment_order = ["A", "B"] + with self.assertRaises(ValueError): + get_chromsome_accessions(insdc_accession_range, segment_order) + + @mock.patch("requests.get") + def test_get_ena_analysis_process(self, mock_post): + mock_post.return_value = mock_requests_post(200, process_response_text) + response = get_ena_analysis_process( + test_ena_config, erz_accession="ERZ000001", segment_order=["main"] + ) + desired_response = { + "erz_accession": "ERZ000001", + "insdc_accession": "OZ189999", + "insdc_accession_full": "OZ189999.1", + "segment_order": ["main"], + } + self.assertEqual(response.result, desired_response) + if __name__ == "__main__": unittest.main() diff --git a/ena-submission/test/get_ena_analysis_process_response.json b/ena-submission/test/get_ena_analysis_process_response.json new file mode 100644 index 000000000..719270b9b --- /dev/null +++ b/ena-submission/test/get_ena_analysis_process_response.json @@ -0,0 +1 @@ +[{"report":{"id":"ERZ24879999","analysisType":"SEQUENCE_ASSEMBLY","acc":"chromosomes:OZ189999-OZ189999","processingStatus":"COMPLETED","processingStart":"27-09-2024 06:06:40","processingEnd":"27-09-2024 06:07:06","processingError":null},"links":[]}] \ No newline at end of file diff --git a/ingest/README.md b/ingest/README.md index 6669f8ad0..94f6380e7 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -140,7 +140,14 @@ The ingest pipeline requires config files, found in the directory `config`. The ## Testing -Currently, there is not automated testing other than running the pipeline manually and in preview deployments. +Tests can be found in the `tests` folder, they can be run using + +```sh +micromamba activate loculus-ingest +pytest tests/ +``` + +`.github/workflows/ingest-tests.yaml` runs these tests, ## Roadmap diff --git a/ingest/Snakefile b/ingest/Snakefile index e9e788927..ca8da1ba4 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -26,6 +26,7 @@ LOG_LEVEL = config.get("log_level", "INFO") NCBI_API_KEY = os.getenv("NCBI_API_KEY") FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None) APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes +CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False) def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING): @@ -95,6 +96,49 @@ rule rename_columns: ) +if CHECK_ENA_DEPOSITION: + + rule get_loculus_depositions: + input: + script="scripts/get_loculus_depositions.py", + config="results/config.yaml", + output: + exclude_insdc_accessions="results/insdc_accessions_to_exclude.tsv", + exclude_biosample_accessions="results/biosample_accessions_to_exclude.tsv", + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --output-insdc-accessions {output.exclude_insdc_accessions} \ + --output-biosample-accessions {output.exclude_biosample_accessions} \ + --log-level {params.log_level} \ + --config-file {input.config} \ + """ + + rule filter_out_loculus_depositions: + input: + ncbi_dataset_tsv="results/metadata_post_rename.tsv", + exclude_biosample_accessions="results/biosample_accessions_to_exclude.tsv", + exclude_insdc_accessions="results/insdc_accessions_to_exclude.tsv", + script="scripts/filter_out_depositions.py", + config="results/config.yaml", + output: + metadata_tsv="results/filtered_metadata.tsv", + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --input-metadata-tsv {input.ncbi_dataset_tsv} \ + --exclude-insdc-accessions {input.exclude_insdc_accessions} \ + --exclude-biosample-accessions {input.exclude_biosample_accessions} \ + --log-level {params.log_level} \ + --config-file {input.config} \ + --output-metadata-tsv {output.metadata_tsv} \ + """ + + rule extract_ncbi_dataset_sequences: """ For unsegmented sequences, we only keep the sequence ID in the header. @@ -210,7 +254,11 @@ rule prepare_metadata: # Transform Genbank metadata keys and values to Loculus format input: script="scripts/prepare_metadata.py", - metadata="results/metadata_post_rename.tsv", + metadata=( + "results/filtered_metadata.tsv" + if CHECK_ENA_DEPOSITION + else "results/metadata_post_rename.tsv" + ), segments="results/nextclade_merged.tsv" if SEGMENTED else "results/config.yaml", # else is just a dummy sequence_hashes="results/sequence_hashes.ndjson", config="results/config.yaml", diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index c4ed19c4a..9a7ca754f 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -76,3 +76,6 @@ password: insdc_ingest_user keycloak_client_id: backend-client subsample_fraction: 1.0 approve_timeout_min: "25" # Cronjobs run every 30min, make approve stop before it is forced to stop by argocd +db_username: postgres +db_password: unsecure +db_url: "jdbc:postgresql://127.0.0.1:5432/loculus" diff --git a/ingest/environment.yml b/ingest/environment.yml index 9d98dcf5b..505fcddd2 100644 --- a/ingest/environment.yml +++ b/ingest/environment.yml @@ -14,6 +14,7 @@ dependencies: - nextclade >=3.7.0 - orjsonl - pandas + - psycopg2 - PyYAML - requests - seqkit diff --git a/ingest/scripts/filter_out_depositions.py b/ingest/scripts/filter_out_depositions.py new file mode 100644 index 000000000..04e288152 --- /dev/null +++ b/ingest/scripts/filter_out_depositions.py @@ -0,0 +1,87 @@ +import logging +from dataclasses import dataclass + +import click +import pandas as pd +import yaml + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.DEBUG, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + segmented: bool + db_password: str + db_username: str + db_host: str + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +@click.option( + "--input-metadata-tsv", + required=True, + type=click.Path(exists=True), +) +@click.option( + "--output-metadata-tsv", + required=True, + type=click.Path(), +) +@click.option( + "--exclude-insdc-accessions", + required=True, + type=click.Path(), +) +@click.option( + "--exclude-biosample-accessions", + required=True, + type=click.Path(), +) +def filter_out_depositions( + log_level, + config_file, + input_metadata_tsv, + output_metadata_tsv, + exclude_insdc_accessions, + exclude_biosample_accessions, +): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False) + original_count = len(df) + with open(exclude_insdc_accessions, encoding="utf-8") as f: + loculus_insdc_accessions = [line.strip() for line in f] + + with open(exclude_biosample_accessions, encoding="utf-8") as f: + loculus_biosample_accessions = [line.strip() for line in f] + + filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)] + filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)] + logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.") + filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False) + + +if __name__ == "__main__": + filter_out_depositions() diff --git a/ingest/scripts/get_loculus_depositions.py b/ingest/scripts/get_loculus_depositions.py new file mode 100644 index 000000000..b83e0cedf --- /dev/null +++ b/ingest/scripts/get_loculus_depositions.py @@ -0,0 +1,161 @@ +import logging +import os +import re +from dataclasses import dataclass + +import click +import yaml +from psycopg2.extras import RealDictCursor +from psycopg2.pool import SimpleConnectionPool + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.DEBUG, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + segmented: bool + db_password: str + db_username: str + db_host: str + + +def convert_jdbc_to_psycopg2(jdbc_url): + jdbc_pattern = r"jdbc:postgresql://(?P[^:/]+)(?::(?P\d+))?/(?P[^?]+)" + + match = re.match(jdbc_pattern, jdbc_url) + + if not match: + msg = "Invalid JDBC URL format." + raise ValueError(msg) + + host = match.group("host") + port = match.group("port") or "5432" # Default to 5432 if no port is provided + dbname = match.group("dbname") + + return f"postgresql://{host}:{port}/{dbname}" + + +def db_init( + db_password_default: str, db_username_default: str, db_url_default: str +) -> SimpleConnectionPool: + db_password = os.getenv("DB_PASSWORD") + if not db_password: + db_password = db_password_default + + db_username = os.getenv("DB_USERNAME") + if not db_username: + db_username = db_username_default + + db_url = os.getenv("DB_URL") + if not db_url: + db_url = db_url_default + + db_dsn = convert_jdbc_to_psycopg2(db_url) + "?options=-c%20search_path%3Dena_deposition_schema" + return SimpleConnectionPool( + minconn=1, + maxconn=2, # max 7*2 connections to db allowed + user=db_username, + password=db_password, + dsn=db_dsn, + ) + + +def get_bio_sample_accessions(db_conn_pool: SimpleConnectionPool) -> dict[str, str]: + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + # Result is a jsonb column + query = "SELECT accession, result FROM sample_table WHERE STATUS = 'SUBMITTED'" + + cur.execute(query) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return {result["accession"]: result["result"]["biosample_accession"] for result in results} + + +def get_insdc_accessions(db_conn_pool: SimpleConnectionPool) -> dict[str, str]: + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + # Result is a jsonb column + query = "SELECT accession, result FROM assembly_table WHERE STATUS IN ('SUBMITTED', 'WAITING')" + + cur.execute(query) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return { + result["accession"]: [ + result["result"][key] + for key in result["result"] + if key.startswith("insdc_accession_full") + ] + for result in results + } + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +@click.option( + "--output-insdc-accessions", + required=True, + type=click.Path(), +) +@click.option( + "--output-biosample-accessions", + required=True, + type=click.Path(), +) +def get_loculus_depositions( + log_level, config_file, output_insdc_accessions, output_biosample_accessions +): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + + db_config = db_init(config.db_password, config.db_username, config.db_host) + insdc_accessions_submitted_by_loculus = get_insdc_accessions(db_config) + all_insdc_accessions_submitted_by_loculus = [ + item for sublist in insdc_accessions_submitted_by_loculus.values() for item in sublist + ] + logger.debug(f"Assembly accessions to filter out: {all_insdc_accessions_submitted_by_loculus}") + biosample_accessions_submitted_by_loculus = get_bio_sample_accessions(db_config) + logger.debug( + f"Biosample accessions to filter out: {biosample_accessions_submitted_by_loculus.values()}" + ) + + with open(output_insdc_accessions, "w", encoding="utf-8") as f: + for item in all_insdc_accessions_submitted_by_loculus: + f.write(f"{item}\n") + with open(output_biosample_accessions, "w", encoding="utf-8") as f: + for item in biosample_accessions_submitted_by_loculus.values(): + f.write(f"{item}\n") + + +if __name__ == "__main__": + get_loculus_depositions() diff --git a/ingest/tests/config_cchf/config.yaml b/ingest/tests/config_cchf/config.yaml index bd7fdb63f..2ae38b880 100644 --- a/ingest/tests/config_cchf/config.yaml +++ b/ingest/tests/config_cchf/config.yaml @@ -27,3 +27,4 @@ rename: ncbiSubmitterAffiliation: authorAffiliations ncbiSubmitterNames: authors taxon_id: 3052518 +check_ena_deposition: true diff --git a/ingest/tests/test_data_cchf/biosample_accessions_to_exclude.tsv b/ingest/tests/test_data_cchf/biosample_accessions_to_exclude.tsv new file mode 100644 index 000000000..a6bf3ded4 --- /dev/null +++ b/ingest/tests/test_data_cchf/biosample_accessions_to_exclude.tsv @@ -0,0 +1 @@ +SAMEA131392145 diff --git a/ingest/tests/test_data_cchf/insdc_accessions_to_exclude.tsv b/ingest/tests/test_data_cchf/insdc_accessions_to_exclude.tsv new file mode 100644 index 000000000..81277edc9 --- /dev/null +++ b/ingest/tests/test_data_cchf/insdc_accessions_to_exclude.tsv @@ -0,0 +1 @@ +INSDC001.1 diff --git a/ingest/tests/test_data_cchf/metadata_post_extract.tsv b/ingest/tests/test_data_cchf/metadata_post_extract.tsv index 5077fac22..5681090d4 100644 --- a/ingest/tests/test_data_cchf/metadata_post_extract.tsv +++ b/ingest/tests/test_data_cchf/metadata_post_extract.tsv @@ -5,3 +5,4 @@ KX013464.1 PARTIAL 1 Russia: Astrakhan Europe Ixodoidea 297308 true KX096703.1 PARTIAL 1 Kazakhstan: Sairam district Asia Hyalomma anatolicum 176092 true 2015 tick pool #134 513 1 2016-04-30T00:00:00Z GenBank Public Health England, Research Deryabin,P.,Atshabar,B.,Sansyzbaev,Y.,Berezin,V.,Nurmakhanov,T.,Yeskhojayev,O.,Vilkova,A.,Shevtsov,A.,Hewson,R.,Atkinson,B. 2016-04-30T00:00:00Z Orthonairovirus haemorrhagiae 3052518 KX013483.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 12098 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 KX013485.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 1571 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 +INSDC001.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 1571 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py index d011b78f6..d469efb79 100644 --- a/ingest/tests/test_ingest.py +++ b/ingest/tests/test_ingest.py @@ -62,6 +62,7 @@ def test_snakemake(): destination_directory = CONFIG_DIR source_directory = TEST_DATA_DIR / "config_cchf" copy_files(source_directory, destination_directory) + run_snakemake("get_loculus_depositions", touch=True) # Do not call_loculus run_snakemake("group_segments") run_snakemake("get_previous_submissions", touch=True) # Do not call_loculus run_snakemake("compare_hashes") diff --git a/kubernetes/loculus/templates/_common-metadata.tpl b/kubernetes/loculus/templates/_common-metadata.tpl index 213d7ef5e..4703e873f 100644 --- a/kubernetes/loculus/templates/_common-metadata.tpl +++ b/kubernetes/loculus/templates/_common-metadata.tpl @@ -39,21 +39,21 @@ fields: generateIndex: true autocomplete: true header: Submission details - - name: groupId - displayName: Group ID - type: int + - name: groupName + type: string + generateIndex: true autocomplete: true header: Submission details displayName: Submitting group customDisplay: type: submittingGroup displayGroup: group - - name: groupName - type: string - generateIndex: true + - name: groupId + displayName: Group ID + type: int autocomplete: true header: Submission details - displayName: Submitting group + displayName: Submitting group (numeric ID) customDisplay: type: submittingGroup displayGroup: group @@ -95,6 +95,10 @@ fields: type: string notSearchable: true hideOnSequenceDetailsPage: true + - name: versionComment + type: string + displayName: Version comment + header: Submission details {{- if $.Values.dataUseTermsUrls }} - name: dataUseTermsUrl displayName: Data use terms URL @@ -132,6 +136,9 @@ fields: {{- define "loculus.generateWebsiteConfig" }} name: {{ quote $.Values.name }} logo: {{ $.Values.logo | toYaml | nindent 6 }} +{{ if $.Values.gitHubMainUrl }} +gitHubMainUrl: {{ quote $.Values.gitHubMainUrl }} +{{ end }} {{ if $.Values.bannerMessage }} bannerMessage: {{ quote $.Values.bannerMessage }} {{ else if or $.Values.runDevelopmentMainDatabase $.Values.runDevelopmentKeycloakDatabase }} diff --git a/kubernetes/loculus/templates/ingest-config.yaml b/kubernetes/loculus/templates/ingest-config.yaml index 85786a24c..7118fc6a6 100644 --- a/kubernetes/loculus/templates/ingest-config.yaml +++ b/kubernetes/loculus/templates/ingest-config.yaml @@ -14,6 +14,7 @@ data: config.yaml: | {{- $values.ingest.configFile | toYaml | nindent 4 }} verify_loculus_version_is: {{$dockerTag}} + check_ena_deposition: {{ not $.Values.disableEnaSubmission }} organism: {{ $key }} backend_url: {{ $backendHost }} keycloak_token_url: {{ $keycloakHost -}}/realms/loculus/protocol/openid-connect/token @@ -25,4 +26,4 @@ data: {{- end -}} {{- end }} {{- end }} -{{- end }} \ No newline at end of file +{{- end }} diff --git a/kubernetes/loculus/templates/ingest-deployment.yaml b/kubernetes/loculus/templates/ingest-deployment.yaml index 2545563d3..f0ee7fe44 100644 --- a/kubernetes/loculus/templates/ingest-deployment.yaml +++ b/kubernetes/loculus/templates/ingest-deployment.yaml @@ -56,6 +56,21 @@ spec: secretKeyRef: name: service-accounts key: insdcIngestUserPassword + - name: DB_URL + valueFrom: + secretKeyRef: + name: database + key: url + - name: DB_USERNAME + valueFrom: + secretKeyRef: + name: database + key: username + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: database + key: password args: - snakemake - results/submitted diff --git a/kubernetes/loculus/templates/keycloak-config-map.yaml b/kubernetes/loculus/templates/keycloak-config-map.yaml index 959ac42f3..392888486 100644 --- a/kubernetes/loculus/templates/keycloak-config-map.yaml +++ b/kubernetes/loculus/templates/keycloak-config-map.yaml @@ -114,7 +114,7 @@ data: ] } }, - {{ end }} + {{ end }} { "username": "insdc_ingest_user", "enabled": true, @@ -260,8 +260,7 @@ data: "http://localhost:3000/*" ] }, - { - + { "clientId" : "account-console2", "name" : "${client_account-console}", "description" : "", @@ -319,7 +318,7 @@ data: , "userProfileEnabled" : "true" }, "components": { - "org.keycloak.userprofile.UserProfileProvider" : [ { + "org.keycloak.userprofile.UserProfileProvider" : [ { "providerId" : "declarative-user-profile", "subComponents" : { }, "config" : { diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 84bd1242a..195cc9067 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1059,9 +1059,6 @@ defaultOrganismConfig: &defaultOrganismConfig type: percentage preprocessing: inputs: {input: nextclade.coverage} - - name: versionComment - displayName: Version comment - header: Submission details website: &website tableColumns: - sampleCollectionDate @@ -1525,6 +1522,7 @@ replicas: website: 1 backend: 1 gitHubEditLink: "https://github.com/loculus-project/loculus/edit/main/monorepo/website/" +gitHubMainUrl: https://github.com/loculus-project/loculus resources: website: requests: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 2c6474e75..6f01c0e1d 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -31,6 +31,7 @@ ProcessedData, ProcessedEntry, ProcessedMetadata, + ProcessedMetadataValue, ProcessingAnnotation, ProcessingResult, ProcessingSpec, @@ -654,6 +655,31 @@ def process_single( ) +def processed_entry_with_errors(id): + return ProcessedEntry( + accession=accession_from_str(id), + version=version_from_str(id), + data=ProcessedData( + metadata=defaultdict(dict[str, ProcessedMetadataValue]), + unalignedNucleotideSequences=defaultdict(dict[str, Any]), + alignedNucleotideSequences=defaultdict(dict[str, Any]), + nucleotideInsertions=defaultdict(dict[str, Any]), + alignedAminoAcidSequences=defaultdict(dict[str, Any]), + aminoAcidInsertions=defaultdict(dict[str, Any]), + ), + errors=[ + ProcessingAnnotation( + source=[AnnotationSource(name="unknown", type=AnnotationSourceType.METADATA)], + message=( + f"Failed to process submission with id: {id} - please review your submission " + "or reach out to an administrator if this error persists." + ), + ) + ], + warnings=[], + ) + + def process_all( unprocessed: Sequence[UnprocessedEntry], dataset_dir: str, config: Config ) -> Sequence[ProcessedEntry]: @@ -661,11 +687,19 @@ def process_all( if config.nextclade_dataset_name: nextclade_results = enrich_with_nextclade(unprocessed, dataset_dir, config) for id, result in nextclade_results.items(): - processed_single = process_single(id, result, config) + try: + processed_single = process_single(id, result, config) + except Exception as e: + logging.error(f"Processing failed for {id} with error: {e}") + processed_single = processed_entry_with_errors(id) processed_results.append(processed_single) else: for entry in unprocessed: - processed_single = process_single(entry.accessionVersion, entry.data, config) + try: + processed_single = process_single(entry.accessionVersion, entry.data, config) + except Exception as e: + logging.error(f"Processing failed for {id} with error: {e}") + processed_single = processed_entry_with_errors(id) processed_results.append(processed_single) return processed_results diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 780764d66..d961461c7 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -38,6 +38,13 @@ def standardize_option(option): return " ".join(option.lower().split()) +def invalid_value_annotation(input_datum, output_field, value_type) -> ProcessingAnnotation: + return ProcessingAnnotation( + source=[AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)], + message=f"Invalid {value_type} value: {input_datum} for field {output_field}.", + ) + + class ProcessingFunctions: @classmethod def call_function( @@ -57,6 +64,24 @@ def call_function( f"with input {input_data} and args {args}: {e}" ) logger.exception(message) + return ProcessingResult( + datum=None, + warnings=[], + errors=[ + ProcessingAnnotation( + source=[ + AnnotationSource( + name=output_field, type=AnnotationSourceType.METADATA + ) + ], + message=( + f"Internal Error: Function {function_name} did not return " + f"ProcessingResult with input {input_data} and args {args}, " + "please contact the administrator." + ), + ) + ], + ) if isinstance(result, ProcessingResult): return result # Handle unexpected case where a called function does not return a ProcessingResult @@ -138,6 +163,7 @@ def check_date( ], ) + # TODO: This function is specifically for collection date - maybe rename it to reflect that @staticmethod def process_date( input_data: InputMetadata, @@ -170,6 +196,7 @@ def process_date( warnings = [] errors = [] + # TODO: required check is also in process_single - check if can be removed here if len(date_str) == 0: if args and args.get("required"): errors.append( @@ -337,30 +364,30 @@ def concatenate( ) formatted_input_data = [] - for i in range(len(order)): - if type[i] == "date": - processed = ProcessingFunctions.process_date( - {"date": input_data[order[i]]}, output_field - ) - formatted_input_data.append("" if processed.datum is None else processed.datum) - errors += processed.errors - warnings += processed.warnings - elif type[i] == "timestamp": - processed = ProcessingFunctions.parse_timestamp( - {"timestamp": input_data[order[i]]}, output_field - ) - formatted_input_data.append("" if processed.datum is None else processed.datum) - errors += processed.errors - warnings += processed.warnings - elif order[i] in input_data: - formatted_input_data.append( - "" if input_data[order[i]] is None else input_data[order[i]] - ) - else: - formatted_input_data.append(accession_version) - logging.debug(f"formatted input data:{formatted_input_data}") - try: + for i in range(len(order)): + if type[i] == "date": + processed = ProcessingFunctions.process_date( + {"date": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) + errors += processed.errors + warnings += processed.warnings + elif type[i] == "timestamp": + processed = ProcessingFunctions.parse_timestamp( + {"timestamp": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) + errors += processed.errors + warnings += processed.warnings + elif order[i] in input_data: + formatted_input_data.append( + "" if input_data[order[i]] is None else input_data[order[i]] + ) + else: + formatted_input_data.append(accession_version) + logging.debug(f"formatted input data:{formatted_input_data}") + result = "/".join(formatted_input_data) # To avoid downstream issues do not let the result start or end in a "/" # Also replace white space with '_' @@ -374,7 +401,10 @@ def concatenate( source=[ AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) ], - message="Concatenation failed. This is a technical error, please contact the administrator.", + message=( + f"Concatenation failed for {output_field}. This is a technical error, " + "please contact the administrator." + ), ) ) return ProcessingResult( @@ -405,14 +435,24 @@ def identity( if not input_datum: return ProcessingResult(datum=None, warnings=[], errors=[]) - warnings: list[ProcessingAnnotation] = [] + errors: list[ProcessingAnnotation] = [] output_datum: ProcessedMetadataValue if args and "type" in args: match args["type"]: case "int": - output_datum = int(input_datum) + try: + output_datum = int(input_datum) + except ValueError: + output_datum = None + errors.append(invalid_value_annotation(input_datum, output_field, "int")) case "float": - output_datum = float(input_datum) + try: + output_datum = float(input_datum) + except ValueError: + output_datum = None + errors.append( + invalid_value_annotation(input_datum, output_field, "float") + ) case "boolean": if input_datum.lower() == "true": output_datum = True @@ -420,21 +460,14 @@ def identity( output_datum = False else: output_datum = None - warnings.append( - ProcessingAnnotation( - source=[ - AnnotationSource( - name=output_field, type=AnnotationSourceType.METADATA - ) - ], - message=f"Invalid boolean value: {input_datum}. Defaulting to null.", - ) + errors.append( + invalid_value_annotation(input_datum, output_field, "boolean") ) case _: output_datum = input_datum else: output_datum = input_datum - return ProcessingResult(datum=output_datum, warnings=warnings, errors=[]) + return ProcessingResult(datum=output_datum, warnings=[], errors=errors) @staticmethod def process_options( diff --git a/website/src/components/Navigation/Navigation.astro b/website/src/components/Navigation/Navigation.astro index 97e197210..1337ce7f1 100644 --- a/website/src/components/Navigation/Navigation.astro +++ b/website/src/components/Navigation/Navigation.astro @@ -6,9 +6,10 @@ import { getAuthUrl } from '../../utils/getAuthUrl'; interface Props { implicitOrganism?: string; + gitHubMainUrl?: string; } -const { implicitOrganism } = Astro.props; +const { implicitOrganism, gitHubMainUrl } = Astro.props; const { organism, knownOrganisms } = cleanOrganism(Astro.params.organism); const selectedOrganism = implicitOrganism !== undefined ? knownOrganisms.find((it) => it.key === implicitOrganism) : organism; @@ -35,6 +36,12 @@ const loginUrl = await getAuthUrl(Astro.url.toString()); top: '-2rem', }} > - + diff --git a/website/src/components/Navigation/SandwichMenu.tsx b/website/src/components/Navigation/SandwichMenu.tsx index cb0f927ca..800a35ed6 100644 --- a/website/src/components/Navigation/SandwichMenu.tsx +++ b/website/src/components/Navigation/SandwichMenu.tsx @@ -10,9 +10,10 @@ type SandwichMenuProps = { organism: Organism | undefined; isLoggedIn: boolean; loginUrl: string | undefined; + gitHubMainUrl: string | undefined; }; -export const SandwichMenu: FC = ({ organism, isLoggedIn, loginUrl }) => { +export const SandwichMenu: FC = ({ organism, isLoggedIn, loginUrl, gitHubMainUrl }) => { const { isOpen, toggle: toggleMenu, close: closeMenu } = useOffCanvas(); return ( @@ -50,7 +51,11 @@ export const SandwichMenu: FC = ({ organism, isLoggedIn, logi
diff --git a/website/src/layouts/BaseLayout.astro b/website/src/layouts/BaseLayout.astro index f3c60216f..719f5cb10 100644 --- a/website/src/layouts/BaseLayout.astro +++ b/website/src/layouts/BaseLayout.astro @@ -11,7 +11,7 @@ import { navigationItems } from '../routes/navigationItems'; import { BackendClient } from '../services/backendClient'; const websiteConfig = getWebsiteConfig(); -const { name: websiteName, logo, bannerMessage, additionalHeadHTML } = websiteConfig; +const { name: websiteName, logo, bannerMessage, additionalHeadHTML, gitHubMainUrl } = websiteConfig; interface Props { title: string; @@ -72,7 +72,7 @@ const lastTimeBannerWasClosed = Astro.cookies.get('lastTimeBannerWasClosed')?.va
- +
@@ -95,7 +95,10 @@ const lastTimeBannerWasClosed = Astro.cookies.get('lastTimeBannerWasClosed')?.va } - + github-icon diff --git a/website/src/types/config.ts b/website/src/types/config.ts index 12438d9b0..dc674b18c 100644 --- a/website/src/types/config.ts +++ b/website/src/types/config.ts @@ -117,6 +117,7 @@ export const websiteConfig = z.object({ bannerMessage: z.string().optional(), additionalHeadHTML: z.string().optional(), gitHubEditLink: z.string().optional(), + gitHubMainUrl: z.string().optional(), }); export type WebsiteConfig = z.infer;