From a637799fa9bf8c4e4f62d32e37f49b1bd0739c4b Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:08:02 +0200 Subject: [PATCH 01/19] Add concatenate function to preprocessing. --- .../templates/_preprocessingFromValues.tpl | 6 ++ kubernetes/loculus/values.yaml | 12 ++- .../src/loculus_preprocessing/prepro.py | 98 ++++++++++++------- .../processing_functions.py | 67 +++++++++++++ 4 files changed, 143 insertions(+), 40 deletions(-) diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index 7ce408ae0..05d8c8247 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -7,6 +7,12 @@ {{- if .type }} type: {{ .type }} {{- end }} + {{- if .order }} + order: + {{- range .order }} + - {{ . }} + {{- end }} + {{- end }} {{- if .preprocessing }} {{- if hasKey .preprocessing "function" }} function: {{ index .preprocessing "function" }} diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index b12e18720..7c0b4b329 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -64,6 +64,14 @@ defaultOrganismConfig: &defaultOrganismConfig inputs: date: sample_collection_date required: true + - name: display_name + order: [geo_loc_country, accession_version, sample_collection_date] + preprocessing: + function: concatenate + inputs: + string: geo_loc_country + date: sample_collection_date + noInput: true - name: ncbi_release_date displayName: NCBI release date type: date @@ -1193,5 +1201,5 @@ enableCrossRefCredentials: true runDevelopmentKeycloakDatabase: true runDevelopmentMainDatabase: true enforceHTTPS: true -registrationTermsMessage: > - You must agree to the terms of use. \ No newline at end of file +registrationTermsMessage: >- + You must agree to the terms of use. diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 07328e5d2..b40e01722 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -1,3 +1,4 @@ +import copy import csv import json import logging @@ -316,7 +317,53 @@ def null_per_backend(x: Any) -> bool: return False +def add_InputMetadata( + spec: ProcessingSpec, + unprocessed: UnprocessedAfterNextclade, + errors: list[ProcessingAnnotation], + input_data: InputMetadata, + arg_name: str, + input_path: str, +) -> InputMetadata: + input_data[arg_name] = None + # If field starts with "nextclade.", take from nextclade metadata + nextclade_prefix = "nextclade." + if input_path.startswith(nextclade_prefix): + segment = spec.args.get("segment", "main") + if unprocessed.nextcladeMetadata is None: + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource( + name="main", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ) + ], + message="Nucleotide sequence failed to align", + ) + ) + return input_data + sub_path = input_path[len(nextclade_prefix) :] + if segment in unprocessed.nextcladeMetadata: + input_data[arg_name] = str( + dpath.get( + unprocessed.nextcladeMetadata[segment], + sub_path, + separator=".", + default=None, + ) + ) + else: + input_data[arg_name] = None + return input_data + if input_path not in unprocessed.inputMetadata: + return input_data + input_data[arg_name] = unprocessed.inputMetadata[input_path] + return input_data + + def get_metadata( + id: AccessionVersion, spec: ProcessingSpec, output_field: str, unprocessed: UnprocessedAfterNextclade, @@ -324,47 +371,21 @@ def get_metadata( warnings: list[ProcessingAnnotation], ) -> ProcessingResult: input_data: InputMetadata = {} + args = {} if spec.args is None else copy.deepcopy(spec.args) for arg_name, input_path in spec.inputs.items(): - input_data[arg_name] = None - # If field starts with "nextclade.", take from nextclade metadata - nextclade_prefix = "nextclade." - if input_path.startswith(nextclade_prefix): - # Remove "nextclade." prefix - if spec.args is None: - spec.args = {} - segment = spec.args.get("segment", "main") - if unprocessed.nextcladeMetadata is None: - errors.append( - ProcessingAnnotation( - source=[ - AnnotationSource( - name="main", - type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - ) - ], - message="Nucleotide sequence failed to align", - ) - ) - continue - sub_path = input_path[len(nextclade_prefix) :] - if segment in unprocessed.nextcladeMetadata: - input_data[arg_name] = str( - dpath.get( - unprocessed.nextcladeMetadata[segment], - sub_path, - separator=".", - default=None, - ) - ) - else: - input_data[arg_name] = None - continue - if input_path not in unprocessed.inputMetadata: - continue - input_data[arg_name] = unprocessed.inputMetadata[input_path] + input_data = add_InputMetadata(spec, unprocessed, errors, input_data, arg_name, input_path) + if spec.function == "concatenate": + args["accession_version"] = id + filledin_order: InputMetadata = {} + for item in spec.args["order"]: + filledin_order = add_InputMetadata( + copy.deepcopy(spec), unprocessed, errors, filledin_order, item, item + ) + args["order"] = [filledin_order[item] for item in spec.args["order"]] + try: processing_result = ProcessingFunctions.call_function( - spec.function, spec.args, input_data, output_field + spec.function, args, input_data, output_field ) except Exception as e: msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}" @@ -403,6 +424,7 @@ def process_single( ) spec.args = {} if spec.args is None else spec.args processing_result = get_metadata( + id, spec, output_field, unprocessed, diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 7e03977f4..cd0d9fd9e 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -273,6 +273,73 @@ def parse_timestamp( errors=errors, ) + @staticmethod + def concatenate( + input_data: InputMetadata, output_field: str, args: FunctionArgs = None + ) -> ProcessingResult: + """Concatenates input fields with accession_version using the "/" separator in the order + specified by the order argument. + """ + warnings: list[ProcessingAnnotation] = [] + errors: list[ProcessingAnnotation] = [] + + number_fields = len(input_data.keys()) + 1 + + accession_version = args["accession_version"] + order = args["order"] + + # Check accessionVersion only exists once in the list: + if number_fields != len(order): + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message="Concatenation failed.", + ) + ) + return ProcessingResult( + datum=None, + warnings=warnings, + errors=errors, + ) + + formatted_input_data = {} + for key, item in input_data.items(): + if key == "date": + processed = ProcessingFunctions.process_date({key: item}, output_field) + formatted_input_data[item] = processed.datum + errors += processed.errors + warnings += processed.warnings + if key == "timestamp": + processed = ProcessingFunctions.parse_timestamp({key: item}, output_field) + formatted_input_data[item] = processed.datum + errors += processed.errors + warnings += processed.warnings + else: + formatted_input_data[item] = item + logging.debug(f"formatted input data:{formatted_input_data}") + + try: + concatenation_order = [formatted_input_data.get(i, accession_version) for i in order] + result = "/".join(concatenation_order) + + return ProcessingResult(datum=result, warnings=warnings, errors=errors) + except ValueError as e: + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message="Concatenation failed.", + ) + ) + return ProcessingResult( + datum=None, + errors=errors, + warnings=warnings, + ) + @staticmethod def identity( input_data: InputMetadata, output_field: str, args: FunctionArgs = None From 378ba280767f7a056fc0b1d45905d50b814f5453 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:13:07 +0200 Subject: [PATCH 02/19] Update values.yaml Add display_name to INSDC header just to see what it looks like --- kubernetes/loculus/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 7c0b4b329..45969c79c 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -65,6 +65,7 @@ defaultOrganismConfig: &defaultOrganismConfig date: sample_collection_date required: true - name: display_name + header: "INSDC" order: [geo_loc_country, accession_version, sample_collection_date] preprocessing: function: concatenate From 3410d7341bf5778b36fe5a29f74f11bc9d2ee13c Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:25:48 +0200 Subject: [PATCH 03/19] Update processing_functions.py Fix little bug --- .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index cd0d9fd9e..7a6caea8e 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -311,7 +311,7 @@ def concatenate( formatted_input_data[item] = processed.datum errors += processed.errors warnings += processed.warnings - if key == "timestamp": + elif key == "timestamp": processed = ProcessingFunctions.parse_timestamp({key: item}, output_field) formatted_input_data[item] = processed.datum errors += processed.errors From 87df9184f7e55d644962f630d06a6db928cf1bb9 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:39:52 +0200 Subject: [PATCH 04/19] Add displayNames below loculus accession. --- .../src/components/SequenceDetailsPage/DataTable.tsx | 3 +++ .../SequenceDetailsPage/getDataTableData.ts | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx index 825cf5885..948b49bf4 100644 --- a/website/src/components/SequenceDetailsPage/DataTable.tsx +++ b/website/src/components/SequenceDetailsPage/DataTable.tsx @@ -14,6 +14,9 @@ interface Props { const DataTableComponent: React.FC = ({ dataTableData, dataUseTermsHistory }) => { return (
+ {dataTableData.topmatter.displayName !== undefined && ( +
{dataTableData.topmatter.displayName}
+ )} {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts index 4f67cae84..d0726b7c3 100644 --- a/website/src/components/SequenceDetailsPage/getDataTableData.ts +++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts @@ -3,6 +3,7 @@ import type { TableDataEntry } from './types.ts'; export type DataTableData = { topmatter: { authors: string[] | undefined; + displayName: string | undefined; }; table: { header: string; @@ -14,6 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa const result: DataTableData = { topmatter: { authors: undefined, + displayName: undefined, }, table: [], }; @@ -33,6 +35,15 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa continue; } + if ( + result.topmatter.displayName === undefined && + entry.type.kind === 'metadata' && + entry.name === 'display_name' + ) { + result.topmatter.displayName = entry.value.toString(); + continue; + } + if (!tableHeaderMap.has(entry.header)) { tableHeaderMap.set(entry.header, []); } From 8655d9f03e4031b4ff697efd6f2eca6db88ef1f8 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:49:31 +0200 Subject: [PATCH 05/19] If displayName input is None use empty string instead. --- .../src/loculus_preprocessing/processing_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 7a6caea8e..c67d15198 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -308,12 +308,12 @@ def concatenate( for key, item in input_data.items(): if key == "date": processed = ProcessingFunctions.process_date({key: item}, output_field) - formatted_input_data[item] = processed.datum + formatted_input_data[item] = "" if processed.datum is None else processed.datum errors += processed.errors warnings += processed.warnings elif key == "timestamp": processed = ProcessingFunctions.parse_timestamp({key: item}, output_field) - formatted_input_data[item] = processed.datum + formatted_input_data[item] = "" if processed.datum is None else processed.datum errors += processed.errors warnings += processed.warnings else: From 14a3de63aec2aa3900692ad829afb1702b82494f Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Thu, 4 Jul 2024 17:54:36 +0200 Subject: [PATCH 06/19] Make displayName italics --- website/src/components/SequenceDetailsPage/DataTable.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx index 948b49bf4..0f4ced393 100644 --- a/website/src/components/SequenceDetailsPage/DataTable.tsx +++ b/website/src/components/SequenceDetailsPage/DataTable.tsx @@ -15,7 +15,7 @@ const DataTableComponent: React.FC = ({ dataTableData, dataUseTermsHistor return (
{dataTableData.topmatter.displayName !== undefined && ( -
{dataTableData.topmatter.displayName}
+
{dataTableData.topmatter.displayName}
)} {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
From b6b7e44fbbf80ffa63e650d27e02db1dce9f3f08 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 14:46:43 +0200 Subject: [PATCH 07/19] Little config updates --- ingest/config/config.yaml | 2 ++ kubernetes/loculus/values.yaml | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index c1ddc277e..4d48286e1 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -6,6 +6,8 @@ organism: ebola-zaire # taxon_id: 3052518 # backend_url: http://localhost:8079/ # keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token +# nextclade_dataset_name: nextstrain/cchfv/linked +# nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output # organism: cchf # nucleotide_sequences: # - M diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 45969c79c..00ff57967 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -1202,5 +1202,6 @@ enableCrossRefCredentials: true runDevelopmentKeycloakDatabase: true runDevelopmentMainDatabase: true enforceHTTPS: true -registrationTermsMessage: >- +registrationTermsMessage: > You must agree to the terms of use. + From 10ff3733cddb153827a843e9afe3d7da3340ee90 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:08:34 +0200 Subject: [PATCH 08/19] Function clean up --- .../src/loculus_preprocessing/prepro.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index b40e01722..fae0bff7d 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -317,15 +317,13 @@ def null_per_backend(x: Any) -> bool: return False -def add_InputMetadata( +def add_input_metadata( spec: ProcessingSpec, unprocessed: UnprocessedAfterNextclade, errors: list[ProcessingAnnotation], - input_data: InputMetadata, - arg_name: str, input_path: str, ) -> InputMetadata: - input_data[arg_name] = None + """Returns value of input_path in unprocessed metadata""" # If field starts with "nextclade.", take from nextclade metadata nextclade_prefix = "nextclade." if input_path.startswith(nextclade_prefix): @@ -342,10 +340,10 @@ def add_InputMetadata( message="Nucleotide sequence failed to align", ) ) - return input_data + return None sub_path = input_path[len(nextclade_prefix) :] if segment in unprocessed.nextcladeMetadata: - input_data[arg_name] = str( + return str( dpath.get( unprocessed.nextcladeMetadata[segment], sub_path, @@ -353,13 +351,10 @@ def add_InputMetadata( default=None, ) ) - else: - input_data[arg_name] = None - return input_data + return None if input_path not in unprocessed.inputMetadata: - return input_data - input_data[arg_name] = unprocessed.inputMetadata[input_path] - return input_data + return None + return unprocessed.inputMetadata[input_path] def get_metadata( @@ -371,17 +366,20 @@ def get_metadata( warnings: list[ProcessingAnnotation], ) -> ProcessingResult: input_data: InputMetadata = {} - args = {} if spec.args is None else copy.deepcopy(spec.args) for arg_name, input_path in spec.inputs.items(): - input_data = add_InputMetadata(spec, unprocessed, errors, input_data, arg_name, input_path) + input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path) + args = spec.args + if spec.function == "concatenate": - args["accession_version"] = id - filledin_order: InputMetadata = {} + spec_copy = copy.deepcopy(spec) + spec_copy.args["accession_version"] = id + filled_in_order: InputMetadata = {} for item in spec.args["order"]: - filledin_order = add_InputMetadata( - copy.deepcopy(spec), unprocessed, errors, filledin_order, item, item + filled_in_order = add_input_metadata( + spec_copy, unprocessed, errors, filled_in_order, item, item ) - args["order"] = [filledin_order[item] for item in spec.args["order"]] + spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]] + args = spec_copy.args try: processing_result = ProcessingFunctions.call_function( From 3a844aa91fc48d6103274f86d288b3ef2604309a Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:16:21 +0200 Subject: [PATCH 09/19] Make values.yaml more logical: allow setting args in values.yaml to allow specification of concatenation order. --- .../templates/_preprocessingFromValues.tpl | 32 +++++++++++-------- kubernetes/loculus/values.yaml | 3 +- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index 05d8c8247..e2f79743a 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -1,18 +1,5 @@ {{- define "loculus.sharedPreproSpecs" }} {{ .key }}: - args: - {{- if .segment }} - segment: {{ .segment }} - {{- end }} - {{- if .type }} - type: {{ .type }} - {{- end }} - {{- if .order }} - order: - {{- range .order }} - - {{ . }} - {{- end }} - {{- end }} {{- if .preprocessing }} {{- if hasKey .preprocessing "function" }} function: {{ index .preprocessing "function" }} @@ -25,6 +12,18 @@ {{- . | toYaml | nindent 4 }} {{- end }} {{- end }} + args: + {{- if .segment }} + segment: {{ .segment }} + {{- end }} + {{- if .type }} + type: {{ .type }} + {{- end }} + {{- if hasKey .preprocessing "args" }} + {{- with index .preprocessing "args" }} + {{- . | toYaml | nindent 4 }} + {{- end }} + {{- end }} {{- else }} function: identity inputs: @@ -36,6 +35,13 @@ {{- end }} {{- if .required}} required: true + args: + {{- if .segment }} + segment: {{ .segment }} + {{- end }} + {{- if .type }} + type: {{ .type }} + {{- end }} {{- end }} {{- end }} diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 00ff57967..16068f2ed 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -66,12 +66,13 @@ defaultOrganismConfig: &defaultOrganismConfig required: true - name: display_name header: "INSDC" - order: [geo_loc_country, accession_version, sample_collection_date] preprocessing: function: concatenate inputs: string: geo_loc_country date: sample_collection_date + args: + order: [geo_loc_country, accession_version, sample_collection_date] noInput: true - name: ncbi_release_date displayName: NCBI release date From e9f6edfe3974a158b96e8f78025afffcf98ee63e Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:29:35 +0200 Subject: [PATCH 10/19] Add documentation. --- preprocessing/nextclade/README.md | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md index b24b2c8eb..f6576e6ef 100644 --- a/preprocessing/nextclade/README.md +++ b/preprocessing/nextclade/README.md @@ -73,3 +73,44 @@ prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp- ``` Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend. + +## Preprocessing Checks + +### Type Check + +Preprocessing checks that the type of each metadata field corresponds to the expected `type` value seen in the config. If no type is given we assume the metadata field should be of type string. + +### Required value Check + +Additionally, we check that if a field is required, e.g. `required` is true that that field is not None. + +### Custom Preprocessing Functions + +If no additional `preprocessing` field is specified we assume that field uses the `identity` function, i.e. the output should be the same as the input. If a specific `type` is given the input will be converted to that type. + +However, the `preprocessing` field can be customized to take an arbitrary number of input metadata fields, perform a function on them and then output the desired metadata field. We have defined the following preprocessing functions but more can be added for your own custom instance. + +0. `identity`: Return the input field in the desired type. +1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format +2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format +3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter. + +Using these functions in your `values.yaml` will look like: + +``` +- name: sample_collection_date + type: date + preprocessing: + function: process_date + inputs: + date: sample_collection_date + required: true +- name: display_name + preprocessing: + function: concatenate + inputs: + string: geo_loc_country + date: sample_collection_date + args: + order: [geo_loc_country, accession_version, sample_collection_date] +``` From 1d580bd41850a783e321bc52f42f563e7a3d3291 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 15:34:12 +0200 Subject: [PATCH 11/19] Fix little bug. --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index fae0bff7d..9397d431b 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -375,9 +375,7 @@ def get_metadata( spec_copy.args["accession_version"] = id filled_in_order: InputMetadata = {} for item in spec.args["order"]: - filled_in_order = add_input_metadata( - spec_copy, unprocessed, errors, filled_in_order, item, item - ) + filled_in_order[item] = add_input_metadata(spec_copy, unprocessed, errors, item) spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]] args = spec_copy.args From ce01807940569a657bfb3a7ab1260b160a843eff Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:08:03 +0200 Subject: [PATCH 12/19] Fix little config bug --- kubernetes/loculus/templates/_preprocessingFromValues.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index e2f79743a..6392180d3 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -32,9 +32,9 @@ {{- else }} input: {{ .name }} {{- end }} - {{- end }} {{- if .required}} required: true + {{- end }} args: {{- if .segment }} segment: {{ .segment }} From 1024f12dccb17654726cf28cbc0a6d0c327f5ab1 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:54:26 +0200 Subject: [PATCH 13/19] Fix required issue. --- kubernetes/loculus/templates/_preprocessingFromValues.tpl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index 6392180d3..8b242e779 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -32,9 +32,6 @@ {{- else }} input: {{ .name }} {{- end }} - {{- if .required}} - required: true - {{- end }} args: {{- if .segment }} segment: {{ .segment }} @@ -43,6 +40,9 @@ type: {{ .type }} {{- end }} {{- end }} + {{- if .required}} + required: true + {{- end }} {{- end }} {{- define "loculus.preprocessingSpecs" -}} From 61fe7320a4ab2df22a845d64b0ca6090d25fc44b Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:44:26 +0200 Subject: [PATCH 14/19] Add suggestions --- kubernetes/loculus/templates/_preprocessingFromValues.tpl | 6 ++---- kubernetes/loculus/values.yaml | 1 - .../src/loculus_preprocessing/processing_functions.py | 2 ++ website/src/components/SequenceDetailsPage/DataTable.tsx | 4 ++-- .../components/SequenceDetailsPage/getDataTableData.ts | 8 ++++---- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index 8b242e779..82ae2d102 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -19,10 +19,8 @@ {{- if .type }} type: {{ .type }} {{- end }} - {{- if hasKey .preprocessing "args" }} - {{- with index .preprocessing "args" }} - {{- . | toYaml | nindent 4 }} - {{- end }} + {{- with (get .preprocessing "args") }} + {{ toYaml . | nindent 4 }} {{- end }} {{- else }} function: identity diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 16068f2ed..4c9880d5b 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -65,7 +65,6 @@ defaultOrganismConfig: &defaultOrganismConfig date: sample_collection_date required: true - name: display_name - header: "INSDC" preprocessing: function: concatenate inputs: diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index c67d15198..e068da633 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -323,6 +323,8 @@ def concatenate( try: concatenation_order = [formatted_input_data.get(i, accession_version) for i in order] result = "/".join(concatenation_order) + # To avoid downstream issues do not let the result start or end in a "/" + result = result.strip("/") return ProcessingResult(datum=result, warnings=warnings, errors=errors) except ValueError as e: diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx index 0f4ced393..64f187e1a 100644 --- a/website/src/components/SequenceDetailsPage/DataTable.tsx +++ b/website/src/components/SequenceDetailsPage/DataTable.tsx @@ -14,8 +14,8 @@ interface Props { const DataTableComponent: React.FC = ({ dataTableData, dataUseTermsHistory }) => { return (
- {dataTableData.topmatter.displayName !== undefined && ( -
{dataTableData.topmatter.displayName}
+ {dataTableData.topmatter.sequenceDisplayName !== undefined && ( +
{dataTableData.topmatter.sequenceDisplayName}
)} {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts index d0726b7c3..5f9b7a477 100644 --- a/website/src/components/SequenceDetailsPage/getDataTableData.ts +++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts @@ -3,7 +3,7 @@ import type { TableDataEntry } from './types.ts'; export type DataTableData = { topmatter: { authors: string[] | undefined; - displayName: string | undefined; + sequenceDisplayName: string | undefined; }; table: { header: string; @@ -15,7 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa const result: DataTableData = { topmatter: { authors: undefined, - displayName: undefined, + sequenceDisplayName: undefined, }, table: [], }; @@ -36,11 +36,11 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa } if ( - result.topmatter.displayName === undefined && + result.topmatter.sequenceDisplayName === undefined && entry.type.kind === 'metadata' && entry.name === 'display_name' ) { - result.topmatter.displayName = entry.value.toString(); + result.topmatter.sequenceDisplayName = entry.value.toString(); continue; } From 7fb8f54edf7d65ef96120c83e104b91b9a3035f3 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 8 Jul 2024 19:58:15 +0200 Subject: [PATCH 15/19] Let concatenate function take multiple input values of the same type by specifying type in a separate argument. --- kubernetes/loculus/values.yaml | 5 ++-- preprocessing/nextclade/README.md | 7 ++--- .../processing_functions.py | 26 +++++++++++-------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 4c9880d5b..cede07fdd 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -68,10 +68,11 @@ defaultOrganismConfig: &defaultOrganismConfig preprocessing: function: concatenate inputs: - string: geo_loc_country - date: sample_collection_date + geo_loc_country: geo_loc_country + sample_collection_date: sample_collection_date args: order: [geo_loc_country, accession_version, sample_collection_date] + type: [string, string, date] noInput: true - name: ncbi_release_date displayName: NCBI release date diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md index f6576e6ef..773441ed8 100644 --- a/preprocessing/nextclade/README.md +++ b/preprocessing/nextclade/README.md @@ -93,7 +93,7 @@ However, the `preprocessing` field can be customized to take an arbitrary number 0. `identity`: Return the input field in the desired type. 1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format 2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format -3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter. +3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument). Using these functions in your `values.yaml` will look like: @@ -109,8 +109,9 @@ Using these functions in your `values.yaml` will look like: preprocessing: function: concatenate inputs: - string: geo_loc_country - date: sample_collection_date + geo_loc_country: geo_loc_country + sample_collection_date: sample_collection_date args: order: [geo_loc_country, accession_version, sample_collection_date] + type: [string, string, date] ``` diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index e068da633..9e4c88eb1 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -287,6 +287,7 @@ def concatenate( accession_version = args["accession_version"] order = args["order"] + type = args["type"] # Check accessionVersion only exists once in the list: if number_fields != len(order): @@ -304,25 +305,28 @@ def concatenate( errors=errors, ) - formatted_input_data = {} - for key, item in input_data.items(): - if key == "date": - processed = ProcessingFunctions.process_date({key: item}, output_field) - formatted_input_data[item] = "" if processed.datum is None else processed.datum + formatted_input_data = [] + for i in range(len(order)): + if type[i] == "date": + processed = ProcessingFunctions.process_date( + {"date": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) errors += processed.errors warnings += processed.warnings - elif key == "timestamp": - processed = ProcessingFunctions.parse_timestamp({key: item}, output_field) - formatted_input_data[item] = "" if processed.datum is None else processed.datum + if type[i] == "timestamp": + processed = ProcessingFunctions.parse_timestamp( + {"timestamp": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) errors += processed.errors warnings += processed.warnings else: - formatted_input_data[item] = item + formatted_input_data.append(input_data.get(order[i], accession_version)) logging.debug(f"formatted input data:{formatted_input_data}") try: - concatenation_order = [formatted_input_data.get(i, accession_version) for i in order] - result = "/".join(concatenation_order) + result = "/".join(formatted_input_data) # To avoid downstream issues do not let the result start or end in a "/" result = result.strip("/") From ca6904d3a91bbf91f323398dba0b8e604020e7fe Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 8 Jul 2024 20:22:24 +0200 Subject: [PATCH 16/19] Add changes in prepro I forgot to commit. --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 40dceba0b..87fc96536 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -431,10 +431,6 @@ def get_metadata( if spec.function == "concatenate": spec_copy = copy.deepcopy(spec) spec_copy.args["accession_version"] = id - filled_in_order: InputMetadata = {} - for item in spec.args["order"]: - filled_in_order[item] = add_input_metadata(spec_copy, unprocessed, errors, item) - spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]] args = spec_copy.args try: From 497654b5e75133bd56bae6c59cd643fb9be5d2c8 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 8 Jul 2024 20:34:12 +0200 Subject: [PATCH 17/19] Fix weird else if bug (I thought I fixed this before - odd) --- .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 9e4c88eb1..ccddf4bc8 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -314,7 +314,7 @@ def concatenate( formatted_input_data.append("" if processed.datum is None else processed.datum) errors += processed.errors warnings += processed.warnings - if type[i] == "timestamp": + elif type[i] == "timestamp": processed = ProcessingFunctions.parse_timestamp( {"timestamp": input_data[order[i]]}, output_field ) From 70f9e760a3d21e985350314779384ea05d920c51 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 8 Jul 2024 21:55:49 +0200 Subject: [PATCH 18/19] Add display name to header --- website/src/components/SequenceDetailsPage/DataTable.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx index 2d1fc4b9d..e6651e5ad 100644 --- a/website/src/components/SequenceDetailsPage/DataTable.tsx +++ b/website/src/components/SequenceDetailsPage/DataTable.tsx @@ -19,7 +19,7 @@ const DataTableComponent: React.FC = ({ dataTableData, dataUseTermsHistor return (
{dataTableData.topmatter.sequenceDisplayName !== undefined && ( -
{dataTableData.topmatter.sequenceDisplayName}
+
Display Name: {dataTableData.topmatter.sequenceDisplayName}
)} {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
From cd1c9d8bb36d1f3ab21da10ceb20aaae817a7e2a Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Tue, 9 Jul 2024 09:48:41 +0200 Subject: [PATCH 19/19] Fix None vs not error. --- preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 87fc96536..f8583f459 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -386,7 +386,7 @@ def add_input_metadata( nextclade_prefix = "nextclade." if input_path.startswith(nextclade_prefix): segment = spec.args.get("segment", "main") - if unprocessed.nextcladeMetadata is None: + if not unprocessed.nextcladeMetadata: errors.append( ProcessingAnnotation( source=[