diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index c1ddc277e..4d48286e1 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -6,6 +6,8 @@ organism: ebola-zaire # taxon_id: 3052518 # backend_url: http://localhost:8079/ # keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token +# nextclade_dataset_name: nextstrain/cchfv/linked +# nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output # organism: cchf # nucleotide_sequences: # - M diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl index 7ce408ae0..82ae2d102 100644 --- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl +++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl @@ -1,12 +1,5 @@ {{- define "loculus.sharedPreproSpecs" }} {{ .key }}: - args: - {{- if .segment }} - segment: {{ .segment }} - {{- end }} - {{- if .type }} - type: {{ .type }} - {{- end }} {{- if .preprocessing }} {{- if hasKey .preprocessing "function" }} function: {{ index .preprocessing "function" }} @@ -19,6 +12,16 @@ {{- . | toYaml | nindent 4 }} {{- end }} {{- end }} + args: + {{- if .segment }} + segment: {{ .segment }} + {{- end }} + {{- if .type }} + type: {{ .type }} + {{- end }} + {{- with (get .preprocessing "args") }} + {{ toYaml . | nindent 4 }} + {{- end }} {{- else }} function: identity inputs: @@ -27,6 +30,13 @@ {{- else }} input: {{ .name }} {{- end }} + args: + {{- if .segment }} + segment: {{ .segment }} + {{- end }} + {{- if .type }} + type: {{ .type }} + {{- end }} {{- end }} {{- if .required}} required: true diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 4584784a5..48ea758f8 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -64,6 +64,16 @@ defaultOrganismConfig: &defaultOrganismConfig inputs: date: sample_collection_date required: true + - name: display_name + preprocessing: + function: concatenate + inputs: + geo_loc_country: geo_loc_country + sample_collection_date: sample_collection_date + args: + order: [geo_loc_country, accession_version, sample_collection_date] + type: [string, string, date] + noInput: true - name: ncbi_release_date displayName: NCBI release date type: date diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md index b24b2c8eb..773441ed8 100644 --- a/preprocessing/nextclade/README.md +++ b/preprocessing/nextclade/README.md @@ -73,3 +73,45 @@ prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp- ``` Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend. + +## Preprocessing Checks + +### Type Check + +Preprocessing checks that the type of each metadata field corresponds to the expected `type` value seen in the config. If no type is given we assume the metadata field should be of type string. + +### Required value Check + +Additionally, we check that if a field is required, e.g. `required` is true that that field is not None. + +### Custom Preprocessing Functions + +If no additional `preprocessing` field is specified we assume that field uses the `identity` function, i.e. the output should be the same as the input. If a specific `type` is given the input will be converted to that type. + +However, the `preprocessing` field can be customized to take an arbitrary number of input metadata fields, perform a function on them and then output the desired metadata field. We have defined the following preprocessing functions but more can be added for your own custom instance. + +0. `identity`: Return the input field in the desired type. +1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format +2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format +3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument). + +Using these functions in your `values.yaml` will look like: + +``` +- name: sample_collection_date + type: date + preprocessing: + function: process_date + inputs: + date: sample_collection_date + required: true +- name: display_name + preprocessing: + function: concatenate + inputs: + geo_loc_country: geo_loc_country + sample_collection_date: sample_collection_date + args: + order: [geo_loc_country, accession_version, sample_collection_date] + type: [string, string, date] +``` diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py index 25b3c2215..f8583f459 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py @@ -1,3 +1,4 @@ +import copy import csv import json import logging @@ -374,7 +375,48 @@ def null_per_backend(x: Any) -> bool: return False +def add_input_metadata( + spec: ProcessingSpec, + unprocessed: UnprocessedAfterNextclade, + errors: list[ProcessingAnnotation], + input_path: str, +) -> InputMetadata: + """Returns value of input_path in unprocessed metadata""" + # If field starts with "nextclade.", take from nextclade metadata + nextclade_prefix = "nextclade." + if input_path.startswith(nextclade_prefix): + segment = spec.args.get("segment", "main") + if not unprocessed.nextcladeMetadata: + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource( + name="main", + type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, + ) + ], + message="Nucleotide sequence failed to align", + ) + ) + return None + sub_path = input_path[len(nextclade_prefix) :] + if segment in unprocessed.nextcladeMetadata: + return str( + dpath.get( + unprocessed.nextcladeMetadata[segment], + sub_path, + separator=".", + default=None, + ) + ) + return None + if input_path not in unprocessed.inputMetadata: + return None + return unprocessed.inputMetadata[input_path] + + def get_metadata( + id: AccessionVersion, spec: ProcessingSpec, output_field: str, unprocessed: UnprocessedAfterNextclade, @@ -383,46 +425,17 @@ def get_metadata( ) -> ProcessingResult: input_data: InputMetadata = {} for arg_name, input_path in spec.inputs.items(): - input_data[arg_name] = None - # If field starts with "nextclade.", take from nextclade metadata - nextclade_prefix = "nextclade." - if input_path.startswith(nextclade_prefix): - # Remove "nextclade." prefix - if spec.args is None: - spec.args = {} - segment = spec.args.get("segment", "main") - if not unprocessed.nextcladeMetadata: - errors.append( - ProcessingAnnotation( - source=[ - AnnotationSource( - name="main", - type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE, - ) - ], - message="Nucleotide sequence failed to align", - ) - ) - continue - sub_path = input_path[len(nextclade_prefix) :] - if segment in unprocessed.nextcladeMetadata: - input_data[arg_name] = str( - dpath.get( - unprocessed.nextcladeMetadata[segment], - sub_path, - separator=".", - default=None, - ) - ) - else: - input_data[arg_name] = None - continue - if input_path not in unprocessed.inputMetadata: - continue - input_data[arg_name] = unprocessed.inputMetadata[input_path] + input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path) + args = spec.args + + if spec.function == "concatenate": + spec_copy = copy.deepcopy(spec) + spec_copy.args["accession_version"] = id + args = spec_copy.args + try: processing_result = ProcessingFunctions.call_function( - spec.function, spec.args, input_data, output_field + spec.function, args, input_data, output_field ) except Exception as e: msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}" @@ -491,6 +504,7 @@ def process_single( ) spec.args = {} if spec.args is None else spec.args processing_result = get_metadata( + id, spec, output_field, unprocessed, diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 7e03977f4..ccddf4bc8 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -273,6 +273,79 @@ def parse_timestamp( errors=errors, ) + @staticmethod + def concatenate( + input_data: InputMetadata, output_field: str, args: FunctionArgs = None + ) -> ProcessingResult: + """Concatenates input fields with accession_version using the "/" separator in the order + specified by the order argument. + """ + warnings: list[ProcessingAnnotation] = [] + errors: list[ProcessingAnnotation] = [] + + number_fields = len(input_data.keys()) + 1 + + accession_version = args["accession_version"] + order = args["order"] + type = args["type"] + + # Check accessionVersion only exists once in the list: + if number_fields != len(order): + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message="Concatenation failed.", + ) + ) + return ProcessingResult( + datum=None, + warnings=warnings, + errors=errors, + ) + + formatted_input_data = [] + for i in range(len(order)): + if type[i] == "date": + processed = ProcessingFunctions.process_date( + {"date": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) + errors += processed.errors + warnings += processed.warnings + elif type[i] == "timestamp": + processed = ProcessingFunctions.parse_timestamp( + {"timestamp": input_data[order[i]]}, output_field + ) + formatted_input_data.append("" if processed.datum is None else processed.datum) + errors += processed.errors + warnings += processed.warnings + else: + formatted_input_data.append(input_data.get(order[i], accession_version)) + logging.debug(f"formatted input data:{formatted_input_data}") + + try: + result = "/".join(formatted_input_data) + # To avoid downstream issues do not let the result start or end in a "/" + result = result.strip("/") + + return ProcessingResult(datum=result, warnings=warnings, errors=errors) + except ValueError as e: + errors.append( + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message="Concatenation failed.", + ) + ) + return ProcessingResult( + datum=None, + errors=errors, + warnings=warnings, + ) + @staticmethod def identity( input_data: InputMetadata, output_field: str, args: FunctionArgs = None diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx index 76f2a2ba1..e6651e5ad 100644 --- a/website/src/components/SequenceDetailsPage/DataTable.tsx +++ b/website/src/components/SequenceDetailsPage/DataTable.tsx @@ -18,6 +18,9 @@ const DataTableComponent: React.FC = ({ dataTableData, dataUseTermsHistor return (
+ {dataTableData.topmatter.sequenceDisplayName !== undefined && ( +
Display Name: {dataTableData.topmatter.sequenceDisplayName}
+ )} {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts index 4f67cae84..5f9b7a477 100644 --- a/website/src/components/SequenceDetailsPage/getDataTableData.ts +++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts @@ -3,6 +3,7 @@ import type { TableDataEntry } from './types.ts'; export type DataTableData = { topmatter: { authors: string[] | undefined; + sequenceDisplayName: string | undefined; }; table: { header: string; @@ -14,6 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa const result: DataTableData = { topmatter: { authors: undefined, + sequenceDisplayName: undefined, }, table: [], }; @@ -33,6 +35,15 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa continue; } + if ( + result.topmatter.sequenceDisplayName === undefined && + entry.type.kind === 'metadata' && + entry.name === 'display_name' + ) { + result.topmatter.sequenceDisplayName = entry.value.toString(); + continue; + } + if (!tableHeaderMap.has(entry.header)) { tableHeaderMap.set(entry.header, []); }