feat(website): Create a standardized sequence name with the format: {…

…country}/{AccessionVersion}/{date} (#2246) * Add concatenate function to preprocessing. * Add displayName in italics below loculus accession on webpage. * Make values.yaml more logical: allow setting args in values.yaml to allow specification of concatenation order and type. * Add documentation on how preprocessing functions work
loculus-project · Jul 9, 2024 · 0a96d23 · 0a96d23
1 parent a8f78d7
commit 0a96d23
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 45 deletions.
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -6,6 +6,8 @@ organism: ebola-zaire
 # taxon_id:  3052518
 # backend_url: http://localhost:8079/
 # keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token
+# nextclade_dataset_name: nextstrain/cchfv/linked
+# nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
 # organism: cchf
 # nucleotide_sequences:
 #   - M

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -1,12 +1,5 @@
 {{- define "loculus.sharedPreproSpecs" }}
 {{ .key }}:
-  args:
-    {{- if .segment }}
-    segment: {{ .segment }}
-    {{- end }}
-    {{- if .type }}
-    type: {{ .type }}
-    {{- end }}
   {{- if .preprocessing }}
   {{- if hasKey .preprocessing "function" }}
   function: {{ index .preprocessing "function" }}
@@ -19,6 +12,16 @@
     {{- . | toYaml | nindent 4 }}
     {{- end }}
   {{- end }}
+  args:
+    {{- if .segment }}
+    segment: {{ .segment }}
+    {{- end }}
+    {{- if .type }}
+    type: {{ .type }}
+    {{- end }}
+    {{- with (get .preprocessing "args") }}
+    {{ toYaml . | nindent 4 }}
+    {{- end }}
   {{- else }}
   function: identity
   inputs:
@@ -27,6 +30,13 @@
     {{- else }}
     input: {{ .name }}
     {{- end }}
+  args:
+    {{- if .segment }}
+    segment: {{ .segment }}
+    {{- end }}
+    {{- if .type }}
+    type: {{ .type }}
+    {{- end }}
   {{- end }}
   {{- if .required}}
   required: true

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -64,6 +64,16 @@ defaultOrganismConfig: &defaultOrganismConfig
           inputs:
             date: sample_collection_date
         required: true
+      - name: display_name
+        preprocessing:
+          function: concatenate
+          inputs:
+            geo_loc_country: geo_loc_country
+            sample_collection_date: sample_collection_date
+          args:
+            order: [geo_loc_country, accession_version, sample_collection_date]
+            type: [string, string, date]
+        noInput: true
       - name: ncbi_release_date
         displayName: NCBI release date
         type: date

diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
@@ -73,3 +73,45 @@ prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp-
 ```
 
 Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend.
+
+## Preprocessing Checks
+
+### Type Check
+
+Preprocessing checks that the type of each metadata field corresponds to the expected `type` value seen in the config. If no type is given we assume the metadata field should be of type string.
+
+### Required value Check
+
+Additionally, we check that if a field is required, e.g. `required` is true that that field is not None.
+
+### Custom Preprocessing Functions
+
+If no additional `preprocessing` field is specified we assume that field uses the `identity` function, i.e. the output should be the same as the input. If a specific `type` is given the input will be converted to that type.
+
+However, the `preprocessing` field can be customized to take an arbitrary number of input metadata fields, perform a function on them and then output the desired metadata field. We have defined the following preprocessing functions but more can be added for your own custom instance.
+
+0. `identity`: Return the input field in the desired type.
+1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format
+2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format
+3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument).
+
+Using these functions in your `values.yaml` will look like:
+
+```
+- name: sample_collection_date
+   type: date
+   preprocessing:
+      function: process_date
+      inputs:
+         date: sample_collection_date
+   required: true
+- name: display_name
+   preprocessing:
+      function: concatenate
+      inputs:
+         geo_loc_country: geo_loc_country
+         sample_collection_date: sample_collection_date
+      args:
+         order: [geo_loc_country, accession_version, sample_collection_date]
+         type: [string, string, date]
+```
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -1,3 +1,4 @@
+import copy
 import csv
 import json
 import logging
@@ -374,7 +375,48 @@ def null_per_backend(x: Any) -> bool:
             return False
 
 
+def add_input_metadata(
+    spec: ProcessingSpec,
+    unprocessed: UnprocessedAfterNextclade,
+    errors: list[ProcessingAnnotation],
+    input_path: str,
+) -> InputMetadata:
+    """Returns value of input_path in unprocessed metadata"""
+    # If field starts with "nextclade.", take from nextclade metadata
+    nextclade_prefix = "nextclade."
+    if input_path.startswith(nextclade_prefix):
+        segment = spec.args.get("segment", "main")
+        if not unprocessed.nextcladeMetadata:
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(
+                            name="main",
+                            type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+                        )
+                    ],
+                    message="Nucleotide sequence failed to align",
+                )
+            )
+            return None
+        sub_path = input_path[len(nextclade_prefix) :]
+        if segment in unprocessed.nextcladeMetadata:
+            return str(
+                dpath.get(
+                    unprocessed.nextcladeMetadata[segment],
+                    sub_path,
+                    separator=".",
+                    default=None,
+                )
+            )
+        return None
+    if input_path not in unprocessed.inputMetadata:
+        return None
+    return unprocessed.inputMetadata[input_path]
+
+
 def get_metadata(
+    id: AccessionVersion,
     spec: ProcessingSpec,
     output_field: str,
     unprocessed: UnprocessedAfterNextclade,
@@ -383,46 +425,17 @@ def get_metadata(
 ) -> ProcessingResult:
     input_data: InputMetadata = {}
     for arg_name, input_path in spec.inputs.items():
-        input_data[arg_name] = None
-        # If field starts with "nextclade.", take from nextclade metadata
-        nextclade_prefix = "nextclade."
-        if input_path.startswith(nextclade_prefix):
-            # Remove "nextclade." prefix
-            if spec.args is None:
-                spec.args = {}
-            segment = spec.args.get("segment", "main")
-            if not unprocessed.nextcladeMetadata:
-                errors.append(
-                    ProcessingAnnotation(
-                        source=[
-                            AnnotationSource(
-                                name="main",
-                                type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
-                            )
-                        ],
-                        message="Nucleotide sequence failed to align",
-                    )
-                )
-                continue
-            sub_path = input_path[len(nextclade_prefix) :]
-            if segment in unprocessed.nextcladeMetadata:
-                input_data[arg_name] = str(
-                    dpath.get(
-                        unprocessed.nextcladeMetadata[segment],
-                        sub_path,
-                        separator=".",
-                        default=None,
-                    )
-                )
-            else:
-                input_data[arg_name] = None
-            continue
-        if input_path not in unprocessed.inputMetadata:
-            continue
-        input_data[arg_name] = unprocessed.inputMetadata[input_path]
+        input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path)
+    args = spec.args
+
+    if spec.function == "concatenate":
+        spec_copy = copy.deepcopy(spec)
+        spec_copy.args["accession_version"] = id
+        args = spec_copy.args
+
     try:
         processing_result = ProcessingFunctions.call_function(
-            spec.function, spec.args, input_data, output_field
+            spec.function, args, input_data, output_field
         )
     except Exception as e:
         msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}"
@@ -491,6 +504,7 @@ def process_single(
         )
         spec.args = {} if spec.args is None else spec.args
         processing_result = get_metadata(
+            id,
             spec,
             output_field,
             unprocessed,

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -273,6 +273,79 @@ def parse_timestamp(
                 errors=errors,
             )
 
+    @staticmethod
+    def concatenate(
+        input_data: InputMetadata, output_field: str, args: FunctionArgs = None
+    ) -> ProcessingResult:
+        """Concatenates input fields with accession_version using the "/" separator in the order
+        specified by the order argument.
+        """
+        warnings: list[ProcessingAnnotation] = []
+        errors: list[ProcessingAnnotation] = []
+
+        number_fields = len(input_data.keys()) + 1
+
+        accession_version = args["accession_version"]
+        order = args["order"]
+        type = args["type"]
+
+        # Check accessionVersion only exists once in the list:
+        if number_fields != len(order):
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message="Concatenation failed.",
+                )
+            )
+            return ProcessingResult(
+                datum=None,
+                warnings=warnings,
+                errors=errors,
+            )
+
+        formatted_input_data = []
+        for i in range(len(order)):
+            if type[i] == "date":
+                processed = ProcessingFunctions.process_date(
+                    {"date": input_data[order[i]]}, output_field
+                )
+                formatted_input_data.append("" if processed.datum is None else processed.datum)
+                errors += processed.errors
+                warnings += processed.warnings
+            elif type[i] == "timestamp":
+                processed = ProcessingFunctions.parse_timestamp(
+                    {"timestamp": input_data[order[i]]}, output_field
+                )
+                formatted_input_data.append("" if processed.datum is None else processed.datum)
+                errors += processed.errors
+                warnings += processed.warnings
+            else:
+                formatted_input_data.append(input_data.get(order[i], accession_version))
+        logging.debug(f"formatted input data:{formatted_input_data}")
+
+        try:
+            result = "/".join(formatted_input_data)
+            # To avoid downstream issues do not let the result start or end in a "/"
+            result = result.strip("/")
+
+            return ProcessingResult(datum=result, warnings=warnings, errors=errors)
+        except ValueError as e:
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message="Concatenation failed.",
+                )
+            )
+            return ProcessingResult(
+                datum=None,
+                errors=errors,
+                warnings=warnings,
+            )
+
     @staticmethod
     def identity(
         input_data: InputMetadata, output_field: str, args: FunctionArgs = None

diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx
@@ -18,6 +18,9 @@ const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistor
 
     return (
         <div>
+            {dataTableData.topmatter.sequenceDisplayName !== undefined && (
+                <div className='px-6 mb-4 italic'>Display Name: {dataTableData.topmatter.sequenceDisplayName}</div>
+            )}
             {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
                 <div className='px-6 mb-4'>
                     <AuthorList authors={dataTableData.topmatter.authors} />

diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts
@@ -3,6 +3,7 @@ import type { TableDataEntry } from './types.ts';
 export type DataTableData = {
     topmatter: {
         authors: string[] | undefined;
+        sequenceDisplayName: string | undefined;
     };
     table: {
         header: string;
@@ -14,6 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
     const result: DataTableData = {
         topmatter: {
             authors: undefined,
+            sequenceDisplayName: undefined,
         },
         table: [],
     };
@@ -33,6 +35,15 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
             continue;
         }
 
+        if (
+            result.topmatter.sequenceDisplayName === undefined &&
+            entry.type.kind === 'metadata' &&
+            entry.name === 'display_name'
+        ) {
+            result.topmatter.sequenceDisplayName = entry.value.toString();
+            continue;
+        }
+
         if (!tableHeaderMap.has(entry.header)) {
             tableHeaderMap.set(entry.header, []);
         }