From a637799fa9bf8c4e4f62d32e37f49b1bd0739c4b Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:08:02 +0200
Subject: [PATCH 01/19] Add concatenate function to preprocessing.

---
 .../templates/_preprocessingFromValues.tpl    |  6 ++
 kubernetes/loculus/values.yaml                | 12 ++-
 .../src/loculus_preprocessing/prepro.py       | 98 ++++++++++++-------
 .../processing_functions.py                   | 67 +++++++++++++
 4 files changed, 143 insertions(+), 40 deletions(-)
diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
index 7ce408ae0..05d8c8247 100644
--- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl
+++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -7,6 +7,12 @@
     {{- if .type }}
     type: {{ .type }}
     {{- end }}
+    {{- if .order }}
+    order: 
+    {{- range .order }}
+      - {{ . }}
+    {{- end }}
+    {{- end }}
   {{- if .preprocessing }}
   {{- if hasKey .preprocessing "function" }}
   function: {{ index .preprocessing "function" }}
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index b12e18720..7c0b4b329 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -64,6 +64,14 @@ defaultOrganismConfig: &defaultOrganismConfig
           inputs:
             date: sample_collection_date
         required: true
+      - name: display_name
+        order: [geo_loc_country, accession_version, sample_collection_date]
+        preprocessing:
+          function: concatenate
+          inputs:
+            string: geo_loc_country
+            date: sample_collection_date
+        noInput: true
       - name: ncbi_release_date
         displayName: NCBI release date
         type: date
@@ -1193,5 +1201,5 @@ enableCrossRefCredentials: true
 runDevelopmentKeycloakDatabase: true
 runDevelopmentMainDatabase: true
 enforceHTTPS: true
-registrationTermsMessage: >
-   You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
\ No newline at end of file
+registrationTermsMessage: >-
+  You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 07328e5d2..b40e01722 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -1,3 +1,4 @@
+import copy
 import csv
 import json
 import logging
@@ -316,7 +317,53 @@ def null_per_backend(x: Any) -> bool:
             return False
 
 
+def add_InputMetadata(
+    spec: ProcessingSpec,
+    unprocessed: UnprocessedAfterNextclade,
+    errors: list[ProcessingAnnotation],
+    input_data: InputMetadata,
+    arg_name: str,
+    input_path: str,
+) -> InputMetadata:
+    input_data[arg_name] = None
+    # If field starts with "nextclade.", take from nextclade metadata
+    nextclade_prefix = "nextclade."
+    if input_path.startswith(nextclade_prefix):
+        segment = spec.args.get("segment", "main")
+        if unprocessed.nextcladeMetadata is None:
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(
+                            name="main",
+                            type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
+                        )
+                    ],
+                    message="Nucleotide sequence failed to align",
+                )
+            )
+            return input_data
+        sub_path = input_path[len(nextclade_prefix) :]
+        if segment in unprocessed.nextcladeMetadata:
+            input_data[arg_name] = str(
+                dpath.get(
+                    unprocessed.nextcladeMetadata[segment],
+                    sub_path,
+                    separator=".",
+                    default=None,
+                )
+            )
+        else:
+            input_data[arg_name] = None
+        return input_data
+    if input_path not in unprocessed.inputMetadata:
+        return input_data
+    input_data[arg_name] = unprocessed.inputMetadata[input_path]
+    return input_data
+
+
 def get_metadata(
+    id: AccessionVersion,
     spec: ProcessingSpec,
     output_field: str,
     unprocessed: UnprocessedAfterNextclade,
@@ -324,47 +371,21 @@ def get_metadata(
     warnings: list[ProcessingAnnotation],
 ) -> ProcessingResult:
     input_data: InputMetadata = {}
+    args = {} if spec.args is None else copy.deepcopy(spec.args)
     for arg_name, input_path in spec.inputs.items():
-        input_data[arg_name] = None
-        # If field starts with "nextclade.", take from nextclade metadata
-        nextclade_prefix = "nextclade."
-        if input_path.startswith(nextclade_prefix):
-            # Remove "nextclade." prefix
-            if spec.args is None:
-                spec.args = {}
-            segment = spec.args.get("segment", "main")
-            if unprocessed.nextcladeMetadata is None:
-                errors.append(
-                    ProcessingAnnotation(
-                        source=[
-                            AnnotationSource(
-                                name="main",
-                                type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
-                            )
-                        ],
-                        message="Nucleotide sequence failed to align",
-                    )
-                )
-                continue
-            sub_path = input_path[len(nextclade_prefix) :]
-            if segment in unprocessed.nextcladeMetadata:
-                input_data[arg_name] = str(
-                    dpath.get(
-                        unprocessed.nextcladeMetadata[segment],
-                        sub_path,
-                        separator=".",
-                        default=None,
-                    )
-                )
-            else:
-                input_data[arg_name] = None
-            continue
-        if input_path not in unprocessed.inputMetadata:
-            continue
-        input_data[arg_name] = unprocessed.inputMetadata[input_path]
+        input_data = add_InputMetadata(spec, unprocessed, errors, input_data, arg_name, input_path)
+    if spec.function == "concatenate":
+        args["accession_version"] = id
+        filledin_order: InputMetadata = {}
+        for item in spec.args["order"]:
+            filledin_order = add_InputMetadata(
+                copy.deepcopy(spec), unprocessed, errors, filledin_order, item, item
+            )
+        args["order"] = [filledin_order[item] for item in spec.args["order"]]
+
     try:
         processing_result = ProcessingFunctions.call_function(
-            spec.function, spec.args, input_data, output_field
+            spec.function, args, input_data, output_field
         )
     except Exception as e:
         msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}"
@@ -403,6 +424,7 @@ def process_single(
         )
         spec.args = {} if spec.args is None else spec.args
         processing_result = get_metadata(
+            id,
             spec,
             output_field,
             unprocessed,
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 7e03977f4..cd0d9fd9e 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -273,6 +273,73 @@ def parse_timestamp(
                 errors=errors,
             )
 
+    @staticmethod
+    def concatenate(
+        input_data: InputMetadata, output_field: str, args: FunctionArgs = None
+    ) -> ProcessingResult:
+        """Concatenates input fields with accession_version using the "/" separator in the order
+        specified by the order argument.
+        """
+        warnings: list[ProcessingAnnotation] = []
+        errors: list[ProcessingAnnotation] = []
+
+        number_fields = len(input_data.keys()) + 1
+
+        accession_version = args["accession_version"]
+        order = args["order"]
+
+        # Check accessionVersion only exists once in the list:
+        if number_fields != len(order):
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message="Concatenation failed.",
+                )
+            )
+            return ProcessingResult(
+                datum=None,
+                warnings=warnings,
+                errors=errors,
+            )
+
+        formatted_input_data = {}
+        for key, item in input_data.items():
+            if key == "date":
+                processed = ProcessingFunctions.process_date({key: item}, output_field)
+                formatted_input_data[item] = processed.datum
+                errors += processed.errors
+                warnings += processed.warnings
+            if key == "timestamp":
+                processed = ProcessingFunctions.parse_timestamp({key: item}, output_field)
+                formatted_input_data[item] = processed.datum
+                errors += processed.errors
+                warnings += processed.warnings
+            else:
+                formatted_input_data[item] = item
+        logging.debug(f"formatted input data:{formatted_input_data}")
+
+        try:
+            concatenation_order = [formatted_input_data.get(i, accession_version) for i in order]
+            result = "/".join(concatenation_order)
+
+            return ProcessingResult(datum=result, warnings=warnings, errors=errors)
+        except ValueError as e:
+            errors.append(
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message="Concatenation failed.",
+                )
+            )
+            return ProcessingResult(
+                datum=None,
+                errors=errors,
+                warnings=warnings,
+            )
+
     @staticmethod
     def identity(
         input_data: InputMetadata, output_field: str, args: FunctionArgs = None

From 378ba280767f7a056fc0b1d45905d50b814f5453 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:13:07 +0200
Subject: [PATCH 02/19] Update values.yaml

Add display_name to INSDC header just to see what it looks like
---
 kubernetes/loculus/values.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 7c0b4b329..45969c79c 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -65,6 +65,7 @@ defaultOrganismConfig: &defaultOrganismConfig
             date: sample_collection_date
         required: true
       - name: display_name
+        header: "INSDC"
         order: [geo_loc_country, accession_version, sample_collection_date]
         preprocessing:
           function: concatenate

From 3410d7341bf5778b36fe5a29f74f11bc9d2ee13c Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:25:48 +0200
Subject: [PATCH 03/19] Update processing_functions.py

Fix little bug
---
 .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index cd0d9fd9e..7a6caea8e 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -311,7 +311,7 @@ def concatenate(
                 formatted_input_data[item] = processed.datum
                 errors += processed.errors
                 warnings += processed.warnings
-            if key == "timestamp":
+            elif key == "timestamp":
                 processed = ProcessingFunctions.parse_timestamp({key: item}, output_field)
                 formatted_input_data[item] = processed.datum
                 errors += processed.errors

From 87df9184f7e55d644962f630d06a6db928cf1bb9 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:39:52 +0200
Subject: [PATCH 04/19] Add displayNames below loculus accession.

---
 .../src/components/SequenceDetailsPage/DataTable.tsx  |  3 +++
 .../SequenceDetailsPage/getDataTableData.ts           | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx
index 825cf5885..948b49bf4 100644
--- a/website/src/components/SequenceDetailsPage/DataTable.tsx
+++ b/website/src/components/SequenceDetailsPage/DataTable.tsx
@@ -14,6 +14,9 @@ interface Props {
 const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistory }) => {
     return (
         <div>
+            {dataTableData.topmatter.displayName !== undefined && (
+                <div className='px-6 mb-4'>{dataTableData.topmatter.displayName}</div>
+            )}
             {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
                 <div className='px-6 mb-4'>
                     <AuthorList authors={dataTableData.topmatter.authors} />
diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts
index 4f67cae84..d0726b7c3 100644
--- a/website/src/components/SequenceDetailsPage/getDataTableData.ts
+++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts
@@ -3,6 +3,7 @@ import type { TableDataEntry } from './types.ts';
 export type DataTableData = {
     topmatter: {
         authors: string[] | undefined;
+        displayName: string | undefined;
     };
     table: {
         header: string;
@@ -14,6 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
     const result: DataTableData = {
         topmatter: {
             authors: undefined,
+            displayName: undefined,
         },
         table: [],
     };
@@ -33,6 +35,15 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
             continue;
         }
 
+        if (
+            result.topmatter.displayName === undefined &&
+            entry.type.kind === 'metadata' &&
+            entry.name === 'display_name'
+        ) {
+            result.topmatter.displayName = entry.value.toString();
+            continue;
+        }
+
         if (!tableHeaderMap.has(entry.header)) {
             tableHeaderMap.set(entry.header, []);
         }

From 8655d9f03e4031b4ff697efd6f2eca6db88ef1f8 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:49:31 +0200
Subject: [PATCH 05/19] If displayName input is None use empty string instead.

---
 .../src/loculus_preprocessing/processing_functions.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 7a6caea8e..c67d15198 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -308,12 +308,12 @@ def concatenate(
         for key, item in input_data.items():
             if key == "date":
                 processed = ProcessingFunctions.process_date({key: item}, output_field)
-                formatted_input_data[item] = processed.datum
+                formatted_input_data[item] = "" if processed.datum is None else processed.datum
                 errors += processed.errors
                 warnings += processed.warnings
             elif key == "timestamp":
                 processed = ProcessingFunctions.parse_timestamp({key: item}, output_field)
-                formatted_input_data[item] = processed.datum
+                formatted_input_data[item] = "" if processed.datum is None else processed.datum
                 errors += processed.errors
                 warnings += processed.warnings
             else:

From 14a3de63aec2aa3900692ad829afb1702b82494f Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Thu, 4 Jul 2024 17:54:36 +0200
Subject: [PATCH 06/19] Make displayName italics

---
 website/src/components/SequenceDetailsPage/DataTable.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx
index 948b49bf4..0f4ced393 100644
--- a/website/src/components/SequenceDetailsPage/DataTable.tsx
+++ b/website/src/components/SequenceDetailsPage/DataTable.tsx
@@ -15,7 +15,7 @@ const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistor
     return (
         <div>
             {dataTableData.topmatter.displayName !== undefined && (
-                <div className='px-6 mb-4'>{dataTableData.topmatter.displayName}</div>
+                <div className='px-6 mb-4 italic'>{dataTableData.topmatter.displayName}</div>
             )}
             {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
                 <div className='px-6 mb-4'>

From b6b7e44fbbf80ffa63e650d27e02db1dce9f3f08 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 14:46:43 +0200
Subject: [PATCH 07/19] Little config updates

---
 ingest/config/config.yaml      | 2 ++
 kubernetes/loculus/values.yaml | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index c1ddc277e..4d48286e1 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -6,6 +6,8 @@ organism: ebola-zaire
 # taxon_id:  3052518
 # backend_url: http://localhost:8079/
 # keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token
+# nextclade_dataset_name: nextstrain/cchfv/linked
+# nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
 # organism: cchf
 # nucleotide_sequences:
 #   - M
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 45969c79c..00ff57967 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -1202,5 +1202,6 @@ enableCrossRefCredentials: true
 runDevelopmentKeycloakDatabase: true
 runDevelopmentMainDatabase: true
 enforceHTTPS: true
-registrationTermsMessage: >-
+registrationTermsMessage: >
   You must agree to the <a href="http://main.loculus.org/terms">terms of use</a>.
+

From 10ff3733cddb153827a843e9afe3d7da3340ee90 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:08:34 +0200
Subject: [PATCH 08/19] Function clean up

---
 .../src/loculus_preprocessing/prepro.py       | 36 +++++++++----------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index b40e01722..fae0bff7d 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -317,15 +317,13 @@ def null_per_backend(x: Any) -> bool:
             return False
 
 
-def add_InputMetadata(
+def add_input_metadata(
     spec: ProcessingSpec,
     unprocessed: UnprocessedAfterNextclade,
     errors: list[ProcessingAnnotation],
-    input_data: InputMetadata,
-    arg_name: str,
     input_path: str,
 ) -> InputMetadata:
-    input_data[arg_name] = None
+    """Returns value of input_path in unprocessed metadata"""
     # If field starts with "nextclade.", take from nextclade metadata
     nextclade_prefix = "nextclade."
     if input_path.startswith(nextclade_prefix):
@@ -342,10 +340,10 @@ def add_InputMetadata(
                     message="Nucleotide sequence failed to align",
                 )
             )
-            return input_data
+            return None
         sub_path = input_path[len(nextclade_prefix) :]
         if segment in unprocessed.nextcladeMetadata:
-            input_data[arg_name] = str(
+            return str(
                 dpath.get(
                     unprocessed.nextcladeMetadata[segment],
                     sub_path,
@@ -353,13 +351,10 @@ def add_InputMetadata(
                     default=None,
                 )
             )
-        else:
-            input_data[arg_name] = None
-        return input_data
+        return None
     if input_path not in unprocessed.inputMetadata:
-        return input_data
-    input_data[arg_name] = unprocessed.inputMetadata[input_path]
-    return input_data
+        return None
+    return unprocessed.inputMetadata[input_path]
 
 
 def get_metadata(
@@ -371,17 +366,20 @@ def get_metadata(
     warnings: list[ProcessingAnnotation],
 ) -> ProcessingResult:
     input_data: InputMetadata = {}
-    args = {} if spec.args is None else copy.deepcopy(spec.args)
     for arg_name, input_path in spec.inputs.items():
-        input_data = add_InputMetadata(spec, unprocessed, errors, input_data, arg_name, input_path)
+        input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path)
+    args = spec.args
+
     if spec.function == "concatenate":
-        args["accession_version"] = id
-        filledin_order: InputMetadata = {}
+        spec_copy = copy.deepcopy(spec)
+        spec_copy.args["accession_version"] = id
+        filled_in_order: InputMetadata = {}
         for item in spec.args["order"]:
-            filledin_order = add_InputMetadata(
-                copy.deepcopy(spec), unprocessed, errors, filledin_order, item, item
+            filled_in_order = add_input_metadata(
+                spec_copy, unprocessed, errors, filled_in_order, item, item
             )
-        args["order"] = [filledin_order[item] for item in spec.args["order"]]
+        spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]]
+        args = spec_copy.args
 
     try:
         processing_result = ProcessingFunctions.call_function(

From 3a844aa91fc48d6103274f86d288b3ef2604309a Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:16:21 +0200
Subject: [PATCH 09/19] Make values.yaml more logical: allow setting args in
 values.yaml to allow specification of concatenation order.

---
 .../templates/_preprocessingFromValues.tpl    | 32 +++++++++++--------
 kubernetes/loculus/values.yaml                |  3 +-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
index 05d8c8247..e2f79743a 100644
--- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl
+++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -1,18 +1,5 @@
 {{- define "loculus.sharedPreproSpecs" }}
 {{ .key }}:
-  args:
-    {{- if .segment }}
-    segment: {{ .segment }}
-    {{- end }}
-    {{- if .type }}
-    type: {{ .type }}
-    {{- end }}
-    {{- if .order }}
-    order: 
-    {{- range .order }}
-      - {{ . }}
-    {{- end }}
-    {{- end }}
   {{- if .preprocessing }}
   {{- if hasKey .preprocessing "function" }}
   function: {{ index .preprocessing "function" }}
@@ -25,6 +12,18 @@
     {{- . | toYaml | nindent 4 }}
     {{- end }}
   {{- end }}
+  args:
+    {{- if .segment }}
+    segment: {{ .segment }}
+    {{- end }}
+    {{- if .type }}
+    type: {{ .type }}
+    {{- end }}
+    {{- if hasKey .preprocessing "args" }}
+    {{- with index .preprocessing "args" }}
+    {{- . | toYaml | nindent 4 }}
+    {{- end }}
+    {{- end }}
   {{- else }}
   function: identity
   inputs:
@@ -36,6 +35,13 @@
   {{- end }}
   {{- if .required}}
   required: true
+  args:
+    {{- if .segment }}
+    segment: {{ .segment }}
+    {{- end }}
+    {{- if .type }}
+    type: {{ .type }}
+    {{- end }}
   {{- end }}
 {{- end }}
 
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 00ff57967..16068f2ed 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -66,12 +66,13 @@ defaultOrganismConfig: &defaultOrganismConfig
         required: true
       - name: display_name
         header: "INSDC"
-        order: [geo_loc_country, accession_version, sample_collection_date]
         preprocessing:
           function: concatenate
           inputs:
             string: geo_loc_country
             date: sample_collection_date
+          args:
+            order: [geo_loc_country, accession_version, sample_collection_date]
         noInput: true
       - name: ncbi_release_date
         displayName: NCBI release date

From e9f6edfe3974a158b96e8f78025afffcf98ee63e Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:29:35 +0200
Subject: [PATCH 10/19] Add documentation.

---
 preprocessing/nextclade/README.md | 41 +++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
index b24b2c8eb..f6576e6ef 100644
--- a/preprocessing/nextclade/README.md
+++ b/preprocessing/nextclade/README.md
@@ -73,3 +73,44 @@ prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp-
 ```
 
 Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend.
+
+## Preprocessing Checks
+
+### Type Check
+
+Preprocessing checks that the type of each metadata field corresponds to the expected `type` value seen in the config. If no type is given we assume the metadata field should be of type string.
+
+### Required value Check
+
+Additionally, we check that if a field is required, e.g. `required` is true that that field is not None.
+
+### Custom Preprocessing Functions
+
+If no additional `preprocessing` field is specified we assume that field uses the `identity` function, i.e. the output should be the same as the input. If a specific `type` is given the input will be converted to that type.
+
+However, the `preprocessing` field can be customized to take an arbitrary number of input metadata fields, perform a function on them and then output the desired metadata field. We have defined the following preprocessing functions but more can be added for your own custom instance.
+
+0. `identity`: Return the input field in the desired type.
+1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format
+2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format
+3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter.
+
+Using these functions in your `values.yaml` will look like:
+
+```
+- name: sample_collection_date
+   type: date
+   preprocessing:
+      function: process_date
+      inputs:
+         date: sample_collection_date
+   required: true
+- name: display_name
+   preprocessing:
+      function: concatenate
+      inputs:
+         string: geo_loc_country
+         date: sample_collection_date
+      args:
+         order: [geo_loc_country, accession_version, sample_collection_date]
+```

From 1d580bd41850a783e321bc52f42f563e7a3d3291 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:34:12 +0200
Subject: [PATCH 11/19] Fix little bug.

---
 preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index fae0bff7d..9397d431b 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -375,9 +375,7 @@ def get_metadata(
         spec_copy.args["accession_version"] = id
         filled_in_order: InputMetadata = {}
         for item in spec.args["order"]:
-            filled_in_order = add_input_metadata(
-                spec_copy, unprocessed, errors, filled_in_order, item, item
-            )
+            filled_in_order[item] = add_input_metadata(spec_copy, unprocessed, errors, item)
         spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]]
         args = spec_copy.args
 

From ce01807940569a657bfb3a7ab1260b160a843eff Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 16:08:03 +0200
Subject: [PATCH 12/19] Fix little config bug

---
 kubernetes/loculus/templates/_preprocessingFromValues.tpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
index e2f79743a..6392180d3 100644
--- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl
+++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -32,9 +32,9 @@
     {{- else }}
     input: {{ .name }}
     {{- end }}
-  {{- end }}
   {{- if .required}}
   required: true
+  {{- end }}
   args:
     {{- if .segment }}
     segment: {{ .segment }}

From 1024f12dccb17654726cf28cbc0a6d0c327f5ab1 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Fri, 5 Jul 2024 16:54:26 +0200
Subject: [PATCH 13/19] Fix required issue.

---
 kubernetes/loculus/templates/_preprocessingFromValues.tpl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
index 6392180d3..8b242e779 100644
--- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl
+++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -32,9 +32,6 @@
     {{- else }}
     input: {{ .name }}
     {{- end }}
-  {{- if .required}}
-  required: true
-  {{- end }}
   args:
     {{- if .segment }}
     segment: {{ .segment }}
@@ -43,6 +40,9 @@
     type: {{ .type }}
     {{- end }}
   {{- end }}
+  {{- if .required}}
+  required: true
+  {{- end }}
 {{- end }}
 
 {{- define "loculus.preprocessingSpecs" -}}

From 61fe7320a4ab2df22a845d64b0ca6090d25fc44b Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:44:26 +0200
Subject: [PATCH 14/19] Add suggestions

---
 kubernetes/loculus/templates/_preprocessingFromValues.tpl | 6 ++----
 kubernetes/loculus/values.yaml                            | 1 -
 .../src/loculus_preprocessing/processing_functions.py     | 2 ++
 website/src/components/SequenceDetailsPage/DataTable.tsx  | 4 ++--
 .../components/SequenceDetailsPage/getDataTableData.ts    | 8 ++++----
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/kubernetes/loculus/templates/_preprocessingFromValues.tpl b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
index 8b242e779..82ae2d102 100644
--- a/kubernetes/loculus/templates/_preprocessingFromValues.tpl
+++ b/kubernetes/loculus/templates/_preprocessingFromValues.tpl
@@ -19,10 +19,8 @@
     {{- if .type }}
     type: {{ .type }}
     {{- end }}
-    {{- if hasKey .preprocessing "args" }}
-    {{- with index .preprocessing "args" }}
-    {{- . | toYaml | nindent 4 }}
-    {{- end }}
+    {{- with (get .preprocessing "args") }}
+    {{ toYaml . | nindent 4 }}
     {{- end }}
   {{- else }}
   function: identity
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 16068f2ed..4c9880d5b 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -65,7 +65,6 @@ defaultOrganismConfig: &defaultOrganismConfig
             date: sample_collection_date
         required: true
       - name: display_name
-        header: "INSDC"
         preprocessing:
           function: concatenate
           inputs:
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index c67d15198..e068da633 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -323,6 +323,8 @@ def concatenate(
         try:
             concatenation_order = [formatted_input_data.get(i, accession_version) for i in order]
             result = "/".join(concatenation_order)
+            # To avoid downstream issues do not let the result start or end in a "/"
+            result = result.strip("/")
 
             return ProcessingResult(datum=result, warnings=warnings, errors=errors)
         except ValueError as e:
diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx
index 0f4ced393..64f187e1a 100644
--- a/website/src/components/SequenceDetailsPage/DataTable.tsx
+++ b/website/src/components/SequenceDetailsPage/DataTable.tsx
@@ -14,8 +14,8 @@ interface Props {
 const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistory }) => {
     return (
         <div>
-            {dataTableData.topmatter.displayName !== undefined && (
-                <div className='px-6 mb-4 italic'>{dataTableData.topmatter.displayName}</div>
+            {dataTableData.topmatter.sequenceDisplayName !== undefined && (
+                <div className='px-6 mb-4 italic'>{dataTableData.topmatter.sequenceDisplayName}</div>
             )}
             {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
                 <div className='px-6 mb-4'>
diff --git a/website/src/components/SequenceDetailsPage/getDataTableData.ts b/website/src/components/SequenceDetailsPage/getDataTableData.ts
index d0726b7c3..5f9b7a477 100644
--- a/website/src/components/SequenceDetailsPage/getDataTableData.ts
+++ b/website/src/components/SequenceDetailsPage/getDataTableData.ts
@@ -3,7 +3,7 @@ import type { TableDataEntry } from './types.ts';
 export type DataTableData = {
     topmatter: {
         authors: string[] | undefined;
-        displayName: string | undefined;
+        sequenceDisplayName: string | undefined;
     };
     table: {
         header: string;
@@ -15,7 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
     const result: DataTableData = {
         topmatter: {
             authors: undefined,
-            displayName: undefined,
+            sequenceDisplayName: undefined,
         },
         table: [],
     };
@@ -36,11 +36,11 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
         }
 
         if (
-            result.topmatter.displayName === undefined &&
+            result.topmatter.sequenceDisplayName === undefined &&
             entry.type.kind === 'metadata' &&
             entry.name === 'display_name'
         ) {
-            result.topmatter.displayName = entry.value.toString();
+            result.topmatter.sequenceDisplayName = entry.value.toString();
             continue;
         }
 

From 7fb8f54edf7d65ef96120c83e104b91b9a3035f3 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Mon, 8 Jul 2024 19:58:15 +0200
Subject: [PATCH 15/19] Let concatenate function take multiple input values of
 the same type by specifying type in a separate argument.

---
 kubernetes/loculus/values.yaml                |  5 ++--
 preprocessing/nextclade/README.md             |  7 ++---
 .../processing_functions.py                   | 26 +++++++++++--------
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
index 4c9880d5b..cede07fdd 100644
--- a/kubernetes/loculus/values.yaml
+++ b/kubernetes/loculus/values.yaml
@@ -68,10 +68,11 @@ defaultOrganismConfig: &defaultOrganismConfig
         preprocessing:
           function: concatenate
           inputs:
-            string: geo_loc_country
-            date: sample_collection_date
+            geo_loc_country: geo_loc_country
+            sample_collection_date: sample_collection_date
           args:
             order: [geo_loc_country, accession_version, sample_collection_date]
+            type: [string, string, date]
         noInput: true
       - name: ncbi_release_date
         displayName: NCBI release date
diff --git a/preprocessing/nextclade/README.md b/preprocessing/nextclade/README.md
index f6576e6ef..773441ed8 100644
--- a/preprocessing/nextclade/README.md
+++ b/preprocessing/nextclade/README.md
@@ -93,7 +93,7 @@ However, the `preprocessing` field can be customized to take an arbitrary number
 0. `identity`: Return the input field in the desired type.
 1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format
 2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format
-3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter.
+3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument).
 
 Using these functions in your `values.yaml` will look like:
 
@@ -109,8 +109,9 @@ Using these functions in your `values.yaml` will look like:
    preprocessing:
       function: concatenate
       inputs:
-         string: geo_loc_country
-         date: sample_collection_date
+         geo_loc_country: geo_loc_country
+         sample_collection_date: sample_collection_date
       args:
          order: [geo_loc_country, accession_version, sample_collection_date]
+         type: [string, string, date]
 ```
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index e068da633..9e4c88eb1 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -287,6 +287,7 @@ def concatenate(
 
         accession_version = args["accession_version"]
         order = args["order"]
+        type = args["type"]
 
         # Check accessionVersion only exists once in the list:
         if number_fields != len(order):
@@ -304,25 +305,28 @@ def concatenate(
                 errors=errors,
             )
 
-        formatted_input_data = {}
-        for key, item in input_data.items():
-            if key == "date":
-                processed = ProcessingFunctions.process_date({key: item}, output_field)
-                formatted_input_data[item] = "" if processed.datum is None else processed.datum
+        formatted_input_data = []
+        for i in range(len(order)):
+            if type[i] == "date":
+                processed = ProcessingFunctions.process_date(
+                    {"date": input_data[order[i]]}, output_field
+                )
+                formatted_input_data.append("" if processed.datum is None else processed.datum)
                 errors += processed.errors
                 warnings += processed.warnings
-            elif key == "timestamp":
-                processed = ProcessingFunctions.parse_timestamp({key: item}, output_field)
-                formatted_input_data[item] = "" if processed.datum is None else processed.datum
+            if type[i] == "timestamp":
+                processed = ProcessingFunctions.parse_timestamp(
+                    {"timestamp": input_data[order[i]]}, output_field
+                )
+                formatted_input_data.append("" if processed.datum is None else processed.datum)
                 errors += processed.errors
                 warnings += processed.warnings
             else:
-                formatted_input_data[item] = item
+                formatted_input_data.append(input_data.get(order[i], accession_version))
         logging.debug(f"formatted input data:{formatted_input_data}")
 
         try:
-            concatenation_order = [formatted_input_data.get(i, accession_version) for i in order]
-            result = "/".join(concatenation_order)
+            result = "/".join(formatted_input_data)
             # To avoid downstream issues do not let the result start or end in a "/"
             result = result.strip("/")
 

From ca6904d3a91bbf91f323398dba0b8e604020e7fe Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Mon, 8 Jul 2024 20:22:24 +0200
Subject: [PATCH 16/19] Add changes in prepro I forgot to commit.

---
 preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 40dceba0b..87fc96536 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -431,10 +431,6 @@ def get_metadata(
     if spec.function == "concatenate":
         spec_copy = copy.deepcopy(spec)
         spec_copy.args["accession_version"] = id
-        filled_in_order: InputMetadata = {}
-        for item in spec.args["order"]:
-            filled_in_order[item] = add_input_metadata(spec_copy, unprocessed, errors, item)
-        spec_copy.args["order"] = [filled_in_order[item] for item in spec.args["order"]]
         args = spec_copy.args
 
     try:

From 497654b5e75133bd56bae6c59cd643fb9be5d2c8 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Mon, 8 Jul 2024 20:34:12 +0200
Subject: [PATCH 17/19] Fix weird else if bug (I thought I fixed this before -
 odd)

---
 .../nextclade/src/loculus_preprocessing/processing_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
index 9e4c88eb1..ccddf4bc8 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -314,7 +314,7 @@ def concatenate(
                 formatted_input_data.append("" if processed.datum is None else processed.datum)
                 errors += processed.errors
                 warnings += processed.warnings
-            if type[i] == "timestamp":
+            elif type[i] == "timestamp":
                 processed = ProcessingFunctions.parse_timestamp(
                     {"timestamp": input_data[order[i]]}, output_field
                 )

From 70f9e760a3d21e985350314779384ea05d920c51 Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Mon, 8 Jul 2024 21:55:49 +0200
Subject: [PATCH 18/19] Add display name to header

---
 website/src/components/SequenceDetailsPage/DataTable.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/components/SequenceDetailsPage/DataTable.tsx b/website/src/components/SequenceDetailsPage/DataTable.tsx
index 2d1fc4b9d..e6651e5ad 100644
--- a/website/src/components/SequenceDetailsPage/DataTable.tsx
+++ b/website/src/components/SequenceDetailsPage/DataTable.tsx
@@ -19,7 +19,7 @@ const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistor
     return (
         <div>
             {dataTableData.topmatter.sequenceDisplayName !== undefined && (
-                <div className='px-6 mb-4 italic'>{dataTableData.topmatter.sequenceDisplayName}</div>
+                <div className='px-6 mb-4 italic'>Display Name: {dataTableData.topmatter.sequenceDisplayName}</div>
             )}
             {dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
                 <div className='px-6 mb-4'>

From cd1c9d8bb36d1f3ab21da10ceb20aaae817a7e2a Mon Sep 17 00:00:00 2001
From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com>
Date: Tue, 9 Jul 2024 09:48:41 +0200
Subject: [PATCH 19/19] Fix None vs not error.

---
 preprocessing/nextclade/src/loculus_preprocessing/prepro.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
index 87fc96536..f8583f459 100644
--- a/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
+++ b/preprocessing/nextclade/src/loculus_preprocessing/prepro.py
@@ -386,7 +386,7 @@ def add_input_metadata(
     nextclade_prefix = "nextclade."
     if input_path.startswith(nextclade_prefix):
         segment = spec.args.get("segment", "main")
-        if unprocessed.nextcladeMetadata is None:
+        if not unprocessed.nextcladeMetadata:
             errors.append(
                 ProcessingAnnotation(
                     source=[