Skip to content

Commit

Permalink
feat(website): Create a standardized sequence name with the format: {…
Browse files Browse the repository at this point in the history
…country}/{AccessionVersion}/{date} (#2246)

* Add concatenate function to preprocessing.

* Add displayName in italics below loculus accession on webpage.

* Make values.yaml more logical: allow setting args in values.yaml to allow specification of concatenation order and type.

* Add documentation on how preprocessing functions work
  • Loading branch information
anna-parker authored Jul 9, 2024
1 parent a8f78d7 commit 0a96d23
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 45 deletions.
2 changes: 2 additions & 0 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ organism: ebola-zaire
# taxon_id: 3052518
# backend_url: http://localhost:8079/
# keycloak_token_url: http://localhost:8083/realms/loculus/protocol/openid-connect/token
# nextclade_dataset_name: nextstrain/cchfv/linked
# nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
# organism: cchf
# nucleotide_sequences:
# - M
Expand Down
24 changes: 17 additions & 7 deletions kubernetes/loculus/templates/_preprocessingFromValues.tpl
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
{{- define "loculus.sharedPreproSpecs" }}
{{ .key }}:
args:
{{- if .segment }}
segment: {{ .segment }}
{{- end }}
{{- if .type }}
type: {{ .type }}
{{- end }}
{{- if .preprocessing }}
{{- if hasKey .preprocessing "function" }}
function: {{ index .preprocessing "function" }}
Expand All @@ -19,6 +12,16 @@
{{- . | toYaml | nindent 4 }}
{{- end }}
{{- end }}
args:
{{- if .segment }}
segment: {{ .segment }}
{{- end }}
{{- if .type }}
type: {{ .type }}
{{- end }}
{{- with (get .preprocessing "args") }}
{{ toYaml . | nindent 4 }}
{{- end }}
{{- else }}
function: identity
inputs:
Expand All @@ -27,6 +30,13 @@
{{- else }}
input: {{ .name }}
{{- end }}
args:
{{- if .segment }}
segment: {{ .segment }}
{{- end }}
{{- if .type }}
type: {{ .type }}
{{- end }}
{{- end }}
{{- if .required}}
required: true
Expand Down
10 changes: 10 additions & 0 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,16 @@ defaultOrganismConfig: &defaultOrganismConfig
inputs:
date: sample_collection_date
required: true
- name: display_name
preprocessing:
function: concatenate
inputs:
geo_loc_country: geo_loc_country
sample_collection_date: sample_collection_date
args:
order: [geo_loc_country, accession_version, sample_collection_date]
type: [string, string, date]
noInput: true
- name: ncbi_release_date
displayName: NCBI release date
type: date
Expand Down
42 changes: 42 additions & 0 deletions preprocessing/nextclade/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,45 @@ prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp-
```
Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend.
## Preprocessing Checks
### Type Check
Preprocessing checks that the type of each metadata field corresponds to the expected `type` value seen in the config. If no type is given we assume the metadata field should be of type string.
### Required value Check
Additionally, we check that if a field is required, e.g. `required` is true that that field is not None.
### Custom Preprocessing Functions
If no additional `preprocessing` field is specified we assume that field uses the `identity` function, i.e. the output should be the same as the input. If a specific `type` is given the input will be converted to that type.
However, the `preprocessing` field can be customized to take an arbitrary number of input metadata fields, perform a function on them and then output the desired metadata field. We have defined the following preprocessing functions but more can be added for your own custom instance.
0. `identity`: Return the input field in the desired type.
1. `process_date`: Take a date string and return a date field in the "%Y-%m-%d" format
2. `parse_timestamp`: Take a timestamp e.g. 2022-11-01T00:00:00Z and return that field in the "%Y-%m-%d" format
3. `concatenate`: Take multiple metadata fields (including the accessionVersion) and concatenate them in the order specified by the `arg.order` parameter, fields will first be processed based on their `arg.type` (the order of the types should correspond to the order of fields specified by the order argument).
Using these functions in your `values.yaml` will look like:
```
- name: sample_collection_date
type: date
preprocessing:
function: process_date
inputs:
date: sample_collection_date
required: true
- name: display_name
preprocessing:
function: concatenate
inputs:
geo_loc_country: geo_loc_country
sample_collection_date: sample_collection_date
args:
order: [geo_loc_country, accession_version, sample_collection_date]
type: [string, string, date]
```
90 changes: 52 additions & 38 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import csv
import json
import logging
Expand Down Expand Up @@ -374,7 +375,48 @@ def null_per_backend(x: Any) -> bool:
return False


def add_input_metadata(
spec: ProcessingSpec,
unprocessed: UnprocessedAfterNextclade,
errors: list[ProcessingAnnotation],
input_path: str,
) -> InputMetadata:
"""Returns value of input_path in unprocessed metadata"""
# If field starts with "nextclade.", take from nextclade metadata
nextclade_prefix = "nextclade."
if input_path.startswith(nextclade_prefix):
segment = spec.args.get("segment", "main")
if not unprocessed.nextcladeMetadata:
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(
name="main",
type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
)
],
message="Nucleotide sequence failed to align",
)
)
return None
sub_path = input_path[len(nextclade_prefix) :]
if segment in unprocessed.nextcladeMetadata:
return str(
dpath.get(
unprocessed.nextcladeMetadata[segment],
sub_path,
separator=".",
default=None,
)
)
return None
if input_path not in unprocessed.inputMetadata:
return None
return unprocessed.inputMetadata[input_path]


def get_metadata(
id: AccessionVersion,
spec: ProcessingSpec,
output_field: str,
unprocessed: UnprocessedAfterNextclade,
Expand All @@ -383,46 +425,17 @@ def get_metadata(
) -> ProcessingResult:
input_data: InputMetadata = {}
for arg_name, input_path in spec.inputs.items():
input_data[arg_name] = None
# If field starts with "nextclade.", take from nextclade metadata
nextclade_prefix = "nextclade."
if input_path.startswith(nextclade_prefix):
# Remove "nextclade." prefix
if spec.args is None:
spec.args = {}
segment = spec.args.get("segment", "main")
if not unprocessed.nextcladeMetadata:
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(
name="main",
type=AnnotationSourceType.NUCLEOTIDE_SEQUENCE,
)
],
message="Nucleotide sequence failed to align",
)
)
continue
sub_path = input_path[len(nextclade_prefix) :]
if segment in unprocessed.nextcladeMetadata:
input_data[arg_name] = str(
dpath.get(
unprocessed.nextcladeMetadata[segment],
sub_path,
separator=".",
default=None,
)
)
else:
input_data[arg_name] = None
continue
if input_path not in unprocessed.inputMetadata:
continue
input_data[arg_name] = unprocessed.inputMetadata[input_path]
input_data[arg_name] = add_input_metadata(spec, unprocessed, errors, input_path)
args = spec.args

if spec.function == "concatenate":
spec_copy = copy.deepcopy(spec)
spec_copy.args["accession_version"] = id
args = spec_copy.args

try:
processing_result = ProcessingFunctions.call_function(
spec.function, spec.args, input_data, output_field
spec.function, args, input_data, output_field
)
except Exception as e:
msg = f"Processing for spec: {spec} with input data: {input_data} failed with {e}"
Expand Down Expand Up @@ -491,6 +504,7 @@ def process_single(
)
spec.args = {} if spec.args is None else spec.args
processing_result = get_metadata(
id,
spec,
output_field,
unprocessed,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,79 @@ def parse_timestamp(
errors=errors,
)

@staticmethod
def concatenate(
input_data: InputMetadata, output_field: str, args: FunctionArgs = None
) -> ProcessingResult:
"""Concatenates input fields with accession_version using the "/" separator in the order
specified by the order argument.
"""
warnings: list[ProcessingAnnotation] = []
errors: list[ProcessingAnnotation] = []

number_fields = len(input_data.keys()) + 1

accession_version = args["accession_version"]
order = args["order"]
type = args["type"]

# Check accessionVersion only exists once in the list:
if number_fields != len(order):
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message="Concatenation failed.",
)
)
return ProcessingResult(
datum=None,
warnings=warnings,
errors=errors,
)

formatted_input_data = []
for i in range(len(order)):
if type[i] == "date":
processed = ProcessingFunctions.process_date(
{"date": input_data[order[i]]}, output_field
)
formatted_input_data.append("" if processed.datum is None else processed.datum)
errors += processed.errors
warnings += processed.warnings
elif type[i] == "timestamp":
processed = ProcessingFunctions.parse_timestamp(
{"timestamp": input_data[order[i]]}, output_field
)
formatted_input_data.append("" if processed.datum is None else processed.datum)
errors += processed.errors
warnings += processed.warnings
else:
formatted_input_data.append(input_data.get(order[i], accession_version))
logging.debug(f"formatted input data:{formatted_input_data}")

try:
result = "/".join(formatted_input_data)
# To avoid downstream issues do not let the result start or end in a "/"
result = result.strip("/")

return ProcessingResult(datum=result, warnings=warnings, errors=errors)
except ValueError as e:
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message="Concatenation failed.",
)
)
return ProcessingResult(
datum=None,
errors=errors,
warnings=warnings,
)

@staticmethod
def identity(
input_data: InputMetadata, output_field: str, args: FunctionArgs = None
Expand Down
3 changes: 3 additions & 0 deletions website/src/components/SequenceDetailsPage/DataTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ const DataTableComponent: React.FC<Props> = ({ dataTableData, dataUseTermsHistor

return (
<div>
{dataTableData.topmatter.sequenceDisplayName !== undefined && (
<div className='px-6 mb-4 italic'>Display Name: {dataTableData.topmatter.sequenceDisplayName}</div>
)}
{dataTableData.topmatter.authors !== undefined && dataTableData.topmatter.authors.length > 0 && (
<div className='px-6 mb-4'>
<AuthorList authors={dataTableData.topmatter.authors} />
Expand Down
11 changes: 11 additions & 0 deletions website/src/components/SequenceDetailsPage/getDataTableData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import type { TableDataEntry } from './types.ts';
export type DataTableData = {
topmatter: {
authors: string[] | undefined;
sequenceDisplayName: string | undefined;
};
table: {
header: string;
Expand All @@ -14,6 +15,7 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
const result: DataTableData = {
topmatter: {
authors: undefined,
sequenceDisplayName: undefined,
},
table: [],
};
Expand All @@ -33,6 +35,15 @@ export function getDataTableData(listTableDataEntries: TableDataEntry[]): DataTa
continue;
}

if (
result.topmatter.sequenceDisplayName === undefined &&
entry.type.kind === 'metadata' &&
entry.name === 'display_name'
) {
result.topmatter.sequenceDisplayName = entry.value.toString();
continue;
}

if (!tableHeaderMap.has(entry.header)) {
tableHeaderMap.set(entry.header, []);
}
Expand Down

0 comments on commit 0a96d23

Please sign in to comment.