Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(don't merge) test branch to test ingest change effects #2837

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
2 changes: 1 addition & 1 deletion ingest/scripts/call_loculus.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ def record_factory(*args, **kwargs):
if mode == "get-submitted":
logger.info("Getting submitted sequences")
response = get_submitted(config)
Path(output).write_text(json.dumps(response), encoding="utf-8")
Path(output).write_text(json.dumps(response, indent=4), encoding="utf-8")


if __name__ == "__main__":
Expand Down
49 changes: 39 additions & 10 deletions ingest/scripts/group_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@
import orjsonl
import yaml


def sort_authors(authors: str) -> str:
"""Sort authors alphabetically"""
return ", ".join(sorted(authors.split(", ")))


def values_with_sorted_authors(values: dict[str, str]) -> dict[str, str]:
"""Sort authors values and return modified values"""
values_copy = values.copy()
values_copy["authors"] = sort_authors(values_copy["authors"])
return values_copy


logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
Expand All @@ -48,7 +61,8 @@ class Config:
segmented: bool


SPECIAL_FIELDS: Final = {"segment", "submissionId"}
# submissionId is actually NCBI accession
INTRINSICALLY_SEGMENT_SPECIFIC_FIELDS: Final = {"segment", "submissionId"}


@click.command()
Expand Down Expand Up @@ -94,27 +108,38 @@ def main(
# Group segments according to isolate, collection date and isolate specific values
# These are the fields that are expected to be identical across all segments for a given isolate

# Dynamically determine the fields that are present in the metadata
first_row = next(iter(segment_metadata.values()))
if not first_row:
msg = "No data found in metadata file"
raise ValueError(msg)
all_fields = first_row.keys()
all_fields = sorted(first_row.keys())
logger.debug(f"All metadata fields: {all_fields}")

# Metadata fields can vary between segments w/o indicating being from different assemblies
insdc_segment_specific_fields = set(config.insdc_segment_specific_fields)
insdc_segment_specific_fields.add("hash")

shared_fields = set(all_fields) - insdc_segment_specific_fields - SPECIAL_FIELDS
# Fields that in principle should be identical for all segments of the same assembly
shared_fields = sorted(
set(all_fields) - insdc_segment_specific_fields - INTRINSICALLY_SEGMENT_SPECIFIC_FIELDS
)
logger.debug(f"Shared metadata fields: {shared_fields}")

# Build equivalence classes based on shared fields
# Use shared fields as the key to group the data
type SegmentName = str
type Accession = str
type EquivalenceClasses = dict[tuple[str, str], dict[SegmentName, list[Accession]]]

# Creating the nested defaultdict with type hints
equivalence_classes: EquivalenceClasses = defaultdict(lambda: defaultdict(list))
for accession, values in segment_metadata.items():
group_key = tuple((field, values[field]) for field in shared_fields if values[field])
# Author order sometimes varies among segments from same isolate
# Example: JX999734.1 (L) and JX999735.1 (M)
modified_values = values_with_sorted_authors(values)
group_key = str(
tuple((field, value) for field in shared_fields if (value := modified_values[field]))
)
segment = values["segment"]
equivalence_classes[group_key][segment].append(accession)

Expand Down Expand Up @@ -189,11 +214,15 @@ def main(

for field in shared_fields:
values = {segment: segment_metadata[group[segment]][field] for segment in group}
deduplicated_values = set(values.values())
if len(deduplicated_values) != 1:
msg = f"Assertion failed: values for group must be identical: {values}"
raise ValueError(msg)
row[field] = deduplicated_values.pop()
deduplicated_values = sorted(set(values.values()))
if len(deduplicated_values) > 1:
if field == "authors":
# For authors, we accept different orders
logger.info(f"Author orders differ for group {joint_key}: {values}")
else:
msg = f"Assertion failed: values for group must be identical: {values}"
raise ValueError(msg)
row[field] = deduplicated_values[0]

for field in insdc_segment_specific_fields:
for segment in config.nucleotide_sequences:
Expand Down
2 changes: 1 addition & 1 deletion ingest/scripts/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def split_authors(authors: str) -> str:
else:
result.append(single_split[i].strip())

return ", ".join(sorted(result))
return ", ".join(result)


@click.command()
Expand Down
Loading