Skip to content

Commit

Permalink
MAG <- Sample containment mapping (#11)
Browse files Browse the repository at this point in the history
* [wip] adds containment relationship between MAGs and Samples

* adds tests for genome-sample mapping
  • Loading branch information
SandyRogers authored Jul 2, 2024
1 parent 326d2db commit a98c3ce
Show file tree
Hide file tree
Showing 19 changed files with 589 additions and 18 deletions.
15 changes: 14 additions & 1 deletion holofood/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from holofood.models import (
Sample,
SampleMetadataMarker,
SampleStructuredDatum,
AnalysisSummary,
GenomeCatalogue,
Expand All @@ -16,6 +15,7 @@
ViralCatalogue,
Animal,
AnimalStructuredDatum,
GenomeSampleContainment,
)


Expand Down Expand Up @@ -106,6 +106,19 @@ class GenomeCatalogueAdmin(ModelAdmin):
inlines = [GenomeInline]


class GenomeSampleContainmentInline(TabularInlinePaginated):
model = GenomeSampleContainment
per_page = 5
can_delete = True
show_change_link = True
show_full_result_count = True


@admin.register(Genome)
class GenomeAdmin(ModelAdmin):
inlines = [GenomeSampleContainmentInline]


class ViralFragmentInline(TabularInlinePaginated):
model = ViralFragment
fields = ["id", "cluster_representative", "viral_type"]
Expand Down
28 changes: 28 additions & 0 deletions holofood/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
ViralFragment,
Animal,
AnimalStructuredDatum,
GenomeSampleContainment,
)
from holofood.utils import holofood_config

Expand Down Expand Up @@ -194,6 +195,16 @@ class Config:
model_fields = ["accession", "cluster_representative", "taxonomy", "metadata"]


class GenomeSampleContainmentSchema(ModelSchema):
class Config:
model = GenomeSampleContainment
model_fields = ["sample", "containment"]


class GenomeWithContainingSamplesSchema(GenomeSchema):
samples_containing: List[GenomeSampleContainmentSchema]


class ViralCatalogueSchema(ModelSchema):
related_genome_catalogue: GenomeCatalogueSchema
analysis_summaries: List[RelatedAnalysisSummarySchema]
Expand Down Expand Up @@ -453,6 +464,23 @@ def list_genome_catalogue_genomes(request, catalogue_id: str):
return catalogue.genomes.all()


@api.get(
"/genome-catalogues/{genome_catalogue_id}/genomes/{genome_id}",
response=GenomeWithContainingSamplesSchema,
summary="Fetch the detail of a Genome",
description="A Genomes is a Metagenomic Assembled Genome (MAG)."
"Each MAG originates from HoloFood samples."
"Each MAG has also been clustered with MAGs from other projects."
"Each HoloFood MAG references the best representative of these clusters, in MGnify."
"Each MAG has also been searched in all of the project samples, to find samples which contain the kmers of genome.",
tags=[GENOMES],
url_name="get_genome",
)
def get_genome(request, genome_catalogue_id: str, genome_id: str):
genome = get_object_or_404(Genome, accession=genome_id)
return genome


@api.get(
"/viral-catalogues",
response=List[ViralCatalogueSchema],
Expand Down
20 changes: 19 additions & 1 deletion holofood/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
ViralFragmentSchema,
AnimalSlimSchema,
AnimalStructuredDatumSchema,
GenomeSampleContainmentSchema,
)
from holofood.models import Sample, GenomeCatalogue, ViralCatalogue, Animal
from holofood.models import Sample, GenomeCatalogue, ViralCatalogue, Animal, Genome


class CSVRenderer(BaseRenderer):
Expand Down Expand Up @@ -114,6 +115,23 @@ def list_genome_catalogue_genomes(request, catalogue_id: str):
return catalogue.genomes.all()


@export_api.get(
"/genome-catalogues/{genome_catalogue_id}/genomes/{genome_id}/samples_containing",
response=List[GenomeSampleContainmentSchema],
summary="Fetch the list of Samples contained by a Genome, as a TSV",
description="A Genomes is a Metagenomic Assembled Genome (MAG)."
"Each MAG originates from HoloFood samples."
"Each MAG has also been clustered with MAGs from other projects."
"Each HoloFood MAG references the best representative of these clusters, in MGnify."
"Each species representative MAG has also been searched in all of the project samples, "
"to find samples which contain the kmers of genome.",
url_name="get_samples_containing_genome",
)
def get_genome(request, genome_catalogue_id: str, genome_id: str):
genome = get_object_or_404(Genome, accession=genome_id)
return genome.samples_containing


@export_api.get(
"/viral-catalogues/{catalogue_id}/fragments",
response=List[ViralFragmentSchema],
Expand Down
32 changes: 31 additions & 1 deletion holofood/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,17 @@
OuterRef,
Func,
)
from django.forms import NumberInput
from django.utils.safestring import mark_safe

from holofood.models import Sample, Genome, ViralFragment, Animal, AnimalStructuredDatum
from holofood.models import (
Sample,
Genome,
ViralFragment,
Animal,
AnimalStructuredDatum,
GenomeSampleContainment,
)
from holofood.utils import holofood_config


Expand Down Expand Up @@ -123,6 +131,28 @@ class Meta:
}


class GenomeSampleContainmentFilter(django_filters.FilterSet):
minimum_containment = django_filters.NumberFilter(
field_name="containment",
label="Minimum containment",
lookup_expr="gte",
min_value=0.0,
max_value=1.0,
help_text=mark_safe("Fraction of MAG kmers present in samples"),
widget=NumberInput(
attrs={"type": "range", "min": "0.2", "max": "1.0", "step": "0.05"}
),
)

class Meta:
model = GenomeSampleContainment

fields = {
"sample__accession": ["icontains"],
"sample__animal__accession": ["icontains"],
}


class ViralFragmentFilter(django_filters.FilterSet):
ALL = "Include species-cluster members"
REPS = "Species-cluster representatives only"
Expand Down
2 changes: 1 addition & 1 deletion holofood/management/commands/import_mag_catalogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def add_arguments(self, parser):
parser.add_argument(
"catalogue_file",
type=argparse.FileType("r"),
help="Path to the TSV file listing viral sequences",
help="Path to the TSV file listing MAGs.",
)
parser.add_argument(
"title",
Expand Down
103 changes: 103 additions & 0 deletions holofood/management/commands/import_mag_sample_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import argparse
import logging
from csv import DictReader

from django.core.management.base import BaseCommand, CommandError

from holofood.models import GenomeSampleContainment, Genome, Sample


class Command(BaseCommand):
help = (
"Import mappings between MAGs and samples, from a TSV file. "
"Needs columns of at least `mgyg`, `sample_accession`, `containment`."
)

def add_arguments(self, parser):
parser.add_argument(
"mapping_file",
type=argparse.FileType("r"),
help="Path to the TSV file listing MAG – Sample pairs.",
)
parser.add_argument(
"--catalogue_id_to_preclear",
type=str,
help="(Optional) ID of a MAG catalogue, in slug form, "
"to clear all sample maps from prior to inserting new ones.",
default=None,
)

def handle(self, *args, **options):
tsv_file = options["mapping_file"]
catalogue_id_to_preclear = options["catalogue_id_to_preclear"]

if catalogue_id_to_preclear:
existing_containments = GenomeSampleContainment.objects.filter(
genome__catalogue_id=catalogue_id_to_preclear
)
logging.info(
f"Deleting {existing_containments.count()} existing containments from genomes in {catalogue_id_to_preclear}"
)
existing_containments.delete()

reader = DictReader(tsv_file, delimiter="\t")

column_mapping = {
"mgyg": "genome_id",
"containment": "containment",
"sample_accession": "sample_id",
}

missing = set(column_mapping.keys()).difference(reader.fieldnames)
if missing:
raise CommandError(
f"Not all expected columns were found in the TSV. {missing=}"
)

for mapping in reader:
logging.info(
f"Importing mapping for {mapping['mgyg']} to sample {mapping['sample_accession']}"
)
try:
sample = Sample.objects.get(accession=mapping["sample_accession"])
except Sample.DoesNotExist:
self.stdout.write(
self.style.WARNING(
f"Sample {mapping['sample_accession']} does not exist."
)
)
continue

genomes = Genome.objects.filter(cluster_representative=mapping["mgyg"])
if not genomes.exists():
self.stdout.write(
self.style.WARNING(
f"Genomes with cluster rep {mapping['mgyg']} do not exist."
)
)
else:
logging.debug(f"Found {genomes.count()} Genomes")
for genome in genomes:
(
genome_sample_containment,
created,
) = GenomeSampleContainment.objects.get_or_create(
genome=genome,
sample=sample,
defaults={"containment": mapping["containment"]},
)
if created:
logging.debug(
f"Created genome-sample-containment {genome_sample_containment}"
)
else:
containment = float(mapping["containment"])
if containment > genome_sample_containment.containment:
logging.info(
f"Genome-sample-containment {genome_sample_containment} already exists, but updating "
)
logging.debug(
f"Updated genome-sample-containment {genome_sample_containment}"
)
tsv_file.close()
self.stdout.write(self.style.SUCCESS(f"Done"))
47 changes: 47 additions & 0 deletions holofood/migrations/0037_genomesamplecontainment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Generated by Django 4.2 on 2024-06-20 10:54

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):
dependencies = [
("holofood", "0036_alter_animalstructureddatum_measurement_and_more"),
]

operations = [
migrations.CreateModel(
name="GenomeSampleContainment",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("containment", models.FloatField(default=0)),
(
"genome",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="samples_containing",
to="holofood.genome",
),
),
(
"sample",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="genomes_contained",
to="holofood.sample",
),
),
],
options={
"ordering": ("genome", "-containment"),
},
),
]
46 changes: 46 additions & 0 deletions holofood/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,52 @@ class Meta:
ordering = ("accession",)


class GenomeSampleContainmentManager(models.Manager):
def get_queryset(self):
prefetchable_markers = (
holofood_config.tables.animals_list.default_metadata_marker_columns
)
primary_markers = AnimalStructuredDatum.objects.filter(
marker__name__in=prefetchable_markers
)
return (
super()
.get_queryset()
.select_related("sample")
.select_related("genome")
.select_related("sample__animal")
.prefetch_related(
Prefetch(
"sample__animal__structured_metadata",
queryset=primary_markers,
to_attr="primary_metadata",
)
)
)


class GenomeSampleContainment(models.Model):
"""
An instance of a genome being present ("contained") within a metagenomic sample.
"""

sample = models.ForeignKey(
Sample, on_delete=models.CASCADE, related_name="genomes_contained"
)
genome = models.ForeignKey(
Genome, on_delete=models.CASCADE, related_name="samples_containing"
)
containment = models.FloatField(default=0)

objects = GenomeSampleContainmentManager()

class Meta:
ordering = ("genome", "-containment")

def __str__(self):
return f"Containment of {self.genome} in {self.sample}"


class ViralCatalogue(models.Model):
"""
A collection of (probable) viral fragments detected in the metagenomic reads.
Expand Down
Loading

0 comments on commit a98c3ce

Please sign in to comment.