Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update individuals with sample qc #4663

Open
wants to merge 9 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 46 additions & 2 deletions seqr/management/commands/check_for_new_samples_from_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import re

from reference_data.models import GENOME_VERSION_LOOKUP
from seqr.models import Family, Sample, SavedVariant, Project
from seqr.models import Family, Sample, SavedVariant, Project, Individual
from seqr.utils.communication_utils import safe_post_to_slack, send_project_email
from seqr.utils.file_utils import file_iter, list_files, is_google_bucket_file_path
from seqr.utils.search.add_data_utils import notify_search_data_loaded, update_airtable_loading_tracking_status
from seqr.utils.search.utils import parse_valid_variant_id
from seqr.utils.search.hail_search_utils import hail_variant_multi_lookup, search_data_type
from seqr.views.apis.data_manager_api import DATA_TYPE_MAP
from seqr.views.utils.airtable_utils import AirtableSession, LOADABLE_PDO_STATUSES, AVAILABLE_PDO_STATUS
from seqr.views.utils.dataset_utils import match_and_update_search_samples
from seqr.views.utils.export_utils import write_multiple_files
Expand Down Expand Up @@ -42,6 +43,12 @@
'CallsetCompletionDate', 'Project', 'Metrics Checked', 'gCNV_SV_CallsetPath', 'DRAGENShortReadCallsetPath',
]

QC_FILTER_FLAG_COL_MAP = {
'callrate': 'filtered_callrate',
'contamination': 'contamination_rate',
'coverage_exome': 'percent_bases_at_20x',
'coverage_genome': 'mean_coverage'
}

class Command(BaseCommand):
help = 'Check for newly loaded seqr samples'
Expand Down Expand Up @@ -221,7 +228,7 @@ def _load_new_samples(cls, metadata_path, genome_version, dataset_type, run_vers

sample_type = metadata['sample_type']
logger.info(f'Loading {len(sample_project_tuples)} {sample_type} {dataset_type} samples in {len(samples_by_project)} projects')
new_samples, *args = match_and_update_search_samples(
new_samples, updated_samples, *args = match_and_update_search_samples(
projects=samples_by_project.keys(),
sample_project_tuples=sample_project_tuples,
sample_data={'data_source': run_version, 'elasticsearch_index': ';'.join(metadata['callsets'])},
Expand All @@ -242,6 +249,10 @@ def _load_new_samples(cls, metadata_path, genome_version, dataset_type, run_vers
except Exception as e:
logger.error(f'Error reporting loading failure for {run_version}: {e}')

# Update sample qc
if 'sample_qc' in metadata:
cls.update_individuals_sample_qc(sample_type, updated_samples, metadata['sample_qc'])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should wrap this in a try/except block that logs an error because if something goes wrong somehow we would still want the rest of the job to proceed


# Reload saved variant JSON
updated_variants_by_id = update_projects_saved_variant_json([
(project.id, project.name, project.genome_version, families) for project, families in families_by_project.items()
Expand Down Expand Up @@ -350,6 +361,39 @@ def _update_pdos(session, project_guid, sample_ids):

return sorted(pdos_to_create.keys())

@classmethod
def update_individuals_sample_qc(cls, sample_type, updated_samples, sample_qc_map):
sample_individual_map = {
i.individual_id: i for i in Individual.objects.filter(sample__in=updated_samples)
}

updated_individuals = []
for individual_id, record in sample_qc_map.items():
individual = sample_individual_map[individual_id]
filter_flags = {}
for flag in record['filter_flags']:
flag = '{}_{}'.format(flag, DATA_TYPE_MAP[sample_type.lower()]) if flag == 'coverage' else flag
flag_col = QC_FILTER_FLAG_COL_MAP.get(flag, flag)
filter_flags[flag] = record[flag_col]

pop_platform_filters = {}
for flag in record['qc_metrics_filters']:
flag_col = 'sample_qc.{}'.format(flag)
pop_platform_filters[flag] = record[flag_col]

individual.filter_flags = filter_flags
individual.pop_platform_filters = pop_platform_filters
individual.population = record['qc_gen_anc'].upper()
updated_individuals.append(individual)

if updated_individuals:
Individual.bulk_update_models(
user=None,
models=updated_individuals,
fields=['filter_flags', 'pop_platform_filters', 'population'],
)


@classmethod
def _reload_shared_variant_annotations(cls, data_type, genome_version, updated_variants_by_id=None, exclude_families=None, chromosomes=None):
dataset_type = data_type.split('_')[0]
Expand Down
71 changes: 71 additions & 0 deletions seqr/management/tests/check_for_new_samples_from_pipeline_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,57 @@
},
},
'relatedness_check_file_path': 'gs://seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/test_callset_hash.tsv',
'sample_qc': {
'NA20885': {
'filtered_callrate': 1.0,
'contamination_rate': 5.0,
'percent_bases_at_20x': 90.0,
'mean_coverage': 28.0,
'filter_flags': ['coverage'],
'pca_scores': [0.1 for _ in range(20)],
'prob_afr': 0.02,
'prob_ami': 0.0,
'prob_amr': 0.02,
'prob_asj': 0.9,
'prob_eas': 0.0,
'prob_fin': 0.0,
'prob_mid': 0.0,
'prob_nfe': 0.05,
'prob_sas': 0.01,
**{f'pop_PC{i + 1}': 0.1 for i in range(20)},
'qc_gen_anc': 'oth',
'sample_qc.call_rate': 1.0,
'sample_qc.n_called': 30,
'sample_qc.n_not_called': 0,
'sample_qc.n_filtered': 0,
'sample_qc.n_hom_ref': 17,
'sample_qc.n_het': 3,
'sample_qc.n_hom_var': 10,
'sample_qc.n_non_ref': 13,
'sample_qc.n_singleton': 0,
'sample_qc.n_snp': 23,
'sample_qc.n_insertion': 0,
'sample_qc.n_deletion': 0,
'sample_qc.n_transition': 13,
'sample_qc.n_transversion': 10,
'sample_qc.n_star': 0,
'sample_qc.r_ti_tv': 1.3,
'sample_qc.r_het_hom_var': 0.3,
'sample_qc.r_insertion_deletion': None,
'sample_qc.f_inbreeding.f_stat': -0.038400752079048056,
'sample_qc.f_inbreeding.n_called': 30,
'sample_qc.f_inbreeding.expected_homs': 27.11094199999999,
'sample_qc.f_inbreeding.observed_homs': 27,
'fail_n_snp': True,
'fail_r_ti_tv': False,
'fail_r_insertion_deletion': None,
'fail_n_insertion': True,
'fail_n_deletion': True,
'fail_r_het_hom_var': False,
'fail_call_rate': False,
'qc_metrics_filters': ['n_deletion', 'n_insertion', 'n_snp'],
}
}
}, {
'callsets': ['invalid_family.vcf'],
'sample_type': 'WGS',
Expand Down Expand Up @@ -401,6 +452,10 @@ def test_command(self, mock_email, mock_temp_dir):
('update 2 Familys', {'dbUpdate': mock.ANY}),
] + self.AIRTABLE_LOGS + [
('update 3 Familys', {'dbUpdate': mock.ANY}),
('update 1 Individuals', {'dbUpdate': {
'dbEntity': 'Individual', 'entityIds': ['I000015_na20885'],
'updateFields': ['filter_flags', 'pop_platform_filters', 'population'], 'updateType': 'bulk_update'}}
),
('Reloading saved variants in 2 projects', None),
('Updated 0 variants in 2 families for project Test Reprocessed Project', None),
('update SavedVariant SV0000006_1248367227_r0004_non', {'dbUpdate': mock.ANY}),
Expand Down Expand Up @@ -481,6 +536,22 @@ def test_command(self, mock_email, mock_temp_dir):
{EXISTING_SV_SAMPLE_GUID, NEW_SAMPLE_GUID_P4}
)

# Test Individual model properly updated with sample qc results
self.assertListEqual(
list(Individual.objects.filter(
guid__in=['I000015_na20885', 'I000016_na20888']).order_by('guid').values('filter_flags', 'pop_platform_filters', 'population')
),
[{
'filter_flags': {'coverage_exome': 90.0},
'pop_platform_filters': {'n_deletion': 0, 'n_insertion': 0, 'n_snp': 23},
'population': 'OTH'
}, {
'filter_flags': None,
'pop_platform_filters': None,
'population': 'SAS'
}]
)

# Test Family models updated
self.assertListEqual(list(Family.objects.filter(
guid__in=['F000002_2', 'F000011_11', 'F000012_12']
Expand Down