-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
update individuals with sample qc #4663
base: dev
Are you sure you want to change the base?
Changes from 1 commit
8a567f8
774d2cb
93cbbb7
eaf7232
ef7bc96
a9fd30f
87534a5
1830984
3aad126
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
import re | ||
|
||
from reference_data.models import GENOME_VERSION_LOOKUP | ||
from seqr.models import Family, Sample, SavedVariant, Project | ||
from seqr.models import Family, Sample, SavedVariant, Project, Individual | ||
from seqr.utils.communication_utils import safe_post_to_slack, send_project_email | ||
from seqr.utils.file_utils import file_iter, list_files, is_google_bucket_file_path | ||
from seqr.utils.search.add_data_utils import notify_search_data_loaded, update_airtable_loading_tracking_status | ||
|
@@ -42,6 +42,12 @@ | |
'CallsetCompletionDate', 'Project', 'Metrics Checked', 'gCNV_SV_CallsetPath', 'DRAGENShortReadCallsetPath', | ||
] | ||
|
||
QC_FILTER_FLAG_COL_MAP = { | ||
'callrate': 'filtered_callrate', | ||
'contamination': 'PCT_CONTAMINATION', | ||
'coverage_exome': 'HS_PCT_TARGET_BASES_20X', | ||
'coverage_genome': 'WGS_MEAN_COVERAGE' | ||
} | ||
|
||
class Command(BaseCommand): | ||
help = 'Check for newly loaded seqr samples' | ||
|
@@ -242,6 +248,9 @@ def _load_new_samples(cls, metadata_path, genome_version, dataset_type, run_vers | |
except Exception as e: | ||
logger.error(f'Error reporting loading failure for {run_version}: {e}') | ||
|
||
# Update sample qc | ||
cls.update_individuals_sample_qc(sample_type, samples_by_project.keys(), metadata['sample_qc']) | ||
|
||
# Reload saved variant JSON | ||
updated_variants_by_id = update_projects_saved_variant_json([ | ||
(project.id, project.name, project.genome_version, families) for project, families in families_by_project.items() | ||
|
@@ -350,6 +359,59 @@ def _update_pdos(session, project_guid, sample_ids): | |
|
||
return sorted(pdos_to_create.keys()) | ||
|
||
@classmethod | ||
def update_individuals_sample_qc(cls, sample_type, projects, sample_qc_map): | ||
sample_individual_map = { | ||
i.sample.sample_id: i for i in Individual.objects.filter( | ||
family__project__in=projects, | ||
sample__sample_id__in=sample_qc_map.keys(), | ||
).select_related('sample') | ||
} | ||
|
||
updated_individuals = [] | ||
unknown_filter_flags = set() | ||
unknown_pop_filter_flags = set() | ||
|
||
for sample_id, individual in sample_individual_map.items(): | ||
record = sample_qc_map[sample_id] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this would be more stable if you iterate through |
||
filter_flags = {} | ||
for flag in json.loads(record['filter_flags']): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm assuming this is essentially copy-paste from existing qc code- let me know if you made any substantive changes? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no substantive changes here ! |
||
flag = '{}_{}'.format(flag, sample_type) if flag == 'coverage' else flag | ||
flag_col = QC_FILTER_FLAG_COL_MAP.get(flag, flag) | ||
if flag_col in record: | ||
filter_flags[flag] = record[flag_col] | ||
else: | ||
unknown_filter_flags.add(flag) | ||
|
||
pop_platform_filters = {} | ||
for flag in json.loads(record['qc_metrics_filters']): | ||
flag_col = 'sample_qc.{}'.format(flag) | ||
if flag_col in record: | ||
pop_platform_filters[flag] = record[flag_col] | ||
else: | ||
unknown_pop_filter_flags.add(flag) | ||
|
||
individual.filter_flags.update(filter_flags) | ||
individual.pop_platform_filters.update(pop_platform_filters) | ||
individual.population.update(record['qc_pop'].upper()) | ||
updated_individuals.append(individual) | ||
|
||
if updated_individuals: | ||
Individual.objects.bulk_update( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its better to use the |
||
updated_individuals, | ||
['filter_flags', 'pop_platform_filters', 'population'], | ||
) | ||
|
||
if unknown_filter_flags: | ||
message = 'The following filter flags have no known corresponding value and were not saved: {}'.format( | ||
', '.join(unknown_filter_flags)) | ||
logger.warning(message) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warnings are useful when something is triggered in the UI and the person who does so will be shown the warnings, but they are essentially useless in a job that runs automatically in the background. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree here, this was a copy paste without critical thinking. There's actually no way we'd have erroneous filter flags, and for the hail qc_metrics_filters, which are set by hail's sample qc, we can trust that the output has the same flags every time, so I'm leaning towards just taking the flag validation out of the new seqr sample qc flow. |
||
|
||
if unknown_pop_filter_flags: | ||
message = 'The following population platform filters have no known corresponding value and were not saved: {}'.format( | ||
', '.join(unknown_pop_filter_flags)) | ||
logger.warning(message) | ||
|
||
@classmethod | ||
def _reload_shared_variant_annotations(cls, data_type, genome_version, updated_variants_by_id=None, exclude_families=None, chromosomes=None): | ||
dataset_type = data_type.split('_')[0] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
select_related
is generally an inefficient way to query things (although there are a lot of these floating around in old code still) so its generally a sign that what we are querying from the database is not efficiently written.Also, while its not obvious in this part of the code, for hail data the Sample
sample_id
is always identical to the corresponding Individual'sindividual_id
Also, we are currently not using it but
match_and_update_search_samples
returnsupdated_samples
which will be a queryset of all the sample models that were included in this round of loading. We should be able to safely assume that the sample QC data will only be included for those samples, so this can be rewritten as followsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was considering using
updated_samples
! glad to hear I was on the right track, thanks for the suggestion.