Skip to content

Commit

Permalink
Merge pull request #3855 from broadinstitute/rna-seq-manage-cleanup
Browse files Browse the repository at this point in the history
RNA-seq manage command cleanup
  • Loading branch information
hanars authored Feb 6, 2024
2 parents bb9ee23 + 7736faf commit 9877363
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 244 deletions.
Original file line number Diff line number Diff line change
@@ -1,27 +1,19 @@
import logging
from django.core.management.base import BaseCommand

from seqr.models import RnaSeqTpm, Sample
from seqr.models import Sample
from seqr.views.utils.file_utils import parse_file
from seqr.views.utils.dataset_utils import load_rna_seq_tpm
from seqr.views.utils.dataset_utils import load_rna_seq, RNA_DATA_TYPE_CONFIGS

logger = logging.getLogger(__name__)

TISSUE_TYPE_MAP = {
'whole_blood': 'WB',
'fibroblasts': 'F',
'muscle': 'M',
'lymphocytes': 'L',
}

REVERSE_TISSUE_TYPE = {v: k for k, v in TISSUE_TYPE_MAP.items()}


class Command(BaseCommand):
help = 'Load RNA-Seq TPM data'
help = 'Load RNA-Seq data'

def add_arguments(self, parser):
parser.add_argument('input_file', help='tsv file with TPM data')
parser.add_argument('data_type', help='RNA data type', choices=sorted(RNA_DATA_TYPE_CONFIGS.keys()))
parser.add_argument('input_file', help='tsv file with RNA data')
parser.add_argument('--mapping-file', help='optional file to map sample IDs to seqr individual IDs')
parser.add_argument('--ignore-extra-samples', action='store_true', help='whether to suppress errors about extra samples')

Expand All @@ -31,17 +23,19 @@ def handle(self, *args, **options):
with open(options['mapping_file']) as f:
mapping_file = parse_file(options['mapping_file'], f)

sample_guids, _, _ = load_rna_seq_tpm(
options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True,
data_type = options['data_type']
self.model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class']

sample_guids, _, _ = load_rna_seq(
data_type, options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True,
mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples'])

Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids)

logger.info('DONE')

@staticmethod
def _save_sample_data(sample_guid, data_by_gene):
def _save_sample_data(self, sample_guid, data_by_gene):
sample = Sample.objects.get(guid=sample_guid)
models = RnaSeqTpm.objects.bulk_create(
[RnaSeqTpm(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000)
logger.info(f'create {len(models)} RnaSeqTpm for {sample.sample_id}')
models = self.model_cls.objects.bulk_create(
[self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000)
logger.info(f'create {len(models)} {self.model_cls.__name__} for {sample.sample_id}')
38 changes: 0 additions & 38 deletions seqr/management/commands/load_rna_seq_outlier.py

This file was deleted.

63 changes: 0 additions & 63 deletions seqr/management/tests/load_rna_seq_outlier_tests.py

This file was deleted.

159 changes: 159 additions & 0 deletions seqr/management/tests/load_rna_seq_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
import mock

from django.core.management import call_command
from django.core.management.base import CommandError

from seqr.models import Sample, RnaSeqTpm, RnaSeqOutlier
from seqr.utils.middleware import ErrorsWarningsException
from seqr.views.utils.test_utils import AuthenticationTestCase

RNA_FILE_ID = 'all_tissue_tpms.tsv.gz'
MAPPING_FILE_ID = 'mapping.tsv'
EXISTING_SAMPLE_GUID = 'S000152_na19675_d2'


class LoadRnaSeqTest(AuthenticationTestCase):
fixtures = ['users', '1kg_project', 'reference_data']

def setUp(self):
patcher = mock.patch('seqr.utils.file_utils.gzip.open')
mock_gzip_open = patcher.start()
self.mock_gzip_file_iter = mock_gzip_open.return_value.__enter__.return_value.__iter__
self.addCleanup(patcher.stop)
patcher = mock.patch('seqr.management.commands.load_rna_seq.open')
self.mock_open = patcher.start()
self.addCleanup(patcher.stop)
patcher = mock.patch('seqr.management.commands.load_rna_seq.logger')
self.mock_logger = patcher.start()
self.addCleanup(patcher.stop)

def _test_invalid_calls(self, data_type, expected_columns, file_data, unmatched_samples, additional_errors=None):
self.mock_gzip_file_iter.return_value = ['invalid\theader']

with self.assertRaises(CommandError) as e:
call_command('load_rna_seq', 'not_a_type', RNA_FILE_ID)
self.assertEqual(
str(e.exception),
"Error: argument data_type: invalid choice: 'not_a_type' (choose from 'outlier', 'splice_outlier', 'tpm')")

with self.assertRaises(ValueError) as e:
call_command('load_rna_seq', data_type, RNA_FILE_ID)
self.assertEqual(str(e.exception), f'Invalid file: missing column(s): {expected_columns}')

self.mock_gzip_file_iter.return_value = file_data
with self.assertRaises(ErrorsWarningsException) as e:
call_command('load_rna_seq', data_type, RNA_FILE_ID)
self.assertListEqual(e.exception.errors, (additional_errors or []) + [
f'Unable to find matches for the following samples: {unmatched_samples}',
])

def _assert_expected_existing_sample(self, data_source):
existing_sample = Sample.objects.get(individual_id=1, sample_id='NA19675_D2', sample_type='RNA')
self.assertEqual(existing_sample.guid, EXISTING_SAMPLE_GUID)
self.assertEqual(existing_sample.sample_id, 'NA19675_D2')
self.assertTrue(existing_sample.is_active)
self.assertIsNone(existing_sample.elasticsearch_index)
self.assertEqual(existing_sample.tissue_type, 'M')
self.assertEqual(existing_sample.data_source, data_source)
return existing_sample

@mock.patch('seqr.views.utils.dataset_utils.logger')
def test_tpm(self, mock_utils_logger):
self._test_invalid_calls(
'tpm',
expected_columns='TPM, gene_id, project, sample_id, tissue',
file_data=[
'sample_id\tproject\tindividual_id\tgene_id\tTPM\ttissue\n',
'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n',
'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n',
'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n',
'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n',
'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n',
'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n',
],
unmatched_samples='NA19677, NA19678, NA19678_D1',
additional_errors=['Samples missing required "tissue": NA19675_D2'],
)

self.mock_gzip_file_iter.return_value = [
self.mock_gzip_file_iter.return_value[0],
'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t 6.04\twhole_blood\n',
] + self.mock_gzip_file_iter.return_value[2:]
call_command('load_rna_seq', 'tpm', RNA_FILE_ID, '--ignore-extra-samples')

# Existing outlier data should be unchanged
self.assertEqual(RnaSeqOutlier.objects.count(), 3)

# Test database models
existing_sample = self._assert_expected_existing_sample('muscle_samples.tsv.gz')
existing_rna_samples = Sample.objects.filter(sample_type='RNA', rnaseqtpm__isnull=False)

new_sample = Sample.objects.get(individual_id=2, sample_type='RNA')
self.assertEqual(new_sample.sample_id, 'NA19678_D1')
self.assertTrue(new_sample.is_active)
self.assertIsNone(new_sample.elasticsearch_index)
self.assertEqual(new_sample.data_source, 'all_tissue_tpms.tsv.gz')
self.assertEqual(new_sample.tissue_type, 'WB')

models = RnaSeqTpm.objects.all()
self.assertEqual(models.count(), 5)
self.assertSetEqual({model.sample for model in models}, set(existing_rna_samples))
self.assertEqual(models.filter(sample=existing_sample, gene_id='ENSG00000240361').count(), 0)
self.assertEqual(models.get(sample=new_sample, gene_id='ENSG00000233750').tpm, 6.04)

self.mock_logger.info.assert_has_calls([
mock.call('create 1 RnaSeqTpm for NA19678_D1'),
mock.call('DONE'),
])
mock_utils_logger.warning.assert_has_calls([
mock.call('Skipped loading for the following 2 unmatched samples: NA19677, NA19678', None),
])

# Test a new sample created for a mismatched tissue and a row with 0.0 tpm
self.mock_gzip_file_iter.return_value[1] = 'NA19678_D1\t1kg project nåme with uniçøde\tNA19678\tENSG00000233750\t0.0\tfibroblasts\n'
call_command('load_rna_seq', 'tpm', 'new_file.tsv.gz', '--ignore-extra-samples')
models = RnaSeqTpm.objects.select_related('sample').filter(sample__sample_id='NA19678_D1')
self.assertEqual(models.count(), 2)
self.assertSetEqual(set(models.values_list('sample__tissue_type', flat=True)), {'F', 'WB'})
self.assertEqual(models.get(gene_id='ENSG00000233750', sample__tissue_type='F').tpm, 0.0)
self.assertEqual(models.values('sample').distinct().count(), 2)
self.mock_logger.info.assert_has_calls([
mock.call('create 1 RnaSeqTpm for NA19678_D1'),
mock.call('DONE'),
])

def test_outlier(self):
self._test_invalid_calls(
'outlier',
expected_columns='geneID, pValue, padjust, project, sampleID, tissue, zScore',
file_data=[
'sampleID\tproject\tgeneID\tdetail\tpValue\tpadjust\tzScore\ttissue\n',
'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail1\t0.01\t0.13\t-3.1\tmuscle\n',
'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000240361\tdetail2\t0.01\t0.13\t-3.1\tmuscle\n',
'NA19675_D2\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n',
'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n',
'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n',
],
unmatched_samples='NA19675_D3, NA19675_D4',
)

self.mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678']
with self.assertRaises(ErrorsWarningsException) as e:
call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv')
self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3'])

call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples')

sample = self._assert_expected_existing_sample('all_tissue_tpms.tsv.gz')

models = RnaSeqOutlier.objects.all()
self.assertEqual(models.count(), 2)
self.assertSetEqual({model.sample for model in models}, {sample})
self.assertListEqual(list(models.values_list('gene_id', 'p_adjust', 'p_value', 'z_score')), [
('ENSG00000240361', 0.13, 0.01, -3.1), ('ENSG00000233750', 0.0000057, 0.064, 7.8),
])
self.mock_logger.info.assert_has_calls([
mock.call('create 2 RnaSeqOutlier for NA19675_D2'),
mock.call('DONE'),
])
Loading

0 comments on commit 9877363

Please sign in to comment.