Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a Gene Shet reference data update cmd. #3576

Merged
merged 11 commits into from
Oct 19, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# _seqr_ Changes

## dev
* Add GeneShet model to the reference DB (REQUIRES DB MIGRATION)

## 8/22/23
* Add db indices to optimize RNA data queries (REQUIRES DB MIGRATION)
Expand Down
24 changes: 24 additions & 0 deletions reference_data/management/commands/update_gene_shet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import logging
from reference_data.management.commands.utils.update_utils import GeneCommand, ReferenceDataHandler
from reference_data.models import GeneShet

logger = logging.getLogger(__name__)


class ShetReferenceDataHandler(ReferenceDataHandler):

model_cls = GeneShet
# The .tsv file is generated from the Google Doc at https://docs.google.com/spreadsheets/d/1enxGBWCAFBHdrRlqCj_ueleiDo9K9GWn/edit#gid=1146995171
# by downloading with a tsv format.
url = 'https://storage.googleapis.com/seqr-reference-data/gene_constraint/shet_Zeng(2023).xlsx%20-%20All%20scores-for%20gene%20page.tsv'

@staticmethod
def parse_record(record):
yield {
'gene_id': record['ensg'],
'shet': float(record['post_mean (Shet)']),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't post_mean the name of the actual statistic, and Shet is the whole method? So shouldn't the score column be called post_mean

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the column name of the spreadsheet. I don't know what name is more suitable. Maybe we should ask Lynn.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the column name is post_mean (Shet). The name of this method is Shet. Therefore, this column is representing the post_mean score for the Shet method. Since the name of the table is Shet, we do not need to capture Shet in the name of the score in the table. Therefore, the name of the score in the table should be post_mean

}


class Command(GeneCommand):
reference_data_handler = ShetReferenceDataHandler
20 changes: 20 additions & 0 deletions reference_data/management/tests/update_gene_shet_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from reference_data.models import GeneShet
from reference_data.management.tests.test_utils import ReferenceDataCommandTestCase

class UpdateGeneShetTest(ReferenceDataCommandTestCase):
URL = 'https://storage.googleapis.com/seqr-reference-data/gene_constraint/shet_Zeng(2023).xlsx%20-%20All%20scores-for%20gene%20page.tsv'
DATA = [
'ensg hgnc post_mean (Shet)\n',
'ENSG00000223972 HGNC:37225 3.01E-05\n',
'ENSG00000227233 HGNC:26441 4.85E-05\n',
'ENSG00000243485 HGNC:4013 5.08E-05\n',
]

def test_update_gene_cn_sensitivity_command(self):
self._test_update_command('update_gene_shet', 'GeneShet', created_records=2)

self.assertEqual(GeneShet.objects.count(), 2)
record = GeneShet.objects.get(gene__gene_id='ENSG00000223972')
self.assertEqual(record.shet, 3.01E-05)
record = GeneShet.objects.get(gene__gene_id='ENSG00000243485')
self.assertEqual(record.shet, 5.08E-05)
22 changes: 22 additions & 0 deletions reference_data/migrations/0022_geneshet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.2.20 on 2023-08-25 14:33

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('reference_data', '0021_auto_20221031_2049'),
]

operations = [
migrations.CreateModel(
name='GeneShet',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('shet', models.FloatField()),
('gene', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='reference_data.geneinfo')),
],
),
]
9 changes: 9 additions & 0 deletions reference_data/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ class Meta:
json_fields = ['pHI', 'pTS']


class GeneShet(models.Model):
gene = models.ForeignKey(GeneInfo, on_delete=models.CASCADE)

shet = models.FloatField()

class Meta:
json_fields = ['shet']


class Omim(models.Model):
MAP_METHOD_CHOICES = (
('1', 'the disorder is placed on the map based on its association with a gene, but the underlying defect is not known.'),
Expand Down
8 changes: 8 additions & 0 deletions seqr/fixtures/reference_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -1167,6 +1167,14 @@
"pHI": 0.90576,
"pTS": 0.7346
}
},
{
"model": "reference_data.geneshet",
"pk": 1,
"fields": {
"gene": 1,
"shet": 0.90576
}
},
{
"model": "reference_data.dbnsfpgene",
Expand Down
Loading