From 49b16eff01012d687770754cd382f3a7db1073bc Mon Sep 17 00:00:00 2001 From: Mark Koni Wright Date: Tue, 9 Apr 2019 14:54:37 -0700 Subject: [PATCH 1/4] Recognize all MT names/accessions for loop=True in validate_loop() --- seqseek/chromosome.py | 20 +++++++++----------- seqseek/lib.py | 3 +++ seqseek/tests/test_functional.py | 6 +++--- setup.py | 4 ++-- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/seqseek/chromosome.py b/seqseek/chromosome.py index 007c6fa..925c924 100644 --- a/seqseek/chromosome.py +++ b/seqseek/chromosome.py @@ -2,7 +2,8 @@ from .exceptions import TooManyLoops, MissingDataError from .lib import (BUILD37, BUILD38, get_data_directory, sorted_nicely, - BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, RCRS_ACCESSION) + BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, + RCRS_ACCESSION, MITOCHONDRIA_NAMES) class Chromosome(object): @@ -12,7 +13,7 @@ class Chromosome(object): BUILD38: BUILD38_ACCESSIONS } - def __init__(self, chromosome_name, assembly=BUILD37, loop=False): + def __init__(self, chromosome_name, assembly=BUILD37, loop=None): """ Usage: @@ -27,7 +28,10 @@ def __init__(self, chromosome_name, assembly=BUILD37, loop=False): """ self.name = str(chromosome_name) self.assembly = assembly - self.loop = loop + if loop is None: + self.loop = True if self.name in MITOCHONDRIA_NAMES else False + else: + self.loop = loop self.validate_assembly() self.validate_name() @@ -56,7 +60,7 @@ def validate_name(self): name=self.name)) def validate_loop(self): - if self.loop and self.name != 'MT': + if self.loop and self.name not in MITOCHONDRIA_NAMES: raise ValueError('Loop may only be specified for the mitochondria.') def validate_coordinates(self, start, end): @@ -87,7 +91,7 @@ def sorted_chromosome_length_tuples(cls, assembly): ACCESSION_LENGTHS.keys()).index(name_to_accession[pair[0]])) def filename(self): - return '{}.fa'.format(self.accession) + return '{}.fa'.format(self.accession) def path(self): data_dir = get_data_directory() @@ -124,10 +128,4 @@ def sequence(self, start, end): sequence = ''.join([self.read(*read) for read in reads]) - # The rCRS mito contig contains an 'N' base at position 3107 to preserve legacy - # nucleotide numbering. We remove it because it is not part of the observed - # sequence. See http://www.mitomap.org/MITOMAP/HumanMitoSeq - if self.accession == RCRS_ACCESSION: - sequence = sequence.replace('N', '') - return sequence diff --git a/seqseek/lib.py b/seqseek/lib.py index 03ece83..3c8bb8b 100644 --- a/seqseek/lib.py +++ b/seqseek/lib.py @@ -152,6 +152,9 @@ 'NT_167251.1': 1680828, } +MITOCHONDRIA_NAMES = ('MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'], + BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']) + def get_data_directory(): default = os.path.expanduser(DEFAULT_DATA_DIR) diff --git a/seqseek/tests/test_functional.py b/seqseek/tests/test_functional.py index 67d5a75..42daf6b 100644 --- a/seqseek/tests/test_functional.py +++ b/seqseek/tests/test_functional.py @@ -63,16 +63,16 @@ def test_chr1_sequences(self): self.assertEqual(seq, expected_seq) def test_chrMT_sequence(self): - expected_seq = 'GATCACAGGTCTTCACCCT' + expected_seq = 'GATCACAGGTCTNTCACCCT' seq = Chromosome('MT').sequence(0, 20) self.assertEqual(seq, expected_seq) - self.assertEqual(len(seq), 19) # the N base was removed + self.assertEqual(len(seq), 20) # the N base was *not* removed expected_seq = 'CAGGT' seq = Chromosome('MT').sequence(5, 10) self.assertEqual(seq, expected_seq) def test_mito_loop_end(self): - expected_seq = 'CTTCACCCTGATCACAGGT' + expected_seq = 'CTNTCACCCTGATCACAGGT' seq = Chromosome('MT', loop=True).sequence(10, 30) self.assertEqual(seq, expected_seq) diff --git a/setup.py b/setup.py index 485f563..a702de1 100755 --- a/setup.py +++ b/setup.py @@ -15,9 +15,9 @@ setup( name='seqseek', - version='0.3.3', + version='0.3.4', url='https://github.com/23andMe/seqseek', - download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.3', + download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.4', author='23andMe Engineering', author_email=['mstrand@23andme.com'], description='Easy access to human reference genome sequences', From 66be8c51b2e1d11c5c8a8e43f85b20ff8ba2ad72 Mon Sep 17 00:00:00 2001 From: Mark Koni Wright Date: Tue, 9 Apr 2019 17:35:02 -0700 Subject: [PATCH 2/4] Make rCRS N removal optional and default to True Now a parameter to Chromosome.__init__() defaults to True meaning the N in the rCRS MT sequence is removed from the sequence returned. This restores the original behavior by default, but allows a user to request that the N not be removed by setting RCRS_N_remove=False. Tests reverted, new test added to test new parameter. --- seqseek/chromosome.py | 14 +++++++++----- seqseek/lib.py | 4 ++-- seqseek/tests/test_functional.py | 12 +++++++++--- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/seqseek/chromosome.py b/seqseek/chromosome.py index 925c924..34607f9 100644 --- a/seqseek/chromosome.py +++ b/seqseek/chromosome.py @@ -13,7 +13,7 @@ class Chromosome(object): BUILD38: BUILD38_ACCESSIONS } - def __init__(self, chromosome_name, assembly=BUILD37, loop=None): + def __init__(self, chromosome_name, assembly=BUILD37, loop=False, RCRS_N_remove=True): """ Usage: @@ -28,10 +28,8 @@ def __init__(self, chromosome_name, assembly=BUILD37, loop=None): """ self.name = str(chromosome_name) self.assembly = assembly - if loop is None: - self.loop = True if self.name in MITOCHONDRIA_NAMES else False - else: - self.loop = loop + self.loop = loop + self.RCRS_N_remove = RCRS_N_remove self.validate_assembly() self.validate_name() @@ -128,4 +126,10 @@ def sequence(self, start, end): sequence = ''.join([self.read(*read) for read in reads]) + # The rCRS mito contig contains an 'N' base at position 3107 to preserve legacy + # nucleotide numbering. We remove it because it is not part of the observed + # sequence. See http://www.mitomap.org/MITOMAP/HumanMitoSeq + if self.accession == RCRS_ACCESSION and self.RCRS_N_remove is True: + sequence = sequence.replace('N', '') + return sequence diff --git a/seqseek/lib.py b/seqseek/lib.py index 3c8bb8b..93a70cd 100644 --- a/seqseek/lib.py +++ b/seqseek/lib.py @@ -152,8 +152,8 @@ 'NT_167251.1': 1680828, } -MITOCHONDRIA_NAMES = ('MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'], - BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']) +MITOCHONDRIA_NAMES = {'MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'], + BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']} def get_data_directory(): diff --git a/seqseek/tests/test_functional.py b/seqseek/tests/test_functional.py index 42daf6b..d68858b 100644 --- a/seqseek/tests/test_functional.py +++ b/seqseek/tests/test_functional.py @@ -63,16 +63,22 @@ def test_chr1_sequences(self): self.assertEqual(seq, expected_seq) def test_chrMT_sequence(self): - expected_seq = 'GATCACAGGTCTNTCACCCT' + expected_seq = 'GATCACAGGTCTTCACCCT' seq = Chromosome('MT').sequence(0, 20) self.assertEqual(seq, expected_seq) - self.assertEqual(len(seq), 20) # the N base was *not* removed + self.assertEqual(len(seq), 19) # the N base was removed expected_seq = 'CAGGT' seq = Chromosome('MT').sequence(5, 10) self.assertEqual(seq, expected_seq) + def test_rCRS_sequence_retain_N(self): + expected_seq = 'GATCACAGGTCTNTCACCCT' + seq = Chromosome('MT', RCRS_N_remove=False).sequence(0, 20) + self.assertEqual(seq, expected_seq) + self.assertTrue('N' in seq) # the N base was *not* removed + def test_mito_loop_end(self): - expected_seq = 'CTNTCACCCTGATCACAGGT' + expected_seq = 'CTTCACCCTGATCACAGGT' seq = Chromosome('MT', loop=True).sequence(10, 30) self.assertEqual(seq, expected_seq) From a48d213b9974ece827a5a893d747b684f63d4d7f Mon Sep 17 00:00:00 2001 From: Mark Koni Wright Date: Wed, 10 Apr 2019 15:58:12 -0700 Subject: [PATCH 3/4] Adding suggested update to README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0e44218..75b99b2 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ backward-compatibility then you may load it directly by accession (`NC_001807.4` The rCRS mitochondria sequence contains an 'N' base at position 3106-3107 to preserve legacy nucleotide numbering. This can be useful for using legacy coordinates but but is impractical when working with sequences that are -expected to align to observed human mitochondrial sequences. SeqSeek removes this `N`: +expected to align to observed human mitochondrial sequences. SeqSeek +removes this `N` unless it is explicitly requested by passing `RCRS_N_remove=False`. ```python Chromosome('MT').sequence(3106, 3107) # => '' From d92eabd6f8d7a6a8048e8ed2ae36e16d89c9c581 Mon Sep 17 00:00:00 2001 From: Mark Koni Wright Date: Tue, 23 Apr 2019 11:33:52 -0700 Subject: [PATCH 4/4] Set version to 0.4.1 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a702de1..6f85489 100755 --- a/setup.py +++ b/setup.py @@ -15,9 +15,9 @@ setup( name='seqseek', - version='0.3.4', + version='0.4.1', url='https://github.com/23andMe/seqseek', - download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.4', + download_url = 'https://github.com/23andMe/seqseek/tarball/0.4.1', author='23andMe Engineering', author_email=['mstrand@23andme.com'], description='Easy access to human reference genome sequences',