Skip to content

Commit

Permalink
Works around the limitation of 10000 acc IDs by NCBI
Browse files Browse the repository at this point in the history
  • Loading branch information
StuntsPT committed May 17, 2018
1 parent da58340 commit 0162c68
Showing 1 changed file with 12 additions and 13 deletions.
25 changes: 12 additions & 13 deletions back_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,28 +34,33 @@ def __init__(self, database, term, outfile, gui):
self.terminated = False
super(Downloader, self).__init__()


def ncbi_search(self, database, term):
"""
Submit search to NCBI and return the records.
"""
self.handle = Entrez.esearch(db=database, term=term, usehistory="y",
retmax=100000000, idtype="acc")
retmax=10, idtype="acc")
self.record = Entrez.read(self.handle)
self.handle.close()

return self.record


def record_processor(self, record):
def record_processor(self, record, database):
"""
Splits the record returned by Entrez into sparate variables and returns
them.
"""
count = int(record["Count"]) # Int
IDs = record["IdList"] # List
webenv = record["WebEnv"] # String
query_key = record["QueryKey"] # String
IDs = []

for i in range(0, count, 10000):
iter_handle = Entrez.efetch(db=database, webenv=webenv,
query_key=query_key, retmax=10000,
rettype="acc", retstart=i)
IDs += [x.rstrip() for x in iter_handle]
iter_handle.close()

assert count == len(IDs)

Expand All @@ -68,7 +73,6 @@ def record_processor(self, record):

return count, IDs, webenv, query_key


def main_organizer(self, count, IDs, webenv, query_key, b_size, Run):
"""
Defines what tasks need to be performed, handles NCBI server errors and
Expand Down Expand Up @@ -132,7 +136,6 @@ def main_organizer(self, count, IDs, webenv, query_key, b_size, Run):
if self.terminated is False:
self.re_downloader(IDs, webenv, query_key, b_size)


def re_downloader(self, IDs, webenv, query_key, b_size):
"""
Checks for missing sequences.
Expand Down Expand Up @@ -162,7 +165,6 @@ def re_downloader(self, IDs, webenv, query_key, b_size):
self.main_organizer(numb_missing, IDs, webenv, query_key,
b_size, 2)


def error_finder(self, target_file):
"""
Looks for errors in the output fasta and retruns a list of necessary
Expand All @@ -179,7 +181,6 @@ def error_finder(self, target_file):
target_handle.close()
return verified_ids


def fetch_by_id(self, IDs, b_size):
"""
Fetches NCBI data based on the IDs, rather than a search query. Returns
Expand All @@ -195,7 +196,6 @@ def fetch_by_id(self, IDs, b_size):

return data


def fetch_by_history(self, start, b_size, webenv, query_key):
"""
Fetches NCBI data based on the provided search query. Returns the data
Expand All @@ -213,7 +213,6 @@ def fetch_by_history(self, start, b_size, webenv, query_key):

return data


def translate_genome(self, acclist):
"""
Translates genome query IDs into a nucleotide query IDs, since NCBI has
Expand All @@ -236,7 +235,6 @@ def translate_genome(self, acclist):

return nuc_acc_list


def run_everything(self):
"""
Run the functions in order.
Expand All @@ -248,7 +246,8 @@ def run_everything(self):

rec = self.ncbi_search(self.database, self.term)
try:
count, IDs, webenv, query_key = self.record_processor(rec)
count, IDs, webenv, query_key = self.record_processor(rec,
self.database)
except TypeError:
return None
if self.database == "genome":
Expand Down

0 comments on commit 0162c68

Please sign in to comment.