Skip to content

Commit

Permalink
Updating files
Browse files Browse the repository at this point in the history
  • Loading branch information
r78v10a07 committed Mar 30, 2022
1 parent 1bd7508 commit cc5d7da
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 144 deletions.
8 changes: 4 additions & 4 deletions docs/source/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,31 @@ Archaea
.. code-block:: bash
localhost:~> datasets download genome taxon 2157 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated
localhost:~> mv ncbi_meta.zip archaea_meta.zip
localhost:~> mv ncbi_dataset.zip archaea_meta.zip
Bacteria
========

.. code-block:: bash
localhost:~> datasets download genome taxon 2 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated
localhost:~> mv ncbi_meta.zip bacteria_meta.zip
localhost:~> mv ncbi_dataset.zip bacteria_meta.zip
Viruses
=======

.. code-block:: bash
localhost:~> datasets download genome taxon 10239 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated
localhost:~> mv ncbi_meta.zip viruses_meta.zip
localhost:~> mv ncbi_dataset.zip viruses_meta.zip
Eukaryotes
==========

.. code-block:: bash
localhost:~> datasets download genome taxon 2759 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated
localhost:~> mv ncbi_meta.zip eukaryotes_meta.zip
localhost:~> mv ncbi_dataset.zip eukaryotes_meta.zip
Process metadata and creates the directories for hydration
----------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def readme():
'taxonomy_pickle = gtax.taxonomy_main:taxonomy_pickle',
'gtax_database = gtax.gtax_main:gtax_database',
'filter_metadata_zip = gtax.gtax_main:filter_metadata_zip',
'create_random_short_sequences = gtax.sequence:create_random_short_sequences'
'create_random_short_sequences = gtax.sequence:create_random_short_sequences',
'sequence_binning = gtax.sequence_binning:sequence_binning_main'
],
}
)
81 changes: 41 additions & 40 deletions src/gtax/gtax_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,45 +32,46 @@ def filter_metadata_zip():
superkingdoms = ['archaea', 'bacteria', 'viruses', 'eukaryotes']

for db in superkingdoms:
if not os.path.exists('{}/ncbi_dataset/data'.format(db)):
os.makedirs('{}/ncbi_dataset/data'.format(db))
with ZipFile('{}_meta.zip'.format(db), 'r') as zip:
assemblies = []
assemblies_tmp = {}
with zip.open('ncbi_dataset/data/assembly_data_report.jsonl') as fjson, open(
'{}/ncbi_dataset/data/assembly_data_report.jsonl'.format(db), 'w') as fjson_out:
for line in fjson.readlines():
d = json.loads(line.decode("utf-8"))
v = assemblies_tmp.setdefault(d['taxId'], [])
v.append(d)
for s in assemblies_tmp.keys():
rep_genome = []
for e in assemblies_tmp[s]:
if 'refseqCategory' in e['assemblyInfo']:
rep_genome.append(e)
if len(rep_genome) == 1:
assemblies.append(rep_genome[0]['assemblyInfo']['assemblyAccession'])
fjson_out.write('{}\n'.format(json.dumps(rep_genome[0])))
else:
assemblies.append(assemblies_tmp[s][0]['assemblyInfo']['assemblyAccession'])
fjson_out.write('{}\n'.format(json.dumps(assemblies_tmp[s][0])))
if os.path.exists('{}_meta.zip'.format(db)):
if not os.path.exists('{}/ncbi_dataset/data'.format(db)):
os.makedirs('{}/ncbi_dataset/data'.format(db))
with ZipFile('{}_meta.zip'.format(db), 'r') as zip:
assemblies = []
assemblies_tmp = {}
with zip.open('ncbi_dataset/data/assembly_data_report.jsonl') as fjson, open(
'{}/ncbi_dataset/data/assembly_data_report.jsonl'.format(db), 'w') as fjson_out:
for line in fjson.readlines():
d = json.loads(line.decode("utf-8"))
v = assemblies_tmp.setdefault(d['taxId'], [])
v.append(d)
for s in assemblies_tmp.keys():
rep_genome = []
for e in assemblies_tmp[s]:
if 'refseqCategory' in e['assemblyInfo']:
rep_genome.append(e)
if len(rep_genome) == 1:
assemblies.append(rep_genome[0]['assemblyInfo']['assemblyAccession'])
fjson_out.write('{}\n'.format(json.dumps(rep_genome[0])))
else:
assemblies.append(assemblies_tmp[s][0]['assemblyInfo']['assemblyAccession'])
fjson_out.write('{}\n'.format(json.dumps(assemblies_tmp[s][0])))

print('There are {} assemblies included'.format(len(assemblies)))
with zip.open('ncbi_dataset/data/dataset_catalog.json') as fjson, open(
'{}/ncbi_dataset/data/dataset_catalog.json'.format(db), 'w') as fjson_out:
d = json.loads(fjson.read().decode("utf-8"))
catalog = []
for c in d['assemblies']:
if 'accession' in c:
if c['accession'] in assemblies:
print('There are {} assemblies included'.format(len(assemblies)))
with zip.open('ncbi_dataset/data/dataset_catalog.json') as fjson, open(
'{}/ncbi_dataset/data/dataset_catalog.json'.format(db), 'w') as fjson_out:
d = json.loads(fjson.read().decode("utf-8"))
catalog = []
for c in d['assemblies']:
if 'accession' in c:
if c['accession'] in assemblies:
catalog.append(c)
else:
catalog.append(c)
else:
catalog.append(c)
d['assemblies'] = catalog
fjson_out.write(json.dumps(d, indent=2))
with zip.open('ncbi_dataset/fetch.txt') as fin, open('{}/ncbi_dataset/fetch.txt'.format(db), 'w') as fout:
for line in fin.readlines():
line = line.decode("utf-8")
f = os.path.dirname(line.split('\t')[2].replace('data/', ''))
if f in assemblies:
fout.write(line)
d['assemblies'] = catalog
fjson_out.write(json.dumps(d, indent=2))
with zip.open('ncbi_dataset/fetch.txt') as fin, open('{}/ncbi_dataset/fetch.txt'.format(db), 'w') as fout:
for line in fin.readlines():
line = line.decode("utf-8")
f = os.path.dirname(line.split('\t')[2].replace('data/', ''))
if f in assemblies:
fout.write(line)
Loading

0 comments on commit cc5d7da

Please sign in to comment.