diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 51f407a..4561443 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -19,7 +19,7 @@ Archaea .. code-block:: bash localhost:~> datasets download genome taxon 2157 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated - localhost:~> mv ncbi_meta.zip archaea_meta.zip + localhost:~> mv ncbi_dataset.zip archaea_meta.zip Bacteria ======== @@ -27,7 +27,7 @@ Bacteria .. code-block:: bash localhost:~> datasets download genome taxon 2 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated - localhost:~> mv ncbi_meta.zip bacteria_meta.zip + localhost:~> mv ncbi_dataset.zip bacteria_meta.zip Viruses ======= @@ -35,7 +35,7 @@ Viruses .. code-block:: bash localhost:~> datasets download genome taxon 10239 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated - localhost:~> mv ncbi_meta.zip viruses_meta.zip + localhost:~> mv ncbi_dataset.zip viruses_meta.zip Eukaryotes ========== @@ -43,7 +43,7 @@ Eukaryotes .. code-block:: bash localhost:~> datasets download genome taxon 2759 --assembly-source refseq --exclude-gff3 --exclude-protein --exclude-rna --exclude-gff3 --exclude-rna --exclude-genomic-cds --dehydrated - localhost:~> mv ncbi_meta.zip eukaryotes_meta.zip + localhost:~> mv ncbi_dataset.zip eukaryotes_meta.zip Process metadata and creates the directories for hydration ---------------------------------------------------------- diff --git a/setup.py b/setup.py index c1bc7ba..cc59043 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,8 @@ def readme(): 'taxonomy_pickle = gtax.taxonomy_main:taxonomy_pickle', 'gtax_database = gtax.gtax_main:gtax_database', 'filter_metadata_zip = gtax.gtax_main:filter_metadata_zip', - 'create_random_short_sequences = gtax.sequence:create_random_short_sequences' + 'create_random_short_sequences = gtax.sequence:create_random_short_sequences', + 'sequence_binning = gtax.sequence_binning:sequence_binning_main' ], } ) diff --git a/src/gtax/gtax_main.py b/src/gtax/gtax_main.py index 8471e50..5e9e98e 100644 --- a/src/gtax/gtax_main.py +++ b/src/gtax/gtax_main.py @@ -32,45 +32,46 @@ def filter_metadata_zip(): superkingdoms = ['archaea', 'bacteria', 'viruses', 'eukaryotes'] for db in superkingdoms: - if not os.path.exists('{}/ncbi_dataset/data'.format(db)): - os.makedirs('{}/ncbi_dataset/data'.format(db)) - with ZipFile('{}_meta.zip'.format(db), 'r') as zip: - assemblies = [] - assemblies_tmp = {} - with zip.open('ncbi_dataset/data/assembly_data_report.jsonl') as fjson, open( - '{}/ncbi_dataset/data/assembly_data_report.jsonl'.format(db), 'w') as fjson_out: - for line in fjson.readlines(): - d = json.loads(line.decode("utf-8")) - v = assemblies_tmp.setdefault(d['taxId'], []) - v.append(d) - for s in assemblies_tmp.keys(): - rep_genome = [] - for e in assemblies_tmp[s]: - if 'refseqCategory' in e['assemblyInfo']: - rep_genome.append(e) - if len(rep_genome) == 1: - assemblies.append(rep_genome[0]['assemblyInfo']['assemblyAccession']) - fjson_out.write('{}\n'.format(json.dumps(rep_genome[0]))) - else: - assemblies.append(assemblies_tmp[s][0]['assemblyInfo']['assemblyAccession']) - fjson_out.write('{}\n'.format(json.dumps(assemblies_tmp[s][0]))) + if os.path.exists('{}_meta.zip'.format(db)): + if not os.path.exists('{}/ncbi_dataset/data'.format(db)): + os.makedirs('{}/ncbi_dataset/data'.format(db)) + with ZipFile('{}_meta.zip'.format(db), 'r') as zip: + assemblies = [] + assemblies_tmp = {} + with zip.open('ncbi_dataset/data/assembly_data_report.jsonl') as fjson, open( + '{}/ncbi_dataset/data/assembly_data_report.jsonl'.format(db), 'w') as fjson_out: + for line in fjson.readlines(): + d = json.loads(line.decode("utf-8")) + v = assemblies_tmp.setdefault(d['taxId'], []) + v.append(d) + for s in assemblies_tmp.keys(): + rep_genome = [] + for e in assemblies_tmp[s]: + if 'refseqCategory' in e['assemblyInfo']: + rep_genome.append(e) + if len(rep_genome) == 1: + assemblies.append(rep_genome[0]['assemblyInfo']['assemblyAccession']) + fjson_out.write('{}\n'.format(json.dumps(rep_genome[0]))) + else: + assemblies.append(assemblies_tmp[s][0]['assemblyInfo']['assemblyAccession']) + fjson_out.write('{}\n'.format(json.dumps(assemblies_tmp[s][0]))) - print('There are {} assemblies included'.format(len(assemblies))) - with zip.open('ncbi_dataset/data/dataset_catalog.json') as fjson, open( - '{}/ncbi_dataset/data/dataset_catalog.json'.format(db), 'w') as fjson_out: - d = json.loads(fjson.read().decode("utf-8")) - catalog = [] - for c in d['assemblies']: - if 'accession' in c: - if c['accession'] in assemblies: + print('There are {} assemblies included'.format(len(assemblies))) + with zip.open('ncbi_dataset/data/dataset_catalog.json') as fjson, open( + '{}/ncbi_dataset/data/dataset_catalog.json'.format(db), 'w') as fjson_out: + d = json.loads(fjson.read().decode("utf-8")) + catalog = [] + for c in d['assemblies']: + if 'accession' in c: + if c['accession'] in assemblies: + catalog.append(c) + else: catalog.append(c) - else: - catalog.append(c) - d['assemblies'] = catalog - fjson_out.write(json.dumps(d, indent=2)) - with zip.open('ncbi_dataset/fetch.txt') as fin, open('{}/ncbi_dataset/fetch.txt'.format(db), 'w') as fout: - for line in fin.readlines(): - line = line.decode("utf-8") - f = os.path.dirname(line.split('\t')[2].replace('data/', '')) - if f in assemblies: - fout.write(line) + d['assemblies'] = catalog + fjson_out.write(json.dumps(d, indent=2)) + with zip.open('ncbi_dataset/fetch.txt') as fin, open('{}/ncbi_dataset/fetch.txt'.format(db), 'w') as fout: + for line in fin.readlines(): + line = line.decode("utf-8") + f = os.path.dirname(line.split('\t')[2].replace('data/', '')) + if f in assemblies: + fout.write(line) diff --git a/src/gtax/sequence.py b/src/gtax/sequence.py index eef48d0..eef69c2 100644 --- a/src/gtax/sequence.py +++ b/src/gtax/sequence.py @@ -2,6 +2,7 @@ import gzip import os import random +import time import uuid from functools import partial from multiprocessing import Pool @@ -12,12 +13,16 @@ from Bio.SeqRecord import SeqRecord from gtax.taxonomy import Taxonomy +from gtax.utils import chunks -def chunks(lst, n): - """Yield successive n-sized chunks from lst.""" - for i in range(0, len(lst), n): - yield lst[i:i + n] +def load_reference_sequences(reference_file, total_sequences, short_seq_len): + sequences = [] + with gzip.open(reference_file, 'rt') as trans_stream: + for r in SeqIO.parse(trans_stream, "fasta"): + sequences.append(r) + print('{} transcripts loaded'.format(len(sequences))) + return sequences def load_reference_transcripts(reference_file, total_sequences, short_seq_len): @@ -70,30 +75,6 @@ def fasta_paired_end_generator(records, number_seq_factor, short_seq_len, min_se return pre -def fasta_single_end_generator(records, number_seq_factor, short_seq_len, min_seq_size): - pre = uuid.uuid4() - with gzip.open('temp_{}.fa.gz'.format(pre), 'wt') as fout: - for r in records: - length = len(r.seq) - no_reads = int(number_seq_factor * length / short_seq_len) - i = 0 - iterations = no_reads * 100 - while i < no_reads: - iterations -= 1 - pos = random.randrange(length) - f_start = pos - short_seq_len - if f_start < 0: - f_start = 0 - f_read = r.seq[f_start:pos] - if 'N' not in f_read and len(f_read) > min_seq_size: - f_rec = SeqRecord(Seq(f_read), id='{}_{}'.format(r.id, f_start), description='') - fout.write(f_rec.format("fasta")) - i += 1 - if iterations <= 0: - break - return pre - - def create_random_paired_end_files(reference_file, output_prefix, number_sequences, short_seq_len, min_seq_size, number_files, threads, split_file_by): @@ -108,10 +89,6 @@ def create_random_paired_end_files(reference_file, output_prefix, min_seq_size=min_seq_size), [d for d in list(chunks(transcripts, 1000))]) - # p = Pool(processes=8) - # results_cont = p.map(contamination_read_generator, taxonomy_groups_reads.keys()) - # p.close() - print('Creating file {}_{}'.format(output_prefix, i)) with gzip.open('{}_{}_1.fa.gz'.format(output_prefix, i), 'wt') as fout_1, \ gzip.open('{}_{}_2.fa.gz'.format(output_prefix, i), 'wt') as fout_2, \ @@ -128,25 +105,63 @@ def create_random_paired_end_files(reference_file, output_prefix, fout_2.write(r.format('fasta')) fout2_2.write(r.format('fasta')) os.remove('temp_{}_2.fa.gz'.format(pre)) - # for k in taxonomy_groups_reads: - # with gzip.open('{}_1.fa.gz'.format(k), 'rt') as fin: - # for r in SeqIO.parse(fin, "fasta"): - # fout2_1.write(r.format('fasta')) - # os.remove('{}_1.fa.gz'.format(k)) - # with gzip.open('{}_2.fa.gz'.format(k), 'rt') as fin: - # for r in SeqIO.parse(fin, "fasta"): - # fout2_2.write(r.format('fasta')) - # os.remove('{}_2.fa.gz'.format(k)) -def contamination_fasta_single_end_generator(fasta_file, total_read_seq, short_seq_len, min_seq_size): +def contamination_fasta_single_end_generator(tax_group, gtax_fasta_dir, df, + total_read_seq, short_seq_len, min_seq_size): pre = uuid.uuid4() - with open(fasta_file) as fin, gzip.open('temp_{}.fa.gz'.format(pre), 'wt') as fout: - for r in SeqIO.parse(fin, "fasta"): + fasta_file = os.path.join(gtax_fasta_dir, '{}.fsa'.format(tax_group)) + output_file = 'temp_{}_{}.fa.gz'.format(tax_group, pre) + with open(fasta_file) as fin, \ + gzip.open(output_file, 'wt') as fout: + for idx, row in df.iterrows(): + fin.seek(row[1], 0) + for r in SeqIO.parse(fin, "fasta"): + length = len(r.seq) + i = 0 + iterations = total_read_seq * 1000 + while i < total_read_seq: + iterations -= 1 + pos = random.randrange(length) + f_start = pos - short_seq_len + if f_start < 0: + f_start = 0 + f_read = r.seq[f_start:pos] + if 'N' not in f_read and len(f_read) > min_seq_size: + f_rec = SeqRecord(Seq(f_read), + id='{}_{}'.format(r.id, f_start), description='') + fout.write(f_rec.format("fasta")) + i += 1 + if iterations <= 0: + break + break + return output_file + + +def process_by_taxonomy_group(tax_group, gtax_fasta_dir, total_reads, short_seq_len, min_seq_size): + idx_file = os.path.join(gtax_fasta_dir, '{}.idx'.format(tax_group)) + df = pandas.read_csv(idx_file, sep='\t', header=None) + total_seq = len(df) + total_read_seq = int(total_reads / total_seq) + if total_read_seq < 1000: + total_read_seq = 1000 + total_seq = int(total_reads / total_read_seq) + df = df.sample(total_seq, random_state=random.randint(1, int(time.time()))).sort_values(by=1) + return contamination_fasta_single_end_generator(tax_group, gtax_fasta_dir, + df, + total_read_seq, + short_seq_len, min_seq_size) + + +def fasta_single_end_generator(records, number_seq_factor, short_seq_len, min_seq_size): + pre = uuid.uuid4() + with gzip.open('temp_{}.fa.gz'.format(pre), 'wt') as fout: + for r in records: length = len(r.seq) + no_reads = int(number_seq_factor * length / short_seq_len) i = 0 - iterations = total_read_seq * 100 - while i < total_read_seq: + iterations = no_reads * 1000 + while i < no_reads: iterations -= 1 pos = random.randrange(length) f_start = pos - short_seq_len @@ -162,26 +177,17 @@ def contamination_fasta_single_end_generator(fasta_file, total_read_seq, short_s return pre -def process_by_taxonomy_group(tax_group, gtax_fasta_dir, total_reads, short_seq_len, min_seq_size): - print('Processing tax group: {}'.format(tax_group)) - fasta_file = os.path.join(gtax_fasta_dir, '{}.fsa'.format(tax_group)) - idx_file = os.path.join(gtax_fasta_dir, '{}.idx'.format(tax_group)) - df = pandas.read_csv(idx_file, sep='\t', header=None) - total_seq = len(df) - total_read_seq = int(total_reads / total_seq) - if total_read_seq < 1: - total_read_seq = 1 - return contamination_fasta_single_end_generator(fasta_file, total_read_seq * total_seq, short_seq_len, min_seq_size) - - def create_random_single_end_files(reference_file, output_prefix, number_sequences, short_seq_len, min_seq_size, number_files, threads, split_file_by, taxonomy, foreign_contamination_percent, gtax_fasta_dir, taxonomy_group): print("Creating single-end short sequences") + ref_number_sequences = number_sequences - \ + (number_sequences * foreign_contamination_percent * + (len(taxonomy.taxonomy_groups) - 1)) transcripts, number_seq_factor = load_reference_transcripts(reference_file, - number_sequences, short_seq_len) + ref_number_sequences, short_seq_len) for i in range(1, number_files + 1): print('Creating reference sequences') @@ -202,41 +208,34 @@ def create_random_single_end_files(reference_file, output_prefix, print('Printing final files') if split_file_by == 0: print('Creating file {}_{}'.format(output_prefix, i)) - with gzip.open('{}_{}.fa.gz'.format(output_prefix, i), 'wt') as fout, \ - gzip.open('{}_{}_cont.fa.gz'.format(output_prefix, i), 'wt') as fout2: + with gzip.open('{}_{}.fa.gz'.format(output_prefix, i), 'wt') as fout2: for pre in results: with gzip.open('temp_{}.fa.gz'.format(pre), 'rt') as fin: for r in SeqIO.parse(fin, "fasta"): - fout.write(r.format('fasta')) fout2.write(r.format('fasta')) os.remove('temp_{}.fa.gz'.format(pre)) for pre in results_cont: - with gzip.open('temp_{}.fa.gz'.format(pre), 'rt') as fin: + with gzip.open(pre, 'rt') as fin: for r in SeqIO.parse(fin, "fasta"): fout2.write(r.format('fasta')) - os.remove('temp_{}.fa.gz'.format(pre)) + os.remove(pre) else: count = 0 suffix = 1 - fout = gzip.open('{}_{}_{}.fa.gz'.format(output_prefix, i, suffix), 'wt') - fout2 = gzip.open('{}_{}_{}_cont.fa.gz'.format(output_prefix, i, suffix), 'wt') + fout2 = gzip.open('{}_{}_{}.fa.gz'.format(output_prefix, i, suffix), 'wt') for pre in results: with gzip.open('temp_{}.fa.gz'.format(pre), 'rt') as fin: for r in SeqIO.parse(fin, "fasta"): if count == split_file_by: - fout.close() fout2.close() count = 0 suffix += 1 - fout = gzip.open('{}_{}_{}.fa.gz'.format(output_prefix, i, suffix), 'wt') fout2 = gzip.open('{}_{}_{}_cont.fa.gz'.format(output_prefix, i, suffix), 'wt') count += 1 - fout.write(r.format('fasta')) fout2.write(r.format('fasta')) os.remove('temp_{}.fa.gz'.format(pre)) - fout.close() for pre in results_cont: - with gzip.open('temp_{}.fa.gz'.format(pre), 'rt') as fin: + with gzip.open(pre, 'rt') as fin: for r in SeqIO.parse(fin, "fasta"): if count == split_file_by: fout2.close() @@ -245,16 +244,17 @@ def create_random_single_end_files(reference_file, output_prefix, fout2 = gzip.open('{}_{}_{}_cont.fa.gz'.format(output_prefix, i, suffix), 'wt') count += 1 fout2.write(r.format('fasta')) - os.remove('temp_{}.fa.gz'.format(pre)) + os.remove(pre) fout2.close() def create_random_short_sequences(): parser = argparse.ArgumentParser() - parser.add_argument('--reference_file', help='Reference Transcriptome gzip file', + parser.add_argument('--reference', help='Reference Transcriptome gzip file', required=True) - parser.add_argument('--taxonomy_group', help='Taxonomy group name for the target organism', + parser.add_argument('--taxid', help='TaxID for target organism', + type=int, required=True) parser.add_argument('--gtax_fasta_dir', help='Gtax database directory contaning FASTA and IDX files', required=True) @@ -286,29 +286,31 @@ def create_random_short_sequences(): group_pickle_file=os.path.join(args.gtax_fasta_dir, 'taxonomy_groups.pickle')) - if args.paired_end: - create_random_paired_end_files(args.reference_file, - args.output_prefix, - args.number_sequences, - args.short_seq_len, - min_seq_size, - args.number_files, - args.threads, - args.split_file_by) - else: - create_random_single_end_files(args.reference_file, - args.output_prefix, - args.number_sequences, - args.short_seq_len, - min_seq_size, - args.number_files, - args.threads, - args.split_file_by, - taxonomy, - args.foreign_contamination_percent, - args.gtax_fasta_dir, - args.taxonomy_group) + taxonomy_group = taxonomy.get_taxonomy_group_from_taxid(args.taxid) + if taxonomy_group: + print('TaxID {} is in {} taxonomy group'.format(args.taxid, taxonomy_group)) - -if __name__ == "__main__": - create_random_short_sequences() + if args.paired_end: + create_random_paired_end_files(args.reference, + args.output_prefix, + args.number_sequences, + args.short_seq_len, + min_seq_size, + args.number_files, + args.threads, + args.split_file_by) + else: + create_random_single_end_files(args.reference, + args.output_prefix, + args.number_sequences, + args.short_seq_len, + min_seq_size, + args.number_files, + args.threads, + args.split_file_by, + taxonomy, + args.foreign_contamination_percent, + args.gtax_fasta_dir, + taxonomy_group) + else: + print('Reference organism is not in the database') diff --git a/src/gtax/sequence_binning.py b/src/gtax/sequence_binning.py new file mode 100644 index 0000000..1bc8dea --- /dev/null +++ b/src/gtax/sequence_binning.py @@ -0,0 +1,73 @@ +import argparse +import gzip +import os +import uuid +from functools import partial +from multiprocessing import Pool + +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +from gtax.utils import chunks, sequences_to_list + + +def binning(transcript, bin, step): + seq_len = len(transcript.seq) + for i in range(1, seq_len - bin, step): + yield SeqRecord(Seq(transcript.seq[i: i + bin]), id='{}_{}'.format(transcript.id, i), + description='') + + +def sequence_binning_to_tmpfile(transcripts, bin, step): + pre = uuid.uuid4() + with gzip.open('temp_{}.fa.gz'.format(pre), 'wt') as fout: + sequences = {} + for r in transcripts: + for s in binning(r, bin, step): + if not sequences.setdefault(str(s.seq), False): + sequences[str(s.seq)] = True + fout.write(s.format("fasta")) + return pre + + +def sequence_binning(reference_file, output, seq_len, step, threads): + print("Creating sequences") + with Pool(processes=threads) as p: + results = p.map(partial(sequence_binning_to_tmpfile, + bin=seq_len, + step=step), + [d for d in list(chunks(sequences_to_list(reference_file), 1000))]) + print('Printing final files') + print('Creating file {}'.format(output)) + with gzip.open('{}'.format(output), 'wt') as fout2: + sequences = {} + for pre in results: + with gzip.open('temp_{}.fa.gz'.format(pre), 'rt') as fin: + for r in SeqIO.parse(fin, "fasta"): + if not sequences.setdefault(str(r.seq), False): + sequences[str(r.seq)] = True + fout2.write(r.format('fasta')) + os.remove('temp_{}.fa.gz'.format(pre)) + + +def sequence_binning_main(): + parser = argparse.ArgumentParser() + + parser.add_argument('--reference', help='Reference Transcriptome gzip file', + required=True) + parser.add_argument('--output', help='Output file prefix', required=True) + parser.add_argument('--threads', help='Number of threads', + type=int, required=True) + parser.add_argument('--seq_len', help='Length of the short sequence', + default=100, type=int, required=True) + parser.add_argument('--step', + help='Sequence step to move the bin', + type=int, required=True) + args = parser.parse_args() + + sequence_binning(args.reference, + args.output, + args.seq_len, + args.step, + args.threads) diff --git a/src/gtax/taxonomy.py b/src/gtax/taxonomy.py index 8ae23b0..23b99a6 100644 --- a/src/gtax/taxonomy.py +++ b/src/gtax/taxonomy.py @@ -210,3 +210,9 @@ def print_size(self, node_name, deep=1, step=1, min_size=10.0, min_size_child=50 if t != node[1]['id']: self.print_size(self.nodes[str(t)]['name'], deep, step, min_size, min_size_child) + + def get_taxonomy_group_from_taxid(self, taxid): + for k in self.taxonomy_groups: + if taxid in self.taxonomy_groups[k]['nodes']: + return k + return None \ No newline at end of file diff --git a/src/gtax/utils.py b/src/gtax/utils.py new file mode 100644 index 0000000..fffb835 --- /dev/null +++ b/src/gtax/utils.py @@ -0,0 +1,23 @@ +import gzip + +from Bio import SeqIO + + +def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] + + +def yield_sequences(reference_file): + with gzip.open(reference_file, 'rt') as trans_stream: + for r in SeqIO.parse(trans_stream, "fasta"): + yield r + + +def sequences_to_list(reference_file): + sequences = [] + with gzip.open(reference_file, 'rt') as trans_stream: + for r in SeqIO.parse(trans_stream, "fasta"): + sequences.append(r) + return sequences