-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
reconstruct the code and update to decrease the memory use
- Loading branch information
1 parent
ebd2ddf
commit 7d4007f
Showing
2 changed files
with
98 additions
and
213 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,13 +2,9 @@ | |
""" | ||
Split fasta files by accumulated sequence length or number | ||
Copyright@fanjunpeng ([email protected]) | ||
2018/1/12: init by fanjunpeng | ||
2018/1/13: add support for more fasta files | ||
2018/1/13: add support for more multiprocess | ||
""" | ||
|
||
from __future__ import absolute_import | ||
#from __future__ import absolute_import | ||
|
||
import logging | ||
import os | ||
|
@@ -23,128 +19,71 @@ | |
LOG = logging.getLogger(__name__) | ||
|
||
|
||
def write_record(records, out_filename): | ||
""" | ||
write fasta records to file | ||
def split_fasta(filename, mode, number, output_dir="split", max_split=1000): | ||
""" | ||
|
||
# write seq records | ||
fasta_out = open(out_filename, "w") | ||
bed_out = open("%s.bed" % out_filename, 'w') | ||
|
||
for record in records: | ||
fasta_out.write("%s\n" % str(record)) | ||
bed_out.write("%s\t1\t%s\n" % (record.id, record.length)) | ||
|
||
fasta_out.close() | ||
bed_out.close() | ||
|
||
return out_filename | ||
|
||
|
||
def split_by_number(filename, number, output_dir="split", max_split=1000): | ||
""" | ||
split Fasta file by {number} records per file | ||
split Fasta file by {number} length per file | ||
:param filename: Fasta filename | ||
:param number: records per file | ||
:param mode: length or number | ||
:param number: record length per file | ||
:param output_dir: the output directory of sub fasta files | ||
:param max_split: the max number of sub files, avoid too much sub files | ||
:return: list of sub file names | ||
""" | ||
|
||
assert isinstance(number, int) | ||
assert mode in ["length", "number"] | ||
|
||
r = [] | ||
|
||
head = os.path.splitext(os.path.basename(filename.rstrip(".gz")))[0] | ||
prefix = os.path.join(output_dir, head) | ||
|
||
sub_num = 0 | ||
fasta_records = [] | ||
n = 0 | ||
n = 1 | ||
records = open_fasta(filename) | ||
|
||
for record in open_fasta(filename): | ||
while True: | ||
|
||
if n == number: | ||
if n >= max_split: | ||
msg = "file %r cuts more than %s, break" % (filename, max_split) | ||
raise Exception(msg) | ||
|
||
if sub_num > max_split: | ||
msg = "file %r cuts more than %s, break" % (filename, max_split) | ||
raise Exception(msg) | ||
out_filename = "%s%s.fasta" % (prefix, n) | ||
out = open(out_filename, "w") | ||
count = 0 | ||
|
||
out_filename = "%s.%s.fasta" % (prefix, sub_num) | ||
write_record(fasta_records, out_filename) | ||
r.append(out_filename) | ||
sub_num += 1 | ||
while count < number: | ||
record = records.next() | ||
|
||
fasta_records = [] | ||
n = 0 | ||
if record: | ||
out.write(str(record)+"\n") | ||
if mode == "length": | ||
count += record.length | ||
else: | ||
count += 1 | ||
else: | ||
break | ||
|
||
fasta_records.append(record) | ||
n += 1 | ||
|
||
# add the last part if has | ||
if fasta_records: | ||
out_filename = "%s.%s.fasta" % (prefix, sub_num) | ||
write_record(fasta_records, out_filename) | ||
out.close() | ||
r.append(out_filename) | ||
|
||
return r | ||
|
||
if count < number: | ||
break | ||
|
||
def split_by_length(filename, number, output_dir="split", max_split=1000): | ||
""" | ||
split Fasta file by {number} length per file | ||
:param filename: Fasta filename | ||
:param number: record length per file | ||
:param output_dir: the output directory of sub fasta files | ||
:param max_split: the max number of sub files, avoid too much sub files | ||
:return: list of sub file names | ||
""" | ||
r = [] | ||
|
||
head = os.path.splitext(os.path.basename(filename.rstrip(".gz")))[0] | ||
prefix = os.path.join(output_dir, head) | ||
|
||
sub_num = 0 | ||
fasta_records = [] | ||
n = 0 | ||
|
||
for record in open_fasta(filename): | ||
|
||
if n > number: | ||
|
||
if sub_num > max_split: | ||
msg = "file %r cuts more than %s, break" % (filename, max_split) | ||
raise Exception(msg) | ||
|
||
out_filename = "%s.%s.fasta" % (prefix, sub_num) | ||
write_record(fasta_records, out_filename) | ||
r.append(out_filename) | ||
sub_num += 1 | ||
|
||
fasta_records = [] | ||
n = 0 | ||
|
||
fasta_records.append(record) | ||
n += record.length | ||
|
||
# add the last part if has | ||
if fasta_records: | ||
out_filename = "%s.%s.fasta" % (prefix, sub_num) | ||
write_record(fasta_records, out_filename) | ||
r.append(out_filename) | ||
n += 1 | ||
|
||
return r | ||
|
||
|
||
def fastaSplit(filenames, mode, num, output_dir, concurrent=1, max_split=1000): | ||
""" | ||
split fasta files | ||
split fasta files, use multiprocess for parallel | ||
:param filenames: a list of fasta files | ||
:param mode: length or number | ||
:param num: | ||
:param num: | ||
:param output_dir: output directory | ||
:param concurrent: see -h | ||
:param max_split: see -h | ||
:return: | ||
:return: | ||
""" | ||
assert mode in ["number", "length"] | ||
num = int(num) | ||
|
@@ -155,19 +94,18 @@ def fastaSplit(filenames, mode, num, output_dir, concurrent=1, max_split=1000): | |
|
||
# avoid rerun | ||
if os.path.exists(done): | ||
print("%r exists, pass this step; if you want to rerun, delete the file" % done) | ||
LOG.info("%r exists, pass this step; if you want to rerun, delete the file" % done) | ||
return fofn2list(split_list) | ||
|
||
# for multiprocessing | ||
pool = Pool(processes=concurrent) | ||
results = [] | ||
print("Split '{filenames}' by sequence {mode} =~ {num} per file".format(**locals())) | ||
if mode == "number": | ||
for f in filenames: | ||
results.append(pool.apply_async(split_by_number, (f, num, output_dir, max_split))) | ||
if mode == "length": | ||
for f in filenames: | ||
results.append(pool.apply_async(split_by_length, (f, num, output_dir, max_split))) | ||
|
||
LOG.info("Split '{filenames}' by sequence {mode} =~ {num} per file".format(**locals())) | ||
|
||
for f in filenames: | ||
print("processing %s" % f) | ||
results.append(pool.apply_async(split_fasta, (f, mode, num, output_dir, max_split))) | ||
|
||
pool.close() | ||
pool.join() | ||
|
@@ -193,10 +131,10 @@ def set_args(): | |
""") | ||
|
||
args.add_argument("input", metavar="FASTAs", nargs="+", help="fasta files") | ||
args.add_argument("-m", "--mode", choices=["number", "length"], default="length", help="split by number or length") | ||
args.add_argument("-n", "--number", type=int, metavar="INT", help="the value of mode") | ||
args.add_argument("-m", "--mode", choices=["number", "length"], required=True, help="split by number or length") | ||
args.add_argument("-n", "--number", type=int, required=True, metavar="INT", help="the value of mode") | ||
args.add_argument("-o", "--output_dir", default="split", metavar="DIR", help="output directory") | ||
args.add_argument("-ms", "--max_split", type=int, default=1000, metavar="INT", help="the max number of sub files") | ||
args.add_argument("-ms", "--max_split", type=int, default=6666, metavar="INT", help="the max number of sub files") | ||
args.add_argument("-c", "--concurrent", type=int, default=1, metavar="INT", help="number of concurrent process") | ||
return args.parse_args() | ||
|
||
|
Oops, something went wrong.