Skip to content

Commit

Permalink
reconstruct the code and update to decrease the memory use
Browse files Browse the repository at this point in the history
  • Loading branch information
FlyPythons committed Apr 17, 2018
1 parent ebd2ddf commit 7d4007f
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 213 deletions.
150 changes: 44 additions & 106 deletions fastaSplit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@
"""
Split fasta files by accumulated sequence length or number
Copyright@fanjunpeng ([email protected])
2018/1/12: init by fanjunpeng
2018/1/13: add support for more fasta files
2018/1/13: add support for more multiprocess
"""

from __future__ import absolute_import
#from __future__ import absolute_import

import logging
import os
Expand All @@ -23,128 +19,71 @@
LOG = logging.getLogger(__name__)


def write_record(records, out_filename):
"""
write fasta records to file
def split_fasta(filename, mode, number, output_dir="split", max_split=1000):
"""

# write seq records
fasta_out = open(out_filename, "w")
bed_out = open("%s.bed" % out_filename, 'w')

for record in records:
fasta_out.write("%s\n" % str(record))
bed_out.write("%s\t1\t%s\n" % (record.id, record.length))

fasta_out.close()
bed_out.close()

return out_filename


def split_by_number(filename, number, output_dir="split", max_split=1000):
"""
split Fasta file by {number} records per file
split Fasta file by {number} length per file
:param filename: Fasta filename
:param number: records per file
:param mode: length or number
:param number: record length per file
:param output_dir: the output directory of sub fasta files
:param max_split: the max number of sub files, avoid too much sub files
:return: list of sub file names
"""

assert isinstance(number, int)
assert mode in ["length", "number"]

r = []

head = os.path.splitext(os.path.basename(filename.rstrip(".gz")))[0]
prefix = os.path.join(output_dir, head)

sub_num = 0
fasta_records = []
n = 0
n = 1
records = open_fasta(filename)

for record in open_fasta(filename):
while True:

if n == number:
if n >= max_split:
msg = "file %r cuts more than %s, break" % (filename, max_split)
raise Exception(msg)

if sub_num > max_split:
msg = "file %r cuts more than %s, break" % (filename, max_split)
raise Exception(msg)
out_filename = "%s%s.fasta" % (prefix, n)
out = open(out_filename, "w")
count = 0

out_filename = "%s.%s.fasta" % (prefix, sub_num)
write_record(fasta_records, out_filename)
r.append(out_filename)
sub_num += 1
while count < number:
record = records.next()

fasta_records = []
n = 0
if record:
out.write(str(record)+"\n")
if mode == "length":
count += record.length
else:
count += 1
else:
break

fasta_records.append(record)
n += 1

# add the last part if has
if fasta_records:
out_filename = "%s.%s.fasta" % (prefix, sub_num)
write_record(fasta_records, out_filename)
out.close()
r.append(out_filename)

return r

if count < number:
break

def split_by_length(filename, number, output_dir="split", max_split=1000):
"""
split Fasta file by {number} length per file
:param filename: Fasta filename
:param number: record length per file
:param output_dir: the output directory of sub fasta files
:param max_split: the max number of sub files, avoid too much sub files
:return: list of sub file names
"""
r = []

head = os.path.splitext(os.path.basename(filename.rstrip(".gz")))[0]
prefix = os.path.join(output_dir, head)

sub_num = 0
fasta_records = []
n = 0

for record in open_fasta(filename):

if n > number:

if sub_num > max_split:
msg = "file %r cuts more than %s, break" % (filename, max_split)
raise Exception(msg)

out_filename = "%s.%s.fasta" % (prefix, sub_num)
write_record(fasta_records, out_filename)
r.append(out_filename)
sub_num += 1

fasta_records = []
n = 0

fasta_records.append(record)
n += record.length

# add the last part if has
if fasta_records:
out_filename = "%s.%s.fasta" % (prefix, sub_num)
write_record(fasta_records, out_filename)
r.append(out_filename)
n += 1

return r


def fastaSplit(filenames, mode, num, output_dir, concurrent=1, max_split=1000):
"""
split fasta files
split fasta files, use multiprocess for parallel
:param filenames: a list of fasta files
:param mode: length or number
:param num:
:param num:
:param output_dir: output directory
:param concurrent: see -h
:param max_split: see -h
:return:
:return:
"""
assert mode in ["number", "length"]
num = int(num)
Expand All @@ -155,19 +94,18 @@ def fastaSplit(filenames, mode, num, output_dir, concurrent=1, max_split=1000):

# avoid rerun
if os.path.exists(done):
print("%r exists, pass this step; if you want to rerun, delete the file" % done)
LOG.info("%r exists, pass this step; if you want to rerun, delete the file" % done)
return fofn2list(split_list)

# for multiprocessing
pool = Pool(processes=concurrent)
results = []
print("Split '{filenames}' by sequence {mode} =~ {num} per file".format(**locals()))
if mode == "number":
for f in filenames:
results.append(pool.apply_async(split_by_number, (f, num, output_dir, max_split)))
if mode == "length":
for f in filenames:
results.append(pool.apply_async(split_by_length, (f, num, output_dir, max_split)))

LOG.info("Split '{filenames}' by sequence {mode} =~ {num} per file".format(**locals()))

for f in filenames:
print("processing %s" % f)
results.append(pool.apply_async(split_fasta, (f, mode, num, output_dir, max_split)))

pool.close()
pool.join()
Expand All @@ -193,10 +131,10 @@ def set_args():
""")

args.add_argument("input", metavar="FASTAs", nargs="+", help="fasta files")
args.add_argument("-m", "--mode", choices=["number", "length"], default="length", help="split by number or length")
args.add_argument("-n", "--number", type=int, metavar="INT", help="the value of mode")
args.add_argument("-m", "--mode", choices=["number", "length"], required=True, help="split by number or length")
args.add_argument("-n", "--number", type=int, required=True, metavar="INT", help="the value of mode")
args.add_argument("-o", "--output_dir", default="split", metavar="DIR", help="output directory")
args.add_argument("-ms", "--max_split", type=int, default=1000, metavar="INT", help="the max number of sub files")
args.add_argument("-ms", "--max_split", type=int, default=6666, metavar="INT", help="the max number of sub files")
args.add_argument("-c", "--concurrent", type=int, default=1, metavar="INT", help="number of concurrent process")
return args.parse_args()

Expand Down
Loading

0 comments on commit 7d4007f

Please sign in to comment.