dib-lab · ogayot · Nov 23, 2023 · Nov 26, 2023
diff --git a/khmer/kfile.py b/khmer/kfile.py
@@ -35,6 +35,7 @@
 """File handling/checking utilities for command-line scripts."""
 
 
+import contextlib
 import os
 import sys
 import errno
@@ -246,3 +247,37 @@ def get_file_writer(file_handle, do_gzip, do_bzip):
         ofile = file_handle
 
     return ofile
+
+
+@contextlib.contextmanager
+def FileWriter(file_handle, do_gzip, do_bzip, *, steal_ownership=False):
+    """Alternative to get_file_writer that requires the use of a with block.
+    The intent is to address an inherent problem with get_file_writer() that
+    makes it difficult to use as a context manager. When get_file_writer() is
+    called with both gzip=False and bzip=False, the underlying file handle is
+    returned. As a consequence, doing:
+    >  with get_file_writer(sys.stdout, bzip=False, gzip=False) as fh:
+    >      pass
+    ends up closing sys.stdout when the with block is exited. Using the
+    function without a context manager avoids the issue, but then it results in
+    leaked open files when either bzip=True or gzip=True.
+    FileWriter must be used as a context manager, but it ensures that resources
+    are closed upon exiting the with block. Furthermore, it can be explicitly
+    requested to close the underlying file_handle."""
+    ofile = None
+
+    if do_gzip and do_bzip:
+        raise ValueError("Cannot specify both bzip and gzip compression!")
+
+    if do_gzip:
+        ofile = gzip.GzipFile(fileobj=file_handle, mode='w')
+    elif do_bzip:
+        ofile = bz2.open(file_handle, mode='w')
+    else:
+        ofile = contextlib.nullcontext(enter_result=file_handle)
+
+    with ofile as x:
+        yield x
+
+    if steal_ownership:
+        file_handle.close()
diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
@@ -218,6 +218,10 @@ def __do_abundance_dist__(read_parser):
 
     log_info('wrote to: {output}', output=args.output_histogram_filename)
 
+    # Ensure that the output files are properly written. Python 3.12 seems to
+    # be less forgiving here ..
+    hist_fp.close()
+
 
 if __name__ == '__main__':
     main()

diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
@@ -42,6 +42,7 @@
 Use '-h' for parameter help.
 """
 
+import contextlib
 import sys
 import csv
 import khmer
@@ -143,26 +144,28 @@ def main():
         sys.exit(1)
 
     if args.output_histogram_filename in ('-', '/dev/stdout'):
-        countgraph_fp = sys.stdout
+        countgraph_ctx = contextlib.nullcontext(enter_result=sys.stdout)
     else:
-        countgraph_fp = open(args.output_histogram_filename, 'w')
-    countgraph_fp_csv = csv.writer(countgraph_fp)
-    # write headers:
-    countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
-                                'cumulative_fraction'])
+        countgraph_ctx = open(args.output_histogram_filename, 'w')
 
-    sofar = 0
-    for _, i in enumerate(abundances):
-        if i == 0 and not args.output_zero:
-            continue
+    with countgraph_ctx as countgraph_fp:
+        countgraph_fp_csv = csv.writer(countgraph_fp)
+        # write headers:
+        countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
+                                    'cumulative_fraction'])
 
-        sofar += i
-        frac = sofar / float(total)
+        sofar = 0
+        for _, i in enumerate(abundances):
+            if i == 0 and not args.output_zero:
+                continue
 
-        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
+            sofar += i
+            frac = sofar / float(total)
 
-        if sofar == total:
-            break
+            countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
+
+            if sofar == total:
+                break
 
 
 if __name__ == '__main__':

diff --git a/scripts/do-partition.py b/scripts/do-partition.py
@@ -168,8 +168,8 @@ def main():  # pylint: disable=too-many-locals,too-many-statements
         worker_q.put((nodegraph, _, start, end))
 
     print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
-    open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
-                                                % (n_subsets))
+    with open('%s.info' % args.graphbase, 'w') as info_fp:
+        info_fp.write('%d subsets total\n' % (n_subsets))
 
     if n_subsets < args.threads:
         args.threads = n_subsets

diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py
@@ -52,7 +52,7 @@
 import sys
 from khmer import __version__
 from khmer.utils import write_record
-from khmer.kfile import add_output_compression_type, get_file_writer
+from khmer.kfile import add_output_compression_type, FileWriter
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 
 
@@ -81,12 +81,12 @@ def get_parser():
 
 def main():
     args = sanitize_help(get_parser()).parse_args()
-    outfp = get_file_writer(args.output, args.gzip, args.bzip)
-    for filename in args.input_filenames:
-        for record in screed.open(filename):
-            if len(record['sequence']) >= args.length:
-                write_record(record, outfp)
-    print('wrote to: ' + args.output.name, file=sys.stderr)
+    with FileWriter(args.output, args.gzip, args.bzip) as outfp:
+        for filename in args.input_filenames:
+            for record in screed.open(filename):
+                if len(record['sequence']) >= args.length:
+                    write_record(record, outfp)
+        print('wrote to: ' + args.output.name, file=sys.stderr)
 
 
 if __name__ == '__main__':

diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py
@@ -44,6 +44,7 @@
 
 Reads FASTQ and FASTA input, retains format for output.
 """
+from contextlib import nullcontext
 import sys
 import os.path
 import textwrap
@@ -53,7 +54,7 @@
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
 from khmer.khmer_args import FileType as khFileType
 from khmer.kfile import add_output_compression_type
-from khmer.kfile import get_file_writer
+from khmer.kfile import FileWriter
 
 from khmer.utils import broken_paired_reader, write_record, write_record_pair
 
@@ -132,39 +133,40 @@ def main():
 
     # OVERRIDE default output file locations with -p, -s
     if args.output_paired:
-        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
-        out2 = paired_fp.name
+        paired_ctx = FileWriter(args.output_paired, args.gzip, args.bzip)
+        out2 = args.output_paired.name
     else:
         # Don't override, just open the default filename from above
-        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
+        paired_ctx = FileWriter(open(out2, 'wb'), args.gzip, args.bzip,
+                                steal_ownership=True)
     if args.output_single:
-        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
+        single_ctx = FileWriter(args.output_single, args.gzip, args.bzip)
         out1 = args.output_single.name
     else:
         # Don't override, just open the default filename from above
-        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
-
-    print('reading file "%s"' % infile, file=sys.stderr)
-    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
-    print('outputting orphans to "%s"' % out1, file=sys.stderr)
-
-    n_pe = 0
-    n_se = 0
-
-    reads = ReadParser(infile)
-    for index, is_pair, read1, read2 in broken_paired_reader(reads):
-        if index % 100000 == 0 and index > 0:
-            print('...', index, file=sys.stderr)
-
-        if is_pair:
-            write_record_pair(read1, read2, paired_fp)
-            n_pe += 1
-        else:
-            write_record(read1, single_fp)
-            n_se += 1
-
-    single_fp.close()
-    paired_fp.close()
+        single_ctx = FileWriter(open(out1, 'wb'), args.gzip, args.bzip,
+                                steal_ownership=True)
+
+    with paired_ctx as paired_fp, single_ctx as single_fp:
+        print('reading file "%s"' % infile, file=sys.stderr)
+        print('outputting interleaved pairs to "%s"' % out2,
+              file=sys.stderr)
+        print('outputting orphans to "%s"' % out1, file=sys.stderr)
+
+        n_pe = 0
+        n_se = 0
+
+        reads = ReadParser(infile)
+        for index, is_pair, read1, read2 in broken_paired_reader(reads):
+            if index % 100000 == 0 and index > 0:
+                print('...', index, file=sys.stderr)
+
+            if is_pair:
+                write_record_pair(read1, read2, paired_fp)
+                n_pe += 1
+            else:
+                write_record(read1, single_fp)
+                n_se += 1
 
     if n_pe == 0:
         raise Exception("no paired reads!? check file formats...")

diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
@@ -301,10 +301,10 @@ def main():
                                    args.max_size)
 
     if args.output_unassigned:
-        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
-        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
-        extractor.process_unassigned(unassigned_fp)
-        unassigned_fp.close()
+        with open('%s.unassigned.%s' % (args.prefix, suffix), 'wb') as ofile:
+            unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
+            extractor.process_unassigned(unassigned_fp)
+            unassigned_fp.close()
     else:
         extractor.process_unassigned()
 
@@ -320,13 +320,21 @@ def main():
         print('nothing to output; exiting!', file=sys.stderr)
         return
 
+    to_close = []
     # open a bunch of output files for the different groups
     group_fps = {}
     for index in range(extractor.group_n):
         fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
-        group_fp = get_file_writer(open(fname, 'wb'), args.gzip,
+        back_fp = open(fname, 'wb')
+        group_fp = get_file_writer(back_fp, args.gzip,
                                    args.bzip)
         group_fps[index] = group_fp
+        # It feels more natural to close the writer before closing the
+        # underlying file. fp.close() is theoretically idempotent, so it should
+        # be fine even though sometimes get_file_writer "steals" ownership of
+        # the underlying stream.
+        to_close.append(group_fp)
+        to_close.append(back_fp)
 
     # write 'em all out!
     # refresh the generator
@@ -351,6 +359,9 @@ def main():
            args.prefix,
            suffix), file=sys.stderr)
 
+    for fp in to_close:
+        fp.close()
+
 
 if __name__ == '__main__':
     main()
diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py
@@ -45,7 +45,7 @@
 import sys
 import screed
 from khmer import __version__
-from khmer.kfile import (add_output_compression_type, get_file_writer,
+from khmer.kfile import (add_output_compression_type, FileWriter,
                          describe_file_handle)
 from khmer.utils import write_record
 from khmer.khmer_args import sanitize_help, KhmerArgumentParser
@@ -74,21 +74,21 @@ def main():
     args = sanitize_help(get_parser()).parse_args()
 
     print('fastq from ', args.input_sequence, file=sys.stderr)
-    outfp = get_file_writer(args.output, args.gzip, args.bzip)
-    n_count = 0
-    for n, record in enumerate(screed.open(args.input_sequence)):
-        if n % 10000 == 0:
-            print('...', n, file=sys.stderr)
+    with FileWriter(args.output, args.gzip, args.bzip) as outfp:
+        n_count = 0
+        for n, record in enumerate(screed.open(args.input_sequence)):
+            if n % 10000 == 0:
+                print('...', n, file=sys.stderr)
 
-        sequence = record['sequence']
+            sequence = record['sequence']
 
-        if 'N' in sequence:
-            if not args.n_keep:
-                n_count += 1
-                continue
+            if 'N' in sequence:
+                if not args.n_keep:
+                    n_count += 1
+                    continue
 
-        del record['quality']
-        write_record(record, outfp)
+            del record['quality']
+            write_record(record, outfp)
 
     print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)
 

diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
@@ -60,7 +60,7 @@
 from khmer.kfile import (check_input_files, check_space,
                          check_space_for_graph,
                          add_output_compression_type,
-                         get_file_writer)
+                         FileWriter)
 from khmer.khmer_logger import (configure_logging, log_info, log_error,
                                 log_warn)
 from khmer.trimming import (trim_record)
@@ -160,22 +160,23 @@ def main():
         outfile = os.path.basename(args.datafile) + '.abundfilt'
     else:
         outfile = args.outfile
-    outfp = open(outfile, 'wb')
-    outfp = get_file_writer(outfp, args.gzip, args.bzip)
-
-    paired_iter = broken_paired_reader(ReadParser(args.datafile),
-                                       min_length=graph.ksize(),
-                                       force_single=True)
-
-    for n, is_pair, read1, read2 in paired_iter:
-        assert not is_pair
-        assert read2 is None
-
-        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
-                                        args.variable_coverage,
-                                        args.normalize_to)
-        if trimmed_record:
-            write_record(trimmed_record, outfp)
+
+    with FileWriter(open(outfile, 'wb'), args.gzip, args.bzip,
+                    steal_ownership=True) as outfp:
+
+        paired_iter = broken_paired_reader(ReadParser(args.datafile),
+                                           min_length=graph.ksize(),
+                                           force_single=True)
+
+        for n, is_pair, read1, read2 in paired_iter:
+            assert not is_pair
+            assert read2 is None
+
+            trimmed_record, _ = trim_record(graph, read1, args.cutoff,
+                                            args.variable_coverage,
+                                            args.normalize_to)
+            if trimmed_record:
+                write_record(trimmed_record, outfp)
 
     log_info('output in {outfile}', outfile=outfile)