From 10093c6d84bd8341b412c6d290d36c9619894788 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 13 Jan 2022 12:14:54 +0000
Subject: [PATCH 1/5] drop unused/outdated index formats (indexed_tsv and
 zpickle)

---
 ir_datasets/indices/__init__.py             |   2 -
 ir_datasets/indices/indexed_tsv_docstore.py | 386 --------------------
 ir_datasets/indices/zpickle_docstore.py     | 172 ---------
 3 files changed, 560 deletions(-)
 delete mode 100644 ir_datasets/indices/indexed_tsv_docstore.py
 delete mode 100644 ir_datasets/indices/zpickle_docstore.py

diff --git a/ir_datasets/indices/__init__.py b/ir_datasets/indices/__init__.py
index de2fd089..70fc9d71 100644
--- a/ir_datasets/indices/__init__.py
+++ b/ir_datasets/indices/__init__.py
@@ -1,6 +1,4 @@
 from .base import Docstore
-from .indexed_tsv_docstore import IndexedTsvDocstore
-from .zpickle_docstore import ZPickleDocStore
 from .numpy_sorted_index import NumpySortedIndex, NumpyPosIndex
 from .lz4_pickle import Lz4PickleLookup, PickleLz4FullStore
 from .cache_docstore import CacheDocstore
diff --git a/ir_datasets/indices/indexed_tsv_docstore.py b/ir_datasets/indices/indexed_tsv_docstore.py
deleted file mode 100644
index e05d9382..00000000
--- a/ir_datasets/indices/indexed_tsv_docstore.py
+++ /dev/null
@@ -1,386 +0,0 @@
-import os
-import shutil
-import json
-import zlib
-import pickle
-from contextlib import contextmanager
-import ir_datasets
-
-
-_logger = ir_datasets.log.easy()
-
-
-
-class ZPickleKeyValueStore:
-    def __init__(self, path, value_encoder=None):
-        self._path = path
-        self._idx = None
-        self._bin = None
-
-    def built(self):
-        return len(self) > 0
-
-    def idx(self):
-        if self._idx is None:
-            self._idx = NumpyPosIndex(os.path.join(self._path, 'idx'))
-        return self._idx
-
-    def bin(self):
-        if self._bin is None:
-            self._bin = open(os.path.join(self._path, 'bin'), 'rb')
-        return self._bin
-
-    def purge(self):
-        if self._idx:
-            self._idx.close()
-            self._idx = None
-        if self._bin:
-            self._bin.close()
-            self._bin = None
-
-    @contextmanager
-    def transaction(self):
-        os.makedirs(self._path, exist_ok=True)
-        with ZPickleDocStoreTransaction(self) as trans:
-            yield trans
-
-    def __getitem__(self, value):
-        if isinstance(value, tuple) and len(value) == 2:
-            key, field = value
-        else:
-            # assume key and all fields
-            key, field = value, Ellipsis
-        binf = self.bin()
-        binf.seek(self.idx().get(key))
-        content_length = int.from_bytes(binf.read(4), 'little')
-        content = binf.read(content_length)
-        content = zlib.decompress(content)
-        content = pickle.loads(content)
-        if content[0] != key:
-            raise KeyError(f'key={key} not found')
-        if field is Ellipsis:
-            return dict(content[1:])
-        for f, val in content[1:]:
-            if field == f:
-                return val
-        raise KeyError(f'field={field} not found for key={key}')
-
-    def path(self, force=True):
-        return self._path
-
-    def __iter__(self):
-        # iterates documents
-        binf = self.bin()
-        binf.seek(0)
-        while binf.read(1): # peek
-            binf.seek(-1) # un-peek
-            content_length = int.from_bytes(binf.read(4), 'little')
-            content = binf.read(content_length)
-            content = zlib.decompress(content)
-            content = pickle.loads(content)
-            yield content[0], dict(content[1:])
-
-    def __len__(self):
-        # number of keys
-        return len(self.idx())
-
-
-
-class IndexedTsvKeyValueStore:
-    def __init__(self, path, value_encoder=None):
-        self._path = path
-        self._value_encoder = value_encoder
-        self._idx = None
-        self._tsv = None
-
-    def built(self):
-        return len(self) > 0
-
-    def idx(self):
-        if self._idx is None:
-            self._idx = NumpyPosIndex(os.path.join(self._path, 'idx'))
-        return self._idx
-
-    def tsv(self):
-        if self._tsv is None:
-            self._tsv = open(os.path.join(self._path, 'tsv'), 'rt')
-        return self._tsv
-
-    def purge(self):
-        if self._idx:
-            self._idx.close()
-            self._idx = None
-        if self._tsv:
-            self._tsv.close()
-            self._tsv = None
-
-    @contextmanager
-    def transaction(self):
-        os.makedirs(self._path, exist_ok=True)
-        with IndexedTsvDocStoreTransaction(self) as trans:
-            yield trans
-
-    def __getitem__(self, value):
-        if isinstance(value, tuple) and len(value) == 2:
-            key, field = value
-        else:
-            # assume key and all fields
-            key = value
-            field = ...
-            record = {}
-        tsv = self.tsv()
-        tsv.seek(self.idx().get(key))
-        for line in tsv:
-            cols = line.rstrip().split('\t')
-            if len(cols) == 1:
-                if cols[0] != key:
-                    break # end of doc
-                else:
-                    continue # key verified
-            l_field, l_text = cols
-            if field is Ellipsis:
-                if self._value_encoder == 'json':
-                    l_text = json.loads(l_text)
-                record[l_field] = l_text
-            else:
-                if l_field == field:
-                    if self._value_encoder == 'json':
-                        l_text = json.loads(l_text)
-                    return l_text
-        if field is Ellipsis:
-            if not record:
-                raise KeyError(f'key={key} not found')
-            return record
-        raise KeyError(f'key={key} field={field} not found')
-
-    def path(self, force=True):
-        return self._path
-
-    def __iter__(self):
-        # iterates documents
-        tsv = self.tsv()
-        tsv.seek(0)
-        key = None
-        doc = None
-        for line in tsv:
-            cols = line.rstrip().split('\t')
-            if len(cols) == 1:
-                if doc is not None:
-                    yield key, doc
-                key = cols[0]
-                doc = {}
-            else:
-                if self._value_encoder == 'json':
-                    cols[1] = json.loads(cols[1])
-                doc[cols[0]] = cols[1]
-        if doc is not None:
-            yield key, doc
-
-    def __len__(self):
-        # number of keys
-        return len(self.idx())
-
-
-class IndexedTsvDocStoreTransaction:
-    def __init__(self, docstore):
-        self.docstore = docstore
-        self.path = self.docstore.path()
-        self.idx = NumpyPosIndex(os.path.join(self.path, 'idx'))
-        self.tsv = open(os.path.join(self.path, 'tsv'), 'wt')
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not exc_val:
-            self.commit()
-        else:
-            self.discard()
-
-    def commit(self):
-        self.idx.commit()
-        self.tsv.flush()
-        self.tsv.close()
-        # self.docstore.merge(IndexedTsvDocstore(self.path))
-        # shutil.rmtree(self.path)
-
-    def discard(self):
-        shutil.rmtree(self.path)
-
-    def add(self, key, fields):
-        self.idx.add(key, self.tsv.tell())
-        self.tsv.write(f'{key}\n')
-        for field, value in zip(type(fields)._fields, fields):
-            if self.docstore._value_encoder == 'json':
-                value = json.dumps(value)
-            elif self.docstore._value_encoder is None:
-                value = value.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
-            self.tsv.write(f'{field}\t{value}\n')
-
-
-class ZPickleDocStoreTransaction:
-    def __init__(self, docstore):
-        self.docstore = docstore
-        self.path = self.docstore.path()
-        self.idx = NumpyPosIndex(os.path.join(self.path, 'idx'))
-        self.bin = open(os.path.join(self.path, 'bin'), 'wb')
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not exc_val:
-            self.commit()
-        else:
-            self.discard()
-
-    def commit(self):
-        self.idx.commit()
-        self.bin.flush()
-        self.bin.close()
-
-    def discard(self):
-        shutil.rmtree(self.path)
-
-    def add(self, key, fields):
-        self.idx.add(key, self.bin.tell())
-        content = tuple(zip(type(fields)._fields, fields))
-        content = pickle.dumps(content)
-        content = zlib.compress(content)
-        content_length = len(content)
-        self.bin.write(content_length.to_bytes(4, 'little'))
-        self.bin.write(content)
-
-
-class NumpyPosIndex:
-    def __init__(self, path):
-        self.path = path
-        self.data = None
-        self.mmap1 = None
-        self.mmap2 = None
-        self.doccount = None
-        self.didlen = None
-        self.np = ir_datasets.lazy_libs.numpy()
-
-    def add(self, did, idx):
-        if self.data is None:
-            self.data = {}
-        self.data[did] = idx
-
-    def commit(self):
-        didlen = max(len(x) for x in self.data)
-        sorted_data = sorted(self.data.items())
-        # Use zero-terminated bytes here (S) rather than unicode type (U) because U includes a ton
-        # of extra padding (for longer unicode formats), which can inflate the size of the index greatly.
-        array1 = self.np.array([x[0].encode('utf8') for x in sorted_data], dtype=f'S{didlen}')
-        array2 = self.np.array([x[1] for x in sorted_data], dtype=f'int64')
-        m1 = self.np.memmap(f'{self.path}.did', dtype=array1.dtype, mode='w+', shape=array1.shape)
-        m1[:] = array1[:]
-        del m1
-        m2 = self.np.memmap(f'{self.path}.pos', dtype=array2.dtype, mode='w+', shape=array2.shape)
-        m2[:] = array2[:]
-        del m2
-        with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f:
-            f.write(f'{didlen} {len(self.data)}')
-        self.data = None
-
-    def _lazy_load(self):
-        if self.mmap1 is None:
-            with open(f'{self.path}.meta', 'rt') as f:
-                self.didlen, self.doccount = f.read().split()
-                self.didlen, self.doccount = int(self.didlen), int(self.doccount)
-            self.mmap1 = self.np.memmap(f'{self.path}.did', dtype=f'S{self.didlen}', mode='r', shape=(self.doccount,))
-            self.mmap2 = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,))
-
-    def get(self, did):
-        self._lazy_load()
-        did = did.encode('utf8')
-        loc = self.np.searchsorted(self.mmap1, did)
-        if self.mmap1[loc] == did:
-            return self.mmap2[loc]
-        return 0
-
-    def close(self):
-        if self.mmap1 is not None:
-            del self.mmap1
-            self.mmap1 = None
-        if self.mmap2 is not None:
-            del self.mmap2
-            self.mmap2 = None
-        self.data = None
-
-    def __iter__(self):
-        # iterates keys
-        self._lazy_load()
-        for i in range(len(self)):
-            yield self.mmap1[i].decode('utf8')
-
-    def __len__(self):
-        # number of keys
-        self._lazy_load()
-        return self.doccount
-
-
-def dir_size(path):
-    # Adapted from <https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python>
-    total = 0
-    for entry in os.scandir(path):
-        if entry.is_file():
-            total += entry.stat().st_size
-        elif entry.is_dir():
-            total += dir_size(entry.path)
-    return total
-
-
-
-class IndexedTsvDocstore:
-    file_ext = 'itsv'
-
-    def __init__(self, path, doc_cls, value_encoder='json', id_field='doc_id', store=IndexedTsvKeyValueStore):
-        self._path = path
-        self._doc_cls = doc_cls
-        self._id_field = id_field
-        self._id_field_idx = doc_cls._fields.index(id_field)
-        self._store = store(path, value_encoder=value_encoder)
-
-    def built(self):
-        return os.path.exists(self._path)
-
-    def purge(self):
-        self._store.purge()
-
-    def build(self, documents):
-        with self._store.transaction() as trans:
-            for doc in documents:
-                trans.add(doc[self._id_field_idx], doc)
-
-    def get(self, did, field=None):
-        if field is not None:
-            return self._store[did, field]
-        result = self._store[did]
-        return self._doc_cls(*(result[f] for f in self._doc_cls._fields))
-
-    def get_many(self, dids, field=None):
-        result = {}
-        for did in dids:
-            try:
-                result[did] = self.get(did, field)
-            except ValueError:
-                pass
-        return result
-
-    def num_docs(self):
-        return len(self._store)
-
-    def docids(self):
-        return iter(self._store.idx())
-
-    def iter_docs(self):
-        for did, fields in iter(self._store):
-            yield self._doc_cls(*(fields[f] for f in self._doc_cls._fields))
-
-    def path(self, force=True):
-        return self._path
-
-    def file_size(self):
-        return dir_size(self._path)
diff --git a/ir_datasets/indices/zpickle_docstore.py b/ir_datasets/indices/zpickle_docstore.py
deleted file mode 100644
index 309db45e..00000000
--- a/ir_datasets/indices/zpickle_docstore.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import os
-import shutil
-import json
-import zlib
-import pickle
-from contextlib import contextmanager
-from .indexed_tsv_docstore import NumpyPosIndex
-import ir_datasets
-
-
-_logger = ir_datasets.log.easy()
-
-
-
-class ZPickleKeyValueStore:
-    def __init__(self, path, id_idx, doc_cls):
-        self._path = path
-        self._id_idx = id_idx
-        self._doc_cls = doc_cls
-        self._idx = None
-        self._bin = None
-
-    def built(self):
-        return len(self) > 0
-
-    def idx(self):
-        if self._idx is None:
-            self._idx = NumpyPosIndex(os.path.join(self._path, 'idx'))
-        return self._idx
-
-    def bin(self):
-        if self._bin is None:
-            self._bin = open(os.path.join(self._path, 'bin'), 'rb')
-        return self._bin
-
-    def purge(self):
-        if self._idx:
-            self._idx.close()
-            self._idx = None
-        if self._bin:
-            self._bin.close()
-            self._bin = None
-
-    @contextmanager
-    def transaction(self):
-        os.makedirs(self._path, exist_ok=True)
-        with ZPickleDocStoreTransaction(self) as trans:
-            yield trans
-
-    def __getitem__(self, value):
-        if isinstance(value, tuple) and len(value) == 2:
-            key, field = value
-        else:
-            # assume key and all fields
-            key, field = value, Ellipsis
-        binf = self.bin()
-        binf.seek(self.idx().get(key))
-        content_length = int.from_bytes(binf.read(4), 'little')
-        content = binf.read(content_length)
-        content = zlib.decompress(content)
-        content = pickle.loads(content)
-        if content[self._id_idx][1] != key:
-            raise KeyError(f'key={key} not found')
-        if field is Ellipsis:
-            content = dict(content)
-            return self._doc_cls(*(content.get(f) for f in self._doc_cls._fields))
-        for f, val in content:
-            if field == f:
-                return val
-        raise KeyError(f'field={field} not found for key={key}')
-
-    def path(self, force=True):
-        return self._path
-
-    def __iter__(self):
-        # iterates documents
-        binf = self.bin()
-        binf.seek(0)
-        while binf.read(1): # peek
-            binf.seek(-1, 1) # un-peek
-            content_length = int.from_bytes(binf.read(4), 'little')
-            content = binf.read(content_length)
-            content = zlib.decompress(content)
-            content = pickle.loads(content)
-            content = dict(content)
-            yield self._doc_cls(*(content.get(f) for f in self._doc_cls._fields))
-
-    def __len__(self):
-        # number of keys
-        return len(self.idx())
-
-
-class ZPickleDocStoreTransaction:
-    def __init__(self, docstore):
-        self.docstore = docstore
-        self.path = self.docstore.path()
-        self.idx = NumpyPosIndex(os.path.join(self.path, 'idx'))
-        self.bin = open(os.path.join(self.path, 'bin'), 'wb')
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if not exc_val:
-            self.commit()
-        else:
-            self.discard()
-
-    def commit(self):
-        self.idx.commit()
-        self.bin.flush()
-        self.bin.close()
-
-    def discard(self):
-        shutil.rmtree(self.path)
-
-    def add(self, key, fields):
-        self.idx.add(key, self.bin.tell())
-        content = tuple(zip(type(fields)._fields, fields))
-        content = pickle.dumps(content)
-        content = zlib.compress(content)
-        content_length = len(content)
-        self.bin.write(content_length.to_bytes(4, 'little'))
-        self.bin.write(content)
-
-
-class ZPickleDocStore:
-    file_ext = 'zpkl'
-
-    def __init__(self, path, doc_cls, id_field='doc_id'):
-        self._path = path
-        self._doc_cls = doc_cls
-        self._id_field = id_field
-        self._id_field_idx = doc_cls._fields.index(id_field)
-        self._store = ZPickleKeyValueStore(path, self._id_field_idx, self._doc_cls)
-
-    def built(self):
-        return os.path.exists(self._path)
-
-    def purge(self):
-        self._store.purge()
-
-    def build(self, documents):
-        with self._store.transaction() as trans:
-            for doc in documents:
-                trans.add(doc[self._id_field_idx], doc)
-
-    def get(self, did, field=None):
-        if field is not None:
-            return self._store[did, field]
-        return self._store[did]
-
-    def get_many(self, dids, field=None):
-        result = {}
-        for did in dids:
-            try:
-                result[did] = self.get(did, field)
-            except ValueError:
-                pass
-        return result
-
-    def num_docs(self):
-        return len(self._store)
-
-    def docids(self):
-        return iter(self._store.idx())
-
-    def __iter__(self):
-        return iter(self._store)
-
-    def path(self, force=True):
-        return self._path

From d998ca5fcb349ef3531bffadcacde901f772f56b Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 13 Jan 2022 12:39:01 +0000
Subject: [PATCH 2/5] adding encoding in places that open files/streams for
 reading

---
 ir_datasets/commands/build_c4_checkpoints.py |  2 +-
 ir_datasets/commands/build_download_cache.py |  2 +-
 ir_datasets/commands/doc_fifos.py            |  2 +-
 ir_datasets/commands/generate_metadata.py    |  2 +-
 ir_datasets/datasets/antique.py              |  2 +-
 ir_datasets/datasets/clueweb09.py            |  2 +-
 ir_datasets/datasets/clueweb12.py            |  2 +-
 ir_datasets/datasets/codesearchnet.py        | 10 +++----
 ir_datasets/datasets/cord19.py               |  2 +-
 ir_datasets/datasets/cranfield.py            |  4 +--
 ir_datasets/datasets/gov2.py                 |  2 +-
 ir_datasets/datasets/msmarco_qna.py          | 30 ++++++++++----------
 ir_datasets/datasets/tripclick.py            |  2 +-
 ir_datasets/datasets/tweets2013_ia.py        |  4 +--
 ir_datasets/datasets/vaswani.py              |  6 ++--
 ir_datasets/formats/tsv.py                   |  6 ++--
 ir_datasets/indices/lz4_pickle.py            |  4 +--
 ir_datasets/indices/numpy_sorted_index.py    |  2 +-
 ir_datasets/util/__init__.py                 |  7 +++--
 ir_datasets/util/download.py                 |  4 +--
 setup.py                                     |  4 +--
 test/downloads.py                            |  4 +--
 22 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/ir_datasets/commands/build_c4_checkpoints.py b/ir_datasets/commands/build_c4_checkpoints.py
index 7187b029..f02c5282 100644
--- a/ir_datasets/commands/build_c4_checkpoints.py
+++ b/ir_datasets/commands/build_c4_checkpoints.py
@@ -65,7 +65,7 @@ def main(args):
                 })
             except Exception as ex:
                 print(file, ex)
-        with gzip.open(args.sources_file + '.gz', 'wt') as f:
+        with gzip.open(args.sources_file + '.gz', 'wt', encoding='utf8') as f:
             json.dump(sources, f)
     all_source_files = [f.relative_to(source_dir) for f in all_source_files]
     if args.skip_last:
diff --git a/ir_datasets/commands/build_download_cache.py b/ir_datasets/commands/build_download_cache.py
index de1e0435..d5c52104 100644
--- a/ir_datasets/commands/build_download_cache.py
+++ b/ir_datasets/commands/build_download_cache.py
@@ -63,7 +63,7 @@ def main(args):
     parser.add_argument('--retries', default='10')
     args = parser.parse_args(args)
 
-    with open('ir_datasets/etc/downloads.json') as f:
+    with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f:
         data = json.load(f)
     with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
         _build_cache(data, args.dir)
diff --git a/ir_datasets/commands/doc_fifos.py b/ir_datasets/commands/doc_fifos.py
index ac0e7282..25ed3e2f 100644
--- a/ir_datasets/commands/doc_fifos.py
+++ b/ir_datasets/commands/doc_fifos.py
@@ -66,7 +66,7 @@ def main(args):
         print(f'Ready at {d}')
         print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index <your_index_path> <other_anserini_args>')
 
-        fifos = [stack.enter_context(open(f, 'wt')) for f in fifos]
+        fifos = [stack.enter_context(open(f, 'wt', encoding='utf8')) for f in fifos]
 
         ready = None
         for doc in docs_iter:
diff --git a/ir_datasets/commands/generate_metadata.py b/ir_datasets/commands/generate_metadata.py
index 02144291..dd041f68 100644
--- a/ir_datasets/commands/generate_metadata.py
+++ b/ir_datasets/commands/generate_metadata.py
@@ -36,7 +36,7 @@ def dataset2metadata(args):
 
 
 def write_metadata_file(data, file):
-    with file.open('wt') as f:
+    with file.open('wt', encoding='utf8') as f:
         # partially-formatted data; one dataset per line
         f.write('{\n')
         for i, key in enumerate(sorted(data.keys())):
diff --git a/ir_datasets/datasets/antique.py b/ir_datasets/datasets/antique.py
index 2c4d2932..ebc6c6c5 100644
--- a/ir_datasets/datasets/antique.py
+++ b/ir_datasets/datasets/antique.py
@@ -56,7 +56,7 @@ def _init():
     disallow_list = dlc['disallow_list']
     def disllow_qids():
         with disallow_list.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             return {l.rstrip() for l in stream}
     disllow_qids = Lazy(disllow_qids)
     subsets['test/non-offensive'] = Dataset(
diff --git a/ir_datasets/datasets/clueweb09.py b/ir_datasets/datasets/clueweb09.py
index 8b71e875..dd8a9c61 100644
--- a/ir_datasets/datasets/clueweb09.py
+++ b/ir_datasets/datasets/clueweb09.py
@@ -86,7 +86,7 @@ def _docs_warc_file_counts(self):
             result = {}
             for d in self.dirs:
                 counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt')
-                with open(counts_file, 'rt') as f:
+                with open(counts_file, 'rt', encoding='utf8') as f:
                     for line in f:
                         file, count = line.strip().split()
                         # Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5
diff --git a/ir_datasets/datasets/clueweb12.py b/ir_datasets/datasets/clueweb12.py
index 53211775..09381e16 100644
--- a/ir_datasets/datasets/clueweb12.py
+++ b/ir_datasets/datasets/clueweb12.py
@@ -193,7 +193,7 @@ def _docs_warc_file_counts(self):
             result = {}
             for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')):
                 d = os.path.basename(counts_file)[:-len('_counts.txt')]
-                with open(counts_file, 'rt') as f:
+                with open(counts_file, 'rt', encoding='utf8') as f:
                     for line in f:
                         file, count = line.strip().split()
                         file = os.path.join(self.docs_dlc.path(), d, file[2:])
diff --git a/ir_datasets/datasets/codesearchnet.py b/ir_datasets/datasets/codesearchnet.py
index f02e34a3..0c2ca5c5 100644
--- a/ir_datasets/datasets/codesearchnet.py
+++ b/ir_datasets/datasets/codesearchnet.py
@@ -55,7 +55,7 @@ def docs_iter(self):
         for dlc in self.docs_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob('**/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield CodeSearchNetDoc(
@@ -101,7 +101,7 @@ def queries_iter(self):
         for dlc in self.queries_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield GenericQuery(
@@ -129,7 +129,7 @@ def qrels_iter(self):
         for dlc in self.qrels_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield TrecQrel(
@@ -159,7 +159,7 @@ def queries_path(self):
 
     def queries_iter(self):
         with self.queries_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for i, line in enumerate(stream):
                 if i == 0:
                     continue # skip first (header) line
@@ -184,7 +184,7 @@ def qrels_path(self):
     def qrels_iter(self):
         query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()}
         with self.qrels_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for data in csv.DictReader(stream):
                 yield CodeSearchNetChallengeQrel(
                     query_id=query_map[data['Query']],
diff --git a/ir_datasets/datasets/cord19.py b/ir_datasets/datasets/cord19.py
index d284a7eb..aab15152 100644
--- a/ir_datasets/datasets/cord19.py
+++ b/ir_datasets/datasets/cord19.py
@@ -109,7 +109,7 @@ def _docs_iter(self):
                         'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))),
                     }
             if self._include_fulltext:
-                csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt'))
+                csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt', encoding='utf8'))
             else:
                 csv_reader = ctxt.enter_context(self._streamer.stream())
                 csv_reader = codecs.getreader('utf8')(csv_reader)
diff --git a/ir_datasets/datasets/cranfield.py b/ir_datasets/datasets/cranfield.py
index 4a912112..4a6ffe08 100644
--- a/ir_datasets/datasets/cranfield.py
+++ b/ir_datasets/datasets/cranfield.py
@@ -51,7 +51,7 @@ def docs_path(self, force=True):
     @ir_datasets.util.use_docstore
     def docs_iter(self):
         with self.docs_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
                 record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''}
                 field = 'doc_id'
@@ -103,7 +103,7 @@ def queries_path(self):
 
     def queries_iter(self):
         with self.queries_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
                 record = {'query_id': '', 'text': ''}
                 field = 'query_id'
diff --git a/ir_datasets/datasets/gov2.py b/ir_datasets/datasets/gov2.py
index ad46692f..1a000826 100644
--- a/ir_datasets/datasets/gov2.py
+++ b/ir_datasets/datasets/gov2.py
@@ -273,7 +273,7 @@ def path(self, force=True):
             docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz')
             result = Counter()
             with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar:
-                with gzip.open(docs_urls_path, 'rt') as fin:
+                with gzip.open(docs_urls_path, 'rt', encoding='utf8') as fin:
                     for line in fin:
                         url, doc_id = line.rstrip().split()
                         d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388
diff --git a/ir_datasets/datasets/msmarco_qna.py b/ir_datasets/datasets/msmarco_qna.py
index 6122429d..69bfef1a 100644
--- a/ir_datasets/datasets/msmarco_qna.py
+++ b/ir_datasets/datasets/msmarco_qna.py
@@ -130,16 +130,16 @@ def build(self):
                 with contextlib.ExitStack() as inner_stack:
                     stream = inner_stack.enter_context(dlc.stream())
                     parser = ijson.parse(stream)
-                    out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt'))
-                    out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt'))
-                    out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt'))
+                    out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt', encoding='utf8'))
+                    out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt', encoding='utf8'))
+                    out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt', encoding='utf8'))
                     if file_str != 'eval':
-                        out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt'))
-                        out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+'))
+                        out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt', encoding='utf8'))
+                        out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+', encoding='utf8'))
                         out_seq = None
                     else:
                         out_qrels, out_answer = None, None
-                        out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt'))
+                        out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt', encoding='utf8'))
                     for prefix, event, data in parser:
                         pbar_postfix['key'] = prefix
                         pbar.set_postfix(pbar_postfix, refresh=False)
@@ -221,19 +221,19 @@ def build(self):
             # Merge files
             for file_str in ['train', 'dev', 'eval']:
                 with contextlib.ExitStack() as stack:
-                    f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt'))
-                    f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt'))
-                    f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt'))
-                    f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt'))
-                    f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt'))
+                    f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt', encoding='utf8'))
+                    f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt', encoding='utf8'))
+                    f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt', encoding='utf8'))
+                    f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt', encoding='utf8'))
+                    f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt', encoding='utf8'))
                     in_files = [f_qid, f_type, f_text]
                     if file_str != 'eval':
-                        f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt'))
-                        f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt'))
-                        f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt'))
+                        f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt', encoding='utf8'))
+                        f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt', encoding='utf8'))
+                        f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt', encoding='utf8'))
                         in_files += [f_selections, f_answers]
                     else:
-                        f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt'))
+                        f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt', encoding='utf8'))
                         in_files += [f_seq]
                     for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'):
                         columns = [x.strip() for x in columns]
diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py
index 67d4df2a..73dc018c 100644
--- a/ir_datasets/datasets/tripclick.py
+++ b/ir_datasets/datasets/tripclick.py
@@ -117,7 +117,7 @@ def __init__(self, dlc):
 
     def qlogs_iter(self):
         for file in sorted(Path(self.dlc.path()).glob('logs/*.json')):
-            with file.open('rt') as fin:
+            with file.open('rt', encoding='utf8') as fin:
                 for line in fin:
                     record = json.loads(line)
                     time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1)
diff --git a/ir_datasets/datasets/tweets2013_ia.py b/ir_datasets/datasets/tweets2013_ia.py
index a1caa3ba..4e815e14 100644
--- a/ir_datasets/datasets/tweets2013_ia.py
+++ b/ir_datasets/datasets/tweets2013_ia.py
@@ -332,7 +332,7 @@ def _docs_build(self):
 
             # Write out a file that gives the counts for each source file. This is used for fancy slicing
             # and also avoids globbing to get a list of all source files.
-            with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt') as f:
+            with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt', encoding='utf8') as f:
                 for file, count in sorted(file_counts.items()):
                     f.write(f'{file}\t{count}\n')
 
@@ -369,7 +369,7 @@ def _docs_file_counts(self):
         if self._docs_file_counts_cache is None:
             self._docs_build()
             result = {}
-            with (Path(self.docs_path()) / 'file_counts.tsv').open('rt') as f:
+            with (Path(self.docs_path()) / 'file_counts.tsv').open('rt', encoding='utf8') as f:
                 for line in f:
                     file, count = line.strip().split('\t')
                     result[file] = int(count)
diff --git a/ir_datasets/datasets/vaswani.py b/ir_datasets/datasets/vaswani.py
index 52557dff..6ab4b635 100644
--- a/ir_datasets/datasets/vaswani.py
+++ b/ir_datasets/datasets/vaswani.py
@@ -33,7 +33,7 @@ def docs_path(self, force=True):
     @ir_datasets.util.use_docstore
     def docs_iter(self):
         with self.docs_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in sentinel_splitter(stream, sentinel='   /\n'):
                 doc_id = lines[0].rstrip('\n')
                 doc_text = ''.join(lines[1:])
@@ -73,7 +73,7 @@ def queries_path(self):
 
     def queries_iter(self):
         with self.queries_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in sentinel_splitter(stream, sentinel='/\n'):
                 query_id = lines[0].rstrip('\n')
                 query_text = ''.join(lines[1:])
@@ -98,7 +98,7 @@ def qrels_path(self):
 
     def qrels_iter(self):
         with self.qrels_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in sentinel_splitter(stream, sentinel='   /\n'):
                 query_id = lines[0].rstrip('\n')
                 for line in lines[1:]:
diff --git a/ir_datasets/formats/tsv.py b/ir_datasets/formats/tsv.py
index 4059e829..b916bab0 100644
--- a/ir_datasets/formats/tsv.py
+++ b/ir_datasets/formats/tsv.py
@@ -23,9 +23,9 @@ def __next__(self):
             raise StopIteration
         if self.stream is None:
             if isinstance(self.dlc, list):
-                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()))
+                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()), encoding='utf8')
             else:
-                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream()))
+                self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream()), encoding='utf8')
         while self.pos < self.start:
             line = self.stream.readline()
             if line != '\n':
@@ -34,7 +34,7 @@ def __next__(self):
             if isinstance(self.dlc, list):
                 self.stream_idx += 1
                 if self.stream_idx < len(self.dlc):
-                    self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()))
+                    self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()), encoding='utf8')
                     line = self.stream.readline()
                 else:
                     raise StopIteration()
diff --git a/ir_datasets/indices/lz4_pickle.py b/ir_datasets/indices/lz4_pickle.py
index 3f7e3c89..9880ce97 100644
--- a/ir_datasets/indices/lz4_pickle.py
+++ b/ir_datasets/indices/lz4_pickle.py
@@ -112,7 +112,7 @@ def __init__(self, path, doc_cls, key_field, index_fields, key_field_prefix=None
         # check that the fields match
         meta_info = ' '.join(doc_cls._fields)
         if os.path.exists(self._meta_path):
-            with open(self._meta_path, 'rt') as f:
+            with open(self._meta_path, 'rt', encoding='utf8') as f:
                 existing_meta = f.read()
             assert existing_meta == meta_info, f"fields do not match; you may need to re-build this store {path}"
 
@@ -159,7 +159,7 @@ def transaction(self):
             os.makedirs(self._path, exist_ok=True)
         if not os.path.exists(self._meta_path):
             meta_info = ' '.join(self._doc_cls._fields)
-            with open(self._meta_path, 'wt') as f:
+            with open(self._meta_path, 'wt', encoding='utf8') as f:
                 f.write(meta_info)
 
         with Lz4PickleTransaction(self) as trans:
diff --git a/ir_datasets/indices/numpy_sorted_index.py b/ir_datasets/indices/numpy_sorted_index.py
index 413894fc..e71d0845 100644
--- a/ir_datasets/indices/numpy_sorted_index.py
+++ b/ir_datasets/indices/numpy_sorted_index.py
@@ -49,7 +49,7 @@ def _lazy_load(self):
         if self.np is None:
             self.np = ir_datasets.lazy_libs.numpy()
         if self.mmap_keys is None and self._exists():
-            with open(f'{self.path}.meta', 'rt') as f:
+            with open(f'{self.path}.meta', 'rt', encoding='utf8') as f:
                 self.keylen, self.doccount = f.read().split()
                 self.keylen, self.doccount = int(self.keylen), int(self.doccount)
             self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,))
diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py
index 8127e0b4..2c69e92d 100644
--- a/ir_datasets/util/__init__.py
+++ b/ir_datasets/util/__init__.py
@@ -35,12 +35,13 @@ def home_path():
 
 @contextmanager
 def finialized_file(path, mode):
+    encoding = 'utf8' if 't' in mode else None
     if path == os.devnull:
-        with open(path, mode) as f:
+        with open(path, mode, encoding=encoding) as f:
             yield f
     else:
         try:
-            with open(f'{path}.tmp', mode) as f:
+            with open(f'{path}.tmp', mode, encoding=encoding) as f:
                 yield f
             os.replace(f'{path}.tmp', path)
         except:
@@ -206,7 +207,7 @@ def wrapped(*args, **kwargs):
         return fn
 
     def _read_version(self):
-        with self._version_file.open('rt') as f:
+        with self._version_file.open('rt', encoding='utf8') as f:
             return f.read()
 
 
diff --git a/ir_datasets/util/download.py b/ir_datasets/util/download.py
index 94d0ccce..2fbb1fe7 100644
--- a/ir_datasets/util/download.py
+++ b/ir_datasets/util/download.py
@@ -90,7 +90,7 @@ def __iter__(self):
                             dlen = int(dlen)
                         fmt = '{desc}: {percentage:3.1f}%{r_bar}'
                         if os.environ.get('IR_DATASETS_DL_DISABLE_PBAR', '').lower() == 'true':
-                            pbar_f = stack.enter_context(open(os.devnull, 'w')) # still maintain the pbar, but write to /dev/null
+                            pbar_f = stack.enter_context(open(os.devnull, 'wt', encoding='utf8')) # still maintain the pbar, but write to /dev/null
                         else:
                             pbar_f = None # defaults to stderr
                         pbar = stack.enter_context(_logger.pbar_raw(desc=self.url, total=dlen, unit='B', unit_scale=True, bar_format=fmt, file=pbar_f))
@@ -159,7 +159,7 @@ def _handle_auth(self, http_args):
             auth_dir.mkdir(parents=True, exist_ok=True)
         auth_path = auth_dir / self.auth
         if auth_path.exists():
-            with auth_path.open('rt') as fin:
+            with auth_path.open('rt', encoding='utf8') as fin:
                 lines = fin.read().split('\n')
                 if len(lines) < 2:
                     raise RuntimeError(f'{str(auth_path)} in incorrect format. Set the first line as the username and the second line as the password.')
diff --git a/setup.py b/setup.py
index 5e3f1e10..48c54946 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from glob import glob
 import setuptools
 
-with open("README.md", "r") as fh:
+with open("README.md", "rt", encoding='utf8') as fh:
     long_description = fh.read()
 
 setuptools.setup(
@@ -15,7 +15,7 @@
     url="https://github.com/allenai/ir_datasets",
     include_package_data = True,
     packages=setuptools.find_packages(include=['ir_datasets', 'ir_datasets.*']),
-    install_requires=list(open('requirements.txt')),
+    install_requires=list(open('requirements.txt', 'rt', encoding='utf8')),
     classifiers=[],
     python_requires='>=3.6',
     entry_points={
diff --git a/test/downloads.py b/test/downloads.py
index 522741da..1bf972bb 100644
--- a/test/downloads.py
+++ b/test/downloads.py
@@ -38,7 +38,7 @@ class TestDownloads(unittest.TestCase):
     output_data = []
 
     def test_downloads(self):
-        with open('ir_datasets/etc/downloads.json') as f:
+        with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f:
             data = json.load(f)
         try:
             self._test_download_iter(data)
@@ -55,7 +55,7 @@ def test_downloads(self):
                         self.output_data.append(self._test_download(clir_dlc[top_key][sub_key], f'clirmatrix/{top_key}/{sub_key}'))
         finally:
             if self.output_path is not None:
-                with open(self.output_path, 'wt') as f:
+                with open(self.output_path, 'wt', encoding='utf8') as f:
                     json.dump(self.output_data, f)
 
     def _test_download_iter(self, data, prefix=''):

From a92835eed47b031061d8e5251881d1cf243321db Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 13 Jan 2022 12:45:13 +0000
Subject: [PATCH 3/5] fix typo in finalized_file function name

---
 ir_datasets/commands/build_download_cache.py |  2 +-
 ir_datasets/datasets/aol_ia.py               |  8 ++++----
 ir_datasets/datasets/clueweb12.py            |  2 +-
 ir_datasets/datasets/dpr_w100.py             |  4 ++--
 ir_datasets/datasets/gov2.py                 |  2 +-
 ir_datasets/datasets/natural_questions.py    | 12 ++++++------
 ir_datasets/datasets/tripclick.py            |  2 +-
 ir_datasets/indices/clueweb_warc.py          |  2 +-
 ir_datasets/indices/numpy_sorted_index.py    |  2 +-
 ir_datasets/util/__init__.py                 |  4 +++-
 ir_datasets/util/download.py                 |  2 +-
 ir_datasets/util/fileio.py                   |  2 +-
 12 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/ir_datasets/commands/build_download_cache.py b/ir_datasets/commands/build_download_cache.py
index d5c52104..8379be99 100644
--- a/ir_datasets/commands/build_download_cache.py
+++ b/ir_datasets/commands/build_download_cache.py
@@ -35,7 +35,7 @@ def _build_cache(data, dir, prefix=''):
             _logger.info(f'skipping {prefix}; already exists')
             return
         try:
-            with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
+            with ir_datasets.util.finalized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
                 download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True)
                 with download.stream() as stream:
                     inp = stream.read(io.DEFAULT_BUFFER_SIZE)
diff --git a/ir_datasets/datasets/aol_ia.py b/ir_datasets/datasets/aol_ia.py
index baada6bd..9a56881c 100644
--- a/ir_datasets/datasets/aol_ia.py
+++ b/ir_datasets/datasets/aol_ia.py
@@ -7,7 +7,7 @@
 from hashlib import md5
 import ir_datasets
 from typing import NamedTuple, Tuple
-from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
+from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finalized_file
 from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
 from ir_datasets.datasets.base import Dataset, YamlDocumentation
 
@@ -136,9 +136,9 @@ def build(self):
         lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame
 
         encountered_qids = set()
-        with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
-             finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \
-             finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
+        with finalized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
+             finalized_file(self._base_path/'qrels', 'wt') as f_qrels, \
+             finalized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
              lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \
              _logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar:
             for dlc in self._log_dlcs:
diff --git a/ir_datasets/datasets/clueweb12.py b/ir_datasets/datasets/clueweb12.py
index 09381e16..e33ad53b 100644
--- a/ir_datasets/datasets/clueweb12.py
+++ b/ir_datasets/datasets/clueweb12.py
@@ -239,7 +239,7 @@ def _create_record_counts_if_needed(self, path):
         with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar:
             for d in glob(os.path.join(path, 'ClueWeb12_??')):
                 d = os.path.basename(d)
-                out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
+                out = stack.enter_context(ir_datasets.util.finalized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
                 for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))):
                     shortf = file[-24:]
                     with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf:
diff --git a/ir_datasets/datasets/dpr_w100.py b/ir_datasets/datasets/dpr_w100.py
index a3449916..51fa78ef 100644
--- a/ir_datasets/datasets/dpr_w100.py
+++ b/ir_datasets/datasets/dpr_w100.py
@@ -45,8 +45,8 @@ def build(self):
             return # already built
 
         with contextlib.ExitStack() as stack:
-            f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt'))
-            f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt'))
+            f_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'queries.tsv', 'wt'))
+            f_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'qrels', 'wt'))
             stream = stack.enter_context(self._dlc.stream())
             qid_counter = itertools.count()
             for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'):
diff --git a/ir_datasets/datasets/gov2.py b/ir_datasets/datasets/gov2.py
index 1a000826..fb1c40c9 100644
--- a/ir_datasets/datasets/gov2.py
+++ b/ir_datasets/datasets/gov2.py
@@ -280,7 +280,7 @@ def path(self, force=True):
                         file = f'{d}/{f}.gz'
                         result[file] += 1
                         pbar.update()
-                with ir_datasets.util.finialized_file(self._path, 'wt') as fout:
+                with ir_datasets.util.finalized_file(self._path, 'wt') as fout:
                     for file in sorted(result):
                         fout.write(f'{file}\t{result[file]}\n')
         return self._path
diff --git a/ir_datasets/datasets/natural_questions.py b/ir_datasets/datasets/natural_questions.py
index d42a457b..45e9494b 100644
--- a/ir_datasets/datasets/natural_questions.py
+++ b/ir_datasets/datasets/natural_questions.py
@@ -56,12 +56,12 @@ def build(self):
         with contextlib.ExitStack() as stack:
             docs_trans = stack.enter_context(docs_store.lookup.transaction())
             pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question'))
-            train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt'))
-            train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt'))
-            train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
-            dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt'))
-            dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
-            dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
+            train_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.queries.tsv', 'wt'))
+            train_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.qrels.jsonl', 'wt'))
+            train_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
+            dev_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.queries.tsv', 'wt'))
+            dev_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
+            dev_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
             for file_name in sorted(self._dlcs.contents().keys()):
                 pbar_postfix['file'] = file_name
                 pbar.set_postfix(pbar_postfix)
diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py
index 73dc018c..fa025939 100644
--- a/ir_datasets/datasets/tripclick.py
+++ b/ir_datasets/datasets/tripclick.py
@@ -163,7 +163,7 @@ def path(self, force=True):
             for query in _logger.pbar(self._queries.queries_iter(), desc='build query lookup', unit='query'):
                 queryhash = hashlib.md5(SPACES.sub(' ', query.text).strip().encode()).digest()[:6]
                 query_map[queryhash] = query.query_id
-            with ir_datasets.util.finialized_file(self._cache_path, 'wt') as fout, \
+            with ir_datasets.util.finalized_file(self._cache_path, 'wt') as fout, \
                  self._docpair_dlc.stream() as stream, \
                  _logger.pbar_raw(desc='building docpairs', total=23_222_038, unit='docpair') as pbar:
                 skipped = 0
diff --git a/ir_datasets/indices/clueweb_warc.py b/ir_datasets/indices/clueweb_warc.py
index c1ede03f..1a39c3cb 100644
--- a/ir_datasets/indices/clueweb_warc.py
+++ b/ir_datasets/indices/clueweb_warc.py
@@ -79,7 +79,7 @@ def build(self, checkpoint_freq=8*1024*1024):
         last_chekpoint_pos = 0
         with self.zlib_state.GzipStateFile(self.source_path, keep_last_state=True) as f, \
              warc.WARCFile(fileobj=f) as f_warc, \
-             ir_datasets.util.finialized_file(self.index_path, 'wb') as f_tmp, \
+             ir_datasets.util.finalized_file(self.index_path, 'wb') as f_tmp, \
              WarcIndexFile(f_tmp, 'wb') as f_chk:
             doc_idx = 0
             for doc in f_warc:
diff --git a/ir_datasets/indices/numpy_sorted_index.py b/ir_datasets/indices/numpy_sorted_index.py
index e71d0845..2ddea7c3 100644
--- a/ir_datasets/indices/numpy_sorted_index.py
+++ b/ir_datasets/indices/numpy_sorted_index.py
@@ -38,7 +38,7 @@ def commit(self):
         self.mmap_keys[:] = keys[:]
         self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype=poss.dtype, mode='w+', shape=poss.shape)
         self.mmap_poss[:] = poss[:]
-        with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f:
+        with ir_datasets.util.finalized_file(f'{self.path}.meta', 'wt') as f:
             f.write(f'{self.keylen} {self.doccount}')
         self.transaction = None
 
diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py
index 2c69e92d..94b95bca 100644
--- a/ir_datasets/util/__init__.py
+++ b/ir_datasets/util/__init__.py
@@ -34,7 +34,7 @@ def home_path():
 
 
 @contextmanager
-def finialized_file(path, mode):
+def finalized_file(path, mode):
     encoding = 'utf8' if 't' in mode else None
     if path == os.devnull:
         with open(path, mode, encoding=encoding) as f:
@@ -51,6 +51,8 @@ def finialized_file(path, mode):
                 pass # ignore
             raise
 
+finialized_file = finalized_file # support old name of function w/ typo
+
 
 class Lazy:
     def __init__(self, fn):
diff --git a/ir_datasets/util/download.py b/ir_datasets/util/download.py
index 2fbb1fe7..fbcb3a94 100644
--- a/ir_datasets/util/download.py
+++ b/ir_datasets/util/download.py
@@ -246,7 +246,7 @@ def path(self, force=True):
 
         for mirror in self.mirrors:
             try:
-                with util.finialized_file(download_path, 'wb') as f:
+                with util.finalized_file(download_path, 'wb') as f:
                     with mirror.stream() as stream:
                         stream = util.HashStream(stream, self.expected_md5, algo='md5')
                         shutil.copyfileobj(stream, f)
diff --git a/ir_datasets/util/fileio.py b/ir_datasets/util/fileio.py
index 272bfa01..73175107 100644
--- a/ir_datasets/util/fileio.py
+++ b/ir_datasets/util/fileio.py
@@ -161,7 +161,7 @@ def stream(self):
         if not self._output_file.exists():
             with contextlib.ExitStack() as ctxt, self._streamer.stream() as stream:
                 ctxt.enter_context(_logger.duration('re-taring file'))
-                outf = ctxt.enter_context(util.finialized_file(self._output_file, 'wb'))
+                outf = ctxt.enter_context(util.finalized_file(self._output_file, 'wb'))
                 o_tarf = ctxt.enter_context(tarfile.open(fileobj=outf, mode=f'w|{self._compression or ""}'))
                 # IMPORTANT: open this file in streaming mode (| in mode). This means that the
                 # content need not be written to disk or be fully read.

From a337796527056ace24c0cc333a56de3e89efc665 Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 13 Jan 2022 17:49:58 +0000
Subject: [PATCH 4/5] added a github action to identify unspecified encodings

---
 .github/workflows/pylint.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/pylint.yml

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
new file mode 100644
index 00000000..76d72b1c
--- /dev/null
+++ b/.github/workflows/pylint.yml
@@ -0,0 +1,29 @@
+name: pylint
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: 'ubuntu-latest'
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v2
+
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.9'
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pylint==2.12.2
+
+    - name: Run PyLint
+      run: |
+        pylint --disable=all --enable=unspecified-encoding ./ir_datasets

From a557447c41cfdb80141fd22d772c0d4b435f731a Mon Sep 17 00:00:00 2001
From: Sean MacAvaney <sean.macavaney@gmail.com>
Date: Thu, 13 Jan 2022 17:52:55 +0000
Subject: [PATCH 5/5] added a missing encoding

---
 ir_datasets/util/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py
index 94b95bca..043ff879 100644
--- a/ir_datasets/util/__init__.py
+++ b/ir_datasets/util/__init__.py
@@ -201,7 +201,7 @@ def wrapped(*args, **kwargs):
                                     os.unlink(file)
                                 else:
                                     shutil.rmtree(file)
-                        with self._version_file.open('wt') as f:
+                        with self._version_file.open('wt', encoding='utf8') as f:
                             f.write(self._version)
                     self._state = 'OK'
                 return fn(*args, **kwargs)