From 10093c6d84bd8341b412c6d290d36c9619894788 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 13 Jan 2022 12:14:54 +0000 Subject: [PATCH 1/5] drop unused/outdated index formats (indexed_tsv and zpickle) --- ir_datasets/indices/__init__.py | 2 - ir_datasets/indices/indexed_tsv_docstore.py | 386 -------------------- ir_datasets/indices/zpickle_docstore.py | 172 --------- 3 files changed, 560 deletions(-) delete mode 100644 ir_datasets/indices/indexed_tsv_docstore.py delete mode 100644 ir_datasets/indices/zpickle_docstore.py diff --git a/ir_datasets/indices/__init__.py b/ir_datasets/indices/__init__.py index de2fd089..70fc9d71 100644 --- a/ir_datasets/indices/__init__.py +++ b/ir_datasets/indices/__init__.py @@ -1,6 +1,4 @@ from .base import Docstore -from .indexed_tsv_docstore import IndexedTsvDocstore -from .zpickle_docstore import ZPickleDocStore from .numpy_sorted_index import NumpySortedIndex, NumpyPosIndex from .lz4_pickle import Lz4PickleLookup, PickleLz4FullStore from .cache_docstore import CacheDocstore diff --git a/ir_datasets/indices/indexed_tsv_docstore.py b/ir_datasets/indices/indexed_tsv_docstore.py deleted file mode 100644 index e05d9382..00000000 --- a/ir_datasets/indices/indexed_tsv_docstore.py +++ /dev/null @@ -1,386 +0,0 @@ -import os -import shutil -import json -import zlib -import pickle -from contextlib import contextmanager -import ir_datasets - - -_logger = ir_datasets.log.easy() - - - -class ZPickleKeyValueStore: - def __init__(self, path, value_encoder=None): - self._path = path - self._idx = None - self._bin = None - - def built(self): - return len(self) > 0 - - def idx(self): - if self._idx is None: - self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) - return self._idx - - def bin(self): - if self._bin is None: - self._bin = open(os.path.join(self._path, 'bin'), 'rb') - return self._bin - - def purge(self): - if self._idx: - self._idx.close() - self._idx = None - if self._bin: - self._bin.close() - self._bin = None - - @contextmanager - def transaction(self): - os.makedirs(self._path, exist_ok=True) - with ZPickleDocStoreTransaction(self) as trans: - yield trans - - def __getitem__(self, value): - if isinstance(value, tuple) and len(value) == 2: - key, field = value - else: - # assume key and all fields - key, field = value, Ellipsis - binf = self.bin() - binf.seek(self.idx().get(key)) - content_length = int.from_bytes(binf.read(4), 'little') - content = binf.read(content_length) - content = zlib.decompress(content) - content = pickle.loads(content) - if content[0] != key: - raise KeyError(f'key={key} not found') - if field is Ellipsis: - return dict(content[1:]) - for f, val in content[1:]: - if field == f: - return val - raise KeyError(f'field={field} not found for key={key}') - - def path(self, force=True): - return self._path - - def __iter__(self): - # iterates documents - binf = self.bin() - binf.seek(0) - while binf.read(1): # peek - binf.seek(-1) # un-peek - content_length = int.from_bytes(binf.read(4), 'little') - content = binf.read(content_length) - content = zlib.decompress(content) - content = pickle.loads(content) - yield content[0], dict(content[1:]) - - def __len__(self): - # number of keys - return len(self.idx()) - - - -class IndexedTsvKeyValueStore: - def __init__(self, path, value_encoder=None): - self._path = path - self._value_encoder = value_encoder - self._idx = None - self._tsv = None - - def built(self): - return len(self) > 0 - - def idx(self): - if self._idx is None: - self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) - return self._idx - - def tsv(self): - if self._tsv is None: - self._tsv = open(os.path.join(self._path, 'tsv'), 'rt') - return self._tsv - - def purge(self): - if self._idx: - self._idx.close() - self._idx = None - if self._tsv: - self._tsv.close() - self._tsv = None - - @contextmanager - def transaction(self): - os.makedirs(self._path, exist_ok=True) - with IndexedTsvDocStoreTransaction(self) as trans: - yield trans - - def __getitem__(self, value): - if isinstance(value, tuple) and len(value) == 2: - key, field = value - else: - # assume key and all fields - key = value - field = ... - record = {} - tsv = self.tsv() - tsv.seek(self.idx().get(key)) - for line in tsv: - cols = line.rstrip().split('\t') - if len(cols) == 1: - if cols[0] != key: - break # end of doc - else: - continue # key verified - l_field, l_text = cols - if field is Ellipsis: - if self._value_encoder == 'json': - l_text = json.loads(l_text) - record[l_field] = l_text - else: - if l_field == field: - if self._value_encoder == 'json': - l_text = json.loads(l_text) - return l_text - if field is Ellipsis: - if not record: - raise KeyError(f'key={key} not found') - return record - raise KeyError(f'key={key} field={field} not found') - - def path(self, force=True): - return self._path - - def __iter__(self): - # iterates documents - tsv = self.tsv() - tsv.seek(0) - key = None - doc = None - for line in tsv: - cols = line.rstrip().split('\t') - if len(cols) == 1: - if doc is not None: - yield key, doc - key = cols[0] - doc = {} - else: - if self._value_encoder == 'json': - cols[1] = json.loads(cols[1]) - doc[cols[0]] = cols[1] - if doc is not None: - yield key, doc - - def __len__(self): - # number of keys - return len(self.idx()) - - -class IndexedTsvDocStoreTransaction: - def __init__(self, docstore): - self.docstore = docstore - self.path = self.docstore.path() - self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) - self.tsv = open(os.path.join(self.path, 'tsv'), 'wt') - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if not exc_val: - self.commit() - else: - self.discard() - - def commit(self): - self.idx.commit() - self.tsv.flush() - self.tsv.close() - # self.docstore.merge(IndexedTsvDocstore(self.path)) - # shutil.rmtree(self.path) - - def discard(self): - shutil.rmtree(self.path) - - def add(self, key, fields): - self.idx.add(key, self.tsv.tell()) - self.tsv.write(f'{key}\n') - for field, value in zip(type(fields)._fields, fields): - if self.docstore._value_encoder == 'json': - value = json.dumps(value) - elif self.docstore._value_encoder is None: - value = value.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') - self.tsv.write(f'{field}\t{value}\n') - - -class ZPickleDocStoreTransaction: - def __init__(self, docstore): - self.docstore = docstore - self.path = self.docstore.path() - self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) - self.bin = open(os.path.join(self.path, 'bin'), 'wb') - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if not exc_val: - self.commit() - else: - self.discard() - - def commit(self): - self.idx.commit() - self.bin.flush() - self.bin.close() - - def discard(self): - shutil.rmtree(self.path) - - def add(self, key, fields): - self.idx.add(key, self.bin.tell()) - content = tuple(zip(type(fields)._fields, fields)) - content = pickle.dumps(content) - content = zlib.compress(content) - content_length = len(content) - self.bin.write(content_length.to_bytes(4, 'little')) - self.bin.write(content) - - -class NumpyPosIndex: - def __init__(self, path): - self.path = path - self.data = None - self.mmap1 = None - self.mmap2 = None - self.doccount = None - self.didlen = None - self.np = ir_datasets.lazy_libs.numpy() - - def add(self, did, idx): - if self.data is None: - self.data = {} - self.data[did] = idx - - def commit(self): - didlen = max(len(x) for x in self.data) - sorted_data = sorted(self.data.items()) - # Use zero-terminated bytes here (S) rather than unicode type (U) because U includes a ton - # of extra padding (for longer unicode formats), which can inflate the size of the index greatly. - array1 = self.np.array([x[0].encode('utf8') for x in sorted_data], dtype=f'S{didlen}') - array2 = self.np.array([x[1] for x in sorted_data], dtype=f'int64') - m1 = self.np.memmap(f'{self.path}.did', dtype=array1.dtype, mode='w+', shape=array1.shape) - m1[:] = array1[:] - del m1 - m2 = self.np.memmap(f'{self.path}.pos', dtype=array2.dtype, mode='w+', shape=array2.shape) - m2[:] = array2[:] - del m2 - with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f: - f.write(f'{didlen} {len(self.data)}') - self.data = None - - def _lazy_load(self): - if self.mmap1 is None: - with open(f'{self.path}.meta', 'rt') as f: - self.didlen, self.doccount = f.read().split() - self.didlen, self.doccount = int(self.didlen), int(self.doccount) - self.mmap1 = self.np.memmap(f'{self.path}.did', dtype=f'S{self.didlen}', mode='r', shape=(self.doccount,)) - self.mmap2 = self.np.memmap(f'{self.path}.pos', dtype='int64', mode='r', shape=(self.doccount,)) - - def get(self, did): - self._lazy_load() - did = did.encode('utf8') - loc = self.np.searchsorted(self.mmap1, did) - if self.mmap1[loc] == did: - return self.mmap2[loc] - return 0 - - def close(self): - if self.mmap1 is not None: - del self.mmap1 - self.mmap1 = None - if self.mmap2 is not None: - del self.mmap2 - self.mmap2 = None - self.data = None - - def __iter__(self): - # iterates keys - self._lazy_load() - for i in range(len(self)): - yield self.mmap1[i].decode('utf8') - - def __len__(self): - # number of keys - self._lazy_load() - return self.doccount - - -def dir_size(path): - # Adapted from - total = 0 - for entry in os.scandir(path): - if entry.is_file(): - total += entry.stat().st_size - elif entry.is_dir(): - total += dir_size(entry.path) - return total - - - -class IndexedTsvDocstore: - file_ext = 'itsv' - - def __init__(self, path, doc_cls, value_encoder='json', id_field='doc_id', store=IndexedTsvKeyValueStore): - self._path = path - self._doc_cls = doc_cls - self._id_field = id_field - self._id_field_idx = doc_cls._fields.index(id_field) - self._store = store(path, value_encoder=value_encoder) - - def built(self): - return os.path.exists(self._path) - - def purge(self): - self._store.purge() - - def build(self, documents): - with self._store.transaction() as trans: - for doc in documents: - trans.add(doc[self._id_field_idx], doc) - - def get(self, did, field=None): - if field is not None: - return self._store[did, field] - result = self._store[did] - return self._doc_cls(*(result[f] for f in self._doc_cls._fields)) - - def get_many(self, dids, field=None): - result = {} - for did in dids: - try: - result[did] = self.get(did, field) - except ValueError: - pass - return result - - def num_docs(self): - return len(self._store) - - def docids(self): - return iter(self._store.idx()) - - def iter_docs(self): - for did, fields in iter(self._store): - yield self._doc_cls(*(fields[f] for f in self._doc_cls._fields)) - - def path(self, force=True): - return self._path - - def file_size(self): - return dir_size(self._path) diff --git a/ir_datasets/indices/zpickle_docstore.py b/ir_datasets/indices/zpickle_docstore.py deleted file mode 100644 index 309db45e..00000000 --- a/ir_datasets/indices/zpickle_docstore.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -import shutil -import json -import zlib -import pickle -from contextlib import contextmanager -from .indexed_tsv_docstore import NumpyPosIndex -import ir_datasets - - -_logger = ir_datasets.log.easy() - - - -class ZPickleKeyValueStore: - def __init__(self, path, id_idx, doc_cls): - self._path = path - self._id_idx = id_idx - self._doc_cls = doc_cls - self._idx = None - self._bin = None - - def built(self): - return len(self) > 0 - - def idx(self): - if self._idx is None: - self._idx = NumpyPosIndex(os.path.join(self._path, 'idx')) - return self._idx - - def bin(self): - if self._bin is None: - self._bin = open(os.path.join(self._path, 'bin'), 'rb') - return self._bin - - def purge(self): - if self._idx: - self._idx.close() - self._idx = None - if self._bin: - self._bin.close() - self._bin = None - - @contextmanager - def transaction(self): - os.makedirs(self._path, exist_ok=True) - with ZPickleDocStoreTransaction(self) as trans: - yield trans - - def __getitem__(self, value): - if isinstance(value, tuple) and len(value) == 2: - key, field = value - else: - # assume key and all fields - key, field = value, Ellipsis - binf = self.bin() - binf.seek(self.idx().get(key)) - content_length = int.from_bytes(binf.read(4), 'little') - content = binf.read(content_length) - content = zlib.decompress(content) - content = pickle.loads(content) - if content[self._id_idx][1] != key: - raise KeyError(f'key={key} not found') - if field is Ellipsis: - content = dict(content) - return self._doc_cls(*(content.get(f) for f in self._doc_cls._fields)) - for f, val in content: - if field == f: - return val - raise KeyError(f'field={field} not found for key={key}') - - def path(self, force=True): - return self._path - - def __iter__(self): - # iterates documents - binf = self.bin() - binf.seek(0) - while binf.read(1): # peek - binf.seek(-1, 1) # un-peek - content_length = int.from_bytes(binf.read(4), 'little') - content = binf.read(content_length) - content = zlib.decompress(content) - content = pickle.loads(content) - content = dict(content) - yield self._doc_cls(*(content.get(f) for f in self._doc_cls._fields)) - - def __len__(self): - # number of keys - return len(self.idx()) - - -class ZPickleDocStoreTransaction: - def __init__(self, docstore): - self.docstore = docstore - self.path = self.docstore.path() - self.idx = NumpyPosIndex(os.path.join(self.path, 'idx')) - self.bin = open(os.path.join(self.path, 'bin'), 'wb') - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if not exc_val: - self.commit() - else: - self.discard() - - def commit(self): - self.idx.commit() - self.bin.flush() - self.bin.close() - - def discard(self): - shutil.rmtree(self.path) - - def add(self, key, fields): - self.idx.add(key, self.bin.tell()) - content = tuple(zip(type(fields)._fields, fields)) - content = pickle.dumps(content) - content = zlib.compress(content) - content_length = len(content) - self.bin.write(content_length.to_bytes(4, 'little')) - self.bin.write(content) - - -class ZPickleDocStore: - file_ext = 'zpkl' - - def __init__(self, path, doc_cls, id_field='doc_id'): - self._path = path - self._doc_cls = doc_cls - self._id_field = id_field - self._id_field_idx = doc_cls._fields.index(id_field) - self._store = ZPickleKeyValueStore(path, self._id_field_idx, self._doc_cls) - - def built(self): - return os.path.exists(self._path) - - def purge(self): - self._store.purge() - - def build(self, documents): - with self._store.transaction() as trans: - for doc in documents: - trans.add(doc[self._id_field_idx], doc) - - def get(self, did, field=None): - if field is not None: - return self._store[did, field] - return self._store[did] - - def get_many(self, dids, field=None): - result = {} - for did in dids: - try: - result[did] = self.get(did, field) - except ValueError: - pass - return result - - def num_docs(self): - return len(self._store) - - def docids(self): - return iter(self._store.idx()) - - def __iter__(self): - return iter(self._store) - - def path(self, force=True): - return self._path From d998ca5fcb349ef3531bffadcacde901f772f56b Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 13 Jan 2022 12:39:01 +0000 Subject: [PATCH 2/5] adding encoding in places that open files/streams for reading --- ir_datasets/commands/build_c4_checkpoints.py | 2 +- ir_datasets/commands/build_download_cache.py | 2 +- ir_datasets/commands/doc_fifos.py | 2 +- ir_datasets/commands/generate_metadata.py | 2 +- ir_datasets/datasets/antique.py | 2 +- ir_datasets/datasets/clueweb09.py | 2 +- ir_datasets/datasets/clueweb12.py | 2 +- ir_datasets/datasets/codesearchnet.py | 10 +++---- ir_datasets/datasets/cord19.py | 2 +- ir_datasets/datasets/cranfield.py | 4 +-- ir_datasets/datasets/gov2.py | 2 +- ir_datasets/datasets/msmarco_qna.py | 30 ++++++++++---------- ir_datasets/datasets/tripclick.py | 2 +- ir_datasets/datasets/tweets2013_ia.py | 4 +-- ir_datasets/datasets/vaswani.py | 6 ++-- ir_datasets/formats/tsv.py | 6 ++-- ir_datasets/indices/lz4_pickle.py | 4 +-- ir_datasets/indices/numpy_sorted_index.py | 2 +- ir_datasets/util/__init__.py | 7 +++-- ir_datasets/util/download.py | 4 +-- setup.py | 4 +-- test/downloads.py | 4 +-- 22 files changed, 53 insertions(+), 52 deletions(-) diff --git a/ir_datasets/commands/build_c4_checkpoints.py b/ir_datasets/commands/build_c4_checkpoints.py index 7187b029..f02c5282 100644 --- a/ir_datasets/commands/build_c4_checkpoints.py +++ b/ir_datasets/commands/build_c4_checkpoints.py @@ -65,7 +65,7 @@ def main(args): }) except Exception as ex: print(file, ex) - with gzip.open(args.sources_file + '.gz', 'wt') as f: + with gzip.open(args.sources_file + '.gz', 'wt', encoding='utf8') as f: json.dump(sources, f) all_source_files = [f.relative_to(source_dir) for f in all_source_files] if args.skip_last: diff --git a/ir_datasets/commands/build_download_cache.py b/ir_datasets/commands/build_download_cache.py index de1e0435..d5c52104 100644 --- a/ir_datasets/commands/build_download_cache.py +++ b/ir_datasets/commands/build_download_cache.py @@ -63,7 +63,7 @@ def main(args): parser.add_argument('--retries', default='10') args = parser.parse_args(args) - with open('ir_datasets/etc/downloads.json') as f: + with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f: data = json.load(f) with tmp_environ(IR_DATASETS_DL_TRIES=args.retries): _build_cache(data, args.dir) diff --git a/ir_datasets/commands/doc_fifos.py b/ir_datasets/commands/doc_fifos.py index ac0e7282..25ed3e2f 100644 --- a/ir_datasets/commands/doc_fifos.py +++ b/ir_datasets/commands/doc_fifos.py @@ -66,7 +66,7 @@ def main(args): print(f'Ready at {d}') print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index ') - fifos = [stack.enter_context(open(f, 'wt')) for f in fifos] + fifos = [stack.enter_context(open(f, 'wt', encoding='utf8')) for f in fifos] ready = None for doc in docs_iter: diff --git a/ir_datasets/commands/generate_metadata.py b/ir_datasets/commands/generate_metadata.py index 02144291..dd041f68 100644 --- a/ir_datasets/commands/generate_metadata.py +++ b/ir_datasets/commands/generate_metadata.py @@ -36,7 +36,7 @@ def dataset2metadata(args): def write_metadata_file(data, file): - with file.open('wt') as f: + with file.open('wt', encoding='utf8') as f: # partially-formatted data; one dataset per line f.write('{\n') for i, key in enumerate(sorted(data.keys())): diff --git a/ir_datasets/datasets/antique.py b/ir_datasets/datasets/antique.py index 2c4d2932..ebc6c6c5 100644 --- a/ir_datasets/datasets/antique.py +++ b/ir_datasets/datasets/antique.py @@ -56,7 +56,7 @@ def _init(): disallow_list = dlc['disallow_list'] def disllow_qids(): with disallow_list.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') return {l.rstrip() for l in stream} disllow_qids = Lazy(disllow_qids) subsets['test/non-offensive'] = Dataset( diff --git a/ir_datasets/datasets/clueweb09.py b/ir_datasets/datasets/clueweb09.py index 8b71e875..dd8a9c61 100644 --- a/ir_datasets/datasets/clueweb09.py +++ b/ir_datasets/datasets/clueweb09.py @@ -86,7 +86,7 @@ def _docs_warc_file_counts(self): result = {} for d in self.dirs: counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt') - with open(counts_file, 'rt') as f: + with open(counts_file, 'rt', encoding='utf8') as f: for line in f: file, count = line.strip().split() # Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5 diff --git a/ir_datasets/datasets/clueweb12.py b/ir_datasets/datasets/clueweb12.py index 53211775..09381e16 100644 --- a/ir_datasets/datasets/clueweb12.py +++ b/ir_datasets/datasets/clueweb12.py @@ -193,7 +193,7 @@ def _docs_warc_file_counts(self): result = {} for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')): d = os.path.basename(counts_file)[:-len('_counts.txt')] - with open(counts_file, 'rt') as f: + with open(counts_file, 'rt', encoding='utf8') as f: for line in f: file, count = line.strip().split() file = os.path.join(self.docs_dlc.path(), d, file[2:]) diff --git a/ir_datasets/datasets/codesearchnet.py b/ir_datasets/datasets/codesearchnet.py index f02e34a3..0c2ca5c5 100644 --- a/ir_datasets/datasets/codesearchnet.py +++ b/ir_datasets/datasets/codesearchnet.py @@ -55,7 +55,7 @@ def docs_iter(self): for dlc in self.docs_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob('**/*.gz')): - with gzip.open(file, 'rt') as f: + with gzip.open(file, 'rt', encoding='utf8') as f: for line in f: data = json.loads(line) yield CodeSearchNetDoc( @@ -101,7 +101,7 @@ def queries_iter(self): for dlc in self.queries_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob(f'**/{self.split}/*.gz')): - with gzip.open(file, 'rt') as f: + with gzip.open(file, 'rt', encoding='utf8') as f: for line in f: data = json.loads(line) yield GenericQuery( @@ -129,7 +129,7 @@ def qrels_iter(self): for dlc in self.qrels_dlcs: base_path = Path(dlc.path()) for file in sorted(base_path.glob(f'**/{self.split}/*.gz')): - with gzip.open(file, 'rt') as f: + with gzip.open(file, 'rt', encoding='utf8') as f: for line in f: data = json.loads(line) yield TrecQrel( @@ -159,7 +159,7 @@ def queries_path(self): def queries_iter(self): with self.queries_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for i, line in enumerate(stream): if i == 0: continue # skip first (header) line @@ -184,7 +184,7 @@ def qrels_path(self): def qrels_iter(self): query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()} with self.qrels_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for data in csv.DictReader(stream): yield CodeSearchNetChallengeQrel( query_id=query_map[data['Query']], diff --git a/ir_datasets/datasets/cord19.py b/ir_datasets/datasets/cord19.py index d284a7eb..aab15152 100644 --- a/ir_datasets/datasets/cord19.py +++ b/ir_datasets/datasets/cord19.py @@ -109,7 +109,7 @@ def _docs_iter(self): 'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))), } if self._include_fulltext: - csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt')) + csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt', encoding='utf8')) else: csv_reader = ctxt.enter_context(self._streamer.stream()) csv_reader = codecs.getreader('utf8')(csv_reader) diff --git a/ir_datasets/datasets/cranfield.py b/ir_datasets/datasets/cranfield.py index 4a912112..4a6ffe08 100644 --- a/ir_datasets/datasets/cranfield.py +++ b/ir_datasets/datasets/cranfield.py @@ -51,7 +51,7 @@ def docs_path(self, force=True): @ir_datasets.util.use_docstore def docs_iter(self): with self.docs_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for lines in prefix_sentinel_splitter(stream, sentinel='.I '): record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''} field = 'doc_id' @@ -103,7 +103,7 @@ def queries_path(self): def queries_iter(self): with self.queries_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for lines in prefix_sentinel_splitter(stream, sentinel='.I '): record = {'query_id': '', 'text': ''} field = 'query_id' diff --git a/ir_datasets/datasets/gov2.py b/ir_datasets/datasets/gov2.py index ad46692f..1a000826 100644 --- a/ir_datasets/datasets/gov2.py +++ b/ir_datasets/datasets/gov2.py @@ -273,7 +273,7 @@ def path(self, force=True): docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz') result = Counter() with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar: - with gzip.open(docs_urls_path, 'rt') as fin: + with gzip.open(docs_urls_path, 'rt', encoding='utf8') as fin: for line in fin: url, doc_id = line.rstrip().split() d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388 diff --git a/ir_datasets/datasets/msmarco_qna.py b/ir_datasets/datasets/msmarco_qna.py index 6122429d..69bfef1a 100644 --- a/ir_datasets/datasets/msmarco_qna.py +++ b/ir_datasets/datasets/msmarco_qna.py @@ -130,16 +130,16 @@ def build(self): with contextlib.ExitStack() as inner_stack: stream = inner_stack.enter_context(dlc.stream()) parser = ijson.parse(stream) - out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt')) - out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt')) - out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt')) + out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt', encoding='utf8')) + out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt', encoding='utf8')) + out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt', encoding='utf8')) if file_str != 'eval': - out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt')) - out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+')) + out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt', encoding='utf8')) + out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+', encoding='utf8')) out_seq = None else: out_qrels, out_answer = None, None - out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt')) + out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt', encoding='utf8')) for prefix, event, data in parser: pbar_postfix['key'] = prefix pbar.set_postfix(pbar_postfix, refresh=False) @@ -221,19 +221,19 @@ def build(self): # Merge files for file_str in ['train', 'dev', 'eval']: with contextlib.ExitStack() as stack: - f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt')) - f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt')) - f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt')) - f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt')) - f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt')) + f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt', encoding='utf8')) + f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt', encoding='utf8')) + f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt', encoding='utf8')) + f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt', encoding='utf8')) + f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt', encoding='utf8')) in_files = [f_qid, f_type, f_text] if file_str != 'eval': - f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt')) - f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt')) - f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt')) + f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt', encoding='utf8')) + f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt', encoding='utf8')) + f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt', encoding='utf8')) in_files += [f_selections, f_answers] else: - f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt')) + f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt', encoding='utf8')) in_files += [f_seq] for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'): columns = [x.strip() for x in columns] diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py index 67d4df2a..73dc018c 100644 --- a/ir_datasets/datasets/tripclick.py +++ b/ir_datasets/datasets/tripclick.py @@ -117,7 +117,7 @@ def __init__(self, dlc): def qlogs_iter(self): for file in sorted(Path(self.dlc.path()).glob('logs/*.json')): - with file.open('rt') as fin: + with file.open('rt', encoding='utf8') as fin: for line in fin: record = json.loads(line) time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1) diff --git a/ir_datasets/datasets/tweets2013_ia.py b/ir_datasets/datasets/tweets2013_ia.py index a1caa3ba..4e815e14 100644 --- a/ir_datasets/datasets/tweets2013_ia.py +++ b/ir_datasets/datasets/tweets2013_ia.py @@ -332,7 +332,7 @@ def _docs_build(self): # Write out a file that gives the counts for each source file. This is used for fancy slicing # and also avoids globbing to get a list of all source files. - with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt') as f: + with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt', encoding='utf8') as f: for file, count in sorted(file_counts.items()): f.write(f'{file}\t{count}\n') @@ -369,7 +369,7 @@ def _docs_file_counts(self): if self._docs_file_counts_cache is None: self._docs_build() result = {} - with (Path(self.docs_path()) / 'file_counts.tsv').open('rt') as f: + with (Path(self.docs_path()) / 'file_counts.tsv').open('rt', encoding='utf8') as f: for line in f: file, count = line.strip().split('\t') result[file] = int(count) diff --git a/ir_datasets/datasets/vaswani.py b/ir_datasets/datasets/vaswani.py index 52557dff..6ab4b635 100644 --- a/ir_datasets/datasets/vaswani.py +++ b/ir_datasets/datasets/vaswani.py @@ -33,7 +33,7 @@ def docs_path(self, force=True): @ir_datasets.util.use_docstore def docs_iter(self): with self.docs_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for lines in sentinel_splitter(stream, sentinel=' /\n'): doc_id = lines[0].rstrip('\n') doc_text = ''.join(lines[1:]) @@ -73,7 +73,7 @@ def queries_path(self): def queries_iter(self): with self.queries_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for lines in sentinel_splitter(stream, sentinel='/\n'): query_id = lines[0].rstrip('\n') query_text = ''.join(lines[1:]) @@ -98,7 +98,7 @@ def qrels_path(self): def qrels_iter(self): with self.qrels_dlc.stream() as stream: - stream = io.TextIOWrapper(stream) + stream = io.TextIOWrapper(stream, encoding='utf8') for lines in sentinel_splitter(stream, sentinel=' /\n'): query_id = lines[0].rstrip('\n') for line in lines[1:]: diff --git a/ir_datasets/formats/tsv.py b/ir_datasets/formats/tsv.py index 4059e829..b916bab0 100644 --- a/ir_datasets/formats/tsv.py +++ b/ir_datasets/formats/tsv.py @@ -23,9 +23,9 @@ def __next__(self): raise StopIteration if self.stream is None: if isinstance(self.dlc, list): - self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream())) + self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()), encoding='utf8') else: - self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream())) + self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc.stream()), encoding='utf8') while self.pos < self.start: line = self.stream.readline() if line != '\n': @@ -34,7 +34,7 @@ def __next__(self): if isinstance(self.dlc, list): self.stream_idx += 1 if self.stream_idx < len(self.dlc): - self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream())) + self.stream = io.TextIOWrapper(self.ctxt.enter_context(self.dlc[self.stream_idx].stream()), encoding='utf8') line = self.stream.readline() else: raise StopIteration() diff --git a/ir_datasets/indices/lz4_pickle.py b/ir_datasets/indices/lz4_pickle.py index 3f7e3c89..9880ce97 100644 --- a/ir_datasets/indices/lz4_pickle.py +++ b/ir_datasets/indices/lz4_pickle.py @@ -112,7 +112,7 @@ def __init__(self, path, doc_cls, key_field, index_fields, key_field_prefix=None # check that the fields match meta_info = ' '.join(doc_cls._fields) if os.path.exists(self._meta_path): - with open(self._meta_path, 'rt') as f: + with open(self._meta_path, 'rt', encoding='utf8') as f: existing_meta = f.read() assert existing_meta == meta_info, f"fields do not match; you may need to re-build this store {path}" @@ -159,7 +159,7 @@ def transaction(self): os.makedirs(self._path, exist_ok=True) if not os.path.exists(self._meta_path): meta_info = ' '.join(self._doc_cls._fields) - with open(self._meta_path, 'wt') as f: + with open(self._meta_path, 'wt', encoding='utf8') as f: f.write(meta_info) with Lz4PickleTransaction(self) as trans: diff --git a/ir_datasets/indices/numpy_sorted_index.py b/ir_datasets/indices/numpy_sorted_index.py index 413894fc..e71d0845 100644 --- a/ir_datasets/indices/numpy_sorted_index.py +++ b/ir_datasets/indices/numpy_sorted_index.py @@ -49,7 +49,7 @@ def _lazy_load(self): if self.np is None: self.np = ir_datasets.lazy_libs.numpy() if self.mmap_keys is None and self._exists(): - with open(f'{self.path}.meta', 'rt') as f: + with open(f'{self.path}.meta', 'rt', encoding='utf8') as f: self.keylen, self.doccount = f.read().split() self.keylen, self.doccount = int(self.keylen), int(self.doccount) self.mmap_keys = self.np.memmap(f'{self.path}.key', dtype=f'S{self.keylen}', mode='r', shape=(self.doccount,)) diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py index 8127e0b4..2c69e92d 100644 --- a/ir_datasets/util/__init__.py +++ b/ir_datasets/util/__init__.py @@ -35,12 +35,13 @@ def home_path(): @contextmanager def finialized_file(path, mode): + encoding = 'utf8' if 't' in mode else None if path == os.devnull: - with open(path, mode) as f: + with open(path, mode, encoding=encoding) as f: yield f else: try: - with open(f'{path}.tmp', mode) as f: + with open(f'{path}.tmp', mode, encoding=encoding) as f: yield f os.replace(f'{path}.tmp', path) except: @@ -206,7 +207,7 @@ def wrapped(*args, **kwargs): return fn def _read_version(self): - with self._version_file.open('rt') as f: + with self._version_file.open('rt', encoding='utf8') as f: return f.read() diff --git a/ir_datasets/util/download.py b/ir_datasets/util/download.py index 94d0ccce..2fbb1fe7 100644 --- a/ir_datasets/util/download.py +++ b/ir_datasets/util/download.py @@ -90,7 +90,7 @@ def __iter__(self): dlen = int(dlen) fmt = '{desc}: {percentage:3.1f}%{r_bar}' if os.environ.get('IR_DATASETS_DL_DISABLE_PBAR', '').lower() == 'true': - pbar_f = stack.enter_context(open(os.devnull, 'w')) # still maintain the pbar, but write to /dev/null + pbar_f = stack.enter_context(open(os.devnull, 'wt', encoding='utf8')) # still maintain the pbar, but write to /dev/null else: pbar_f = None # defaults to stderr pbar = stack.enter_context(_logger.pbar_raw(desc=self.url, total=dlen, unit='B', unit_scale=True, bar_format=fmt, file=pbar_f)) @@ -159,7 +159,7 @@ def _handle_auth(self, http_args): auth_dir.mkdir(parents=True, exist_ok=True) auth_path = auth_dir / self.auth if auth_path.exists(): - with auth_path.open('rt') as fin: + with auth_path.open('rt', encoding='utf8') as fin: lines = fin.read().split('\n') if len(lines) < 2: raise RuntimeError(f'{str(auth_path)} in incorrect format. Set the first line as the username and the second line as the password.') diff --git a/setup.py b/setup.py index 5e3f1e10..48c54946 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from glob import glob import setuptools -with open("README.md", "r") as fh: +with open("README.md", "rt", encoding='utf8') as fh: long_description = fh.read() setuptools.setup( @@ -15,7 +15,7 @@ url="https://github.com/allenai/ir_datasets", include_package_data = True, packages=setuptools.find_packages(include=['ir_datasets', 'ir_datasets.*']), - install_requires=list(open('requirements.txt')), + install_requires=list(open('requirements.txt', 'rt', encoding='utf8')), classifiers=[], python_requires='>=3.6', entry_points={ diff --git a/test/downloads.py b/test/downloads.py index 522741da..1bf972bb 100644 --- a/test/downloads.py +++ b/test/downloads.py @@ -38,7 +38,7 @@ class TestDownloads(unittest.TestCase): output_data = [] def test_downloads(self): - with open('ir_datasets/etc/downloads.json') as f: + with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f: data = json.load(f) try: self._test_download_iter(data) @@ -55,7 +55,7 @@ def test_downloads(self): self.output_data.append(self._test_download(clir_dlc[top_key][sub_key], f'clirmatrix/{top_key}/{sub_key}')) finally: if self.output_path is not None: - with open(self.output_path, 'wt') as f: + with open(self.output_path, 'wt', encoding='utf8') as f: json.dump(self.output_data, f) def _test_download_iter(self, data, prefix=''): From a92835eed47b031061d8e5251881d1cf243321db Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 13 Jan 2022 12:45:13 +0000 Subject: [PATCH 3/5] fix typo in finalized_file function name --- ir_datasets/commands/build_download_cache.py | 2 +- ir_datasets/datasets/aol_ia.py | 8 ++++---- ir_datasets/datasets/clueweb12.py | 2 +- ir_datasets/datasets/dpr_w100.py | 4 ++-- ir_datasets/datasets/gov2.py | 2 +- ir_datasets/datasets/natural_questions.py | 12 ++++++------ ir_datasets/datasets/tripclick.py | 2 +- ir_datasets/indices/clueweb_warc.py | 2 +- ir_datasets/indices/numpy_sorted_index.py | 2 +- ir_datasets/util/__init__.py | 4 +++- ir_datasets/util/download.py | 2 +- ir_datasets/util/fileio.py | 2 +- 12 files changed, 23 insertions(+), 21 deletions(-) diff --git a/ir_datasets/commands/build_download_cache.py b/ir_datasets/commands/build_download_cache.py index d5c52104..8379be99 100644 --- a/ir_datasets/commands/build_download_cache.py +++ b/ir_datasets/commands/build_download_cache.py @@ -35,7 +35,7 @@ def _build_cache(data, dir, prefix=''): _logger.info(f'skipping {prefix}; already exists') return try: - with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix): + with ir_datasets.util.finalized_file(cache_path, 'wb') as fout, _logger.duration(prefix): download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True) with download.stream() as stream: inp = stream.read(io.DEFAULT_BUFFER_SIZE) diff --git a/ir_datasets/datasets/aol_ia.py b/ir_datasets/datasets/aol_ia.py index baada6bd..9a56881c 100644 --- a/ir_datasets/datasets/aol_ia.py +++ b/ir_datasets/datasets/aol_ia.py @@ -7,7 +7,7 @@ from hashlib import md5 import ir_datasets from typing import NamedTuple, Tuple -from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file +from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finalized_file from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs from ir_datasets.datasets.base import Dataset, YamlDocumentation @@ -136,9 +136,9 @@ def build(self): lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame encountered_qids = set() - with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \ - finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \ - finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \ + with finalized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \ + finalized_file(self._base_path/'qrels', 'wt') as f_qrels, \ + finalized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \ lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \ _logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar: for dlc in self._log_dlcs: diff --git a/ir_datasets/datasets/clueweb12.py b/ir_datasets/datasets/clueweb12.py index 09381e16..e33ad53b 100644 --- a/ir_datasets/datasets/clueweb12.py +++ b/ir_datasets/datasets/clueweb12.py @@ -239,7 +239,7 @@ def _create_record_counts_if_needed(self, path): with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar: for d in glob(os.path.join(path, 'ClueWeb12_??')): d = os.path.basename(d) - out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt')) + out = stack.enter_context(ir_datasets.util.finalized_file(f'{rc_dir}/{d}_counts.txt', 'wt')) for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))): shortf = file[-24:] with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf: diff --git a/ir_datasets/datasets/dpr_w100.py b/ir_datasets/datasets/dpr_w100.py index a3449916..51fa78ef 100644 --- a/ir_datasets/datasets/dpr_w100.py +++ b/ir_datasets/datasets/dpr_w100.py @@ -45,8 +45,8 @@ def build(self): return # already built with contextlib.ExitStack() as stack: - f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt')) - f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt')) + f_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'queries.tsv', 'wt')) + f_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'qrels', 'wt')) stream = stack.enter_context(self._dlc.stream()) qid_counter = itertools.count() for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'): diff --git a/ir_datasets/datasets/gov2.py b/ir_datasets/datasets/gov2.py index 1a000826..fb1c40c9 100644 --- a/ir_datasets/datasets/gov2.py +++ b/ir_datasets/datasets/gov2.py @@ -280,7 +280,7 @@ def path(self, force=True): file = f'{d}/{f}.gz' result[file] += 1 pbar.update() - with ir_datasets.util.finialized_file(self._path, 'wt') as fout: + with ir_datasets.util.finalized_file(self._path, 'wt') as fout: for file in sorted(result): fout.write(f'{file}\t{result[file]}\n') return self._path diff --git a/ir_datasets/datasets/natural_questions.py b/ir_datasets/datasets/natural_questions.py index d42a457b..45e9494b 100644 --- a/ir_datasets/datasets/natural_questions.py +++ b/ir_datasets/datasets/natural_questions.py @@ -56,12 +56,12 @@ def build(self): with contextlib.ExitStack() as stack: docs_trans = stack.enter_context(docs_store.lookup.transaction()) pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question')) - train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt')) - train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt')) - train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt')) - dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt')) - dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt')) - dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt')) + train_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.queries.tsv', 'wt')) + train_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.qrels.jsonl', 'wt')) + train_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.scoreddocs.tsv', 'wt')) + dev_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.queries.tsv', 'wt')) + dev_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.qrels.jsonl', 'wt')) + dev_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.scoreddocs.tsv', 'wt')) for file_name in sorted(self._dlcs.contents().keys()): pbar_postfix['file'] = file_name pbar.set_postfix(pbar_postfix) diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py index 73dc018c..fa025939 100644 --- a/ir_datasets/datasets/tripclick.py +++ b/ir_datasets/datasets/tripclick.py @@ -163,7 +163,7 @@ def path(self, force=True): for query in _logger.pbar(self._queries.queries_iter(), desc='build query lookup', unit='query'): queryhash = hashlib.md5(SPACES.sub(' ', query.text).strip().encode()).digest()[:6] query_map[queryhash] = query.query_id - with ir_datasets.util.finialized_file(self._cache_path, 'wt') as fout, \ + with ir_datasets.util.finalized_file(self._cache_path, 'wt') as fout, \ self._docpair_dlc.stream() as stream, \ _logger.pbar_raw(desc='building docpairs', total=23_222_038, unit='docpair') as pbar: skipped = 0 diff --git a/ir_datasets/indices/clueweb_warc.py b/ir_datasets/indices/clueweb_warc.py index c1ede03f..1a39c3cb 100644 --- a/ir_datasets/indices/clueweb_warc.py +++ b/ir_datasets/indices/clueweb_warc.py @@ -79,7 +79,7 @@ def build(self, checkpoint_freq=8*1024*1024): last_chekpoint_pos = 0 with self.zlib_state.GzipStateFile(self.source_path, keep_last_state=True) as f, \ warc.WARCFile(fileobj=f) as f_warc, \ - ir_datasets.util.finialized_file(self.index_path, 'wb') as f_tmp, \ + ir_datasets.util.finalized_file(self.index_path, 'wb') as f_tmp, \ WarcIndexFile(f_tmp, 'wb') as f_chk: doc_idx = 0 for doc in f_warc: diff --git a/ir_datasets/indices/numpy_sorted_index.py b/ir_datasets/indices/numpy_sorted_index.py index e71d0845..2ddea7c3 100644 --- a/ir_datasets/indices/numpy_sorted_index.py +++ b/ir_datasets/indices/numpy_sorted_index.py @@ -38,7 +38,7 @@ def commit(self): self.mmap_keys[:] = keys[:] self.mmap_poss = self.np.memmap(f'{self.path}.pos', dtype=poss.dtype, mode='w+', shape=poss.shape) self.mmap_poss[:] = poss[:] - with ir_datasets.util.finialized_file(f'{self.path}.meta', 'wt') as f: + with ir_datasets.util.finalized_file(f'{self.path}.meta', 'wt') as f: f.write(f'{self.keylen} {self.doccount}') self.transaction = None diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py index 2c69e92d..94b95bca 100644 --- a/ir_datasets/util/__init__.py +++ b/ir_datasets/util/__init__.py @@ -34,7 +34,7 @@ def home_path(): @contextmanager -def finialized_file(path, mode): +def finalized_file(path, mode): encoding = 'utf8' if 't' in mode else None if path == os.devnull: with open(path, mode, encoding=encoding) as f: @@ -51,6 +51,8 @@ def finialized_file(path, mode): pass # ignore raise +finialized_file = finalized_file # support old name of function w/ typo + class Lazy: def __init__(self, fn): diff --git a/ir_datasets/util/download.py b/ir_datasets/util/download.py index 2fbb1fe7..fbcb3a94 100644 --- a/ir_datasets/util/download.py +++ b/ir_datasets/util/download.py @@ -246,7 +246,7 @@ def path(self, force=True): for mirror in self.mirrors: try: - with util.finialized_file(download_path, 'wb') as f: + with util.finalized_file(download_path, 'wb') as f: with mirror.stream() as stream: stream = util.HashStream(stream, self.expected_md5, algo='md5') shutil.copyfileobj(stream, f) diff --git a/ir_datasets/util/fileio.py b/ir_datasets/util/fileio.py index 272bfa01..73175107 100644 --- a/ir_datasets/util/fileio.py +++ b/ir_datasets/util/fileio.py @@ -161,7 +161,7 @@ def stream(self): if not self._output_file.exists(): with contextlib.ExitStack() as ctxt, self._streamer.stream() as stream: ctxt.enter_context(_logger.duration('re-taring file')) - outf = ctxt.enter_context(util.finialized_file(self._output_file, 'wb')) + outf = ctxt.enter_context(util.finalized_file(self._output_file, 'wb')) o_tarf = ctxt.enter_context(tarfile.open(fileobj=outf, mode=f'w|{self._compression or ""}')) # IMPORTANT: open this file in streaming mode (| in mode). This means that the # content need not be written to disk or be fully read. From a337796527056ace24c0cc333a56de3e89efc665 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 13 Jan 2022 17:49:58 +0000 Subject: [PATCH 4/5] added a github action to identify unspecified encodings --- .github/workflows/pylint.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/pylint.yml diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 00000000..76d72b1c --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,29 @@ +name: pylint + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: 'ubuntu-latest' + steps: + - name: Checkout source + uses: actions/checkout@v2 + + - name: Set up Python 3.9 + uses: actions/setup-python@v1 + with: + python-version: '3.9' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pylint==2.12.2 + + - name: Run PyLint + run: | + pylint --disable=all --enable=unspecified-encoding ./ir_datasets From a557447c41cfdb80141fd22d772c0d4b435f731a Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Thu, 13 Jan 2022 17:52:55 +0000 Subject: [PATCH 5/5] added a missing encoding --- ir_datasets/util/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ir_datasets/util/__init__.py b/ir_datasets/util/__init__.py index 94b95bca..043ff879 100644 --- a/ir_datasets/util/__init__.py +++ b/ir_datasets/util/__init__.py @@ -201,7 +201,7 @@ def wrapped(*args, **kwargs): os.unlink(file) else: shutil.rmtree(file) - with self._version_file.open('wt') as f: + with self._version_file.open('wt', encoding='utf8') as f: f.write(self._version) self._state = 'OK' return fn(*args, **kwargs)