allenai · seanmacavaney · Jan 13, 2022 · Jan 13, 2022 · Jan 13, 2022 · Jan 13, 2022
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -0,0 +1,29 @@
+name: pylint
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: 'ubuntu-latest'
+    steps:
+    - name: Checkout source
+      uses: actions/checkout@v2
+
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.9'
+
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pylint==2.12.2
+
+    - name: Run PyLint
+      run: |
+        pylint --disable=all --enable=unspecified-encoding ./ir_datasets
diff --git a/ir_datasets/commands/build_c4_checkpoints.py b/ir_datasets/commands/build_c4_checkpoints.py
@@ -65,7 +65,7 @@ def main(args):
                 })
             except Exception as ex:
                 print(file, ex)
-        with gzip.open(args.sources_file + '.gz', 'wt') as f:
+        with gzip.open(args.sources_file + '.gz', 'wt', encoding='utf8') as f:
             json.dump(sources, f)
     all_source_files = [f.relative_to(source_dir) for f in all_source_files]
     if args.skip_last:

diff --git a/ir_datasets/commands/build_download_cache.py b/ir_datasets/commands/build_download_cache.py
@@ -35,7 +35,7 @@ def _build_cache(data, dir, prefix=''):
             _logger.info(f'skipping {prefix}; already exists')
             return
         try:
-            with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
+            with ir_datasets.util.finalized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
                 download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True)
                 with download.stream() as stream:
                     inp = stream.read(io.DEFAULT_BUFFER_SIZE)
@@ -63,7 +63,7 @@ def main(args):
     parser.add_argument('--retries', default='10')
     args = parser.parse_args(args)
 
-    with open('ir_datasets/etc/downloads.json') as f:
+    with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f:
         data = json.load(f)
     with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
         _build_cache(data, args.dir)

diff --git a/ir_datasets/commands/doc_fifos.py b/ir_datasets/commands/doc_fifos.py
@@ -66,7 +66,7 @@ def main(args):
         print(f'Ready at {d}')
         print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index <your_index_path> <other_anserini_args>')
 
-        fifos = [stack.enter_context(open(f, 'wt')) for f in fifos]
+        fifos = [stack.enter_context(open(f, 'wt', encoding='utf8')) for f in fifos]
 
         ready = None
         for doc in docs_iter:

diff --git a/ir_datasets/commands/generate_metadata.py b/ir_datasets/commands/generate_metadata.py
@@ -36,7 +36,7 @@ def dataset2metadata(args):
 
 
 def write_metadata_file(data, file):
-    with file.open('wt') as f:
+    with file.open('wt', encoding='utf8') as f:
         # partially-formatted data; one dataset per line
         f.write('{\n')
         for i, key in enumerate(sorted(data.keys())):

diff --git a/ir_datasets/datasets/antique.py b/ir_datasets/datasets/antique.py
@@ -56,7 +56,7 @@ def _init():
     disallow_list = dlc['disallow_list']
     def disllow_qids():
         with disallow_list.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             return {l.rstrip() for l in stream}
     disllow_qids = Lazy(disllow_qids)
     subsets['test/non-offensive'] = Dataset(

diff --git a/ir_datasets/datasets/aol_ia.py b/ir_datasets/datasets/aol_ia.py
@@ -7,7 +7,7 @@
 from hashlib import md5
 import ir_datasets
 from typing import NamedTuple, Tuple
-from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
+from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finalized_file
 from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
 from ir_datasets.datasets.base import Dataset, YamlDocumentation
 
@@ -136,9 +136,9 @@ def build(self):
         lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame
 
         encountered_qids = set()
-        with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
-             finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \
-             finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
+        with finalized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
+             finalized_file(self._base_path/'qrels', 'wt') as f_qrels, \
+             finalized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
              lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \
              _logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar:
             for dlc in self._log_dlcs:

diff --git a/ir_datasets/datasets/clueweb09.py b/ir_datasets/datasets/clueweb09.py
@@ -86,7 +86,7 @@ def _docs_warc_file_counts(self):
             result = {}
             for d in self.dirs:
                 counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt')
-                with open(counts_file, 'rt') as f:
+                with open(counts_file, 'rt', encoding='utf8') as f:
                     for line in f:
                         file, count = line.strip().split()
                         # Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5

diff --git a/ir_datasets/datasets/clueweb12.py b/ir_datasets/datasets/clueweb12.py
@@ -193,7 +193,7 @@ def _docs_warc_file_counts(self):
             result = {}
             for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')):
                 d = os.path.basename(counts_file)[:-len('_counts.txt')]
-                with open(counts_file, 'rt') as f:
+                with open(counts_file, 'rt', encoding='utf8') as f:
                     for line in f:
                         file, count = line.strip().split()
                         file = os.path.join(self.docs_dlc.path(), d, file[2:])
@@ -239,7 +239,7 @@ def _create_record_counts_if_needed(self, path):
         with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar:
             for d in glob(os.path.join(path, 'ClueWeb12_??')):
                 d = os.path.basename(d)
-                out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
+                out = stack.enter_context(ir_datasets.util.finalized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
                 for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))):
                     shortf = file[-24:]
                     with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf:

diff --git a/ir_datasets/datasets/codesearchnet.py b/ir_datasets/datasets/codesearchnet.py
@@ -55,7 +55,7 @@ def docs_iter(self):
         for dlc in self.docs_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob('**/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield CodeSearchNetDoc(
@@ -101,7 +101,7 @@ def queries_iter(self):
         for dlc in self.queries_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield GenericQuery(
@@ -129,7 +129,7 @@ def qrels_iter(self):
         for dlc in self.qrels_dlcs:
             base_path = Path(dlc.path())
             for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
-                with gzip.open(file, 'rt') as f:
+                with gzip.open(file, 'rt', encoding='utf8') as f:
                     for line in f:
                         data = json.loads(line)
                         yield TrecQrel(
@@ -159,7 +159,7 @@ def queries_path(self):
 
     def queries_iter(self):
         with self.queries_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for i, line in enumerate(stream):
                 if i == 0:
                     continue # skip first (header) line
@@ -184,7 +184,7 @@ def qrels_path(self):
     def qrels_iter(self):
         query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()}
         with self.qrels_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for data in csv.DictReader(stream):
                 yield CodeSearchNetChallengeQrel(
                     query_id=query_map[data['Query']],

diff --git a/ir_datasets/datasets/cord19.py b/ir_datasets/datasets/cord19.py
@@ -109,7 +109,7 @@ def _docs_iter(self):
                         'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))),
                     }
             if self._include_fulltext:
-                csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt'))
+                csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt', encoding='utf8'))
             else:
                 csv_reader = ctxt.enter_context(self._streamer.stream())
                 csv_reader = codecs.getreader('utf8')(csv_reader)

diff --git a/ir_datasets/datasets/cranfield.py b/ir_datasets/datasets/cranfield.py
@@ -51,7 +51,7 @@ def docs_path(self, force=True):
     @ir_datasets.util.use_docstore
     def docs_iter(self):
         with self.docs_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
                 record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''}
                 field = 'doc_id'
@@ -103,7 +103,7 @@ def queries_path(self):
 
     def queries_iter(self):
         with self.queries_dlc.stream() as stream:
-            stream = io.TextIOWrapper(stream)
+            stream = io.TextIOWrapper(stream, encoding='utf8')
             for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
                 record = {'query_id': '', 'text': ''}
                 field = 'query_id'

diff --git a/ir_datasets/datasets/dpr_w100.py b/ir_datasets/datasets/dpr_w100.py
@@ -45,8 +45,8 @@ def build(self):
             return # already built
 
         with contextlib.ExitStack() as stack:
-            f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt'))
-            f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt'))
+            f_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'queries.tsv', 'wt'))
+            f_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'qrels', 'wt'))
             stream = stack.enter_context(self._dlc.stream())
             qid_counter = itertools.count()
             for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'):

diff --git a/ir_datasets/datasets/gov2.py b/ir_datasets/datasets/gov2.py
@@ -273,14 +273,14 @@ def path(self, force=True):
             docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz')
             result = Counter()
             with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar:
-                with gzip.open(docs_urls_path, 'rt') as fin:
+                with gzip.open(docs_urls_path, 'rt', encoding='utf8') as fin:
                     for line in fin:
                         url, doc_id = line.rstrip().split()
                         d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388
                         file = f'{d}/{f}.gz'
                         result[file] += 1
                         pbar.update()
-                with ir_datasets.util.finialized_file(self._path, 'wt') as fout:
+                with ir_datasets.util.finalized_file(self._path, 'wt') as fout:
                     for file in sorted(result):
                         fout.write(f'{file}\t{result[file]}\n')
         return self._path

diff --git a/ir_datasets/datasets/msmarco_qna.py b/ir_datasets/datasets/msmarco_qna.py
@@ -130,16 +130,16 @@ def build(self):
                 with contextlib.ExitStack() as inner_stack:
                     stream = inner_stack.enter_context(dlc.stream())
                     parser = ijson.parse(stream)
-                    out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt'))
-                    out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt'))
-                    out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt'))
+                    out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt', encoding='utf8'))
+                    out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt', encoding='utf8'))
+                    out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt', encoding='utf8'))
                     if file_str != 'eval':
-                        out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt'))
-                        out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+'))
+                        out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt', encoding='utf8'))
+                        out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+', encoding='utf8'))
                         out_seq = None
                     else:
                         out_qrels, out_answer = None, None
-                        out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt'))
+                        out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt', encoding='utf8'))
                     for prefix, event, data in parser:
                         pbar_postfix['key'] = prefix
                         pbar.set_postfix(pbar_postfix, refresh=False)
@@ -221,19 +221,19 @@ def build(self):
             # Merge files
             for file_str in ['train', 'dev', 'eval']:
                 with contextlib.ExitStack() as stack:
-                    f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt'))
-                    f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt'))
-                    f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt'))
-                    f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt'))
-                    f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt'))
+                    f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt', encoding='utf8'))
+                    f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt', encoding='utf8'))
+                    f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt', encoding='utf8'))
+                    f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt', encoding='utf8'))
+                    f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt', encoding='utf8'))
                     in_files = [f_qid, f_type, f_text]
                     if file_str != 'eval':
-                        f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt'))
-                        f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt'))
-                        f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt'))
+                        f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt', encoding='utf8'))
+                        f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt', encoding='utf8'))
+                        f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt', encoding='utf8'))
                         in_files += [f_selections, f_answers]
                     else:
-                        f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt'))
+                        f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt', encoding='utf8'))
                         in_files += [f_seq]
                     for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'):
                         columns = [x.strip() for x in columns]

diff --git a/ir_datasets/datasets/natural_questions.py b/ir_datasets/datasets/natural_questions.py
@@ -56,12 +56,12 @@ def build(self):
         with contextlib.ExitStack() as stack:
             docs_trans = stack.enter_context(docs_store.lookup.transaction())
             pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question'))
-            train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt'))
-            train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt'))
-            train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
-            dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt'))
-            dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
-            dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
+            train_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.queries.tsv', 'wt'))
+            train_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.qrels.jsonl', 'wt'))
+            train_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
+            dev_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.queries.tsv', 'wt'))
+            dev_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
+            dev_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
             for file_name in sorted(self._dlcs.contents().keys()):
                 pbar_postfix['file'] = file_name
                 pbar.set_postfix(pbar_postfix)

diff --git a/ir_datasets/datasets/tripclick.py b/ir_datasets/datasets/tripclick.py
@@ -117,7 +117,7 @@ def __init__(self, dlc):
 
     def qlogs_iter(self):
         for file in sorted(Path(self.dlc.path()).glob('logs/*.json')):
-            with file.open('rt') as fin:
+            with file.open('rt', encoding='utf8') as fin:
                 for line in fin:
                     record = json.loads(line)
                     time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1)
@@ -163,7 +163,7 @@ def path(self, force=True):
             for query in _logger.pbar(self._queries.queries_iter(), desc='build query lookup', unit='query'):
                 queryhash = hashlib.md5(SPACES.sub(' ', query.text).strip().encode()).digest()[:6]
                 query_map[queryhash] = query.query_id
-            with ir_datasets.util.finialized_file(self._cache_path, 'wt') as fout, \
+            with ir_datasets.util.finalized_file(self._cache_path, 'wt') as fout, \
                  self._docpair_dlc.stream() as stream, \
                  _logger.pbar_raw(desc='building docpairs', total=23_222_038, unit='docpair') as pbar:
                 skipped = 0

diff --git a/ir_datasets/datasets/tweets2013_ia.py b/ir_datasets/datasets/tweets2013_ia.py
@@ -332,7 +332,7 @@ def _docs_build(self):
 
             # Write out a file that gives the counts for each source file. This is used for fancy slicing
             # and also avoids globbing to get a list of all source files.
-            with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt') as f:
+            with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt', encoding='utf8') as f:
                 for file, count in sorted(file_counts.items()):
                     f.write(f'{file}\t{count}\n')
 
@@ -369,7 +369,7 @@ def _docs_file_counts(self):
         if self._docs_file_counts_cache is None:
             self._docs_build()
             result = {}
-            with (Path(self.docs_path()) / 'file_counts.tsv').open('rt') as f:
+            with (Path(self.docs_path()) / 'file_counts.tsv').open('rt', encoding='utf8') as f:
                 for line in f:
                     file, count = line.strip().split('\t')
                     result[file] = int(count)