Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

encoding fixes #152

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: pylint

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: 'ubuntu-latest'
steps:
- name: Checkout source
uses: actions/checkout@v2

- name: Set up Python 3.9
uses: actions/setup-python@v1
with:
python-version: '3.9'

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pylint==2.12.2

- name: Run PyLint
run: |
pylint --disable=all --enable=unspecified-encoding ./ir_datasets
2 changes: 1 addition & 1 deletion ir_datasets/commands/build_c4_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main(args):
})
except Exception as ex:
print(file, ex)
with gzip.open(args.sources_file + '.gz', 'wt') as f:
with gzip.open(args.sources_file + '.gz', 'wt', encoding='utf8') as f:
json.dump(sources, f)
all_source_files = [f.relative_to(source_dir) for f in all_source_files]
if args.skip_last:
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/commands/build_download_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _build_cache(data, dir, prefix=''):
_logger.info(f'skipping {prefix}; already exists')
return
try:
with ir_datasets.util.finialized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
with ir_datasets.util.finalized_file(cache_path, 'wb') as fout, _logger.duration(prefix):
download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'])], expected_md5=data['expected_md5'], stream=True)
with download.stream() as stream:
inp = stream.read(io.DEFAULT_BUFFER_SIZE)
Expand Down Expand Up @@ -63,7 +63,7 @@ def main(args):
parser.add_argument('--retries', default='10')
args = parser.parse_args(args)

with open('ir_datasets/etc/downloads.json') as f:
with open('ir_datasets/etc/downloads.json', 'rt', encoding='utf8') as f:
data = json.load(f)
with tmp_environ(IR_DATASETS_DL_TRIES=args.retries):
_build_cache(data, args.dir)
Expand Down
2 changes: 1 addition & 1 deletion ir_datasets/commands/doc_fifos.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def main(args):
print(f'Ready at {d}')
print(f'To index with Anserini, run:\nIndexCollection -collection JsonCollection -input {d} -threads {args.count} -index <your_index_path> <other_anserini_args>')

fifos = [stack.enter_context(open(f, 'wt')) for f in fifos]
fifos = [stack.enter_context(open(f, 'wt', encoding='utf8')) for f in fifos]

ready = None
for doc in docs_iter:
Expand Down
2 changes: 1 addition & 1 deletion ir_datasets/commands/generate_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def dataset2metadata(args):


def write_metadata_file(data, file):
with file.open('wt') as f:
with file.open('wt', encoding='utf8') as f:
# partially-formatted data; one dataset per line
f.write('{\n')
for i, key in enumerate(sorted(data.keys())):
Expand Down
2 changes: 1 addition & 1 deletion ir_datasets/datasets/antique.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _init():
disallow_list = dlc['disallow_list']
def disllow_qids():
with disallow_list.stream() as stream:
stream = io.TextIOWrapper(stream)
stream = io.TextIOWrapper(stream, encoding='utf8')
return {l.rstrip() for l in stream}
disllow_qids = Lazy(disllow_qids)
subsets['test/non-offensive'] = Dataset(
Expand Down
8 changes: 4 additions & 4 deletions ir_datasets/datasets/aol_ia.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from hashlib import md5
import ir_datasets
from typing import NamedTuple, Tuple
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finialized_file
from ir_datasets.util import DownloadConfig, GzipExtract, TarExtract, finalized_file
from ir_datasets.formats import TrecQrels, TsvQueries, DocstoreBackedDocs, BaseQlogs
from ir_datasets.datasets.base import Dataset, YamlDocumentation

Expand Down Expand Up @@ -136,9 +136,9 @@ def build(self):
lz4_frame = ir_datasets.lazy_libs.lz4_frame().frame

encountered_qids = set()
with finialized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
finialized_file(self._base_path/'qrels', 'wt') as f_qrels, \
finialized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
with finalized_file(self._base_path/'queries.tsv', 'wt') as f_queries, \
finalized_file(self._base_path/'qrels', 'wt') as f_qrels, \
finalized_file(self._base_path/'log.pkl.lz4', 'wb') as f_log, \
lz4_frame.LZ4FrameFile(f_log, 'wb') as f_log, \
_logger.pbar_raw(desc=f'preparing {NAME} log lines', total=36389567) as pbar:
for dlc in self._log_dlcs:
Expand Down
2 changes: 1 addition & 1 deletion ir_datasets/datasets/clueweb09.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _docs_warc_file_counts(self):
result = {}
for d in self.dirs:
counts_file = os.path.join(self.docs_dlc.path(), f'record_counts/{d}_counts.txt')
with open(counts_file, 'rt') as f:
with open(counts_file, 'rt', encoding='utf8') as f:
for line in f:
file, count = line.strip().split()
# Fixing bug in record_counts: en0054 is under ClueWeb09_English_4, not _5
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/clueweb12.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def _docs_warc_file_counts(self):
result = {}
for counts_file in glob(os.path.join(self.docs_dlc.path(), 'recordcounts', '*.txt')):
d = os.path.basename(counts_file)[:-len('_counts.txt')]
with open(counts_file, 'rt') as f:
with open(counts_file, 'rt', encoding='utf8') as f:
for line in f:
file, count = line.strip().split()
file = os.path.join(self.docs_dlc.path(), d, file[2:])
Expand Down Expand Up @@ -239,7 +239,7 @@ def _create_record_counts_if_needed(self, path):
with contextlib.ExitStack() as stack, _logger.pbar_raw(desc='building b13 document count cache', unit='file') as pbar:
for d in glob(os.path.join(path, 'ClueWeb12_??')):
d = os.path.basename(d)
out = stack.enter_context(ir_datasets.util.finialized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
out = stack.enter_context(ir_datasets.util.finalized_file(f'{rc_dir}/{d}_counts.txt', 'wt'))
for file in sorted(glob(os.path.join(path, d, '*', '*.warc.gz'))):
shortf = file[-24:]
with gzip.open(file, 'rb') as f, warc.WARCFile(fileobj=f) as warcf:
Expand Down
10 changes: 5 additions & 5 deletions ir_datasets/datasets/codesearchnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def docs_iter(self):
for dlc in self.docs_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob('**/*.gz')):
with gzip.open(file, 'rt') as f:
with gzip.open(file, 'rt', encoding='utf8') as f:
for line in f:
data = json.loads(line)
yield CodeSearchNetDoc(
Expand Down Expand Up @@ -101,7 +101,7 @@ def queries_iter(self):
for dlc in self.queries_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
with gzip.open(file, 'rt') as f:
with gzip.open(file, 'rt', encoding='utf8') as f:
for line in f:
data = json.loads(line)
yield GenericQuery(
Expand Down Expand Up @@ -129,7 +129,7 @@ def qrels_iter(self):
for dlc in self.qrels_dlcs:
base_path = Path(dlc.path())
for file in sorted(base_path.glob(f'**/{self.split}/*.gz')):
with gzip.open(file, 'rt') as f:
with gzip.open(file, 'rt', encoding='utf8') as f:
for line in f:
data = json.loads(line)
yield TrecQrel(
Expand Down Expand Up @@ -159,7 +159,7 @@ def queries_path(self):

def queries_iter(self):
with self.queries_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
stream = io.TextIOWrapper(stream, encoding='utf8')
for i, line in enumerate(stream):
if i == 0:
continue # skip first (header) line
Expand All @@ -184,7 +184,7 @@ def qrels_path(self):
def qrels_iter(self):
query_map = {q.text: q.query_id for q in self._queries_handler.queries_iter()}
with self.qrels_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
stream = io.TextIOWrapper(stream, encoding='utf8')
for data in csv.DictReader(stream):
yield CodeSearchNetChallengeQrel(
query_id=query_map[data['Query']],
Expand Down
2 changes: 1 addition & 1 deletion ir_datasets/datasets/cord19.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _docs_iter(self):
'custom_license': tarfile.open(fileobj=ctxt.push((self._extr_path/self._date/'custom_license.tar.gz').open('rb'))),
}
if self._include_fulltext:
csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt'))
csv_reader = ctxt.push((self._extr_path/self._date/'metadata.csv').open('rt', encoding='utf8'))
else:
csv_reader = ctxt.enter_context(self._streamer.stream())
csv_reader = codecs.getreader('utf8')(csv_reader)
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/cranfield.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def docs_path(self, force=True):
@ir_datasets.util.use_docstore
def docs_iter(self):
with self.docs_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
stream = io.TextIOWrapper(stream, encoding='utf8')
for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
record = {'doc_id': '', 'title': '', 'author': '', 'bib': '', 'text': ''}
field = 'doc_id'
Expand Down Expand Up @@ -103,7 +103,7 @@ def queries_path(self):

def queries_iter(self):
with self.queries_dlc.stream() as stream:
stream = io.TextIOWrapper(stream)
stream = io.TextIOWrapper(stream, encoding='utf8')
for lines in prefix_sentinel_splitter(stream, sentinel='.I '):
record = {'query_id': '', 'text': ''}
field = 'query_id'
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/dpr_w100.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def build(self):
return # already built

with contextlib.ExitStack() as stack:
f_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'queries.tsv', 'wt'))
f_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'qrels', 'wt'))
f_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'queries.tsv', 'wt'))
f_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'qrels', 'wt'))
stream = stack.enter_context(self._dlc.stream())
qid_counter = itertools.count()
for record in _logger.pbar(ijson.items(stream, 'item'), 'building dpr-w100', unit='record'):
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/gov2.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,14 +273,14 @@ def path(self, force=True):
docs_urls_path = os.path.join(self._docs_dlc.path(), 'GOV2_extras/url2id.gz')
result = Counter()
with _logger.pbar_raw(desc='building doccounts file', total=25205179, unit='doc') as pbar:
with gzip.open(docs_urls_path, 'rt') as fin:
with gzip.open(docs_urls_path, 'rt', encoding='utf8') as fin:
for line in fin:
url, doc_id = line.rstrip().split()
d, f, i = doc_id.split('-') # formatted like: GX024-52-0546388
file = f'{d}/{f}.gz'
result[file] += 1
pbar.update()
with ir_datasets.util.finialized_file(self._path, 'wt') as fout:
with ir_datasets.util.finalized_file(self._path, 'wt') as fout:
for file in sorted(result):
fout.write(f'{file}\t{result[file]}\n')
return self._path
Expand Down
30 changes: 15 additions & 15 deletions ir_datasets/datasets/msmarco_qna.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ def build(self):
with contextlib.ExitStack() as inner_stack:
stream = inner_stack.enter_context(dlc.stream())
parser = ijson.parse(stream)
out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt'))
out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt'))
out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt'))
out_text = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'wt', encoding='utf8'))
out_type = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'wt', encoding='utf8'))
out_id = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'wt', encoding='utf8'))
if file_str != 'eval':
out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt'))
out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+'))
out_qrels = inner_stack.enter_context(open(self._base_path/f'{file_str}.selections', 'wt', encoding='utf8'))
out_answer = inner_stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'wt+', encoding='utf8'))
out_seq = None
else:
out_qrels, out_answer = None, None
out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt'))
out_seq = inner_stack.enter_context(open(self._base_path/f'{file_str}.seq', 'wt', encoding='utf8'))
for prefix, event, data in parser:
pbar_postfix['key'] = prefix
pbar.set_postfix(pbar_postfix, refresh=False)
Expand Down Expand Up @@ -221,19 +221,19 @@ def build(self):
# Merge files
for file_str in ['train', 'dev', 'eval']:
with contextlib.ExitStack() as stack:
f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt'))
f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt'))
f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt'))
f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt'))
f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt'))
f_qid = stack.enter_context(open(self._base_path/f'{file_str}.query_id', 'rt', encoding='utf8'))
f_type = stack.enter_context(open(self._base_path/f'{file_str}.query_type', 'rt', encoding='utf8'))
f_text = stack.enter_context(open(self._base_path/f'{file_str}.query_text', 'rt', encoding='utf8'))
f_queries = stack.enter_context(open(self._base_path/f'{file_str}.queries.tsv', 'wt', encoding='utf8'))
f_run = stack.enter_context(open(self._base_path/f'{file_str}.run', 'wt', encoding='utf8'))
in_files = [f_qid, f_type, f_text]
if file_str != 'eval':
f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt'))
f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt'))
f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt'))
f_selections = stack.enter_context(open(self._base_path/f'{file_str}.selections', 'rt', encoding='utf8'))
f_answers = stack.enter_context(open(self._base_path/f'{file_str}.query_answer', 'rt', encoding='utf8'))
f_qrels = stack.enter_context(open(self._base_path/f'{file_str}.qrels', 'wt', encoding='utf8'))
in_files += [f_selections, f_answers]
else:
f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt'))
f_seq = stack.enter_context(open(self._base_path/f'{file_str}.seq', 'rt', encoding='utf8'))
in_files += [f_seq]
for columns in _logger.pbar(zip(*in_files), desc=f'merging {file_str} files', unit='doc'):
columns = [x.strip() for x in columns]
Expand Down
12 changes: 6 additions & 6 deletions ir_datasets/datasets/natural_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,12 @@ def build(self):
with contextlib.ExitStack() as stack:
docs_trans = stack.enter_context(docs_store.lookup.transaction())
pbar = stack.enter_context(_logger.pbar_raw(desc='processing nq', postfix=pbar_postfix, unit='question'))
train_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.queries.tsv', 'wt'))
train_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.qrels.jsonl', 'wt'))
train_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
dev_queries = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.queries.tsv', 'wt'))
dev_qrels = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
dev_scoreddocs = stack.enter_context(ir_datasets.util.finialized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
train_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.queries.tsv', 'wt'))
train_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.qrels.jsonl', 'wt'))
train_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'train.scoreddocs.tsv', 'wt'))
dev_queries = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.queries.tsv', 'wt'))
dev_qrels = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.qrels.jsonl', 'wt'))
dev_scoreddocs = stack.enter_context(ir_datasets.util.finalized_file(self._base_path/'dev.scoreddocs.tsv', 'wt'))
for file_name in sorted(self._dlcs.contents().keys()):
pbar_postfix['file'] = file_name
pbar.set_postfix(pbar_postfix)
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/tripclick.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def __init__(self, dlc):

def qlogs_iter(self):
for file in sorted(Path(self.dlc.path()).glob('logs/*.json')):
with file.open('rt') as fin:
with file.open('rt', encoding='utf8') as fin:
for line in fin:
record = json.loads(line)
time = re.match(r'^/Date\(([0-9]+)\)/$', record['DateCreated']).group(1)
Expand Down Expand Up @@ -163,7 +163,7 @@ def path(self, force=True):
for query in _logger.pbar(self._queries.queries_iter(), desc='build query lookup', unit='query'):
queryhash = hashlib.md5(SPACES.sub(' ', query.text).strip().encode()).digest()[:6]
query_map[queryhash] = query.query_id
with ir_datasets.util.finialized_file(self._cache_path, 'wt') as fout, \
with ir_datasets.util.finalized_file(self._cache_path, 'wt') as fout, \
self._docpair_dlc.stream() as stream, \
_logger.pbar_raw(desc='building docpairs', total=23_222_038, unit='docpair') as pbar:
skipped = 0
Expand Down
4 changes: 2 additions & 2 deletions ir_datasets/datasets/tweets2013_ia.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ def _docs_build(self):

# Write out a file that gives the counts for each source file. This is used for fancy slicing
# and also avoids globbing to get a list of all source files.
with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt') as f:
with (Path(self._docs_base_path) / 'file_counts.tsv').open('wt', encoding='utf8') as f:
for file, count in sorted(file_counts.items()):
f.write(f'{file}\t{count}\n')

Expand Down Expand Up @@ -369,7 +369,7 @@ def _docs_file_counts(self):
if self._docs_file_counts_cache is None:
self._docs_build()
result = {}
with (Path(self.docs_path()) / 'file_counts.tsv').open('rt') as f:
with (Path(self.docs_path()) / 'file_counts.tsv').open('rt', encoding='utf8') as f:
for line in f:
file, count = line.strip().split('\t')
result[file] = int(count)
Expand Down
Loading