Skip to content

Commit

Permalink
dont first create anserini index
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskamphuis committed Jan 20, 2021
1 parent 2c6ae4b commit 050c161
Showing 1 changed file with 22 additions and 15 deletions.
37 changes: 22 additions & 15 deletions index.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import duckdb

from pyserini.search import SimpleSearcher
from pyserini import collection, index


class Index:
Expand All @@ -18,29 +18,36 @@ def __init__(self, **kwargs):
def get_arguments(kwargs):
arguments = {
'database': None,
'index': None
'collection': None
}
for key, item in arguments.items():
if kwargs.get(key) is not None:
arguments[key] = kwargs.get(key)
if arguments['database'] is None:
raise IOError('Database path needs to be provided.')
if arguments['index'] is None:
if arguments['collection'] is None:
raise IOError('Collection path needs to be provided.')
return arguments

def create_input_table(self):
searcher = SimpleSearcher(self.arguments['index'])
self.cursor.execute(f"CREATE TABLE documents(id VARCHAR, body VARCHAR)")
self.connection.begin()
for i in range(searcher.num_docs):
if i % 10000 == 0:
self.connection.commit()
self.connection.begin()
doc = searcher.doc(i)
self.cursor.execute(f"INSERT INTO documents VALUES (?, ?)",
(doc.docid(), doc.contents())
)
c = collection.Collection('TrecCollection', self.arguments['collection'])
generator = index.Generator('DefaultLuceneDocumentGenerator')
for fs in c:
for i, doc in enumerate(fs):
if i % 10000 == 0:
self.connection.commit()
self.connection.begin()
try:
parsed = generator.create_document(doc)
except:
pass
doc_id = parsed.get("id")
contents = parsed.get("contents")
self.cursor.execute(f"INSERT INTO documents VALUES (?, ?)",
(doc_id, contents)
)
self.connection.commit()


Expand All @@ -51,9 +58,9 @@ def create_input_table(self):
required=True,
metavar='[file]',
help='Location of the database.')
parser.add_argument('-i',
'--index',
parser.add_argument('-c',
'--collection',
required=True,
metavar='[directory]',
help='Location of the anserini index.')
help='Location of the collection.')
Index(**vars(parser.parse_args()))

0 comments on commit 050c161

Please sign in to comment.