-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcmd_index.py
127 lines (108 loc) · 5.92 KB
/
cmd_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from preprocessing import create_preprocessor
from indexing import create_index_simple, create_index_spimi, create_index_map_reduce
import os
import glob
import nltk
import click
@click.group()
@click.option('--document_folder', required=True, type=click.Path(exists=True),
help='Path to the folder which contains the documents to be indexed')
@click.option('--index_file', required=True,
help='Output filename for index file')
@click.option('--stats_file', required=True,
help='Output filename for document stats file')
@click.option('--enable_case_folding/--disable_case_folding',
default=True, show_default=True,
help='Enable/Disable case folding during preprocessing')
@click.option('--enable_stemmer/--disable_stemmer',
default=True, show_default=True,
help='Enable/Disable stemmer during preprocessing')
@click.option('--enable_lemmatizer/--disable_lemmatizer',
default=False, show_default=True,
help='Enable/Disable lemmatizer during preprocessing')
@click.option('--enable_remove_stop_words/--disable_remove_stop_words',
default=True, show_default=True,
help='Enable/Disable removal of stop words during preprocessing')
@click.option('--min_word_length',
default=2, show_default=True,
help='Minimum word length. Words shorter than the given length are ignored')
@click.option('--enable_strip_html_tags/--disable_strip_html_tags',
default=True, show_default=True,
help='Enable/Disable removal of html tags')
@click.option('--enable_strip_html_entities/--disable_strip_html_entities',
default=True, show_default=True,
help='Enable/Disable removal of html entities, like "&"')
@click.option('--enable_strip_square_bracket_tags/--disable_strip_square_bracket_tags',
default=True, show_default=True,
help='Enable/Disable removal of tags in square brackets, like "[BR]"')
@click.pass_context
def cli(ctx, document_folder, index_file, stats_file,
enable_case_folding, enable_stemmer, enable_lemmatizer,
enable_remove_stop_words, min_word_length,
enable_strip_html_tags, enable_strip_html_entities,
enable_strip_square_bracket_tags):
nltk.download('wordnet')
preprocessor = create_preprocessor(enable_case_folding=enable_case_folding,
enable_remove_stop_words=enable_remove_stop_words,
enable_stemmer=enable_stemmer,
enable_lemmatizer=enable_lemmatizer,
min_length=min_word_length)
glob_pattern = document_folder + '/**'
document_files = [fname for fname in glob.glob(glob_pattern, recursive=True) if os.path.isfile(fname)]
click.echo()
click.echo('Processing {} file(s)'.format(len(document_files)))
click.echo()
ctx.obj['INDEX_FILE'] = index_file
ctx.obj['STATS_FILE'] = stats_file
ctx.obj['DOCUMENT_FILES'] = document_files
ctx.obj['PREPROCESSOR'] = preprocessor
ctx.obj['STRIP_HTML_TAGS'] = enable_strip_html_tags
ctx.obj['STRIP_HTML_ENTITIES'] = enable_strip_html_entities
ctx.obj['STRIP_SQUARE_BRACKET_TAGS'] = enable_strip_square_bracket_tags
@cli.command()
@click.pass_context
def simple(ctx):
preprocessor = ctx.obj['PREPROCESSOR']
click.echo(f'Writing index using simple method to {ctx.obj["INDEX_FILE"]} and document stats to {ctx.obj["STATS_FILE"]}')
click.echo('Reading source files')
create_index_simple(ctx.obj['DOCUMENT_FILES'], preprocessor,
ctx.obj['INDEX_FILE'],
ctx.obj['STATS_FILE'],
strip_html_tags=ctx.obj['STRIP_HTML_TAGS'],
strip_html_entities=ctx.obj['STRIP_HTML_ENTITIES'],
strip_square_bracket_tags=ctx.obj['STRIP_SQUARE_BRACKET_TAGS'])
@cli.command()
@click.option('--max_tokens_per_block', default=10000000, show_default=True,
help='Maximum number of tokens allowed in a single spimi block')
@click.pass_context
def spimi(ctx, max_tokens_per_block):
preprocessor = ctx.obj['PREPROCESSOR']
click.echo(f'Writing index using spimi method to {ctx.obj["INDEX_FILE"]} and document stats to {ctx.obj["STATS_FILE"]}')
click.echo('Reading source files')
create_index_spimi(ctx.obj['DOCUMENT_FILES'], preprocessor,
ctx.obj['INDEX_FILE'],
ctx.obj['STATS_FILE'],
max_tokens_per_block=max_tokens_per_block,
strip_html_tags=ctx.obj['STRIP_HTML_TAGS'],
strip_html_entities=ctx.obj['STRIP_HTML_ENTITIES'],
strip_square_bracket_tags=ctx.obj['STRIP_SQUARE_BRACKET_TAGS'])
@cli.command()
@click.option('--blocksize', default=16, show_default=True,
help='Size of data the Map Process takes one at a time in Megabyte')
@click.option('--num_nodes', default=None, show_default=True,
help='Number of Processes over which the work load is distributed. Typically defaults to the number of cores')
@click.pass_context
def map_reduce(ctx, blocksize, num_nodes):
preprocessor = ctx.obj['PREPROCESSOR']
click.echo(f'Writing index using map_redduce to {ctx.obj["INDEX_FILE"]} and document stats to {ctx.obj["STATS_FILE"]}')
click.echo('Reading source files')
create_index_map_reduce(ctx.obj['DOCUMENT_FILES'], preprocessor,
ctx.obj['INDEX_FILE'],
ctx.obj['STATS_FILE'],
strip_html_tags=ctx.obj['STRIP_HTML_TAGS'],
strip_html_entities=ctx.obj['STRIP_HTML_ENTITIES'],
strip_square_bracket_tags=ctx.obj['STRIP_SQUARE_BRACKET_TAGS'],
blocksize=blocksize,
num_nodes=num_nodes)
if __name__ == '__main__':
cli(obj={})