-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'develop' of https://github.com/explosion/spaCy into dev…
…elop
- Loading branch information
Showing
5 changed files
with
175 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python | ||
from __future__ import print_function, unicode_literals, division | ||
|
||
import logging | ||
from pathlib import Path | ||
from collections import defaultdict | ||
from gensim.models import Word2Vec | ||
from preshed.counter import PreshCounter | ||
import plac | ||
import spacy | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Corpus(object): | ||
def __init__(self, directory, min_freq=10): | ||
self.directory = directory | ||
self.counts = PreshCounter() | ||
self.strings = {} | ||
self.min_freq = min_freq | ||
|
||
def count_doc(self, doc): | ||
# Get counts for this document | ||
for word in doc: | ||
self.counts.inc(word.orth, 1) | ||
return len(doc) | ||
|
||
def __iter__(self): | ||
for text_loc in iter_dir(self.directory): | ||
with text_loc.open("r", encoding="utf-8") as file_: | ||
text = file_.read() | ||
yield text | ||
|
||
|
||
def iter_dir(loc): | ||
dir_path = Path(loc) | ||
for fn_path in dir_path.iterdir(): | ||
if fn_path.is_dir(): | ||
for sub_path in fn_path.iterdir(): | ||
yield sub_path | ||
else: | ||
yield fn_path | ||
|
||
|
||
@plac.annotations( | ||
lang=("ISO language code"), | ||
in_dir=("Location of input directory"), | ||
out_loc=("Location of output file"), | ||
n_workers=("Number of workers", "option", "n", int), | ||
size=("Dimension of the word vectors", "option", "d", int), | ||
window=("Context window size", "option", "w", int), | ||
min_count=("Min count", "option", "m", int), | ||
negative=("Number of negative samples", "option", "g", int), | ||
nr_iter=("Number of iterations", "option", "i", int), | ||
) | ||
def main( | ||
lang, | ||
in_dir, | ||
out_loc, | ||
negative=5, | ||
n_workers=4, | ||
window=5, | ||
size=128, | ||
min_count=10, | ||
nr_iter=2, | ||
): | ||
logging.basicConfig( | ||
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO | ||
) | ||
model = Word2Vec( | ||
size=size, | ||
window=window, | ||
min_count=min_count, | ||
workers=n_workers, | ||
sample=1e-5, | ||
negative=negative, | ||
) | ||
nlp = spacy.blank(lang) | ||
corpus = Corpus(in_dir) | ||
total_words = 0 | ||
total_sents = 0 | ||
for text_no, text_loc in enumerate(iter_dir(corpus.directory)): | ||
with text_loc.open("r", encoding="utf-8") as file_: | ||
text = file_.read() | ||
total_sents += text.count("\n") | ||
doc = nlp(text) | ||
total_words += corpus.count_doc(doc) | ||
logger.info( | ||
"PROGRESS: at batch #%i, processed %i words, keeping %i word types", | ||
text_no, | ||
total_words, | ||
len(corpus.strings), | ||
) | ||
model.corpus_count = total_sents | ||
model.raw_vocab = defaultdict(int) | ||
for orth, freq in corpus.counts: | ||
if freq >= min_count: | ||
model.raw_vocab[nlp.vocab.strings[orth]] = freq | ||
model.scale_vocab() | ||
model.finalize_vocab() | ||
model.iter = nr_iter | ||
model.train(corpus) | ||
model.save(out_loc) | ||
|
||
|
||
if __name__ == "__main__": | ||
plac.call(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters