Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue #8, convert hunspell output to spellchecker JSON format #9

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions backend/spellchecker/proto/spellchecker.proto
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ message Text {

message Suggestions {
message Suggestion {
string key = 1;
repeated string values = 2;
repeated string values = 1;
}
map<string, Suggestion> suggestions = 1;
}
54 changes: 47 additions & 7 deletions backend/spellchecker/server/server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Author: Sergey Sokolov
import time
import argparse

import grpc
from concurrent import futures
Expand All @@ -15,8 +16,8 @@ class SpellcheckerServicer(spellchecker_pb2.SpellcheckServicer):
"""
gRPC service to check text that comes from clients.
"""
def __init__(self):
self.checker = Spellchecker("/usr/share/hunspell", "en_US")
def __init__(self, libparser, dict_path, language):
self.checker = Spellchecker(libparser, dict_path, language)

def CheckText(self, request, context):
"""
Expand All @@ -33,23 +34,26 @@ def CheckText(self, request, context):
return self.checker.check_text(text, languages)


def serve(port: str, max_workers: int = 2):
def serve(port: str, max_workers: int, libparser: str, dict_path: str, language: str):
"""
Initialize and run the gRPC server.

:param port: Port on which the server would be listening.
:param max_workers: Size of thread pool to serve clients.
:param libparser: Path to C parser shared library.
:param dict_path: Path to .dic and .aff files
:param language: Language to add to spellchecker.
"""
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=max_workers)
)

spellchecker_pb2.add_SpellcheckServicer_to_server(
SpellcheckerServicer(),
SpellcheckerServicer(libparser, dict_path, language),
server
)

server.add_insecure_port('[::]:{}'.format(port))
server.add_insecure_port("[::]:{}".format(port))
server.start()

try:
Expand All @@ -58,5 +62,41 @@ def serve(port: str, max_workers: int = 2):
except KeyboardInterrupt:
server.stop(0)

if __name__ == '__main__':
serve(50051)
if __name__ == "__main__":
args_parser = argparse.ArgumentParser(description="Spellchecker gRPC server.")
args_parser.add_argument("-w", "--workers",
action="store",
type=int,
default=2,
metavar="NUM",
help="size of thread pool")

required = args_parser.add_argument_group("required")
required.add_argument("-p", "--port",
required=True,
action="store",
type=int,
metavar="NUM",
help="port which server would be listening")
required.add_argument("-l", "--libparser",
required=True,
action="store",
type=str,
metavar="PATH",
help="path to libparser shared library")
required.add_argument("-d", "--dir",
required=True,
action="store",
type=str,
metavar="PATH",
help="path to .dic and .aff files")
required.add_argument("-L", "--language",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

language is defined in the request, no?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Request from client? Nope, in the request we ask spellchecker to use those languages and that demand is satisfyed only if spellchecker already has hunspell instance initialized with a dictionaries of that languages.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which means that we're restricted to one language, since this argument is single-valued.

Why don't you initialize hunspell instance with the languagw requested bythe client on demand?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alrighty then, I'll work that out.

required=True,
action="store",
type=str,
metavar="LANG",
help="dictionary language")

args = args_parser.parse_args()

serve(args.port, args.workers, args.libparser, args.dir, args.language)
25 changes: 12 additions & 13 deletions backend/spellchecker/server/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ class Spellchecker:
Class contains multiple hunspell instances (one per language) and provides
text spellchecking function using all chosen dictionaries.
"""
def __init__(self, path: str = '', language: str ='', log: object = None):
self._hunspell_instances = {}
self._parser = Parser("../libparser/build/libparser.so")
self._log = log
def __init__(self, libparser: str, path: str = '', language: str ='', log: object = None):
self.hunspell_instances = {}
self.parser = Parser(libparser)
self.log = log

if path is not '':
self.add_dictionary(path, language)
Expand All @@ -28,21 +28,21 @@ def add_dictionary(self, path: str, language: str) -> None:
:param path: Path to .dic and .aff files.
:param language: Language that corresponds .dic and .aff files.
"""
if language in self._hunspell_instances:
if language in self.hunspell_instances:
return

dictionary = "{}/{}.dic".format(path, language)
if not os.path.isfile(dictionary):
if self._log:
self._log.write("File not found: {}".format(dictionary))
if self.log:
self.log.write("File not found: {}".format(dictionary))
return
affix = "{}/{}.aff".format(path, language)
if not os.path.isfile(affix):
if self._log:
self._log.write("File not found: {}".format(affix))
if self.log:
self.log.write("File not found: {}".format(affix))
return

self._hunspell_instances[language] = hunspell.HunSpell(dictionary, affix)
self.hunspell_instances[language] = hunspell.HunSpell(dictionary, affix)

def check_text(self, text: str, languages) -> str:
"""
Expand All @@ -53,13 +53,12 @@ def check_text(self, text: str, languages) -> str:
:return: JSON with suggestions for misspelled words.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comment is no more actual, as well as "return type"

"""
suggestions_map = Suggestions()
hunspells = (self._hunspell_instances[lang] for lang in languages if lang in self._hunspell_instances)
tokens = self._parser.tokenize(text)
hunspells = (self.hunspell_instances[lang] for lang in languages if lang in self.hunspell_instances)
tokens = self.parser.tokenize(text)

for h in hunspells:
for token in tokens:
if not h.spell(token):
suggestions_map.suggestions[token].values.extend(h.suggest(token))

# return ujson.dumps(suggestions)
return suggestions_map