From 794a444055613082250f6a673ebc0b169aefa873 Mon Sep 17 00:00:00 2001 From: Vol Ky Date: Sat, 15 Oct 2022 16:56:12 +0200 Subject: [PATCH] main: run segmeter on all text at once, add arguments: - --line-by-line is now opt-in - options to choose iterators and their hyperparams for benchmarking --- choppa/__main__.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/choppa/__main__.py b/choppa/__main__.py index ab10d78..7e0bad9 100644 --- a/choppa/__main__.py +++ b/choppa/__main__.py @@ -1,18 +1,42 @@ +import argparse import sys from pathlib import Path from choppa.srx_parser import SrxDocument -from choppa.iterators import SrxTextIterator +import choppa.iterators +def make_iterator(name: str): + return getattr(choppa.iterators, name) + + +parser = argparse.ArgumentParser("choppa") +parser.add_argument("-i", "--iterator", type=str, + choices=["AccurateSrxTextIterator", "SrxTextIterator"], + default="AccurateSrxTextIterator") +parser.add_argument("--max-lookbehind-construct-length", type=int, default=100, + help="Maximum length of a regular expression construct that occurs in lookbehind.") +parser.add_argument("-l", "--line-by-line", action="store_true", + help="Run a separate segmenter on each line of input. " + + "Faster if your sentences definitely do not span multiple lines.") +args = parser.parse_args() +args.iterator = make_iterator(args.iterator) + ruleset = Path(__file__).parent / "data/srx/languagetool_segment.srx" SRX_2_XSD = Path(__file__).parent / "data/xsd/srx20.xsd" document = SrxDocument(ruleset=ruleset, validate_ruleset=SRX_2_XSD) if sys.stdin.isatty(): - print('reading from stdin...', file=sys.stderr) + print("reading from stdin...", file=sys.stderr) -for line in sys.stdin: - for text in SrxTextIterator(document, "uk_two", line.strip()): - print(text) +if args.line_by_line: + for line in sys.stdin: + for sentence in args.iterator(document, "uk_two", line.strip(), + max_lookbehind_construct_length=args.max_lookbehind_construct_length): + print(sentence) +else: + whole_input = sys.stdin.read().replace("\n", " ").strip() + for sentence in args.iterator(document, "uk_two", whole_input, + max_lookbehind_construct_length=args.max_lookbehind_construct_length): + print(sentence)