Skip to content

Commit

Permalink
Update submodule
Browse files Browse the repository at this point in the history
  • Loading branch information
p-goulart committed Jan 12, 2024
1 parent b108082 commit 2629a7a
Show file tree
Hide file tree
Showing 16 changed files with 20 additions and 576 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
path: ${{ env.PT_DICT_HOME }}
repository: ${{ env.GITHUB_ACTION_REPOSITORY }}
fetch-depth: 0
submodules: recursive
- name: Get number of CPU cores
uses: SimenB/github-actions-cpu-cores@v1
id: cpu-cores
Expand Down Expand Up @@ -88,11 +89,11 @@ jobs:
- name: Build POS tagging dictionary
working-directory: ${{ env.PT_DICT_HOME }}
run: |
poetry run python "./pt_dict/scripts/build_tagger_dicts.py"
poetry run python "./dictionary-tools/scripts/build_tagger_dicts.py"
- name: Build spelling dictionaries
working-directory: ${{ env.PT_DICT_HOME }}
run: |
mkdir -p "${{ env.LT_TMP_DIR }}/compounds"
poetry run python "./pt_dict/scripts/build_spelling_dicts.py" \
poetry run python "./dictionary-tools/scripts/build_spelling_dicts.py" \
--tmp-dir "${{ env.LT_TMP_DIR }}" \
--max-threads "${{ steps.cpu-cores.outputs.count }}"
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "dict_tools"]
path = dict_tools
url = https://[email protected]/languagetool-org/dictionary-tools.git
1 change: 1 addition & 0 deletions dict_tools
Submodule dict_tools added at 53a0b7
10 changes: 3 additions & 7 deletions pt_dict/console_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from spylls.hunspell import Dictionary

from pt_dict.constants import HUNSPELL_DIR
from pt_dict.variants.variant import PT_BR, Variant, PT_PT_90, PT_PT_45
from dict_tools.lib.variant import Variant, PT_BR, PT_PT_90, PT_PT_45


class ConsoleUtils:
Expand Down Expand Up @@ -36,14 +36,10 @@ def normal_mode(self):
self.mode = "main"

def load_dictionary(self, variant: Variant) -> Dictionary:
if variant.country == 'BR':
variant_code = variant.underscored
else:
variant_code = variant.underscored_with_agreement
if self.mode == 'compounds':
dict_path = path.join(HUNSPELL_DIR, "compounds", variant_code)
dict_path = path.join(HUNSPELL_DIR, "compounds", variant.underscored)
else:
dict_path = path.join(HUNSPELL_DIR, variant_code)
dict_path = path.join(HUNSPELL_DIR, variant.underscored)
return Dictionary.from_files(dict_path)

def load_dictionaries(self):
Expand Down
48 changes: 3 additions & 45 deletions pt_dict/constants.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,13 @@
import logging
from os import path, environ
from os import path
import pathlib

from pt_dict.logger import Logger

LT_VER = "6.4-SNAPSHOT"
LATIN_1_ENCODING = 'ISO-8859-1'
from dict_tools.lib.constants import DATA_DIR

# Paths
REPO_DIR = pathlib.Path(path.dirname(path.abspath(__file__))).parent
LT_HOME = environ.get('LT_HOME')
LT_DIR = path.join(pathlib.Path(REPO_DIR).parent, "languagetool") if LT_HOME is None else LT_HOME
RESOURCE_DIR = path.join(LT_DIR, "languagetool-language-modules/pt/src/main/resources/org/languagetool/resource/pt")
RULES_DIR = path.join(LT_DIR, "languagetool-language-modules/pt/src/main/resources/org/languagetool/rules/pt")
DATA_DIR = path.join(REPO_DIR, 'data')
SPELLING_DICT_DIR = path.join(DATA_DIR, "spelling-dict")
HUNSPELL_DIR = path.join(SPELLING_DICT_DIR, "hunspell")
TAGGER_DICT_DIR = path.join(DATA_DIR, "src-dict")
TAGGER_SCRIPTS_DIR = path.join(REPO_DIR, "pos_tagger_scripts")
TAGGER_BUILD_SCRIPT_PATH = path.join(TAGGER_SCRIPTS_DIR, "build-lt.sh")
PT_REPO_DIR = pathlib.Path(path.dirname(path.abspath(__file__))).parent
SYLLABLES_FILEPATH = path.join(DATA_DIR, 'misc', 'syllables.tsv')
ALTERNATIONS_DIR = path.join(DATA_DIR, 'alternations')
PT_BR_ALTERNATIONS_FILEPATH = path.join(ALTERNATIONS_DIR, 'pt_br.txt')
SILENT_LETTER_ALTERNATIONS_FILEPATH = path.join(ALTERNATIONS_DIR, 'silent_letters.tsv')
PT_45_90_ALTERNATIONS_FILEPATH = path.join(ALTERNATIONS_DIR, 'pt_45_90.tsv')
COMPOUNDS_FILEPATH = path.join(RESOURCE_DIR, "post-reform-compounds.txt")
COMPOUNDS_DIR = path.join(HUNSPELL_DIR, 'compounds')
TO_ADD_DIR = path.join(DATA_DIR, "to_add")

RESULTS_DIR = path.join(REPO_DIR, 'results')
JAVA_RESULTS_DIR = path.join(RESULTS_DIR, 'java-lt')
LT_RESULTS_DIR = path.join(RESULTS_DIR, 'lt')
FDIC_DIR = path.join(TAGGER_SCRIPTS_DIR, "fdic-to-lt")
RESULT_POS_DICT_FILEPATH = path.join(LT_RESULTS_DIR, "dict.txt")
SORTED_POS_DICT_FILEPATH = path.join(LT_RESULTS_DIR, "dict_sorted.txt")
POS_DICT_DIFF_FILEPATH = path.join(LT_RESULTS_DIR, "dict.diff")
OLD_POS_DICT_FILEPATH = path.join(LT_RESULTS_DIR, "dict.old")
COMPILED_POS_DICT_FILEPATH = path.join(JAVA_RESULTS_DIR, )
JAVA_OUTPUT_DIR = path.join(JAVA_RESULTS_DIR, "src/main/resources/org/languagetool/resource/pt")
SPELLING_OUTPUT_DIR = path.join(JAVA_OUTPUT_DIR, "spelling")
POS_DICT_JAVA_OUTPUT_PATH = path.join(JAVA_OUTPUT_DIR, "portuguese.dict")
POS_INFO_JAVA_INPUT_PATH = path.join(TAGGER_DICT_DIR, "portuguese.info")
POS_INFO_JAVA_OUTPUT_PATH = path.join(JAVA_OUTPUT_DIR, "portuguese.info")
SYNTH_DICT_JAVA_OUTPUT_PATH = path.join(JAVA_OUTPUT_DIR, "portuguese_synth.dict")
SYNTH_INFO_JAVA_OUTPUT_PATH = path.join(JAVA_OUTPUT_DIR, "portuguese_synth.info")
SYNTH_INFO_JAVA_INPUT_PATH = path.join(TAGGER_DICT_DIR, "portuguese_synth.info")
LT_JAR_PATH = path.join(LT_DIR, 'languagetool-standalone', 'target', f"LanguageTool-{LT_VER}", f"LanguageTool-{LT_VER}",
'languagetool.jar')
LT_JAR_WITH_DEPS_PATH = path.join(LT_DIR, "languagetool-dev", "target",
f"languagetool-dev-{LT_VER}-jar-with-dependencies.jar")

logging.setLoggerClass(Logger)
LOGGER = logging.getLogger('build_spelling_dicts')
LOGGER.setLevel(logging.DEBUG)
4 changes: 2 additions & 2 deletions pt_dict/dicts/hunspell.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

from pt_dict.constants import LATIN_1_ENCODING
from pt_dict.dicts.dictionary import Dictionary
from pt_dict.variants.variant import VARIANTS
from dict_tools.lib.variant import VARIANT_MAPPING


class HunspellDict(Dictionary):
pattern = re.compile('^([^/\\t#]+)(/|$|\\t)')

def collect_lemmata(self, split_compounds=False):
for variant in VARIANTS:
for variant in VARIANT_MAPPING.get('pt'):
self.collect_lemmata_from_file(variant.dic(), self.pattern, split_compounds, encoding=LATIN_1_ENCODING,
offset=1)
return self.lemmata
28 changes: 0 additions & 28 deletions pt_dict/logger.py

This file was deleted.

Loading

0 comments on commit 2629a7a

Please sign in to comment.