Skip to content

Commit

Permalink
Add validation for synset definitions and examples (#230)
Browse files Browse the repository at this point in the history
- W305 blank definition
- W306 blank example
- W307 repeated definition

Resolves #151
  • Loading branch information
goodmami authored Dec 11, 2024
1 parent 2535f4b commit 60266f2
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 11 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
* `Relation` class ([#216])
* `Sense.relation_map()` method ([#216])
* `Synset.relation_map()` method ([#167], [#216])
* `W305` blank definition on synset validation ([#151])
* `W306` blank example on synset validation ([#151])
* `W307` repeated definition on synset validation ([#151])

## Fixed

Expand Down Expand Up @@ -687,6 +690,7 @@ abandoned, but this is an entirely new codebase.
[#146]: https://github.com/goodmami/wn/issues/146
[#147]: https://github.com/goodmami/wn/issues/147
[#148]: https://github.com/goodmami/wn/issues/148
[#151]: https://github.com/goodmami/wn/issues/151
[#152]: https://github.com/goodmami/wn/issues/152
[#154]: https://github.com/goodmami/wn/issues/154
[#155]: https://github.com/goodmami/wn/issues/155
Expand Down
27 changes: 27 additions & 0 deletions tests/data/W305-0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- blank definition in synset -->

<Lexicon id="test-w305"
label="Testing W305"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-w305-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-w305-foo-n" synset="test-w305-01-n" />
</LexicalEntry>

<Synset id="test-w305-01-n" ili="i12345" partOfSpeech="n">
<Definition>

</Definition>
</Synset>

</Lexicon>

</LexicalResource>
27 changes: 27 additions & 0 deletions tests/data/W306-0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- blank example in synset -->

<Lexicon id="test-w306"
label="Testing W306"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-w306-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-w306-foo-n" synset="test-w306-01-n" />
</LexicalEntry>

<Synset id="test-w306-01-n" ili="i12345" partOfSpeech="n">
<Example>

</Example>
</Synset>

</Lexicon>

</LexicalResource>
30 changes: 30 additions & 0 deletions tests/data/W307-0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- repeated definition in synset -->

<Lexicon id="test-w307"
label="Testing W307"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-w307-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-w307-foo-1-n" synset="test-w307-01-n" />
<Sense id="test-w307-foo-2-n" synset="test-w307-02-n" />
</LexicalEntry>

<Synset id="test-w307-01-n" ili="i12345" partOfSpeech="n">
<Definition>foo</Definition>
</Synset>

<Synset id="test-w307-02-n" ili="i12346" partOfSpeech="n">
<Definition>foo</Definition>
</Synset>

</Lexicon>

</LexicalResource>
28 changes: 17 additions & 11 deletions tests/validate_test.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
from pathlib import Path
import pytest

from wn import lmf
from wn.validate import validate

tests = [
("E101", 0),
("E101", 1),
("E101", 2),
("E101", 3),
("W305", 0),
("W306", 0),
("W307", 0),
]
test_ids = [f"{code}-{i}" for code, i in tests]

def _assert_invalid(select: str, path: Path) -> None:

@pytest.mark.parametrize("code,i", tests, ids=test_ids)
def test_validate(datadir, code: str, i: int) -> None:
path = datadir / f"{code}-{i}.xml"
lex = lmf.load(path, progress_handler=None)["lexicons"][0]
report = validate(lex, select=[select], progress_handler=None)
report = validate(lex, select=[code], progress_handler=None)
print(report)
assert len(report[select]["items"]) > 0


def test_E101(datadir):
_assert_invalid("E101", datadir / "E101-0.xml")
_assert_invalid("E101", datadir / "E101-1.xml")
_assert_invalid("E101", datadir / "E101-2.xml")
_assert_invalid("E101", datadir / "E101-3.xml")
assert len(report[code]["items"]) > 0
34 changes: 34 additions & 0 deletions wn/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
W302 ILI is repeated across synsets.
W303 Proposed ILI is missing a definition.
W304 Existing ILI has a spurious definition.
W305 Synset has a blank definition.
W306 Synset has a blank example.
W307 Synset repeats an existing definition.
E401 Relation target is missing or invalid.
W402 Relation type is invalid for the source and target.
W403 Redundant relation between source and target.
Expand Down Expand Up @@ -125,6 +128,34 @@ def _spurious_ili_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
if ss['ili'] and ss['ili'] != 'in' and ss.get('ili_definition')}


def _blank_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset has a blank definition"""
return {
ss['id']: {} for ss in _synsets(lex)
if any(dfn["text"].strip() == "" for dfn in ss.get("definitions", []))
}

def _blank_synset_example(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset has a blank example"""
return {
ss['id']: {} for ss in _synsets(lex)
if any(ex["text"].strip() == "" for ex in ss.get("examples", []))
}


def _repeated_synset_definition(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""synset repeats an existing definition"""
repeated = _multiples(
dfn["text"]
for ss in _synsets(lex)
for dfn in ss.get("definitions", [])
)
return {
ss["id"]: {} for ss in _synsets(lex)
if any(dfn["text"] in repeated for dfn in ss.get("definitions", []))
}


def _missing_relation_target(lex: lmf.Lexicon, ids: _Ids) -> _Result:
"""relation target is missing or invalid"""
result = {s['id']: {'type': r['relType'], 'target': r['target']}
Expand Down Expand Up @@ -253,6 +284,9 @@ def _get_dc_type(r: lmf.Relation) -> Optional[str]:
'W302': _repeated_ili,
'W303': _missing_ili_definition,
'W304': _spurious_ili_definition,
'W305': _blank_synset_definition,
'W306': _blank_synset_example,
'W307': _repeated_synset_definition,
# 400 - relations
'E401': _missing_relation_target,
'W402': _invalid_relation_type,
Expand Down

0 comments on commit 60266f2

Please sign in to comment.