Skip to content

Commit

Permalink
Enumerate entry/sense/synset ids for validation
Browse files Browse the repository at this point in the history
Fixes #228
  • Loading branch information
goodmami committed Dec 11, 2024
1 parent c994588 commit 7921233
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 3 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
* `Sense.relation_map()` method ([#216])
* `Synset.relation_map()` method ([#167], [#216])

## Fixed

* Enumerate repeated entry, sense, synset IDs for validation ([#228])


## [v0.10.1]

Expand Down Expand Up @@ -706,3 +710,4 @@ abandoned, but this is an entirely new codebase.
[#215]: https://github.com/goodmami/wn/issues/215
[#216]: https://github.com/goodmami/wn/issues/216
[#221]: https://github.com/goodmami/wn/issues/221
[#228]: https://github.com/goodmami/wn/issues/228
28 changes: 28 additions & 0 deletions tests/data/E101-0.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- duplicate ID in lexical entries -->

<Lexicon id="test-e101"
label="Testing E101"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-e101-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-e101-foo" synset="test-e101-01-n" />
</LexicalEntry>

<LexicalEntry id="test-e101-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo2" />
<Sense id="test-e101-foo2" synset="test-e101-01-n" />
</LexicalEntry>

<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />

</Lexicon>

</LexicalResource>
25 changes: 25 additions & 0 deletions tests/data/E101-1.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- duplicate ID in senses -->

<Lexicon id="test-e101"
label="Testing E101"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-e101-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-e101-foo" synset="test-e101-01-n" />
<Sense id="test-e101-foo" synset="test-e101-02-n" />
</LexicalEntry>

<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
<Synset id="test-e101-02-n" ili="i12346" partOfSpeech="n" />

</Lexicon>

</LexicalResource>
24 changes: 24 additions & 0 deletions tests/data/E101-2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- duplicate ID in synsets -->

<Lexicon id="test-e101"
label="Testing E101"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-e101-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-e101-foo-n" synset="test-e101-01-n" />
</LexicalEntry>

<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
<Synset id="test-e101-01-n" ili="i12346" partOfSpeech="n" />

</Lexicon>

</LexicalResource>
23 changes: 23 additions & 0 deletions tests/data/E101-3.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">

<!-- duplicate ID in different entity types -->

<Lexicon id="test-e101"
label="Testing E101"
language="en"
email="[email protected]"
license="https://creativecommons.org/licenses/by/4.0/"
version="1">

<LexicalEntry id="test-e101-foo-n">
<Lemma partOfSpeech="n" writtenForm="foo" />
<Sense id="test-e101-foo-n" synset="test-e101-01-n" />
</LexicalEntry>

<Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />

</Lexicon>

</LexicalResource>
19 changes: 19 additions & 0 deletions tests/validate_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from collections.abc import Sequence
from pathlib import Path

from wn import lmf
from wn.validate import validate


def _assert_invalid(select: str, path: Path) -> None:
lex = lmf.load(path, progress_handler=None)["lexicons"][0]
report = validate(lex, select=[select], progress_handler=None)
print(report)
assert len(report[select]["items"]) > 0


def test_E101(datadir):
_assert_invalid("E101", datadir / "E101-0.xml")
_assert_invalid("E101", datadir / "E101-1.xml")
_assert_invalid("E101", datadir / "E101-2.xml")
_assert_invalid("E101", datadir / "E101-3.xml")
6 changes: 3 additions & 3 deletions wn/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ def _non_unique_id(lex: lmf.Lexicon, ids: _Ids) -> _Result:
[lex['id']],
(f['id'] for e in _entries(lex) for f in _forms(e) if f.get('id')),
(sb['id'] for sb in lex.get('frames', []) if sb.get('id')),
ids['entry'],
ids['sense'],
ids['synset'],
ids['entry'].elements(),
ids['sense'].elements(),
ids['synset'].elements(),
))


Expand Down

0 comments on commit 7921233

Please sign in to comment.