Enumerate entry/sense/synset ids for validation

Fixes #228
goodmami · Dec 11, 2024 · 7921233 · 7921233
1 parent c994588
commit 7921233
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,10 @@
 * `Sense.relation_map()` method ([#216])
 * `Synset.relation_map()` method ([#167], [#216])
 
+## Fixed
+
+* Enumerate repeated entry, sense, synset IDs for validation ([#228])
+
 
 ## [v0.10.1]
 
@@ -706,3 +710,4 @@ abandoned, but this is an entirely new codebase.
 [#215]: https://github.com/goodmami/wn/issues/215
 [#216]: https://github.com/goodmami/wn/issues/216
 [#221]: https://github.com/goodmami/wn/issues/221
+[#228]: https://github.com/goodmami/wn/issues/228
diff --git a/tests/data/E101-0.xml b/tests/data/E101-0.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
+<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+<!-- duplicate ID in lexical entries -->
+
+  <Lexicon id="test-e101"
+           label="Testing E101"
+           language="en"
+           email="[email protected]"
+           license="https://creativecommons.org/licenses/by/4.0/"
+           version="1">
+
+    <LexicalEntry id="test-e101-foo-n">
+      <Lemma partOfSpeech="n" writtenForm="foo" />
+      <Sense id="test-e101-foo" synset="test-e101-01-n" />
+    </LexicalEntry>
+
+    <LexicalEntry id="test-e101-foo-n">
+      <Lemma partOfSpeech="n" writtenForm="foo2" />
+      <Sense id="test-e101-foo2" synset="test-e101-01-n" />
+    </LexicalEntry>
+
+    <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
+
+  </Lexicon>
+
+</LexicalResource>
diff --git a/tests/data/E101-1.xml b/tests/data/E101-1.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
+<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+<!-- duplicate ID in senses -->
+
+  <Lexicon id="test-e101"
+           label="Testing E101"
+           language="en"
+           email="[email protected]"
+           license="https://creativecommons.org/licenses/by/4.0/"
+           version="1">
+
+    <LexicalEntry id="test-e101-foo-n">
+      <Lemma partOfSpeech="n" writtenForm="foo" />
+      <Sense id="test-e101-foo" synset="test-e101-01-n" />
+      <Sense id="test-e101-foo" synset="test-e101-02-n" />
+    </LexicalEntry>
+
+    <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
+    <Synset id="test-e101-02-n" ili="i12346" partOfSpeech="n" />
+
+  </Lexicon>
+
+</LexicalResource>
diff --git a/tests/data/E101-2.xml b/tests/data/E101-2.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
+<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+<!-- duplicate ID in synsets -->
+
+  <Lexicon id="test-e101"
+           label="Testing E101"
+           language="en"
+           email="[email protected]"
+           license="https://creativecommons.org/licenses/by/4.0/"
+           version="1">
+
+    <LexicalEntry id="test-e101-foo-n">
+      <Lemma partOfSpeech="n" writtenForm="foo" />
+      <Sense id="test-e101-foo-n" synset="test-e101-01-n" />
+    </LexicalEntry>
+
+    <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
+    <Synset id="test-e101-01-n" ili="i12346" partOfSpeech="n" />
+
+  </Lexicon>
+
+</LexicalResource>
diff --git a/tests/data/E101-3.xml b/tests/data/E101-3.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE LexicalResource SYSTEM "http://globalwordnet.github.io/schemas/WN-LMF-1.0.dtd">
+<LexicalResource xmlns:dc="http://purl.org/dc/elements/1.1/">
+
+<!-- duplicate ID in different entity types -->
+
+  <Lexicon id="test-e101"
+           label="Testing E101"
+           language="en"
+           email="[email protected]"
+           license="https://creativecommons.org/licenses/by/4.0/"
+           version="1">
+
+    <LexicalEntry id="test-e101-foo-n">
+      <Lemma partOfSpeech="n" writtenForm="foo" />
+      <Sense id="test-e101-foo-n" synset="test-e101-01-n" />
+    </LexicalEntry>
+
+    <Synset id="test-e101-01-n" ili="i12345" partOfSpeech="n" />
+
+  </Lexicon>
+
+</LexicalResource>
diff --git a/tests/validate_test.py b/tests/validate_test.py
@@ -0,0 +1,19 @@
+from collections.abc import Sequence
+from pathlib import Path
+
+from wn import lmf
+from wn.validate import validate
+
+
+def _assert_invalid(select: str, path: Path) -> None:
+    lex = lmf.load(path, progress_handler=None)["lexicons"][0]
+    report = validate(lex, select=[select], progress_handler=None)
+    print(report)
+    assert len(report[select]["items"]) > 0
+
+
+def test_E101(datadir):
+    _assert_invalid("E101", datadir / "E101-0.xml")
+    _assert_invalid("E101", datadir / "E101-1.xml")
+    _assert_invalid("E101", datadir / "E101-2.xml")
+    _assert_invalid("E101", datadir / "E101-3.xml")
diff --git a/wn/validate.py b/wn/validate.py
@@ -57,9 +57,9 @@ def _non_unique_id(lex: lmf.Lexicon, ids: _Ids) -> _Result:
         [lex['id']],
         (f['id'] for e in _entries(lex) for f in _forms(e) if f.get('id')),
         (sb['id'] for sb in lex.get('frames', []) if sb.get('id')),
-        ids['entry'],
-        ids['sense'],
-        ids['synset'],
+        ids['entry'].elements(),
+        ids['sense'].elements(),
+        ids['synset'].elements(),
     ))