From ff3fa040befbc7ae9205fbd2b73cdebaee662460 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 22 Feb 2024 21:38:32 +0100 Subject: [PATCH 1/5] feat: new test --- test/empty-speech.py | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 test/empty-speech.py diff --git a/test/empty-speech.py b/test/empty-speech.py new file mode 100644 index 0000000000..6160f17a58 --- /dev/null +++ b/test/empty-speech.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +""" +Test there are no empty speeches. +""" +from .pytestconfig import fetch_config +from datetime import datetime +from lxml import etree +from pyriksdagen.utils import ( + parse_protocol, + protocol_iterators, +) +from tqdm import tqdm +import pandas as pd +import unittest +import warnings + + +class EmptyElement(Warning): + + def __init__(self, m): + self.message = m + + def __str__(self): + return self.message + + +class Test(unittest.TestCase): + + def protocol_iter(self): + """ + Get protocols. + """ + return sorted(list(protocol_iterators("corpus/protocols/", start=1867, end=2022))) + + + def test_no_empty_speech(self): + """ + Test protocol has no empty `u` or `seg` elements + """ + rows = [] + protocols = self.protocol_iter() + for p in tqdm(protocols, total=len(protocols)): + root, ns = parse_protocol(p, get_ns=True) + for elem in root.iter(f'{ns["tei_ns"]}u'): + if len(elem) == 0: + if f'{ns["xml_ns"]}id' in elem.attrib: + u_id = elem.attrib[f'{ns["xml_ns"]}id'] + rows.append([p, "u", u_id]) + warnings.warn(f"Empty u-elem: {p}, {u_id}", EmptyElement) + else: + print("oh no, U") + else: + for seg in elem: + if not seg.text or seg.text.strip() == '': + if f'{ns["xml_ns"]}id' in seg.attrib: + seg_id = seg.attrib[f'{ns["xml_ns"]}id'] + rows.append([p, "seg", seg_id]) + warnings.warn(f"Empty seg-elem: {p}, {seg_id}", EmptyElement) + else: + print("oh no, SEG") + if len(rows) > 0: + config = fetch_config("empty-speech") + if config and config["write_empty_speeches"]: + now = datetime.now().strftime('%Y%m%d-%H%M%S') + cols = ["protocol", "elem", "elem_id"] + df = pd.DataFrame(rows, columns=cols) + df.to_csv( + f"{config['test_out_path']}empty-speech_{now}.csv", + sep=';', + index=False) + + self.assertEqual(len(rows), 0) + +if __name__ == '__main__': + unittest.main() From ed5c349133173d06888a5ede51e84a7ca62afe3c Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 22 Feb 2024 21:39:18 +0100 Subject: [PATCH 2/5] feat: get outpath for all test results --- test/pytestconfig.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/pytestconfig.py b/test/pytestconfig.py index 34c057b9e2..dd819af52a 100644 --- a/test/pytestconfig.py +++ b/test/pytestconfig.py @@ -18,6 +18,8 @@ def fetch_config(test): try: with open("test/_test_config/test.json", 'r') as j: d = json.load(j) + config = d[test] + config['test_out_path'] = d['test_out_path'] return d[test] except: return None From fa4e149f55a85ad39079ae362319fda528134d08 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 22 Feb 2024 21:40:04 +0100 Subject: [PATCH 3/5] feat: run empty speeches test on push --- .github/workflows/push.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index f6e8358865..dfad8e0c32 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -68,6 +68,26 @@ jobs: run: | python -m unittest test.chairs + empty-speeches: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + pip install PyPDF2 + - name: Test there are no empty u or seg elements + run: | + python -m unittest test.empty-speeches + mp: runs-on: ubuntu-latest strategy: From fac4e4b4518e25d0af4b0aca1ef46a83d571ac17 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 22 Feb 2024 21:53:12 +0100 Subject: [PATCH 4/5] style: cleaning up formatting --- test/empty-speech.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/test/empty-speech.py b/test/empty-speech.py index 6160f17a58..193b6c04d5 100644 --- a/test/empty-speech.py +++ b/test/empty-speech.py @@ -15,6 +15,8 @@ import warnings + + class EmptyElement(Warning): def __init__(self, m): @@ -24,21 +26,18 @@ def __str__(self): return self.message -class Test(unittest.TestCase): - def protocol_iter(self): - """ - Get protocols. - """ - return sorted(list(protocol_iterators("corpus/protocols/", start=1867, end=2022))) +class Test(unittest.TestCase): def test_no_empty_speech(self): """ Test protocol has no empty `u` or `seg` elements """ rows = [] - protocols = self.protocol_iter() + protocols = sorted(list(protocol_iterators("corpus/protocols/", + start=1867, + end=2022))) for p in tqdm(protocols, total=len(protocols)): root, ns = parse_protocol(p, get_ns=True) for elem in root.iter(f'{ns["tei_ns"]}u'): @@ -47,8 +46,6 @@ def test_no_empty_speech(self): u_id = elem.attrib[f'{ns["xml_ns"]}id'] rows.append([p, "u", u_id]) warnings.warn(f"Empty u-elem: {p}, {u_id}", EmptyElement) - else: - print("oh no, U") else: for seg in elem: if not seg.text or seg.text.strip() == '': @@ -56,8 +53,6 @@ def test_no_empty_speech(self): seg_id = seg.attrib[f'{ns["xml_ns"]}id'] rows.append([p, "seg", seg_id]) warnings.warn(f"Empty seg-elem: {p}, {seg_id}", EmptyElement) - else: - print("oh no, SEG") if len(rows) > 0: config = fetch_config("empty-speech") if config and config["write_empty_speeches"]: @@ -71,5 +66,8 @@ def test_no_empty_speech(self): self.assertEqual(len(rows), 0) + + + if __name__ == '__main__': unittest.main() From dd8998629df06a034c6ae509471877e7eac7dcf9 Mon Sep 17 00:00:00 2001 From: Bob Borges Date: Thu, 22 Feb 2024 22:24:15 +0100 Subject: [PATCH 5/5] fix: filename --- .github/workflows/push.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index dfad8e0c32..1df672eaad 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -68,7 +68,7 @@ jobs: run: | python -m unittest test.chairs - empty-speeches: + empty-speech: runs-on: ubuntu-latest strategy: matrix: @@ -86,7 +86,7 @@ jobs: pip install PyPDF2 - name: Test there are no empty u or seg elements run: | - python -m unittest test.empty-speeches + python -m unittest test.empty-speech mp: runs-on: ubuntu-latest