diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index f6e8358865..1df672eaad 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -68,6 +68,26 @@ jobs: run: | python -m unittest test.chairs + empty-speech: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install . + pip install PyPDF2 + - name: Test there are no empty u or seg elements + run: | + python -m unittest test.empty-speech + mp: runs-on: ubuntu-latest strategy: diff --git a/test/empty-speech.py b/test/empty-speech.py new file mode 100644 index 0000000000..193b6c04d5 --- /dev/null +++ b/test/empty-speech.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +""" +Test there are no empty speeches. +""" +from .pytestconfig import fetch_config +from datetime import datetime +from lxml import etree +from pyriksdagen.utils import ( + parse_protocol, + protocol_iterators, +) +from tqdm import tqdm +import pandas as pd +import unittest +import warnings + + + + +class EmptyElement(Warning): + + def __init__(self, m): + self.message = m + + def __str__(self): + return self.message + + + + +class Test(unittest.TestCase): + + def test_no_empty_speech(self): + """ + Test protocol has no empty `u` or `seg` elements + """ + rows = [] + protocols = sorted(list(protocol_iterators("corpus/protocols/", + start=1867, + end=2022))) + for p in tqdm(protocols, total=len(protocols)): + root, ns = parse_protocol(p, get_ns=True) + for elem in root.iter(f'{ns["tei_ns"]}u'): + if len(elem) == 0: + if f'{ns["xml_ns"]}id' in elem.attrib: + u_id = elem.attrib[f'{ns["xml_ns"]}id'] + rows.append([p, "u", u_id]) + warnings.warn(f"Empty u-elem: {p}, {u_id}", EmptyElement) + else: + for seg in elem: + if not seg.text or seg.text.strip() == '': + if f'{ns["xml_ns"]}id' in seg.attrib: + seg_id = seg.attrib[f'{ns["xml_ns"]}id'] + rows.append([p, "seg", seg_id]) + warnings.warn(f"Empty seg-elem: {p}, {seg_id}", EmptyElement) + if len(rows) > 0: + config = fetch_config("empty-speech") + if config and config["write_empty_speeches"]: + now = datetime.now().strftime('%Y%m%d-%H%M%S') + cols = ["protocol", "elem", "elem_id"] + df = pd.DataFrame(rows, columns=cols) + df.to_csv( + f"{config['test_out_path']}empty-speech_{now}.csv", + sep=';', + index=False) + + self.assertEqual(len(rows), 0) + + + + +if __name__ == '__main__': + unittest.main() diff --git a/test/pytestconfig.py b/test/pytestconfig.py index 34c057b9e2..dd819af52a 100644 --- a/test/pytestconfig.py +++ b/test/pytestconfig.py @@ -18,6 +18,8 @@ def fetch_config(test): try: with open("test/_test_config/test.json", 'r') as j: d = json.load(j) + config = d[test] + config['test_out_path'] = d['test_out_path'] return d[test] except: return None