Skip to content
This repository has been archived by the owner on May 8, 2024. It is now read-only.

Commit

Permalink
chore: merge pull request #480 from welfare-state-analytics/empty-spe…
Browse files Browse the repository at this point in the history
…ech-test

No empty speeches test
  • Loading branch information
ninpnin authored Feb 23, 2024
2 parents 3ecb915 + dd89986 commit bde9c99
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 0 deletions.
20 changes: 20 additions & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ jobs:
run: |
python -m unittest test.chairs
empty-speech:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .
pip install PyPDF2
- name: Test there are no empty u or seg elements
run: |
python -m unittest test.empty-speech
mp:
runs-on: ubuntu-latest
strategy:
Expand Down
73 changes: 73 additions & 0 deletions test/empty-speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""
Test there are no empty speeches.
"""
from .pytestconfig import fetch_config
from datetime import datetime
from lxml import etree
from pyriksdagen.utils import (
parse_protocol,
protocol_iterators,
)
from tqdm import tqdm
import pandas as pd
import unittest
import warnings




class EmptyElement(Warning):

def __init__(self, m):
self.message = m

def __str__(self):
return self.message




class Test(unittest.TestCase):

def test_no_empty_speech(self):
"""
Test protocol has no empty `u` or `seg` elements
"""
rows = []
protocols = sorted(list(protocol_iterators("corpus/protocols/",
start=1867,
end=2022)))
for p in tqdm(protocols, total=len(protocols)):
root, ns = parse_protocol(p, get_ns=True)
for elem in root.iter(f'{ns["tei_ns"]}u'):
if len(elem) == 0:
if f'{ns["xml_ns"]}id' in elem.attrib:
u_id = elem.attrib[f'{ns["xml_ns"]}id']
rows.append([p, "u", u_id])
warnings.warn(f"Empty u-elem: {p}, {u_id}", EmptyElement)
else:
for seg in elem:
if not seg.text or seg.text.strip() == '':
if f'{ns["xml_ns"]}id' in seg.attrib:
seg_id = seg.attrib[f'{ns["xml_ns"]}id']
rows.append([p, "seg", seg_id])
warnings.warn(f"Empty seg-elem: {p}, {seg_id}", EmptyElement)
if len(rows) > 0:
config = fetch_config("empty-speech")
if config and config["write_empty_speeches"]:
now = datetime.now().strftime('%Y%m%d-%H%M%S')
cols = ["protocol", "elem", "elem_id"]
df = pd.DataFrame(rows, columns=cols)
df.to_csv(
f"{config['test_out_path']}empty-speech_{now}.csv",
sep=';',
index=False)

self.assertEqual(len(rows), 0)




if __name__ == '__main__':
unittest.main()
2 changes: 2 additions & 0 deletions test/pytestconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def fetch_config(test):
try:
with open("test/_test_config/test.json", 'r') as j:
d = json.load(j)
config = d[test]
config['test_out_path'] = d['test_out_path']
return d[test]
except:
return None

0 comments on commit bde9c99

Please sign in to comment.