Skip to content
This repository has been archived by the owner on May 8, 2024. It is now read-only.

Commit

Permalink
test: extend testing to each year in the data
Browse files Browse the repository at this point in the history
  • Loading branch information
ninpnin committed Oct 10, 2023
1 parent 9cd3db1 commit 78b3841
Showing 1 changed file with 20 additions and 21 deletions.
41 changes: 20 additions & 21 deletions test/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,40 @@
from pyriksdagen.download import get_blocks
from pyriksdagen.export import create_tei, create_parlaclarin
from pyriksdagen.db import load_patterns
from pathlib import Path
import random
random.seed(429)

class Test(unittest.TestCase):

# Official example parla-clarin
def test_official_example(self):
"""
Validate the official ParlaClarin example provided by ParlaClarin.
"""
schema_path = "schemas/parla-clarin.xsd"
parlaclarin_path = "input/parla-clarin/official-example.xml"

valid = validate_xml_schema(parlaclarin_path, schema_path)
self.assertEqual(valid, True)

# Parla-clarin generated from example OCR XML
def test_protocols(self):
"""
For each year, randomly choose a file and check it against the ParlaClarin schema.
"""
schema_path = "schemas/parla-clarin.xsd"
protocol_id1, msg1 = "1955/prot-1955--ak--22", "Andra kammaren"
protocol_id2, msg2 = "1933/prot-1933--fk--5", "Första kammaren"
protocol_id3, msg3 = "197879/prot-197879--14", "Enkammarsriksdagen"
protocol_id4, msg4 = "199596/prot-199596--35", "Digital original, format 1"
protocol_id5, msg5 = "201011/prot-201011--19", "Digital original, format 2"
protocol_id6, msg6 = "201819/prot-201819--45", "Digital original, format 3"

folder = "corpus/protocols/"
valid1 = validate_xml_schema(f"{folder}{protocol_id1}.xml", schema_path)
valid2 = validate_xml_schema(f"{folder}{protocol_id2}.xml", schema_path)
valid3 = validate_xml_schema(f"{folder}{protocol_id3}.xml", schema_path)
valid4 = validate_xml_schema(f"{folder}{protocol_id4}.xml", schema_path)
valid5 = validate_xml_schema(f"{folder}{protocol_id5}.xml", schema_path)
valid6 = validate_xml_schema(f"{folder}{protocol_id6}.xml", schema_path)
years = sorted([p.stem for p in Path(folder).glob("*") if p.is_dir()])

self.assertGreaterEqual(len(years), 1, "We should have a nonempty set of data folders")

for year in years:
files_year = list(Path(folder).glob(f"{year}/*.xml"))
self.assertGreaterEqual(len(files_year), 1, f"For year(s) {year}, we should have a nonempty set of XML files")
file = random.choice(files_year)
print(year, file.stem)
valid = validate_xml_schema(file.absolute(), schema_path)
self.assertTrue(valid, f"{year}s: {file.stem}")

self.assertTrue(valid1, f"{msg1}: {protocol_id1}")
self.assertTrue(valid2, f"{msg2}: {protocol_id2}")
self.assertTrue(valid3, f"{msg3}: {protocol_id3}")
self.assertTrue(valid4, f"{msg4}: {protocol_id4}")
self.assertTrue(valid5, f"{msg5}: {protocol_id5}")
self.assertTrue(valid6, f"{msg6}: {protocol_id6}")

if __name__ == '__main__':
# begin the unittest.main()
Expand Down

0 comments on commit 78b3841

Please sign in to comment.