Skip to content
This repository has been archived by the owner on May 8, 2024. It is now read-only.

Adjust pipeline to new corpus structure #468

Merged
merged 22 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a5b60b2
refactor: rewrite pipeline from scratch
ninpnin Feb 2, 2024
c592ce2
refactor: use external library for ALTO XML
ninpnin Feb 2, 2024
5bcec68
refactor: add XML_NS as a shared variable for the package
ninpnin Feb 2, 2024
f01f516
fix: correct amount of padding when printing XML
ninpnin Feb 2, 2024
6bb0190
refactor: rewrite pipeline from scratch
ninpnin Feb 2, 2024
68a3853
fix: digital originals pipeline
ninpnin Feb 2, 2024
e57b77c
fix: missing dependency
ninpnin Feb 2, 2024
1fcf4e3
refactor: remove unnecessary code
ninpnin Feb 2, 2024
8d6277b
fix: make digital originals first ID unique
ninpnin Feb 2, 2024
b466c4f
fix: archive login bug
ninpnin Feb 2, 2024
f1bb9c5
feat: check what protocols exist and run pipeline with the same script
ninpnin Feb 2, 2024
75e5f5e
fix: remove debugging 'break' statement
ninpnin Feb 2, 2024
c2ab1e2
fix: remove unnecessary imports
ninpnin Feb 2, 2024
5355f73
refactor: split ALTO processing into three functions for generalizabi…
ninpnin Feb 5, 2024
560160f
fix: add docstring and wrong var bug
BobBorges Feb 5, 2024
3a0f185
feat:pipe local alto files
BobBorges Feb 5, 2024
4525787
fix: paragraph ID seed
ninpnin Feb 6, 2024
ae88210
chore: merge branch 'pipeline-refactor' of github.com:welfare-state-a…
ninpnin Feb 6, 2024
34a116d
fix: rm edition statement
BobBorges Feb 9, 2024
6ac68db
Merge branch 'pipeline-refactor' of github.com:welfare-state-analytic…
BobBorges Feb 9, 2024
44297b0
fix: sort local input filenames
BobBorges Feb 9, 2024
2e9a6ab
fix: remove default version number
ninpnin Feb 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ importlib_resources = "*"
nltk = "*"
textdistance = "*"
py-markdown-table = "*"
alto-xml = "*"

[tool.poetry.dev-dependencies]
devtools = "^0.5.1"
Expand Down
137 changes: 47 additions & 90 deletions pyriksdagen/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from lxml import etree
from .utils import clean_html
import warnings
import alto

class LazyArchive:
"""
Expand Down Expand Up @@ -102,112 +103,68 @@ def oppna_data_to_dict(input_dict):
data["paragraphs"].append(paragraph)
return data

def dl_kb_blocks(package_id, archive):
def _alto_extract_paragraphs(altofile):
"""
Download protocol from betalab, convert it to the simple XML 'blocks' schema
Extract text from ALTO XML on paragraph / textBlock level
"""
paragraphs = []
text_blocks = altofile.extract_text_blocks()
for tb_ix, tb in enumerate(text_blocks):
lines = tb.extract_string_lines()
paragraph = "\n".join(lines)

# Remove line breaks when next line starts with a small letter
paragraph = re.sub("([a-zß-ÿ,])- ?\n ?([a-zß-ÿ])", "\\1\\2", paragraph)
paragraph = re.sub("([a-zß-ÿ,]) ?\n ?([a-zß-ÿ])", "\\1 \\2", paragraph)

paragraph = " ".join(paragraph.split())
if paragraph != "":
paragraphs.append(paragraph)
return paragraphs

def convert_alto(filenames, files):
"""
Convert a document from ALTO to a list of paragraphs.

Args:
filenames: the names of the ALTO files of one document, as a list of str.
The script assumes zero-padded numbering right before the .xml extension.
files: ALTO XML files as a list of str in corresponding order to the filenames
"""
package = archive.get(package_id)
root = etree.Element("protocol", id=package_id)
in_sync = True
for ix, fname in enumerate(fetch_files(package)):
s = package.get_raw(fname).read()
tree = etree.fromstring(s)
ns_dict = {"space": "http://www.loc.gov/standards/alto/ns-v3#"}
content_blocks = tree.findall(
".//{http://www.loc.gov/standards/alto/ns-v3#}ComposedBlock"
)
page_number_str = re.findall("([0-9]{3,3}).xml", fname)[0]
page_number = int(page_number_str)
paragraphs = []
for ix, pair in progressbar.progressbar(enumerate(zip(filenames, files))):
fname, s = pair
altofile = alto.parse(s)
page_number = int(re.findall("([0-9]{3,3}).xml", fname)[0])
paragraphs.append(page_number)
if in_sync and page_number != ix:
not_in_sync_warning = f"KB page number and page count not in sync ({package_id})"
not_in_sync_warning = f"ALTO page number and page count not in sync ({fname})"
warnings.warn(not_in_sync_warning)
in_sync = False
paragraphs += _alto_extract_paragraphs(altofile)
return paragraphs

for cb_ix, content_block in enumerate(content_blocks):
content_block_e = etree.SubElement(
root, "contentBlock", page=str(page_number), ix=str(cb_ix)
)
text_blocks = content_block.findall(
".//{http://www.loc.gov/standards/alto/ns-v3#}TextBlock"
)
for tb_ix, text_block in enumerate(text_blocks):
tblock = []
text_lines = text_block.findall(
".//{http://www.loc.gov/standards/alto/ns-v3#}TextLine"
)

for text_line in text_lines:
# tblock.append("\n")
strings = text_line.findall(
".//{http://www.loc.gov/standards/alto/ns-v3#}String"
)
for string in strings:
content = string.attrib["CONTENT"]
tblock.append(content)

tblock = "\n".join(tblock)
# Remove line breaks when next line starts with a small letter
tblock = re.sub("([a-zß-ÿ,])- ?\n ?([a-zß-ÿ])", "\\1\\2", tblock)
tblock = re.sub("([a-zß-ÿ,]) ?\n ?([a-zß-ÿ])", "\\1 \\2", tblock)
text_block_e = etree.SubElement(
content_block_e, "textBlock", ix=str(tb_ix)
)
text_block_e.text = tblock

return root


def get_blocks(protocol_id, archive, load=True, save=True):
def dl_kb_blocks(package_id, archive):
"""
Get content and text blocks from an OCR output XML file. Concatenate words into sentences.

Args:
protocol_id: ID of the protocol
archive: KBlab archive
load: Load the file from disk if available
save: Save the downloaded file to disk

Returns an lxml elem tree with the structure page > contentBlock > textBlock.
Download protocol from betalab, convert it to the simple XML 'blocks' schema
"""
folder = "input/raw/" + protocol_id + "/"
fname = "original.xml"
root = None
overwrite = True
if load or save:
if not os.path.exists(folder):
os.mkdir(folder)

# Attempt to load from disk
if load:
fnames = os.listdir(folder)
if fname in fnames:
s = open(folder + fname).read()
overwrite = False
root = etree.fromstring(s.encode("utf-8"))

# Load from server if local copy is not available
if root is None:
root = dl_kb_blocks(protocol_id, archive)

# Save in case a new version was loaded from server
if save and overwrite:
fname = "original.xml"
sb = etree.tostring(
root, pretty_print=True, encoding="utf-8", xml_declaration=True
)
f = open(folder + fname, "wb")
f.write(sb)
f.close()

return root
print(f"Get package {package_id}...")
package = archive.get(package_id)
filenames = fetch_files(package)
def files():
for fname in filenames:
yield package.get_raw(fname).read()

return convert_alto(filenames, files())


def count_pages(start, end):
"""
Generate a dataframe of pages between provided start and end years. Fetches information from KB's API.
"""
years = range(start, end)
archive = login_to_archive()
archive = LazyArchive()
rows = []

for year in progressbar.progressbar(years):
Expand Down
Loading
Loading