-
Notifications
You must be signed in to change notification settings - Fork 0
/
enrich_from_pmb.py
50 lines (45 loc) · 1.68 KB
/
enrich_from_pmb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import lxml.etree as ET
from acdh_tei_pyutils.tei import TeiReader
from config import MASTER_FILE, NAME_SPACES, PMB_LISTPLACE_DUMP, MASTER_ENRICHED
print(f"fetching PMB Places from {PMB_LISTPLACE_DUMP}")
doc = TeiReader(PMB_LISTPLACE_DUMP)
doc.tree_to_file("hansi.xml")
data = {}
for x in doc.any_xpath(".//tei:place[@xml:id]"):
for y in x.xpath('./tei:idno[@subtype="pmb"]/text()', namespaces=NAME_SPACES):
if y.endswith("/"):
data[y] = {}
else:
data[f"{y}/"] = {}
for idno in x.xpath("./tei:idno", namespaces=NAME_SPACES):
try:
domain = idno.attrib["subtype"]
except KeyError:
print(f"no idno type subtype for {y}")
continue
uri = idno.text
if domain == "pmb":
continue
elif domain == "geonames":
continue
else:
if y.endswith("/"):
data[y][domain] = uri
else:
data[f"{y}/"][domain] = uri
doc = TeiReader(MASTER_FILE)
for bad in doc.any_xpath('.//tei:idno[@type="website"]'):
bad.getparent().remove(bad)
print(f"adding IDNOS into {MASTER_FILE} and save it into {MASTER_ENRICHED}")
for x in doc.any_xpath(".//tei:place"):
pmb = x.xpath('./tei:idno[@type="pmb"]/text()', namespaces=NAME_SPACES)[0]
match = data[pmb]
for key, value in match.items():
idno = ET.Element("{http://www.tei-c.org/ns/1.0}idno")
idno.attrib["type"] = "website"
idno.attrib["subtype"] = key.replace("-", "_")
idno.text = value
x.append(idno)
doc.tree_to_file(MASTER_ENRICHED)
print("done")