forked from Loke-git/I2CMIF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathI2CMIF.py
483 lines (430 loc) · 23.2 KB
/
I2CMIF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
#!/usr/bin/env python
# coding: utf-8
# version: 1.1.3.5
# by Loke Sjølie
# project uses code from Munch XML Muncher with permission
print("Initializing...")
import sys
import subprocess
import pkg_resources
print("Checking requirements...")
# Package installation borrowed from:
# https://stackoverflow.com/questions/12332975/installing-python-module-within-code/58040520#58040520
required = {'bs4', 'pandas'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed
if missing:
# implement pip as a subprocess:
print("\tInstalling requirements...")
subprocess.check_call([sys.executable, '-m', 'pip', 'install', *missing])
else:
print("\tRequirements met.")
# Ibsen XML Muncher 1
# Much of this code has been appropriated from the Munch XML Muncher (MXMLM) tool.
print("Importing libraries...")
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
from collections import defaultdict
import json
# File and folder handling
import glob # The yeast of thought and mind
import os # File system
# Metadata and configuration
import configparser # Used to easily get statements from the config file
# Time and date
from datetime import date
today = date.today()
today = today.strftime("%Y-%m-%d") # Formater dato
df = pd.read_csv("Compiled_Letter_Data.csv", sep=",")
df = df[['Dispatch_Location',"GeoName_ID"]].fillna("N/A")
placeIDdict = defaultdict(dict)
places = []
for idx,row in df.iterrows():
place = str(row['Dispatch_Location'])
place = place.lstrip('[').rstrip("]").upper()
if place not in places:
if str(row['GeoName_ID']) != "N/A":
places.append(place)
placeid = str(row['GeoName_ID']).split(".")
placeid = placeid[0]
placeIDdict[place] = placeid
print("\nGetting metadata from config.ini...")
config = configparser.ConfigParser()
config.read("config.ini", encoding="utf-8")
cmifTitle = config.get("statements", "cmifTitle")
editorName = config.get("statements", "editorName")
editorMail = config.get("statements", "editorMail")
cmifUid = config.get("statements", "cmifUid")
publisherURL = config.get("statements", "publisherURL")
publisherName = config.get("statements", "publisherName")
cmifURL = config.get("statements", "cmifURL")
typeOfBibl = config.get("statements", "typeOfBibl")
publicationStatementFull = config.get("statements", "publicationStatementFull")
outputFileName = config.get("statements", "outputFileName")
outputFileNameVaria = config.get("statements", "outputFileNameVaria")
print(f"{cmifTitle}\nUID: {cmifUid}\nEditor: {editorName} ({editorMail}) at {publisherName} ({publisherURL})\n{publicationStatementFull}\nOutput: {outputFileName}.xml (main texts only) and {outputFileNameVaria}.xml (main texts and varia)")
t = "Targeting these files: "
listXMLfiles = glob.glob("letters/*.xml",recursive=True)
i=0
for file in listXMLfiles:
if i!= 0:
t += ", "
t+=str(file)
i+=1
print(t)
main = defaultdict(dict)
i=0
for xml_file in listXMLfiles:
pathSplit = xml_file.split("\\")
fileName = pathSplit[1]
fileName = fileName.split(".")
fileName = str(fileName[0])
fileName = "https://www.ibsen.uio.no/BREV_"+fileName[1:]
print("Melting",xml_file)
with open(xml_file, "r", encoding="utf-8") as file:
# Read each line in the file, readlines() returns a list of lines
content = file.readlines()
# Combine the lines in the list into a string
content = "".join(content)
soup = bs(content, "xml")
for document in soup.findAll('HIS:hisMsDesc', {"xml:id":True}):
theAuthorsRefs,theAuthors,theAuthorsTypes,theRecipients,theRecipientsRefs,theRecipientsTypes = [],[],[],[],[],[]
docType = list(document.attrs.values())[0]
docID = list(document.attrs.values())[1]
printString = str(docID)
try:
docLoc = document.find("origPlace").findChild("HIS:hisRef", {"type":"place"}).contents[0]
place = document.find("origPlace").findChild("HIS:hisRef", {"type":"place"})
placeID = list(place.attrs.values())[1]
placeID = placeID.replace("Navneregister_HISe.xml#","")
except:
docLoc = "UKJENT OPPRINNELSESSTED"
placeID = "plNN"
printString+=", "+docType+" from "+docLoc
isDocumentFromTo = document.find("origDate", {"notBefore":True}) # Does the date element have a not before assignment?
if isDocumentFromTo: # If it does, and thus has a range
doesDocumentHaveToDate = document.find("origDate", {"notAfter":True})
if doesDocumentHaveToDate:
# Both from and to attributes are present.
fromDate = isDocumentFromTo['notBefore'] # Extract 'from' date.
toDate = isDocumentFromTo['notAfter'] # Extract 'to' date.
date = str(fromDate)+"%"+str(toDate)
else:
# If the 'from' attribute is present without the 'to', it's interpreted as "not before this date". This is unlikely in Ibsen files; here as an in-case.
date = isDocumentFromTo['notBefore']
else:
dateObj = document.find("origDate")
date = list(dateObj.attrs.values())[0]
printString+=" dated: "+date
printString+="\n"
senders = document.find("name",{"role":"sender"}).findChildren(True, recursive=True)
printString+="Senders: "
for sender in senders:
senderType = list(sender.attrs.values())[0]
senderRef = list(sender.attrs.values())[1]
senderRef = senderRef.replace("Navneregister_HISe.xml#","")
for senderName in sender.contents:
printString+=senderName+" ("+senderType+")"
theAuthors.append(senderName)
theAuthorsTypes.append(senderType)
theAuthorsRefs.append(senderRef)
recips = document.find("name",{"role":"recipient"}).findChildren(True, recursive=True)
printString+=" | Recipients: "
for recip in recips:
recipType = list(recip.attrs.values())[0]
recipRef = list(recip.attrs.values())[1]
recipRef = recipRef.replace("Navneregister_HISe.xml#","")
for recipName in recip.contents:
printString+=recipName+" ("+recipType+")"
theRecipients.append(recipName)
theRecipientsTypes.append(recipType)
theRecipientsRefs.append(recipRef)
docLoc = docLoc.lstrip('[').rstrip("]").upper()
if docLoc in placeIDdict:
placeID = placeIDdict[docLoc]
else:
placeID = "N/A"
main[docID]['type'] = docType
main[docID]['date'] = date
main[docID]['from'] = theAuthors
main[docID]['fromRef'] = theAuthorsRefs
main[docID]['fromType'] = theAuthorsTypes
main[docID]['to'] = theRecipients
main[docID]['toRef'] = theRecipientsRefs
main[docID]['toType'] = theRecipientsTypes
main[docID]['place'] = docLoc
main[docID]['placeRef'] = placeID
main[docID]['source'] = fileName+"|"+docID+".xhtml"
i+=1
print("Acquired GeoNames IDs for these places:")
print(list(placeIDdict.keys()))
df1 = pd.DataFrame.from_dict(main).T.reset_index(drop=False)
df1.columns = "document","type","date","fromX","fromRef","fromType","to","toRef","toType","place","placeRef","source"
# Varia (miscellany) metadata harvesting
print("Checking for varia_file.csv...")
if os.path.exists("varia_file.csv"):
print("Processing varia...")
old_links = ["https://www.ibsen.uio.no/VAR_V18901219HeG.xhtml","https://www.ibsen.uio.no/VAR_V1858kongO2.xhtml","https://www.ibsen.uio.no/VAR_V18930718EPh.xhtml","https://www.ibsen.uio.no/VAR_V18690926HSTp.xhtml","https://www.ibsen.uio.no/VAR_V1860Skand.xhtml","https://www.ibsen.uio.no/VAR_V1861Skand.xhtml","https://www.ibsen.uio.no/VAR_1862Skand.xhtml"]
i=0
warned_about_old_links = False
supplement = defaultdict(dict)
varia = pd.read_csv("varia_file.csv",sep=",").set_index("index")
varia = varia.fillna("N/A")
for idx,row in varia.iterrows():
recipientID = row['fullID']
if recipientID != "N/A":
i+=1
title,date,recipient,docType = row['title'],row['date'],row['clearname'],row['type']
link = "https://www.ibsen.uio.no/VAR_"+str(idx)+".xhtml"
recipRef = "https://www.ibsen.uio.no/REGINFO_"+str(recipientID)+".xhtml"
if "pe" in recipientID:
recipType = "person"
elif "org" in recipientID:
recipType = "org"
#print(f"{idx}\n\t{title} from Ibsen to {recipient}, dated {date}\n\t{recipRef}\n\t{link}")
supplement[idx]['type'] = docType
supplement[idx]['date'] = date
supplement[idx]['from'] = ["HENRIK IBSEN"]
supplement[idx]['fromRef'] = ["peHI"]
supplement[idx]['fromType'] = ["person"]
supplement[idx]['to'] = [recipient]
supplement[idx]['toRef'] = [recipientID]
supplement[idx]['toType'] = [recipType]
supplement[idx]['place'] = "N/A"
supplement[idx]['placeRef'] = "N/A"
supplement[idx]['source'] = link
if link in old_links:
if warned_about_old_links == False:
print("\n>> Warning: the CMIF will use links valid in 2022, meaning that the correct global person/institution ID (e.g. orgSF) is referred to as the old varia-specific ID (e.g. Skand) in the document IDs. Modify the source CSV with new links if applicable, or change them after the fact in the CMIF.")
warned_about_old_links = True
print("\t"+link+" ("+recipRef+")")
if warned_about_old_links == True:
print("This warning will cease once the script does not detect the above links.")
print(f"\nAcquired {i} items from varia.\nRemember! These links will only work as long as they're in the ibsen.uio.no/VAR_ domain.\n")
df2 = pd.DataFrame.from_dict(supplement).T.reset_index(drop=False).fillna("N/A")
df2.columns = "document","type","date","fromX","fromRef","fromType","to","toRef","toType","place","placeRef","source"
df3 = df1.append(df2, ignore_index=True).reset_index(drop=True)
df4 = df3.copy().set_index("document")
df4_json = df4.to_json(orient="index")
# End varia metadata harvesting
# Standard CMIF (all normal letter correspondence)
# Append to UID..
cmifUid = cmifUid + "HT"
# Catch documents with weird/combined placenames in these
letters_with_weird_placenames,weird_placenames_in_letters = [],[]
print("Creating standard CMIF...")
# Create CMIF boilerplate object
CMIFstring = '<?xml-model href="https://raw.githubusercontent.com/TEI-Correspondence-SIG/CMIF/master/schema/cmi-customization.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>'+str(cmifTitle)+'</title><editor>'+str(editorName)+'<email>'+str(editorMail)+'</email></editor></titleStmt><publicationStmt><publisher><ref target="'+str(publisherURL)+'">'+str(publisherName)+'</ref></publisher><idno type="url">'+str(cmifURL)+'</idno><date when="'+str(today)+'"/><availability><licence target="https://creativecommons.org/licenses/by/4.0/">This file is licensed under the terms of the Creative-Commons-License CC-BY 4.0</licence></availability></publicationStmt><sourceDesc><bibl type="'+str(typeOfBibl)+'" xml:id="'+str(cmifUid)+'">'+str(publicationStatementFull)+'</bibl></sourceDesc></fileDesc><profileDesc><dummy/></profileDesc></teiheader><text><body><p/></body></text></tei>'
CMIF = bs(CMIFstring,"xml") # Read as XML, not HTML
profileDescElement = CMIF.find('profileDesc') # Target correspondence wrapper
## PATCH
print("Applying February 2023 patch...")
df_People = pd.read_csv("Person_Register_Info.csv", sep=",")
df_People = df_People.fillna("N/A")
peopleVIAFdict = defaultdict(dict)
for idx,row in df_People.iterrows():
viafID = row['Viaf_ID']
xmlID = row['XML_ID']
if viafID != "N/A":
viafURL = "https://viaf.org/viaf/"+viafID
peopleVIAFdict[xmlID] = viafURL
for idx,row in df1.iterrows():
document,date,fromX,to,place,placeRef,source = row['document'],row['date'],row['fromX'],row['to'],row['place'],row['placeRef'],row['source']
# Construct CMIF correspDesc element
correspDescElement = CMIF.new_tag("correspDesc", attrs={"key":str(document), "ref":source, "source":"#"+cmifUid})
profileDescElement.append(correspDescElement)
i=0
## Author (sender) encoding
for each in fromX:
# For each author, add a correspAction element...
correspActionElement = CMIF.new_tag("correspAction", attrs={'type':'sent'})
correspDescElement.append(correspActionElement)
category = df1.iloc[idx]["fromType"][i]
ttX = df1.iloc[idx]["fromRef"][i]
if ttX in peopleVIAFdict:
ref = peopleVIAFdict[ttX]
else:
ref = str("https://www.ibsen.uio.no/REGINFO_")+str(df1.iloc[idx]["fromRef"][i])+str(".xhtml")
if category == "org":
if ref != "N/A":
persNameElement = CMIF.new_tag("orgName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("orgName")
else:
if ref != "N/A":
persNameElement = CMIF.new_tag("persName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("persName")
persNameElement.string = str(each)
correspActionElement.append(persNameElement)
i+=1
# Place encoding
if place != "N/A" and place != "UKJENT OPPRINNELSESSTED":
if placeRef != "N/A" and placeRef != "plNN":
locationElement = CMIF.new_tag("placeName", attrs={"ref":"http://www.geonames.org/"+placeRef}) # Create place element
else:
locationElement = CMIF.new_tag("placeName")#, attrs={"ref":placeRef} # Create place element
letters_with_weird_placenames.append(document)
weird_placenames_in_letters.append(place)
locationElement.string = str(place) # Give it a string value (placename)
correspActionElement.append(locationElement) # Append the new element to the correspAction element
# End place encoding
# Date encoding
if date != "N/A":
if "%" in str(date): # If this is a "split" (uncertain) date:
dateObject = date.split("%")
dateSentElement = CMIF.new_tag("date", attrs={"notBefore":dateObject[0], "notAfter":dateObject[1]}) # Construct element with notbefore and notafter attributes
correspActionElement.append(dateSentElement)
else: # If this is a simple date:
dateSentElement = CMIF.new_tag("date", attrs={"when":date})
correspActionElement.append(dateSentElement)
# End date encoding
# End author (sender) encoding
i=0
# Recipient encoding
for each in to:
correspActionElement = CMIF.new_tag("correspAction", attrs={'type':'received'})
correspDescElement.append(correspActionElement)
category = df1.iloc[idx]["toType"][i]
ttX = df1.iloc[idx]["fromRef"][i]
if ttX in peopleVIAFdict:
ref = peopleVIAFdict[ttX]
else:
ref = str("https://www.ibsen.uio.no/REGINFO_")+str(df1.iloc[idx]["toRef"][i])+str(".xhtml")
#ref = df1.iloc[idx]["toRef"][i]
if each == "UKJENT MOTTAGER":
each = "UNKNOWN RECIPIENT"
if category == "org":
if ref != "N/A":
persNameElement = CMIF.new_tag("orgName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("orgName")
else:
if ref != "N/A":
persNameElement = CMIF.new_tag("persName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("persName")
i+=1
persNameElement.string = str(each)
correspActionElement.append(persNameElement)
# End recipient encoding
dummyElement = CMIF.find("dummy").decompose() # This will destroy the <dummy/> element.
print("Saving output...")
CMIFstr = str(CMIF)
CMIF = bs(CMIFstr, "xml", preserve_whitespace_tags=["placeName","bibl","corresp","title","persName","editor","email","publisher","ref","idno","licence"])
with open(outputFileName+".xml", "w", encoding="utf-8") as outfile:
outfile.write(CMIF.prettify())
print("Done exporting CMIF as",outputFileName)
with open("ibsen-correspondence-metadata_ht.json", "w") as outfile:
json.dump(main, outfile, indent = 4)
print("Saved metadata in ibsen-correspondence-metadata_ht.json")
if len(weird_placenames_in_letters) > 0:
print(f"\nThese documents have strange placenames:\n{letters_with_weird_placenames}\n{weird_placenames_in_letters}")
# End standard CMIF
if os.path.exists("varia_file.csv"):
# Experimental standard + varia CMIF
# Catch documents with weird/combined placenames in these
letters_with_weird_placenames,weird_placenames_in_letters = [],[]
print("\n\nCreating varia-augmented CMIF...")
# Change the UID..
cmifUid = cmifUid + "V"
# Create CMIF boilerplate object
CMIFstring = '<?xml-model href="https://raw.githubusercontent.com/TEI-Correspondence-SIG/CMIF/master/schema/cmi-customization.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?><TEI xmlns="http://www.tei-c.org/ns/1.0"><teiHeader><fileDesc><titleStmt><title>'+str(cmifTitle)+'</title><editor>'+str(editorName)+'<email>'+str(editorMail)+'</email></editor></titleStmt><publicationStmt><publisher><ref target="'+str(publisherURL)+'">'+str(publisherName)+'</ref></publisher><idno type="url">'+str(cmifURL)+'</idno><date when="'+str(today)+'"/><availability><licence target="https://creativecommons.org/licenses/by/4.0/">This file is licensed under the terms of the Creative-Commons-License CC-BY 4.0</licence></availability></publicationStmt><sourceDesc><bibl type="'+str(typeOfBibl)+'" xml:id="'+str(cmifUid)+'">'+str(publicationStatementFull)+'</bibl></sourceDesc></fileDesc><profileDesc><dummy/></profileDesc></teiheader><text><body><p/></body></text></tei>'
CMIF = bs(CMIFstring,"xml") # Read as XML, not HTML
profileDescElement = CMIF.find('profileDesc') # Target correspondence wrapper
for idx,row in df3.iterrows():
document,date,fromX,to,place,placeRef,source = row['document'],row['date'],row['fromX'],row['to'],row['place'],row['placeRef'],row['source']
# Construct CMIF correspDesc element
correspDescElement = CMIF.new_tag("correspDesc", attrs={"key":str(document), "ref":source, "source":"#"+cmifUid})
profileDescElement.append(correspDescElement)
i=0
## Author (sender) encoding
for each in fromX:
# For each author, add a correspAction element...
correspActionElement = CMIF.new_tag("correspAction", attrs={'type':'sent'})
correspDescElement.append(correspActionElement)
category = df3.iloc[idx]["fromType"][i]
ttX = df3.iloc[idx]["fromRef"][i]
if ttX in peopleVIAFdict:
ref = peopleVIAFdict[ttX]
else:
ref = str("https://www.ibsen.uio.no/REGINFO_")+str(df3.iloc[idx]["fromRef"][i])+str(".xhtml")
if category == "org":
if ref != "N/A":
persNameElement = CMIF.new_tag("orgName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("orgName")
else:
if ref != "N/A":
persNameElement = CMIF.new_tag("persName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("persName")
persNameElement.string = str(each)
correspActionElement.append(persNameElement)
i+=1
# Place encoding
if place != "N/A" and place != "UKJENT OPPRINNELSESSTED":
if placeRef != "N/A" and placeRef != "plNN":
locationElement = CMIF.new_tag("placeName", attrs={"ref":"http://www.geonames.org/"+placeRef}) # Create place element
else:
locationElement = CMIF.new_tag("placeName")#, attrs={"ref":placeRef} # Create place element
letters_with_weird_placenames.append(document)
weird_placenames_in_letters.append(place)
locationElement.string = str(place) # Give it a string value (placename)
correspActionElement.append(locationElement) # Append the new element to the correspAction element
# End place encoding
# Date encoding
if date != "N/A":
if "%" in str(date): # If this is a "split" (uncertain) date:
dateObject = date.split("%")
dateSentElement = CMIF.new_tag("date", attrs={"notBefore":dateObject[0], "notAfter":dateObject[1]}) # Construct element with notbefore and notafter attributes
correspActionElement.append(dateSentElement)
else: # If this is a simple date:
dateSentElement = CMIF.new_tag("date", attrs={"when":date})
correspActionElement.append(dateSentElement)
# End date encoding
# End author (sender) encoding
i=0
# Recipient encoding
for each in to:
correspActionElement = CMIF.new_tag("correspAction", attrs={'type':'received'})
correspDescElement.append(correspActionElement)
category = df3.iloc[idx]["toType"][i]
ttX = df3.iloc[idx]["fromRef"][i]
if ttX in peopleVIAFdict:
ref = peopleVIAFdict[ttX]
else:
ref = str("https://www.ibsen.uio.no/REGINFO_")+str(df3.iloc[idx]["toRef"][i])+str(".xhtml")
#ref = df3.iloc[idx]["toRef"][i]
if each == "UKJENT MOTTAGER":
each = "UNKNOWN RECIPIENT"
if category == "org":
if ref != "N/A":
persNameElement = CMIF.new_tag("orgName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("orgName")
else:
if ref != "N/A":
persNameElement = CMIF.new_tag("persName", attrs={"ref":ref})
else:
persNameElement = CMIF.new_tag("persName")
i+=1
persNameElement.string = str(each)
correspActionElement.append(persNameElement)
# End recipient encoding
dummyElement = CMIF.find("dummy").decompose() # This will destroy the <dummy/> element.
print("Saving output...")
CMIFstr = str(CMIF)
CMIF = bs(CMIFstr, "xml", preserve_whitespace_tags=["orgName","placeName","bibl","corresp","title","persName","editor","email","publisher","ref","idno","licence"])
with open(outputFileNameVaria+".xml", "w", encoding="utf-8") as outfile:
outfile.write(CMIF.prettify())
print("Done exporting CMIF as",outputFileNameVaria)
parse_json = json.loads(df4_json)
with open("ibsen-correspondence-metadata_htv.json", "w") as outfile:
json.dump(parse_json, outfile, indent = 4)
print("Saved metadata in ibsen-correspondence-metadata_htv.json")
if len(weird_placenames_in_letters) > 0:
print(f"\nThese documents have strange placenames:\n{letters_with_weird_placenames}\n{weird_placenames_in_letters}")
# End experimental CMIF
print("All done! Have a nice day.")