forked from FamilySearch/GEDCOM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
uri-def.py
497 lines (447 loc) · 22 KB
/
uri-def.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
import re
from sys import argv, stderr
from os.path import isfile, isdir, exists, dirname, join, basename
from os import makedirs
from subprocess import run
from shutil import copyfile
from glob import glob
def get_paths():
"""Parses command-line arguments, if present; else uses defaults"""
spec = [_ for _ in argv[1:] if isfile(_)]
dest = [_ for _ in argv[1:] if isdir(_)]
if len(dest) != 1: raise Exception("One destination expected, but got "+str(dest)+"\nUSAGE: "+argv[0]+" input1.md input2.md destination/")
if len(spec) < 1: raise Exception("Expected input files, none given"+"\nUSAGE: "+argv[0]+" input1.md input2.md destination/")
return spec, dest[0]
def get_text(spec):
"""Reads the contents of the given file"""
return '\n\n'.join(open(s).read() for s in spec)
def get_prefixes(txt):
"""Find and parse prefix definition tables"""
pfx = {}
for pfxtable in re.finditer(r'([^\n]*)Short Prefix *\| *URI Prefix *\|(\s*\|[^\n]*)*', txt):
for abbr, lng in re.findall(r'`([^`]*)` *\| *`([^`]*)`', pfxtable.group(0)):
pfx[abbr] = lng
return pfx
def find_data_types(txt, g7):
"""Returns datatype:uri and adds URI suffixes to g7"""
dturi = {}
for section in re.finditer(r'^#+ *([^\n]*)\n+((?:[^\n]|\n+[^\n#])*[^\n]*URI for[^\n]*data types? is(?:[^\n]|\n+[^\n#])*)', txt, re.M):
for dt, uri in re.findall(r'URI[^\n]*`([^\n`]*)` data type[^\n]*`([^`\n:]*:[^\n`]*)`', section.group(0)):
dturi[dt] = uri
if uri.startswith('g7:'):
if '#' in uri: uri = uri[:uri.find('#')]
if uri[3:] not in g7:
g7[uri[3:]] = ('data type', [section.group(2).strip()])
return dturi
def find_cat_tables(txt, g7, tagsets):
"""Looks for tables of tags preceded by a concatenation-based URI
Raises an exception if any URI is repeated with distinct definitions. This code contains a hard-coded fix for BIRTH which has the same unifying concept but distinct text in the spec.
Returns a {structure:[list,of,allowed,enums]} mapping
and a {calender:[list,of,months],month:[list,of,calendars]} mapping
"""
hard_code = { ## <- hack for enum-BIRTH
"g7:enum-BIRTH": 'Associated with birth, such as a birth name or birth parents.',
}
cats = {}
enums = {}
calendars = {}
for bit in re.finditer(r'by\s+concatenating\s+`([^`]*)`', txt):
i = txt.rfind('\n#', 0, bit.start())
j = txt.find(' ',i)
j = txt.find(txt[i:j+1], j)
sect = txt[i:j].replace('(Latter-Day Saint Ordinance)','`ord`') ## <- hack for ord-STAT
for entry in re.finditer(r'`([A-Z0-9_]+)` *\| *(.*?) *[|\n]', sect):
enum, meaning = entry.groups()
label = None
h1 = sect.find('\n|',sect.rfind('\n#',0,entry.start()))+1
h2 = sect.find('\n', h1)
header = [_.strip() for _ in sect[h1:h2].strip('| ').split('|')]
pfx = bit.group(1)+enum
if 'The URI of this' in meaning:
meaning, tail = meaning.split('The URI of this')
if meaning.endswith('<br/>'): meaning = meaning[:-5]
pfx = tail.split('`')[1]
meaning = hard_code.get(pfx,meaning)
if len(header) > 2:
h1 = sect.rfind('\n',0,entry.start())
h2 = sect.find('\n',h1+1)
row = [_.strip() for _ in sect[h1:h2].strip('| \n').split('|')]
assert len(row) == len(header)
meaning = [(header[i]+': '+row[i]
if header[i] not in ('Meaning','Name')
else row[i]) for i in range(1,len(header))]
else:
meaning = [meaning]
if pfx in cats and meaning != cats[pfx]:
raise Exception('Concatenated URI '+pfx+' has multiple definitions:'
+ '\n '+cats[pfx]
+ '\n '+meaning
)
if 'enum-' in pfx:
yamltype = 'enumeration'
k1 = sect.find('`', sect.rfind('\n#', 0, entry.start()))
k2 = sect.rfind('`', 0, sect.find('\n', k1))
key = sect[k1:k2].replace('`','').replace('.','-')
if pfx not in enums.get(key,[]): enums.setdefault(key,[]).append(pfx)
elif 'month-' in pfx:
yamltype = 'month'
label = re.sub(r'(,| \().*', '', meaning[0])
k1 = sect.find('`', sect.find('URI for this calendar is', entry.start()))+1
k2 = sect.find('`', k1)
cal = sect[k1:k2]
calendars.setdefault(cal,[]).append(pfx)
calendars.setdefault(pfx,[]).append(cal)
if 'GREGORIAN' in cal: ## <- hack for JULIAN
c2 = cal.replace('GREGORIAN','JULIAN')
calendars.setdefault(c2,[]).append(pfx)
calendars.setdefault(pfx,[]).append(c2)
else:
raise Exception("unexpected enumeration URI prefix "+repr(pfx))
if pfx not in cats:
cats[pfx] = meaning
if pfx.startswith('g7:'):
if pfx[3:] in g7:
raise Exception(pfx+' defined as an enumeration and a '+g7[pfx[3:]][0])
if label:
g7[pfx[3:]] = (yamltype, meaning, None, label)
else:
g7[pfx[3:]] = (yamltype, meaning)
return enums, calendars
def find_calendars(txt, g7):
"""Looks for sections defining a `g7:cal-` URI"""
for bit in re.finditer(r'#+ `[^`]*`[^\n]*\n+((?:\n+(?!#)|[^\n])*is `g7:(cal-[^`]*)`(?:\n+(?!#)|[^\n#])*)', txt):
m = re.search('The epoch markers? ([`_A-Z0-9, and]+) (is|are) permitted', bit.group(1))
marker = [] if not m else re.findall(r'[A-Z0-9_]+', m[1])
m = re.match(r'^The ([A-Z][A-Za-z]* )+calendar', bit.group(1))
if m:
g7[bit.group(2)] = ('calendar',[bit.group(1)], marker, m.group(0)[4:-9])
else:
g7[bit.group(2)] = ('calendar',[bit.group(1)], marker)
def joint_card(c1,c2):
"""Given two cardinalities, combine them."""
return '{' + ('1' if c1[1] == c2[1] == '1' else '0') + ':' + ('1' if c1[3] == c2[3] == '1' else 'M') + '}'
def parse_rules(txt):
"""returns {rule:[(card,uri),(card,uir),...] for each level-n
production of the rule, even if indirect (via another rule),
regardless of if alternation or set."""
# Find gedstruct context
rule_becomes = {}
rule_becomes_rule = {}
for rule,block,notes in re.findall(r'# *`([A-Z_0-9]+)` *:=\s+```+[^\n]*\n([^`]*)``+[^\n]+((?:[^\n]|\n(?!#))*)', txt):
for card, uri in re.findall(r'^n [A-Z@][^\n]*(\{.:.\}) *(\S+:\S+)', block, re.M):
rule_becomes.setdefault(rule,[]).append((card, uri))
for r2, card in re.findall(r'^n <<([^>]*)>>[^\n]*(\{.:.\})', block, re.M):
rule_becomes_rule.setdefault(rule,[]).append((card, r2))
# Fixed-point rule-to-rule resolution
again = True
while again:
again = False
for r1,rset in tuple(rule_becomes_rule.items()):
flat = True
for c,r2 in rset:
if r2 in rule_becomes_rule:
flat = False
if flat:
for c,r2 in rset:
rule_becomes.setdefault(r1,[]).extend((joint_card(c,c2),uri) for (c2,uri) in rule_becomes[r2])
del rule_becomes_rule[r1]
else:
again = True
return rule_becomes
def new_key(val, d, *keys, msg=''):
"""Helper method to add to a (nested) dict and raise if present"""
for k in keys[:-1]:
d = d.setdefault(k, {})
if keys[-1] in d:
if d[keys[-1]] != val:
raise Exception(msg+'Duplicate key: '+str(keys))
else: d[keys[-1]] = val
def parse_gedstruct(txt, rules, dtypes):
"""Reads through all gedstruct blocks to find payloads, substructures, and superstructures"""
sup,sub,payload = {'g7:CONT':[]}, {}, {}
for block in re.findall(r'```[^\n]*gedstruct[^\n]*\n([^`]*)\n```', txt):
stack = []
for line in block.split('\n'):
parts = line.strip().split()
if len(parts) < 3:
if line not in ('[','|',']'):
raise Exception('Invalid gedstruct line: '+repr(line))
continue
if parts[1].startswith('@'): del parts[1]
if parts[0] == 'n': stack = []
else:
n = int(parts[0])
while n < len(stack): stack.pop()
if parts[1].startswith('<'):
card = parts[2]
if len(stack):
for c,u in rules[parts[1][2:-2]]:
new_key(joint_card(card,c), sup, u, stack[-1], msg='rule sup: ')
new_key(joint_card(card,c), sub, stack[-1], u, msg='rule sub: ')
else:
uri = parts[-1]
card = parts[-2]
if len(parts) > 4:
p = ' '.join(parts[2:-2])[1:-1]
if p.startswith('<XREF:'): p = '@<https://gedcom.io/terms/v7/record-'+p[6:]+'@'
elif p == 'Y|<NULL>': pass
else: p = dtypes[p]
else: p = None
new_key(p, payload, uri, msg='payload: ')
if len(stack):
new_key(card, sup, uri, stack[-1], msg='line sup: ')
new_key(card, sub, stack[-1], uri, msg='line sub: ')
stack.append(uri)
return {k:{'sub':sub.get(k,[]),'sup':sup.get(k,[]),'pay':payload.get(k)} for k in sub.keys()|sup.keys()|payload.keys()}
def find_descriptions(txt, g7, ssp):
"""Collects structure definitions as follows:
- Sections '#+ TAG (Name) `g7:FULL.TAG`'
- Sections '#+ `RULE` :=' with only one level-n struct
- Rows in tables 'Tag | Name<br/>URI | Description'
Returns a {section header:[list,of,uris]} mapping
"""
# structure sections
for name,uri,desc in re.findall(r'#+ `[^`]*`[^\n]*\(([^)]*)\)[^\n]*`([^:`\n]*:[^`\n]*)`[^\n]*\n+((?:\n+(?!#)|[^\n])*)', txt):
if uri not in ssp:
raise Exception('Found section for '+uri+' but no gedstruct')
if uri.startswith('g7:'):
g7.setdefault(uri[3:],('structure',[],ssp[uri],name.strip()))[1].extend((
name.strip(),
desc.strip()
))
for other in re.findall(r'[Aa] type of `(\S*)`', desc):
m = re.search('^#+ +`'+other+r'`[^\n`]*\n((?:[^\n]+|\n+(?!#))*)', txt, re.M)
if m:
g7[uri[3:]][1].append(m.group(1).strip())
# error check that gedstruct and sections align
for uri in ssp:
if uri.startswith('g7:') and uri[3:] not in g7:
raise Exception('Found gedstruct for '+uri+' but no section')
# gedstruct sections
for uri, desc in re.findall(r'#+ *`[^`]*` *:=[^\n]*\n+`+[^\n]*\n+n [^\n]*\} *(\S+:\S+) *(?:\n [^\n]*)*\n`+[^\n]*\n+((?:[^\n]|\n(?!#))*)', txt):
g7[uri[3:]][1].append(desc.strip())
tagsets = {}
# tag tables
for table in re.finditer(r'\n#+ (\S[-A-Za-z0-9 ]*[a-z0-9])[^#]*?Tag *\| *Name[^|\n]*\| *Description[^\n]*((?:\n[^\n|]*\|[^\n|]*\|[^\n]*)*)', txt):
pfx = ''
header = table.group(1)
if header.startswith('Fam'): pfx = 'FAM-'
if header.startswith('Indi'): pfx = 'INDI-'
for tag, name, desc in re.findall(r'`([A-Z_0-9]+)` *\| *([^|\n]*?) *\| *([^|\n]*[^ |\n]) *', table.group(2)):
if '<br' in name: name = name[:name.find('<br')]
if tag not in g7: tag = pfx+tag
if tag not in g7:
raise Exception('Found table for '+tag+' but no section or structure')
if g7[tag][0] != 'structure':
raise Exception('Found table for '+tag+' but that\'s a '+g7[tag][0]+' not a structure')
tagsets.setdefault(header,[]).append(tag)
g7[tag][1].append(name.strip())
g7[tag][1].append(desc.strip())
return tagsets
def find_enum_by_link(txt, enums, tagsets):
"""Extend enums with the tagsets suggested by any section with #enum- in the header that lacks a table and links to Events or Attributes"""
# enums.setdefault('g7:enumset-EVENATTR',[]).extend(( ## <- hack because of table omission
# 'g7:INDI-EVEN',
# 'g7:FAM-EVEN',
# 'g7:INDI-FACT',
# 'g7:FAM-FACT',
# )) ## do not do for enumset-EVEN
for sect in re.finditer(r'# *`(g7:enumset-[^`]*)`[\s\S]*?\n#', txt):
if '[Events]' in sect.group(0):
key = sect.group(1).replace('`','').replace('.','-')
for k in tagsets:
if 'Event' in k:
enums.setdefault(key, [])
for tag in tagsets[k]:
tag = tag.replace('INDI-','enum-').replace('FAM-','enum-')
tag = 'g7:'+tag
if tag in enums[key]: continue
enums[key].append(tag)
if '[Attributes]' in sect.group(0):
key = sect.group(1).replace('`','').replace('.','-')
for k in tagsets:
if 'Attribute' in k:
enums.setdefault(key, [])
for tag in tagsets[k]:
tag = tag.replace('INDI-','enum-').replace('FAM-','enum-')
tag = 'g7:'+tag
if tag in enums[key]: continue
enums[key].append(tag)
# enums.setdefault(key, []).extend(_ for _ in ['g7:'+_2.replace('INDI-','enum-').replace('FAM-','enum-') for _2 in tagsets[k]] if _ not in enums.get(key,[]))
def find_enumsets(txt):
res = {}
for sect in re.finditer(r'# *[^\n]*?`(g7:[^`]*)`([\s\S]*?)\n#', txt):
if 'from set `g7:enumset-' in sect.group(2):
key = sect.group(1)
val = re.search(r'from set `(g7:enumset-[^`]*)`', sect.group(2)).group(1)
res[key] = val
return res
def tidy_markdown(md, indent, width=79):
"""
The markdown files in the specification directory use the following Markdown dialect:
Part of GFM:
- setext headers with classes like `{.unnumbered}`, unlisting marks like `{-}`, and anchors like `{#container}`
- language-specific code blocks both <code>\`\`\`gedcom</code> and <code>\`\`\` {.gedstruct .long}</code> headers
- markdown inside HTML between lines `<div class="example">` and `</div>` (only inside lists) and between definition list tags `<dl>`, `<dt>`, and `<dd>` (only in acknowledgements)
- code blocks with no leading blank line
- tables with `|---|:--|` format
- tables with `--- | --- | ---` format
Not part of GFM:
- YAML front matter
- divs with `:::class` headers and `:::` footers
- automatic links with `[name of section to link to]`
- inline code with class `1 NO MARR`{.gedcom} (used only once)
pip install mdformat-gfm
"""
global prefixes
for k,v in prefixes.items():
md = re.sub(r'\b'+k+':', v, md)
# for now ignoring YAML frontmatter
md = re.sub(r':::(\S+)', r'<div class="\1">\n', md) # convert ::: divs to <div>s
md = re.sub(r':::', '\n</div>', md) # convert ::: divs to <div>s
md = re.sub(r'\]\([^\)]*\)({[^}]*})?', ']', md) # remove links
md = re.sub(r'`{\.\S+\}', '`', md) # remove inline code classes
import mdformat
out = mdformat.text(md, extensions={"gfm"}, options={"number":True, "wrap":width})
return out.rstrip().replace('\n','\n'+' '*indent).replace('\[','[').replace('\]',']')
def yaml_str_helper(pfx, md, width=79):
txt = tidy_markdown(md, len(pfx), width)
if ('\n'+' '*len(pfx)+'\n') in txt: return pfx + '|\n' + ' '*len(pfx) + txt
if ': ' in txt or txt.startswith('@'):
if '"' in txt: return pfx + '|\n' + ' '*len(pfx) + txt
else: return pfx+'"' + txt + '"'
return pfx + txt
def expand_prefix(txt, prefixes):
for key in sorted(prefixes.keys(), key=lambda x:-len(x)):
k = key+':'
if txt.startswith(k):
return prefixes[key] + txt[len(k):]
return txt
if __name__ == '__main__':
# URI definitions
g7 = {}
specs, dest = get_paths()
txt = get_text(specs)
prefixes = get_prefixes(txt)
dtypes = find_data_types(txt, g7)
rules = parse_rules(txt)
ssp = parse_gedstruct(txt, rules, dtypes)
tagsets = find_descriptions(txt, g7, ssp)
enums, calendars = find_cat_tables(txt, g7, tagsets)
find_enum_by_link(txt, enums, tagsets)
for k in enums:
g7[k[3:]] = ('enumeration set',[])
enumsets = find_enumsets(txt)
find_calendars(txt, g7)
struct_lookup = []
enum_lookup = []
enumset_lookup = []
payload_lookup = []
cardinality_lookup = []
for tag in g7:
print('outputting', tag, '...', end=' ')
maybe = join(dirname(specs[0]),'terms',tag)
if exists(maybe):
copyfile(maybe, join(dest,tag))
print('by copying', maybe, '...', end=' ')
continue
with open(join(dest,tag), 'w') as fh:
fh.write('%YAML 1.2\n---\n')
print('lang: en-US', file=fh)
print('\ntype:',g7[tag][0], file=fh)
# error: type-DATE# type-List#
uri = expand_prefix('g7:'+tag,prefixes)
print('\nuri:', uri, file=fh)
if g7[tag][0] in ('structure', 'enumeration', 'calendar', 'month'):
ptag = re.sub(r'.*-', '', re.sub(r'-[A-Z]?[a-z].*', '', tag))
print('\nstandard tag: '+(repr(ptag) if ptag in ('YES','NO','TRUE','FALSE', '0', '1', '2', '3') else ptag), file=fh)
if len(g7[tag][1]) > 0:
print('\nspecification:', file=fh)
for desc in g7[tag][1]:
print(yaml_str_helper(' - ', desc), file=fh)
if len(g7[tag]) > 3:
print('\nlabel:',repr(g7[tag][3]), file=fh)
if g7[tag][0] == 'structure':
d = g7[tag][2]
payload = expand_prefix(d['pay'],prefixes) if d['pay'] is not None else 'null'
if payload[0] == '@' or ': ' in payload:
print('\npayload: "'+payload+'"', file=fh)
else:
print('\npayload:', payload, file=fh)
payload_lookup.append([uri, payload if payload != 'null' else ''])
if d['pay'] and 'Enum' in d['pay']:
setname = expand_prefix(enumsets['g7:'+tag],prefixes)
print('\nenumeration set: "'+setname+'"', file=fh)
enum_lookup.append([uri,setname])
# print('\nenumeration values:', file=fh)
# for k in sorted(enums[tag]):
# penum = re.sub(r'.*[-:/]', '', k)
# puri = expand_prefix(k,prefixes)
# print(' -', expand_prefix(k,prefixes), file=fh)
# enum_lookup.append([uri,penum,puri])
if d['sub']:
print('\nsubstructures:', file=fh)
for k,v in sorted(d['sub'].items()):
print(' "'+expand_prefix(k,prefixes)+'": "'+v+'"', file=fh)
else: print('\nsubstructures: {}', file=fh)
if d['sup']:
print('\nsuperstructures:', file=fh)
for k,v in sorted(d['sup'].items()):
suri = expand_prefix(k,prefixes)
print(' "'+suri+'": "'+v+'"', file=fh)
struct_lookup.append([suri,ptag,uri])
cardinality_lookup.append([suri,uri,v])
else:
print('\nsuperstructures: {}', file=fh)
struct_lookup.append(['',ptag,uri])
elif g7[tag][0] == 'calendar':
print('\nmonths:', file=fh)
for k in calendars['g7:'+tag]:
print(' - "'+expand_prefix(k, prefixes)+'"', file=fh)
if len(g7[tag][2]) == 0:
print('\nepochs: []', file=fh)
else:
print('\nepochs:', file=fh)
for epoch in g7[tag][2]:
print(' -', epoch, file=fh)
elif g7[tag][0] == 'month':
print('\ncalendars:', file=fh)
for k in calendars['g7:'+tag]:
print(' - "'+expand_prefix(k, prefixes)+'"', file=fh)
elif g7[tag][0] == 'enumeration set':
print('\nenumeration values:', file=fh)
for k in enums['g7:'+tag]:
valname = expand_prefix(k, prefixes)
print(' - "'+valname+'"', file=fh)
enumset_lookup.append([uri, valname])
# handle use in enumerations (which can include any tag type)
is_used_by = False
for tag2 in sorted(enums):
if ('g7:'+tag) in enums[tag2]:
if not is_used_by:
print('\nvalue of:', file=fh)
is_used_by = True
print(' - "'+expand_prefix(tag2,prefixes)+'"', file=fh)
print('\ncontact: "https://gedcom.io/community/"', file=fh)
fh.write('...\n')
print('done')
for path in glob(join(dirname(specs[0]),'terms','*')):
tag = basename(path)
if tag not in g7:
print('copying', tag, '...', end=' ')
copyfile(path, join(dest,tag))
print('done')
if dest.endswith('/'): dest=dest[:-1]
base = dirname(dest)
for data,name in [
(struct_lookup, join(base,'substructures.tsv')),
(enum_lookup, join(base,'enumerations.tsv')),
(enumset_lookup, join(base,'enumerationsets.tsv')),
(payload_lookup, join(base,'payloads.tsv')),
(cardinality_lookup, join(base,'cardinalities.tsv')),
]:
print('outputting', name, '...', end=' ')
with open(name, 'w') as f:
for row in data:
print('\t'.join(row), file=f)
print('done')