-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate.py
executable file
·1393 lines (1299 loc) · 70.2 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/python
# Original code (2015) by Filip Ginter and Sampo Pyysalo.
# DZ 2018-11-04: Porting the validator to Python 3.
import fileinput
import sys
import io
import os.path
import logging
# According to https://stackoverflow.com/questions/1832893/python-regex-matching-unicode-properties,
# the regex module has the same API as re but it can check Unicode character properties using \p{}
# as in Perl.
#import re
import regex as re
import traceback
import argparse
THISDIR=os.path.dirname(os.path.abspath(__file__)) # The folder where this script resides.
# Constants for the column indices
COLCOUNT=10
ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC=range(COLCOUNT)
COLNAMES='ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC'.split(',')
TOKENSWSPACE=MISC+1 #one extra constant
# Global variables:
curr_line=0 # Current line in the input file
sentence_line=0 # The line in the input file on which the current sentence starts
sentence_id=None # The most recently read sentence id
line_of_first_empty_node=None
line_of_first_enhanced_orphan=None
error_counter={} # key: error type value: error count
warn_on_missing_files=set() # langspec files which you should warn about in case they are missing (can be deprel, edeprel, feat_val, tokens_w_space)
def warn(msg, error_type, lineno=True, nodelineno=0):
"""
Print the warning.
If lineno is True, print the number of the line last read from input. Note
that once we have read a sentence, this is the number of the empty line
after the sentence, hence we probably do not want to print it.
If we still have an error that pertains to an individual node, and we know
the number of the line where the node appears, we can supply it via
nodelineno. Nonzero nodelineno means that lineno value is ignored.
If lineno is False, print the number and starting line of the current tree.
"""
global curr_fname, curr_line, sentence_line, sentence_id, error_counter, tree_counter, args
error_counter[error_type] = error_counter.get(error_type, 0)+1
if not args.quiet:
if args.max_err>0 and error_counter[error_type]==args.max_err:
print(('...suppressing further errors regarding ' + error_type), file=sys.stderr)
elif args.max_err>0 and error_counter[error_type]>args.max_err:
pass #suppressed
else:
if len(args.input)>1: #several files, should report which one
if curr_fname=="-":
fn="(in STDIN) "
else:
fn="(in "+os.path.basename(curr_fname)+") "
else:
fn=""
sent = ''
# Global variable (last read sentence id): sentence_id
# Originally we used a parameter sid but we probably do not need to override the global value.
if sentence_id:
sent = ' Sent ' + sentence_id
if nodelineno:
print("[%sLine %d%s]: %s" % (fn, nodelineno, sent, msg), file=sys.stderr)
elif lineno:
print("[%sLine %d%s]: %s" % (fn, curr_line, sent, msg), file=sys.stderr)
else:
print("[%sTree number %d on line %d%s]: %s" % (fn, tree_counter, sentence_line, sent, msg), file=sys.stderr)
###### Support functions
def is_whitespace(line):
return re.match(r"^\s+$", line)
def is_word(cols):
return re.match(r"^[1-9][0-9]*$", cols[ID])
def is_multiword_token(cols):
return re.match(r"^[1-9][0-9]*-[1-9][0-9]*$", cols[ID])
def is_empty_node(cols):
return re.match(r"^[0-9]+\.[1-9][0-9]*$", cols[ID])
def parse_empty_node_id(cols):
m = re.match(r"^([0-9]+)\.([0-9]+)$", cols[ID])
assert m, 'parse_empty_node_id with non-empty node'
return m.groups()
def shorten(string):
return string if len(string) < 25 else string[:20]+'[...]'
def lspec2ud(deprel):
return deprel.split(':', 1)[0]
#==============================================================================
# Level 1 tests. Only CoNLL-U backbone. Values can be empty or non-UD.
#==============================================================================
sentid_re=re.compile('^# sent_id\s*=\s*(\S+)$')
def trees(inp, tag_sets, args):
"""
`inp` a file-like object yielding lines as unicode
`tag_sets` and `args` are needed for choosing the tests
This function does elementary checking of the input and yields one
sentence at a time from the input stream.
"""
global curr_line, sentence_line, sentence_id
comments=[] # List of comment lines to go with the current sentence
lines=[] # List of token/word lines of the current sentence
for line_counter, line in enumerate(inp):
curr_line=line_counter+1
line=line.rstrip(u"\n")
if is_whitespace(line):
warn('Spurious line that appears empty but is not; there are whitespace characters.', 'Format')
# We will pretend that the line terminates a sentence in order to avoid subsequent misleading error messages.
if lines:
yield comments, lines
comments=[]
lines=[]
elif not line: # empty line
if lines: # sentence done
yield comments, lines
comments=[]
lines=[]
else:
warn('Spurious empty line. Only one empty line is expected after every sentence.', 'Format')
elif line[0]=='#':
# We will really validate sentence ids later. But now we want to remember
# everything that looks like a sentence id and use it in the error messages.
# Line numbers themselves may not be sufficient if we are reading multiple
# files from a pipe.
match = sentid_re.match(line)
if match:
sentence_id = match.group(1)
if not lines: # before sentence
comments.append(line)
else:
warn('Spurious comment line. Comments are only allowed before a sentence.', 'Format')
elif line[0].isdigit():
if not lines: # new sentence
sentence_line=curr_line
cols=line.split(u"\t")
if len(cols)!=COLCOUNT:
warn('The line has %d columns but %d are expected.'%(len(cols), COLCOUNT), 'Format')
lines.append(cols)
validate_cols_level1(cols)
if args.level > 1:
validate_cols(cols,tag_sets,args)
else: # A line which is neither a comment nor a token/word, nor empty. That's bad!
warn("Spurious line: '%s'. All non-empty lines should start with a digit or the # character."%(line), 'Format')
else: # end of file
if comments or lines: # These should have been yielded on an empty line!
warn('Missing empty line after the last tree.', 'Format')
yield comments, lines
###### Tests applicable to a single row indpendently of the others
whitespace_re=re.compile('.*\s',re.U)
whitespace2_re=re.compile('.*\s\s', re.U)
def validate_cols_level1(cols):
"""
Tests that can run on a single line and pertain only to the CoNLL-U file
format, not to predefined sets of UD tags.
"""
# Some whitespace may be permitted in FORM, LEMMA and MISC but not elsewhere.
for col_idx in range(MISC+1):
if col_idx >= len(cols):
break # this has been already reported in trees()
# Must never be empty
if not cols[col_idx]:
warn('Empty value in column %s'%(COLNAMES[col_idx]), 'Format')
else:
# Must never have leading/trailing whitespace
if cols[col_idx][0].isspace():
warn('Initial whitespace not allowed in column %s'%(COLNAMES[col_idx]), 'Format')
if cols[col_idx][-1].isspace():
warn('Trailing whitespace not allowed in column %s'%(COLNAMES[col_idx]), 'Format')
# Must never contain two consecutive whitespace characters
if whitespace2_re.match(cols[col_idx]):
warn('Two or more consecutive whitespace characters not allowed in column %s'%(COLNAMES[col_idx]), 'Format')
# These columns must not have whitespace
for col_idx in (ID,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS):
if col_idx >= len(cols):
break # this has been already reported in trees()
if whitespace_re.match(cols[col_idx]):
warn("White space not allowed in the %s column: '%s'"%(COLNAMES[col_idx], cols[col_idx]), 'Format')
# Check for the format of the ID value. (ID must not be empty.)
if not (is_word(cols) or is_empty_node(cols) or is_multiword_token(cols)):
warn("Unexpected ID format '%s'" % cols[ID], 'Format')
##### Tests applicable to the whole tree
interval_re=re.compile('^([0-9]+)-([0-9]+)$',re.U)
def validate_ID_sequence(tree):
"""
Validates that the ID sequence is correctly formed.
"""
words=[]
tokens=[]
current_word_id, next_empty_id = 0, 1
for cols in tree:
if not is_empty_node(cols):
next_empty_id = 1 # reset sequence
if is_word(cols):
t_id=int(cols[ID])
current_word_id = t_id
words.append(t_id)
# Not covered by the previous interval?
if not (tokens and tokens[-1][0]<=t_id and tokens[-1][1]>=t_id):
tokens.append((t_id,t_id)) # nope - let's make a default interval for it
elif is_multiword_token(cols):
match=interval_re.match(cols[ID]) # Check the interval against the regex
if not match:
warn("Spurious token interval definition: '%s'."%cols[ID], 'Format', lineno=False)
continue
beg,end=int(match.group(1)),int(match.group(2))
if not ((not words and beg == 1) or (words and beg == words[-1]+1)):
warn('Multiword range not before its first word', 'Format')
continue
tokens.append((beg,end))
elif is_empty_node(cols):
word_id, empty_id = (int(i) for i in parse_empty_node_id(cols))
if word_id != current_word_id or empty_id != next_empty_id:
warn('Empty node id %s, expected %d.%d' %
(cols[ID], current_word_id, next_empty_id), 'Format')
next_empty_id += 1
# Now let's do some basic sanity checks on the sequences
wrdstrseq = ','.join(str(x) for x in words)
expstrseq = ','.join(str(x) for x in range(1, len(words)+1)) # Words should form a sequence 1,2,...
if wrdstrseq != expstrseq:
warn("Words do not form a sequence. Got '%s'. Expected '%s'."%(wrdstrseq, expstrseq), 'Format', lineno=False)
# Check elementary sanity of word intervals
for (b,e) in tokens:
if e<b: # end before beginning
warn('Spurious token interval %d-%d'%(b,e), 'Format')
continue
if b<1 or e>len(words): # out of range
warn('Spurious token interval %d-%d (out of range)'%(b,e), 'Format')
continue
def validate_token_ranges(tree):
"""
Checks that the word ranges for multiword tokens are valid.
"""
covered = set()
for cols in tree:
if not is_multiword_token(cols):
continue
m = interval_re.match(cols[ID])
if not m:
warn('Failed to parse ID %s' % cols[ID], 'Format')
continue
start, end = m.groups()
try:
start, end = int(start), int(end)
except ValueError:
assert False, 'internal error' # RE should assure that this works
if not start < end:
warn('Invalid range: %s' % cols[ID], 'Format')
continue
if covered & set(range(start, end+1)):
warn('Range overlaps with others: %s' % cols[ID], 'Format')
covered |= set(range(start, end+1))
def validate_newlines(inp):
if inp.newlines and inp.newlines!='\n':
warn('Only the unix-style LF line terminator is allowed', 'Format')
#==============================================================================
# Level 2 tests. Tree structure, universal tags and deprels. Note that any
# well-formed Feature=Valid pair is allowed (because it could be language-
# specific) and any word form or lemma can contain spaces (because language-
# specific guidelines may permit it).
#==============================================================================
###### Metadata tests #########
def validate_sent_id(comments,known_ids,lcode):
matched=[]
for c in comments:
match=sentid_re.match(c)
if match:
matched.append(match)
else:
if c.startswith('# sent_id') or c.startswith('#sent_id'):
warn("Spurious sent_id line: '%s' Should look like '# sent_id = xxxxx' where xxxxx is not whitespace. Forward slash reserved for special purposes." %c, 'Metadata')
if not matched:
warn('Missing the sent_id attribute.', 'Metadata')
elif len(matched)>1:
warn('Multiple sent_id attributes.', 'Metadata')
else:
# Uniqueness of sentence ids should be tested treebank-wide, not just file-wide.
# For that to happen, all three files should be tested at once.
sid=matched[0].group(1)
if sid in known_ids:
warn('Non-unique sent_id the sent_id attribute: '+sid, 'Metadata')
if sid.count(u"/")>1 or (sid.count(u"/")==1 and lcode!=u"ud" and lcode!=u"shopen"):
warn('The forward slash is reserved for special use in parallel treebanks: '+sid, 'Metadata')
known_ids.add(sid)
text_re=re.compile('^# text\s*=\s*(.+)$')
def validate_text_meta(comments,tree):
matched=[]
for c in comments:
match=text_re.match(c)
if match:
matched.append(match)
if not matched:
warn('Missing the text attribute.', 'Metadata')
elif len(matched)>1:
warn('Multiple text attributes.', 'Metadata')
else:
stext=matched[0].group(1)
if stext[-1].isspace():
warn('The text attribute must not end with whitespace', 'Metadata')
# Validate the text against the SpaceAfter attribute in MISC.
skip_words=set()
mismatch_reported=0 # do not report multiple mismatches in the same sentence; they usually have the same cause
for cols in tree:
if MISC >= len(cols):
# This error has been reported elsewhere but we cannot check MISC now.
continue
if 'NoSpaceAfter=Yes' in cols[MISC]: # I leave this without the split("|") to catch all
warn('NoSpaceAfter=Yes should be replaced with SpaceAfter=No', 'Metadata')
if '.' in cols[ID]: # empty word
if 'SpaceAfter=No' in cols[MISC]: # I leave this without the split("|") to catch all
warn('There should not be a SpaceAfter=No entry for empty words', 'Metadata')
continue
elif '-' in cols[ID]: # multi-word token
beg,end=cols[ID].split('-')
try:
begi,endi = int(beg),int(end)
except ValueError as e:
warn('Non-integer range %s-%s (%s)'%(beg,end,e), 'Format')
begi,endi=1,0
# If we see a MWtoken, add its words to an ignore-set - these will be skipped, and also checked for absence of SpaceAfter=No
for i in range(begi, endi+1):
skip_words.add(str(i))
elif cols[ID] in skip_words:
if 'SpaceAfter=No' in cols[MISC]:
warn('There should not be a SpaceAfter=No entry for words which are a part of a multi-word token', 'Metadata')
continue
else:
# Err, I guess we have nothing to do here. :)
pass
# So now we have either a MWtoken or a word which is also a token in its entirety
if not stext.startswith(cols[FORM]):
if not mismatch_reported:
warn("Mismatch between the text attribute and the FORM field. Form[%s] is '%s' but text is '%s...'" %(cols[ID], cols[FORM], stext[:len(cols[FORM])+20]), 'Metadata', False)
mismatch_reported=1
else:
stext=stext[len(cols[FORM]):] # eat the form
if 'SpaceAfter=No' not in cols[MISC].split("|"):
if args.check_space_after and (stext) and not stext[0].isspace():
warn("SpaceAfter=No is missing in the MISC field of node #%s because the text is '%s'" %(cols[ID], shorten(cols[FORM]+stext)), 'Metadata')
stext=stext.lstrip()
if stext:
warn("Extra characters at the end of the text attribute, not accounted for in the FORM fields: '%s'"%stext, 'Metadata')
##### Tests applicable to a single row indpendently of the others
def validate_cols(cols, tag_sets, args):
"""
All tests that can run on a single line. Done as soon as the line is read,
called from trees() if level>1.
"""
if is_word(cols) or is_empty_node(cols):
validate_character_constraints(cols) # level 2
validate_features(cols, tag_sets, args) # level 2 and up (relevant code checks whether higher level is required)
validate_pos(cols,tag_sets) # level 2
elif is_multiword_token(cols):
validate_token_empty_vals(cols)
# else do nothing; we have already reported wrong ID format at level 1
if is_word(cols):
validate_deprels(cols, tag_sets, args) # level 2 and up
elif is_empty_node(cols):
validate_empty_node_empty_vals(cols) # level 2
# TODO check also the following:
# - DEPS are connected and non-acyclic
# (more, what?)
if args.level > 3:
validate_whitespace(cols, tag_sets) # level 4 (it is language-specific; to disallow everywhere, use --lang ud)
def validate_token_empty_vals(cols):
"""
Checks that a multi-word token has _ empty values in all fields except MISC.
This is required by UD guidelines although it is not a problem in general,
therefore a level 2 test.
"""
assert is_multiword_token(cols), 'internal error'
for col_idx in range(LEMMA,MISC): #all columns in the LEMMA-DEPS range
if cols[col_idx]!=u"_":
warn("A multi-word token line must have '_' in the column %s. Now: '%s'."%(COLNAMES[col_idx], cols[col_idx]), 'Format')
def validate_empty_node_empty_vals(cols):
"""
Checks that an empty node has _ empty values in HEAD and DEPREL. This is
required by UD guidelines but not necessarily by CoNLL-U, therefore
a level 2 test.
"""
assert is_empty_node(cols), 'internal error'
for col_idx in (HEAD, DEPREL):
if cols[col_idx]!=u"_":
warn("An empty node must have '_' in the column %s. Now: '%s'."%(COLNAMES[col_idx], cols[col_idx]), 'Format')
# Ll ... lowercase Unicode letters
# Lm ... modifier Unicode letters (e.g., superscript h)
# Lo ... other Unicode letters (all caseless scripts, e.g., Arabic)
# M .... combining diacritical marks
# Underscore is allowed between letters but not at beginning, end, or next to another underscore.
edeprelpart_resrc = '[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*';
# There must be always the universal part, consisting only of ASCII letters.
# There can be up to three additional, colon-separated parts: subtype, preposition and case.
# One of them, the preposition, may contain Unicode letters. We do not know which one it is
# (only if there are all four parts, we know it is the third one).
# ^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$
edeprel_resrc = '^[a-z]+(:[a-z]+)?(:' + edeprelpart_resrc + ')?(:[a-z]+)?$'
edeprel_re = re.compile(edeprel_resrc, re.U)
def validate_character_constraints(cols):
"""
Checks general constraints on valid characters, e.g. that UPOS
only contains [A-Z].
"""
if is_multiword_token(cols):
return
if UPOS >= len(cols):
return # this has been already reported in trees()
if not (re.match(r"^[A-Z]+$", cols[UPOS]) or
(is_empty_node(cols) and cols[UPOS] == u"_")):
warn('Invalid UPOS value %s' % cols[UPOS], 'Morpho')
if not (re.match(r"^[a-z]+(:[a-z]+)?$", cols[DEPREL]) or
(is_empty_node(cols) and cols[DEPREL] == u"_")):
warn('Invalid DEPREL value %s' % cols[DEPREL], 'Syntax')
try:
deps = deps_list(cols)
except ValueError:
warn('Failed for parse DEPS: %s' % cols[DEPS], 'Syntax')
return
if any(deprel for head, deprel in deps_list(cols)
if not edeprel_re.match(deprel)):
warn('Invalid value in DEPS: %s' % cols[DEPS], 'Syntax')
attr_val_re=re.compile('^([A-Z0-9][A-Z0-9a-z]*(?:\[[a-z0-9]+\])?)=(([A-Z0-9][A-Z0-9a-z]*)(,([A-Z0-9][A-Z0-9a-z]*))*)$',re.U)
val_re=re.compile('^[A-Z0-9][A-Z0-9a-z]*',re.U)
def validate_features(cols, tag_sets, args):
"""
Checks general constraints on feature-value format. On level 4 and higher,
also checks that a feature-value pair is listed as approved. (Every pair
must be allowed on level 2 because it could be defined as language-specific.
To disallow non-universal features, test on level 4 with language 'ud'.)
"""
if FEATS >= len(cols):
return # this has been already reported in trees()
feats=cols[FEATS]
if feats==u"_":
return True
feat_list=feats.split(u"|")
if [f.lower() for f in feat_list]!=sorted(f.lower() for f in feat_list):
warn("Morphological features must be sorted: '%s'"%feats, 'Morpho')
attr_set=set() # I'll gather the set of attributes here to check later than none is repeated
for f in feat_list:
match=attr_val_re.match(f)
if match is None:
warn("Spurious morphological feature: '%s'. Should be of the form attribute=value and must start with [A-Z0-9] and only contain [A-Za-z0-9]."%f, 'Morpho')
attr_set.add(f) # to prevent misleading error "Repeated features are disallowed"
else:
# Check that the values are sorted as well
attr=match.group(1)
attr_set.add(attr)
values=match.group(2).split(u",")
if len(values)!=len(set(values)):
warn("Repeated feature values are disallowed: %s"%feats, 'Morpho')
if [v.lower() for v in values]!=sorted(v.lower() for v in values):
warn("If an attribute has multiple values, these must be sorted as well: '%s'"%f, 'Morpho')
for v in values:
if not val_re.match(v):
warn("Incorrect value '%s' in '%s'. Must start with [A-Z0-9] and only contain [A-Za-z0-9]."%(v,f), 'Morpho')
# Level 2 tests character properties and canonical order but not that the f-v pair is known.
# Level 4 also checks whether the feature value is on the list.
# If only universal feature-value pairs are allowed, test on level 4 with lang='ud'.
if args.level > 3 and tag_sets[FEATS] is not None and attr+'='+v not in tag_sets[FEATS]:
warn_on_missing_files.add("feat_val")
warn('Unknown attribute-value pair %s=%s'%(attr,v), 'Morpho')
if len(attr_set)!=len(feat_list):
warn('Repeated features are disallowed: %s'%feats, 'Morpho')
def validate_upos(cols,tag_sets):
if UPOS >= len(cols):
return # this has been already reported in trees()
if tag_sets[UPOS] is not None and cols[UPOS] not in tag_sets[UPOS]:
warn('Unknown UPOS tag: %s'%cols[UPOS], 'Morpho')
def validate_xpos(cols,tag_sets):
if XPOS >= len(cols):
return # this has been already reported in trees()
# We currently do not have any list of known XPOS tags, hence tag_sets[XPOS] is None.
if tag_sets[XPOS] is not None and cols[XPOS] not in tag_sets[XPOS]:
warn('Unknown XPOS tag: %s'%cols[XPOS], 'Morpho')
def validate_pos(cols,tag_sets):
if not (is_empty_node(cols) and cols[UPOS] == '_'):
validate_upos(cols, tag_sets)
if not (is_empty_node(cols) and cols[XPOS] == '_'):
validate_xpos(cols, tag_sets)
def validate_deprels(cols, tag_sets, args):
if DEPREL >= len(cols):
return # this has been already reported in trees()
# Test only the universal part if testing at universal level.
deprel = cols[DEPREL]
if args.level < 4:
deprel = lspec2ud(deprel)
if tag_sets[DEPREL] is not None and deprel not in tag_sets[DEPREL]:
warn_on_missing_files.add("deprel")
warn('Unknown UD DEPREL: %s'%cols[DEPREL], 'Syntax')
if tag_sets[DEPS] is not None and cols[DEPS]!='_':
for head_deprel in cols[DEPS].split(u"|"):
try:
head,deprel=head_deprel.split(u":",1)
except ValueError:
warn("Malformed head:deprel pair '%s'"%head_deprel, 'Syntax')
continue
if args.level < 4:
deprel = lspec2ud(deprel)
if deprel not in tag_sets[DEPS]:
warn_on_missing_files.add("edeprel")
warn("Unknown enhanced dependency relation '%s' in '%s'"%(deprel,head_deprel), 'Syntax')
##### Tests applicable to the whole sentence
def subset_to_words_and_empty_nodes(tree):
"""
Only picks word and empty node lines, skips multiword token lines.
"""
return [cols for cols in tree if is_word(cols) or is_empty_node(cols)]
def deps_list(cols):
if DEPS >= len(cols):
return # this has been already reported in trees()
if cols[DEPS] == '_':
deps = []
else:
deps = [hd.split(':',1) for hd in cols[DEPS].split('|')]
if any(hd for hd in deps if len(hd) != 2):
raise ValueError('malformed DEPS: %s' % cols[DEPS])
return deps
def validate_ID_references(tree):
"""
Validates that HEAD and DEPS reference existing IDs.
"""
word_tree = subset_to_words_and_empty_nodes(tree)
ids = set([cols[ID] for cols in word_tree])
def valid_id(i):
return i in ids or i == '0'
def valid_empty_head(cols):
return cols[HEAD] == '_' and is_empty_node(cols)
for cols in word_tree:
if HEAD >= len(cols):
return # this has been already reported in trees()
if not (valid_id(cols[HEAD]) or valid_empty_head(cols)):
warn('Undefined ID in HEAD: %s' % cols[HEAD], 'Format')
try:
deps = deps_list(cols)
except ValueError:
warn("Failed to parse DEPS: '%s'" % cols[DEPS], 'Format')
continue
for head, deprel in deps:
if not valid_id(head):
warn("Undefined ID in DEPS: '%s'" % head, 'Format')
def validate_root(tree):
"""
Validates that DEPREL is "root" iff HEAD is 0.
"""
for cols in tree:
if not (is_word(cols) or is_empty_node(cols)):
continue
if HEAD >= len(cols):
continue # this has been already reported in trees()
if cols[HEAD] == '0':
if cols[DEPREL] != 'root':
warn("DEPREL must be 'root' if HEAD is 0", 'Syntax')
else:
if cols[DEPREL] == 'root':
warn("DEPREL cannot be 'root' if HEAD is not 0", 'Syntax')
def validate_deps(tree):
"""
Validates that DEPS is correctly formatted and that there are no
self-loops in DEPS.
"""
node_line = sentence_line - 1
for cols in tree:
node_line += 1
if not (is_word(cols) or is_empty_node(cols)):
continue
if DEPS >= len(cols):
continue # this has been already reported in trees()
try:
deps = deps_list(cols)
heads = [float(h) for h, d in deps]
except ValueError:
warn("Failed to parse DEPS: '%s'" % cols[DEPS], 'Format', nodelineno=node_line)
return
if heads != sorted(heads):
warn("DEPS not sorted by head index: '%s'" % cols[DEPS], 'Format', nodelineno=node_line)
else:
lasth = None
lastd = None
for h, d in deps:
if h == lasth:
if d < lastd:
warn("DEPS pointing to head '%s' not sorted by relation type: '%s'" % (h, cols[DEPS]), 'Format', nodelineno=node_line)
elif d == lastd:
warn("DEPS contain multiple instances of the same relation '%s:%s'" % (h, d), 'Format', nodelineno=node_line)
lasth = h
lastd = d
# Like in the basic representation, head 0 implies relation root and vice versa.
# Note that the enhanced graph may have multiple roots (coordination of predicates).
ud = lspec2ud(d)
if h == '0' and ud != 'root':
warn("Illegal relation '%s:%s' in DEPS: must be 'root' if head is 0" % (h, d), 'Format', nodelineno=node_line)
if ud == 'root' and h != '0':
warn("Illegal relation '%s:%s' in DEPS: cannot be 'root' if head is not 0" % (h, d), 'Format', nodelineno=node_line)
try:
id_ = float(cols[ID])
except ValueError:
warn("Non-numeric ID: '%s'" % cols[ID], 'Format', nodelineno=node_line)
return
if id_ in heads:
warn("Self-loop in DEPS for '%s'" % cols[ID], 'Format', nodelineno=node_line)
def build_tree(sentence):
"""
Takes the list of non-comment lines (line = list of columns) describing
a sentence. Returns a dictionary with items providing easier access to the
tree structure. In case of fatal problems (missing HEAD etc.) returns None
but does not report the error (presumably it has already been reported).
tree ... dictionary:
nodes ... array of word lines, i.e., lists of columns; mwt and empty nodes are skipped, indices equal to ids (nodes[0] is empty)
children ... array of sets of children indices (numbers, not strings); indices to this array equal to ids (children[0] are the children of the root)
linenos ... array of line numbers in the file, corresponding to nodes (needed in error messages)
"""
global sentence_line # the line of the first token/word of the current tree (skipping comments!)
node_line = sentence_line - 1
children = {} # node -> set of children
tree = {
'nodes': [['0', '_', '_', '_', '_', '_', '_', '_', '_', '_']], # add artificial node 0
'children': [],
'linenos': [sentence_line] # for node 0
}
for cols in sentence:
node_line += 1
if not is_word(cols):
continue
# Even MISC may be needed when checking the annotation guidelines
# (for instance, SpaceAfter=No must not occur inside a goeswith span).
if MISC >= len(cols):
# This error has been reported on lower levels, do not report it here.
# Do not continue to check annotation if there are elementary flaws.
return None
try:
id_ = int(cols[ID])
except ValueError:
# This error has been reported on lower levels, do not report it here.
# Do not continue to check annotation if there are elementary flaws.
return None
try:
head = int(cols[HEAD])
except ValueError:
# This error has been reported on lower levels, do not report it here.
# Do not continue to check annotation if there are elementary flaws.
return None
if head == id_:
warn('HEAD == ID for %s' % cols[ID], 'Syntax', nodelineno=node_line)
continue
tree['nodes'].append(cols)
tree['linenos'].append(node_line)
# Incrementally build the set of children of every node.
children.setdefault(cols[HEAD], set()).add(id_)
for cols in tree['nodes']:
tree['children'].append(sorted(children.get(cols[ID], [])))
# Check that there is just one node with the root relation.
if len(tree['children'][0]) > 1 and args.single_root:
warn('Multiple root words: %s' % tree['children'][0], 'Syntax', lineno=False)
# Return None if there are any cycles. Avoid surprises when working with the graph.
# Presence of cycles is equivalent to presence of unreachable nodes.
projection = set()
get_projection(0, tree, projection)
unreachable = set(range(1, len(tree['nodes']) - 1)) - projection
if unreachable:
warn('Non-tree structure. Words %s are not reachable from the root 0.'%(','.join(str(w) for w in sorted(unreachable))), 'Syntax', lineno=False)
return None
return tree
def get_projection(id, tree, projection):
"""
Like proj() above, but works with the tree data structure. Collects node ids
in the set called projection.
"""
for child in tree['children'][id]:
if child in projection:
continue # cycle is or will be reported elsewhere
projection.add(child)
get_projection(child, tree, projection)
return projection
def build_egraph(sentence):
"""
Takes the list of non-comment lines (line = list of columns) describing
a sentence. Returns a dictionary with items providing easier access to the
enhanced graph structure. In case of fatal problems returns None
but does not report the error (presumably it has already been reported).
However, once the graph has been found and built, this function verifies
that the graph is connected and generates an error if it is not.
egraph ... dictionary:
nodes ... dictionary of dictionaries, each corresponding to a word or an empty node; mwt lines are skipped
keys equal to node ids (i.e. strings that look like integers or decimal numbers; key 0 is the artificial root node)
value is a dictionary-record:
cols ... array of column values from the input line corresponding to the node
children ... set of children ids (strings)
lineno ... line number in the file (needed in error messages)
"""
global sentence_line # the line of the first token/word of the current tree (skipping comments!)
node_line = sentence_line - 1
egraph_exists = False # enhanced deps are optional
rootnode = {
'cols': ['0', '_', '_', '_', '_', '_', '_', '_', '_', '_'],
'deps': [],
'parents': set(),
'children': set(),
'lineno': sentence_line
}
egraph = {
'0': rootnode
} # structure described above
nodeids = set()
for cols in sentence:
node_line += 1
if is_multiword_token(cols):
continue
if MISC >= len(cols):
# This error has been reported on lower levels, do not report it here.
# Do not continue to check annotation if there are elementary flaws.
return None
try:
deps = deps_list(cols)
heads = [h for h, d in deps]
except ValueError:
# This error has been reported on lower levels, do not report it here.
# Do not continue to check annotation if there are elementary flaws.
return None
if is_empty_node(cols):
egraph_exists = True
nodeids.add(cols[ID])
# The graph may already contain a record for the current node if one of
# the previous nodes is its child. If it doesn't, we will create it now.
egraph.setdefault(cols[ID], {})
egraph[cols[ID]]['cols'] = cols
egraph[cols[ID]]['deps'] = deps_list(cols)
egraph[cols[ID]]['parents'] = set([h for h, d in deps])
egraph[cols[ID]].setdefault('children', set())
egraph[cols[ID]]['lineno'] = node_line
# Incrementally build the set of children of every node.
for h in heads:
egraph_exists = True
egraph.setdefault(h, {})
egraph[h].setdefault('children', set()).add(cols[ID])
# We are currently testing the existence of enhanced graphs separately for each sentence.
# It is thus possible to have one sentence with connected egraph and another without enhanced dependencies.
if not egraph_exists:
return None
# Check that the graph is connected. The UD v2 guidelines do not license unconnected graphs.
# Compute projection of every node. Beware of cycles.
projection = set()
get_graph_projection('0', egraph, projection)
unreachable = nodeids - projection
if unreachable:
sur = sorted(unreachable)
warn("Enhanced graph is not connected. Nodes %s are not reachable from any root" % sur, 'Syntax', lineno=False)
return None
return egraph
def get_graph_projection(id, graph, projection):
for child in graph[id]['children']:
if child in projection:
continue; # skip cycles
projection.add(child)
get_graph_projection(child, graph, projection)
return projection
#==============================================================================
# Level 3 tests. Annotation content vs. the guidelines (only universal tests).
#==============================================================================
def validate_upos_vs_deprel(id, tree):
"""
For certain relations checks that the dependent word belongs to an expected
part-of-speech category. Occasionally we may have to check the children of
the node, too.
"""
cols = tree['nodes'][id]
# This is a level 3 test, we will check only the universal part of the relation.
deprel = lspec2ud(cols[DEPREL])
childrels = set([lspec2ud(tree['nodes'][x][DEPREL]) for x in tree['children'][id]])
# Certain relations are reserved for nominals and cannot be used for verbs.
# Nevertheless, they can appear with adjectives or adpositions if they are promoted due to ellipsis.
# Unfortunately, we cannot enforce this test because a word can be cited
# rather than used, and then it can take a nominal function even if it is
# a verb, as in this Upper Sorbian sentence where infinitives are appositions:
# [hsb] Z werba danci "rejować" móže substantiw nastać danco "reja", adjektiw danca "rejowanski" a adwerb dance "rejowansce", ale tež z substantiwa martelo "hamor" móže nastać werb marteli "klepać z hamorom", adjektiw martela "hamorowy" a adwerb martele "z hamorom".
#if re.match(r"^(nsubj|obj|iobj|obl|vocative|expl|dislocated|nmod|appos)", deprel) and re.match(r"^(VERB|AUX|ADV|SCONJ|CCONJ)", cols[UPOS]):
# warn("Node %s: '%s' should be a nominal but it is '%s'" % (cols[ID], deprel, cols[UPOS]), 'Syntax', lineno=False)
# Determiner can alternate with a pronoun.
if deprel == 'det' and not re.match(r"^(DET|PRON)", cols[UPOS]):
warn("'det' should be 'DET' or 'PRON' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Nummod is for numerals only.
if deprel == 'nummod' and not re.match(r"^(NUM)", cols[UPOS]):
warn("'nummod' should be 'NUM' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Advmod is for adverbs, perhaps particles but not for prepositional phrases or clauses.
if deprel == 'advmod' and not re.match(r"^(ADV|CCONJ|PART|SYM)", cols[UPOS]) and not 'fixed' in childrels:
warn("'advmod' should be 'ADV' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Known expletives are pronouns. Determiners and particles are probably acceptable, too.
if deprel == 'expl' and not re.match(r"^(PRON|DET|PART)$", cols[UPOS]):
warn("'expl' should normally be 'PRON' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Auxiliary verb/particle must be AUX.
if deprel == 'aux' and not re.match(r"^(AUX)", cols[UPOS]):
warn("'aux' should be 'AUX' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Copula is an auxiliary verb/particle (AUX) or a pronoun (PRON|DET).
if deprel == 'cop' and not re.match(r"^(AUX|PRON|DET|SYM)", cols[UPOS]):
warn("'cop' should be 'AUX' or 'PRON'/'DET' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Case is normally an adposition, maybe particle.
# However, there are also secondary adpositions and they may have the original POS tag:
# NOUN: [cs] pomocí, prostřednictvím
# VERB: [en] including
# Interjection can also act as case marker for vocative, as in Sanskrit: भोः भगवन् / bhoḥ bhagavan / oh sir.
if deprel == 'case' and re.match(r"^(PROPN|ADJ|PRON|DET|NUM|AUX)", cols[UPOS]) and not 'fixed' in childrels:
warn("'case' should not be '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Mark is normally a conjunction or adposition, maybe particle but definitely not a pronoun.
if deprel == 'mark' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|VERB|AUX|INTJ)", cols[UPOS]) and not 'fixed' in childrels:
warn("'mark' should not be '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
# Cc is a conjunction, possibly an adverb or particle.
if deprel == 'cc' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|VERB|AUX|INTJ)", cols[UPOS]) and not 'fixed' in childrels:
warn("'cc' should not be '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
if cols[DEPREL] == 'punct' and cols[UPOS] != 'PUNCT':
warn("DEPREL can be 'punct' only if UPOS is 'PUNCT' but it is '%s'" % (cols[UPOS]), 'Syntax', nodelineno=tree['linenos'][id])
if cols[UPOS] == 'PUNCT' and not re.match(r"^(punct|root)", deprel):
warn("if UPOS is 'PUNCT', DEPREL must be 'punct' but is '%s'" % (cols[DEPREL]), 'Syntax', nodelineno=tree['linenos'][id])
def validate_left_to_right_relations(id, tree):
"""
Certain UD relations must always go left-to-right.
Here we currently check the rule for the basic dependencies.
The same should also be tested for the enhanced dependencies!
"""
cols = tree['nodes'][id]
if is_multiword_token(cols):
return
if DEPREL >= len(cols):
return # this has been already reported in trees()
# According to the v2 guidelines, apposition should also be left-headed, although the definition of apposition may need to be improved.
if re.match(r"^(conj|fixed|flat|goeswith|appos)", cols[DEPREL]):
ichild = int(cols[ID])
iparent = int(cols[HEAD])
if ichild < iparent:
warn("Violation of guidelines: relation '%s' must go left-to-right" % cols[DEPREL], 'Syntax', nodelineno=tree['linenos'][id])
def validate_single_subject(id, tree):
"""
No predicate should have more than one subject.
An xcomp dependent normally has no subject, but in some languages the
requirement may be weaker: it could have an overt subject if it is
correferential with a particular argument of the matrix verb. Hence we do
not check zero subjects of xcomp dependents at present.
"""
subjects = sorted([x for x in tree['children'][id] if re.search(r"subj", lspec2ud(tree['nodes'][x][DEPREL]))])
if len(subjects) > 1:
warn("Violation of guidelines: node has more than one subject: %s" % str(subjects), 'Syntax', nodelineno=tree['linenos'][id])
def validate_functional_leaves(id, tree):
"""
Most of the time, function-word nodes should be leaves. This function
checks for known exceptions and warns in the other cases.
"""
# This is a level 3 test, we will check only the universal part of the relation.
deprel = lspec2ud(tree['nodes'][id][DEPREL])
childrels = set([lspec2ud(tree['nodes'][x][DEPREL]) for x in tree['children'][id]])
disallowed_childrels = childrels - set(['goeswith', 'fixed', 'conj'])
if re.match(r"^(case|mark|cc|aux|cop)$", deprel) and disallowed_childrels:
warn("'%s' not expected to have children (%s)" % (deprel, disallowed_childrels), 'Syntax', nodelineno=tree['linenos'][id])
# Fixed expressions should not be nested, i.e., no chains of fixed relations.
# As they are supposed to represent functional elements, they should not have
# other dependents either, with the possible exception of conj.
###!!! We also allow a punct child, at least temporarily, because of fixed
###!!! expressions that have a hyphen in the middle (e.g. Russian "вперед-назад").
###!!! It would be better to keep these expressions as one token. But sometimes
###!!! the tokenizer is out of control of the UD data providers and it is not
###!!! practical to retokenize.
disallowed_childrels = childrels - set(['goeswith', 'conj', 'punct'])
if deprel == 'fixed' and disallowed_childrels:
warn("'%s' not expected to have children (%s)" % (deprel, disallowed_childrels), 'Syntax', nodelineno=tree['linenos'][id])
# Goeswith cannot have any children, not even another goeswith.
disallowed_childrels = childrels
if deprel == 'goeswith' and disallowed_childrels:
warn("'%s' not expected to have children (%s)" % (deprel, disallowed_childrels), 'Syntax', nodelineno=tree['linenos'][id])
# Punctuation can exceptionally have other punct children if an exclamation
# mark is in brackets or quotes. It cannot have other children.
disallowed_childrels = childrels - set(['punct'])
if deprel == 'punct' and disallowed_childrels:
warn("'%s' not expected to have children (%s)" % (deprel, disallowed_childrels), 'Syntax', nodelineno=tree['linenos'][id])
def collect_ancestors(id, tree, ancestors):
"""
Usage: ancestors = collect_ancestors(nodeid, nodes, [])
"""
pid = int(tree['nodes'][int(id)][HEAD])
if pid == 0:
return ancestors
if pid in ancestors:
# Cycle has been reported on level 2. But we must jump out of it now.
return ancestors
ancestors.append(pid)
return collect_ancestors(pid, tree, ancestors)
def get_caused_nonprojectivities(id, tree):
"""
Checks whether a node is in a gap of a nonprojective edge. Report true only
if the node's parent is not in the same gap. (We use this function to check
that a punctuation node does not cause nonprojectivity. But if it has been
dragged to the gap with a larger subtree, then we do not blame it.)
tree ... dictionary:
nodes ... array of word lines, i.e., lists of columns; mwt and empty nodes are skipped, indices equal to ids (nodes[0] is empty)
children ... array of sets of children indices (numbers, not strings); indices to this array equal to ids (children[0] are the children of the root)
linenos ... array of line numbers in the file, corresponding to nodes (needed in error messages)
"""
iid = int(id) # just to be sure
# We need to find all nodes that are not ancestors of this node and lie
# on other side of this node than their parent. First get the set of
# ancestors.
ancestors = collect_ancestors(iid, tree, [])
maxid = len(tree['nodes']) - 1
# Get the lists of nodes to either side of id.
# Do not look beyond the parent (if it is in the same gap, it is the parent's responsibility).
pid = int(tree['nodes'][iid][HEAD])
if pid < iid:
left = range(pid, iid - 1)
right = range(iid + 1, maxid)
else:
left = range(1, iid - 1)
right = range(iid + 1, pid)
# Exclude ancestors of id from the ranges.
sancestors = set(ancestors)
leftna = set(left) - sancestors
rightna = set(right) - sancestors
leftcross = [x for x in leftna if int(tree['nodes'][x][HEAD]) > iid]
rightcross = [x for x in rightna if int(tree['nodes'][x][HEAD]) < iid]
# Once again, exclude nonprojectivities that are caused by ancestors of id.
if pid < iid:
rightcross = [x for x in rightcross if int(tree['nodes'][x][HEAD]) > pid]
else:
leftcross = [x for x in leftcross if int(tree['nodes'][x][HEAD]) < pid]
# Do not return just a boolean value. Return the nonprojective nodes so we can report them.
return sorted(leftcross + rightcross)
def get_gap(id, tree):
iid = int(id) # just to be sure
pid = int(tree['nodes'][iid][HEAD])
if iid < pid:
rangebetween = range(iid + 1, pid - 1)
else:
rangebetween = range(pid + 1, iid - 1)
gap = set()
if rangebetween:
projection = set()
get_projection(pid, tree, projection)
gap = set(rangebetween) - projection
return gap
def validate_goeswith_span(id, tree):
"""
The relation 'goeswith' is used to connect word parts that are separated
by whitespace and should be one word instead. We assume that the relation
goes left-to-right, which is checked elsewhere. Here we check that the
nodes really were separated by whitespace. If there is another node in the
middle, it must be also attached via 'goeswith'. The parameter id refers to
the node whose goeswith children we test.