-
Notifications
You must be signed in to change notification settings - Fork 67
/
normalize-punctuation.perl
90 lines (82 loc) · 1.87 KB
/
normalize-punctuation.perl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
my $language = "en";
my $PENN = 0;
while (@ARGV) {
$_ = shift;
/^-b$/ && ($| = 1, next); # not buffered (flush each line)
/^-l$/ && ($language = shift, next);
/^[^\-]/ && ($language = $_, next);
/^-penn$/ && ($PENN = 1, next);
}
while(<STDIN>) {
s/\r//g;
# remove extra spaces
s/\(/ \(/g;
s/\)/\) /g; s/ +/ /g;
s/\) ([\.\!\:\?\;\,])/\)$1/g;
s/\( /\(/g;
s/ \)/\)/g;
s/(\d) \%/$1\%/g;
s/ :/:/g;
s/ ;/;/g;
# normalize unicode punctuation
if ($PENN == 0) {
s/\`/\'/g;
s/\'\'/ \" /g;
}
s/„/\"/g;
s/“/\"/g;
s/”/\"/g;
s/–/-/g;
s/—/ - /g; s/ +/ /g;
s/´/\'/g;
s/([a-z])‘([a-z])/$1\'$2/gi;
s/([a-z])’([a-z])/$1\'$2/gi;
s/‘/\'/g;
s/‚/\'/g;
s/’/\"/g;
s/''/\"/g;
s/´´/\"/g;
s/…/.../g;
# French quotes
s/ « / \"/g;
s/« /\"/g;
s/«/\"/g;
s/ » /\" /g;
s/ »/\"/g;
s/»/\"/g;
# handle pseudo-spaces
s/ \%/\%/g;
s/nº /nº /g;
s/ :/:/g;
s/ ºC/ ºC/g;
s/ cm/ cm/g;
s/ \?/\?/g;
s/ \!/\!/g;
s/ ;/;/g;
s/, /, /g; s/ +/ /g;
# English "quotation," followed by comma, style
if ($language eq "en") {
s/\"([,\.]+)/$1\"/g;
}
# Czech is confused
elsif ($language eq "cs" || $language eq "cz") {
}
# German/Spanish/French "quotation", followed by comma, style
else {
s/,\"/\",/g;
s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
}
if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
s/(\d) (\d)/$1,$2/g;
}
else {
s/(\d) (\d)/$1.$2/g;
}
print $_;
}