forked from decal/werdlists
-
Notifications
You must be signed in to change notification settings - Fork 4
/
non-split-patterns.js
111 lines (103 loc) · 1.47 KB
/
non-split-patterns.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//These are patterns for things that might look like sentence splits but they
//should not be used as such.
//
// Valentin Tablan, 24 Aug 2007
//
//
// Lines starting with // are comments; empty lines are ignored
//The Java RegEx matching machine is eager to return the first match
//because of this, the explicit abbreviations need to appear before the
//generic patterns so that for example "a.m." is matched in preference to
//"a.m" (which would match under the internet address rule).
//known abbreviations
\b\.net
\b\.NET
\b\.Net
\bAG\.
\bA\.M\.
\bAPR\.
\bAUG\.
\bAdm\.
\bBrig\.
\bCO\.
\bCORP\.
\bCapt\.
\bCmdr\.
\bCo\.
\bCol\.
\bComdr\.
\bDEC\.
\bDR\.
\bDr\.
\bFEB\.
\bFig\.
\bFRI\.
\bGMBH\.
\bGen\.
\bGov\.
\bINC\.
\bJAN\.
\bJUL\.
\bJUN\.
\bLTD\.
\bLt\.
\bLtd\.
\bMAR\.
\bMON\.
\bMP\.
\bMaj\.
\bMr\.
\bMrs\.
\bMs\.
\bNA\.
\bNOV\.
\bNV\.
\bOCT\.
\bOy\.
\bPLC\.
\bP\.M\.
\bProf\.
\bRep\.
\bSA\.
\bSAT\.
\bSEP\.
\bSIR\.
\bSR\.
\bSUN\.
\bSen\.
\bSgt\.
\bSpA\.
\bSt\.
\bTHU\.
\bTHUR\.
\bTUE\.
\bVP\.
\bWED\.
\ba\.m\.
\bad\.
\bal\.
\bed\.
\beds\.
\beg\.
\be\.g\.
\bet\.
\betc\.(?!\s+\p{Upper})
\bfig\.
\bie\.
\bi\.e\.
\bp\.
\bp\.m\.
\busu\.
\bvs\.
\byr\.
\byrs\.
//four or more dots are ignored
\.{4,}
//five or more ?,! are ignored
(?:!|\?){5,}
//a sequence of single upper case letters followed by dot
\b(?:\p{javaUpperCase}\.)+
//numbers with decimal part or IP addresses, or Internet addresses
\p{Alnum}+(?:\.\p{Alnum}+)+
//java dotted names or Internet addresses
\p{Alpha}+(?:\.\p{Alpha}+)+