Skip to content

Commit

Permalink
Merge pull request #195 from mailgun/brendan/unicode-in-custom-grammer
Browse files Browse the repository at this point in the history
Allow Unicode characters in custom grammar checks
  • Loading branch information
b0d0nne11 authored May 16, 2018
2 parents 3409e48 + b4323ce commit b8802fb
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 190 deletions.
84 changes: 0 additions & 84 deletions flanker/addresslib/plugins/_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,71 +8,8 @@
"""

import re

import six

LBRACKET = '<'
AT_SYMBOL = '@'
RBRACKET = '>'
DQUOTE = '"'

BAD_DOMAIN = re.compile(r''' # start or end
^-|-$ # with -
''', re.MULTILINE | re.VERBOSE)

DELIMITER = re.compile(r'''
[,;][,;\s]* # delimiter
''', re.MULTILINE | re.VERBOSE)

WHITESPACE = re.compile(r'''
(\ |\t)+ # whitespace
''', re.MULTILINE | re.VERBOSE)

UNI_WHITE = re.compile(u'''
[
\u0020\u00a0\u1680\u180e
\u2000-\u200a
\u2028\u202f\u205f\u3000
]*
''', re.MULTILINE | re.VERBOSE | re.UNICODE)

RELAX_ATOM = re.compile(r'''
([^\s<>;,"]+)
''', re.MULTILINE | re.VERBOSE)

ATOM = re.compile(r'''
[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext
''', re.MULTILINE | re.VERBOSE)

DOT_ATOM = re.compile(r'''
[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+ # atext
(\.[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~]+)* # (dot atext)*
''', re.MULTILINE | re.VERBOSE)

UNI_ATOM = re.compile(r'''
([^\s<>;,"]+)
''', re.MULTILINE | re.VERBOSE | re.UNICODE)

UNI_QSTR = re.compile(r'''
"
(?P<qstr>([^"]+))
"
''', re.MULTILINE | re.VERBOSE | re.UNICODE)

QSTRING = re.compile(r'''
" # dquote
(\s* # whitespace
([\x21\x23-\x5b\x5d-\x7e] # qtext
| # or
\\[\x21-\x7e\t\ ]))* # quoted-pair
\s* # whitespace
" # dquote
''', re.MULTILINE | re.VERBOSE)

URL = re.compile(r'''
(?:http|https)://
[^\s<>{}|\^~\[\]`;,]+
''', re.MULTILINE | re.VERBOSE | re.UNICODE)

class TokenStream(object):
"""
Expand Down Expand Up @@ -122,27 +59,6 @@ def end_of_stream(self):
return True
return False

def synchronize(self):
"""
Advances the stream to synchronizes to the delimiter token. Used primarily
in relaxed mode parsing.
"""
start_pos = self.position
end_pos = len(self.stream)

match = DELIMITER.search(self.stream, self.position)
if match:
self.position = match.start()
end_pos = match.start()
else:
self.position = end_pos

skip = self.stream[start_pos:end_pos]
if skip.strip() == '':
return None

return skip

def peek(self, token=None):
"""
Peek at the stream to see what the next token is or peek for a
Expand Down
32 changes: 18 additions & 14 deletions flanker/addresslib/plugins/aol.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,30 @@
'''
import re
from flanker.addresslib.plugins._tokenizer import TokenStream
from flanker.addresslib._parser.lexer import _UNICODE_CHAR

ALPHA = re.compile(r'''
[A-Za-z]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

NUMERIC = re.compile(r'''
[0-9]+
''', re.MULTILINE | re.VERBOSE)
( [0-9]
)+
''',
re.MULTILINE | re.VERBOSE)

ALPHANUM = re.compile(r'''
[A-Za-z0-9]+
''', re.MULTILINE | re.VERBOSE)

DOT = re.compile(r'''
\.
''', re.MULTILINE | re.VERBOSE)

UNDERSCORE = re.compile(r'''
\_
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

DOT = '.'
UNDERSCORE = '_'

AOL_UNMANAGED = ['verizon.net']

Expand Down
24 changes: 10 additions & 14 deletions flanker/addresslib/plugins/gmail.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,19 @@
'''
import re
from flanker.addresslib.plugins._tokenizer import TokenStream
from flanker.addresslib.plugins._tokenizer import ATOM
from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR


GMAIL_BASE = re.compile(r'''
[A-Za-z0-9\.]+
''', re.MULTILINE | re.VERBOSE)
ATOM = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE)

ALPHANUM = re.compile(r'''
[A-Za-z0-9]+
''', re.MULTILINE | re.VERBOSE)

PLUS = re.compile(r'''
[\+]
''', re.MULTILINE | re.VERBOSE)
DOT = re.compile(r'''
[\.]
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

PLUS = '+'
DOT = '.'


def validate(email_addr):
Expand Down
54 changes: 26 additions & 28 deletions flanker/addresslib/plugins/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,36 +29,34 @@
'''
import re
from flanker.addresslib.plugins._tokenizer import TokenStream
from flanker.addresslib.plugins._tokenizer import ATOM
from flanker.addresslib._parser.lexer import t_ATOM, _UNICODE_CHAR

ATOM = re.compile(t_ATOM, re.MULTILINE | re.VERBOSE)

GOOGLE_BASE = re.compile(r'''
[A-Za-z0-9_\-'\.]+
''', re.MULTILINE | re.VERBOSE)
GOOGLE_BASE = re.compile(r'''
( [A-Za-z0-9_\-'\.]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

ALPHANUM = re.compile(r'''
[A-Za-z0-9]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

UNDERSCORE = re.compile(r'''
[_]+
''', re.MULTILINE | re.VERBOSE)
APOSTROPHE = re.compile(r'''
\'
''',
re.MULTILINE | re.VERBOSE)

APOSTROPHES = re.compile(r'''
[']+
''', re.MULTILINE | re.VERBOSE)
UNDERSCORE = re.compile(r'\_', re.MULTILINE | re.VERBOSE)
DASH = re.compile(r'\-', re.MULTILINE | re.VERBOSE)

DASH = re.compile(r'''
[-]+
''', re.MULTILINE | re.VERBOSE)

DOTS = re.compile(r'''
[.]+
''', re.MULTILINE | re.VERBOSE)

PLUS = re.compile(r'''
[\+]+
''', re.MULTILINE | re.VERBOSE)
DOTS = '.'
PLUS = '+'


def validate(email_addr):
Expand All @@ -80,21 +78,21 @@ def validate(email_addr):
# if only one character, must be alphanum, underscore (_), or apostrophe (')
if len(localpart) == 1 or l == 1:
if ALPHANUM.match(localpart) or UNDERSCORE.match(localpart) or \
APOSTROPHES.match(localpart):
APOSTROPHE.match(localpart):
return True
return False

# must start with: alphanum, underscore (_), dash (-), or apostrophes(')
# must start with: alphanum, underscore (_), dash (-), or apostrophe(')
if len(real_localpart) > 0:
if not ALPHANUM.match(real_localpart[0]) and not UNDERSCORE.match(real_localpart[0]) \
and not DASH.match(real_localpart[0]) and not APOSTROPHES.match(real_localpart[0]):
and not DASH.match(real_localpart[0]) and not APOSTROPHE.match(real_localpart[0]):
return False
else:
return False

# must end with: alphanum, underscore(_), dash(-), or apostrophes(')
# must end with: alphanum, underscore(_), dash(-), or apostrophe(')
if not ALPHANUM.match(real_localpart[-1]) and not UNDERSCORE.match(real_localpart[-1]) \
and not DASH.match(real_localpart[-1]) and not APOSTROPHES.match(real_localpart[-1]):
and not DASH.match(real_localpart[-1]) and not APOSTROPHE.match(real_localpart[-1]):
return False

# grammar check
Expand Down
34 changes: 17 additions & 17 deletions flanker/addresslib/plugins/hotmail.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,26 +31,30 @@
'''
import re
from flanker.addresslib.plugins._tokenizer import TokenStream
from flanker.addresslib._parser.lexer import _UNICODE_CHAR

HOTMAIL_PREFIX = re.compile(r'''
[A-Za-z0-9]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

HOTMAIL_BASE = re.compile(r'''
[A-Za-z0-9\.\-\_]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9\.\-\_]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

HOTMAIL_SUFFIX = re.compile(r'''
[A-Za-z0-9\-\_]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9\-\_]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

PLUS = re.compile(r'''
\+
''', re.MULTILINE | re.VERBOSE)

PERIODS = re.compile(r'''
\.{2,}
''', re.MULTILINE | re.VERBOSE)
PLUS = '+'


def validate(email_addr):
Expand Down Expand Up @@ -82,10 +86,6 @@ def validate(email_addr):
if localpart.count('+') > 1:
return False

# no consecutive periods (..)
if PERIODS.search(localpart):
return False

# grammar check
retval = _validate(real_localpart)
return retval
Expand Down
39 changes: 20 additions & 19 deletions flanker/addresslib/plugins/icloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,32 @@
'''
import re
from flanker.addresslib.plugins._tokenizer import TokenStream
from flanker.addresslib._parser.lexer import _UNICODE_CHAR

ALPHA = re.compile(r'''
[A-Za-z]+
''', re.MULTILINE | re.VERBOSE)
ALPHA = re.compile(r'''
( [A-Za-z]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

ALPHANUM = re.compile(r'''
[A-Za-z0-9]+
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)


ICLOUD_PREFIX = re.compile(r'''
[A-Za-z]+
''', re.MULTILINE | re.VERBOSE)

ICLOUD_BASE = re.compile(r'''
[A-Za-z0-9\+]+
''', re.MULTILINE | re.VERBOSE)

DOT = re.compile(r'''
\.
''', re.MULTILINE | re.VERBOSE)
( [A-Za-z0-9\+]
| {unicode_char}
)+
'''.format(unicode_char=_UNICODE_CHAR),
re.MULTILINE | re.VERBOSE)

UNDERSCORE = re.compile(r'''
\_
''', re.MULTILINE | re.VERBOSE)
DOT = '.'
UNDERSCORE = '_'


def validate(email_addr):
Expand Down Expand Up @@ -97,7 +98,7 @@ def _validate(localpart):
stream = TokenStream(localpart)

# localpart must start with alpha
alpa = stream.get_token(ICLOUD_PREFIX)
alpa = stream.get_token(ALPHA)
if alpa is None:
return False

Expand Down
Loading

0 comments on commit b8802fb

Please sign in to comment.