diff --git a/.gitignore b/.gitignore index a6834947..e718e4e0 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,9 @@ nosetests.xml # PLY parser.out + +# OSX +.DS_Store + +# Goland +.idea/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b1b2af7..77294e96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [0.9.9] - 2019-09-25 +### Changed +- Replace the leading '.' in an quoted-printable encoded mime part to avoid + obscure SMTP bug + ## [0.9.0] - 2018-05-16 ### Changed - Support for Python 3 was added with preserving the Python 2 behavior in mind. diff --git a/README.rst b/README.rst index 44ebd093..9a36a57b 100644 --- a/README.rst +++ b/README.rst @@ -89,7 +89,7 @@ To parse an address list: >>> from flanker.addresslib import address >>> - >>> address.parse_list('foo@example.com, bar@example.com, @example.com') + >>> address.parse_list(['foo@example.com, bar@example.com, @example.com']) [foo@example.com, bar@example.com] To parse an address list as well as return a tuple containing the parsed @@ -99,7 +99,7 @@ addresses and the unparsable portions >>> from flanker.addresslib import address >>> - >>> address.parse_list('foo@example.com, bar@example.com, @example.com', as_tuple=True) + >>> address.parse_list(['foo@example.com, bar@example.com, @example.com'], as_tuple=True) [foo@example.com, bar@example.com], ['@example.com'] To parse an address list in strict mode: @@ -108,7 +108,7 @@ To parse an address list in strict mode: >>> from flanker.addresslib import address >>> - >>> address.parse_list('foo@example.com, bar@example.com, @example.com', strict=True) + >>> address.parse_list(['foo@example.com, bar@example.com, @example.com'], strict=True) [foo@example.com, bar@example.com] To validate an email address (parse as well as DNS, MX existence, and ESP grammar checks): @@ -126,7 +126,7 @@ To validate an address list: >>> from flanker.addresslib import address >>> - >>> address.validate_list('foo@mailgun.com, bar@mailgun.com, @mailgun.com', as_tuple=True) + >>> address.validate_list(['foo@mailgun.com, bar@mailgun.com, @mailgun.com'], as_tuple=True) ([foo@mailgun.com, bar@mailgun.com], ['@mailgun.com']) MIME Parsing diff --git a/flanker/addresslib/_parser/parser.py b/flanker/addresslib/_parser/parser.py index d1a6593e..b503959e 100644 --- a/flanker/addresslib/_parser/parser.py +++ b/flanker/addresslib/_parser/parser.py @@ -158,27 +158,32 @@ def p_error(p): log.debug('building mailbox parser') mailbox_parser = yacc.yacc(start='mailbox', errorlog=log, - tabmodule='mailbox_parsetab') + tabmodule='mailbox_parsetab', + debug=False) log.debug('building addr_spec parser') addr_spec_parser = yacc.yacc(start='addr_spec', errorlog=log, - tabmodule='addr_spec_parsetab') + tabmodule='addr_spec_parsetab', + debug=False) log.debug('building url parser') url_parser = yacc.yacc(start='url', errorlog=log, - tabmodule='url_parsetab') + tabmodule='url_parsetab', + debug=False) log.debug('building mailbox_or_url parser') mailbox_or_url_parser = yacc.yacc(start='mailbox_or_url', errorlog=log, - tabmodule='mailbox_or_url_parsetab') + tabmodule='mailbox_or_url_parsetab', + debug=False) log.debug('building mailbox_or_url_list parser') mailbox_or_url_list_parser = yacc.yacc(start='mailbox_or_url_list', errorlog=log, - tabmodule='mailbox_or_url_list_parsetab') + tabmodule='mailbox_or_url_list_parsetab', + debug=False) # Interactive prompt for easy debugging diff --git a/flanker/mime/message/part.py b/flanker/mime/message/part.py index 3a795ebf..c69b1da3 100644 --- a/flanker/mime/message/part.py +++ b/flanker/mime/message/part.py @@ -632,6 +632,7 @@ def _encode_transfer_encoding(encoding, body): if six.PY3: if encoding == 'quoted-printable': body = quopri.encodestring(body, quotetabs=False) + body = fix_leading_dot(body) return body.decode('utf-8') if encoding == 'base64': @@ -647,13 +648,111 @@ def _encode_transfer_encoding(encoding, body): return body if encoding == 'quoted-printable': - return quopri.encodestring(body, quotetabs=False) + body = quopri.encodestring(body, quotetabs=False) + return fix_leading_dot(body) elif encoding == 'base64': return _email.encode_base64(body) else: return body +def fix_leading_dot(s): + """ + From SMTP RFC: https://tools.ietf.org/html/rfc5321#section-4.5.2 + + ----- + When a line of mail text is received by the SMTP server, it checks + the line. If the line is composed of a single period, it is + treated as the end of mail indicator. If the first character is a + period and there are other characters on the line, the first + character is deleted. + ----- + + We have observed some remote SMTP servers have an intermittent obscure bug + where the leading '.' is removed according to the above spec. Even when the '.' + is obviously within the bounds of a mime part, and with our sending SMTP + clients dot stuffing the line. To combat this we convert any leading '.' + to a '=2E'. + """ + infp = six.BytesIO(s) + outfp = six.BytesIO() + + # TODO(thrawn01): We could scan the entire string looking for leading '.' + # If none found return the original string. This would save memory at the + # expense of some additional processing + + dot = b"." + if six.PY3: + dot = ord('.') + + while 1: + line = infp.readline() + if not line: + break + + if line[0] == dot: + line = _quote_and_cut(line) + + outfp.write(line) + + return outfp.getvalue() + + +def _quote_and_cut(ln): + """ + Quotes the leading '.', if the resulting line is longer than 76 characters + cut the line in half without dividing any quoted characters and + conforming to the quoted-printable RFC in regards to ending characters. + """ + ln = quopri.quote(ln[0:1]) + ln[1:] + + # If the line is under the 76 + '\n' character limit + if len(ln) <= 77: + return ln + + # Find a suitable cut point that doesn't divide a quoted character + in_quote, pos = 0, -1 + for pos, c in enumerate(ln): + + # Skip quoted (=XX) characters + if in_quote != 0: + in_quote += 1 + if in_quote <= 3: + continue + in_quote = 0 + + # If we are past the half way mark, make our cut here + if pos > len(ln)/2: + break + + if six.PY3: + c = bytes((c,)) + + # Should be a quoted character + if c == b'=': + # Peak ahead, does the next char appear to be a hex value? + if quopri.ishex(ln[pos+1:pos+2]): + in_quote = 1 + continue + + new_line = ln[:pos] + next_line = ln[pos:] + + # If new line ends with a :space or :tab + if new_line[-1:] in b' \t': + new_line = new_line[:-1] + quopri.quote(new_line[-1:]) + + dot = b'.' + if six.PY3: + dot = ord('.') + + # If the next line starts with a '.' + if next_line[0] == dot: + next_line = quopri.quote(next_line[0:1]) + next_line[1:] + + return new_line + b"=\n" + next_line + + def _choose_text_encoding(charset, preferred_encoding, body): if charset in ('ascii', 'iso-8859-1', 'us-ascii'): if has_long_lines(body): diff --git a/setup.py b/setup.py index 1d885770..fdc27a4e 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ ], setup(name='flanker', - version='0.9.8', + version='0.10.0', description='Mailgun Parsing Tools', long_description=open('README.rst').read(), classifiers=[ diff --git a/tests/mime/message/headers/part_test.py b/tests/mime/message/headers/part_test.py new file mode 100644 index 00000000..c375e373 --- /dev/null +++ b/tests/mime/message/headers/part_test.py @@ -0,0 +1,154 @@ +# coding:utf-8 + +import flanker.mime.message.part as part +from nose.tools import eq_ + +STRINGS = ( + # Some normal strings + (b'', ''), + (b'hello', 'hello'), + (b'''hello + there + world''', '''hello + there + world'''), + (b'''hello + there + world +''', '''hello + there + world +'''), + (b'\201\202\203', '=81=82=83'), + # Add some trailing MUST QUOTE strings + (b'hello ', 'hello=20'), + (b'hello\t', 'hello=09'), + + # Some long lines. First, a single line of 108 characters + (b'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xd8\xd9\xda\xdb\xdc\xdd\xde\xdfxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', + '''xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx=D8=D9=DA=DB=DC=DD=DE=DFx= +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'''), + + # A line of exactly 76 characters, no soft line break should be needed + (b'yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy', + 'yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy'), + + # A line of 77 characters, forcing a soft line break at position 75, + # and a second line of exactly 2 characters (because the soft line + # break `=' sign counts against the line length limit). + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', + '''zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz= +zz'''), + + # A line of 151 characters, forcing a soft line break at position 75, + # with a second line of exactly 76 characters and no trailing = + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', + '''zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz= +zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'''), + + # A string containing a hard line break, but which the first line is + # 151 characters and the second line is exactly 76 characters. This + # should leave us with three lines, the first which has a soft line + # break, and which the second and third do not. + (b'''yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz''', + '''yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy= +yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy +zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'''), + + # Lines that end with space or tab should be quoted + (b'yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy ', + '''yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy= +=20'''), + + # Lines that end with a partial quoted character + (b'yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy=y', + '''yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy= +=3Dy'''), + + # Lines that lead with a dot '.' should have the dot quoted + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.z', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Ez'), + + # Lines that end with a dot '.' are not quoted + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.zz', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.=\n' + + 'zz'), + + # Lines that lead with a dot '.' should have the dot quoted and cut + # if the quoted line is longer than 76 characters. + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Ezzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\nzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + 'zz'), + + # Respect quoted characters when considering leading '.' + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f\x7f', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2E=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=7F=\n' + + '=7F=7F=7F'), + + # Should cut somewhere near the middle of the line + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat, quick hot dog, quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + '=2Equick brown fox, quick brown cat, qui=\n' + + 'ck hot dog, quick read dog, quick whi=\n' + + 'te bird'), + + # Respect quoted character when considering where to cut + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat\x7f\x7f\x7f\x7f\x7f, quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Equick brown fox, quick brown cat=7F=7F=\n' + + '=7F=7F=7F, quick read dog, quick whi=\n' + + 'te bird'), + + # Avoid considering non quoted characters when cutting + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat=20=================, quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Equick brown fox, quick brown cat=3D20=\n' + + '=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=\n' + + '=3D=3D=3D=3D=3D, quick read dog, quick white bird'), + + # Should quote leading '.' if the cut results in a '.' on the next line + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat..................... quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Equick brown fox, quick brown cat.....=\n' + + '=2E............... quick read dog, quic=\n' + + 'k white bird'), + + # Should quote :space if the cut results in a :space at the end of the next line + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Equick brown fox, quick brown cat =20=\n' + + ' quick read dog, quic=\n' + + 'k white bird'), + # Should quote :tab if the cut results in a :tab at the end of the next line + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.quick brown fox, quick brown cat \t quick read dog, quick white bird', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Equick brown fox, quick brown cat =09=\n' + + ' quick read dog, quic=\n' + + 'k white bird'), + # Should avoid cutting in the middle of multiple quoted characters near the cut point + (b'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + + b'.foo. \xF0\x9F\x99\x82 also there is \xF0\x9F\x99\x82 more in \xF0\x9F\x99\x82 ' + + b'this \xF0\x9F\x99\x82 message', + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz=\n' + + '=2Efoo. =F0=9F=99=82 also there is =F0=9F=\n' + + '=99=82 more in =F0=9F=99=82 this =F0=\n' + '=9F=99=82 message'), +) + + +def test_encode(): + for p, e in STRINGS: + enc = part._encode_transfer_encoding('quoted-printable', p) + eq_(enc, e) +