Skip to content

Commit

Permalink
Use @charset to decode CSS files if available
Browse files Browse the repository at this point in the history
  • Loading branch information
Prinzhorn committed Aug 6, 2020
1 parent 2f33d6c commit 2fe8f3a
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Unreleased: mitmproxy next
** Full Changelog **

* Add MsgPack content viewer (@tasn)
* Use `@charset` to decode CSS files if available (@prinzhorn)

* --- TODO: add new PRs above this line ---

Expand Down
6 changes: 6 additions & 0 deletions mitmproxy/net/http/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ def _guess_encoding(self, content: bytes = b"") -> str:
meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
if not enc:
if "text/css" in self.headers.get("content-type", ""):
# @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")
if not enc:
enc = "latin-1"
# Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
Expand Down
21 changes: 21 additions & 0 deletions test/mitmproxy/net/http/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,27 @@ def test_guess_meta_charset(self):
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert u"鏄庝集" in r.text

def test_guess_css_charset(self):
# @charset but not text/css
r = tutils.tresp(content=b'@charset "gb2312";'
b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert u"鏄庝集" not in r.text

# @charset not at the beginning
r = tutils.tresp(content=b'foo@charset "gb2312";'
b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
r.headers["content-type"] = "text/css"
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert u"鏄庝集" not in r.text

# @charset and text/css
r = tutils.tresp(content=b'@charset "gb2312";'
b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
r.headers["content-type"] = "text/css"
# "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
assert u"鏄庝集" in r.text

def test_guess_latin_1(self):
r = tutils.tresp(content=b"\xF0\xE2")
assert r.text == u"ðâ"
Expand Down

0 comments on commit 2fe8f3a

Please sign in to comment.