From 2fe8f3a21818afd6c83ebe6759df065f37ec9a3a Mon Sep 17 00:00:00 2001 From: Alexander Prinzhorn Date: Thu, 6 Aug 2020 15:01:23 +0200 Subject: [PATCH] Use @charset to decode CSS files if available --- CHANGELOG | 1 + mitmproxy/net/http/message.py | 6 ++++++ test/mitmproxy/net/http/test_message.py | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 74a9a0420a..9cf7f086fe 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ Unreleased: mitmproxy next ** Full Changelog ** * Add MsgPack content viewer (@tasn) + * Use `@charset` to decode CSS files if available (@prinzhorn) * --- TODO: add new PRs above this line --- diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py index aea0d91b76..ba3269aa08 100644 --- a/mitmproxy/net/http/message.py +++ b/mitmproxy/net/http/message.py @@ -193,6 +193,12 @@ def _guess_encoding(self, content: bytes = b"") -> str: meta_charset = re.search(rb"""]+charset=['"]?([^'">]+)""", content) if meta_charset: enc = meta_charset.group(1).decode("ascii", "ignore") + if not enc: + if "text/css" in self.headers.get("content-type", ""): + # @charset rule must be the very first thing. + css_charset = re.match(rb"""@charset "([^"]+)";""", content) + if css_charset: + enc = css_charset.group(1).decode("ascii", "ignore") if not enc: enc = "latin-1" # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites. diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py index 7cfbfa6c6b..bd42c30c5b 100644 --- a/test/mitmproxy/net/http/test_message.py +++ b/test/mitmproxy/net/http/test_message.py @@ -225,6 +225,27 @@ def test_guess_meta_charset(self): # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 assert u"鏄庝集" in r.text + def test_guess_css_charset(self): + # @charset but not text/css + r = tutils.tresp(content=b'@charset "gb2312";' + b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}') + # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 + assert u"鏄庝集" not in r.text + + # @charset not at the beginning + r = tutils.tresp(content=b'foo@charset "gb2312";' + b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}') + r.headers["content-type"] = "text/css" + # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 + assert u"鏄庝集" not in r.text + + # @charset and text/css + r = tutils.tresp(content=b'@charset "gb2312";' + b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}') + r.headers["content-type"] = "text/css" + # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030 + assert u"鏄庝集" in r.text + def test_guess_latin_1(self): r = tutils.tresp(content=b"\xF0\xE2") assert r.text == u"ðâ"