Use @charset to decode CSS files if available

snemes · Aug 6, 2020 · 2fe8f3a · 2fe8f3a
1 parent 2f33d6c
commit 2fe8f3a
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 0 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -3,6 +3,7 @@ Unreleased: mitmproxy next
 ** Full Changelog **
 
     * Add MsgPack content viewer (@tasn)
+    * Use `@charset` to decode CSS files if available (@prinzhorn)
 
     * --- TODO: add new PRs above this line ---
 

diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
@@ -193,6 +193,12 @@ def _guess_encoding(self, content: bytes = b"") -> str:
             meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
             if meta_charset:
                 enc = meta_charset.group(1).decode("ascii", "ignore")
+        if not enc:
+            if "text/css" in self.headers.get("content-type", ""):
+                # @charset rule must be the very first thing.
+                css_charset = re.match(rb"""@charset "([^"]+)";""", content)
+                if css_charset:
+                    enc = css_charset.group(1).decode("ascii", "ignore")
         if not enc:
             enc = "latin-1"
         # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.

diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py
@@ -225,6 +225,27 @@ def test_guess_meta_charset(self):
         # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
         assert u"鏄庝集" in r.text
 
+    def test_guess_css_charset(self):
+        # @charset but not text/css
+        r = tutils.tresp(content=b'@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" not in r.text
+
+        # @charset not at the beginning
+        r = tutils.tresp(content=b'foo@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        r.headers["content-type"] = "text/css"
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" not in r.text
+
+        # @charset and text/css
+        r = tutils.tresp(content=b'@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        r.headers["content-type"] = "text/css"
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" in r.text
+
     def test_guess_latin_1(self):
         r = tutils.tresp(content=b"\xF0\xE2")
         assert r.text == u"ðâ"