From 2fe8f3a21818afd6c83ebe6759df065f37ec9a3a Mon Sep 17 00:00:00 2001
From: Alexander Prinzhorn <alexander@prinzhorn.it>
Date: Thu, 6 Aug 2020 15:01:23 +0200
Subject: [PATCH] Use @charset to decode CSS files if available

---
 CHANGELOG                               |  1 +
 mitmproxy/net/http/message.py           |  6 ++++++
 test/mitmproxy/net/http/test_message.py | 21 +++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 74a9a0420a..9cf7f086fe 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,7 @@ Unreleased: mitmproxy next
 ** Full Changelog **
 
     * Add MsgPack content viewer (@tasn)
+    * Use `@charset` to decode CSS files if available (@prinzhorn)
 
     * --- TODO: add new PRs above this line ---
 
diff --git a/mitmproxy/net/http/message.py b/mitmproxy/net/http/message.py
index aea0d91b76..ba3269aa08 100644
--- a/mitmproxy/net/http/message.py
+++ b/mitmproxy/net/http/message.py
@@ -193,6 +193,12 @@ def _guess_encoding(self, content: bytes = b"") -> str:
             meta_charset = re.search(rb"""<meta[^>]+charset=['"]?([^'">]+)""", content)
             if meta_charset:
                 enc = meta_charset.group(1).decode("ascii", "ignore")
+        if not enc:
+            if "text/css" in self.headers.get("content-type", ""):
+                # @charset rule must be the very first thing.
+                css_charset = re.match(rb"""@charset "([^"]+)";""", content)
+                if css_charset:
+                    enc = css_charset.group(1).decode("ascii", "ignore")
         if not enc:
             enc = "latin-1"
         # Use GB 18030 as the superset of GB2312 and GBK to fix common encoding problems on Chinese websites.
diff --git a/test/mitmproxy/net/http/test_message.py b/test/mitmproxy/net/http/test_message.py
index 7cfbfa6c6b..bd42c30c5b 100644
--- a/test/mitmproxy/net/http/test_message.py
+++ b/test/mitmproxy/net/http/test_message.py
@@ -225,6 +225,27 @@ def test_guess_meta_charset(self):
         # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
         assert u"鏄庝集" in r.text
 
+    def test_guess_css_charset(self):
+        # @charset but not text/css
+        r = tutils.tresp(content=b'@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" not in r.text
+
+        # @charset not at the beginning
+        r = tutils.tresp(content=b'foo@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        r.headers["content-type"] = "text/css"
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" not in r.text
+
+        # @charset and text/css
+        r = tutils.tresp(content=b'@charset "gb2312";'
+                                 b'#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}')
+        r.headers["content-type"] = "text/css"
+        # "鏄庝集" is decoded form of \xe6\x98\x8e\xe4\xbc\xaf in gb18030
+        assert u"鏄庝集" in r.text
+
     def test_guess_latin_1(self):
         r = tutils.tresp(content=b"\xF0\xE2")
         assert r.text == u"ðâ"