Skip to content

Commit

Permalink
Improve sanitize_text() to keep selected CSI escape sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
dmach committed Mar 7, 2024
1 parent 2d53994 commit f9b1734
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 12 deletions.
84 changes: 72 additions & 12 deletions osc/output/output.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import re
import sys
from typing import Dict
from typing import Optional
from typing import TextIO
from typing import Union
Expand Down Expand Up @@ -44,24 +46,82 @@ def print_msg(*args, print_to: Optional[str] = "debug"):
raise ValueError(f"Invalid value of the 'print_to' option: {print_to}")


# Forbidden characters are nearly all control characters 0-31 with the exception of:
# 0x09 - horizontal tab (\t)
# 0x0A - line feed (\n)
# 0x0D - carriage return (\r)
# (related to CVE-2012-1095)
#
# It would be good to selectively allow 0x1B with safe & trusted escape sequences.
FORBIDDEN_BYTES = b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
FORBIDDEN_CHARS = dict.fromkeys(FORBIDDEN_BYTES)
# cached compiled regular expressions; they are created on the first use
SANITIZE_TEXT_RE: Optional[Dict] = None


def sanitize_text(text: Union[bytes, str]) -> Union[bytes, str]:
"""
Remove forbidden characters from ``text``.
Remove forbidden characters and escape sequences from ``text``.
This must be run on lines or the whole text to work correctly.
Processing blocks of constant size might lead to splitting escape sequences
and leaving garbage characters after sanitizing.
"""
global SANITIZE_TEXT_RE

if not SANITIZE_TEXT_RE:
SANITIZE_TEXT_RE = {}

# CONTROL CHARACTERS
# remove all control characters with the exception of:
# 0x09 - horizontal tab (\t)
# 0x0A - line feed (\n)
# 0x0D - carriage return (\r)
# 0x1B - escape - is selectively handled later as part of sanitizing escape sequences

regex = r"[\x00-\x08\x0B\x0C\x0E-\x1A\x1C-\x1F]"
SANITIZE_TEXT_RE["str_control"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_control"] = re.compile(regex.encode("ascii"))

# CSI ESCAPE SEQUENCES
# https://en.wikipedia.org/wiki/ANSI_escape_code#CSI_codes
# remove all but allowed CSI escape sequences

# negative lookahead assertion that allows safe color escape sequences
neg_allowed_csi_sequences = r"(?!\[([0-5]|[34][0-7]|;)+m)"

# range 0x30–0x3F (OCT \040-\077) (ASCII 0–9:;<=>?); zero or more characters
csi_parameter_bytes = r"[\x30-\x3F]*"

# range 0x20–0x2F (OCT \040-\057) (ASCII space and !"#$%&'()*+,-./); zero or more characters
csi_itermediate_bytes = r"[\x20-\x2F]*"

# range 0x40–0x7E (OCT \100-\176) (ASCII @A–Z[\]^_`a–z{|}~); 1 character
csi_final_byte = r"[\x40-\x7E]"

regex = rf"\033{neg_allowed_csi_sequences}\[{csi_parameter_bytes}{csi_itermediate_bytes}{csi_final_byte}"
SANITIZE_TEXT_RE["str_csi_sequences"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_csi_sequences"] = re.compile(regex.encode("ascii"))

# FE ESCAPE SEQUENCES
# https://en.wikipedia.org/wiki/ANSI_escape_code#Fe_Escape_sequences
# remove all Fe escape sequences

# range 0x40 to 0x5F (ASCII @A–Z[\]^_); 1 character
fe = r"[\x40-x5F]"
regex = rf"\033{neg_allowed_csi_sequences}{fe}"
SANITIZE_TEXT_RE["str_fe_sequences"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_fe_sequences"] = re.compile(regex.encode("ascii"))

# REMAINING ESCAPE CHARACTERS
# remove all remaining escape characters that are not followed with the allowed CSI escape sequences

regex = rf"\033{neg_allowed_csi_sequences}"
SANITIZE_TEXT_RE["str_esc"] = re.compile(regex)
SANITIZE_TEXT_RE["bytes_esc"] = re.compile(regex.encode("ascii"))

if isinstance(text, bytes):
return text.translate(None, FORBIDDEN_BYTES)
return text.translate(FORBIDDEN_CHARS)
text = SANITIZE_TEXT_RE["bytes_control"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_csi_sequences"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_fe_sequences"].sub(b"", text)
text = SANITIZE_TEXT_RE["bytes_esc"].sub(b"", text)
else:
text = SANITIZE_TEXT_RE["str_control"].sub("", text)
text = SANITIZE_TEXT_RE["str_csi_sequences"].sub("", text)
text = SANITIZE_TEXT_RE["str_fe_sequences"].sub("", text)
text = SANITIZE_TEXT_RE["str_esc"].sub("", text)
return text


def safe_print(*args, **kwargs):
Expand Down
78 changes: 78 additions & 0 deletions tests/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import osc.conf
from osc.output import KeyValueTable
from osc.output import print_msg
from osc.output import sanitize_text
from osc.output import tty


Expand Down Expand Up @@ -160,5 +161,82 @@ def test_stderr(self):
self.assertEqual("foo bar\n", stderr.getvalue())


class TestSanitization(unittest.TestCase):
def test_control_chars_bytes(self):
original = b"".join([i.to_bytes(1, byteorder="big") for i in range(32)])
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"\t\n\r")

def test_control_chars_str(self):
original = "".join([chr(i) for i in range(32)])
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "\t\n\r")

def test_csi_escape_sequences_str(self):
# allowed CSI escape sequences
originals = [">\033[0m<", ">\033[1;31;47m]<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, original)

# not allowed CSI escape sequences
originals = [">\033[8m<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")

def test_csi_escape_sequences_bytes(self):
# allowed CSI escape sequences
originals = [b">\033[0m<", b">\033[1;31;47m]<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, original)

# not allowed CSI escape sequences
originals = [b">\033[8m<"]
for original in originals:
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")

def test_standalone_escape_str(self):
original = ">\033<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")

def test_standalone_escape_bytes(self):
# standalone escape
original = b">\033<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")

def test_fe_escape_sequences_str(self):
for i in range(0x40, 0x5F + 1):
char = chr(i)
original = f">\033{char}<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, "><")

def test_fe_escape_sequences_bytes(self):
for i in range(0x40, 0x5F + 1):
byte = i.to_bytes(1, byteorder="big")
original = b">\033" + byte + b"<"
sanitized = sanitize_text(original)
self.assertEqual(sanitized, b"><")

def test_osc_escape_sequences_str(self):
# OSC (Operating System Command) sequences
original = "\033]0;this is the window title\007"
sanitized = sanitize_text(original)
# \033] is removed with the Fe sequences
self.assertEqual(sanitized, "0;this is the window title")

def test_osc_escape_sequences_bytes(self):
# OSC (Operating System Command) sequences
original = b"\033]0;this is the window title\007"
sanitized = sanitize_text(original)
# \033] is removed with the Fe sequences
self.assertEqual(sanitized, b"0;this is the window title")


if __name__ == "__main__":
unittest.main()

0 comments on commit f9b1734

Please sign in to comment.