From 4719c74f6522a5d87599c634c406c704e1c0a988 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alicia=20Boya=20Garc=C3=ADa?= <ntrrgc@gmail.com>
Date: Tue, 27 Aug 2024 11:50:31 +0200
Subject: [PATCH] New TOC CD-TEXT string decoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch replaces the previous broken approach to TOC string decoding
that used `.encode().decode('unicode_escape')` with proper parsing of
the escape sequences cdrdao is known to generate.

The new parser is also lenient with invalid escape sequences, that can
occur due to improper escaping in cdrdao. See:
https://github.com/cdrdao/cdrdao/issues/32

Latin-1:

This new parsing method should work for Latin-1 strings for both old and
new versions of cdrdao, as long as those strings don't trigger the
improper escaping issues in upstream cdrdao.

This has been verified with the album Diorama from the Danish black
metal band MØL.

MS-JIS:

This new parsing method should also work for MS-JIS strings as long as
the .toc file was generated by cdrdao 1.2.5+ and the strings don't
trigger improper escaping issues in upstream cdrdao.

Unfortunately, I don't have any CD with CD-Text in MS-JIS, so I could
not verify this.

cdrdao versions before 1.2.5 will still cause whipper to produce
mojibake (garbled characters) when reading MS-JIS CD-Text, as those
versions do not encode strings in UTF-8.

Other encodings:

As far as I know, CD-Text only supports officially ASCII, Latin-1 and
MS-JIS, but I wouldn't be surprised if there are unofficial encodings
out there, given the strange strings I've seen in some bug reports.

If you have a CD with garbled CD-Text, please submit a bug report
indicating the performer, album name, language and attach the .toc file
so that the produced strings can be compared to the expected text.

Fixes https://github.com/whipper-team/whipper/issues/169

Signed-off-by: Alicia Boya García <ntrrgc@gmail.com>
---
 whipper/image/toc.py                      |  62 +++++++++-
 whipper/test/diorama.cue                  |  45 ++++++++
 whipper/test/diorama_noutf8.toc           | 134 ++++++++++++++++++++++
 whipper/test/diorama_utf8.toc             | 134 ++++++++++++++++++++++
 whipper/test/test_image_toc.py            |  67 +++++++++++
 whipper/test/toc_cdtext_string_parsing.py |  36 ++++++
 6 files changed, 472 insertions(+), 6 deletions(-)
 create mode 100644 whipper/test/diorama.cue
 create mode 100644 whipper/test/diorama_noutf8.toc
 create mode 100644 whipper/test/diorama_utf8.toc
 create mode 100644 whipper/test/toc_cdtext_string_parsing.py
diff --git a/whipper/image/toc.py b/whipper/image/toc.py
index b4c7656e..d4efb966 100644
--- a/whipper/image/toc.py
+++ b/whipper/image/toc.py
@@ -33,7 +33,61 @@
 logger = logging.getLogger(__name__)
 
 # shared
-_CDTEXT_CANDIDATE_RE = re.compile(r'(?P<key>\w+) "(?P<value>.+)"')
+_CDTEXT_CANDIDATE_RE = re.compile(r'''
+    (?P<key>\w+)   # CD-TEXT key.
+    \s+
+    "(?P<value>    # CD-TEXT value.
+        (?:
+            \\\\   # escaped backslash.
+        |   \\"    # escaped double-quote.
+        |   [^"]   # not a double-quote.
+        )+?        # the value must not be empty.
+    )"
+''', flags=re.VERBOSE)
+
+_STRING_SUBSTITUTIONS_RE = re.compile(r'''
+      \\(?P<octal>[0-8][0-8][0-8])
+    | \\"
+    | \\\\
+''', flags=re.VERBOSE)
+
+
+def _string_contents_repl(match: 're.Match[str]') -> str:
+    group_octal = match.group('octal')
+    if group_octal is not None:
+        code_point = int(group_octal, base=8)
+        return chr(code_point)
+
+    entire_match = match.group(0)
+    if entire_match == '\\"':
+        return '"'
+    elif entire_match == '\\\\':
+        return '\\'
+    else:
+        raise RuntimeError("unexpected match: ", entire_match)
+
+
+def parse_toc_string(str_within_quotes: str) -> str:
+    """
+    Given the a quoted string obtained from a TOC file using
+    _CDTEXT_CANDIDATE_RE, compute the unescaped string contained inside.
+
+    Backslash substitutions fail gracefully, which is important since cdrdao
+    string encoding has been found to be flawed as recently as cdrdao 1.2.5
+    (2023):
+    https://github.com/cdrdao/cdrdao/issues/32
+    https://github.com/whipper-team/whipper/issues/169
+
+    This function assumes cdrdao 1.2.5+ (2023) was used, which unless --no-utf8
+    is passed, provides UTF-8 strings. It also works with older versions as long
+    as the encoding was ASCII or Latin-1.
+
+    Note: CD-Text in MS-JIS produced by cdrdao <1.2.5 will produce mojibake
+    (garbled characters), just like the older code this function replaced.
+    """
+    return _STRING_SUBSTITUTIONS_RE.sub(_string_contents_repl,
+                                        str_within_quotes)
+
 
 # header
 _CATALOG_RE = re.compile(r'^CATALOG "(?P<catalog>\d+)"$')
@@ -208,11 +262,7 @@ def parse(self):
             m = _CDTEXT_CANDIDATE_RE.search(line)
             if m:
                 key = m.group('key')
-                value = m.group('value')
-                # usually, value is encoded with octal escapes and in latin-1
-                # FIXME: other encodings are possible, does cdrdao handle
-                # them ?
-                value = value.encode().decode('unicode_escape')
+                value = parse_toc_string(m.group('value'))
                 if key in table.CDTEXT_FIELDS:
                     # FIXME: consider ISRC separate for now, but this
                     # is a limitation of our parser approach
diff --git a/whipper/test/diorama.cue b/whipper/test/diorama.cue
new file mode 100644
index 00000000..6d6a9c2f
--- /dev/null
+++ b/whipper/test/diorama.cue
@@ -0,0 +1,45 @@
+REM DISCID 700AC908
+REM COMMENT "whipper 0.10.1.dev27+ga4b9742.d20240827"
+PERFORMER "MØL"
+TITLE "Diorama"
+FILE "data.wav" WAVE
+  TRACK 01 AUDIO
+    PERFORMER "MØL"
+    TITLE "Fraktur"
+    ISRC DED832100085
+    INDEX 01 00:00:00
+  TRACK 02 AUDIO
+    PERFORMER "MØL"
+    TITLE "Photophobic"
+    ISRC DED832100086
+    INDEX 01 04:19:00
+  TRACK 03 AUDIO
+    PERFORMER "MØL"
+    TITLE "Serf"
+    ISRC DED832100087
+    INDEX 01 09:37:00
+  TRACK 04 AUDIO
+    PERFORMER "MØL"
+    TITLE "Vestige"
+    ISRC DED832100088
+    INDEX 01 14:59:12
+  TRACK 05 AUDIO
+    PERFORMER "MØL"
+    TITLE "Redacted"
+    ISRC DED832100089
+    INDEX 01 20:37:68
+  TRACK 06 AUDIO
+    PERFORMER "MØL"
+    TITLE "Itinerari"
+    ISRC DED832100090
+    INDEX 01 25:54:18
+  TRACK 07 AUDIO
+    PERFORMER "MØL"
+    TITLE "Tvesind"
+    ISRC DED832100091
+    INDEX 01 30:57:58
+  TRACK 08 AUDIO
+    PERFORMER "MØL"
+    TITLE "Diorama"
+    ISRC DED832100092
+    INDEX 01 38:46:57
diff --git a/whipper/test/diorama_noutf8.toc b/whipper/test/diorama_noutf8.toc
new file mode 100644
index 00000000..9b038845
--- /dev/null
+++ b/whipper/test/diorama_noutf8.toc
@@ -0,0 +1,134 @@
+CD_DA
+
+CD_TEXT {
+  LANGUAGE_MAP {
+    0: 9
+  }
+  LANGUAGE 0 {
+    TITLE "Diorama"
+    PERFORMER "M\330L"
+    SIZE_INFO { 0,  1,  8,  0,  7,  3,  0,  0,  0,  0,  0,  0,
+                0,  0,  0,  0,  0,  0,  0,  3, 12,  0,  0,  0,
+                0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0}
+  }
+}
+
+// Track 1
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100085"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Fraktur"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 0 04:19:00
+
+
+// Track 2
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100086"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Photophobic"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 04:19:00 05:18:00
+
+
+// Track 3
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100087"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Serf"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 09:37:00 05:22:12
+
+
+// Track 4
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100088"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Vestige"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 14:59:12 05:38:56
+
+
+// Track 5
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100089"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Redacted"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 20:37:68 05:16:25
+
+
+// Track 6
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100090"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Itinerari"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 25:54:18 05:03:40
+
+
+// Track 7
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100091"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Tvesind"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 30:57:58 07:48:74
+
+
+// Track 10
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100092"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Diorama"
+    PERFORMER "M\330L"
+  }
+}
+FILE "data.wav" 38:46:57 07:14:36
+
diff --git a/whipper/test/diorama_utf8.toc b/whipper/test/diorama_utf8.toc
new file mode 100644
index 00000000..45125952
--- /dev/null
+++ b/whipper/test/diorama_utf8.toc
@@ -0,0 +1,134 @@
+CD_DA
+
+CD_TEXT {
+  LANGUAGE_MAP {
+    0: 9
+  }
+  LANGUAGE 0 {
+    TITLE "Diorama"
+    PERFORMER "MØL"
+    SIZE_INFO { 0,  1,  8,  0,  7,  3,  0,  0,  0,  0,  0,  0,
+                0,  0,  0,  0,  0,  0,  0,  3, 12,  0,  0,  0,
+                0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0}
+  }
+}
+
+// Track 1
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100085"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Fraktur"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 0 04:19:00
+
+
+// Track 2
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100086"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Photophobic"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 04:19:00 05:18:00
+
+
+// Track 3
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100087"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Serf"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 09:37:00 05:22:12
+
+
+// Track 4
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100088"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Vestige"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 14:59:12 05:38:56
+
+
+// Track 5
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100089"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Redacted"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 20:37:68 05:16:25
+
+
+// Track 6
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100090"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Itinerari"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 25:54:18 05:03:40
+
+
+// Track 7
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100091"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Tvesind"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 30:57:58 07:48:74
+
+
+// Track 8
+TRACK AUDIO
+NO COPY
+NO PRE_EMPHASIS
+TWO_CHANNEL_AUDIO
+ISRC "DED832100092"
+CD_TEXT {
+  LANGUAGE 0 {
+    TITLE "Diorama"
+    PERFORMER "MØL"
+  }
+}
+FILE "data.wav" 38:46:57 07:14:36
+
diff --git a/whipper/test/test_image_toc.py b/whipper/test/test_image_toc.py
index 177622b3..76cb82f9 100644
--- a/whipper/test/test_image_toc.py
+++ b/whipper/test/test_image_toc.py
@@ -5,6 +5,7 @@
 import copy
 import shutil
 import tempfile
+from abc import abstractmethod
 
 from whipper.image import toc
 
@@ -194,6 +195,72 @@ def testConvertCue(self):
         ref = self.readCue('breeders.cue')
         self.assertEqual(cue, ref)
 
+
+class DioramaTOCMixin:
+    # TODO figure out how to make this class abstract
+    """
+    MØL - Diorama contains CD-Text.
+
+    Two .toc files are provided:
+     - diorama_utf8.toc (UTF-8 mode, cdrdao 1.2.5)
+     - diorama_noutf8.toc (--no-utf8 mode, cdrdao 1.2.5, but representative of
+       any older version of cdrdao)
+
+    Regardless of the version chosen for the toc file, all the same tests should
+    pass, including generating the same .cue file as output.
+    """
+
+    @property
+    @abstractmethod
+    def tocFileName(self) -> str:
+        raise NotImplementedError
+
+    def setUp(self):
+        self.path = os.path.join(os.path.dirname(__file__), self.tocFileName)
+        self.toc = toc.TocFile(self.path)
+        self.toc.parse()
+        self.assertEqual(len(self.toc.table.tracks), 8)
+
+    def testCDText(self):
+        cdt = self.toc.table.cdtext
+        self.assertEqual(cdt['PERFORMER'], 'MØL')
+        self.assertEqual(cdt['TITLE'], 'Diorama')
+
+        t = self.toc.table.tracks[0]
+        cdt = t.cdtext
+        self.assertEqual(cdt['PERFORMER'], 'MØL')
+        self.assertEqual(cdt['TITLE'], 'Fraktur')
+
+    def testConvertCue(self):
+        self.assertTrue(self.toc.table.hasTOC())
+        cue = self.toc.table.cue()
+        with open("/tmp/miau.txt", "w") as f:
+            f.write(cue)
+        ref = self.readCue('diorama.cue')
+        self.maxDiff = None
+        self.assertEqual(cue, ref)
+
+
+class CDTextLatin1TOCTestCase(common.TestCase, common.UnicodeTestMixin,
+                              DioramaTOCMixin):
+    @property
+    def tocFileName(self) -> str:
+        return 'diorama_noutf8.toc'
+
+    def setUp(self):
+        DioramaTOCMixin.setUp(self)
+
+
+class CDTextUTF8TOCTestCase(common.TestCase, common.UnicodeTestMixin,
+                            DioramaTOCMixin):
+    @property
+    def tocFileName(self) -> str:
+        return 'diorama_utf8.toc'
+
+    def setUp(self):
+        DioramaTOCMixin.setUp(self)
+
+
 # Ladyhawke has a data track
 
 
diff --git a/whipper/test/toc_cdtext_string_parsing.py b/whipper/test/toc_cdtext_string_parsing.py
new file mode 100644
index 00000000..ccfc4a11
--- /dev/null
+++ b/whipper/test/toc_cdtext_string_parsing.py
@@ -0,0 +1,36 @@
+from whipper.image.toc import _CDTEXT_CANDIDATE_RE, parse_toc_string
+
+from whipper.test import common
+
+class TestTOCStringParsing(common.TestCase):
+    def check_string(self, str_with_quotes: str, str_parsed_expected: str):
+        text = f"PERFORMER {str_with_quotes}"
+        match = _CDTEXT_CANDIDATE_RE.match(text)
+        if not match:
+            self.fail(f"String wasn't matched: {text}")
+        self.assertEquals(match.start(), 0)
+        self.assertEquals(match.end(), len(text))
+
+        str_parsed_actual = parse_toc_string(match.group("value"))
+        self.assertEquals(str_parsed_actual, str_parsed_expected)
+
+    def test_simple(self):
+        self.check_string('"foo bar"', 'foo bar')
+
+    def test_escaped_quotes(self):
+        self.check_string(r'"the \"foos\""', r'the "foos"')
+
+    def test_escaped_backslash(self):
+        self.check_string(r'"foo\\bar"', r'foo\bar')
+
+    def test_escaped_latin1(self):
+        self.check_string(r'"M\330L"', r'MØL')
+
+    def test_incomplete_escape(self):
+        self.check_string(r'"M\33a"', r'M\33a')
+
+    def test_trailing_backslash(self):
+        self.check_string(r'"foo\\"', 'foo\\')
+
+    def test_unicode(self):
+        self.check_string(r'"MØL"', 'MØL')
\ No newline at end of file