From 05208f94440c581248fa301dbf6a052ea5fcea15 Mon Sep 17 00:00:00 2001 From: KG Date: Wed, 18 Dec 2024 19:09:11 -0500 Subject: [PATCH] Add token accessors and rework name converters --- README.md | 2 +- tivars/tokenizer/__init__.py | 22 ++++++++++++--- tivars/tokenizer/decoder.py | 49 ++++++++++++--------------------- tivars/types/gdb.py | 2 +- tivars/types/list.py | 2 +- tivars/types/picture.py | 4 +-- tivars/types/tokenized.py | 52 +++++++++++++++++++++++++++--------- tivars/var.py | 4 +-- 8 files changed, 81 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 7498e4e..333ed91 100644 --- a/README.md +++ b/README.md @@ -261,7 +261,7 @@ All entry types support string formatting using Python's f-strings. - `width`: how many digits to group together *(default: no groups)* - Tokenized entries support formatting of their tokens into readable lines: `{line_spec}{sep}{type}{lang}` - `line_spec`: format specifier for line numbers *(default: no line numbers)* - - `sep`: a string to separate lines and line numbers *(default: none)* + - `sep`: a string to separate lines and line numbers *(required for line numbering)* - `type`: how to format each token - `a`: use accessible names - `d`: use display names *(default)* diff --git a/tivars/tokenizer/__init__.py b/tivars/tokenizer/__init__.py index b573b78..196434d 100644 --- a/tivars/tokenizer/__init__.py +++ b/tivars/tokenizer/__init__.py @@ -16,14 +16,28 @@ class TokenizedString(String): """ Converter for data sections best interpreted as strings of tokens - Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens. + Tokenization uses the TI-84+CE token sheet. """ _T = str @classmethod def get(cls, data: bytes, **kwargs) -> _T: - return decode(data.ljust(8, b'\x00'))[0] + return "".join(token.langs["en"].display for token in decode(data.ljust(8, b'\x00'))[0]) + + @classmethod + def set(cls, value: _T, *, instance=None, **kwargs) -> bytes: + return encode(value)[0].rstrip(b'\x00') + + +class Name(TokenizedString): + """ + Converter for names of vars + + Tokenization uses the TI-84+CE token sheet, which is backwards compatible for all var name tokens. + """ + + _T = str @classmethod def set(cls, value: _T, *, instance=None, **kwargs) -> bytes: @@ -37,5 +51,5 @@ def set(cls, value: _T, *, instance=None, **kwargs) -> bytes: return data -__all__ = ["decode", "encode", "normalize", "TokenizedString", - "Tokens", "OsVersion", "OsVersions"] +__all__ = ["decode", "encode", "normalize", "Name", "TokenizedString", + "Token", "Tokens", "OsVersion", "OsVersions"] diff --git a/tivars/tokenizer/decoder.py b/tivars/tokenizer/decoder.py index 51b480c..f1dba3d 100644 --- a/tivars/tokenizer/decoder.py +++ b/tivars/tokenizer/decoder.py @@ -9,11 +9,15 @@ from tivars.tokens.scripts import * -def decode(bytestream: bytes, *, - tokens: Tokens = None, lang: str = "en", - mode: str = "display") -> tuple[str | bytes, OsVersion]: +def invalid_token(bits: bytes) -> Token: + name = rf"\x{bits.hex()}" if len(bits) == 1 else rf"\u{bits.hex()}" + return Token(bits, {"en": Translation(b'?', "?", name, [])}) + + + +def decode(bytestream: bytes, *, tokens: Tokens = None) -> tuple[list[Token], OsVersion]: """ - Decodes a byte stream into a string of tokens and its minimum supported OS version + Decodes a byte stream into a list of `Token` objects and its minimum supported OS version Each token is represented using one of three different representations formats, dictated by ``mode``: - ``display``: Represents the tokens with Unicode characters matching the calculator's display @@ -22,9 +26,7 @@ def decode(bytestream: bytes, *, :param bytestream: The token bytes to decode :param tokens: The `Tokens` object to use for decoding (defaults to the TI-84+CE tokens) - :param lang: The language used in ``string`` (defaults to English, ``en``) - :param mode: The form of token representation to use for output (defaults to ``display``) - :return: A tuple of a string of token representations and a minimum `OsVersion` + :return: A tuple of a list of `Token` objects and a minimum `OsVersion` """ tokens = tokens or TI_84PCE.tokens @@ -32,8 +34,6 @@ def decode(bytestream: bytes, *, out = [] since = OsVersions.INITIAL - byte_attr = mode == "ti_ascii" - index = 0 curr_bytes = b'' while index < len(bytestream): @@ -42,50 +42,35 @@ def decode(bytestream: bytes, *, if curr_bytes[0]: if curr_bytes in tokens.bytes: - try: - out.append(getattr(tokens.bytes[curr_bytes].langs[lang], mode)) - - except AttributeError: - raise ValueError(f"'{mode}' is not a recognized token representation") - - except KeyError: - raise ValueError(f"'{lang}' is not a recognized language") - + out.append(tokens.bytes[curr_bytes]) since = max(tokens.bytes[curr_bytes].since, since) curr_bytes = b'' elif len(curr_bytes) >= 2: - if not any(key.startswith(curr_bytes[:1]) for key in tokens.bytes): - warn(f"Unrecognized byte '0x{curr_hex}' at position {index}.", - BytesWarning) - - out.append(b'?' if byte_attr else rf"\x{curr_hex}") - - else: - warn(f"Unrecognized bytes '0x{curr_hex}' at position {index}.", - BytesWarning) - - out.append(b'?' if byte_attr else rf"\u{curr_hex}") + warn(f"Unrecognized byte(s) '0x{curr_hex}' at position {index}.", + BytesWarning) + out.append(invalid_token(curr_bytes)) curr_bytes = b'' - elif any(curr_bytes): + elif curr_bytes[-1]: count = 0 while not curr_bytes[0]: curr_bytes = curr_bytes[1:] count += 1 - out.append(b'?' if byte_attr else r"\x00") + out.append(invalid_token(b'\x00')) warn(f"There are {count} unexpected null bytes at position {index}." if count > 1 else f"There is an unexpected null byte at position {index}.", BytesWarning) + curr_bytes = b'' index -= 1 index += 1 - return b''.join(out) if byte_attr else "".join(out), since + return out, since __all__ = ["decode"] diff --git a/tivars/types/gdb.py b/tivars/types/gdb.py index 05aee6f..ece235b 100644 --- a/tivars/types/gdb.py +++ b/tivars/types/gdb.py @@ -283,7 +283,7 @@ def json_name(self) -> str: :return: The name of this equation used in the GDB JSON format """ - return decode(self.raw.name, mode="accessible")[0].strip("{}|") + return self.decode(self.raw.name, mode="accessible").strip("{}|") def load_data_section(self, data: BytesIO): flag_byte = data.read(1) diff --git a/tivars/types/list.py b/tivars/types/list.py index d70cf41..78ea29b 100644 --- a/tivars/types/list.py +++ b/tivars/types/list.py @@ -15,7 +15,7 @@ from .real import RealEntry -class ListName(TokenizedString): +class ListName(Name): """ Converter for the name section of lists diff --git a/tivars/types/picture.py b/tivars/types/picture.py index b432c04..d71cae6 100644 --- a/tivars/types/picture.py +++ b/tivars/types/picture.py @@ -11,7 +11,7 @@ from tivars.data import * from tivars.models import * -from tivars.tokenizer import TokenizedString +from tivars.tokenizer import Name from tivars.var import SizedEntry RGB = tuple[int, int, int] @@ -331,7 +331,7 @@ def array(self) -> list[list[pixel_type]]: # Workaround until the token sheets are updated -class ImageName(TokenizedString): +class ImageName(Name): """ Converter for the name section of images diff --git a/tivars/types/tokenized.py b/tivars/types/tokenized.py index 9b5518b..42bd57e 100644 --- a/tivars/types/tokenized.py +++ b/tivars/types/tokenized.py @@ -6,6 +6,7 @@ import re from io import BytesIO +from typing import Sequence from warnings import catch_warnings, simplefilter, warn from tivars.data import * @@ -43,9 +44,8 @@ class TokenizedEntry(SizedEntry): def __format__(self, format_spec: str) -> str: try: - lines, sep, spec, lang = re.match(r"(.*?[a-z%#])?(\W*)(\w?)\.?(\w+)?", format_spec).groups() - line_number = f"{{index:{lines}}}{sep}" if lines else sep - lang = lang or "en" + lines, sep, spec, lang = re.match(r"(?:(.*?[a-z%#])(\W+))?(\w?)(\.\w+)?$", format_spec).groups() + lang = (lang or ".en")[1:] match spec: case "" | "d": @@ -57,25 +57,31 @@ def __format__(self, format_spec: str) -> str: case _: raise KeyError - return "\n".join(line_number.format(index=index) + line for index, line in enumerate(string.split("\n"))) + if lines: + return "\n".join(f"{index:{lines}}{sep}" + line for index, line in enumerate(string.split("\n"))) + + else: + return string except (AttributeError, KeyError, TypeError, ValueError): return super().__format__(format_spec) @staticmethod - def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str | bytes: + def decode(data: bytes, *, lang: str = "en", mode: str = "display") -> str: """ Decodes a byte stream into a string of tokens - For detailed information on tokenization modes, see `tivars.tokenizer.decode`. - :param data: The token bytes to decode :param lang: The language used in ``string`` (defaults to English, ``en``) :param mode: The form of token representation to use for output (defaults to ``display``) :return: A string of token representations """ - return decode(data, lang=lang, mode=mode)[0] + try: + return "".join(getattr(token.langs[lang], mode) for token in decode(data)[0]) + + except (AttributeError, TypeError): + raise ValueError(f"unrecognized tokenization mode: '{mode}'") @staticmethod def encode(string: str, *, model: TIModel = None, lang: str = None, mode: str = None) -> bytes: @@ -165,6 +171,23 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m self.data = self.encode(string, model=model, lang=lang, mode=mode) + @Loader[Sequence[Token]] + def load_tokens(self, tokens: Sequence[Token]): + """ + Loads this entry from a sequence of `Token` objects + + :param tokens: The sequence of tokens to load + """ + + self.data = b''.join(token.bits for token in tokens) + + def tokens(self) -> list[Token]: + """ + :return: The tokens comprising this entry as a list of `Token` objects + """ + + return decode(self.data)[0] + class TIEquation(TokenizedEntry, register=True): """ @@ -191,7 +214,7 @@ def __init__(self, init=None, *, super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data) - @Section(8, TokenizedString) + @Section(8, Name) def name(self, value) -> str: """ The name of the entry @@ -245,7 +268,7 @@ def __init__(self, init=None, *, super().__init__(init, for_flash=for_flash, name=name, version=version, archived=archived, data=data) - @Section(8, TokenizedString) + @Section(8, Name) def name(self, value) -> str: """ The name of the entry @@ -335,13 +358,16 @@ def load_string(self, string: str, *, model: TIModel = None, lang: str = None, m super().load_string(string, model=model, lang=lang, mode=mode) def string(self) -> str: - string = super().string() - if not self.is_tokenized: warn("ASM programs may not have tokenized data.", UserWarning) - return string + with catch_warnings(): + simplefilter("ignore") + return super().string() + + else: + return super().string() def coerce(self): with catch_warnings(): diff --git a/tivars/var.py b/tivars/var.py index dc92fec..2ad5942 100644 --- a/tivars/var.py +++ b/tivars/var.py @@ -13,7 +13,7 @@ from .data import * from .models import * -from .tokenizer import TokenizedString +from .tokenizer import Name match version_info[:2]: @@ -515,7 +515,7 @@ def type_id(self) -> int: The type determines how the contents of the data section of the entry are interpreted. """ - @Section(8, TokenizedString) + @Section(8, Name) def name(self) -> str: """ The name of the entry