From f92c4e49c13681d909a2abd08bb5a42741842c07 Mon Sep 17 00:00:00 2001 From: Petr Knap <8299754+petrknap@users.noreply.github.com> Date: Fri, 4 Aug 2023 09:00:36 +0200 Subject: [PATCH 1/3] refactor: rewritten as functional --- jsonl_formatter.py | 260 +++++++++++++++++++--------------- tests/test_jsonl_formatter.py | 9 +- 2 files changed, 151 insertions(+), 118 deletions(-) diff --git a/jsonl_formatter.py b/jsonl_formatter.py index cba92d9..1193100 100755 --- a/jsonl_formatter.py +++ b/jsonl_formatter.py @@ -7,127 +7,161 @@ import sys -class JSONLFormatter: - def __init__(self): - self.json_serialize = json.dumps - self.json_deserialize = lambda json_line: json.loads(json_line, object_pairs_hook=OrderedDict) - self.key_not_found = KeyError() - - def format(self, path: str) -> None: - try: - deserialized = self._load_and_deserialize(path) - logging.debug('File `%s` has been loaded and deserialized', path) - except IOError as load_error: - raise Exception(f'Failed to load file `{path}`\n\t{load_error}') - except Exception as deserialize_exception: - raise Exception(f'Failed to deserialize file `{path}`\n\t{deserialize_exception}') - serialized = self._serialize(deserialized) - logging.debug('Deserialized file `%s` has been serialized', path) - try: - self._write(path, serialized) - logging.debug('Serialized file `%s` has been written', path) - except IOError as write_error: - raise Exception(f'Failed to write file `{path}`\n\t{write_error}') - - def _load_and_deserialize(self, path: str) -> list: - with open(path, 'r') as jsonl_file: - return [self.json_deserialize(json_line) for json_line in jsonl_file.readlines()] - - @staticmethod - def _write(path: str, serialized: list) -> None: - with open(path, 'w') as jsonl_file: - jsonl_file.write('\n'.join(serialized) + ('\n' if serialized else '')) - - def _serialize(self, data_objects: list, **kwargs) -> list: - if not data_objects: - return [] - keys = kwargs.get('keys', None) - is_array = kwargs.get('is_array', None) - if keys is None: - keys = self._get_keys(data_objects) - if is_array is None: - is_array = isinstance(data_objects[0], list) - serialized = self._open_lines(data_objects, is_array) - for key in keys: - serialized = self._serialize_key(serialized, data_objects, key, is_array) - return self._close_lines(serialized, is_array) - - @staticmethod - def _open_lines(data_objects: list, is_array: bool) -> list: - return ['[' if is_array else '{' for _ in data_objects] - - def _serialize_key(self, lines: list, data_objects: list, key: str, is_array: bool) -> list: - values = [self._get_value(data_object, key) for data_object in data_objects] - serialized = [] - max_length = 0 - cache = [] - for index, value in enumerate(values): - if value is self.key_not_found: - key_value = '' - else: - if isinstance(value, OrderedDict) or isinstance(value, list): - key_value = self._serialize_key_object(key, value, values, index, is_array, cache) - else: - key_value = self._serialize_key_value(key, value, is_array) - key_value += ', ' - max_length = max(max_length, len(key_value)) - serialized.append(key_value) - return [ - lines[index] + key_value + ''.rjust(max_length - len(key_value), ' ') - for index, key_value in enumerate(serialized) - ] - - def _serialize_key_object(self, key: str, data_object, data_objects: list, index: int, is_array: bool, cache: list) -> str: - if not cache: - cache.extend(self._serialize( - data_objects, - keys=self._get_keys(data_objects), - is_array=isinstance(data_object, list), - )) - serialized = cache[index] - if is_array: - return serialized +# region JSONL Formatter +KEY_NOT_FOUND = KeyError() + + +def json_serialize(object) -> str: + return json.dumps(object) + + +def json_deserialize(object: str) -> OrderedDict: + return json.loads(object, object_pairs_hook=OrderedDict) + + +# region format +def format(path: str) -> None: + try: + deserialized = load_and_deserialize(path) + logging.debug('File `%s` has been loaded and deserialized', path) + except IOError as load_error: + raise Exception(f'Failed to load file `{path}`\n\t{load_error}') + except Exception as deserialize_exception: + raise Exception(f'Failed to deserialize file `{path}`\n\t{deserialize_exception}') + serialized = serialize(deserialized) + logging.debug('Deserialized file `%s` has been serialized', path) + try: + write(path, serialized) + logging.debug('Serialized file `%s` has been written', path) + except IOError as write_error: + raise Exception(f'Failed to write file `{path}`\n\t{write_error}') + + +# region load_and_deserialize +def load_and_deserialize(path: str) -> list: + with open(path, 'r') as jsonl_file: + return [json_deserialize(json_line) for json_line in jsonl_file.readlines()] +# endregion + + +# region serialize +def serialize(data_objects: list, **kwargs) -> list: + if not data_objects: + return [] + keys = kwargs.get('keys', None) + is_array = kwargs.get('is_array', None) + if keys is None: + keys = get_keys(data_objects) + if is_array is None: + is_array = isinstance(data_objects[0], list) + serialized = open_lines(data_objects, is_array) + for key in keys: + serialized = serialize_key(serialized, data_objects, key, is_array) + return close_lines(serialized, is_array) + + +# region get_keys +def get_keys(data_objects: list) -> list: + keys = [] + for data_object in data_objects: + data_object_keys = [] + if isinstance(data_object, list): + data_object_keys = range(len(data_object)) + elif hasattr(data_object, 'keys'): + data_object_keys = data_object.keys() + # keep order of keys + for key in data_object_keys: + if key not in keys: + keys.append(key) + return keys +# endregion + + +# region open_lines +def open_lines(data_objects: list, is_array: bool) -> list: + return ['[' if is_array else '{' for _ in data_objects] +# endregion + + +# region serialize_key +def serialize_key(lines: list, data_objects: list, key: str, is_array: bool) -> list: + values = [get_value(data_object, key) for data_object in data_objects] + serialized = [] + max_length = 0 + cache = [] + for index, value in enumerate(values): + if value is KEY_NOT_FOUND: + key_value = '' else: - return self.json_serialize({key: '?'})[1:-1].replace('"?"', serialized) + if isinstance(value, OrderedDict) or isinstance(value, list): + key_value = serialize_key_object(key, value, values, index, is_array, cache) + else: + key_value = serialize_key_value(key, value, is_array) + key_value += ', ' + max_length = max(max_length, len(key_value)) + serialized.append(key_value) + return [ + lines[index] + key_value + ''.rjust(max_length - len(key_value), ' ') + for index, key_value in enumerate(serialized) + ] + + +# region get_value +def get_value(data_object, key: str): + try: + return data_object[key] + except (KeyError, IndexError, TypeError, AttributeError): + return KEY_NOT_FOUND +# endregion + + +# region serialize_key_object +def serialize_key_object(key: str, data_object, data_objects: list, index: int, is_array: bool, cache: list) -> str: + if not cache: + cache.extend(serialize( + data_objects, + keys=get_keys(data_objects), + is_array=isinstance(data_object, list), + )) + serialized = cache[index] + if is_array: + return serialized + else: + return json_serialize({key: '?'})[1:-1].replace('"?"', serialized) +# endregion - def _serialize_key_value(self, key: str, value, is_array: bool) -> str: - if is_array: - return self.json_serialize(value) - else: - return self.json_serialize({key: value})[1:-1] - - @staticmethod - def _close_lines(lines: list, is_array: bool) -> list: - return [line.rstrip(' ').rstrip(',') + (']' if is_array else '}') for line in lines] - - @staticmethod - def _get_keys(data_objects: list) -> list: - keys = [] - for data_object in data_objects: - data_object_keys = [] - if isinstance(data_object, list): - data_object_keys = range(len(data_object)) - elif hasattr(data_object, 'keys'): - data_object_keys = data_object.keys() - # keep order of keys - for key in data_object_keys: - if key not in keys: - keys.append(key) - return keys - - def _get_value(self, data_object, key: str): - try: - return data_object[key] - except (KeyError, IndexError, TypeError, AttributeError): - return self.key_not_found + +# region serialize_key_value +def serialize_key_value(key: str, value, is_array: bool) -> str: + if is_array: + return json_serialize(value) + else: + return json_serialize({key: value})[1:-1] +# endregion +# endregion + + +# region close_lines +def close_lines(lines: list, is_array: bool) -> list: + return [line.rstrip(' ').rstrip(',') + (']' if is_array else '}') for line in lines] +# endregion +# endregion + + +# region write +def write(path: str, serialized: list) -> None: + with open(path, 'w') as jsonl_file: + jsonl_file.write('\n'.join(serialized) + ('\n' if serialized else '')) +# endregion +# endregion +# endregion def format_jsonl_files(jsonl_files: list) -> None: is_ok = True - jsonl_formatter = JSONLFormatter() for jsonl_file in jsonl_files: try: - jsonl_formatter.format(jsonl_file) + format(jsonl_file) logging.info('File `%s` has been formatted', jsonl_file) except Exception as exception: is_ok = False diff --git a/tests/test_jsonl_formatter.py b/tests/test_jsonl_formatter.py index 20a0244..d0a7c5f 100644 --- a/tests/test_jsonl_formatter.py +++ b/tests/test_jsonl_formatter.py @@ -1,12 +1,11 @@ import os import unittest -from jsonl_formatter import JSONLFormatter +from jsonl_formatter import load_and_deserialize, serialize, write, format from collections import OrderedDict class TestJSONLFormatter(unittest.TestCase): def setUp(self): - self.jsonl_formatter = JSONLFormatter() self.jsonl_input_file = os.path.join(os.path.dirname(__file__), 'data.jsonl') self.jsonl_output_file = self.jsonl_input_file + '.tmp' self.expected_loaded_jsonl = [ @@ -22,17 +21,17 @@ def setUp(self): def test_load_and_deserialize(self): self.assertEqual( - self.jsonl_formatter._load_and_deserialize(self.jsonl_input_file), + load_and_deserialize(self.jsonl_input_file), self.expected_loaded_jsonl ) def test_serialize(self): self.assertEqual( - self.jsonl_formatter._serialize(self.expected_loaded_jsonl), + serialize(self.expected_loaded_jsonl), self.expected_serialized_jsonl ) def test_write(self): - self.jsonl_formatter._write(self.jsonl_output_file, self.expected_serialized_jsonl) + write(self.jsonl_output_file, self.expected_serialized_jsonl) with open(self.jsonl_output_file, 'r') as file: self.assertEqual(file.read(), '\n'.join(self.expected_serialized_jsonl) + '\n') From 7529d9a3e0b65e4c3bc2afae6260e0830b516486 Mon Sep 17 00:00:00 2001 From: Petr Knap <8299754+petrknap@users.noreply.github.com> Date: Wed, 18 Oct 2023 10:53:35 +0200 Subject: [PATCH 2/3] chore: Added test of `make docker` --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7c6bfba..18b239e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,7 @@ name: Test on: pull_request: paths: + - 'Dockerfile' - '**.py' jobs: @@ -15,3 +16,4 @@ jobs: python-version: '3.10' - run: make init - run: make test + - run: make docker From 8c8fada65a8ada904636e4d8119a97d318aaec79 Mon Sep 17 00:00:00 2001 From: Petr Knap <8299754+petrknap@users.noreply.github.com> Date: Wed, 18 Oct 2023 11:01:13 +0200 Subject: [PATCH 3/3] chore: Implemented flake8 hints --- jsonl_formatter.py | 4 ++-- tests/test_jsonl_formatter.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/jsonl_formatter.py b/jsonl_formatter.py index 1193100..d2b8919 100755 --- a/jsonl_formatter.py +++ b/jsonl_formatter.py @@ -177,7 +177,7 @@ def format_jsonl_files(jsonl_files: list) -> None: sys.exit(os.EX_IOERR) -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser(epilog='For more information visit https://github.com/seznam/jsonl-formatter') parser.add_argument('jsonl_files', metavar='jsonl_file', type=str, nargs='+', help='JSON Lines files') parser.add_argument('-v', '--verbose', action='count', default=0, help='Make it more talkative') @@ -185,7 +185,7 @@ def format_jsonl_files(jsonl_files: list) -> None: logging.basicConfig( level=(logging.WARNING if args.verbose == 0 else logging.INFO if args.verbose == 1 else logging.DEBUG), - format="%(message)s" + format='%(message)s' ) format_jsonl_files(args.jsonl_files) diff --git a/tests/test_jsonl_formatter.py b/tests/test_jsonl_formatter.py index d0a7c5f..11e910c 100644 --- a/tests/test_jsonl_formatter.py +++ b/tests/test_jsonl_formatter.py @@ -1,6 +1,6 @@ import os import unittest -from jsonl_formatter import load_and_deserialize, serialize, write, format +from jsonl_formatter import load_and_deserialize, serialize, write from collections import OrderedDict @@ -9,14 +9,14 @@ def setUp(self): self.jsonl_input_file = os.path.join(os.path.dirname(__file__), 'data.jsonl') self.jsonl_output_file = self.jsonl_input_file + '.tmp' self.expected_loaded_jsonl = [ - OrderedDict([("boolean", True), ("integer", 1), ("float", 1), ("string", "1"), ("object", OrderedDict([("b", "b1"), ("a", "a1"), ("c", "c1")])), ("array", [])]), - OrderedDict([("boolean", False), ("integer", 11), ("float", 1.1), ("string", "11"), ("object", OrderedDict([("b", "b11"), ("a", "a11")])), ("optional", 11), ("array", [OrderedDict([("b", "b11.1"), ("a", "a11.1"), ("c", "c11.1")])])]), - OrderedDict([("boolean", None), ("integer", 111), ("float", 1.11), ("string", "111"), ("object", OrderedDict([("b", "b111"), ("a", "a111")])), ("optional", 111), ("array", [OrderedDict([("b", "b111.1"), ("a", "a111.1")]), OrderedDict([("b", "b111.2"), ("a", "a111.2"), ("c", "c111.2")])])]), + OrderedDict([('boolean', True), ('integer', 1), ('float', 1), ('string', '1'), ('object', OrderedDict([('b', 'b1'), ('a', 'a1'), ('c', 'c1')])), ('array', [])]), # noqa: E501 + OrderedDict([('boolean', False), ('integer', 11), ('float', 1.1), ('string', '11'), ('object', OrderedDict([('b', 'b11'), ('a', 'a11')])), ('optional', 11), ('array', [OrderedDict([('b', 'b11.1'), ('a', 'a11.1'), ('c', 'c11.1')])])]), # noqa: E501 + OrderedDict([('boolean', None), ('integer', 111), ('float', 1.11), ('string', '111'), ('object', OrderedDict([('b', 'b111'), ('a', 'a111')])), ('optional', 111), ('array', [OrderedDict([('b', 'b111.1'), ('a', 'a111.1')]), OrderedDict([('b', 'b111.2'), ('a', 'a111.2'), ('c', 'c111.2')])])]), # noqa: E501 ] self.expected_serialized_jsonl = [ - '{"boolean": true, "integer": 1, "float": 1, "string": "1", "object": {"b": "b1", "a": "a1", "c": "c1"}, "array": []}', - '{"boolean": false, "integer": 11, "float": 1.1, "string": "11", "object": {"b": "b11", "a": "a11"}, "array": [{"b": "b11.1", "a": "a11.1", "c": "c11.1"}], "optional": 11}', - '{"boolean": null, "integer": 111, "float": 1.11, "string": "111", "object": {"b": "b111", "a": "a111"}, "array": [{"b": "b111.1", "a": "a111.1"}, {"b": "b111.2", "a": "a111.2", "c": "c111.2"}], "optional": 111}', + '{"boolean": true, "integer": 1, "float": 1, "string": "1", "object": {"b": "b1", "a": "a1", "c": "c1"}, "array": []}', # noqa: E501 + '{"boolean": false, "integer": 11, "float": 1.1, "string": "11", "object": {"b": "b11", "a": "a11"}, "array": [{"b": "b11.1", "a": "a11.1", "c": "c11.1"}], "optional": 11}', # noqa: E501 + '{"boolean": null, "integer": 111, "float": 1.11, "string": "111", "object": {"b": "b111", "a": "a111"}, "array": [{"b": "b111.1", "a": "a111.1"}, {"b": "b111.2", "a": "a111.2", "c": "c111.2"}], "optional": 111}', # noqa: E501 ] def test_load_and_deserialize(self):