Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewritten to functional version #4

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ name: Test
on:
pull_request:
paths:
- 'Dockerfile'
- '**.py'

jobs:
Expand All @@ -15,3 +16,4 @@ jobs:
python-version: '3.10'
- run: make init
- run: make test
- run: make docker
264 changes: 149 additions & 115 deletions jsonl_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,127 +7,161 @@
import sys


class JSONLFormatter:
def __init__(self):
self.json_serialize = json.dumps
self.json_deserialize = lambda json_line: json.loads(json_line, object_pairs_hook=OrderedDict)
self.key_not_found = KeyError()

def format(self, path: str) -> None:
try:
deserialized = self._load_and_deserialize(path)
logging.debug('File `%s` has been loaded and deserialized', path)
except IOError as load_error:
raise Exception(f'Failed to load file `{path}`\n\t{load_error}')
except Exception as deserialize_exception:
raise Exception(f'Failed to deserialize file `{path}`\n\t{deserialize_exception}')
serialized = self._serialize(deserialized)
logging.debug('Deserialized file `%s` has been serialized', path)
try:
self._write(path, serialized)
logging.debug('Serialized file `%s` has been written', path)
except IOError as write_error:
raise Exception(f'Failed to write file `{path}`\n\t{write_error}')

def _load_and_deserialize(self, path: str) -> list:
with open(path, 'r') as jsonl_file:
return [self.json_deserialize(json_line) for json_line in jsonl_file.readlines()]

@staticmethod
def _write(path: str, serialized: list) -> None:
with open(path, 'w') as jsonl_file:
jsonl_file.write('\n'.join(serialized) + ('\n' if serialized else ''))

def _serialize(self, data_objects: list, **kwargs) -> list:
if not data_objects:
return []
keys = kwargs.get('keys', None)
is_array = kwargs.get('is_array', None)
if keys is None:
keys = self._get_keys(data_objects)
if is_array is None:
is_array = isinstance(data_objects[0], list)
serialized = self._open_lines(data_objects, is_array)
for key in keys:
serialized = self._serialize_key(serialized, data_objects, key, is_array)
return self._close_lines(serialized, is_array)

@staticmethod
def _open_lines(data_objects: list, is_array: bool) -> list:
return ['[' if is_array else '{' for _ in data_objects]

def _serialize_key(self, lines: list, data_objects: list, key: str, is_array: bool) -> list:
values = [self._get_value(data_object, key) for data_object in data_objects]
serialized = []
max_length = 0
cache = []
for index, value in enumerate(values):
if value is self.key_not_found:
key_value = ''
else:
if isinstance(value, OrderedDict) or isinstance(value, list):
key_value = self._serialize_key_object(key, value, values, index, is_array, cache)
else:
key_value = self._serialize_key_value(key, value, is_array)
key_value += ', '
max_length = max(max_length, len(key_value))
serialized.append(key_value)
return [
lines[index] + key_value + ''.rjust(max_length - len(key_value), ' ')
for index, key_value in enumerate(serialized)
]

def _serialize_key_object(self, key: str, data_object, data_objects: list, index: int, is_array: bool, cache: list) -> str:
if not cache:
cache.extend(self._serialize(
data_objects,
keys=self._get_keys(data_objects),
is_array=isinstance(data_object, list),
))
serialized = cache[index]
if is_array:
return serialized
# region JSONL Formatter
KEY_NOT_FOUND = KeyError()


def json_serialize(object) -> str:
return json.dumps(object)


def json_deserialize(object: str) -> OrderedDict:
return json.loads(object, object_pairs_hook=OrderedDict)


# region format
def format(path: str) -> None:
try:
deserialized = load_and_deserialize(path)
logging.debug('File `%s` has been loaded and deserialized', path)
except IOError as load_error:
raise Exception(f'Failed to load file `{path}`\n\t{load_error}')
except Exception as deserialize_exception:
raise Exception(f'Failed to deserialize file `{path}`\n\t{deserialize_exception}')
serialized = serialize(deserialized)
logging.debug('Deserialized file `%s` has been serialized', path)
try:
write(path, serialized)
logging.debug('Serialized file `%s` has been written', path)
except IOError as write_error:
raise Exception(f'Failed to write file `{path}`\n\t{write_error}')


# region load_and_deserialize
def load_and_deserialize(path: str) -> list:
with open(path, 'r') as jsonl_file:
return [json_deserialize(json_line) for json_line in jsonl_file.readlines()]
# endregion


# region serialize
def serialize(data_objects: list, **kwargs) -> list:
if not data_objects:
return []
keys = kwargs.get('keys', None)
is_array = kwargs.get('is_array', None)
if keys is None:
keys = get_keys(data_objects)
if is_array is None:
is_array = isinstance(data_objects[0], list)
serialized = open_lines(data_objects, is_array)
for key in keys:
serialized = serialize_key(serialized, data_objects, key, is_array)
return close_lines(serialized, is_array)


# region get_keys
def get_keys(data_objects: list) -> list:
keys = []
for data_object in data_objects:
data_object_keys = []
if isinstance(data_object, list):
data_object_keys = range(len(data_object))
elif hasattr(data_object, 'keys'):
data_object_keys = data_object.keys()
# keep order of keys
for key in data_object_keys:
if key not in keys:
keys.append(key)
return keys
# endregion


# region open_lines
def open_lines(data_objects: list, is_array: bool) -> list:
return ['[' if is_array else '{' for _ in data_objects]
# endregion


# region serialize_key
def serialize_key(lines: list, data_objects: list, key: str, is_array: bool) -> list:
values = [get_value(data_object, key) for data_object in data_objects]
serialized = []
max_length = 0
cache = []
for index, value in enumerate(values):
if value is KEY_NOT_FOUND:
key_value = ''
else:
return self.json_serialize({key: '?'})[1:-1].replace('"?"', serialized)
if isinstance(value, OrderedDict) or isinstance(value, list):
key_value = serialize_key_object(key, value, values, index, is_array, cache)
else:
key_value = serialize_key_value(key, value, is_array)
key_value += ', '
max_length = max(max_length, len(key_value))
serialized.append(key_value)
return [
lines[index] + key_value + ''.rjust(max_length - len(key_value), ' ')
for index, key_value in enumerate(serialized)
]


# region get_value
def get_value(data_object, key: str):
try:
return data_object[key]
except (KeyError, IndexError, TypeError, AttributeError):
return KEY_NOT_FOUND
# endregion


# region serialize_key_object
def serialize_key_object(key: str, data_object, data_objects: list, index: int, is_array: bool, cache: list) -> str:
if not cache:
cache.extend(serialize(
data_objects,
keys=get_keys(data_objects),
is_array=isinstance(data_object, list),
))
serialized = cache[index]
if is_array:
return serialized
else:
return json_serialize({key: '?'})[1:-1].replace('"?"', serialized)
# endregion

def _serialize_key_value(self, key: str, value, is_array: bool) -> str:
if is_array:
return self.json_serialize(value)
else:
return self.json_serialize({key: value})[1:-1]

@staticmethod
def _close_lines(lines: list, is_array: bool) -> list:
return [line.rstrip(' ').rstrip(',') + (']' if is_array else '}') for line in lines]

@staticmethod
def _get_keys(data_objects: list) -> list:
keys = []
for data_object in data_objects:
data_object_keys = []
if isinstance(data_object, list):
data_object_keys = range(len(data_object))
elif hasattr(data_object, 'keys'):
data_object_keys = data_object.keys()
# keep order of keys
for key in data_object_keys:
if key not in keys:
keys.append(key)
return keys

def _get_value(self, data_object, key: str):
try:
return data_object[key]
except (KeyError, IndexError, TypeError, AttributeError):
return self.key_not_found

# region serialize_key_value
def serialize_key_value(key: str, value, is_array: bool) -> str:
if is_array:
return json_serialize(value)
else:
return json_serialize({key: value})[1:-1]
# endregion
# endregion


# region close_lines
def close_lines(lines: list, is_array: bool) -> list:
return [line.rstrip(' ').rstrip(',') + (']' if is_array else '}') for line in lines]
# endregion
# endregion


# region write
def write(path: str, serialized: list) -> None:
with open(path, 'w') as jsonl_file:
jsonl_file.write('\n'.join(serialized) + ('\n' if serialized else ''))
# endregion
# endregion
# endregion


def format_jsonl_files(jsonl_files: list) -> None:
is_ok = True
jsonl_formatter = JSONLFormatter()
for jsonl_file in jsonl_files:
try:
jsonl_formatter.format(jsonl_file)
format(jsonl_file)
logging.info('File `%s` has been formatted', jsonl_file)
except Exception as exception:
is_ok = False
Expand All @@ -143,15 +177,15 @@ def format_jsonl_files(jsonl_files: list) -> None:
sys.exit(os.EX_IOERR)


if __name__ == "__main__":
if __name__ == '__main__':
parser = argparse.ArgumentParser(epilog='For more information visit https://github.com/seznam/jsonl-formatter')
parser.add_argument('jsonl_files', metavar='jsonl_file', type=str, nargs='+', help='JSON Lines files')
parser.add_argument('-v', '--verbose', action='count', default=0, help='Make it more talkative')
args = parser.parse_args()

logging.basicConfig(
level=(logging.WARNING if args.verbose == 0 else logging.INFO if args.verbose == 1 else logging.DEBUG),
format="%(message)s"
format='%(message)s'
)

format_jsonl_files(args.jsonl_files)
21 changes: 10 additions & 11 deletions tests/test_jsonl_formatter.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
import os
import unittest
from jsonl_formatter import JSONLFormatter
from jsonl_formatter import load_and_deserialize, serialize, write
from collections import OrderedDict


class TestJSONLFormatter(unittest.TestCase):
def setUp(self):
self.jsonl_formatter = JSONLFormatter()
self.jsonl_input_file = os.path.join(os.path.dirname(__file__), 'data.jsonl')
self.jsonl_output_file = self.jsonl_input_file + '.tmp'
self.expected_loaded_jsonl = [
OrderedDict([("boolean", True), ("integer", 1), ("float", 1), ("string", "1"), ("object", OrderedDict([("b", "b1"), ("a", "a1"), ("c", "c1")])), ("array", [])]),
OrderedDict([("boolean", False), ("integer", 11), ("float", 1.1), ("string", "11"), ("object", OrderedDict([("b", "b11"), ("a", "a11")])), ("optional", 11), ("array", [OrderedDict([("b", "b11.1"), ("a", "a11.1"), ("c", "c11.1")])])]),
OrderedDict([("boolean", None), ("integer", 111), ("float", 1.11), ("string", "111"), ("object", OrderedDict([("b", "b111"), ("a", "a111")])), ("optional", 111), ("array", [OrderedDict([("b", "b111.1"), ("a", "a111.1")]), OrderedDict([("b", "b111.2"), ("a", "a111.2"), ("c", "c111.2")])])]),
OrderedDict([('boolean', True), ('integer', 1), ('float', 1), ('string', '1'), ('object', OrderedDict([('b', 'b1'), ('a', 'a1'), ('c', 'c1')])), ('array', [])]), # noqa: E501
OrderedDict([('boolean', False), ('integer', 11), ('float', 1.1), ('string', '11'), ('object', OrderedDict([('b', 'b11'), ('a', 'a11')])), ('optional', 11), ('array', [OrderedDict([('b', 'b11.1'), ('a', 'a11.1'), ('c', 'c11.1')])])]), # noqa: E501
OrderedDict([('boolean', None), ('integer', 111), ('float', 1.11), ('string', '111'), ('object', OrderedDict([('b', 'b111'), ('a', 'a111')])), ('optional', 111), ('array', [OrderedDict([('b', 'b111.1'), ('a', 'a111.1')]), OrderedDict([('b', 'b111.2'), ('a', 'a111.2'), ('c', 'c111.2')])])]), # noqa: E501
]
self.expected_serialized_jsonl = [
'{"boolean": true, "integer": 1, "float": 1, "string": "1", "object": {"b": "b1", "a": "a1", "c": "c1"}, "array": []}',
'{"boolean": false, "integer": 11, "float": 1.1, "string": "11", "object": {"b": "b11", "a": "a11"}, "array": [{"b": "b11.1", "a": "a11.1", "c": "c11.1"}], "optional": 11}',
'{"boolean": null, "integer": 111, "float": 1.11, "string": "111", "object": {"b": "b111", "a": "a111"}, "array": [{"b": "b111.1", "a": "a111.1"}, {"b": "b111.2", "a": "a111.2", "c": "c111.2"}], "optional": 111}',
'{"boolean": true, "integer": 1, "float": 1, "string": "1", "object": {"b": "b1", "a": "a1", "c": "c1"}, "array": []}', # noqa: E501
'{"boolean": false, "integer": 11, "float": 1.1, "string": "11", "object": {"b": "b11", "a": "a11"}, "array": [{"b": "b11.1", "a": "a11.1", "c": "c11.1"}], "optional": 11}', # noqa: E501
'{"boolean": null, "integer": 111, "float": 1.11, "string": "111", "object": {"b": "b111", "a": "a111"}, "array": [{"b": "b111.1", "a": "a111.1"}, {"b": "b111.2", "a": "a111.2", "c": "c111.2"}], "optional": 111}', # noqa: E501
]

def test_load_and_deserialize(self):
self.assertEqual(
self.jsonl_formatter._load_and_deserialize(self.jsonl_input_file),
load_and_deserialize(self.jsonl_input_file),
self.expected_loaded_jsonl
)

def test_serialize(self):
self.assertEqual(
self.jsonl_formatter._serialize(self.expected_loaded_jsonl),
serialize(self.expected_loaded_jsonl),
self.expected_serialized_jsonl
)

def test_write(self):
self.jsonl_formatter._write(self.jsonl_output_file, self.expected_serialized_jsonl)
write(self.jsonl_output_file, self.expected_serialized_jsonl)
with open(self.jsonl_output_file, 'r') as file:
self.assertEqual(file.read(), '\n'.join(self.expected_serialized_jsonl) + '\n')