diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py index 4b8aadd5e..2d81c28de 100644 --- a/data_juicer/ops/filter/__init__.py +++ b/data_juicer/ops/filter/__init__.py @@ -1,9 +1,8 @@ from . import (alphanumeric_filter, average_line_length_filter, character_repetition_filter, flagged_words_filter, image_aspect_ratio_filter, image_size_filter, - language_id_score_filter, - maximum_line_length_filter, perplexity_filter, - special_characters_filter, specified_field_filter, - specified_numeric_field_filter, stopwords_filter, suffix_filter, - text_length_filter, token_num_filter, word_num_filter, - word_repetition_filter) + language_id_score_filter, maximum_line_length_filter, + perplexity_filter, special_characters_filter, + specified_field_filter, specified_numeric_field_filter, + stopwords_filter, suffix_filter, text_length_filter, + token_num_filter, word_num_filter, word_repetition_filter) diff --git a/data_juicer/ops/filter/image_size_filter.py b/data_juicer/ops/filter/image_size_filter.py index e857b510f..e5fd9455e 100644 --- a/data_juicer/ops/filter/image_size_filter.py +++ b/data_juicer/ops/filter/image_size_filter.py @@ -1,11 +1,10 @@ - import numpy as np from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.mm_utils import get_image_size, size_to_bytes from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -from data_juicer.utils.mm_utils import get_image_size, size_to_bytes @OPERATORS.register_module('image_size_filter') @@ -16,8 +15,8 @@ class ImageSizeFilter(Filter): """ def __init__(self, - min_size: str = "0", - max_size: str = "1Tb", + min_size: str = '0', + max_size: str = '1Tb', any_or_all: str = 'any', *args, **kwargs): @@ -56,8 +55,7 @@ def compute_stats(self, sample, context=False): # for size calculation, no need to load images into memory sample[Fields.stats][StatsKeys.image_sizes] = [ - get_image_size(img_path) - for img_path in sample[self.image_key] + get_image_size(img_path) for img_path in sample[self.image_key] ] return sample @@ -65,10 +63,9 @@ def compute_stats(self, sample, context=False): def process(self, sample): image_sizes = sample[Fields.stats][StatsKeys.image_sizes] keep_bools = np.array([ - size_to_bytes(self.min_size) - <= image_size <= - size_to_bytes(self.max_size) - for image_size in image_sizes]) + size_to_bytes(self.min_size) <= image_size <= size_to_bytes( + self.max_size) for image_size in image_sizes + ]) if len(keep_bools) <= 0: return True @@ -77,4 +74,3 @@ def process(self, sample): return keep_bools.any() else: return keep_bools.all() - diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index f451730ea..0bcf8590e 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -33,7 +33,7 @@ def size_to_bytes(size): numbers_list = [char for char in size if char.isdigit()] if len(numbers_list) == 0: - raise ValueError(f"Your input `size` does not contain numbers: {size}") + raise ValueError(f'Your input `size` does not contain numbers: {size}') size_numbers = int(float(''.join(numbers_list))) @@ -61,7 +61,6 @@ def size_to_bytes(size): elif suffix == 'yb' or suffix == 'yib': return size_numbers << 80 else: - raise ValueError(f"You specified unidentifiable unit: {suffix}, " - f"expected in [KB, MB, GB, TB, PB, EB, ZB, YB, " - f"KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB]") - + raise ValueError(f'You specified unidentifiable unit: {suffix}, ' + f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, ' + f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB]')