Skip to content

Commit

Permalink
naive reverse grouper
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Dec 19, 2024
1 parent 09b1599 commit 203bc64
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 3 deletions.
2 changes: 0 additions & 2 deletions data_juicer/ops/aggregator/meta_tags_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,8 @@ def __init__(self,
self.try_num = try_num

def parse_output(self, response):
print(response)
pattern = re.compile(self.output_pattern, re.VERBOSE | re.DOTALL)
matches = pattern.findall(response)
print(matches)
tag_map = {tag1: tag2 for tag1, tag2 in matches}
return tag_map

Expand Down
3 changes: 2 additions & 1 deletion data_juicer/ops/grouper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .key_value_grouper import KeyValueGrouper
from .naive_grouper import NaiveGrouper
from .naive_reverse_grouper import NaiveReverseGrouper

__all__ = ['NaiveGrouper', 'KeyValueGrouper']
__all__ = ['KeyValueGrouper', 'NaiveGrouper', 'NaiveReverseGrouper']
26 changes: 26 additions & 0 deletions data_juicer/ops/grouper/naive_reverse_grouper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from ..base_op import OPERATORS, Grouper, convert_dict_list_to_list_dict


@OPERATORS.register_module('naive_reverse_grouper')
class NaiveReverseGrouper(Grouper):
"""Split one batched sample to samples. """

def __init__(self, *args, **kwargs):
"""
Initialization method.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)

def process(self, dataset):

if len(dataset) == 0:
return dataset

samples = []
for sample in dataset:
samples.append(convert_dict_list_to_list_dict(sample))

return samples
83 changes: 83 additions & 0 deletions tests/ops/grouper/test_naive_reverse_grouper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import unittest

from data_juicer.core.data import NestedDataset as Dataset
from data_juicer.ops.grouper.naive_reverse_grouper import NaiveReverseGrouper
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase


class NaiveReverseGrouperTest(DataJuicerTestCaseBase):

def _run_helper(self, op, samples, target):
dataset = Dataset.from_list(samples)
new_dataset = op.run(dataset)

for d, t in zip(new_dataset, target):
self.assertEqual(d['text'], t['text'])

def test_one_batched_sample(self):

source = [
{
'text':[
"Today is Sunday and it's a happy day!",
"Sur la plateforme MT4, plusieurs manières d'accéder à \n"
'ces fonctionnalités sont conçues simultanément.',
'欢迎来到阿里巴巴!'
]
}
]

target = [
{
'text': "Today is Sunday and it's a happy day!"
},
{
'text':
"Sur la plateforme MT4, plusieurs manières d'accéder à \n"
'ces fonctionnalités sont conçues simultanément.'
},
{
'text': '欢迎来到阿里巴巴!'
},
]

op = NaiveReverseGrouper()
self._run_helper(op, source, target)


def test_two_batch_sample(self):

source = [
{
'text':[
"Today is Sunday and it's a happy day!",
"Sur la plateforme MT4, plusieurs manières d'accéder à \n"
'ces fonctionnalités sont conçues simultanément.'
]
},
{
'text':[
'欢迎来到阿里巴巴!'
]
}
]

target = [
{
'text': "Today is Sunday and it's a happy day!"
},
{
'text':
"Sur la plateforme MT4, plusieurs manières d'accéder à \n"
'ces fonctionnalités sont conçues simultanément.'
},
{
'text': '欢迎来到阿里巴巴!'
},
]

op = NaiveReverseGrouper()
self._run_helper(op, source, target)

if __name__ == '__main__':
unittest.main()

0 comments on commit 203bc64

Please sign in to comment.