forked from modelscope/data-juicer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig_all.yaml
519 lines (509 loc) · 77.7 KB
/
config_all.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
# Process config example including:
# - all global arguments
# - all ops and their arguments
# global parameters
project_name: 'all' # project name for distinguish your configs
dataset_path: '/path/to/your/dataset' # path to your dataset directory or file with weights(0.0-1.0), 1.0 as default.
# accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/result/dataset.jsonl' # path to processed result dataset. Supported suffixes include ['jsonl', 'json', 'parquet']
export_shard_size: 0 # shard size of exported dataset in Byte. In default, it's 0, which means export the whole dataset into only one file. If it's set a positive number, the exported dataset will be split into several dataset shards, and the max size of each shard won't larger than the export_shard_size
export_in_parallel: false # whether to export the result dataset in parallel to a single file, which usually takes less time. It only works when export_shard_size is 0, and its default number of processes is the same as the argument np. **Notice**: If it's True, sometimes exporting in parallel might require much more time due to the IO blocking, especially for very large datasets. When this happens, False is a better choice, although it takes more time.
np: 4 # number of subprocess to process your dataset
text_keys: 'text' # the key name of field where the sample texts to be processed, e.g., `text`, `instruction`, `output`, ...
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
use_cache: true # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
ds_cache_dir: null # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
use_checkpoint: false # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
temp_dir: null # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
open_tracer: false # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
op_list_to_trace: [] # only ops in this list will be traced by tracer. If it's empty, all ops will be traced. Only available when tracer is opened.
trace_num: 10 # number of samples to show the differences between datasets before and after each op. Only available when tracer is opened.
op_fusion: false # whether to fuse operators that share the same intermediate variables automatically. Op fusion might reduce the memory requirements slightly but speed up the whole process.
cache_compress: null # the compression method of the cache file, which can be specified in ['gzip', 'zstd', 'lz4']. If this parameter is None, the cache file will not be compressed. We recommend you turn on this argument when your input dataset is larger than tens of GB and your disk space is not enough.
keep_stats_in_res_ds: false # whether to keep the computed stats in the result dataset. The intermediate fields to store the stats computed by Filters will be removed if it's False. It's False in default.
keep_hashes_in_res_ds: false # whether to keep the computed hashes in the result dataset. The intermediate fields to store the hashes computed by Deduplicators will be removed if it's False. It's False in default.
# for multimodal data processing
image_key: 'images' # key name of field to store the list of sample image paths.
image_special_token: '<__dj__image>' # the special token that represents an image in the text. In default, it's "<__dj__image>". You can specify your own special token according to your input dataset.
audio_key: 'audios' # key name of field to store the list of sample audio paths.
audio_special_token: '<__dj__audio>' # the special token that represents an audio in the text. In default, it's "<__dj__audio>". You can specify your own special token according to your input dataset.
video_key: 'videos' # key name of field to store the list of sample video paths.
video_special_token: '<__dj__video>' # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset.
eoc_special_token: '<|__dj__eoc|>' # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
# for distributed processing
executor_type: default # type of executor, support "default" or "ray" for now.
ray_address: auto # the address of the Ray cluster.
# only for data analysis
percentiles: [0.25, 0.5, 0.75] # percentiles to analyze the dataset distribution
export_original_dataset: false # whether to export the original dataset with stats. If you only need the stats of the dataset, setting it to false could speed up the exporting.
save_stats_in_one_file: false # whether to store all stats result into one file
# for sandbox or hpo
data_probe_algo: 'uniform' # sampling algorithm for dataset. Should be one of ["uniform", "frequency_specified_field_selector", "topk_specified_field_selector"]. It's "uniform" in default. Only used for dataset sampling.
data_probe_ratio: 1.0 # the sampling ratio to the original dataset size. It's 1.0 in default. Only used for dataset sampling.
hpo_config: null # path to a configuration file when using auto-HPO tool.
# process schedule: a list of several process operators with their arguments
process:
# Mapper ops. Most of these ops need no arguments.
- audio_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg audio filters
- chinese_convert_mapper: # convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.
mode: 's2t' # choose the mode to convert Chinese: ['s2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t']
- clean_email_mapper: # remove emails from text.
- clean_html_mapper: # remove html formats form text.
- clean_ip_mapper: # remove ip addresses from text.
- clean_links_mapper: # remove web links from text.
- clean_copyright_mapper: # remove copyright comments.
- expand_macro_mapper: # expand macro definitions in Latex text.
- extract_qa_mapper: # mapper to extract question and answer pair from text.
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
- fix_unicode_mapper: # fix unicode errors in text.
- image_blur_mapper: # mapper to blur images.
p: 0.2 # probability of the image being blured
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_captioning_from_gpt4v_mapper: # generate samples whose texts are generated based on gpt-4-visison and the image
mode: 'description' # mode of text generated from images, can be one of ['resoning', 'description', 'conversation', 'custom']
api_key: '' # the API key to authenticate the request
max_token: 500 # the maximum number of tokens to generate. Default is 500.
temperature: 1.0 # controls the randomness of the output (range from 0 to 1). Default is 0.
system_prompt: '' # a string prompt used to set the context of a conversation and provide global guidance or rules for the gpt4-vision so that it can generate responses in the expected way. If `mode` set to `custom`, the parameter will be used
user_prompt: '' # a string prompt to guide the generation of gpt4-vision for each samples. It's "" in default, which means no prompt provided
user_prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated text in the final datasets and the original text will be removed. It's True in default
any_or_all: 'any' # keep this sample with 'any' or 'all' strategy of all images. 'any': keep this sample if any images meet the condition. 'all': keep this sample only if all images meet the condition
- image_captioning_mapper: # generate captions for images to augment datasets
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each image
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of blip2 model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
mem_required: '16GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_diffusion_mapper: # generate images by diffusion model
hf_diffusion: 'CompVis/stable-diffusion-v1-4' # stable diffusion model name on huggingface to generate image
torch_dtype: 'fp32' # the floating point type used to load the diffusion model. Can be one of ['fp32', 'fp16', 'bf16']
revision: 'main' # The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git.
strength: 0.8 # parameter of stable diffusion model, indicates extent to transform the reference image. will ignore the input image if it equals to 1
guidance_scale: 7.5 # parameter of stable diffusion model, a higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality
aug_num: 1 # the number of images to generate
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated images in the final datasets and the original images will be removed. It's True in default.
caption_key: null # the key name of fields in samples to store captions for each images, the caption guide the diffusion model to produce what the image is
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # model name on huggingface to generate caption if caption_key is null
mem_required: '8GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_face_blur_mapper: # blur faces detected in images
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
delete_random_word: false # whether to open the augmentation method of deleting random words from the original texts. e.g. "I love LLM" --> "I LLM"
swap_random_word: false # whether to open the augmentation method of swapping random contiguous words in the original texts. e.g. "I love LLM" --> "Love I LLM"
spelling_error_word: false # whether to open the augmentation method of simulating the spelling error for words in the original texts. e.g. "I love LLM" --> "Ai love LLM"
split_random_word: false # whether to open the augmentation method of splitting words randomly with whitespaces in the original texts. e.g. "I love LLM" --> "I love LL M"
keyboard_error_char: false # whether to open the augmentation method of simulating the keyboard error for characters in the original texts. e.g. "I love LLM" --> "I ;ov4 LLM"
ocr_error_char: false # whether to open the augmentation method of simulating the OCR error for characters in the original texts. e.g. "I love LLM" --> "I 10ve LLM"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "I love LLM" --> "I oe LLM"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "I love LLM" --> "I ovle LLM"
insert_random_char: false # whether to open the augmentation method of inserting random characters into the original texts. e.g. "I love LLM" --> "I ^lKove LLM"
- nlpcda_zh_mapper: # simply augment texts in Chinese based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated texts in the final datasets and the original texts will be removed. It's True in default.
replace_similar_word: false # whether to open the augmentation method of replacing random words with their similar words in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
replace_homophone_char: false # whether to open the augmentation method of replacing random characters with their homophones in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
- remove_bibliography_mapper: # remove bibliography from Latex text.
- remove_comments_mapper: # remove comments from Latex text, code, etc.
doc_type: tex # comment type you want to remove. Only support 'tex' for now.
inline: true # whether to remove inline comments
multiline: true # whether to remove multiline comments
- remove_header_mapper: # remove header texts from Latex text.
drop_no_head: true # whether to drop sample texts without headers
- remove_long_words_mapper: # remove much too long words from text.
min_len: 1 # the min word length to keep words.
max_len: 128 # the max word length to keep words.
- remove_non_chinese_character_mapper: # remove non-Chinese character in text samples.
keep_alphabet: true # whether to keep alphabet
keep_number: true # whether to keep number
keep_punc: true # whether to keep punctuation
- remove_repeat_sentences_mapper: # remove repeat sentences in text samples.
lowercase: false # whether to convert sample text to lower case
ignore_special_character: true # whether to ignore special characters when judging repeated sentences. Special characters are all characters except Chinese characters, letters and numbers
min_repeat_sentence_length: 2 # sentences shorter than this length will not be deduplicated. If ignore_special_character is set to True, then special characters are not included in this length
- remove_specific_chars_mapper: # remove characters specified by users
chars_to_remove: '◆●■►▼▲▴∆▻▷❖♡□' # a string or a list including those characters that need to be removed
- remove_table_text_mapper: # remove possible table texts from text.
min_col: 2 # the min num of columns in tables to remove
max_col: 20 # the max num of columns in tables to remove
- remove_words_with_incorrect_substrings_mapper: # remove words with incorrect substrings from text.
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
substrings: ['http', 'www', '.com', 'href', '//'] # incorrect substrings to remove
- sentence_split_mapper: # split text to multiple sentences and join them with '\n'
lang: 'en' # split text in what language
- video_captioning_from_audio_mapper: # caption a video according to its audio streams based on Qwen-Audio model
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only captioned sample in the final datasets and the original sample will be removed. It's True in default.
mem_required: '30GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_frames_mapper: # generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string.
hf_img2seq: 'Salesforce/blip2-opt-2.7b' # image-to-text model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each video
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of image-to-text model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_summarizer_mapper: # generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, ...)
hf_summarizer: 'mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback' # the summarizer model used to summarize texts generated by other methods.
consider_video_caption_from_video: true # whether to consider the video caption generated from video directly in the summarization process. Default: True.
consider_video_caption_from_audio: true # whether to consider the video caption generated from audio streams in the video in the summarization process. Default: True.
consider_video_caption_from_frames: true # whether to consider the video caption generated from sampled frames from the video in the summarization process. Default: True.
consider_video_tags_from_audio: true # whether to consider the video tags generated from audio streams in the video in the summarization process. Default: True.
consider_video_tags_from_frames: true # whether to consider the video tags generated from sampled frames from the video in the summarization process. Default: True.
vid_cap_from_vid_args: null # the arg dict for video captioning from video directly with keys are the arg names and values are the arg values. Default: None.
vid_cap_from_frm_args: null # the arg dict for video captioning from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
vid_tag_from_aud_args: null # the arg dict for video tagging from audio streams in the video with keys are the arg names and values are the arg values. Default: None.
vid_tag_from_frm_args: null # the arg dict for video tagging from sampled frames from the video with keys are the arg names and values are the arg values. Default: None.
keep_tag_num: 5 # max number N of tags from sampled frames to keep. Too many tags might bring negative influence to summarized text, so we consider to only keep the N most frequent tags. Default: 5.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only summarized captions in the final datasets and the original captions will be removed. It's True in default.
mem_required: '40GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_captioning_from_video_mapper: # generate captions by frame images extracted from video to augment datasets
hf_video_blip: 'kpyu/video-blip-opt-2.7b-ego4d' # video-blip model name on huggingface to generate caption
caption_num: 1 # how many candidate captions to generate for each video
keep_candidate_mode: 'random_any' # retain strategy for the generated $caption_num$ candidates. should be in ["random_any", "similar_one_simhash", "all"].
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only generated captions in the final datasets and the original captions will be removed. It's True in default.
prompt: null # a string prompt to guide the generation of video-blip model for all samples globally. It's None in default, which means no prompt provided.
prompt_key: null # the key name of fields in samples to store prompts for each sample. It's used for set different prompts for different samples. If it's none, use prompt in parameter "prompt". It's None in default.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
mem_required: '20GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_face_blur_mapper: # blur faces detected in videos
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- video_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg video filters
- video_remove_watermark_mapper: # Remove the watermarks in videos given regions
roi_strings: ['0,0,0.1,0.1'] # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
roi_type: ratio # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
roi_key: null # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
frame_num: 10 # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
min_frame_threshold: 7 # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
detection_method: pixel_value # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
- video_resize_aspect_ratio_mapper: # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
min_ratio: 9/21 # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
max_ratio: 21/9 # the maximum aspect ratio to enforce videos with an aspect ratio above `max_ratio` will be resized to match this maximum ratio. The ratio should be provided as a string in the format "21:9" or "21/9".
strategy: increase # the resizing strategy to apply when adjusting the video dimensions. It can be either 'decrease' to reduce the dimension or 'increase' to enlarge it. Accepted values are ['decrease', 'increase'].
- video_resize_resolution_mapper: # map videos to ones with given resolution range
min_width: 640 # the min horizontal resolution (unit p), videos with width less than 'min_width' will be mapped to videos with equal or bigger width
max_width: 1280 # the max horizontal resolution (unit p), videos with width more than 'max_width' will be mapped to videos with equal of smaller width
min_height: 480 # the min vertical resolution (unit p), videos with height less than 'min_height' will be mapped to videos with equal or bigger height
max_height: 1080 # the max vertical resolution (unit p), videos with height more than 'max_height' will be mapped to videos with equal or smaller height
force_original_aspect_ratio: 'increase' # Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio
force_divisible_by: 4 # Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio
- video_split_by_duration_mapper: # Mapper to split video by duration.
split_duration: 10 # duration of each video split in seconds.
min_last_split_duration: 0.1 # the minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_key_frame_mapper: # Mapper to split video by key frame.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_scene_mapper: # split videos into scene clips
detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
threshold: 27.0 # threshold passed to the detector
min_scene_len: 15 # minimum length of any scene
show_progress: false # whether to show progress from scenedetect
- video_tagging_from_audio_mapper: # Mapper to generate video tags from audio streams extracted from the video.
hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593' # Huggingface model name for the audio classification model.
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_tagging_from_frames_mapper: # Mapper to generate video tags from frames extracted from the video.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.
# Filter ops
- alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
tokenization: false # whether to count the ratio of alphanumeric to the total number of tokens.
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.9 # the max ratio of filter range
- audio_duration_filter: # keep data samples whose audios' durations are within a specified range.
min_duration: 0 # the min audio duration of filter range (in seconds)
max_duration: 3600 # the max audio duration of filter range (in seconds)
any_or_all: any # keep this sample when any/all audios meet the filter condition
- audio_nmf_snr_filter: # keep data samples whose audios' SNRs (computed based on NMF) are within a specified range.
min_snr: 0 # the min audio SNR to keep samples in dB. It's 0 by default.
max_snr: 1000 # the max audio SNR to keep samples in dB. It's sys.maxsize by default.
nmf_iter_num: 500 # the max number of iterations to run NMF. It's 500 in default.
any_or_all: any # keep this sample when any/all audios meet the filter condition
- audio_size_filter: # keep data samples whose audios' sizes are within a specified range.
min_duration: "0" # the min audio size of filter range
max_duration: "1TB" # the max audio size of filter range
any_or_all: any # keep this sample when any/all audios meet the filter condition
- average_line_length_filter: # filter text with the average length of lines out of specific range.
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- character_repetition_filter: # filter text with the character repetition ratio out of specific range
rep_len: 10 # repetition length for char-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
- flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
lang: en # consider flagged words in what language
tokenization: false # whether to use model to tokenize documents
max_ratio: 0.0045 # the max ratio to filter text
flagged_words_dir: ./assets # directory to store flagged words dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- image_aesthetics_filter: # filter samples according to the aesthetics score of images.
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
min_score: 0.3 # the min aesthetics score of filter range
max_score: 1.0 # the max aesthetics score of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_aspect_ratio_filter: # filter samples according to the aspect ratios of images (a fraction of width by height, r=w/h) in them
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_face_ratio_filter: # filter samples according to the face area ratios in images (r=face_area/image_area). If multiple faces are available, we use the largest one.
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
min_ratio: 0.0 # the min face area ratio of filter range
max_ratio: 0.4 # the max face area ratio of filter range
- image_nsfw_filter: # filter samples according to the nsfw scores of images in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_shape_filter: # filter samples according to the widths and heights of images in them
min_width: 200 # the min width of width filter range
max_width: 5000 # the max width of width filter range
min_height: 200 # the min height of height filter range
max_height: 5000 # the max height of height filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_size: "1TB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_text_matching_filter: # filter samples according to the matching score between image and text.
hf_blip: Salesforce/blip-itm-base-coco # name of used Hugging Face blip
min_score: 0.003 # the min matching score of filter range
max_score: 1.0 # the max matching score of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_text_similarity_filter: # filter samples according to the similarity between image and text.
hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip
min_score: 0.1 # the min similarity of filter range
max_score: 1.0 # the max similarity of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_watermark_filter: # filter samples according to the predicted watermark probabilities of images in them
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification
prob_threshold: 0.8 # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
- maximum_line_length_filter: # filter text with the maximum length of lines out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- perplexity_filter: # filter text with perplexity score out of specific range
lang: en # compute perplexity in what language
max_ppl: 1500 # the max perplexity score to filter text
- phrase_grounding_recall_filter: # filter samples according to the locating recall of phrases extracted from text in the images.
hf_owlvit: google/owlvit-base-patch32 # name of used Hugging Face Owl-ViT
min_recall: 0.1 # the min phrase grounding recall of filter range
max_recall: 1.0 # the max phrase grounding recall of filter range
horizontal_flip: false # flip image horizontally (left to right).
vertical_flip: false # flip image vertically (top to bottom).
iou_thr: 0.5 # the IoU threshold for NMS-like post-process
large_area_ratio_thr: 0.95 # the area ratio threshold for filtering out large predicted bboxes
conf_thr: 0.0 # the confidence score threshold for removing low-confidence bboxes
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- special_characters_filter: # filter text with special-char ratio out of specific range
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.25 # the max ratio of filter range
- specified_field_filter: # filter text with the specified field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
target_value: [] # the range of specified field information corresponding to the samples that need to be retained
- specified_numeric_field_filter: # filter text with the specified numeric field info out of specific range
field_key: '' # the target key corresponding to multi-level field information need to be separated by '.'
min_value: 0 # the min filter value in SpecifiedNumericField op
max_value: 10000 # the max filter value in SpecifiedNumericField op
- stopwords_filter: # filter text with stopword ratio smaller than a specific min value
lang: en # consider stopwords in what language
tokenization: false # whether to use model to tokenize documents
min_ratio: 0.3 # the min ratio to filter text
stopwords_dir: ./assets # directory to store stopwords dictionaries
use_words_aug: false # whether to augment words, especially for Chinese and Vietnamese
words_aug_group_sizes: [2] # the group size of words to augment
words_aug_join_char: "" # the join char between words to augment
- suffix_filter: # filter to keep samples with specified suffix.
suffixes: [] # the suffix of text that will be keep. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
- text_action_filter: # filter text according the number of action verb
lang: en # consider the words in what language
min_action_num: 1 # text will be filtered whose verbs less the min action number
- text_entity_dependency_filter: # filter text without non-independent entity nouns
lang: en # consider the words in what language
min_dependency_num: 1 # the min number of adjacent edges of a non-independent noun in dependency tree
any_or_all: any # keep this sample when any/all entity nouns are non-independent
- text_length_filter: # filter text with length out of specific range
min_len: 10 # the min length of filter range
max_len: 10000 # the max length of filter range
- token_num_filter: # filter text with total token number out of specific range
hf_tokenizer: EleutherAI/pythia-6.9b-deduped # name of used Hugging Face tokenizer
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos.
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
min_score: 0.3 # the min aesthetics score of filter range
max_score: 1.0 # the max aesthetics score of filter range
frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics.
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_aspect_ratio_filter: # filter samples according to the aspect ratios of videos (a fraction of width by height, r=w/h) in them
min_ratio: 9/21 # the minimum aspect ratio to keep samples, supported format is a string, such as "9:21" or "9/21".
max_ratio: 21/9 # the maximum aspect ratio to keep samples, supported format is a string, such as "21:9" or "21/9".
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_duration_filter: # Keep data samples whose videos' durations are within a specified range.
min_duration: 0 # the min video duration of filter range (in seconds)
max_duration: 10 # the max video duration of filter range (in seconds)
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_frames_text_similarity_filter: # keep samples those similarities between sampled video frame images and text within a specific range.
hf_clip: openai/clip-vit-base-patch32 # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice.
min_score: 0.1 # the min similarity to keep samples.
max_score: 1.0 # the max similarity to keep samples.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all videos meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
size: null # resize frames along the smaller edge before computing optical flow, or a sequence like (h, w)
max_size: null # maximum allowed for the longer edge of resized frames
relative: false # whether to normalize the optical flow magnitude to [0, 1], relative to the frame's diagonal length
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_ocr_area_ratio_filter: # Keep data samples whose detected text area ratios for specified frames in the video are within a specified range.
min_area_ratio: 0 # the min ocr area ratio to keep samples. It's 0 by default.
max_area_ratio: 1.0 # the max ocr area ratio to keep samples. It's 1.0 by default.
frame_sample_num: 3 # the number of sampled frames to calculate the ocr area ratio. If it's 1, only middle frame will be selected. If it's 2, only the first and the last frames will be selected. If it's larger than 2, in addition to the first and the last frames, other frames will be sampled evenly within the video duration.
languages_to_detect: ['ch_sim', 'en'] # texts in which languages should be detected. Default: ['ch_sim', 'en']. Full language list can be found here: https://www.jaided.ai/easyocr/.
any_or_all: any # keep this sample with 'any' or 'all' strategy of all videos. 'any': keep this sample if any videos meet the condition. 'all': keep this sample only if all videos meet the condition.
- video_resolution_filter: # filter samples according to the resolution of videos in them
min_width: 1280 # the min resolution of horizontal resolution filter range (unit p)
max_width: 4096 # the max resolution of horizontal resolution filter range (unit p)
min_height: 480 # the min resolution of vertical resolution filter range (unit p)
max_height: 1080 # the max resolution of vertical resolution filter range (unit p)
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification
prob_threshold: 0.8 # the predicted watermark probability threshold for samples, range from 0 to 1. Samples with watermark probability less than this threshold will be kept.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_tagging_from_frames_filter: # filter samples according to the tags of videos in them
tags: ['people'] # a tag list to shift the videos, total tags can be found in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt
contain: any # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
any_or_all: any # keep this sample when any/all videos meet the filter condition
- words_num_filter: # filter text with number of words out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
min_num: 10 # the min number of filter range
max_num: 10000 # the max number of filter range
- word_repetition_filter: # filter text with the word repetition ratio out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
rep_len: 10 # repetition length for word-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
# Deduplicator ops
- document_deduplicator: # deduplicate text samples using md5 hashing exact matching method
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
- document_minhash_deduplicator: # deduplicate text samples using MinHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character, sentencepiece]
window_size: 5 # window size of shingling
num_permutations: 256 # number of permutations in minhash computing
jaccard_threshold: 0.7 # the min jaccard similarity threshold in near-duplicate detection. When the jaccard similarity of two sample texts is >= this threshold, they are regarded as similar samples and this op will only keep one of them after deduplication
num_bands: null # number of bands in LSH. Default it's None, and it will be determined by an optimal params computation algorithm by minimize the weighted sum of probs of False Positives and False Negatives
num_rows_per_band: null # number of rows in each band in LSH. Default it's None, and it will be determined by an optimal params computation algorithm
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
tokenizer_model: null # path for the sentencepiece model, used for sentencepiece tokenization.
- document_simhash_deduplicator: # deduplicate text samples using SimHash-LSH method
tokenization: space # tokenization method for text. One of [space, punctuation, character]
window_size: 6 # window size of shingling
num_blocks: 6 # number of blocks in SimHash computing
hamming_distance: 4 # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
- image_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
consider_text: false # whether to consider text hash together with image hash when applying deduplication.
- video_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of videos between documents.
consider_text: false # whether to consider text hash together with video hash when applying deduplication.
- ray_video_deduplicator: # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
- ray_image_deduplicator: # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents.
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
- ray_document_deduplicator: # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
# Selector ops
- frequency_specified_field_selector: # selector to select samples based on the sorted frequency of specified field value
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top specified field value
topk: # number of selected top specified field value
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order
- random_selector: # selector to random select samples
select_ratio: # the ratio to be sampled
select_num: # the number to be sampled
- range_specified_field_selector: # selector to select a range of samples based on the sorted specified field value from smallest to largest.
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
lower_percentile: # the lower bound of the percentile to be sampled
upper_percentile: # the upper bound of the percentile to be sampled
lower_rank: # the lower rank of the percentile to be sampled
upper_rank: # the upper rank of the percentile to be sampled
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
field_key: '' # the target keys corresponding to multi-level field information need to be separated by '.'
top_ratio: # ratio of selected top samples
topk: # number of selected top sample
reverse: True # determine the sorting rule, if reverse=True, then sort in descending order