Skip to content

Commit

Permalink
Patch for Perf Bench (#506)
Browse files Browse the repository at this point in the history
* * refine perf bench workflow
* fix wrong var in sphinx docs

* * refine perf bench workflow

* * fix wrong var in sphinx docs

* * set python version matrix to include only 3.9 and 3.10

* * hide unnecessary logs

* * update mem_required for image tagging models

* * enable unittests for 3 OPs due to dependency

* + add two dependencies by librosa
  • Loading branch information
HYLcool authored Dec 9, 2024
1 parent 9f1b0c8 commit 4b8b436
Show file tree
Hide file tree
Showing 9 changed files with 19 additions and 17 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/deploy_sphinx_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ on:
jobs:
pages:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: [ "3.9", "3.10" ]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@master
with:
python_version: ${{ matrix.python-version }}
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/perf-bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
unittest-single:
runs-on: [self-hosted, linux]
perf_bench:
runs-on: [GPU, unittest]
environment: Testing
steps:
- uses: actions/checkout@v3
Expand All @@ -42,7 +42,7 @@ jobs:
- name: Run performance benchmark standalone
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
run: |
docker compose exec ray-head python tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
- name: Remove docker compose
working-directory: dj-${{ github.run_id }}/.github/workflows/docker
Expand Down
3 changes: 3 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ process:
radius: 2 # radius of blur kernel
- image_tagging_mapper: # Mapper to generate image tags.
tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default.
mem_required: '9GB'
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
Expand Down Expand Up @@ -382,6 +383,7 @@ process:
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
mem_required: '9GB'
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.

# Filter ops
Expand Down Expand Up @@ -614,6 +616,7 @@ process:
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
any_or_all: any # keep this sample when any/all videos meet the filter condition
mem_required: '9GB'
- words_num_filter: # filter text with number of words out of specific range
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
Expand Down
4 changes: 4 additions & 0 deletions environments/minimal_requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ pandas
numpy
av==13.1.0
soundfile
# need to install two dependencies by librosa to avoid lazy_loader error
librosa>=0.10
samplerate
resampy
# need to install two dependencies by librosa to avoid lazy_loader error
loguru
tabulate
tqdm
Expand Down
1 change: 1 addition & 0 deletions tests/benchmark_performance/configs/video.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ process:
score_threshold: 1.0
mem_required: '1GB'
- video_tagging_from_frames_mapper:
mem_required: '9GB'
- video_duration_filter:
- video_split_by_key_frame_mapper:
keep_original_sample: false
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmark_performance/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ MODALITIES=("text" "image" "video" "audio")
cd $BENCH_PATH

# 1. prepare dataset
wget http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxvf perf_bench_data.tar.gz
wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz

# 2. run the benchmark
for modality in ${MODALITIES[@]}
Expand Down
5 changes: 1 addition & 4 deletions tests/ops/filter/test_audio_duration_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@

from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG, SKIPPED_TESTS
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG

# skip due to conflicts when run lazy_load in multiprocessing in librosa
# tests passed locally.
@SKIPPED_TESTS.register_module()
class AudioDurationFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
5 changes: 1 addition & 4 deletions tests/ops/filter/test_audio_nmf_snr_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,8 @@

from data_juicer.ops.filter.audio_nmf_snr_filter import AudioNMFSNRFilter
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase

# skip due to conflicts when run lazy_load in multiprocessing in librosa
# tests passed locally.
@SKIPPED_TESTS.register_module()
class AudioNMFSNRFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
5 changes: 1 addition & 4 deletions tests/ops/mapper/test_video_tagging_from_audio_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@
VideoTaggingFromAudioMapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase

# skip due to conflicts when run lazy_load in multiprocessing in librosa
# tests passed locally.
@SKIPPED_TESTS.register_module()
class VideoTaggingFromAudioMapperTest(DataJuicerTestCaseBase):
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
'data')
Expand Down

0 comments on commit 4b8b436

Please sign in to comment.