Skip to content

Commit

Permalink
Fix new code filters (#304)
Browse files Browse the repository at this point in the history
Signed-off-by: Igor Gitman <[email protected]>
  • Loading branch information
Kipok authored Dec 20, 2024
1 parent b11b4e0 commit 786cac3
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
20 changes: 20 additions & 0 deletions nemo_skills/training/data_preparation_utils/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,26 @@ def _chunk_manifest(self):
yield manifest_chunk


class DropIfRegexMatch(BaseFilter):
"""Drops data if text matches a regex pattern."""

def __init__(
self,
regex_patterns: List[str],
text_key: str = "text",
**kwargs,
):
super().__init__(**kwargs)
self.regex_patterns = regex_patterns
self.text_key = text_key

def process_dataset_entry(self, data_entry) -> List:
for regex_pattern in self.regex_patterns:
if re.search(re.escape(regex_pattern), data_entry[self.text_key]):
return [DataEntry(data=None, metrics=dict(num_removed=1))]
return [DataEntry(data=data_entry, metrics=dict(num_reomoved=0))]


class DropMultiBoxed(BaseFilter):
def __init__(self, solution_key: str = "generation", **kwargs):
super().__init__(**kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ processors:
should_run: ${filters.remove_contaminated}
contamination_file: ${contamination_file}

- _target_: sdp.processors.DropIfRegexMatch
- _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexMatch
should_run: ${filters.remove_code_errors}
text_key: ${output_key}
regex_patterns:
Expand All @@ -109,7 +109,7 @@ processors:
- {input: {generation: "My solution:\nTimed out\nSomething else"}, output: null}
- {input: {generation: "My solution, no errors"}, output: {generation: "My solution, no errors"}}

- _target_: sdp.processors.DropIfRegexMatch
- _target_: nemo_skills.training.data_preparation_utils.filters.DropIfRegexMatch
should_run: ${filters.remove_verification_code}
text_key: ${output_key}
regex_patterns:
Expand Down

0 comments on commit 786cac3

Please sign in to comment.