diff --git a/src/hatespeech/clean_data.py b/src/hatespeech/clean_data.py index aef398e..46a5647 100644 --- a/src/hatespeech/clean_data.py +++ b/src/hatespeech/clean_data.py @@ -180,7 +180,6 @@ def clean_text(text: str) -> Union[str, None]: # Replace 8 digits with " [CVR] " if "cvr" is in the text, else replace with # " [PHONE] " Check if an 8 digit number is present in text if re.search(r"(? List[str]: # resulting dataframe in the annotated directory, and split up the dataframe into a # validation and test split, and store those too if config.testing: - # Create test annotated data labels = [np.random.choice(["Offensive", "Not Offensive"]) for _ in texts] df_test = pd.DataFrame({"text": texts, "label": labels})