diff --git a/zeeguu/core/content_cleaning/content_cleaner.py b/zeeguu/core/content_cleaning/content_cleaner.py index ae348bcc..efbb57eb 100644 --- a/zeeguu/core/content_cleaning/content_cleaner.py +++ b/zeeguu/core/content_cleaning/content_cleaner.py @@ -292,7 +292,7 @@ "fortsætter herefter til 99 kr.", "fortrydelsesret jf.", "forbrugeraftaleloven.", - "\læs også", + r"\læs også", ] + [ # German Patterns "folgen ich folge", @@ -1124,7 +1124,9 @@ def normalize_sent(text: str): return text.lower().strip() -def filter_noise_patterns(article, sent_filter_set, crawl_report=None, feed=None, url=None): +def filter_noise_patterns( + article, sent_filter_set, crawl_report=None, feed=None, url=None +): clean_artcile = "" for paragraph in article.split("\n\n"): clean_paragraph = "" diff --git a/zeeguu/core/model/article.py b/zeeguu/core/model/article.py index ec0e3380..6edb4e47 100644 --- a/zeeguu/core/model/article.py +++ b/zeeguu/core/model/article.py @@ -25,7 +25,7 @@ HTML_TAG_CLEANR = re.compile("<[^>]*>") -MULTIPLE_NEWLINES = re.compile("\n\s*\n") +MULTIPLE_NEWLINES = re.compile(r"\n\s*\n") # \n matches a line-feed (newline) character (ASCII 10) # \s matches any whitespace character (equivalent to [\r\n\t\f\v ]) # \n matches a line-feed (newline) character (ASCII 10)