Skip to content

Commit

Permalink
raw strings in regex and filter sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
mircealungu committed Jul 10, 2024
1 parent b415994 commit 9c23d3b
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
6 changes: 4 additions & 2 deletions zeeguu/core/content_cleaning/content_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@
"fortsætter herefter til 99 kr.",
"fortrydelsesret jf.",
"forbrugeraftaleloven.",
"\læs også",
r"\læs også",
]
+ [ # German Patterns
"folgen ich folge",
Expand Down Expand Up @@ -1124,7 +1124,9 @@ def normalize_sent(text: str):
return text.lower().strip()


def filter_noise_patterns(article, sent_filter_set, crawl_report=None, feed=None, url=None):
def filter_noise_patterns(
article, sent_filter_set, crawl_report=None, feed=None, url=None
):
clean_artcile = ""
for paragraph in article.split("\n\n"):
clean_paragraph = ""
Expand Down
2 changes: 1 addition & 1 deletion zeeguu/core/model/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

HTML_TAG_CLEANR = re.compile("<[^>]*>")

MULTIPLE_NEWLINES = re.compile("\n\s*\n")
MULTIPLE_NEWLINES = re.compile(r"\n\s*\n")
# \n matches a line-feed (newline) character (ASCII 10)
# \s matches any whitespace character (equivalent to [\r\n\t\f\v ])
# \n matches a line-feed (newline) character (ASCII 10)
Expand Down

0 comments on commit 9c23d3b

Please sign in to comment.