Skip to content

Commit

Permalink
Update to not allow broken articles.
Browse files Browse the repository at this point in the history
  • Loading branch information
tfnribeiro committed Jul 1, 2024
1 parent 87473ee commit fc8d8ca
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 3 deletions.
34 changes: 33 additions & 1 deletion tools/crawl_summary/crawl_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,39 @@ def load_crawl_report_data(self, day_period: int, report_dir_path=None):
with open(
os.path.join(report_dir_path, lang, file), "r", encoding="utf-8"
) as f:
self.data["lang"][lang] = json.load(f)[lang]
loaded_data = json.load(f)
if lang not in self.data["lang"]:
self.add_language(lang)

for feed in loaded_data["feeds"]:
if feed not in self.data["lang"][lang]["feeds"]:
# We have not loaded any feeds yet:
self.data["lang"][lang]["feeds"][feed] = loaded_data[
"feeds"
][feed]
else:
self.data["lang"][lang]["feeds"][feed][
"article_report"
]["sents_removed"] = Counter(
self.data["lang"][lang]["feeds"][feed][
"article_report"
]["sents_removed"]
) + Counter(
loaded_data["feeds"][feed]["article_report"][
"sents_removed"
]
)
self.data["lang"][lang]["feeds"][feed][
"article_report"
]["quality_error"] = Counter(
self.data["lang"][lang]["feeds"][feed][
"article_report"
]["quality_error"]
) + Counter(
loaded_data["feeds"][feed]["article_report"][
"quality_error"
]
)
print(f"LOADED File (d:{date}, l:{lang}): {file}")
except Exception as e:
print(f"Failed to load: '{file}', with: '{e} ({type(e)})'")
Expand Down
6 changes: 4 additions & 2 deletions tools/report_generator/data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def get_article_topics_df(self, feed_df):
INNER JOIN article_topic_map atm on a.id = atm.article_id
INNER JOIN topic t ON atm.topic_id = t.id
INNER JOIN language l ON l.id = a.language_id
WHERE DATEDIFF(CURDATE(), a.published_time) <= {self.DAYS_FOR_REPORT}"""
WHERE DATEDIFF(CURDATE(), a.published_time) <= {self.DAYS_FOR_REPORT}
AND a.broken = 0"""
df = pd.read_sql(query, con=self.db_connection)
self.__add_feed_name(df, feed_df)
return df
Expand All @@ -36,7 +37,8 @@ def get_article_df(self, feed_df):
query = f"""SELECT a.*, l.name Language
FROM article a
INNER JOIN language l ON l.id = a.language_id
WHERE DATEDIFF(CURDATE(), published_time) <= {self.DAYS_FOR_REPORT}"""
WHERE DATEDIFF(CURDATE(), published_time) <= {self.DAYS_FOR_REPORT}
AND a.broken = 0"""
df = pd.read_sql(query, con=self.db_connection)
self.__add_feed_name(df, feed_df)
return df
Expand Down

0 comments on commit fc8d8ca

Please sign in to comment.