Skip to content

Commit

Permalink
Simplified CrawlReport, which can take a feed and extract the code + …
Browse files Browse the repository at this point in the history
…id from the feed object.
  • Loading branch information
tfnribeiro committed Jun 27, 2024
1 parent 922cbf4 commit 87473ee
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 58 deletions.
69 changes: 44 additions & 25 deletions tools/crawl_summary/crawl_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def __init__(self) -> None:
path_to_dir = os.sep.join(inspect.getfile(self.__class__).split(os.sep)[:-1])
self.default_save_dir = os.path.join(path_to_dir, "crawl_data")
self.data = {"lang": {}}
self.crawl_date = datetime.datetime.now()
self.crawl_report_date = datetime.datetime.now()

def get_days_from_crawl_date(self):
return (datetime.datetime.now() - self.crawl_date).days
def get_days_from_crawl_report_date(self):
return (datetime.datetime.now() - self.crawl_report_date).days

def __convert_str_to_dt(self, str_datetime):
dt_parsed = datetime.datetime.strptime(str_datetime, STR_DATETIME_FORMAT)
Expand All @@ -27,7 +27,9 @@ def __convert_dt_to_str(self, datetime):
def add_language(self, lang_code: str):
self.data["lang"][lang_code] = {"feeds": {}, "total_time": None}

def add_feed(self, lang_code: str, feed_id: int):
def add_feed(self, feed):
lang_code = feed.language.code
feed_id = feed.id
if lang_code not in self.data["lang"]:
self.add_language(lang_code)
self.data["lang"][lang_code]["feeds"][feed_id] = {
Expand All @@ -47,52 +49,66 @@ def add_feed(self, lang_code: str, feed_id: int):
def set_total_time(self, lang_code: str, total_time):
self.data["lang"][lang_code]["total_time"] = total_time

def add_feed_error(self, lang_code: str, feed_id: int, error: str):
def add_feed_error(self, feed, error: str):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["feed_errors"].append(error)

def set_feed_crawl_time(self, lang_code: str, feed_id: int, crawl_time):
def set_feed_crawl_time(self, feed, crawl_time):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["crawl_time"] = crawl_time

def set_feed_last_article_date(
self, lang_code: str, feed_id: int, last_article_date
):
def set_feed_last_article_date(self, feed, last_article_date):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["last_article_date"] = (
self.__convert_dt_to_str(last_article_date)
)

def set_feed_total_articles(self, lang_code: str, feed_id: int, total_articles):
def set_feed_total_articles(self, feed, total_articles):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id][
"total_articles"
] = total_articles

def set_feed_total_downloaded(self, lang_code: str, feed_id: int, total_downloaded):
def set_feed_total_downloaded(self, feed, total_downloaded):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id][
"total_downloaded"
] = total_downloaded

def set_feed_total_low_quality(
self, lang_code: str, feed_id: int, total_low_quality
):
def set_feed_total_low_quality(self, feed, total_low_quality):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id][
"total_low_quality"
] = total_low_quality

def set_feed_total_in_db(self, lang_code: str, feed_id: int, total_in_db):
def set_feed_total_in_db(self, feed, total_in_db):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["total_in_db"] = total_in_db

def set_non_quality_reason(
self, lang_code: str, feed_id: int, non_quality_reason_counts: dict
):
def set_non_quality_reason(self, feed, non_quality_reason_counts: dict):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][
"quality_error"
] = Counter(non_quality_reason_counts)

def set_sent_removed(self, lang_code: str, feed_id: int, sent_removed_count: dict):
def set_sent_removed(self, feed, sent_removed_count: dict):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][
"sents_removed"
] = Counter(sent_removed_count)

def add_non_quality_reason(self, lang_code: str, feed_id: int, non_quality_reason):
def add_non_quality_reason(self, feed, non_quality_reason):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][
"quality_error"
][non_quality_reason] = (
Expand All @@ -102,7 +118,9 @@ def add_non_quality_reason(self, lang_code: str, feed_id: int, non_quality_reaso
+ 1
)

def add_sent_removed(self, lang_code: str, feed_id: int, sent_removed):
def add_sent_removed(self, feed, sent_removed):
lang_code = feed.language.code
feed_id = feed.id
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][
"sents_removed"
] = (
Expand All @@ -113,14 +131,14 @@ def add_sent_removed(self, lang_code: str, feed_id: int, sent_removed):
)

def save_crawl_report(self):
timestamp_str = self.__convert_dt_to_str(datetime.datetime.now())
timestamp_str = self.__convert_dt_to_str(self.crawl_report_date)
for lang in self.data["lang"]:
filename = f"{lang}-crawl-{timestamp_str}.json"
output_dir = os.path.join(self.default_save_dir, lang)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
json.dump(self.data["lang"], f)
json.dump(self.data["lang"][lang], f)

def load_crawl_report_data(self, day_period: int, report_dir_path=None):
if report_dir_path is None:
Expand All @@ -129,7 +147,7 @@ def load_crawl_report_data(self, day_period: int, report_dir_path=None):
for file in os.listdir(os.path.join(report_dir_path, lang)):
lang, _, date = file.split(".")[0].split("-")
date = self.__convert_str_to_dt(date)
self.crawl_date = min(self.crawl_date, date)
self.crawl_report_date = min(self.crawl_report_date, date)
day_diff = (date.now() - date).days
if day_diff > day_period:
print(
Expand All @@ -141,8 +159,9 @@ def load_crawl_report_data(self, day_period: int, report_dir_path=None):
os.path.join(report_dir_path, lang, file), "r", encoding="utf-8"
) as f:
self.data["lang"][lang] = json.load(f)[lang]
print(f"LOADED File (d:{date}, l:{lang}): {file}")
except Exception as e:
print(f"Failed to load: '{file}', with: '{e}'")
print(f"Failed to load: '{file}', with: '{e} ({type(e)})'")

def __validate_lang(self, lang: str):
langs_available = set(self.data["lang"].keys())
Expand Down
6 changes: 3 additions & 3 deletions tools/feed_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def download_for_feeds(list_of_feeds, crawl_report):
all_feeds_count = len(list_of_feeds)

for feed in list_of_feeds:
crawl_report.add_feed(feed.language.code, feed.id)
crawl_report.add_feed(feed)
if feed.deactivated:
continue

Expand All @@ -65,11 +65,11 @@ def download_for_feeds(list_of_feeds, crawl_report):
"Something went wrong and we had to rollback a transaction; following is the full stack trace:"
)
traceback.print_exc()
crawl_report.add_feed_error(feed.language.code, feed.id, str(e))
crawl_report.add_feed_error(feed, str(e))

except Exception as e:
traceback.print_exc()
crawl_report.add_feed_error(feed.language.code, feed.id, str(e))
crawl_report.add_feed_error(feed, str(e))

logp(f"Successfully finished processing {counter} feeds.")
return summary_stream
Expand Down
2 changes: 1 addition & 1 deletion tools/report_generator/generate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def generate_html_page():
exercise_activity_df = data_extractor.get_exercise_type_activity()
crawl_report = CrawlReport()
crawl_report.load_crawl_report_data(DAYS_FOR_REPORT)
total_days_from_crawl_report = crawl_report.get_days_from_crawl_date()
total_days_from_crawl_report = crawl_report.get_days_from_crawl_report_date()
warning_crawl_range = (
""
if total_days_from_crawl_report == DAYS_FOR_REPORT
Expand Down
36 changes: 13 additions & 23 deletions zeeguu/core/content_retriever/article_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,10 @@ def download_from_feed(
feed_item_timestamp > last_retrieval_time_seen_this_crawl
):
last_retrieval_time_seen_this_crawl = feed_item_timestamp
crawl_report.set_feed_last_article_date(
feed.language.code, feed.id, feed_item_timestamp
)
crawl_report.set_feed_last_article_date(feed, feed_item_timestamp)

if last_retrieval_time_seen_this_crawl > feed.last_crawled_time:
crawl_report.set_feed_last_article_date(
feed.language.code, feed.id, feed_item_timestamp
)
crawl_report.set_feed_last_article_date(feed, feed_item_timestamp)
feed.last_crawled_time = last_retrieval_time_seen_this_crawl
session.add(feed)
session.commit()
Expand Down Expand Up @@ -213,17 +209,11 @@ def download_from_feed(
else:
logp(e)
continue
crawl_report.set_feed_total_articles(feed.language.code, feed.id, len(items))
crawl_report.set_feed_total_downloaded(feed.language.code, feed.id, downloaded)
crawl_report.set_feed_total_low_quality(
feed.language.code, feed.id, skipped_due_to_low_quality
)
crawl_report.set_feed_total_in_db(
feed.language.code, feed.id, skipped_already_in_db
)
crawl_report.set_feed_crawl_time(
feed.language.code, feed.id, round(time() - start_feed_time, 2)
)
crawl_report.set_feed_total_articles(feed, len(items))
crawl_report.set_feed_total_downloaded(feed, downloaded)
crawl_report.set_feed_total_low_quality(feed, skipped_due_to_low_quality)
crawl_report.set_feed_total_in_db(feed, skipped_already_in_db)
crawl_report.set_feed_crawl_time(feed, round(time() - start_feed_time, 2))
summary_stream += (
f"{downloaded} new articles from {feed.title} ({len(items)} items)\n"
)
Expand All @@ -250,14 +240,10 @@ def download_feed_item(session, feed, feed_item, url, crawl_report):

np_article, sents_removed = download_and_parse_with_remove_sents(url)
print("Counted sents!", sents_removed)
crawl_report.set_sent_removed(feed.language.code, feed.id, sents_removed)
crawl_report.set_sent_removed(feed, sents_removed)

is_quality_article, reason, code = sufficient_quality(np_article)

if not is_quality_article:
crawl_report.add_non_quality_reason(feed.language.code, feed.id, code)
raise SkippedForLowQuality(reason)

summary = feed_item["summary"]
# however, this is not so easy... there have been cases where
# the summary is just malformed HTML... thus we try to extract
Expand Down Expand Up @@ -285,10 +271,14 @@ def download_feed_item(session, feed, feed_item, url, crawl_report):
feed.language,
htmlContent=np_article.htmlContent,
)
session.add(new_article)
if not is_quality_article:
crawl_report.add_non_quality_reason(feed, code)
new_article.broken = True
raise SkippedForLowQuality(reason)

if np_article.top_image != "":
new_article.img_url = Url.find_or_create(session, np_article.top_image)
session.add(new_article)

topics = add_topics(new_article, session)
logp(f" Topics ({topics})")
Expand Down
6 changes: 2 additions & 4 deletions zeeguu/core/test/test_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@ def setUp(self):
self.crawl_report = CrawlReport()
self.spiegel = FeedRule().feed1
self.newspaper_da = FeedRule().feed_newspaper_da
self.crawl_report.add_feed(self.spiegel.language.code, self.spiegel.id)
self.crawl_report.add_feed(
self.newspaper_da.language.code, self.newspaper_da.id
)
self.crawl_report.add_feed(self.spiegel)
self.crawl_report.add_feed(self.newspaper_da)
download_from_feed(self.spiegel, db.session, self.crawl_report, 3, False)
download_from_feed(self.newspaper_da, db.session, self.crawl_report, 3, False)

Expand Down
4 changes: 2 additions & 2 deletions zeeguu/core/test/test_retrieve_and_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def setUp(self):
def testDifficultyOfFeedItems(self):
feed = FeedRule().feed1
crawl_report = CrawlReport()
crawl_report.add_feed(feed.language.code, feed.id)
crawl_report.add_feed(feed)
download_from_feed(feed, zeeguu.core.model.db.session, crawl_report, 3, False)

articles = feed.get_articles(limit=2)
Expand All @@ -44,7 +44,7 @@ def testDownloadWithTopic(self):
zeeguu.core.model.db.session.add(loc_topic)
zeeguu.core.model.db.session.commit()
crawl_report = CrawlReport()
crawl_report.add_feed(feed.language.code, feed.id)
crawl_report.add_feed(feed)
download_from_feed(feed, zeeguu.core.model.db.session, crawl_report, 3, False)

article = feed.get_articles(limit=2)[0]
Expand Down

0 comments on commit 87473ee

Please sign in to comment.