Skip to content

Commit

Permalink
Added Report Generator Functionallity (#166)
Browse files Browse the repository at this point in the history
* Added report generator
* Added necessary changes to the support the article crawler.
* Update to not allow broken articles.
* Store URLs based on patterns + Reducing duplication
* Added a table to track broken_codes
- Allows DB to keep track of why articles were broken

Fixed access when loading a dictionary.
* Fixed tests, need a call that allows to not add to the crawl report.
* Do not plot graphs for inactive languages

- There are some languages which have no active users, so many plots would be empty. Instead, the report states there are no active users.
- Active users require a user to at least read for a minute or do exercises for a minute to be considered active.
  • Loading branch information
tfnribeiro authored Jul 10, 2024
1 parent 2494d02 commit b415994
Show file tree
Hide file tree
Showing 21 changed files with 6,639 additions and 58 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ dev_data_folder*
.idea/
.cache/
data/
tools/report_generator/reports/
tools/crawl_summary/crawl_data/

*.pyc
*.egg-info
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ git+https://github.com/zeeguu/confusionwords.git@main#egg=confusionwords
scikit-learn==1.4.0
flask_monitoringdashboard

# For the report generator
matplotlib
seaborn
pandas

# the following two were required when deploying the API on Mac OS with Python 3.12.1
cryptography
lxml_html_clean
Expand Down
6 changes: 3 additions & 3 deletions tools/add_feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def main():
print(f"= {icon_name}")

description = (
input(f"Description (Enter for: {test_feed.description}): ")
or test_feed.description
input(f"Description (Enter for: {test_feed.description}): ")
or test_feed.description
)
print(f"= {description}")

Expand All @@ -46,7 +46,7 @@ def main():
description,
icon_name=icon_name,
language=language,
feed_type=feed_type
feed_type=feed_type,
)

print("Done: ")
Expand Down
Empty file added tools/crawl_summary/__init__.py
Empty file.
226 changes: 226 additions & 0 deletions tools/crawl_summary/crawl_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
from collections import Counter
import datetime
import os
import inspect
import json
import pathlib

STR_DATETIME_FORMAT = "%d_%m_%y_%H_%M_%S"
CRAWL_REPORT_DATA = os.environ.get(
"CRAWL_REPORT_DATA",
os.path.join(pathlib.Path(__file__).parent.resolve(), "crawl_data"),
)


class CrawlReport:
def __init__(self) -> None:
self.save_dir = CRAWL_REPORT_DATA
self.data = {"lang": {}}
self.crawl_report_date = datetime.datetime.now()

def get_days_from_crawl_report_date(self):
return (datetime.datetime.now() - self.crawl_report_date).days

def __convert_str_to_dt(self, str_datetime):
dt_parsed = datetime.datetime.strptime(str_datetime, STR_DATETIME_FORMAT)
return dt_parsed

def __convert_dt_to_str(self, datetime):
return datetime.strftime(STR_DATETIME_FORMAT)

def _get_feed_dict(self, feed):
lang_code = feed.language.code
feed_id = feed.id
return self.data["lang"][lang_code]["feeds"][feed_id]

def add_language(self, lang_code: str):
self.data["lang"][lang_code] = {"feeds": {}, "total_time": None}

def add_feed(self, feed):
lang_code = feed.language.code
feed_id = feed.id
if lang_code not in self.data["lang"]:
self.add_language(lang_code)
self.data["lang"][lang_code]["feeds"][feed_id] = {
"article_report": {
"sents_removed": {},
"quality_error": {},
"quality_to_url": {},
"sents_to_url": {},
},
"last_article_date": None,
"feed_errors": [],
"crawl_time": None,
"total_articles": None,
"total_downloaded": None,
"total_low_quality": None,
"total_in_db": None,
}

def set_total_time(self, lang_code: str, total_time):
self.data["lang"][lang_code]["total_time"] = total_time

def add_feed_error(self, feed, error: str):
feed_dict = self._get_feed_dict(feed)
feed_dict["feed_errors"].append(error)

def set_feed_crawl_time(self, feed, crawl_time):
feed_dict = self._get_feed_dict(feed)
feed_dict["crawl_time"] = crawl_time

def set_feed_last_article_date(self, feed, last_article_date):
feed_dict = self._get_feed_dict(feed)
feed_dict["last_article_date"] = self.__convert_dt_to_str(last_article_date)

def set_feed_total_articles(self, feed, total_articles):
feed_dict = self._get_feed_dict(feed)
feed_dict["total_articles"] = total_articles

def set_feed_total_downloaded(self, feed, total_downloaded):
feed_dict = self._get_feed_dict(feed)
feed_dict["total_downloaded"] = total_downloaded

def set_feed_total_low_quality(self, feed, total_low_quality):
feed_dict = self._get_feed_dict(feed)
feed_dict["total_low_quality"] = total_low_quality

def set_feed_total_in_db(self, feed, total_in_db):
feed_dict = self._get_feed_dict(feed)
feed_dict["total_in_db"] = total_in_db

def set_non_quality_reason(self, feed, non_quality_reason_counts: dict):
feed_dict = self._get_feed_dict(feed)
feed_dict["article_report"]["quality_error"] = Counter(
non_quality_reason_counts
)

def set_sent_removed(self, feed, sent_removed_count: dict):
feed_dict = self._get_feed_dict(feed)
feed_dict["article_report"]["sents_removed"] = Counter(sent_removed_count)

def add_non_quality_reason(self, feed, non_quality_reason, url=None):
feed_dict = self._get_feed_dict(feed)
feed_dict["article_report"]["quality_error"][non_quality_reason] = (
feed_dict["article_report"]["quality_error"].get(non_quality_reason, 0) + 1
)
if url is not None:
feed_dict["article_report"]["quality_to_url"][non_quality_reason] = (
feed_dict["article_report"]["quality_to_url"].get(
non_quality_reason, []
)
+ [url]
)

def add_sent_removed(self, feed, sent_removed, url=None):
feed_dict = self._get_feed_dict(feed)
feed_dict["article_report"]["sents_removed"][sent_removed] = (
feed_dict["article_report"]["sents_removed"].get(sent_removed, 0) + 1
)
if url is not None:
feed_dict["article_report"]["sents_to_url"][sent_removed] = feed_dict[
"article_report"
]["sents_to_url"].get(sent_removed, []) + [url]

def save_crawl_report(self):
timestamp_str = self.__convert_dt_to_str(self.crawl_report_date)
if not os.path.exists(self.save_dir):
os.mkdir(self.save_dir)
for lang in self.data["lang"]:
filename = f"{lang}-crawl-{timestamp_str}.json"
output_dir = os.path.join(self.save_dir, lang)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
json.dump(self.data["lang"][lang], f)

def load_crawl_report_data(self, day_period: int, report_dir_path=None):
if report_dir_path is None:
report_dir_path = self.save_dir
for lang in os.listdir(report_dir_path):
for file in os.listdir(os.path.join(report_dir_path, lang)):
lang, _, date = file.split(".")[0].split("-")
date = self.__convert_str_to_dt(date)

day_diff = (date.now() - date).days
if day_diff > day_period:
print(
f"File '{file}' outside of day range of '{day_period}', was: '{day_diff}'"
)
continue
try:
self.crawl_report_date = min(self.crawl_report_date, date)
with open(
os.path.join(report_dir_path, lang, file), "r", encoding="utf-8"
) as f:
loaded_data = json.load(f)
if lang not in self.data["lang"]:
self.add_language(lang)
lang_dict = self.data["lang"][lang]
for feed in loaded_data["feeds"]:
if feed not in lang_dict["feeds"]:
# We have not loaded any feeds yet:
lang_dict["feeds"][feed] = loaded_data["feeds"][feed]
else:
feed_dict = lang_dict["feeds"][feed]
feed_dict["article_report"]["sents_removed"] = Counter(
feed_dict["article_report"]["sents_removed"]
) + Counter(
loaded_data["feeds"][feed]["article_report"][
"sents_removed"
]
)
feed_dict["article_report"]["quality_error"] = Counter(
feed_dict["article_report"]["quality_error"]
) + Counter(
loaded_data["feeds"][feed]["article_report"][
"quality_error"
]
)
print(f"LOADED File (d:{date}, l:{lang}): {file}")
except Exception as e:
print(f"Failed to load: '{file}', with: '{e} ({type(e)})'")

def __validate_lang(self, lang: str):
langs_available = set(self.data["lang"].keys())
if lang not in langs_available:
raise ValueError(
f"'{lang}' is not found in current loaded data. Available langs: '{list(langs_available)}'"
)
return True

def get_total_non_quality_counts(self, langs_to_load: list[str] = None):
if langs_to_load is None:
langs_to_load = self.data["lang"].keys()
else:
for lang in langs_to_load:
self.__validate_lang(lang)

total_counts = Counter()
for lang in langs_to_load:
for feed in self.data["lang"][lang]["feeds"]:
feed_dict = self.data["lang"][lang]["feeds"][feed]
total_counts += Counter(feed_dict["article_report"]["quality_error"])
return total_counts

def get_total_removed_sents_counts(self, langs_to_load: list[str] = None):
if langs_to_load is None:
langs_to_load = self.data["lang"].keys()
else:
for lang in langs_to_load:
self.__validate_lang(lang)
total_counts = Counter()

for lang in langs_to_load:
for feed in self.data["lang"][lang]["feeds"]:
feed_dict = self.data["lang"][lang]["feeds"][feed]
try:
total_counts += Counter(
feed_dict["article_report"]["sents_removed"]
)
except Exception as e:
from pprint import pprint

pprint(feed_dict)
print(e, type(e))
input("Continue?")
return total_counts
43 changes: 37 additions & 6 deletions tools/feed_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import traceback

from sqlalchemy.exc import PendingRollbackError
from time import time

import zeeguu.core

Expand All @@ -27,17 +28,19 @@

from zeeguu.core.content_retriever.article_downloader import download_from_feed
from zeeguu.core.model import Feed, Language
from crawl_summary.crawl_report import CrawlReport

db_session = zeeguu.core.model.db.session


def download_for_feeds(list_of_feeds):
def download_for_feeds(list_of_feeds, crawl_report):

summary_stream = ""
counter = 0
all_feeds_count = len(list_of_feeds)

for feed in list_of_feeds:
crawl_report.add_feed(feed)
if feed.deactivated:
continue

Expand All @@ -48,7 +51,12 @@ def download_for_feeds(list_of_feeds):
log(f"{msg}")

summary_stream += (
download_from_feed(feed, zeeguu.core.model.db.session) + "\n\n"
download_from_feed(
feed,
zeeguu.core.model.db.session,
crawl_report,
)
+ "\n\n"
)

except PendingRollbackError as e:
Expand All @@ -57,27 +65,50 @@ def download_for_feeds(list_of_feeds):
"Something went wrong and we had to rollback a transaction; following is the full stack trace:"
)
traceback.print_exc()
crawl_report.add_feed_error(feed, str(e))

except:
except Exception as e:
traceback.print_exc()
crawl_report.add_feed_error(feed, str(e))

logp(f"Successfully finished processing {counter} feeds.")
return summary_stream


def retrieve_articles_for_language(language_code, send_email=False):

start_time = time()
language = Language.find(language_code)
all_language_feeds = (
Feed.query.filter_by(language_id=language.id).filter_by(deactivated=False).all()
)
crawl_report = CrawlReport()
crawl_report.add_language(language_code)

summary_stream = download_for_feeds(all_language_feeds, crawl_report)
if send_email:

logp("sending summary email")

summary_stream = download_for_feeds(all_language_feeds)
import datetime

mailer = ZeeguuMailer(
f"{language.name} Crawl Summary "
+ datetime.datetime.now().strftime("%H:%M"),
summary_stream,
"[email protected]",
)
mailer.send()
crawl_report.set_total_time(language.code, round(time() - start_time, 2))
crawl_report.save_crawl_report()
return crawl_report


def retrieve_articles_from_all_feeds():
counter = 0
all_feeds = Feed.query.all()
download_for_feeds(all_feeds)
crawl_report = CrawlReport()
download_for_feeds(all_feeds, crawl_report)
crawl_report.save_crawl_report()


if __name__ == "__main__":
Expand Down
6 changes: 6 additions & 0 deletions tools/migrations/24-07-01--add_article_broken_code_map.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CREATE TABLE `zeeguu_test`.`article_broken_code_map` (
`article_id` INT NOT NULL,
`broken_code` VARCHAR(45) NULL,
INDEX `article_broken_code_map_ibfk_1_idx` (`article_id` ASC) VISIBLE,
CONSTRAINT `article_broken_code_map_ibfk_1` FOREIGN KEY (`article_id`) REFERENCES `zeeguu_test`.`article` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
);
Empty file.
Loading

0 comments on commit b415994

Please sign in to comment.