-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Report Generator Functionallity (#166)
* Added report generator * Added necessary changes to the support the article crawler. * Update to not allow broken articles. * Store URLs based on patterns + Reducing duplication * Added a table to track broken_codes - Allows DB to keep track of why articles were broken Fixed access when loading a dictionary. * Fixed tests, need a call that allows to not add to the crawl report. * Do not plot graphs for inactive languages - There are some languages which have no active users, so many plots would be empty. Instead, the report states there are no active users. - Active users require a user to at least read for a minute or do exercises for a minute to be considered active.
- Loading branch information
1 parent
2494d02
commit b415994
Showing
21 changed files
with
6,639 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
from collections import Counter | ||
import datetime | ||
import os | ||
import inspect | ||
import json | ||
import pathlib | ||
|
||
STR_DATETIME_FORMAT = "%d_%m_%y_%H_%M_%S" | ||
CRAWL_REPORT_DATA = os.environ.get( | ||
"CRAWL_REPORT_DATA", | ||
os.path.join(pathlib.Path(__file__).parent.resolve(), "crawl_data"), | ||
) | ||
|
||
|
||
class CrawlReport: | ||
def __init__(self) -> None: | ||
self.save_dir = CRAWL_REPORT_DATA | ||
self.data = {"lang": {}} | ||
self.crawl_report_date = datetime.datetime.now() | ||
|
||
def get_days_from_crawl_report_date(self): | ||
return (datetime.datetime.now() - self.crawl_report_date).days | ||
|
||
def __convert_str_to_dt(self, str_datetime): | ||
dt_parsed = datetime.datetime.strptime(str_datetime, STR_DATETIME_FORMAT) | ||
return dt_parsed | ||
|
||
def __convert_dt_to_str(self, datetime): | ||
return datetime.strftime(STR_DATETIME_FORMAT) | ||
|
||
def _get_feed_dict(self, feed): | ||
lang_code = feed.language.code | ||
feed_id = feed.id | ||
return self.data["lang"][lang_code]["feeds"][feed_id] | ||
|
||
def add_language(self, lang_code: str): | ||
self.data["lang"][lang_code] = {"feeds": {}, "total_time": None} | ||
|
||
def add_feed(self, feed): | ||
lang_code = feed.language.code | ||
feed_id = feed.id | ||
if lang_code not in self.data["lang"]: | ||
self.add_language(lang_code) | ||
self.data["lang"][lang_code]["feeds"][feed_id] = { | ||
"article_report": { | ||
"sents_removed": {}, | ||
"quality_error": {}, | ||
"quality_to_url": {}, | ||
"sents_to_url": {}, | ||
}, | ||
"last_article_date": None, | ||
"feed_errors": [], | ||
"crawl_time": None, | ||
"total_articles": None, | ||
"total_downloaded": None, | ||
"total_low_quality": None, | ||
"total_in_db": None, | ||
} | ||
|
||
def set_total_time(self, lang_code: str, total_time): | ||
self.data["lang"][lang_code]["total_time"] = total_time | ||
|
||
def add_feed_error(self, feed, error: str): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["feed_errors"].append(error) | ||
|
||
def set_feed_crawl_time(self, feed, crawl_time): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["crawl_time"] = crawl_time | ||
|
||
def set_feed_last_article_date(self, feed, last_article_date): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["last_article_date"] = self.__convert_dt_to_str(last_article_date) | ||
|
||
def set_feed_total_articles(self, feed, total_articles): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["total_articles"] = total_articles | ||
|
||
def set_feed_total_downloaded(self, feed, total_downloaded): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["total_downloaded"] = total_downloaded | ||
|
||
def set_feed_total_low_quality(self, feed, total_low_quality): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["total_low_quality"] = total_low_quality | ||
|
||
def set_feed_total_in_db(self, feed, total_in_db): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["total_in_db"] = total_in_db | ||
|
||
def set_non_quality_reason(self, feed, non_quality_reason_counts: dict): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["article_report"]["quality_error"] = Counter( | ||
non_quality_reason_counts | ||
) | ||
|
||
def set_sent_removed(self, feed, sent_removed_count: dict): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["article_report"]["sents_removed"] = Counter(sent_removed_count) | ||
|
||
def add_non_quality_reason(self, feed, non_quality_reason, url=None): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["article_report"]["quality_error"][non_quality_reason] = ( | ||
feed_dict["article_report"]["quality_error"].get(non_quality_reason, 0) + 1 | ||
) | ||
if url is not None: | ||
feed_dict["article_report"]["quality_to_url"][non_quality_reason] = ( | ||
feed_dict["article_report"]["quality_to_url"].get( | ||
non_quality_reason, [] | ||
) | ||
+ [url] | ||
) | ||
|
||
def add_sent_removed(self, feed, sent_removed, url=None): | ||
feed_dict = self._get_feed_dict(feed) | ||
feed_dict["article_report"]["sents_removed"][sent_removed] = ( | ||
feed_dict["article_report"]["sents_removed"].get(sent_removed, 0) + 1 | ||
) | ||
if url is not None: | ||
feed_dict["article_report"]["sents_to_url"][sent_removed] = feed_dict[ | ||
"article_report" | ||
]["sents_to_url"].get(sent_removed, []) + [url] | ||
|
||
def save_crawl_report(self): | ||
timestamp_str = self.__convert_dt_to_str(self.crawl_report_date) | ||
if not os.path.exists(self.save_dir): | ||
os.mkdir(self.save_dir) | ||
for lang in self.data["lang"]: | ||
filename = f"{lang}-crawl-{timestamp_str}.json" | ||
output_dir = os.path.join(self.save_dir, lang) | ||
if not os.path.exists(output_dir): | ||
os.mkdir(output_dir) | ||
with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f: | ||
json.dump(self.data["lang"][lang], f) | ||
|
||
def load_crawl_report_data(self, day_period: int, report_dir_path=None): | ||
if report_dir_path is None: | ||
report_dir_path = self.save_dir | ||
for lang in os.listdir(report_dir_path): | ||
for file in os.listdir(os.path.join(report_dir_path, lang)): | ||
lang, _, date = file.split(".")[0].split("-") | ||
date = self.__convert_str_to_dt(date) | ||
|
||
day_diff = (date.now() - date).days | ||
if day_diff > day_period: | ||
print( | ||
f"File '{file}' outside of day range of '{day_period}', was: '{day_diff}'" | ||
) | ||
continue | ||
try: | ||
self.crawl_report_date = min(self.crawl_report_date, date) | ||
with open( | ||
os.path.join(report_dir_path, lang, file), "r", encoding="utf-8" | ||
) as f: | ||
loaded_data = json.load(f) | ||
if lang not in self.data["lang"]: | ||
self.add_language(lang) | ||
lang_dict = self.data["lang"][lang] | ||
for feed in loaded_data["feeds"]: | ||
if feed not in lang_dict["feeds"]: | ||
# We have not loaded any feeds yet: | ||
lang_dict["feeds"][feed] = loaded_data["feeds"][feed] | ||
else: | ||
feed_dict = lang_dict["feeds"][feed] | ||
feed_dict["article_report"]["sents_removed"] = Counter( | ||
feed_dict["article_report"]["sents_removed"] | ||
) + Counter( | ||
loaded_data["feeds"][feed]["article_report"][ | ||
"sents_removed" | ||
] | ||
) | ||
feed_dict["article_report"]["quality_error"] = Counter( | ||
feed_dict["article_report"]["quality_error"] | ||
) + Counter( | ||
loaded_data["feeds"][feed]["article_report"][ | ||
"quality_error" | ||
] | ||
) | ||
print(f"LOADED File (d:{date}, l:{lang}): {file}") | ||
except Exception as e: | ||
print(f"Failed to load: '{file}', with: '{e} ({type(e)})'") | ||
|
||
def __validate_lang(self, lang: str): | ||
langs_available = set(self.data["lang"].keys()) | ||
if lang not in langs_available: | ||
raise ValueError( | ||
f"'{lang}' is not found in current loaded data. Available langs: '{list(langs_available)}'" | ||
) | ||
return True | ||
|
||
def get_total_non_quality_counts(self, langs_to_load: list[str] = None): | ||
if langs_to_load is None: | ||
langs_to_load = self.data["lang"].keys() | ||
else: | ||
for lang in langs_to_load: | ||
self.__validate_lang(lang) | ||
|
||
total_counts = Counter() | ||
for lang in langs_to_load: | ||
for feed in self.data["lang"][lang]["feeds"]: | ||
feed_dict = self.data["lang"][lang]["feeds"][feed] | ||
total_counts += Counter(feed_dict["article_report"]["quality_error"]) | ||
return total_counts | ||
|
||
def get_total_removed_sents_counts(self, langs_to_load: list[str] = None): | ||
if langs_to_load is None: | ||
langs_to_load = self.data["lang"].keys() | ||
else: | ||
for lang in langs_to_load: | ||
self.__validate_lang(lang) | ||
total_counts = Counter() | ||
|
||
for lang in langs_to_load: | ||
for feed in self.data["lang"][lang]["feeds"]: | ||
feed_dict = self.data["lang"][lang]["feeds"][feed] | ||
try: | ||
total_counts += Counter( | ||
feed_dict["article_report"]["sents_removed"] | ||
) | ||
except Exception as e: | ||
from pprint import pprint | ||
|
||
pprint(feed_dict) | ||
print(e, type(e)) | ||
input("Continue?") | ||
return total_counts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
import traceback | ||
|
||
from sqlalchemy.exc import PendingRollbackError | ||
from time import time | ||
|
||
import zeeguu.core | ||
|
||
|
@@ -27,17 +28,19 @@ | |
|
||
from zeeguu.core.content_retriever.article_downloader import download_from_feed | ||
from zeeguu.core.model import Feed, Language | ||
from crawl_summary.crawl_report import CrawlReport | ||
|
||
db_session = zeeguu.core.model.db.session | ||
|
||
|
||
def download_for_feeds(list_of_feeds): | ||
def download_for_feeds(list_of_feeds, crawl_report): | ||
|
||
summary_stream = "" | ||
counter = 0 | ||
all_feeds_count = len(list_of_feeds) | ||
|
||
for feed in list_of_feeds: | ||
crawl_report.add_feed(feed) | ||
if feed.deactivated: | ||
continue | ||
|
||
|
@@ -48,7 +51,12 @@ def download_for_feeds(list_of_feeds): | |
log(f"{msg}") | ||
|
||
summary_stream += ( | ||
download_from_feed(feed, zeeguu.core.model.db.session) + "\n\n" | ||
download_from_feed( | ||
feed, | ||
zeeguu.core.model.db.session, | ||
crawl_report, | ||
) | ||
+ "\n\n" | ||
) | ||
|
||
except PendingRollbackError as e: | ||
|
@@ -57,27 +65,50 @@ def download_for_feeds(list_of_feeds): | |
"Something went wrong and we had to rollback a transaction; following is the full stack trace:" | ||
) | ||
traceback.print_exc() | ||
crawl_report.add_feed_error(feed, str(e)) | ||
|
||
except: | ||
except Exception as e: | ||
traceback.print_exc() | ||
crawl_report.add_feed_error(feed, str(e)) | ||
|
||
logp(f"Successfully finished processing {counter} feeds.") | ||
return summary_stream | ||
|
||
|
||
def retrieve_articles_for_language(language_code, send_email=False): | ||
|
||
start_time = time() | ||
language = Language.find(language_code) | ||
all_language_feeds = ( | ||
Feed.query.filter_by(language_id=language.id).filter_by(deactivated=False).all() | ||
) | ||
crawl_report = CrawlReport() | ||
crawl_report.add_language(language_code) | ||
|
||
summary_stream = download_for_feeds(all_language_feeds, crawl_report) | ||
if send_email: | ||
|
||
logp("sending summary email") | ||
|
||
summary_stream = download_for_feeds(all_language_feeds) | ||
import datetime | ||
|
||
mailer = ZeeguuMailer( | ||
f"{language.name} Crawl Summary " | ||
+ datetime.datetime.now().strftime("%H:%M"), | ||
summary_stream, | ||
"[email protected]", | ||
) | ||
mailer.send() | ||
crawl_report.set_total_time(language.code, round(time() - start_time, 2)) | ||
crawl_report.save_crawl_report() | ||
return crawl_report | ||
|
||
|
||
def retrieve_articles_from_all_feeds(): | ||
counter = 0 | ||
all_feeds = Feed.query.all() | ||
download_for_feeds(all_feeds) | ||
crawl_report = CrawlReport() | ||
download_for_feeds(all_feeds, crawl_report) | ||
crawl_report.save_crawl_report() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
CREATE TABLE `zeeguu_test`.`article_broken_code_map` ( | ||
`article_id` INT NOT NULL, | ||
`broken_code` VARCHAR(45) NULL, | ||
INDEX `article_broken_code_map_ibfk_1_idx` (`article_id` ASC) VISIBLE, | ||
CONSTRAINT `article_broken_code_map_ibfk_1` FOREIGN KEY (`article_id`) REFERENCES `zeeguu_test`.`article` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION | ||
); |
Empty file.
Oops, something went wrong.