-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added Report Generator Functionallity #166
Changes from 4 commits
f758c79
e9d079b
953d919
922cbf4
87473ee
fc8d8ca
9e2157a
ada9583
fce096f
01ae26b
e90fbdd
a4e3886
d216454
1ac822c
72fb785
f4663a3
8ecfd48
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
from collections import Counter | ||
import datetime | ||
import os | ||
import inspect | ||
import json | ||
|
||
STR_DATETIME_FORMAT = "%d_%m_%y_%H_%M_%S" | ||
|
||
|
||
class CrawlReport: | ||
def __init__(self) -> None: | ||
path_to_dir = os.sep.join(inspect.getfile(self.__class__).split(os.sep)[:-1]) | ||
self.default_save_dir = os.path.join(path_to_dir, "crawl_data") | ||
self.data = {"lang": {}} | ||
self.crawl_date = datetime.datetime.now() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this confused me.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the intention is to use as the way to save the files, I actually forgot to update it and it was using datetime.now(). So my idea is, you start an article_crawler, this object stores the time it started and it's used as sort of the key for that crawl, aka, the crawl done for danish in that starting time. I don't think this should be a value that is manipulated, and it works as a almost private property. It also allows us to load files based on their date, so if I want for the last 7 days, I can use this date to sort out when we are crawling it based on the current date. Does that make sense? |
||
|
||
def get_days_from_crawl_date(self): | ||
return (datetime.datetime.now() - self.crawl_date).days | ||
|
||
def __convert_str_to_dt(self, str_datetime): | ||
dt_parsed = datetime.datetime.strptime(str_datetime, STR_DATETIME_FORMAT) | ||
return dt_parsed | ||
|
||
def __convert_dt_to_str(self, datetime): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move this as sub-method where it's used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I decided to make it a private method for the class, but maybe I will make it a class method to reflect this. This is just to ensure it's converted in the expected format for the file output. Let me know if you agree with it otherwise I can make it part of the methods that save and load, it's just in my mind I see it as a pair the dt_to_str and str_to_dt |
||
return datetime.strftime(STR_DATETIME_FORMAT) | ||
|
||
def add_language(self, lang_code: str): | ||
self.data["lang"][lang_code] = {"feeds": {}, "total_time": None} | ||
|
||
def add_feed(self, lang_code: str, feed_id: int): | ||
if lang_code not in self.data["lang"]: | ||
self.add_language(lang_code) | ||
self.data["lang"][lang_code]["feeds"][feed_id] = { | ||
"article_report": { | ||
"sents_removed": {}, | ||
"quality_error": {}, | ||
}, | ||
"last_article_date": None, | ||
"feed_errors": [], | ||
"crawl_time": None, | ||
"total_articles": None, | ||
"total_downloaded": None, | ||
"total_low_quality": None, | ||
"total_in_db": None, | ||
} | ||
|
||
def set_total_time(self, lang_code: str, total_time): | ||
self.data["lang"][lang_code]["total_time"] = total_time | ||
|
||
def add_feed_error(self, lang_code: str, feed_id: int, error: str): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["feed_errors"].append(error) | ||
|
||
def set_feed_crawl_time(self, lang_code: str, feed_id: int, crawl_time): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["crawl_time"] = crawl_time | ||
|
||
def set_feed_last_article_date( | ||
self, lang_code: str, feed_id: int, last_article_date | ||
): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["last_article_date"] = ( | ||
self.__convert_dt_to_str(last_article_date) | ||
) | ||
|
||
def set_feed_total_articles(self, lang_code: str, feed_id: int, total_articles): | ||
self.data["lang"][lang_code]["feeds"][feed_id][ | ||
"total_articles" | ||
] = total_articles | ||
|
||
def set_feed_total_downloaded(self, lang_code: str, feed_id: int, total_downloaded): | ||
self.data["lang"][lang_code]["feeds"][feed_id][ | ||
"total_downloaded" | ||
] = total_downloaded | ||
|
||
def set_feed_total_low_quality( | ||
self, lang_code: str, feed_id: int, total_low_quality | ||
): | ||
self.data["lang"][lang_code]["feeds"][feed_id][ | ||
"total_low_quality" | ||
] = total_low_quality | ||
|
||
def set_feed_total_in_db(self, lang_code: str, feed_id: int, total_in_db): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["total_in_db"] = total_in_db | ||
|
||
def set_non_quality_reason( | ||
self, lang_code: str, feed_id: int, non_quality_reason_counts: dict | ||
): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"quality_error" | ||
] = Counter(non_quality_reason_counts) | ||
|
||
def set_sent_removed(self, lang_code: str, feed_id: int, sent_removed_count: dict): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"sents_removed" | ||
] = Counter(sent_removed_count) | ||
|
||
def add_non_quality_reason(self, lang_code: str, feed_id: int, non_quality_reason): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"quality_error" | ||
][non_quality_reason] = ( | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"quality_error" | ||
].get(non_quality_reason, 0) | ||
+ 1 | ||
) | ||
|
||
def add_sent_removed(self, lang_code: str, feed_id: int, sent_removed): | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"sents_removed" | ||
] = ( | ||
self.data["lang"][lang_code]["feeds"][feed_id]["article_report"][ | ||
"sents_removed" | ||
].get(sent_removed, 0) | ||
+ 1 | ||
) | ||
|
||
def save_crawl_report(self): | ||
timestamp_str = self.__convert_dt_to_str(datetime.datetime.now()) | ||
for lang in self.data["lang"]: | ||
filename = f"{lang}-crawl-{timestamp_str}.json" | ||
output_dir = os.path.join(self.default_save_dir, lang) | ||
if not os.path.exists(output_dir): | ||
os.mkdir(output_dir) | ||
with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f: | ||
json.dump(self.data["lang"], f) | ||
|
||
def load_crawl_report_data(self, day_period: int, report_dir_path=None): | ||
if report_dir_path is None: | ||
report_dir_path = self.default_save_dir | ||
for lang in os.listdir(report_dir_path): | ||
for file in os.listdir(os.path.join(report_dir_path, lang)): | ||
lang, _, date = file.split(".")[0].split("-") | ||
date = self.__convert_str_to_dt(date) | ||
self.crawl_date = min(self.crawl_date, date) | ||
day_diff = (date.now() - date).days | ||
if day_diff > day_period: | ||
print( | ||
f"File '{file}' outside of day range of '{day_period}', was: '{day_diff}'" | ||
) | ||
continue | ||
try: | ||
with open( | ||
os.path.join(report_dir_path, lang, file), "r", encoding="utf-8" | ||
) as f: | ||
self.data["lang"][lang] = json.load(f)[lang] | ||
except Exception as e: | ||
print(f"Failed to load: '{file}', with: '{e}'") | ||
|
||
def __validate_lang(self, lang: str): | ||
langs_available = set(self.data["lang"].keys()) | ||
if lang not in langs_available: | ||
raise ValueError( | ||
f"'{lang}' is not found in current loaded data. Available langs: '{list(langs_available)}'" | ||
) | ||
return True | ||
|
||
def get_total_non_quality_counts(self, langs_to_load: list[str] = None): | ||
if langs_to_load is None: | ||
langs_to_load = self.data["lang"].keys() | ||
else: | ||
for lang in langs_to_load: | ||
self.__validate_lang(lang) | ||
|
||
total_counts = Counter() | ||
for lang in langs_to_load: | ||
for feed in self.data["lang"][lang]["feeds"]: | ||
total_counts += Counter( | ||
self.data["lang"][lang]["feeds"][feed]["article_report"][ | ||
"quality_error" | ||
] | ||
) | ||
return total_counts | ||
|
||
def get_total_removed_sents_counts(self, langs_to_load: list[str] = None): | ||
if langs_to_load is None: | ||
langs_to_load = self.data["lang"].keys() | ||
else: | ||
for lang in langs_to_load: | ||
self.__validate_lang(lang) | ||
total_counts = Counter() | ||
for lang in langs_to_load: | ||
for feed in self.data["lang"][lang]["feeds"]: | ||
total_counts += Counter( | ||
self.data["lang"][lang]["feeds"][feed]["article_report"][ | ||
"sents_removed" | ||
] | ||
) | ||
return total_counts |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
import traceback | ||
|
||
from sqlalchemy.exc import PendingRollbackError | ||
from time import time | ||
|
||
import zeeguu.core | ||
|
||
|
@@ -27,17 +28,19 @@ | |
|
||
from zeeguu.core.content_retriever.article_downloader import download_from_feed | ||
from zeeguu.core.model import Feed, Language | ||
from crawl_summary.crawl_report import CrawlReport | ||
|
||
db_session = zeeguu.core.model.db.session | ||
|
||
|
||
def download_for_feeds(list_of_feeds): | ||
def download_for_feeds(list_of_feeds, crawl_report): | ||
|
||
summary_stream = "" | ||
counter = 0 | ||
all_feeds_count = len(list_of_feeds) | ||
|
||
for feed in list_of_feeds: | ||
crawl_report.add_feed(feed.language.code, feed.id) | ||
if feed.deactivated: | ||
continue | ||
|
||
|
@@ -48,7 +51,12 @@ def download_for_feeds(list_of_feeds): | |
log(f"{msg}") | ||
|
||
summary_stream += ( | ||
download_from_feed(feed, zeeguu.core.model.db.session) + "\n\n" | ||
download_from_feed( | ||
feed, | ||
zeeguu.core.model.db.session, | ||
crawl_report, | ||
) | ||
+ "\n\n" | ||
) | ||
|
||
except PendingRollbackError as e: | ||
|
@@ -57,27 +65,50 @@ def download_for_feeds(list_of_feeds): | |
"Something went wrong and we had to rollback a transaction; following is the full stack trace:" | ||
) | ||
traceback.print_exc() | ||
crawl_report.add_feed_error(feed.language.code, feed.id, str(e)) | ||
|
||
except: | ||
except Exception as e: | ||
traceback.print_exc() | ||
crawl_report.add_feed_error(feed.language.code, feed.id, str(e)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is elegant. |
||
|
||
logp(f"Successfully finished processing {counter} feeds.") | ||
return summary_stream | ||
|
||
|
||
def retrieve_articles_for_language(language_code, send_email=False): | ||
|
||
start_time = time() | ||
language = Language.find(language_code) | ||
all_language_feeds = ( | ||
Feed.query.filter_by(language_id=language.id).filter_by(deactivated=False).all() | ||
) | ||
crawl_report = CrawlReport() | ||
crawl_report.add_language(language_code) | ||
|
||
summary_stream = download_for_feeds(all_language_feeds, crawl_report) | ||
if send_email: | ||
|
||
logp("sending summary email") | ||
|
||
summary_stream = download_for_feeds(all_language_feeds) | ||
import datetime | ||
|
||
mailer = ZeeguuMailer( | ||
f"{language.name} Crawl Summary " | ||
+ datetime.datetime.now().strftime("%H:%M"), | ||
summary_stream, | ||
"[email protected]", | ||
) | ||
mailer.send() | ||
crawl_report.set_total_time(language.code, round(time() - start_time, 2)) | ||
crawl_report.save_crawl_report() | ||
return crawl_report | ||
|
||
|
||
def retrieve_articles_from_all_feeds(): | ||
counter = 0 | ||
all_feeds = Feed.query.all() | ||
download_for_feeds(all_feeds) | ||
crawl_report = CrawlReport() | ||
download_for_feeds(all_feeds, crawl_report) | ||
crawl_report.save_crawl_report() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it would be good if we parameterized this with two envvars maybe?
FOLDER_WITH_CRAWL_SUMMARIES
andFOLDER_FOR_REPORT_OUTPUT
Alternatively, take them as script parameters in the main script and pass them here?
That way we have more control at deployment time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I am not sure. This is initialized in the article_crawler.py - so I think the parameter would be passed there instead for this file specifically, maybe I can make a setter for the dir path, though the generate_report also expects this path to be in a specific place.
I could make both the
article_crawler
and thegenerate_report
take aCrawlReportPath
in case we want to change it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I decided to go with the environment variables, I think it's working!