From c2b9c79dad9c98d2db26fbc1893251d037685fbf Mon Sep 17 00:00:00 2001 From: Richard Jones Date: Fri, 13 Oct 2023 13:54:13 +0100 Subject: [PATCH] add significant logging to journal csv process --- portality/bll/services/journal.py | 31 ++++++++++++++++++++++++------- portality/models/background.py | 9 ++++++--- portality/scripts/journalcsv.py | 9 ++++++++- portality/store.py | 7 +++++-- portality/tasks/journal_csv.py | 10 +++++++--- 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/portality/bll/services/journal.py b/portality/bll/services/journal.py index f9b6eefbab..c1d005300d 100644 --- a/portality/bll/services/journal.py +++ b/portality/bll/services/journal.py @@ -8,7 +8,7 @@ from portality import lock from portality.bll.doaj import DOAJ from portality.lib.dates import FMT_DATETIME_SHORT -from portality.store import StoreFactory, prune_container +from portality.store import StoreFactory, prune_container, StoreException from portality.crosswalks.journal_questions import Journal2QuestionXwalk from datetime import datetime @@ -115,7 +115,7 @@ def journal(self, journal_id, lock_journal=False, lock_account=None, lock_timeou return journal, the_lock - def csv(self, prune=True): + def csv(self, prune=True, logger=None): """ Generate the Journal CSV @@ -127,39 +127,51 @@ def csv(self, prune=True): """ # first validate the incoming arguments to ensure that we've got the right thing argvalidate("csv", [ - {"arg": prune, "allow_none" : False, "arg_name" : "prune"} + {"arg": prune, "allow_none" : False, "arg_name" : "prune"}, + {"arg": logger, "allow_none": True, "arg_name": "logger"} ], exceptions.ArgumentException) # ~~->FileStoreTemp:Feature~~ filename = 'journalcsv__doaj_' + dates.now_str(FMT_DATETIME_SHORT) + '_utf8.csv' container_id = app.config.get("STORE_CACHE_CONTAINER") tmpStore = StoreFactory.tmp() - out = tmpStore.path(container_id, filename, create_container=True, must_exist=False) + try: + out = tmpStore.path(container_id, filename, create_container=True, must_exist=False) + logger("Temporary CSV will be written to {x}".format(x=out)) + except StoreException as e: + logger("Could not create temporary CSV file: {x}".format(x=e)) + raise e with open(out, 'w', encoding='utf-8') as csvfile: - self._make_journals_csv(csvfile) + self._make_journals_csv(csvfile, logger=logger) + logger("Wrote CSV to output file {x}".format(x=out)) # ~~->FileStore:Feature~~ mainStore = StoreFactory.get("cache") try: mainStore.store(container_id, filename, source_path=out) url = mainStore.url(container_id, filename) + logger("Stored CSV in main cache store at {x}".format(x=url)) finally: tmpStore.delete_file(container_id, filename) # don't delete the container, just in case someone else is writing to it + logger("Deleted file from tmp store") action_register = [] if prune: + logger("Pruning old CSVs from store") def sort(filelist): rx = "journalcsv__doaj_(.+?)_utf8.csv" return sorted(filelist, key=lambda x: datetime.strptime(re.match(rx, x).groups(1)[0], FMT_DATETIME_SHORT), reverse=True) def _filter(f_name): return f_name.startswith("journalcsv__") - action_register = prune_container(mainStore, container_id, sort, filter=_filter, keep=2) + action_register = prune_container(mainStore, container_id, sort, filter=_filter, keep=2, logger=logger) + logger("Pruned old CSVs from store") # update the ES record to point to the new file # ~~-> Cache:Model~~ models.Cache.cache_csv(url) + logger("Stored CSV URL in ES Cache") return url, action_register def admin_csv(self, file_path, account_sub_length=8, obscure_accounts=True, add_sensitive_account_info=False): @@ -207,11 +219,12 @@ def acc_email(j): self._make_journals_csv(f, extra_cols) @staticmethod - def _make_journals_csv(file_object, additional_columns=None): + def _make_journals_csv(file_object, additional_columns=None, logger=None): """ Make a CSV file of information for all journals. :param file_object: a utf8 encoded file object. """ + logger = logger if logger is not None else lambda x: x YES_NO = {True: 'Yes', False: 'No', None: '', '': ''} def _get_doaj_meta_kvs(journal): @@ -243,6 +256,8 @@ def _get_article_kvs(journal): # ~~!JournalCSV:Feature->Journal:Model~~ cols = {} for j in models.Journal.all_in_doaj(page_size=1000): #Fixme: limited by ES, this may not be sufficient + logger("Exporting journal {x}".format(x=j.id)) + bj = j.bibjson() issn = bj.get_one_identifier(idtype=bj.P_ISSN) if issn is None: @@ -265,6 +280,7 @@ def _get_article_kvs(journal): toc_kv = _get_doaj_toc_kv(j) cols[issn].insert(2, toc_kv) + logger("All journals exported") issns = cols.keys() csvwriter = csv.writer(file_object) @@ -275,4 +291,5 @@ def _get_article_kvs(journal): csvwriter.writerow(qs) vs = [v for _, v in cols[i]] csvwriter.writerow(vs) + logger("CSV Written") diff --git a/portality/models/background.py b/portality/models/background.py index ac3d3bfc65..604eccc95d 100644 --- a/portality/models/background.py +++ b/portality/models/background.py @@ -152,13 +152,16 @@ def pretty_audit(self): class StdOutBackgroundJob(BackgroundJob): - def __init__(self, inner): + def __init__(self, inner, force_logging=False): super(StdOutBackgroundJob, self).__init__(**inner.data) + self._force_logging = force_logging def add_audit_message(self, msg, timestamp=None): super(StdOutBackgroundJob, self).add_audit_message(msg, timestamp) - if app.config.get("DOAJENV") == 'dev': - print(msg) + if app.config.get("DOAJENV") == 'dev' or self._force_logging: + if timestamp is None: + timestamp = dates.now_str_with_microseconds() + print("[" + timestamp + "] " + msg) # ~~-> DataObj:Library~~ diff --git a/portality/scripts/journalcsv.py b/portality/scripts/journalcsv.py index 7c00cbdf41..dedfb51c9b 100644 --- a/portality/scripts/journalcsv.py +++ b/portality/scripts/journalcsv.py @@ -9,10 +9,17 @@ exit() user = app.config.get("SYSTEM_USERNAME") + print("Running journal CSV export for user {}".format(user)) + job = journal_csv.JournalCSVBackgroundTask.prepare(user) - job = StdOutBackgroundJob(job) + job = StdOutBackgroundJob(job, force_logging=True) + print("Background Job prepared with id {}".format(job.id)) + task = journal_csv.JournalCSVBackgroundTask(job) + print("Background task created") + BackgroundApi.execute(task) + print("Finished journal CSV export for user {}".format(user)) diff --git a/portality/store.py b/portality/store.py index 90300fb1aa..2d0935ee19 100644 --- a/portality/store.py +++ b/portality/store.py @@ -292,7 +292,8 @@ def list_container_ids(self): return [x for x in os.listdir(self.dir) if os.path.isdir(os.path.join(self.dir, x))] -def prune_container(storage, container_id, sort, filter=None, keep=1): +def prune_container(storage, container_id, sort, filter=None, keep=1, logger=None): + logger = logger if logger is not None else lambda x: x action_register = [] filelist = storage.list(container_id) @@ -316,7 +317,9 @@ def prune_container(storage, container_id, sort, filter=None, keep=1): #action_register.append("Considering files for retention in the following order: " + ", ".join(filtered_sorted)) remove = filtered_sorted[keep:] - action_register.append("Removed old files: " + ", ".join(remove)) + msg = "Removed old files: " + ", ".join(remove) + action_register.append(msg) + logger(msg) for fn in remove: storage.delete_file(container_id, fn) diff --git a/portality/tasks/journal_csv.py b/portality/tasks/journal_csv.py index e863aeb9c4..9b5b74269d 100644 --- a/portality/tasks/journal_csv.py +++ b/portality/tasks/journal_csv.py @@ -15,12 +15,16 @@ def run(self): Execute the task as specified by the background_job :return: """ + + def logger(msg): + self.background_job.add_audit_message(msg) + job = self.background_job journalService = DOAJ.journalService() - url, action_register = journalService.csv() - for ar in action_register: - job.add_audit_message(ar) + url, action_register = journalService.csv(logger=logger) + # for ar in action_register: + # job.add_audit_message(ar) job.add_audit_message("CSV generated; will be served from {y}".format(y=url)) def cleanup(self):