From 50a9a314b0e6ed2dc181ec107a1fbddd4e73b1b2 Mon Sep 17 00:00:00 2001 From: Amanda Goldani Date: Thu, 11 Nov 2021 02:00:25 +0100 Subject: [PATCH 01/17] Set custom logger in McMClient --- src/python/Services/McM/McMClient.py | 3 +-- src/python/Utilities/DataTools.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/python/Services/McM/McMClient.py b/src/python/Services/McM/McMClient.py index 879d6be69..ed2bb4c2b 100644 --- a/src/python/Services/McM/McMClient.py +++ b/src/python/Services/McM/McMClient.py @@ -5,7 +5,7 @@ from logging import Logger import logging -from logging import getLogger # from Utilities.Logging import getLogger +from Utilities.Logging import getLogger from typing import Optional @@ -19,7 +19,6 @@ class McMClient(object): def __init__(self, logger: Optional[Logger] = None, **kwargs) -> None: try: super().__init__() - logging.basicConfig(level=logging.INFO) self.logger = logger or getLogger(self.__class__.__name__) self.devMode = os.getenv("UNIFIED_MCM") == "dev" or kwargs.get("dev") or False diff --git a/src/python/Utilities/DataTools.py b/src/python/Utilities/DataTools.py index 826391f96..68c65d608 100644 --- a/src/python/Utilities/DataTools.py +++ b/src/python/Utilities/DataTools.py @@ -254,3 +254,17 @@ def flattenTaskTree(task: str, **selectParam) -> list: allTasks.extend(flattenTaskTree(childSpec, **selectParam)) return allTasks + +def unnestList(lst: list) -> list: + """ + The function to unnest a list of lists + :lst: list of lists + :return: unnested list + """ + allItems = [] + for item in lst: + if isinstance(item, lst): + allItems.extend(unnestList(item)) + elif item is not None: + allItems.extend(item) + return allItems \ No newline at end of file From 6a796415b7ca9743c257f1046f125d4f589bbfc9 Mon Sep 17 00:00:00 2001 From: Amanda Goldani Date: Mon, 6 Dec 2021 04:32:56 +0100 Subject: [PATCH 02/17] Migrate checkor --- src/python/Utilities/DataTools.py | 17 +- src/python/Utilities/Logging.py | 32 +- src/python/WorkflowMgmt/WorkflowController.py | 19 + unified/Checkor.py | 1466 +++++++++++++++++ 4 files changed, 1518 insertions(+), 16 deletions(-) create mode 100644 unified/Checkor.py diff --git a/src/python/Utilities/DataTools.py b/src/python/Utilities/DataTools.py index 68c65d608..1be3842f8 100644 --- a/src/python/Utilities/DataTools.py +++ b/src/python/Utilities/DataTools.py @@ -255,6 +255,21 @@ def flattenTaskTree(task: str, **selectParam) -> list: return allTasks + +def flattenDictKeys(dct: dict, values: set) -> set: + """ + The function to flatten dict keys + :param dct: a dict + :param values: values + :return: set of keys + """ + allValues = set(values) + for value in allValues: + if value in dct: + allValues.update(flattenDictKeys(dct, dct[value])) + return allValues + + def unnestList(lst: list) -> list: """ The function to unnest a list of lists @@ -267,4 +282,4 @@ def unnestList(lst: list) -> list: allItems.extend(unnestList(item)) elif item is not None: allItems.extend(item) - return allItems \ No newline at end of file + return allItems diff --git a/src/python/Utilities/Logging.py b/src/python/Utilities/Logging.py index 54f1ccbf1..accd56eb2 100644 --- a/src/python/Utilities/Logging.py +++ b/src/python/Utilities/Logging.py @@ -240,25 +240,27 @@ def getLogger(name: str, level: str = "INFO", flushEveryLog: bool = True, **kwar :return: a logger """ logger = logging.getLogger(name) - logger.setLevel(level) - streamHandler = logging.StreamHandler() - streamHandler.setLevel(level=level) - streamHandler.setFormatter(logging.Formatter("[%(asctime)s:%(name)s:%(module)s] %(levelname)s: %(message)s")) - logger.addHandler(streamHandler) + if not len(logger.handlers): + logger.setLevel(level) - if kwargs.get("addWfLevel"): - logging.addLevelName(100, "WORKFLOW") - logging.workflow = getWorkflowLogLevel - logging.Logger.workflow = getWorkflowLogLevel + streamHandler = logging.StreamHandler() + streamHandler.setLevel(level=level) + streamHandler.setFormatter(logging.Formatter("[%(asctime)s:%(name)s:%(module)s] %(levelname)s: %(message)s")) + logger.addHandler(streamHandler) - if kwargs.get("elasticSearch"): - esHandler = ElasticSearchHandler(level=kwargs.get("elasticSearch") or level, flushEveryLog=flushEveryLog) - logger.addHandler(esHandler) + if kwargs.get("addWfLevel"): + logging.addLevelName(100, "WORKFLOW") + logging.workflow = getWorkflowLogLevel + logging.Logger.workflow = getWorkflowLogLevel - if kwargs.get("email"): - emailHandler = EmailHandler(level=kwargs.get("email") or level, flushEveryLog=flushEveryLog) - logger.addHandler(emailHandler) + if kwargs.get("elasticSearch"): + esHandler = ElasticSearchHandler(level=kwargs.get("elasticSearch") or level, flushEveryLog=flushEveryLog) + logger.addHandler(esHandler) + + if kwargs.get("email"): + emailHandler = EmailHandler(level=kwargs.get("email") or level, flushEveryLog=flushEveryLog) + logger.addHandler(emailHandler) return logger diff --git a/src/python/WorkflowMgmt/WorkflowController.py b/src/python/WorkflowMgmt/WorkflowController.py index 74a92d5a8..d392eb9a6 100644 --- a/src/python/WorkflowMgmt/WorkflowController.py +++ b/src/python/WorkflowMgmt/WorkflowController.py @@ -458,6 +458,25 @@ def getOutputDatasetsPerTask(self) -> dict: """ return self.request.getOutputDatasetsPerTask(self.getWorkTasks()) + def getCampaignsFromOutputDatasets(self) -> dict: + """ + The function to get the campaigns based on the output datasets + :return: a dict of campaigns by dataset names + """ + campaigns = {} + for dataset in self.request.get("OutputDatasets", []): + campaign = self.request.get("Campaign") + if not self.isRelVal(): + try: + era = dataset.split('/')[2].split('-')[0] + except: + era = None + campaign = era if era else campaign + + campaigns[dataset] = campaign + + return campaigns + def getCampaignByTask(self, task: str) -> str: """ The function to get the campaign for a given task diff --git a/unified/Checkor.py b/unified/Checkor.py new file mode 100644 index 000000000..800b8870e --- /dev/null +++ b/unified/Checkor.py @@ -0,0 +1,1466 @@ +import os +import re +import optparse +import random +import json +import math +from logging import Logger +from collections import defaultdict +from time import mktime, asctime, gmtime, struct_time +from jinja2.runtime import F + +from sqlalchemy.sql.expression import true + +from Utilities.DataTools import unnestList, flattenDictKeys +from Utilities.Decorators import runWithMultiThreading, runWithRetries +from Utilities.Logging import getLogger +from Utilities.ConfigurationHandler import ConfigurationHandler +from Cache.DataCacheLoader import DataCacheLoader +from Databases.Oracle.OracleClient import OracleClient +from Databases.Oracle.OracleDB import Workflow +from MongoControllers.CampaignController import CampaignController +from MongoControllers.CloseoutController import CloseoutController +from MongoControllers.SiteController import SiteController +from MongoControllers.WTCController import WTCController +from Services.ServicesChecker import ServicesChecker +from Services.McM.McMClient import McMClient +from Services.ReqMgr.ReqMgrReader import ReqMgrReader +from Services.ACDC.ACDCReader import ACDCReader +from Services.DBS.DBSReader import DBSReader +from Services.DBS.DBSWriter import DBSWriter +from Services.Rucio.RucioReader import RucioReader +from WorkflowMgmt.UserLockChecker import UserLockChecker +from WorkflowMgmt.WorkflowController import WorkflowController +from WorkflowMgmt.WorkflowStatusEnforcer import WorkflowStatusEnforcer + +from typing import Optional, Tuple, Union + + +class Checkor(OracleClient): + """ + __Checkor__ + General API for checking workflows in completed status + """ + + def __init__(self, logger: Optional[Logger] = None, **kwargs) -> None: + try: + super().__init__(self) + self.logger = logger or getLogger(self.__class__.__name__) + + self.options, self.specificWf = kwargs.get("options"), kwargs.get("specificWf") + if self.options is None: + self.options, self.specificWf = self.parseOptions() + + now = mktime(gmtime()) + self.timePoint = {"lap": now, "subLap": now, "start": now} + + self.unifiedConfiguration = ConfigurationHandler("config/unifiedConfiguration.json") + + self.reqMgrReader = ReqMgrReader() + self.acdcReader = ACDCReader() + self.dbs = {"reader": DBSReader(), "writer": DBSWriter()} + + self.dataCacheLoader = DataCacheLoader() + self.mcmClient = McMClient() if self.useMcM else None + + self.campaignController = CampaignController() + self.closeoutController = CloseoutController() + self.siteController = SiteController() + self.wtcController = WTCController() + + self.logMsg = { + "strict": "Strict option is on: checking workflows that freshly completed", + "update": "Update option is on: checking workflows that have not completed yet", + "clear": "Clear option is on: checking workflows that are ready to toggle closed-out", + "reviewRecover": "Review-recovering option is on: checking only the workflows that had been already acted on", + "reviewManual": "Review-manual option is on: checking the workflows to be acted on", + "timeCheck": "Time check (%s) point at: %s", + "sinceStart": "Since start: %s [s]", + "lap": "%s: %s [s]", + "notAllowed": "%s is not allowed to %s", + "allowed": "%s allowed to %s %s", + "nWfs": "Considering %s workflows (before any limitation)", + "limitOption": "Command line to limit workflows to %s", + "limit": "Limiting workflows to %s this round", + "startCheck": "Starting checkor with {}", + "setClose": "%s is already %s, setting as close", + "setForget": "%s is %s, but will not be set in trouble to find a replacement", + "setTrouble": "%s is %s, setting as trouble", + "setOnHold": "%s is %s, setting as on hold", + "notRunning": "%s is %s, it is not running yet", + "skipCheck": "No need to check %s in status %s", + "bypassChecks": "Bypassing checks on %s because of keyword %s", + "bypassForceComplete": "Bypassing checks and force completing %s because of %s %s", + "failedCheck": "Failed on checking %s", + "onHoldSince": "On hold since %s, timeout at %s", + "bypassOnHold": "%s is on hold and stopped for %.2f days, letting this through with current statistics", + "onHold": "%s is on hold", + "campaigns": "Campaigns: %s", + "checkSummary": "Initial outputs: %s\nWill check on: %s\nTiers out: %s\n, Tiers no custodials: %s", + "inconsistentAcdc": "Inconsistent ACDC %s", + "acdc": "%s still has an ACDC running", + "failedAcdc": "For %s, ACDC %s failed", + "badAcdc": "For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.", + "forceRecovering": "%s is being force completed while recovering", + "forceComplete": "The workflow %s was force completed", + "missingTotalEvents": "TotalInputEvents is missing from the workload of %s", + "sinceCompleted": "%s days since completed", + "earlyAnnounce": "Allowed to announce the output %s over %.2f by campaign requirement", + "overrideFraction": "Overriding fraction to %s for %s by %s", + "truncateRecovery": "Allowed to truncate recovery of %s over %.2f by campaign requirement", + "fractionDamping": "The passing fraction could be reduced to %s given it has been in for long", + "reducePass": "Reducing pass threshold by %.3f%% for long lasting workflow %s", + "truncatePass": "Truncating at pass threshold because of ACDC at rank %d", + } + + self.useMcM = False + self.exceptions = set() + self._setWfs() + # TODO: update JIRA tickets + + except Exception as error: + raise Exception(f"Error initializing Checkor\n{str(error)}") + + @staticmethod + def parseOptions() -> Tuple[dict, Optional[str]]: + """ + The function to parse the Checkor's options and specific workflow + :return: options and the specific workflow, if any + """ + parser = optparse.OptionParser() + + parser.add_option("--go", help="Does not check on duplicate process", action="store_true", default=False) + parser.add_option( + "--update", + help="Running workflows that have not yet reached completed", + action="store_true", + default=False, + ) + parser.add_option( + "--strict", help="Only running workflows that reached completed", action="store_true", default=False + ) + parser.add_option( + "--clear", help="Only running workflows that have reached custodial", action="store_true", default=False + ) + parser.add_option( + "--review", + help="Look at the workflows that have already completed and had required actions", + action="store_true", + default=False, + ) + parser.add_option( + "--recovering", + help="Look at the workflows that already have on-going acdc", + action="store_true", + default=False, + ) + parser.add_option("--manual", help='Look at the workflows in "manual"', action="store_true", default=False) + parser.add_option("--limit", help="The number of workflows to consider for checking", default=0, type=int) + parser.add_option("--threads", help="The number of threads for processing workflows", default=10, type=int) + parser.add_option( + "--fractionPass", help="The completion fraction that is permitted", default=0.0, type="float" + ) + parser.add_option("--lumiSize", help="Force the upper limit on lumisection", default=0, type="float") + parser.add_option( + "--ignoreFiles", help="Force ignoring dbs/rucio differences", action="store_true", default=False + ) + parser.add_option( + "--ignoreInvalid", help="Force ignoring high level of invalid files", action="store_true", default=False + ) + parser.add_option( + "--ignoreDuplicates", help="Force ignoring lumi duplicates", action="store_true", default=False + ) + parser.add_option("--tapeSizeLimit", help="The limit in size of all outputs", default=0, type=int) + parser.add_option("--html", help="Build the the monitor page", action="store_true", default=False) + parser.add_option( + "--noReport", help="Prevent from making the error report", action="store_true", default=False + ) + parser.add_option( + "--backfill", help="To run in test mode (only with backfill workflows)", action="store_true", default=False + ) + + options, args = parser.parse_args() + options = vars(options) + + options["manual"] = not options.get("recovering") + actions = ["strict", "update", "clear", "review"] + if all(not options.get(option) for option in actions): + for option in actions + ["recovering", "manual"]: + options[option] = True + + return options, args[0] if args else None + + def _setWfs(self) -> None: + """ + The function to set workflows to override, on hold, bypass and force complete. + """ + self.overrideWfs = self._getWorkflowsByAction("force-complete", details=True) + self.onHoldWfs = self._getWorkflowsByAction("hold") + self.bypassWfs = self._getWorkflowsByAction("bypass") + unnestList(self.overrideWfs) + self.forceCompleteWfs = self.mcmClient.get("/restapi/requests/forcecomplete") if self.useMcM else {} + + def _checkPoint(self, label: str = "", subLap: bool = False, now: struct_time = gmtime()) -> None: + """ + The function to get the check points + :label: log message label + :subLap: True if for sub lap, False o/w + :now: time now + """ + self.logger.info(self.logMsg["timeCheck"], label, asctime(now)) + self.logger.info(self.logMsg["sinceStart"], now - self.timePoint.get("start", now)) + + self.logger.info( + self.logMsg["lap"], + "Sub lap" if subLap else "Lap", + now - self.timePoint.get("subLap" if subLap else "lap", now), + ) + self.timePoint["subLap"] = now + if not subLap: + self.timePoint["lap"] = now + + def _filterBackfills(self, workflows: list) -> list: + """ + The function to filter only backfill workflows + :workflows: workflows + :return: backfill workflows + """ + if self.options.get("backfill"): + self.logger.info(self.logMsg["backfill"]) + return [wf for wf in workflows if "backfill" in wf.name.lower()] + return workflows + + def _getWorkflowsToCheck(self) -> list: + """ + The function to get the workflows to check + :return: workflows + """ + workflows = set() + + awayWfs = self.session.query(Workflow).filter(Workflow.status == "away").all() + assistanceWfs = self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all() + completedWfs = self.reqMgrReader.getWorkflowsByStatus("completed") + + if self.options.get("strict"): + self.logger.info(self.logMsg["strict"]) + workflows.update(filter(lambda wf: wf.name in completedWfs, awayWfs)) + + if self.options.get("update"): + self.logger.info(self.logMsg["update"]) + workflows.update(filter(lambda wf: wf.name not in completedWfs, awayWfs)) + + if self.options.get("clear"): + self.logger.info(self.logMsg["clear"]) + workflows.update(filter(lambda wf: "custodial" in wf.status, assistanceWfs)) + + if self.options.get("review"): + nonCustodialWfs = [*filter(lambda wf: "custodial" not in wf.status, assistanceWfs)] + if self.options.get("recovering"): + self.logger.info(self.logMsg["reviewRecover"]) + workflows.update(filter(lambda wf: "manual" not in wf.status, nonCustodialWfs)) + if self.options.get("manual"): + self.logger.info(self.logMsg["reviewManual"]) + workflows.update(filter(lambda wf: "manual" in wf.status, nonCustodialWfs)) + + return list(workflows) + + def _getWorkflowsByAction(self, action: str, details: bool = False) -> Union[list, dict]: + """ + The function to get workflows for a given user action + :action: user action + :details: if True return dict of workflows by user. O/w return workflows names + :return: workflows + """ + workflows = defaultdict(set) + + allowedUsers = [*self.unifiedConfiguration.get("allowed_bypass", {}).keys()] + + userWfs = ( + self.wtcController.getHold() + if action == "hold" + else self.wtcController.getBypass() + if action == "bypass" + else self.wtcController.getForce() + if action == "force-complete" + else {} + ) + for user, wf in userWfs.items(): + if user not in allowedUsers: + self.logger.info(self.logMsg["notAllowed"], user, action) + continue + + self.logger.info(self.logMsg["allowed"], user, action, wf) + workflows[user].add(wf) + + if details: + return workflows + return unnestList(workflows) + + def _filterMaxNumberOfWorkflows(self, workflows) -> list: + maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") + if self.options.get("limit"): + self.logger.info(self.logMsg["limitOption"], self.options.get("limit")) + maxPerRound = self.options.get("limit") + + if maxPerRound and not self.specific: + self.logger.info(self.logMsg["limit"], maxPerRound) + + workflows = self._rankWorkflows(workflows) + if self.option.get("update"): + random.shuffle(workflows) + return workflows[:maxPerRound] + + return workflows + + def _rankWorkflows(self, workflows: list) -> list: + """ + The function to rank the workflows by their priority + :param workflows: workflows + :return: sorted workflows + """ + completedWfs = self.reqMgrReader.getWorkflowsByStatus("completed", details=True) + completedWfs = sorted(completedWfs, key=lambda wf: wf.get("RequestPriority", 0)) + completedWfs = [wf.get("RequestName") for wf in completedWfs] + + return sorted(workflows, key=lambda wf: completedWfs.index(wf) if wf in completedWfs else 0, reverse=True) + + def _updateWorkflowsRecords(self, wfsRecords: dict) -> None: + for wf in wfsRecords: + if wf.get("record"): + self.closeoutController.set(wf.get("wf"), wf.get("record")) + + def _updateWorkflowsStatus(self, wfsStatus) -> None: + for wf in wfsStatus: + if wf.get("newStatus"): + newStatus = wf.get("newStatus") + wf["workflow"].status = newStatus + self.session.commit() + + if newStatus == "close": + self.closeoutController.clean(wf.get("wf")) + if self.useMcM and wf.get("mcmForceComplete"): + for prepId in wf.get("prepIds"): + self.mcmClient.clean(f"/restapi/requests/forcecomplete/{prepId}") + + def _checkExecutionTime(self, nWfs: int, now: struct_time = mktime(gmtime())) -> None: + if nWfs: + avgExecutionTime = (now - self.timePoint.get("start")) / nWfs + self.logger.info("Average time spend per workflow: %s s", avgExecutionTime) + + if avgExecutionTime > 120: + self.logger.critical("Checkor took %s [s] per workflow", avgExecutionTime) + + def _countAssistanceWorkflowsByStatus(self) -> dict: + status = defaultdict(int) + for wf in self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all(): + status[wf.status] += 1 + + return status + + def _writeSummary(self) -> str: + if not self.specificWf: + msg = "" + if self.options.get("strict"): + msg += "Workflows which just got in completed were looked at. Look in manual.\n" + if self.options.get("update"): + msg += "Workflows that are still running (and not completed) got looked at.\n" + if self.options.get("clear"): + msg += "Workflows that just need to close-out were verified. Nothing too new a-priori.\n" + if self.options.get("review"): + msg += "Workflows under intervention got review.\n" + + msg += "\n".join([f"{count} in status {status}" for status, count in self._countAssistanceWorkflowsByStatus()]) + + self.logger.info(msg) + + def go(self) -> bool: + """ + The function to check if the checkor can go + :return: True if it can go, False o/w + """ + try: + userLockChecker = UserLockChecker() + servicesChecker = ServicesChecker(softServices=["mcm", "wtc"]) + + if not userLockChecker.isLocked() and servicesChecker.check(): + self.useMcM = servicesChecker.status.get("mcm") + return True + + return False + + except Exception as error: + self.logger.error("Failed to check if Checkor can go") + self.logger.error(str(error)) + + def run(self) -> None: + """ + The function to run checkor + """ + try: + wfsToCheck = self._filterBackfills(self._getWorkflowsToCheck()) + random.shuffle(wfsToCheck) + + self.logger.info(self.logMsg["nWfs"], len(wfsToCheck)) + + wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) + self._check(wfsToCheck) + + self._writeSummary() + + except Exception as error: + self.logger.error("Failed to run checkor") + self.logger.error(str(error)) + + def _check(self, wfsToCheck: list) -> None: + """ + The wrapper function to check the workflows + :param wfsToCheck: workflows to check + """ + + @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=len(wfsToCheck)) + def _checkWorkflow(self, wfsToCheck: list) -> dict: + return WorkflowCheckor(wfsToCheck, checkor=self).check() + + checkResponses = _checkWorkflow(wfsToCheck) + self._updateWorkflowsRecords(checkResponses) + self._updateWorkflowsStatus(checkResponses) + + self._checkExecutionTime(len(wfsToCheck)) + +class WorkflowCheckor: + """ + __WorkflowCheckor__ + General API for checking a fiven workflow + """ + + def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwargs) -> None: + try: + super().__init__(self) + self.logger = logger or getLogger(self.__class__.__name__) + self.now = mktime(gmtime()) + + self.wf = wfToCheck.name + self.wfToCheck = wfToCheck + self.wfController = WorkflowController(wfToCheck.name) + self.wfsToCheck.wm_status = self.wfController.request.get("RequestStatus") + + self.checkor = kwargs.get("checkor") + self.rucioReader = RucioReader() + + self.existingAssistaceTags = set(wfToCheck.status.split("-")[1:]) + self.assistanceTags = set() + self.outputDatasetsToCheck = list() + self.acdcWfs = dict() + self.fractions = dict() + self.campaigns = dict() + self.eventsPerLumi = dict() + self.percentCompletions, self.percentAvgCompletions = dict(),dict() + self.expectedLumis, self.expectedEvents =dict(),dict() + self.producedLumis, self.producedEvents = dict(),dict() + self.passStatsCheck, self.passStatsCheckToAnnounce, self.passStatsCheckOverCompletion = dict(), dict(), dict() + self.lumiUpperLimit = dict() + self.rucioPresence = dict() + self.dbsPresence, self.dbsInvalid = dict(), dict() + + self.failed = False + self.isClosing = False + self.newStatus = None + self._setBypassChecks() + self._setBypassChecksByMcMForceComplete(self.wfController.getPrepIDs()) + self._setBypassChecksByUserForceComplete() + + self.record = {} + + except Exception as error: + raise Exception(f"Error initializing Checkor\n{str(error)}") + + def _skipWorkflow(self) -> bool: + """ + The function to check if a given workflow should be skipped or not + :return: True if workflow should be skipped, False o/w + """ + if ( + (self.checkor.specific and self.checkor.specific not in self.wf) + or os.path.isfile(".checkor_stop") + or self.wfToCheck.wm_status in ["assigned", "acquired"] + or (self.wfToCheck.wm_status != "completed" and self.wf not in self.checkor.exceptions) + ): + self.logger.info("Skipping workflow %s", self.wf) + return True + return False + + def _setWorkflowToClose(self) -> bool: + """ + The function to check if a given workflow should be closed + :return: True if workflow should be closed, False o/w + """ + if self.wfToCheck.wm_status in ["closed-out", "announced"] and self.wf not in self.checkor.exceptions: + self.logger.info(self.logMsg["setClose"], self.wf, self.wfToCheck.wm_status) + self.newStatus = "close" + return True + return False + + def _setWorkflowToForget(self) -> bool: + """ + The function to check if a given workflow should be forgotten + :return: True if workflow should be forgotten, False o/w + """ + if self.wfController.request.isRelVal() and self.wfToCheck.wm_status in [ + "failed", + "aborted", + "aborted-archived", + "rejected", + "rejected-archived", + "aborted-completed", + ]: + self.logger.info(self.logMsg["setForget"], self.wf, self.wfToCheck.wm_status) + self.newStatus = "forget" + return True + return False + + def _setWorkflowToTrouble(self) -> bool: + """ + The function to check if a given workflow should be set as trouble + :return: True if workflow should be set as trouble, False o/w + """ + if not self.wfController.request.isRelVal() and self.wfToCheck.wm_status in [ + "failed", + "aborted", + "aborted-archived", + "rejected", + "rejected-archived", + "aborted-completed", + ]: + self.logger.info(self.logMsg["setTrouble"], self.wf, self.wfToCheck.wm_status) + self.newStatus = "trouble" + return True + return False + + def _setBypassChecks(self) -> None: + """ + The function to check if bypass checks for a given workflow. + :param wf: workflow name + :return: True if bypass checks, False o/w + """ + for bypassWf in self.checkor.bypassWfs: + if bypassWf == self.wf: + self.logger.info(self.logMsg["bypassCheck"], self.wf, bypassWf) + self.bypassChecks = True + self.bypassChecks = False + + def _setBypassChecksByMcMForceComplete(self, prepIds: list) -> None: + """ + The function to check if bypass checks for a given workflow because of McM force-complete status. + :param prepIds: list of prep ids + """ + for forceCompleteWf in self.checkor.forceCompleteWfs: + if forceCompleteWf in prepIds: + self.logger.info(self.logMsg["bypassForceComplete"], self.wf, "prepid", forceCompleteWf) + self.bypassChecksByMcMForceComplete = True + self.bypassChecksByMcMForceComplete = False + + def _setBypassChecksByUserForceComplete(self) -> None: + """ + The function to check if bypass checks for a given workflow because of user force-complete status. + """ + for user, userOverrideWfs in self.checkor.overrideWfs.items(): + forceCompleteWfs = [overrideWf for overrideWf in userOverrideWfs if overrideWf in self.wf] + if forceCompleteWfs: + self.logger.info( + self.logMsg["bypassForceComplete"], self.wf, "user/keyword", f"{user}/{forceCompleteWfs[0]}" + ) + self.bypassChecksByUserForceComplete = True + self.bypassChecksByUserForceComplete = False + + def _isWorkflowOnHold(self) -> bool: + """ + The function to check if a given workfloe is on hold + :return: True if workflow is on hold, False o/w + """ + return "-onhold" in self.wfToCheck.wm_status or self.wf in self.checkor.onHoldWfs + + def _getMinFamilyCompletedDelay(self, family: list) -> float: + """ + The function to get how long a given workflow in the family has been requested to complete. + :param family: workflow family + :param now: time now + :return: how long a given family workflow has been requested to complete + """ + completedFamily = [ + filter(lambda wf: wf.get("Status") == "completed", member["RequestTransition"]) for member in family + ] + return min( + [ + (self.now - completed[-1].get("UpdateTime", self.now)) / 86400.0 if completed else 0 + for completed in completedFamily + ] + ) + + + def _setNewOnHoldStatus(self, family: list) -> bool: + """ + The function to get on hold status for a given workflow + :param family: workflow family + :return: if set new status + """ + onHoldTimeout = self.unifiedConfiguration.get("onhold_timeout") + minFamilyCompletedDelay = self._getMinFamilyCompletedDelay(family, self.now) + + self.logger.info(self.logMsg["onHoldSince"], minFamilyCompletedDelay, onHoldTimeout) + + if onHoldTimeout > 0 and onHoldTimeout < minFamilyCompletedDelay: + self.logger.info(self.logMsg["bypassOnHold"], self.wf, minFamilyCompletedDelay) + self.bypassChecks = True + return False + + if "-onhold" in self.wfToCheck.wm_status and self.wf in self.checkor.onHoldWfs and not self.bypassChecks: + self.logger.info(self.logMsg["onHold"], self.wf) + return True + + if self.wf in self.checkor.onHoldWfs and not self.bypassChecks: + self.logger.info(self.logMsg["setOnHold"], self.wf, self.wfToCheck.wm_status) + self.newStatus = "assistance-onhold" + return True + + return False + + def _updateWorkflowStatus(self) -> bool: + """ + The function to check if the status of a given workflow should be updated + :return: True if update status, False o/w + """ + if self._setWorkflowToClose() or self._setWorkflowToForget() or self._setWorkflowToTrouble(): + return True + if self._isWorkflowOnHold() and self._setNewOnHoldStatus(self.wfController.getFamily(includeItself=True)): + return True + + return False + + def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: + tiersWithNoCheck = set(self.unifiedConfiguration.get("tiers_with_no_check")) + tiersWithNoCustodial = set( + self.unifiedConfiguration.get("tiers_with_no_custodial") if not self.wfController.request.isRelVal() else [] + ) + + for campaign in self._getWorkflowCampaigns(self.wfController).values(): + if ( + campaign in self.campaignController.campaigns + and "custodial_override" in self.campaignController.campaigns[campaign] + ): + if isinstance(self.campaignController.campaigns[campaign]["custodial_override"], list): + tiersWithNoCheck -= self.campaignController.campaigns[campaign]["custodial_override"] + elif self.campaignController.campaigns[campaign]["custodial_override"] == "notape": + tiersWithNoCustodial = set( + sorted([dataset.split("/")[-1] for dataset in self.wfController.request.get("OutputDatasets")]) + ) + + return tiersWithNoCheck, tiersWithNoCustodial + + def _setOutputDatasetsToCheck(self) -> None: + tiersWithNoCheck, tiersWithNoCustodial = self._getTiersWithNoCheckAndCustodial(self.wfController) + + expectedOutputsDatasets = self.wfController.request.get("OutputDatasets") + self.outputDatasetsToCheck = [ + dataset + for dataset in expectedOutputsDatasets + if all([dataset.split("/")[-1] != tier for tier in tiersWithNoCheck]) + ] + + self.wfController.logger.info( + self.logMsg["checkSummary"], + ", ".join(sorted(expectedOutputsDatasets)), + ", ".join(sorted(self.outputDatasetsToCheck)), + ", ".join(sorted(tiersWithNoCheck)), + ", ".join(sorted(tiersWithNoCustodial)), + ) + + def _skipFamilyWorkflow(self, wfSchema: dict) -> bool: + return ( + wfSchema.get("RequestType") != "Ressubmission" + or wfSchema.get("RequestDate") < wfSchema.request.get("RequestDate") + or wfSchema.get("PrepID") != wfSchema.request.get("PrepID") + or wfSchema.get("RequestStatus") == None + ) + + + def _getWorkflowFamily(self) -> list: + family = [] + for member in self.reqmgrReader.getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): + if ( + member.get("RequestName") == self.wf + or self._skipFamilyWorkflow(member) + or not set(member.get("OutputDatasets", [])).issubset(set(self.wfController.request.get("OutputDatasets"))) + ): + continue + + family.append(member) + + return family + + def _setBadAcdcs(self) -> None: + badAcdcs = [] + for member in self.reqmgrReader.getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): + if member.get("RequestName") == self.wf or self._skipFamilyWorkflow(member): + continue + + if not set(member.get("OutputDatasets", [])).issubset(set(self.wfController.request.get("OutputDatasets"))): + if member.get("RequestStatus") not in [ + "rejected-archived", + "rejected", + "aborted", + "aborted-archived", + ]: + badAcdcs.append(member.get("RequestName")) + self.wfController.logger.info(self.logMsg["inconsistentAcdc"], member.get("RequestName")) + + self.acdcWfs["bad"] = badAcdcs + if badAcdcs: + self.assistanceTags.update(["manual", "assistance"]) + + def _setHealthyAcdcs(self, family: list) -> None: + healthyAcdcs = [] + for member in family: + if member.get("RequestStatus") in [ + "running-open", + "running-closed", + "assigned", + "acquired", + "staging", + "staged", + ]: + self.logger.info(self.logMsg["acdc"], member.get("RequestName")) + healthyAcdcs.append(member.get("RequestName")) + + self.acdcWfs["healthy"] = healthyAcdcs + if healthyAcdcs: + self.assistanceTags.add("recovering") + + def _setInactiveAcdcs(self, family: list) -> None: + inactiveAcdcs = [] + for member in family: + if member.get("RequestStatus") not in [ + "running-open", + "running-closed", + "assigned", + "acquired", + "staging", + "staged", + "failed", + ]: + inactiveAcdcs.append(member.get("RequestName")) + + self.acdcWfs["inactive"] = inactiveAcdcs + if inactiveAcdcs: + self.assistanceTags.add("recovered") + + def _setFailedAcdcs(self, family) -> None: + failedAcdcs = [] + for member in family: + if member.get("RequestStatus") in ["failed"]: + failedAcdcs.append(member.get("RequestName")) + + self.acdcWfs["failed"] = failedAcdcs + + def _setAcdcsOrder(self, family: list) -> None: + order = -1 + for member in family: + memberOrder = sorted(filter(re.compile(f"^ACDC\d+$").search, member.get("RequestName"))) + if memberOrder: + order = max(order, int(memberOrder[-1].split("ACDC")[1])) + + self.acdcWfs["order"] = order + + def _checkWorkflowFamily(self) -> None: + family = self._getWorkflowFamily() + + self._setBadAcdcs() + self._setHealthyAcdcs(family) + self._setInactiveAcdcs(family) + self._setFailedAcdcs(family) + self._setAcdcsOrder(family) + + if (self.bypassChecksByMcMForceComplete or self.bypassChecksByUserForceComplete) and len(self.acdcWfs.get("healthy")): + self.wfController.logger.info(self.logMsg["forceRecovering"], self.wf) + self.wfController.logger.critical(self.logMsg["forceComplete"], self.wf) + WorkflowStatusEnforcer(self.wf).forceComplete() + + if self.acdcWfs.get("failed"): + self.logger.critical(self.logMsg["failedAcdc"], self.wf, ", ".join(self.acdcWfs.get("failed"))) + if self.acdcWfs.get("bad"): + self.logger.critical(self.logMsg["badAcdc"], self.wf, ", ".join(self.acdcWfs.get("bad"))) + + def _getWorkflowCompletedDelay(self) -> float: + """ + The function to get how long a given workflow has been requested to complete + :return: how long a given workflow has been requested to complete + """ + completed = [*filter(lambda wf: wf.get("Status") == "completed", self.wfController.request.get("RequestTransition"))] + delay = (self.now - completed[-1].get("UpdateTime", self.now)) / 86400.0 if completed else 0 + + self.logger.info(self.logMsg["sinceCompleted"], delay) + + return delay + + def _getFractionDumping(self) -> float: + wfCompletedDelay = self._getWorkflowCompletedDelay() + fractionDamping = min( + 0.01 + * ( + max(wfCompletedDelay - self.unifiedConfiguration.get("damping_fraction_pass"), 0) + / self.unifiedConfiguration.get("damping_fraction_pass_rate") + ), + self.unifiedConfiguration.get("damping_fraction_pass_max") / 100.0, + ) + + self.logger.info(self.logMsg["fractionDamping"], fractionDamping) + + return fractionDamping + + def _setWorkflowCampaigns(self) -> None: + campaigns = {} + wfCampaigns = self.wfController.request.getCampaigns(details=False) + if len(wfCampaigns) == 1: + for dataset in self.wfController.request.get("OutputDatasets"): + campaigns[dataset] = wfCampaigns[0] + else: + campaigns = self.wfController.getCampaignsFromOutputDatasets() + + self.logger.info(self.logMsg["campaigns"], campaigns) + self.campaigns = campaigns + + def _setDatasetsFractionsToAnnounce(self) -> None: + fractionsAnnounce = {} + for dataset in self.outputDatasetsToCheck: + fractionsAnnounce[dataset] = 1.0 + + self.fractions["announce"] = fractionsAnnounce + + def _setDatasetsFractionsToPass(self) -> None: + fractionsPass = {} + defaultPass = self.unifiedConfiguration.get("default_fraction_pass") + + campaigns = self.campaigns or self._setWorkflowCampaigns() + + for dataset in self.outputDatasetsToCheck: + campaign = campaigns.get(dataset) + + if self.options.get("fractionPass"): + fractionsPass[dataset] = self.options.get("fractionPass") + self.wfController.logger.info( + self.logMsg["overrideFraction"], fractionsPass[dataset], dataset, "command line" + ) + elif self.campaignController.campaigns.get(campaign, {}).get("fractionpass"): + fractionPass = self.campaignController.campaigns.get(campaign).get("fractionpass") + if isinstance(fractionPass, dict): + tier = dataset.split("/")[-1] + priority = str(self.wfController.request.get("RequestPriority")) + fractionsPass[dataset] = fractionPass.get("all", defaultPass) + if fractionPass.get(tier): + tierFractionPass = fractionPass.get(tier) + if isinstance(tierFractionPass, dict): + fractionsPass[dataset] = tierFractionPass.get("all", defaultPass) + for key, passValue in tierFractionPass.items(): + if dataset.startswith(key): + fractionsPass[dataset] = passValue + else: + fractionsPass[dataset] = tierFractionPass + if fractionPass.get(priority): + fractionsPass[dataset] = fractionPass.get("priority") + else: + fractionsPass[dataset] = fractionPass + self.wfController.logger.info( + self.logMsg["overrideFraction"], fractionsPass[dataset], dataset, "campaign requirement" + ) + else: + fractionsPass[dataset] = defaultPass + + for key, passValue in self.unifiedConfiguration.get("pattern_fraction_pass").items(): + if key in dataset: + fractionsPass[dataset] = passValue + self.wfController.logger.info(self.logMsg["overrideFraction"], passValue, dataset, "dataset key") + + self.fractions["pass"] = fractionsPass + + def _setDatasetsFractionsToTruncateRecovery(self) -> None: + fractionsTruncateRecovery = {} + + weightFull = 7.0 + weightUnderPass = 0.0 + weightPass = self._getWorkflowCompletedDelay() + + campaigns = self.campaigns or self._setWorkflowCampaigns() + + for dataset in self.outputDatasetsToCheck: + passPercentBelow = self.fractions["pass"][dataset] - 0.02 + fractionsTruncateRecovery[dataset] = ( + self.fractions["pass"][dataset] * weightPass + weightFull + passPercentBelow * weightUnderPass + ) / (weightPass + weightFull * weightUnderPass) + + campaign = campaigns.get(dataset) + if self.campaignController.campaigns.get(campaign, {}).get("truncaterecovery"): + fractionsTruncateRecovery[dataset] = self.campaignController.campaigns.get(campaign).get( + "truncaterecovery" + ) + self.wfController.logger.info( + self.logMsg["truncateRecovery"], + dataset, + self.campaignController.campaigns.get(campaign).get("truncaterecovery"), + ) + + if fractionsTruncateRecovery[dataset] < self.fractions["pass"][dataset]: + fractionsTruncateRecovery[dataset] = self.fractions["pass"][dataset] + + self.fractions["truncate"] = fractionsTruncateRecovery + + def _setStatisticsThresholds(self) -> None: + self._setDatasetsFractionsToAnnounce() + self._setDatasetsFractionsToPass() + self._setDatasetsFractionsToTruncateRecovery() + + fractionDamping = self._getFractionDumping() + for dataset, value in self.fractions["pass"].items(): + if value != 1.0 and fractionDamping and self.unifiedConfiguration.get("timeout_for_damping_fraction"): + self.fractions["pass"][dataset] -= fractionDamping + self.fractions["truncate"][dataset] -= fractionDamping + + if self.acdsWfs.get("order") > self.unifiedConfiguration.get("acdc_rank_for_truncate"): + self.wfController.logger.info(self.logMsg["truncatePass"], self.acdsWfs.get("order")) + self.fractions["truncate"][dataset] = self.fractions["pass"][dataset] + + self._updateFractionsToPassAndToTruncateRecovery() + + def _updateFractionsToPassAndToTruncateRecovery(self) -> None: + family = dict([(dataset, self.dbs["reader"].getDatasetParent(dataset)) for dataset in self.fractions["pass"]]) + + for dataset, value in self.fractions["pass"].items(): + ancestors = flattenDictKeys(family, family.get(dataset, [])) + + descendingTruncate = self.fractions["truncate"][dataset] + descendingPass = value + for ancestor in ancestors: + descendingPass *= self.fractions["pass"].get(ancestor, 1.0) + descendingTruncate *= self.fractions["truncate"].get(ancestor, 1.0) + + if self.unifiedConfiguration.get("cumulative_fraction_pass"): + self.fractions["pass"][dataset] = descendingPass + self.fractions["truncate"][dataset] = descendingTruncate + self.logger.info( + "For %s, previously passing at %s, is now passing at %s", dataset, value, descendingPass + ) + else: + self.logger.info( + "For %s, instead of passing at %s, could be passing at %s", dataset, value, descendingPass + ) + + def _getExpectedEvents(self) -> float: + if self.wfController.request.get("RequestType") in ["TaskChain", "StepChain"]: + return self.wfController.getRequestNumEvents() + + expectedEvents = self.wfController.request.get("TotalInputEvents") + if expectedEvents is None: + self.wfController.logger.critical(self.logMsg["missingTotalEvents"], wf) + return 0 + return expectedEvents + + def _getTaskOutputDatasets(self) -> dict: + taskOutputDatasets = {} + for task, outputs in self.wfController.getOutputDatasetsPerTask().items(): + for output in outputs: + taskOutputDatasets[output] = self.wfController.request.get(task, {}).get("TaskName", task) + return taskOutputDatasets + + def _checkCompletionStatistics(self) -> None: + lumisExpected = self.wfController.request.get("TotalInputLumis") + eventsExpected = self._getExpectedEvents() + eventsExpectedPerTask = self.wfController.request.getExpectedEventsPerTask() + taskOutputs = self._getTaskOutputDatasets() + + for dataset in self.outputDatasetsToCheck: + events, lumis = self.dbs["reader"].getDatasetEventsAndLumis(dataset) + self.producedEvents[dataset] = events + self.producedLumis[dataset] = lumis + self.eventsPerLumi[dataset] = events / float(lumis) if lumis else 100 + self.percentCompletions[dataset] = 0.0 + + if lumisExpected: + self.wfController.logger.info("Lumi completion %s expected for %s", lumis, lumisExpected, dataset) + self.percentCompletions[dataset] = lumis / float(lumisExpected) + self.expectedLumis[dataset] = lumisExpected + + outputEventsExpected = eventsExpectedPerTask.get(taskOutputs.get(dataset, "NoTaskFound"), eventsExpected) + if outputEventsExpected: + self.expectedEvents[dataset] = outputEventsExpected + eventsFraction = float(events) / float(outputEventsExpected) + if eventsFraction > self.percentCompletions[dataset]: + self.percentCompletions[dataset] = eventsFraction + self.wfController.logger.info( + "Overriding: event completion real %s expected %s for %s", + events, + outputEventsExpected, + dataset, + ) + + def _setPassStatisticsCheck(self) -> None: + self.passStatsCheck = dict( + [ + (dataset, self.bypassChecks or self.percentCompletions[dataset] >= passValue) + for dataset, passValue in self.fractions["pass"].items() + ] + ) + + def _setPassStatisticsCheckToAnnounce(self) -> None: + self.passStatsCheckToAnnounce = dict( + [ + (dataset, self.percentAvgCompletions[dataset] >= passValue) for dataset, passValue in self.fractions["pass"].items() + ] + ) + + def _setPassStatisticsCheckOverCompletion(self) -> None: + defaultFractionOverdoing = self.unifiedConfiguration.get("default_fraction_overdoing") + self.passStatsCheckOverCompletion = dict([(dataset, value >= defaultFractionOverdoing) for dataset, value in self.percentCompletions.items()]) + + + def _checkAvgCompletionStatistics(self) -> None: + percentAvgCompletions = {} + + _, primaries, _, _ = self.wfController.request.getIO() + runWhiteList = self.wfController.getRunWhiteList() + lumiWhiteList = self.wfController.getLumiWhiteList() + + lumisPerRun = {} + if not all(self.passStatsCheck.values()): + nRuns = 1 + for primary in primaries: + if len(self.dbs["reader"].getDatasetRuns(primary)) > 1: + self.logger.info("Fetching input lumis and files for %s", primary) + lumisPerRun[primary], _ = self.dbs["reader"].getDatasetLumisAndFiles( + primary, runs=runWhiteList, lumiList=lumiWhiteList + ) + nRuns = len(set(lumisPerRun[primary].keys())) + + for dataset in self.passStatsCheck: + if primaries and nRuns > 1: + lumisPerRun[dataset], _ = self.dbs["reader"].getDatasetLumisAndFiles(dataset) + + fractionPerRun = {} + primary = primaries[0] + allRuns = sorted(set(lumisPerRun[primary].keys() + lumisPerRun[dataset].keys())) + for run in allRuns: + if lumisPerRun[primary].get(run, []): + fractionPerRun[run] = float(len(lumisPerRun[dataset].get(run, []))) / lumisPerRun[ + primary + ].get(run, []) + + if fractionPerRun: + avgFraction = sum(fractionPerRun.values()) / len(fractionPerRun.values()) + percentAvgCompletions[dataset] = avgFraction + + self.logger.info("The average completion fraction per run for %s is %s", dataset, avgFraction) + + self.percentAvgCompletions = percentAvgCompletions + + def _getAnnounceAssistanceTags(self) -> list: + if self.passStatsCheckToAnnounce and all(self.passStatsCheckToAnnounce.values()): + self.wfController.logger.info( + "The output of this workflow are essentially good to be announced while we work on the rest" + ) + return ["announced" if "announced" in self.wfToCheck.status else "announce"] + return [] + + def _getRecoveryAssistanceTags(self) -> list: + if not all(self.passStatsCheck.values()): + possibleRecoveries = self.acdcReader.getRecoveryDocs() + if possibleRecoveries: + self.wfController.logger.info( + "The workflow has missing statistics, but nothing is recoverable. Passing through to announcement" + ) + bypassChecks = True + else: + self.wfController.logger.info( + "The workflow is not completed, but nothing is recoverable. Passing through to announcement" + ) + + if not bypassChecks: + return ["recovery" if self.unifiedConfiguration.get("use_recoveror") else "manual"] + + return [] + + def _passOver100(self) -> bool: + lhe, primaries, _, _ = self.wfController.request.getIO() + return False if (lhe or primaries) else True + + def _forceCompleteWorkflow(self) -> bool: + if self.acdcWfs.get("healthy") and all(self.passStatsCheck.values()) and all(self.passStatsCheckToAnnounce.values()): + self.logger.info("This is essentially good to truncate, setting to force-complete") + return True + + return False + + + def _checkOutputSize(self,) -> None: + self.assistanceTags += set(self._getAnnounceAssistanceTags()) + + recoveryAssistanceTags = self._getRecoveryAssistanceTags() + if recoveryAssistanceTags: + self.assistanceTags += set(recoveryAssistanceTags) + self.bypassChecks, self.isClosing = False, False + + if self._passOver100() and all(self.passStatsCheckOverCompletion.values()): + self.assistanceTags.add("over100") + + if self._forceCompleteWorkflow(): + WorkflowStatusEnforcer(self.wf).forceComplete() + + def _hasSmallLumis(self) -> bool: + lumiLowerLimit = self.unifiedConfiguration.get("min_events_per_lumi_output") + _, primaries, _, _ = self.wfController.request.getIO() + + if ( + not self.wfController.isRelVal() + and not primaries + and any( + [ + self.eventsPerLumi[dataset] <= lumiLowerLimit + for dataset in self.eventsPerLumi + if not dataset.endswith(("DQMIO", "ALCARECO")) + ] + ) + ): + self.wfController.logger.info("The workflow has very small lumisections") + return True + + return False + + def _hasBigLumis(self) -> bool: + if any( + [ + self.lumiUpperLimit[dataset] > 0 and self.eventsPerLumi[dataset] >= self.lumiUpperLimit[dataset] + for dataset in self.eventsPerLumi + ] + ): + self.wfController.logger.info("The has large lumisections") + return True + + return False + + + def _setLumiUpperLimit(self) -> None: + lumiUpperLimit = {} + campaigns = self.campaigns or self._getWorkflowCampaigns() + + for dataset in self.outputDatasetsToCheck: + campaign = campaigns[dataset] + + if self.wfController.request.get("RequestType") in ["ReDigi", "ReReco"]: + upperLimit = -1 + elif self.options.get("lumisize"): + upperLimit = self.options.get("lumisize") + self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) + elif self.closeoutController.campaigns.get(campaign, {}).get("lumisize"): + upperLimit = self.closeoutController.campaigns.get(campaign, {}).get("lumisize") + self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) + else: + upperLimit = 1001 + + lumiUpperLimit[dataset] = upperLimit + + self.lumiUpperLimit = lumiUpperLimit + + + + def _checkLumiSize(self) -> None: + if self._hasSmallLumis(): + self.assistanceTags.add("smalllumi") + self.isClosing = False + + self._setLumiUpperLimit() + if self._hasBigLumis(): + self.assistanceTags.add("biglumi") + self.isClosing = False + + def _checkRucioCounts(self) -> None: + rucioPresence = {} + + for dataset in self.wfController.request.get("OutputDatasets"): + filesPerBlock = set(self.rucioReader.countDatasetFilesPerBlock(dataset)) + allBlocks = set([*map(lambda x: x[0], filesPerBlock)]) + if len(allBlocks) == len(set(filesPerBlock)): + rucioPresence[dataset] = sum(map(lambda x: x[1], filesPerBlock)) + else: + self.wfController.logger.info( + "There are inconsistences of number of files per block for dataset: %s", dataset + ) + rucioPresence[dataset] = 0 + + if any([nFiles == 0 for nFiles in rucioPresence.values()]) and "announce" in self.assistanceTags: + self.wfController.logger.info("No files in rucio yet, no good to announce") + self.assistanceTags.remove("announce") + + self.rucioPresence = rucioPresence + + def _checkDBSCounts(self) -> None: + dbsPresence, dbsInvalid = {}, {} + for dataset in self.wfController.request.get("OutputDatasets"): + #TODO: migrate dbs3Client.getFileCountDataset + dbsPresence[dataset] = self.checkor.dbs["reader"].countDatasetFiles(dataset) + dbsInvalid[dataset] = self.checkor.dbs["reader"].countDatasetFiles(dataset, onlyInvalid=True) + + self.dbsPresence, self.dbsInvalid = dbsPresence, dbsInvalid + + def _hasFileMismatch(self) -> bool: + if not self.options.get("ignoreFiles") and not all( + [self.dbsPresence[dataset] == self.dbsInvalid[dataset] + self.rucioPresence[dataset] for dataset in self.outputDatasetsToCheck] + ): + self.logger.info("The workflow has a dbs/rucio mismatch") + return True + + return False + + def _checkFileCounts(self) -> list: + showOnlyN = 10 + + for dataset in self.dbsPresence: + dbsFilenames = set( + [ + file.get("logical_file_name") + for file in self.dbs["reader"].getDatasetFiles(dataset, validFileOnly=True, details=True) + ] + ) + rucioReader = RucioReader() + rucioFilenames = set(rucioReader.getDatasetFileNames(dataset)) + + missingRucioFiles = dbsFilenames - rucioFilenames + missingDBSFiles = rucioFilenames - dbsFilenames + + if missingRucioFiles: + self.wfController.logger.info( + "These %d files are missing in Rucio, or extra in DBS, showing %s only.\n %s", + len(missingRucioFiles), + showOnlyN, + "\n".join(missingRucioFiles[:showOnlyN]), + ) + wereInvalidated = sorted(missingRucioFiles & set(self.dataCacheLoader.load("file_invalidation"))) + if wereInvalidated: + self.wfController.logger.info( + "These %d files were invalidated globally, showing %d only.\n %s", + len(wereInvalidated), + showOnlyN, + "\n".join(wereInvalidated[:showOnlyN]), + ) + #TODO: migrate dbs3Client.setFileStatus + self.dbs["writer"].setFileStatus(wereInvalidated, validate=False) + + if missingDBSFiles: + self.wfController.logger.info( + "These %d files are missing in DBS, or extra in Rucio, showing %s only.\n %s", + len(missingDBSFiles), + showOnlyN, + "\n".join(missingDBSFiles[:showOnlyN]), + ) + wereInvalidated = sorted(missingDBSFiles & set(self.dataCacheLoader.load("file_invalidation"))) + if wereInvalidated: + self.wfController.logger.info( + "These %d files were invalidated globally, showing %d only.\n %s", + len(wereInvalidated), + showOnlyN, + "\n".join(wereInvalidated[:showOnlyN]), + ) + + minFamilyCompletedDelay = self._getMinFamilyCompletedDelay(self.wfController.getFamily(includeItself=True)) + self.assistanceTags.add("agentfilemismatch" if minFamilyCompletedDelay < 2 else "filemismatch") + + self.isClosing = False + + def _checkInvalidations(self) -> list: + fractionInvalid = 0.2 + if not self.options.get("ignoreinvalid") and not all( + [ + self.dbsInvalid[dataset] <= int(fractionInvalid * self.dbsPresence[dataset]) + for dataset in self.wfController.request.get("OutputDatasets") + ] + ): + self.wfController.logger.info("The workflow has a DBS invalid file level too high") + self.assistanceTags.add("invalidfiles") + + def _setRecord(self) -> None: + putRecord = { + "datasets": {}, + "name": self.wf, + "closeOutWorkflow": self.isClosing, + "priority": self.wfController.request.get("RequestPriority"), + "prepid": self.wfController.request.get("PrepId"), + } + for dataset in self.outputDatasetsToCheck: + record = putRecord["datasets"].get(dataset, {}) + record["expectedL"] = self.expectedLumis[dataset] + record["expectedN"] = self.expectedEvents[dataset] + record["producedL"] = self.producedLumis[dataset] + record["producedN"] = self.producedEvents[dataset] + record["percentage"] = math.round(self.percentCompletions[dataset], 2) + record["fractionpass"] = math.round(self.fractions["pass"][dataset], 2) + record["duplicate"] = "N/A" + record["closeOutDataset"] = self.isClosing + record["correctLumis"] = ( + int(self.eventsPerLumi[dataset]) if self.eventsPerLumi[dataset] > self.lumiUpperLimit[dataset] else True + ) + record["dbsFiles"] = self.dbsPresence[dataset] + record["dbsInvFiles"] = self.dbsInvalid[dataset] + record["rucioFiles"] = set(RucioReader().getDatasetFileNames(dataset)) + record[ + "acdc" + ] = f"{len(self.acdcWfs.get('healthy', []))} / {len(self.acdcWfs.get('healthy', []) + self.acdcWfs.get('inactive', []))}" + record["family"] = self._getWorkflowFamily(self.wf, self.wfController) + + now = gmtime() + record["timestamp"] = mktime(now) + record["updated"] = asctime(now) + " (GMT)" + + putRecord["datasets"][dataset] = record + + self.record = record + + def _closeWorkflow(self) -> None: + self.wfController.logger.info("Setting %s as closed-out", self.wf) + + if self.wfToCheck.status in ["closed-out", "announced", "normal-archived"]: + self.logger.info( + "%s is already %s, not trying to close-out as assuming it does", + self.wf, + self.wfToCheck.status, + ) + self.newStatus = "close" + return + + # TODO: migrate reqMgrClient.closeOutWorkflowCascade + response = self.checkor.ReqMgrReader.closeoutWorkflow(self.wf) + if response is None: + self.newStatus = "close" + return + + self.logger.info("Could not close-out, will try again next time") + + + def _checkAssistanceTags(self) -> None: + self.logger.info("%s was tagged with: %s", self.wf, self.assistanceTags) + if "recovering" in self.assistanceTags: + self.assistanceTags -= set(["recovery", "filemismatch", "manual"]) + if "recovery" in self.assistanceTags and "recovered" in self.assistanceTags: + self.assistanceTags -= set(["recovery", "recovered"]) + self.assistanceTags.add("manual") + if "recovery" in self.assistanceTags and "manual" in self.assistanceTags: + self.assistanceTags -= set(["recovery"]) + if "custodial" in assistanceTags: + self.assistanceTags -= set(["announce", "announced"]) + if any([tag in self.assistanceTags for tag in ["duplicates", "filemismatch", "agentfilemismatch"]]): + self.assistanceTags -= set(["announce"]) + + self.logger.info("%s needs assistance with: %s", self.wf, self.assistanceTags) + self.logger.info("%s has existing conditions: %s", self.wf, self.existingAssistaceTags) + + def _warnRequestor(self) -> bool: + if self.assistanceTags and "manual" not in self.existingAssistaceTags and self.existingAssistaceTags != self.assistanceTags and any(tag in self.assistanceTags for tag in ["recovery", "biglumi"]): + + msg = "The request PREPID (WORKFLOW) is facing issue in production.\n" + + if "recovery" in self.assistanceTags: + msg += f"Samples completed with missing statistics\n{'\n'.join([f'{round(self.percentCompletions[dataset]*100, 2)}%% complete for {dataset}' for dataset in self.outputDatasetsToCheck ])}\nhttps://cmsweb.cern.ch/report/{wf}\n" + if "biglumi" in self.assistanceTags: + msg += f"Samples completed with large luminosity blocks:\n{'\n'.join([f'{self.eventsPerLumi[dataset]} > {self.lumiUpperLimit[dataset]} for {dataset}' for dataset in self.outputDatasetsToCheck])}\nhttps://cmsweb.cern.ch/reqmgr/view/splitting/{wf}\n" + + msg += "You are invited to check, while this is being taken care of by Comp-Ops.\n" + msg += "This is an automated message from Comp-Ops.\n" + + self.wfController.logger.critical(msg) + + def _getAssistanceStatus(self) -> str: + if self.assistanceTags: + return "assistance-" + "-".join(sorted(self.assistanceTags)) + return "assistance" + + def _setWorkflowToAssistance(self) -> None: + assistanceStatus = self._getAssistanceStatus() + if "manual" not in self.wfToCheck.status or assistanceStatus != "assistance-recovery": + self.newStatus = assistanceStatus + + def check(self) -> dict: + try: + self.checkor._checkPoint(self.checkor.logMsg["startCheck"].format(self.wf)) + if self._skipWorkflow() or self._updateWorkflowStatus(): + return self._writeResponse() + self.checkor._checkPoint("Checked workflow status", subLap=True) + + self._setOutputDatasetsToCheck() + self._checkWorkflowFamily() + self.checkor._checkPoint("Checked workflow family", subLap=True) + + self._setStatisticsThresholds() + self.checkor._checkPoint("Checked statistics threshold", subLap=True) + + self._checkCompletionStatistics() + self.checkor._checkPoint("Checked observed statistics", subLap=True) + + self._setPassStatisticsCheck() + self._checkAvgCompletionStatistics() + self.checkor._checkPoint("Checked more detailed observed statistics", subLap=True) + + self._setPassStatisticsCheckToAnnounce() + self._setPassStatisticsCheckOverCompletion() + self._checkOutputSize() + self.checkor._checkPoint("Checked output size", subLap=True) + + self._checkLumiSize() + self.checkor._checkPoint("Checked lumi size", subLap=True) + + self._checkRucioCounts() + self.checkor_checkPoint("Checked Rucio count", subLap=True) + + self._checkDBSCounts() + self.checkor._checkPoint("DBS file count", subLap=True) + + if self._hasFileMismatch() and "recovering" not in self.assistanceTags: + self._checkFileCounts() + self.checkor._checkPoint("Checked file count", subLap=True) + + self._checkInvalidations() + self.checkor._checkPoint("Checked invalidation", subLap=True) + + self.checkor._checkPoint(f"Done with {self.wf}") + self._setRecord() + + ## TODO: lumi summary if re reco + + if self.isClosing: + self._closeWorkflow() + else: + self._checkAssistanceTags() + self._warnRequestor() + self._setWorkflowToAssistance() + + ## TODO: update jira + + return self._writeResponse() + + except Exception as error: + self.logger.error(self.logMsg["failedCheck"], self.wf) + self.logger.error(str(error)) + self.failed = True + return self._writeResponse() + + def _writeResponse(self) -> dict: + response = { + "workflow": self.wfToCheck, + "wf": self.wf, + "failed": self.failed, + "isClosing": self.isClosing, + "newStatus": self.newStatus, + "prepIds": self.wfController.getPrepIDs(), + "mcmForceComplete": self.bypassChecksByMcMForceComplete, + "record": self.record + } + return response + + +if __name__ == "__main__": + options, specificWf = Checkor.parseOptions() + checkor = Checkor(options=options, specificWf=specificWf) + if checkor.go(): + checkor.run() From 7f6cc9b111e3372213ab85bc151cc1e7c098b728 Mon Sep 17 00:00:00 2001 From: Amanda Goldani Date: Thu, 16 Dec 2021 23:48:38 +0100 Subject: [PATCH 03/17] Add fix to handle both old and new dbs server --- src/python/Services/DBS/DBSReader.py | 43 +++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/python/Services/DBS/DBSReader.py b/src/python/Services/DBS/DBSReader.py index f5a5d6975..623551578 100644 --- a/src/python/Services/DBS/DBSReader.py +++ b/src/python/Services/DBS/DBSReader.py @@ -93,7 +93,48 @@ def _getBlockFileLumis(self, blocks: List[str], validFileOnly: bool = True) -> L This function runs by default with multithreading on the param blocks, which is a list of block names. """ - return self.dbs.listFileLumis(block_name=blocks, validFileOnly=int(validFileOnly)) + response = self.dbs.listFileLumis(block_name=blocks, validFileOnly=int(validFileOnly)) + + if isinstance(response[0]["lumi_section_num"], list): + self.logger.debug("Handling dbsapi.listFileLumis response from NEW DBS server") + return self._aggregateListFileLumis(response) + + self.logger.debug("Handling dbsapi.listFileLumis response from OLD DBS server") + return response + + def _aggregateListFileLumis(self, response: dict) -> List[dict]: + """ + The function to aggregate the list of file lumis + :param response: raw response of listFileLumis + :return: lumi section files + """ + aggregatedResponse = [] + tmpDict = {} + + for entry in response: + runNum = entry["run_num"] + lumiSectionNum = entry["lumi_section_num"] + logicalFileName = entry["logical_file_name"] + eventCount = entry["event_count"] + + key = (runNum, logicalFileName) + + if key in tmpDict: + tmpDict[key]["event_count"].append(eventCount) + else: + tmpDict[key] = {"event_count": [eventCount], "lumi_section_num": [lumiSectionNum]} + + for key, value in tmpDict.iteritems(): + aggregatedResponse.append( + { + "run_num": key[0], + "lumi_section_num": value["lumi_section_num"], + "logical_file_name": key[1], + "event_count": value["event_count"], + } + ) + + return aggregatedResponse def getDBSStatus(self, dataset: str) -> str: """ From dbfa67fae5cd65705c671fb82e165740f5438f02 Mon Sep 17 00:00:00 2001 From: Amanda Goldani Date: Fri, 17 Dec 2021 04:30:38 +0100 Subject: [PATCH 04/17] Add missing functions and comments --- src/python/Services/DBS/DBSReader.py | 26 ++ src/python/Services/ReqMgr/ReqMgrWriter.py | 23 +- unified/Checkor.py | 489 +++++++++++++-------- 3 files changed, 365 insertions(+), 173 deletions(-) diff --git a/src/python/Services/DBS/DBSReader.py b/src/python/Services/DBS/DBSReader.py index 623551578..d320f4455 100644 --- a/src/python/Services/DBS/DBSReader.py +++ b/src/python/Services/DBS/DBSReader.py @@ -518,3 +518,29 @@ def getBlocksLumisAndFilesForCaching(self, blocks: List[dict], validFileOnly: bo except Exception as error: self.logger.error("Failed to get lumi sections and files from DBS for given blocks") self.logger.error(str(error)) + + def countDatasetFiles(self, dataset: str, skipInvalid: bool = False, onlyInvalid: bool = False) -> int: + """ + The function to count the number of files in a given dataset + :param dataset: dataset name + :param skipInvalid: if True skip invalid files, o/w include them + :param onlyInvalid: if True include only invalid files, o/w include all + :return: number of files + """ + try: + files = self.dbs.listFiles(dataset=dataset, detail=(skipInvalid or onlyInvalid)) + + mainLFN = "/".join(files[0]["logical_file_name"].split("/")[:3]) if files else "" + + if skipInvalid: + files = [*filter(lambda file: file["is_file_valid"] == 1, files)] + elif onlyInvalid: + files = [*filter(lambda file: file["is_file_valid"] == 0, files)] + + if (skipInvalid or onlyInvalid) and mainLFN: + return [*filter(lambda file: file["logical_file_name"].startswith(mainLFN), files)] + return files + + except Exception as error: + self.logger.error("Failed to count files in dataset") + self.logger.error(str(error)) diff --git a/src/python/Services/ReqMgr/ReqMgrWriter.py b/src/python/Services/ReqMgr/ReqMgrWriter.py index b9408ee9b..b5c29b889 100644 --- a/src/python/Services/ReqMgr/ReqMgrWriter.py +++ b/src/python/Services/ReqMgr/ReqMgrWriter.py @@ -115,10 +115,31 @@ def approveWorkflow(self, wf: str) -> bool: """ try: result = sendResponse( - url=self.reqmgrUrl, endpoint=f"{self.reqmgrEndpoint['request']}/{wf}", param={"RequestStatus": "assignment-approved"} + url=self.reqmgrUrl, + endpoint=f"{self.reqmgrEndpoint['request']}/{wf}", + param={"RequestStatus": "assignment-approved"}, ) return result["result"][0]["ok"] except Exception as error: self.logger.error("Failed to approve workflow in reqmgr") self.logger.error(str(error)) + + def closeoutWorkflow(self, wf: str, cascade: bool = False) -> bool: + """ + The function to close out a given workflow + :param wf: workflow name + :param cascade: if cascade or not + :return: True if succeeded, False o/w + """ + try: + result = sendResponse( + url=self.reqmgrUrl, + endpoint=f"{self.reqmgrEndpoint['request']}/{wf}", + param={"RequestStatus": "closed-out", "cascade": cascade}, + ) + return result["result"][0][wf] == "OK" + + except Exception as error: + self.logger.error("Failed to close out workflow in reqmgr") + self.logger.error(str(error)) diff --git a/unified/Checkor.py b/unified/Checkor.py index 800b8870e..3c434cfa6 100644 --- a/unified/Checkor.py +++ b/unified/Checkor.py @@ -2,17 +2,13 @@ import re import optparse import random -import json import math from logging import Logger from collections import defaultdict from time import mktime, asctime, gmtime, struct_time -from jinja2.runtime import F - -from sqlalchemy.sql.expression import true from Utilities.DataTools import unnestList, flattenDictKeys -from Utilities.Decorators import runWithMultiThreading, runWithRetries +from Utilities.Decorators import runWithMultiThreading from Utilities.Logging import getLogger from Utilities.ConfigurationHandler import ConfigurationHandler from Cache.DataCacheLoader import DataCacheLoader @@ -20,15 +16,16 @@ from Databases.Oracle.OracleDB import Workflow from MongoControllers.CampaignController import CampaignController from MongoControllers.CloseoutController import CloseoutController -from MongoControllers.SiteController import SiteController from MongoControllers.WTCController import WTCController from Services.ServicesChecker import ServicesChecker from Services.McM.McMClient import McMClient from Services.ReqMgr.ReqMgrReader import ReqMgrReader +from Services.ReqMgr.ReqMgrWriter import ReqMgrWriter from Services.ACDC.ACDCReader import ACDCReader from Services.DBS.DBSReader import DBSReader from Services.DBS.DBSWriter import DBSWriter from Services.Rucio.RucioReader import RucioReader +from WorkflowMgmt.SiteController import SiteController from WorkflowMgmt.UserLockChecker import UserLockChecker from WorkflowMgmt.WorkflowController import WorkflowController from WorkflowMgmt.WorkflowStatusEnforcer import WorkflowStatusEnforcer @@ -56,67 +53,24 @@ def __init__(self, logger: Optional[Logger] = None, **kwargs) -> None: self.unifiedConfiguration = ConfigurationHandler("config/unifiedConfiguration.json") - self.reqMgrReader = ReqMgrReader() self.acdcReader = ACDCReader() self.dbs = {"reader": DBSReader(), "writer": DBSWriter()} + self.reqMgr = {"reader": ReqMgrReader(), "writer": ReqMgrWriter()} self.dataCacheLoader = DataCacheLoader() - self.mcmClient = McMClient() if self.useMcM else None self.campaignController = CampaignController() self.closeoutController = CloseoutController() self.siteController = SiteController() self.wtcController = WTCController() - self.logMsg = { - "strict": "Strict option is on: checking workflows that freshly completed", - "update": "Update option is on: checking workflows that have not completed yet", - "clear": "Clear option is on: checking workflows that are ready to toggle closed-out", - "reviewRecover": "Review-recovering option is on: checking only the workflows that had been already acted on", - "reviewManual": "Review-manual option is on: checking the workflows to be acted on", - "timeCheck": "Time check (%s) point at: %s", - "sinceStart": "Since start: %s [s]", - "lap": "%s: %s [s]", - "notAllowed": "%s is not allowed to %s", - "allowed": "%s allowed to %s %s", - "nWfs": "Considering %s workflows (before any limitation)", - "limitOption": "Command line to limit workflows to %s", - "limit": "Limiting workflows to %s this round", - "startCheck": "Starting checkor with {}", - "setClose": "%s is already %s, setting as close", - "setForget": "%s is %s, but will not be set in trouble to find a replacement", - "setTrouble": "%s is %s, setting as trouble", - "setOnHold": "%s is %s, setting as on hold", - "notRunning": "%s is %s, it is not running yet", - "skipCheck": "No need to check %s in status %s", - "bypassChecks": "Bypassing checks on %s because of keyword %s", - "bypassForceComplete": "Bypassing checks and force completing %s because of %s %s", - "failedCheck": "Failed on checking %s", - "onHoldSince": "On hold since %s, timeout at %s", - "bypassOnHold": "%s is on hold and stopped for %.2f days, letting this through with current statistics", - "onHold": "%s is on hold", - "campaigns": "Campaigns: %s", - "checkSummary": "Initial outputs: %s\nWill check on: %s\nTiers out: %s\n, Tiers no custodials: %s", - "inconsistentAcdc": "Inconsistent ACDC %s", - "acdc": "%s still has an ACDC running", - "failedAcdc": "For %s, ACDC %s failed", - "badAcdc": "For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.", - "forceRecovering": "%s is being force completed while recovering", - "forceComplete": "The workflow %s was force completed", - "missingTotalEvents": "TotalInputEvents is missing from the workload of %s", - "sinceCompleted": "%s days since completed", - "earlyAnnounce": "Allowed to announce the output %s over %.2f by campaign requirement", - "overrideFraction": "Overriding fraction to %s for %s by %s", - "truncateRecovery": "Allowed to truncate recovery of %s over %.2f by campaign requirement", - "fractionDamping": "The passing fraction could be reduced to %s given it has been in for long", - "reducePass": "Reducing pass threshold by %.3f%% for long lasting workflow %s", - "truncatePass": "Truncating at pass threshold because of ACDC at rank %d", - } - - self.useMcM = False - self.exceptions = set() - self._setWfs() - # TODO: update JIRA tickets + self.overrideWfs = None + self.onHoldWfs = None + self.bypassWfs = None + self.forceCompleteWfs = None + + self.jiraClient = None + self.mcmClient = None except Exception as error: raise Exception(f"Error initializing Checkor\n{str(error)}") @@ -197,7 +151,11 @@ def _setWfs(self) -> None: self.overrideWfs = self._getWorkflowsByAction("force-complete", details=True) self.onHoldWfs = self._getWorkflowsByAction("hold") self.bypassWfs = self._getWorkflowsByAction("bypass") + unnestList(self.overrideWfs) - self.forceCompleteWfs = self.mcmClient.get("/restapi/requests/forcecomplete") if self.useMcM else {} + self.forceCompleteWfs = self.mcmClient.get("/restapi/requests/forcecomplete") if self.mcmClient is not None else {} + + if self.jiraClient is not None: + # TODO: update wfs by JIRA tickets / migrate JiraClient + pass def _checkPoint(self, label: str = "", subLap: bool = False, now: struct_time = gmtime()) -> None: """ @@ -206,11 +164,11 @@ def _checkPoint(self, label: str = "", subLap: bool = False, now: struct_time = :subLap: True if for sub lap, False o/w :now: time now """ - self.logger.info(self.logMsg["timeCheck"], label, asctime(now)) - self.logger.info(self.logMsg["sinceStart"], now - self.timePoint.get("start", now)) + self.logger.info("Time check (%s) point at: %s", label, asctime(now)) + self.logger.info("Since start: %s [s]", now - self.timePoint.get("start", now)) self.logger.info( - self.logMsg["lap"], + "%s: %s [s]", "Sub lap" if subLap else "Lap", now - self.timePoint.get("subLap" if subLap else "lap", now), ) @@ -238,27 +196,27 @@ def _getWorkflowsToCheck(self) -> list: awayWfs = self.session.query(Workflow).filter(Workflow.status == "away").all() assistanceWfs = self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all() - completedWfs = self.reqMgrReader.getWorkflowsByStatus("completed") + completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed") if self.options.get("strict"): - self.logger.info(self.logMsg["strict"]) + self.logger.info("Strict option is on: checking workflows that freshly completed") workflows.update(filter(lambda wf: wf.name in completedWfs, awayWfs)) if self.options.get("update"): - self.logger.info(self.logMsg["update"]) + self.logger.info("Update option is on: checking workflows that have not completed yet") workflows.update(filter(lambda wf: wf.name not in completedWfs, awayWfs)) if self.options.get("clear"): - self.logger.info(self.logMsg["clear"]) + self.logger.info("Clear option is on: checking workflows that are ready to toggle closed-out") workflows.update(filter(lambda wf: "custodial" in wf.status, assistanceWfs)) if self.options.get("review"): nonCustodialWfs = [*filter(lambda wf: "custodial" not in wf.status, assistanceWfs)] if self.options.get("recovering"): - self.logger.info(self.logMsg["reviewRecover"]) + self.logger.info("Review-recovering option is on: checking only the workflows that had been already acted on") workflows.update(filter(lambda wf: "manual" not in wf.status, nonCustodialWfs)) if self.options.get("manual"): - self.logger.info(self.logMsg["reviewManual"]) + self.logger.info("Review-manual option is on: checking the workflows to be acted on") workflows.update(filter(lambda wf: "manual" in wf.status, nonCustodialWfs)) return list(workflows) @@ -285,24 +243,29 @@ def _getWorkflowsByAction(self, action: str, details: bool = False) -> Union[lis ) for user, wf in userWfs.items(): if user not in allowedUsers: - self.logger.info(self.logMsg["notAllowed"], user, action) + self.logger.info("%s is not allowed to %s", user, action) continue - self.logger.info(self.logMsg["allowed"], user, action, wf) + self.logger.info("%s allowed to %s %s", user, action, wf) workflows[user].add(wf) if details: return workflows return unnestList(workflows) - def _filterMaxNumberOfWorkflows(self, workflows) -> list: + def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: + """ + The function to filter the max number of workflows per round + :param workflows: list of workflows + :return: filtered list of workflows + """ maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") if self.options.get("limit"): - self.logger.info(self.logMsg["limitOption"], self.options.get("limit")) + self.logger.info("Command line to limit workflows to %s", self.options.get("limit")) maxPerRound = self.options.get("limit") if maxPerRound and not self.specific: - self.logger.info(self.logMsg["limit"], maxPerRound) + self.logger.info("Limiting workflows to %s this round", maxPerRound) workflows = self._rankWorkflows(workflows) if self.option.get("update"): @@ -317,18 +280,26 @@ def _rankWorkflows(self, workflows: list) -> list: :param workflows: workflows :return: sorted workflows """ - completedWfs = self.reqMgrReader.getWorkflowsByStatus("completed", details=True) + completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed", details=True) completedWfs = sorted(completedWfs, key=lambda wf: wf.get("RequestPriority", 0)) completedWfs = [wf.get("RequestName") for wf in completedWfs] return sorted(workflows, key=lambda wf: completedWfs.index(wf) if wf in completedWfs else 0, reverse=True) def _updateWorkflowsRecords(self, wfsRecords: dict) -> None: + """ + The function to update the workflows records + :param wfsRecords: records + """ for wf in wfsRecords: if wf.get("record"): self.closeoutController.set(wf.get("wf"), wf.get("record")) - def _updateWorkflowsStatus(self, wfsStatus) -> None: + def _updateWorkflowsStatus(self, wfsStatus: list) -> None: + """ + The function to update the workflows status + :param wfsStatus: status + """ for wf in wfsStatus: if wf.get("newStatus"): newStatus = wf.get("newStatus") @@ -337,11 +308,16 @@ def _updateWorkflowsStatus(self, wfsStatus) -> None: if newStatus == "close": self.closeoutController.clean(wf.get("wf")) - if self.useMcM and wf.get("mcmForceComplete"): + if self.mcmClient is not None and wf.get("mcmForceComplete"): for prepId in wf.get("prepIds"): self.mcmClient.clean(f"/restapi/requests/forcecomplete/{prepId}") def _checkExecutionTime(self, nWfs: int, now: struct_time = mktime(gmtime())) -> None: + """ + The function to check the execution time of the module + :param nWfs: number of workflows + :param now: time now + """ if nWfs: avgExecutionTime = (now - self.timePoint.get("start")) / nWfs self.logger.info("Average time spend per workflow: %s s", avgExecutionTime) @@ -350,13 +326,20 @@ def _checkExecutionTime(self, nWfs: int, now: struct_time = mktime(gmtime())) -> self.logger.critical("Checkor took %s [s] per workflow", avgExecutionTime) def _countAssistanceWorkflowsByStatus(self) -> dict: + """ + The function to count the number of assistance workflows by status + :return: count of workflows by status + """ status = defaultdict(int) for wf in self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all(): status[wf.status] += 1 return status - def _writeSummary(self) -> str: + def _writeSummary(self) -> None: + """ + The function to write a summary of checkor + """ if not self.specificWf: msg = "" if self.options.get("strict"): @@ -382,7 +365,8 @@ def go(self) -> bool: servicesChecker = ServicesChecker(softServices=["mcm", "wtc"]) if not userLockChecker.isLocked() and servicesChecker.check(): - self.useMcM = servicesChecker.status.get("mcm") + self.mcmClient = McMClient() if servicesChecker.status.get("mcm") else None + self.jiraClient = None #JiraClient() if servicesChecker.status.get("jira") else None TODO: migrate JiraClient return True return False @@ -396,10 +380,12 @@ def run(self) -> None: The function to run checkor """ try: + self._setWfs() + wfsToCheck = self._filterBackfills(self._getWorkflowsToCheck()) random.shuffle(wfsToCheck) - self.logger.info(self.logMsg["nWfs"], len(wfsToCheck)) + self.logger.info("Considering %s workflows (before any limitation)", len(wfsToCheck)) wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) self._check(wfsToCheck) @@ -426,7 +412,7 @@ def _checkWorkflow(self, wfsToCheck: list) -> dict: self._checkExecutionTime(len(wfsToCheck)) -class WorkflowCheckor: +class WorkflowCheckor(object): """ __WorkflowCheckor__ General API for checking a fiven workflow @@ -439,25 +425,29 @@ def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwarg self.now = mktime(gmtime()) self.wf = wfToCheck.name + self.wfController = WorkflowController(self.wf) + self.wfToCheck = wfToCheck - self.wfController = WorkflowController(wfToCheck.name) - self.wfsToCheck.wm_status = self.wfController.request.get("RequestStatus") + self.wfToCheck.wm_status = self.wfController.request.get("RequestStatus") self.checkor = kwargs.get("checkor") self.rucioReader = RucioReader() - self.existingAssistaceTags = set(wfToCheck.status.split("-")[1:]) - self.assistanceTags = set() - self.outputDatasetsToCheck = list() - self.acdcWfs = dict() - self.fractions = dict() + self.assistanceTags, self.existingAssistaceTags = set(), set(wfToCheck.status.split("-")[1:]) + + self.acdcs = dict() self.campaigns = dict() - self.eventsPerLumi = dict() + self.fractions = dict() self.percentCompletions, self.percentAvgCompletions = dict(),dict() + + self.eventsPerLumi = dict() + self.lumiUpperLimit = dict() self.expectedLumis, self.expectedEvents =dict(),dict() self.producedLumis, self.producedEvents = dict(),dict() + + self.outputDatasetsToCheck = list() self.passStatsCheck, self.passStatsCheckToAnnounce, self.passStatsCheckOverCompletion = dict(), dict(), dict() - self.lumiUpperLimit = dict() + self.rucioPresence = dict() self.dbsPresence, self.dbsInvalid = dict(), dict() @@ -486,6 +476,7 @@ def _skipWorkflow(self) -> bool: ): self.logger.info("Skipping workflow %s", self.wf) return True + return False def _setWorkflowToClose(self) -> bool: @@ -494,9 +485,10 @@ def _setWorkflowToClose(self) -> bool: :return: True if workflow should be closed, False o/w """ if self.wfToCheck.wm_status in ["closed-out", "announced"] and self.wf not in self.checkor.exceptions: - self.logger.info(self.logMsg["setClose"], self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is already %s, setting as close", self.wf, self.wfToCheck.wm_status) self.newStatus = "close" return True + return False def _setWorkflowToForget(self) -> bool: @@ -512,9 +504,10 @@ def _setWorkflowToForget(self) -> bool: "rejected-archived", "aborted-completed", ]: - self.logger.info(self.logMsg["setForget"], self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, but will not be set in trouble to find a replacement", self.wf, self.wfToCheck.wm_status) self.newStatus = "forget" return True + return False def _setWorkflowToTrouble(self) -> bool: @@ -530,21 +523,21 @@ def _setWorkflowToTrouble(self) -> bool: "rejected-archived", "aborted-completed", ]: - self.logger.info(self.logMsg["setTrouble"], self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting as trouble", self.wf, self.wfToCheck.wm_status) self.newStatus = "trouble" return True + return False def _setBypassChecks(self) -> None: """ - The function to check if bypass checks for a given workflow. - :param wf: workflow name - :return: True if bypass checks, False o/w + The function to check if bypass checks for a given workflow """ for bypassWf in self.checkor.bypassWfs: if bypassWf == self.wf: self.logger.info(self.logMsg["bypassCheck"], self.wf, bypassWf) self.bypassChecks = True + self.bypassChecks = False def _setBypassChecksByMcMForceComplete(self, prepIds: list) -> None: @@ -554,8 +547,9 @@ def _setBypassChecksByMcMForceComplete(self, prepIds: list) -> None: """ for forceCompleteWf in self.checkor.forceCompleteWfs: if forceCompleteWf in prepIds: - self.logger.info(self.logMsg["bypassForceComplete"], self.wf, "prepid", forceCompleteWf) + self.logger.info("Bypassing checks and force completing %s because of prep id %s", self.wf, forceCompleteWf) self.bypassChecksByMcMForceComplete = True + self.bypassChecksByMcMForceComplete = False def _setBypassChecksByUserForceComplete(self) -> None: @@ -566,9 +560,10 @@ def _setBypassChecksByUserForceComplete(self) -> None: forceCompleteWfs = [overrideWf for overrideWf in userOverrideWfs if overrideWf in self.wf] if forceCompleteWfs: self.logger.info( - self.logMsg["bypassForceComplete"], self.wf, "user/keyword", f"{user}/{forceCompleteWfs[0]}" + "Bypassing checks and force completing %s because of user/keyword %s/%s", self.wf, user, forceCompleteWfs[0] ) self.bypassChecksByUserForceComplete = True + self.bypassChecksByUserForceComplete = False def _isWorkflowOnHold(self) -> bool: @@ -605,19 +600,19 @@ def _setNewOnHoldStatus(self, family: list) -> bool: onHoldTimeout = self.unifiedConfiguration.get("onhold_timeout") minFamilyCompletedDelay = self._getMinFamilyCompletedDelay(family, self.now) - self.logger.info(self.logMsg["onHoldSince"], minFamilyCompletedDelay, onHoldTimeout) + self.logger.info("On hold since %s, timeout at %s", minFamilyCompletedDelay, onHoldTimeout) if onHoldTimeout > 0 and onHoldTimeout < minFamilyCompletedDelay: - self.logger.info(self.logMsg["bypassOnHold"], self.wf, minFamilyCompletedDelay) + self.logger.info("%s is on hold and stopped for %.2f days, letting this through with current statistics", self.wf, minFamilyCompletedDelay) self.bypassChecks = True return False if "-onhold" in self.wfToCheck.wm_status and self.wf in self.checkor.onHoldWfs and not self.bypassChecks: - self.logger.info(self.logMsg["onHold"], self.wf) + self.logger.info("%s is on hold", self.wf) return True if self.wf in self.checkor.onHoldWfs and not self.bypassChecks: - self.logger.info(self.logMsg["setOnHold"], self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting as on hold", self.wf, self.wfToCheck.wm_status) self.newStatus = "assistance-onhold" return True @@ -636,6 +631,10 @@ def _updateWorkflowStatus(self) -> bool: return False def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: + """ + The function yo get the tiers with no check and custodial + :return: list of tiers with no check, and list of tiers with no custodial + """ tiersWithNoCheck = set(self.unifiedConfiguration.get("tiers_with_no_check")) tiersWithNoCustodial = set( self.unifiedConfiguration.get("tiers_with_no_custodial") if not self.wfController.request.isRelVal() else [] @@ -656,6 +655,9 @@ def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: return tiersWithNoCheck, tiersWithNoCustodial def _setOutputDatasetsToCheck(self) -> None: + """ + The function to set the datasets to check + """ tiersWithNoCheck, tiersWithNoCustodial = self._getTiersWithNoCheckAndCustodial(self.wfController) expectedOutputsDatasets = self.wfController.request.get("OutputDatasets") @@ -665,8 +667,8 @@ def _setOutputDatasetsToCheck(self) -> None: if all([dataset.split("/")[-1] != tier for tier in tiersWithNoCheck]) ] - self.wfController.logger.info( - self.logMsg["checkSummary"], + self.wfController.logger.debug( + "Initial outputs: %s\nWill check on: %s\nTiers out: %s\nTiers no custodials: %s", ", ".join(sorted(expectedOutputsDatasets)), ", ".join(sorted(self.outputDatasetsToCheck)), ", ".join(sorted(tiersWithNoCheck)), @@ -674,6 +676,11 @@ def _setOutputDatasetsToCheck(self) -> None: ) def _skipFamilyWorkflow(self, wfSchema: dict) -> bool: + """ + The function to check if the workflow family should be skipped or not + :param wfSchema: workflow schema + :return: True if to skip, False o/w + """ return ( wfSchema.get("RequestType") != "Ressubmission" or wfSchema.get("RequestDate") < wfSchema.request.get("RequestDate") @@ -683,8 +690,12 @@ def _skipFamilyWorkflow(self, wfSchema: dict) -> bool: def _getWorkflowFamily(self) -> list: + """ + The function to get the workflow family + :return: workflow family + """ family = [] - for member in self.reqmgrReader.getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): + for member in self.reqMgr["reader"].getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): if ( member.get("RequestName") == self.wf or self._skipFamilyWorkflow(member) @@ -697,8 +708,11 @@ def _getWorkflowFamily(self) -> list: return family def _setBadAcdcs(self) -> None: + """ + The function to set the bad acdcs + """ badAcdcs = [] - for member in self.reqmgrReader.getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): + for member in self.reqMgr["reader"].getWorkflowsByPrepId(self.wfController.request.get("PrepID"), details=True): if member.get("RequestName") == self.wf or self._skipFamilyWorkflow(member): continue @@ -710,13 +724,17 @@ def _setBadAcdcs(self) -> None: "aborted-archived", ]: badAcdcs.append(member.get("RequestName")) - self.wfController.logger.info(self.logMsg["inconsistentAcdc"], member.get("RequestName")) + self.wfController.logger.info("Inconsistent ACDC %s", member.get("RequestName")) - self.acdcWfs["bad"] = badAcdcs + self.acdcs["bad"] = badAcdcs if badAcdcs: self.assistanceTags.update(["manual", "assistance"]) def _setHealthyAcdcs(self, family: list) -> None: + """ + The function to set healthy acdcs + :param family: workflow family + """ healthyAcdcs = [] for member in family: if member.get("RequestStatus") in [ @@ -727,14 +745,18 @@ def _setHealthyAcdcs(self, family: list) -> None: "staging", "staged", ]: - self.logger.info(self.logMsg["acdc"], member.get("RequestName")) + self.logger.info("%s still has an ACDC running", member.get("RequestName")) healthyAcdcs.append(member.get("RequestName")) - self.acdcWfs["healthy"] = healthyAcdcs + self.acdcs["healthy"] = healthyAcdcs if healthyAcdcs: self.assistanceTags.add("recovering") def _setInactiveAcdcs(self, family: list) -> None: + """ + The function to set the inactive acdcs + :param family: workflow family + """ inactiveAcdcs = [] for member in family: if member.get("RequestStatus") not in [ @@ -748,28 +770,39 @@ def _setInactiveAcdcs(self, family: list) -> None: ]: inactiveAcdcs.append(member.get("RequestName")) - self.acdcWfs["inactive"] = inactiveAcdcs + self.acdcs["inactive"] = inactiveAcdcs if inactiveAcdcs: self.assistanceTags.add("recovered") - def _setFailedAcdcs(self, family) -> None: + def _setFailedAcdcs(self, family: list) -> None: + """ + The function to set the failed acdcs + :param family: workflow family + """ failedAcdcs = [] for member in family: if member.get("RequestStatus") in ["failed"]: failedAcdcs.append(member.get("RequestName")) - self.acdcWfs["failed"] = failedAcdcs + self.acdcs["failed"] = failedAcdcs def _setAcdcsOrder(self, family: list) -> None: + """ + The function to set the acdcs order + :param family: workflow family + """ order = -1 for member in family: memberOrder = sorted(filter(re.compile(f"^ACDC\d+$").search, member.get("RequestName"))) if memberOrder: order = max(order, int(memberOrder[-1].split("ACDC")[1])) - self.acdcWfs["order"] = order + self.acdcs["order"] = order def _checkWorkflowFamily(self) -> None: + """ + The function to check the workflow family + """ family = self._getWorkflowFamily() self._setBadAcdcs() @@ -778,15 +811,15 @@ def _checkWorkflowFamily(self) -> None: self._setFailedAcdcs(family) self._setAcdcsOrder(family) - if (self.bypassChecksByMcMForceComplete or self.bypassChecksByUserForceComplete) and len(self.acdcWfs.get("healthy")): - self.wfController.logger.info(self.logMsg["forceRecovering"], self.wf) - self.wfController.logger.critical(self.logMsg["forceComplete"], self.wf) + if (self.bypassChecksByMcMForceComplete or self.bypassChecksByUserForceComplete) and len(self.acdcs.get("healthy")): + self.wfController.logger.info("%s is being force completed while recovering", self.wf) + self.wfController.logger.critical("The workflow %s was force completed", self.wf) WorkflowStatusEnforcer(self.wf).forceComplete() - if self.acdcWfs.get("failed"): - self.logger.critical(self.logMsg["failedAcdc"], self.wf, ", ".join(self.acdcWfs.get("failed"))) - if self.acdcWfs.get("bad"): - self.logger.critical(self.logMsg["badAcdc"], self.wf, ", ".join(self.acdcWfs.get("bad"))) + if self.acdcs.get("failed"): + self.logger.critical("For %s, ACDC %s failed", self.wf, ", ".join(self.acdcs.get("failed"))) + if self.acdcs.get("bad"): + self.logger.critical("For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.", self.wf, ", ".join(self.acdcs.get("bad"))) def _getWorkflowCompletedDelay(self) -> float: """ @@ -796,11 +829,15 @@ def _getWorkflowCompletedDelay(self) -> float: completed = [*filter(lambda wf: wf.get("Status") == "completed", self.wfController.request.get("RequestTransition"))] delay = (self.now - completed[-1].get("UpdateTime", self.now)) / 86400.0 if completed else 0 - self.logger.info(self.logMsg["sinceCompleted"], delay) + self.logger.info("%s days since completed", delay) return delay def _getFractionDumping(self) -> float: + """ + The function to compute the fraction dumping + :return: fraction dumping + """ wfCompletedDelay = self._getWorkflowCompletedDelay() fractionDamping = min( 0.01 @@ -811,11 +848,14 @@ def _getFractionDumping(self) -> float: self.unifiedConfiguration.get("damping_fraction_pass_max") / 100.0, ) - self.logger.info(self.logMsg["fractionDamping"], fractionDamping) + self.logger.info("The passing fraction could be reduced to %s given it has been in for long", fractionDamping) return fractionDamping def _setWorkflowCampaigns(self) -> None: + """ + The function to set the campaigns + """ campaigns = {} wfCampaigns = self.wfController.request.getCampaigns(details=False) if len(wfCampaigns) == 1: @@ -824,10 +864,13 @@ def _setWorkflowCampaigns(self) -> None: else: campaigns = self.wfController.getCampaignsFromOutputDatasets() - self.logger.info(self.logMsg["campaigns"], campaigns) + self.logger.info("Campaigns: %s", campaigns) self.campaigns = campaigns def _setDatasetsFractionsToAnnounce(self) -> None: + """ + The function to set the dataset fractions to announce + """ fractionsAnnounce = {} for dataset in self.outputDatasetsToCheck: fractionsAnnounce[dataset] = 1.0 @@ -835,6 +878,9 @@ def _setDatasetsFractionsToAnnounce(self) -> None: self.fractions["announce"] = fractionsAnnounce def _setDatasetsFractionsToPass(self) -> None: + """ + The function to set the dataset fractions to pass + """ fractionsPass = {} defaultPass = self.unifiedConfiguration.get("default_fraction_pass") @@ -846,7 +892,7 @@ def _setDatasetsFractionsToPass(self) -> None: if self.options.get("fractionPass"): fractionsPass[dataset] = self.options.get("fractionPass") self.wfController.logger.info( - self.logMsg["overrideFraction"], fractionsPass[dataset], dataset, "command line" + "Overriding fraction to %s for %s by command line", fractionsPass[dataset], dataset ) elif self.campaignController.campaigns.get(campaign, {}).get("fractionpass"): fractionPass = self.campaignController.campaigns.get(campaign).get("fractionpass") @@ -868,7 +914,7 @@ def _setDatasetsFractionsToPass(self) -> None: else: fractionsPass[dataset] = fractionPass self.wfController.logger.info( - self.logMsg["overrideFraction"], fractionsPass[dataset], dataset, "campaign requirement" + "Overriding fraction to %s for %s by campaign requirement", fractionsPass[dataset], dataset ) else: fractionsPass[dataset] = defaultPass @@ -876,11 +922,14 @@ def _setDatasetsFractionsToPass(self) -> None: for key, passValue in self.unifiedConfiguration.get("pattern_fraction_pass").items(): if key in dataset: fractionsPass[dataset] = passValue - self.wfController.logger.info(self.logMsg["overrideFraction"], passValue, dataset, "dataset key") + self.wfController.logger.info("Overriding fraction to %s for %s by dataset key", passValue, dataset) self.fractions["pass"] = fractionsPass def _setDatasetsFractionsToTruncateRecovery(self) -> None: + """ + The function to set the dataset fractions to truncate recovery + """ fractionsTruncateRecovery = {} weightFull = 7.0 @@ -901,7 +950,7 @@ def _setDatasetsFractionsToTruncateRecovery(self) -> None: "truncaterecovery" ) self.wfController.logger.info( - self.logMsg["truncateRecovery"], + "Allowed to truncate recovery of %s over %.2f by campaign requirement", dataset, self.campaignController.campaigns.get(campaign).get("truncaterecovery"), ) @@ -912,6 +961,9 @@ def _setDatasetsFractionsToTruncateRecovery(self) -> None: self.fractions["truncate"] = fractionsTruncateRecovery def _setStatisticsThresholds(self) -> None: + """ + The function to set the statistics thresholds + """ self._setDatasetsFractionsToAnnounce() self._setDatasetsFractionsToPass() self._setDatasetsFractionsToTruncateRecovery() @@ -923,12 +975,15 @@ def _setStatisticsThresholds(self) -> None: self.fractions["truncate"][dataset] -= fractionDamping if self.acdsWfs.get("order") > self.unifiedConfiguration.get("acdc_rank_for_truncate"): - self.wfController.logger.info(self.logMsg["truncatePass"], self.acdsWfs.get("order")) + self.wfController.logger.info("Truncating at pass threshold because of ACDC at rank %d", self.acdsWfs.get("order")) self.fractions["truncate"][dataset] = self.fractions["pass"][dataset] self._updateFractionsToPassAndToTruncateRecovery() def _updateFractionsToPassAndToTruncateRecovery(self) -> None: + """ + The function to update the fractions to pass and to truncate recovery + """ family = dict([(dataset, self.dbs["reader"].getDatasetParent(dataset)) for dataset in self.fractions["pass"]]) for dataset, value in self.fractions["pass"].items(): @@ -952,27 +1007,40 @@ def _updateFractionsToPassAndToTruncateRecovery(self) -> None: ) def _getExpectedEvents(self) -> float: + """ + The function to get the expected events + :return: number of expected events + """ if self.wfController.request.get("RequestType") in ["TaskChain", "StepChain"]: return self.wfController.getRequestNumEvents() expectedEvents = self.wfController.request.get("TotalInputEvents") if expectedEvents is None: - self.wfController.logger.critical(self.logMsg["missingTotalEvents"], wf) + self.wfController.logger.critical("TotalInputEvents is missing from the workload of %s", self.wf) return 0 + return expectedEvents - def _getTaskOutputDatasets(self) -> dict: - taskOutputDatasets = {} + def _getTasksByOutputDataset(self) -> dict: + """ + The function to get the task by output datasets + :return: tasks by output datasets + """ + tasksByDataset = {} for task, outputs in self.wfController.getOutputDatasetsPerTask().items(): for output in outputs: - taskOutputDatasets[output] = self.wfController.request.get(task, {}).get("TaskName", task) - return taskOutputDatasets + tasksByDataset[output] = self.wfController.request.get(task, {}).get("TaskName", task) + + return tasksByDataset def _checkCompletionStatistics(self) -> None: + """ + The function to check the completion statistics + """ lumisExpected = self.wfController.request.get("TotalInputLumis") eventsExpected = self._getExpectedEvents() eventsExpectedPerTask = self.wfController.request.getExpectedEventsPerTask() - taskOutputs = self._getTaskOutputDatasets() + taskOutputs = self._getTasksByOutputDataset() for dataset in self.outputDatasetsToCheck: events, lumis = self.dbs["reader"].getDatasetEventsAndLumis(dataset) @@ -1000,6 +1068,9 @@ def _checkCompletionStatistics(self) -> None: ) def _setPassStatisticsCheck(self) -> None: + """ + The function to set pass statistics + """ self.passStatsCheck = dict( [ (dataset, self.bypassChecks or self.percentCompletions[dataset] >= passValue) @@ -1008,6 +1079,9 @@ def _setPassStatisticsCheck(self) -> None: ) def _setPassStatisticsCheckToAnnounce(self) -> None: + """ + The function to set the pass statistics to announce + """ self.passStatsCheckToAnnounce = dict( [ (dataset, self.percentAvgCompletions[dataset] >= passValue) for dataset, passValue in self.fractions["pass"].items() @@ -1015,11 +1089,17 @@ def _setPassStatisticsCheckToAnnounce(self) -> None: ) def _setPassStatisticsCheckOverCompletion(self) -> None: + """ + The function to set the pass statistics to over complete + """ defaultFractionOverdoing = self.unifiedConfiguration.get("default_fraction_overdoing") self.passStatsCheckOverCompletion = dict([(dataset, value >= defaultFractionOverdoing) for dataset, value in self.percentCompletions.items()]) def _checkAvgCompletionStatistics(self) -> None: + """ + The function to check the average completion statistics + """ percentAvgCompletions = {} _, primaries, _, _ = self.wfController.request.getIO() @@ -1059,37 +1139,50 @@ def _checkAvgCompletionStatistics(self) -> None: self.percentAvgCompletions = percentAvgCompletions def _getAnnounceAssistanceTags(self) -> list: + """ + The function to get announce assistance tags + :return: announce assistance tags + """ if self.passStatsCheckToAnnounce and all(self.passStatsCheckToAnnounce.values()): self.wfController.logger.info( "The output of this workflow are essentially good to be announced while we work on the rest" ) return ["announced" if "announced" in self.wfToCheck.status else "announce"] + return [] def _getRecoveryAssistanceTags(self) -> list: + """ + The function to get recovery assistance tags + :return: recovery assistance tags + """ if not all(self.passStatsCheck.values()): possibleRecoveries = self.acdcReader.getRecoveryDocs() - if possibleRecoveries: - self.wfController.logger.info( - "The workflow has missing statistics, but nothing is recoverable. Passing through to announcement" - ) - bypassChecks = True - else: - self.wfController.logger.info( - "The workflow is not completed, but nothing is recoverable. Passing through to announcement" + if not possibleRecoveries: + self.logger.info( + "The workflow is not completed/has missing statistics, but nothing is recoverable. Passing through to announcement" ) + self.bypassChecks = True - if not bypassChecks: + if not self.bypassChecks: return ["recovery" if self.unifiedConfiguration.get("use_recoveror") else "manual"] return [] def _passOver100(self) -> bool: + """ + The function to check if passing over 100 + :return: True if pass over 100, False o/w + """ lhe, primaries, _, _ = self.wfController.request.getIO() return False if (lhe or primaries) else True def _forceCompleteWorkflow(self) -> bool: - if self.acdcWfs.get("healthy") and all(self.passStatsCheck.values()) and all(self.passStatsCheckToAnnounce.values()): + """ + The function to check if workflow should be force completed or not + :return: True to force complete, False o/w + """ + if self.acdcs.get("healthy") and all(self.passStatsCheck.values()) and all(self.passStatsCheckToAnnounce.values()): self.logger.info("This is essentially good to truncate, setting to force-complete") return True @@ -1097,6 +1190,9 @@ def _forceCompleteWorkflow(self) -> bool: def _checkOutputSize(self,) -> None: + """ + The function check the output size + """ self.assistanceTags += set(self._getAnnounceAssistanceTags()) recoveryAssistanceTags = self._getRecoveryAssistanceTags() @@ -1111,6 +1207,10 @@ def _checkOutputSize(self,) -> None: WorkflowStatusEnforcer(self.wf).forceComplete() def _hasSmallLumis(self) -> bool: + """ + The function to check if the workflow has small lumi sections + :return: True if it has small lumis, False o/w + """ lumiLowerLimit = self.unifiedConfiguration.get("min_events_per_lumi_output") _, primaries, _, _ = self.wfController.request.getIO() @@ -1131,6 +1231,10 @@ def _hasSmallLumis(self) -> bool: return False def _hasBigLumis(self) -> bool: + """ + The function to check if the workflow has big lumi sections + :return: True if it has big lumis, False o/w + """ if any( [ self.lumiUpperLimit[dataset] > 0 and self.eventsPerLumi[dataset] >= self.lumiUpperLimit[dataset] @@ -1144,6 +1248,9 @@ def _hasBigLumis(self) -> bool: def _setLumiUpperLimit(self) -> None: + """ + The function to set the lumi sections upper limit + """ lumiUpperLimit = {} campaigns = self.campaigns or self._getWorkflowCampaigns() @@ -1155,8 +1262,8 @@ def _setLumiUpperLimit(self) -> None: elif self.options.get("lumisize"): upperLimit = self.options.get("lumisize") self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) - elif self.closeoutController.campaigns.get(campaign, {}).get("lumisize"): - upperLimit = self.closeoutController.campaigns.get(campaign, {}).get("lumisize") + elif self.campaignController.campaigns.get(campaign, {}).get("lumisize"): + upperLimit = self.campaignController.campaigns.get(campaign, {}).get("lumisize") self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) else: upperLimit = 1001 @@ -1168,6 +1275,9 @@ def _setLumiUpperLimit(self) -> None: def _checkLumiSize(self) -> None: + """ + The function to check the lumi sections sizes + """ if self._hasSmallLumis(): self.assistanceTags.add("smalllumi") self.isClosing = False @@ -1177,7 +1287,10 @@ def _checkLumiSize(self) -> None: self.assistanceTags.add("biglumi") self.isClosing = False - def _checkRucioCounts(self) -> None: + def _checkRucioFileCounts(self) -> None: + """ + The function to check the number of files in Rucio + """ rucioPresence = {} for dataset in self.wfController.request.get("OutputDatasets"): @@ -1197,16 +1310,22 @@ def _checkRucioCounts(self) -> None: self.rucioPresence = rucioPresence - def _checkDBSCounts(self) -> None: + def _checkDBSFileCounts(self) -> None: + """ + The function to check the number of files in DBS + """ dbsPresence, dbsInvalid = {}, {} for dataset in self.wfController.request.get("OutputDatasets"): - #TODO: migrate dbs3Client.getFileCountDataset dbsPresence[dataset] = self.checkor.dbs["reader"].countDatasetFiles(dataset) dbsInvalid[dataset] = self.checkor.dbs["reader"].countDatasetFiles(dataset, onlyInvalid=True) self.dbsPresence, self.dbsInvalid = dbsPresence, dbsInvalid def _hasFileMismatch(self) -> bool: + """ + The function to check if there is any file mismatch + :return: if number of files in DBS in different from Rucio + """ if not self.options.get("ignoreFiles") and not all( [self.dbsPresence[dataset] == self.dbsInvalid[dataset] + self.rucioPresence[dataset] for dataset in self.outputDatasetsToCheck] ): @@ -1215,7 +1334,10 @@ def _hasFileMismatch(self) -> bool: return False - def _checkFileCounts(self) -> list: + def _checkFileCounts(self) -> None: + """ + The function to check the number of files + """ showOnlyN = 10 for dataset in self.dbsPresence: @@ -1246,7 +1368,6 @@ def _checkFileCounts(self) -> list: showOnlyN, "\n".join(wereInvalidated[:showOnlyN]), ) - #TODO: migrate dbs3Client.setFileStatus self.dbs["writer"].setFileStatus(wereInvalidated, validate=False) if missingDBSFiles: @@ -1270,7 +1391,10 @@ def _checkFileCounts(self) -> list: self.isClosing = False - def _checkInvalidations(self) -> list: + def _checkInvalidations(self) -> None: + """ + The function to check the invalidations + """ fractionInvalid = 0.2 if not self.options.get("ignoreinvalid") and not all( [ @@ -1282,7 +1406,10 @@ def _checkInvalidations(self) -> list: self.assistanceTags.add("invalidfiles") def _setRecord(self) -> None: - putRecord = { + """ + The function to check set the record + """ + wfRecord = { "datasets": {}, "name": self.wf, "closeOutWorkflow": self.isClosing, @@ -1290,13 +1417,13 @@ def _setRecord(self) -> None: "prepid": self.wfController.request.get("PrepId"), } for dataset in self.outputDatasetsToCheck: - record = putRecord["datasets"].get(dataset, {}) + record = wfRecord["datasets"].get(dataset, {}) record["expectedL"] = self.expectedLumis[dataset] record["expectedN"] = self.expectedEvents[dataset] record["producedL"] = self.producedLumis[dataset] record["producedN"] = self.producedEvents[dataset] - record["percentage"] = math.round(self.percentCompletions[dataset], 2) - record["fractionpass"] = math.round(self.fractions["pass"][dataset], 2) + record["percentage"] = round(self.percentCompletions[dataset], 2) + record["fractionpass"] = round(self.fractions["pass"][dataset], 2) record["duplicate"] = "N/A" record["closeOutDataset"] = self.isClosing record["correctLumis"] = ( @@ -1307,18 +1434,21 @@ def _setRecord(self) -> None: record["rucioFiles"] = set(RucioReader().getDatasetFileNames(dataset)) record[ "acdc" - ] = f"{len(self.acdcWfs.get('healthy', []))} / {len(self.acdcWfs.get('healthy', []) + self.acdcWfs.get('inactive', []))}" + ] = f"{len(self.acdcs.get('healthy', []))} / {len(self.acdcs.get('healthy', []) + self.acdcs.get('inactive', []))}" record["family"] = self._getWorkflowFamily(self.wf, self.wfController) now = gmtime() record["timestamp"] = mktime(now) record["updated"] = asctime(now) + " (GMT)" - putRecord["datasets"][dataset] = record + wfRecord["datasets"][dataset] = record - self.record = record + self.record = wfRecord def _closeWorkflow(self) -> None: + """ + The function to close the workflow + """ self.wfController.logger.info("Setting %s as closed-out", self.wf) if self.wfToCheck.status in ["closed-out", "announced", "normal-archived"]: @@ -1330,9 +1460,8 @@ def _closeWorkflow(self) -> None: self.newStatus = "close" return - # TODO: migrate reqMgrClient.closeOutWorkflowCascade - response = self.checkor.ReqMgrReader.closeoutWorkflow(self.wf) - if response is None: + response = self.checkor.reqMgr["writer"].closeoutWorkflow(self.wf, cascade=True) + if response: self.newStatus = "close" return @@ -1340,6 +1469,9 @@ def _closeWorkflow(self) -> None: def _checkAssistanceTags(self) -> None: + """ + The function to check the assistance tags + """ self.logger.info("%s was tagged with: %s", self.wf, self.assistanceTags) if "recovering" in self.assistanceTags: self.assistanceTags -= set(["recovery", "filemismatch", "manual"]) @@ -1348,7 +1480,7 @@ def _checkAssistanceTags(self) -> None: self.assistanceTags.add("manual") if "recovery" in self.assistanceTags and "manual" in self.assistanceTags: self.assistanceTags -= set(["recovery"]) - if "custodial" in assistanceTags: + if "custodial" in self.assistanceTags: self.assistanceTags -= set(["announce", "announced"]) if any([tag in self.assistanceTags for tag in ["duplicates", "filemismatch", "agentfilemismatch"]]): self.assistanceTags -= set(["announce"]) @@ -1356,15 +1488,18 @@ def _checkAssistanceTags(self) -> None: self.logger.info("%s needs assistance with: %s", self.wf, self.assistanceTags) self.logger.info("%s has existing conditions: %s", self.wf, self.existingAssistaceTags) - def _warnRequestor(self) -> bool: + def _warnRequestor(self) -> None: + """ + The function to warn the requestor about the workflow progress + """ if self.assistanceTags and "manual" not in self.existingAssistaceTags and self.existingAssistaceTags != self.assistanceTags and any(tag in self.assistanceTags for tag in ["recovery", "biglumi"]): msg = "The request PREPID (WORKFLOW) is facing issue in production.\n" if "recovery" in self.assistanceTags: - msg += f"Samples completed with missing statistics\n{'\n'.join([f'{round(self.percentCompletions[dataset]*100, 2)}%% complete for {dataset}' for dataset in self.outputDatasetsToCheck ])}\nhttps://cmsweb.cern.ch/report/{wf}\n" + msg += f"Samples completed with missing statistics\n{'\n'.join([f'{round(self.percentCompletions[dataset]*100, 2)}%% complete for {dataset}' for dataset in self.outputDatasetsToCheck ])}\nhttps://cmsweb.cern.ch/report/{self.wf}\n" if "biglumi" in self.assistanceTags: - msg += f"Samples completed with large luminosity blocks:\n{'\n'.join([f'{self.eventsPerLumi[dataset]} > {self.lumiUpperLimit[dataset]} for {dataset}' for dataset in self.outputDatasetsToCheck])}\nhttps://cmsweb.cern.ch/reqmgr/view/splitting/{wf}\n" + msg += f"Samples completed with large luminosity blocks:\n{'\n'.join([f'{self.eventsPerLumi[dataset]} > {self.lumiUpperLimit[dataset]} for {dataset}' for dataset in self.outputDatasetsToCheck])}\nhttps://cmsweb.cern.ch/reqmgr/view/splitting/{self.wf}\n" msg += "You are invited to check, while this is being taken care of by Comp-Ops.\n" msg += "This is an automated message from Comp-Ops.\n" @@ -1372,18 +1507,27 @@ def _warnRequestor(self) -> bool: self.wfController.logger.critical(msg) def _getAssistanceStatus(self) -> str: + """ + The function to get the assistance status from the tags + """ if self.assistanceTags: return "assistance-" + "-".join(sorted(self.assistanceTags)) return "assistance" def _setWorkflowToAssistance(self) -> None: + """ + The function to set the workflow to assistance + """ assistanceStatus = self._getAssistanceStatus() if "manual" not in self.wfToCheck.status or assistanceStatus != "assistance-recovery": self.newStatus = assistanceStatus def check(self) -> dict: + """ + The function to check the workflow + """ try: - self.checkor._checkPoint(self.checkor.logMsg["startCheck"].format(self.wf)) + self.checkor._checkPoint(f"Starting checkor with {self.wf}") if self._skipWorkflow() or self._updateWorkflowStatus(): return self._writeResponse() self.checkor._checkPoint("Checked workflow status", subLap=True) @@ -1410,10 +1554,10 @@ def check(self) -> dict: self._checkLumiSize() self.checkor._checkPoint("Checked lumi size", subLap=True) - self._checkRucioCounts() + self._checkRucioFileCounts() self.checkor_checkPoint("Checked Rucio count", subLap=True) - self._checkDBSCounts() + self._checkDBSFileCounts() self.checkor._checkPoint("DBS file count", subLap=True) if self._hasFileMismatch() and "recovering" not in self.assistanceTags: @@ -1425,8 +1569,6 @@ def check(self) -> dict: self.checkor._checkPoint(f"Done with {self.wf}") self._setRecord() - - ## TODO: lumi summary if re reco if self.isClosing: self._closeWorkflow() @@ -1435,17 +1577,20 @@ def check(self) -> dict: self._warnRequestor() self._setWorkflowToAssistance() - ## TODO: update jira + ## TODO: update JIRA tickets / migrate JiraClient return self._writeResponse() except Exception as error: - self.logger.error(self.logMsg["failedCheck"], self.wf) + self.logger.error("Failed on checking %s", self.wf) self.logger.error(str(error)) self.failed = True return self._writeResponse() def _writeResponse(self) -> dict: + """ + The function to write the check response + """ response = { "workflow": self.wfToCheck, "wf": self.wf, From 5fa27823c400efae1ce17f00bea3e27a20b643fe Mon Sep 17 00:00:00 2001 From: Amanda Goldani Date: Wed, 5 Jan 2022 04:42:20 +0100 Subject: [PATCH 05/17] Add dry-run flag and fix some issues --- src/python/Unified/Checkor.py | 446 +++++++++++++ .../python/Unified/Helpers/WorkflowCheckor.py | 628 ++++-------------- .../python/Unified/Helpers}/__init__.py | 0 {unified => src/python/Unified}/Injector.py | 0 .../python/Unified}/Invalidator.py | 0 {unified => src/python/Unified}/Rejector.py | 0 .../python/Unified/__init__.py | 0 7 files changed, 579 insertions(+), 495 deletions(-) create mode 100644 src/python/Unified/Checkor.py rename unified/Checkor.py => src/python/Unified/Helpers/WorkflowCheckor.py (65%) rename {unified => src/python/Unified/Helpers}/__init__.py (100%) rename {unified => src/python/Unified}/Injector.py (100%) rename {unified => src/python/Unified}/Invalidator.py (100%) rename {unified => src/python/Unified}/Rejector.py (100%) rename unified/.gitkeep => src/python/Unified/__init__.py (100%) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py new file mode 100644 index 000000000..eefc5a5e7 --- /dev/null +++ b/src/python/Unified/Checkor.py @@ -0,0 +1,446 @@ +import optparse +import random +from logging import Logger +from collections import defaultdict +from time import mktime, asctime, gmtime, struct_time + +from Utilities.DataTools import unnestList +from Utilities.Decorators import runWithMultiThreading +from Utilities.Logging import getLogger +from Utilities.ConfigurationHandler import ConfigurationHandler +from Cache.DataCacheLoader import DataCacheLoader +from Databases.Oracle.OracleClient import OracleClient +from Databases.Oracle.OracleDB import Workflow +from MongoControllers.CampaignController import CampaignController +from MongoControllers.CloseoutController import CloseoutController +from MongoControllers.WTCController import WTCController +from Services.ServicesChecker import ServicesChecker +from Services.McM.McMClient import McMClient +from Services.ReqMgr.ReqMgrReader import ReqMgrReader +from Services.ReqMgr.ReqMgrWriter import ReqMgrWriter +from Services.ACDC.ACDCReader import ACDCReader +from Services.DBS.DBSReader import DBSReader +from Services.DBS.DBSWriter import DBSWriter +from WorkflowMgmt.SiteController import SiteController +from WorkflowMgmt.UserLockChecker import UserLockChecker + +from Unified.Helpers.WorkflowCheckor import WorkflowCheckor + +from typing import Optional, Tuple, Union + + +class Checkor(OracleClient): + """ + __Checkor__ + General API for checking workflows in completed status + """ + + def __init__(self, logger: Optional[Logger] = None, **kwargs) -> None: + try: + super().__init__(self) + self.logger = logger or getLogger(self.__class__.__name__) + + self.options, self.specificWf = kwargs.get("options"), kwargs.get("specificWf") + if self.options is None: + self.options, self.specificWf = self.parseOptions() + + now = mktime(gmtime()) + self.timePoint = {"lap": now, "subLap": now, "start": now} + + self.unifiedConfiguration = ConfigurationHandler("config/unifiedConfiguration.json") + + self.acdcReader = ACDCReader() + self.dbs = {"reader": DBSReader(), "writer": DBSWriter()} + self.reqMgr = {"reader": ReqMgrReader(), "writer": ReqMgrWriter()} + + self.dataCacheLoader = DataCacheLoader() + + self.campaignController = CampaignController() + self.closeoutController = CloseoutController() + self.siteController = SiteController() + self.wtcController = WTCController() + + self.overrideWfs = None + self.onHoldWfs = None + self.bypassWfs = None + self.forceCompleteWfs = None + + self.jiraClient = None + self.mcmClient = None + + except Exception as error: + raise Exception(f"Error initializing Checkor\n{str(error)}") + + @staticmethod + def parseOptions() -> Tuple[dict, Optional[str]]: + """ + The function to parse the Checkor's options and specific workflow + :return: options and the specific workflow, if any + """ + parser = optparse.OptionParser() + + parser.add_option("--go", help="Does not check on duplicate process", action="store_true", default=False) + parser.add_option( + "--update", + help="Running workflows that have not yet reached completed", + action="store_true", + default=False, + ) + parser.add_option( + "--strict", help="Only running workflows that reached completed", action="store_true", default=False + ) + parser.add_option( + "--clear", help="Only running workflows that have reached custodial", action="store_true", default=False + ) + parser.add_option( + "--review", + help="Look at the workflows that have already completed and had required actions", + action="store_true", + default=False, + ) + parser.add_option( + "--recovering", + help="Look at the workflows that already have on-going acdc", + action="store_true", + default=False, + ) + parser.add_option("--manual", help='Look at the workflows in "manual"', action="store_true", default=False) + parser.add_option("--limit", help="The number of workflows to consider for checking", default=0, type=int) + parser.add_option("--threads", help="The number of threads for processing workflows", default=10, type=int) + parser.add_option( + "--fractionPass", help="The completion fraction that is permitted", default=0.0, type="float" + ) + parser.add_option("--lumiSize", help="Force the upper limit on lumisection", default=0, type="float") + parser.add_option( + "--ignoreFiles", help="Force ignoring dbs/rucio differences", action="store_true", default=False + ) + parser.add_option( + "--ignoreInvalid", help="Force ignoring high level of invalid files", action="store_true", default=False + ) + parser.add_option( + "--ignoreDuplicates", help="Force ignoring lumi duplicates", action="store_true", default=False + ) + parser.add_option("--tapeSizeLimit", help="The limit in size of all outputs", default=0, type=int) + parser.add_option("--html", help="Build the the monitor page", action="store_true", default=False) + parser.add_option( + "--noReport", help="Prevent from making the error report", action="store_true", default=False + ) + parser.add_option( + "--backfill", help="To run in test mode (only with backfill workflows)", action="store_true", default=False + ) + parser.add_option( + "--dryRun", + help="To run in test mode (with production but without making any changes)", + actions="store_true", + default=False, + ) + + options, args = parser.parse_args() + options = vars(options) + + options["manual"] = not options.get("recovering") + actions = ["strict", "update", "clear", "review"] + if all(not options.get(option) for option in actions): + for option in actions + ["recovering", "manual"]: + options[option] = True + + return options, args[0] if args else None + + def _setWfs(self) -> None: + """ + The function to set workflows to override, on hold, bypass and force complete. + """ + self.overrideWfs = self._getWorkflowsByAction("force-complete", details=True) + self.onHoldWfs = self._getWorkflowsByAction("hold") + self.bypassWfs = self._getWorkflowsByAction("bypass") + unnestList(self.overrideWfs) + self.forceCompleteWfs = ( + self.mcmClient.get("/restapi/requests/forcecomplete") if self.mcmClient is not None else {} + ) + + if self.jiraClient is not None: + # TODO: update wfs by JIRA tickets / migrate JiraClient + pass + + def _checkPoint(self, label: str = "", subLap: bool = False, now: struct_time = gmtime()) -> None: + """ + The function to get the check points + :label: log message label + :subLap: True if for sub lap, False o/w + :now: time now + """ + self.logger.info("Time check (%s) point at: %s", label, asctime(now)) + self.logger.info("Since start: %s [s]", now - self.timePoint.get("start", now)) + + self.logger.info( + "%s: %s [s]", + "Sub lap" if subLap else "Lap", + now - self.timePoint.get("subLap" if subLap else "lap", now), + ) + self.timePoint["subLap"] = now + if not subLap: + self.timePoint["lap"] = now + + def _filterBackfills(self, workflows: list) -> list: + """ + The function to filter only backfill workflows + :workflows: workflows + :return: backfill workflows + """ + if self.options.get("backfill"): + self.logger.info(self.logMsg["backfill"]) + return [wf for wf in workflows if "backfill" in wf.name.lower()] + return workflows + + def _getWorkflowsToCheck(self) -> list: + """ + The function to get the workflows to check + :return: workflows + """ + workflows = set() + + awayWfs = self.session.query(Workflow).filter(Workflow.status == "away").all() + assistanceWfs = self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all() + completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed") + + if self.options.get("strict"): + self.logger.info("Strict option is on: checking workflows that freshly completed") + workflows.update(filter(lambda wf: wf.name in completedWfs, awayWfs)) + + if self.options.get("update"): + self.logger.info("Update option is on: checking workflows that have not completed yet") + workflows.update(filter(lambda wf: wf.name not in completedWfs, awayWfs)) + + if self.options.get("clear"): + self.logger.info("Clear option is on: checking workflows that are ready to toggle closed-out") + workflows.update(filter(lambda wf: "custodial" in wf.status, assistanceWfs)) + + if self.options.get("review"): + nonCustodialWfs = [*filter(lambda wf: "custodial" not in wf.status, assistanceWfs)] + if self.options.get("recovering"): + self.logger.info( + "Review-recovering option is on: checking only the workflows that had been already acted on" + ) + workflows.update(filter(lambda wf: "manual" not in wf.status, nonCustodialWfs)) + if self.options.get("manual"): + self.logger.info("Review-manual option is on: checking the workflows to be acted on") + workflows.update(filter(lambda wf: "manual" in wf.status, nonCustodialWfs)) + + return list(workflows) + + def _getWorkflowsByAction(self, action: str, details: bool = False) -> Union[list, dict]: + """ + The function to get workflows for a given user action + :action: user action + :details: if True return dict of workflows by user. O/w return workflows names + :return: workflows + """ + workflows = defaultdict(set) + + allowedUsers = [*self.unifiedConfiguration.get("allowed_bypass", {}).keys()] + + userWfs = ( + self.wtcController.getHold() + if action == "hold" + else self.wtcController.getBypass() + if action == "bypass" + else self.wtcController.getForce() + if action == "force-complete" + else {} + ) + for user, wf in userWfs.items(): + if user not in allowedUsers: + self.logger.info("%s is not allowed to %s", user, action) + continue + + self.logger.info("%s allowed to %s %s", user, action, wf) + workflows[user].add(wf) + + if details: + return workflows + return unnestList(workflows) + + def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: + """ + The function to filter the max number of workflows per round + :param workflows: list of workflows + :return: filtered list of workflows + """ + maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") + if self.options.get("limit"): + self.logger.info("Command line to limit workflows to %s", self.options.get("limit")) + maxPerRound = self.options.get("limit") + + if maxPerRound and not self.specificWf: + self.logger.info("Limiting workflows to %s this round", maxPerRound) + + workflows = self._rankWorkflows(workflows) + if self.option.get("update"): + random.shuffle(workflows) + return workflows[:maxPerRound] + + return workflows + + def _rankWorkflows(self, workflows: list) -> list: + """ + The function to rank the workflows by their priority + :param workflows: workflows + :return: sorted workflows + """ + completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed", details=True) + completedWfs = sorted(completedWfs, key=lambda wf: wf.get("RequestPriority", 0)) + completedWfs = [wf.get("RequestName") for wf in completedWfs] + + return sorted(workflows, key=lambda wf: completedWfs.index(wf) if wf in completedWfs else 0, reverse=True) + + def _updateWorkflowsRecords(self, wfsRecords: dict) -> None: + """ + The function to update the workflows records + :param wfsRecords: records + """ + for wf in wfsRecords: + if wf.get("record"): + if wf.options.get("dryRun"): + self.logger.debug("Dry run: updated workflow %s records to %s", wf.get("wf"), wf.get("record")) + else: + self.closeoutController.set(wf.get("wf"), wf.get("record")) + + def _updateWorkflowsStatus(self, wfsStatus: list) -> None: + """ + The function to update the workflows status + :param wfsStatus: status + """ + for wf in wfsStatus: + if wf.get("newStatus"): + newStatus = wf.get("newStatus") + wf["workflow"].status = newStatus + + if wf.options.get("dryRun"): + self.logger.debug("Dry run: updated workflow %s status to %s", wf.get("wf"), newStatus) + else: + self.session.commit() + + if newStatus == "close": + if wf.options.get("dryRun"): + self.logger.debug("Dry run: cleaned workflow %s because set as close", wf.get("wf")) + else: + self.closeoutController.clean(wf.get("wf")) + + if self.mcmClient is not None and wf.get("mcmForceComplete"): + if wf.options.get("dryRun"): + self.logger.debug( + "Dry run: cleaned workflow %s prep ids because of force-complete", wf.get("wf") + ) + else: + for prepId in wf.get("prepIds"): + self.mcmClient.clean(f"/restapi/requests/forcecomplete/{prepId}") + + def _checkExecutionTime(self, nWfs: int, now: struct_time = mktime(gmtime())) -> None: + """ + The function to check the execution time of the module + :param nWfs: number of workflows + :param now: time now + """ + if nWfs: + avgExecutionTime = (now - self.timePoint.get("start")) / nWfs + self.logger.info("Average time spend per workflow: %s s", avgExecutionTime) + + if avgExecutionTime > 120: + self.logger.critical("Checkor took %s [s] per workflow", avgExecutionTime) + + def _countAssistanceWorkflowsByStatus(self) -> dict: + """ + The function to count the number of assistance workflows by status + :return: count of workflows by status + """ + status = defaultdict(int) + for wf in self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all(): + status[wf.status] += 1 + + return status + + def _writeSummary(self) -> None: + """ + The function to write a summary of checkor + """ + if not self.specificWf: + msg = "" + if self.options.get("strict"): + msg += "Workflows which just got in completed were looked at. Look in manual.\n" + if self.options.get("update"): + msg += "Workflows that are still running (and not completed) got looked at.\n" + if self.options.get("clear"): + msg += "Workflows that just need to close-out were verified. Nothing too new a-priori.\n" + if self.options.get("review"): + msg += "Workflows under intervention got review.\n" + + msg += "\n".join( + [f"{count} in status {status}" for status, count in self._countAssistanceWorkflowsByStatus()] + ) + + self.logger.info(msg) + + def go(self) -> bool: + """ + The function to check if the checkor can go + :return: True if it can go, False o/w + """ + try: + userLockChecker = UserLockChecker() + servicesChecker = ServicesChecker(softServices=["mcm", "wtc"]) + + if not userLockChecker.isLocked() and servicesChecker.check(): + self.mcmClient = McMClient() if servicesChecker.status.get("mcm") else None + self.jiraClient = ( + None # JiraClient() if servicesChecker.status.get("jira") else None TODO: migrate JiraClient + ) + return True + + return False + + except Exception as error: + self.logger.error("Failed to check if Checkor can go") + self.logger.error(str(error)) + + def run(self) -> None: + """ + The function to run checkor + """ + try: + self._setWfs() + + wfsToCheck = self._filterBackfills(self._getWorkflowsToCheck()) + random.shuffle(wfsToCheck) + + self.logger.info("Considering %s workflows (before any limitation)", len(wfsToCheck)) + + wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) + self._check(wfsToCheck) + + self._writeSummary() + + except Exception as error: + self.logger.error("Failed to run checkor") + self.logger.error(str(error)) + + def _check(self, wfsToCheck: list) -> None: + """ + The wrapper function to check the workflows + :param wfsToCheck: workflows to check + """ + + @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=len(wfsToCheck)) + def _checkWorkflow(self, wfsToCheck: list) -> dict: + return WorkflowCheckor(wfsToCheck, checkor=self).check() + + checkResponses = _checkWorkflow(wfsToCheck) + self._updateWorkflowsRecords(checkResponses) + self._updateWorkflowsStatus(checkResponses) + + self._checkExecutionTime(len(wfsToCheck)) + + +if __name__ == "__main__": + options, specificWf = Checkor.parseOptions() + checkor = Checkor(options=options, specificWf=specificWf) + if checkor.go(): + checkor.run() diff --git a/unified/Checkor.py b/src/python/Unified/Helpers/WorkflowCheckor.py similarity index 65% rename from unified/Checkor.py rename to src/python/Unified/Helpers/WorkflowCheckor.py index 3c434cfa6..a72e46814 100644 --- a/unified/Checkor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -1,421 +1,21 @@ import os import re -import optparse -import random -import math from logging import Logger -from collections import defaultdict -from time import mktime, asctime, gmtime, struct_time +from time import mktime, asctime, gmtime -from Utilities.DataTools import unnestList, flattenDictKeys -from Utilities.Decorators import runWithMultiThreading +from Utilities.DataTools import flattenDictKeys from Utilities.Logging import getLogger -from Utilities.ConfigurationHandler import ConfigurationHandler -from Cache.DataCacheLoader import DataCacheLoader -from Databases.Oracle.OracleClient import OracleClient from Databases.Oracle.OracleDB import Workflow -from MongoControllers.CampaignController import CampaignController -from MongoControllers.CloseoutController import CloseoutController -from MongoControllers.WTCController import WTCController -from Services.ServicesChecker import ServicesChecker -from Services.McM.McMClient import McMClient -from Services.ReqMgr.ReqMgrReader import ReqMgrReader -from Services.ReqMgr.ReqMgrWriter import ReqMgrWriter -from Services.ACDC.ACDCReader import ACDCReader -from Services.DBS.DBSReader import DBSReader -from Services.DBS.DBSWriter import DBSWriter from Services.Rucio.RucioReader import RucioReader -from WorkflowMgmt.SiteController import SiteController -from WorkflowMgmt.UserLockChecker import UserLockChecker from WorkflowMgmt.WorkflowController import WorkflowController from WorkflowMgmt.WorkflowStatusEnforcer import WorkflowStatusEnforcer -from typing import Optional, Tuple, Union - - -class Checkor(OracleClient): - """ - __Checkor__ - General API for checking workflows in completed status - """ - - def __init__(self, logger: Optional[Logger] = None, **kwargs) -> None: - try: - super().__init__(self) - self.logger = logger or getLogger(self.__class__.__name__) - - self.options, self.specificWf = kwargs.get("options"), kwargs.get("specificWf") - if self.options is None: - self.options, self.specificWf = self.parseOptions() - - now = mktime(gmtime()) - self.timePoint = {"lap": now, "subLap": now, "start": now} - - self.unifiedConfiguration = ConfigurationHandler("config/unifiedConfiguration.json") - - self.acdcReader = ACDCReader() - self.dbs = {"reader": DBSReader(), "writer": DBSWriter()} - self.reqMgr = {"reader": ReqMgrReader(), "writer": ReqMgrWriter()} - - self.dataCacheLoader = DataCacheLoader() - - self.campaignController = CampaignController() - self.closeoutController = CloseoutController() - self.siteController = SiteController() - self.wtcController = WTCController() - - self.overrideWfs = None - self.onHoldWfs = None - self.bypassWfs = None - self.forceCompleteWfs = None - - self.jiraClient = None - self.mcmClient = None - - except Exception as error: - raise Exception(f"Error initializing Checkor\n{str(error)}") - - @staticmethod - def parseOptions() -> Tuple[dict, Optional[str]]: - """ - The function to parse the Checkor's options and specific workflow - :return: options and the specific workflow, if any - """ - parser = optparse.OptionParser() - - parser.add_option("--go", help="Does not check on duplicate process", action="store_true", default=False) - parser.add_option( - "--update", - help="Running workflows that have not yet reached completed", - action="store_true", - default=False, - ) - parser.add_option( - "--strict", help="Only running workflows that reached completed", action="store_true", default=False - ) - parser.add_option( - "--clear", help="Only running workflows that have reached custodial", action="store_true", default=False - ) - parser.add_option( - "--review", - help="Look at the workflows that have already completed and had required actions", - action="store_true", - default=False, - ) - parser.add_option( - "--recovering", - help="Look at the workflows that already have on-going acdc", - action="store_true", - default=False, - ) - parser.add_option("--manual", help='Look at the workflows in "manual"', action="store_true", default=False) - parser.add_option("--limit", help="The number of workflows to consider for checking", default=0, type=int) - parser.add_option("--threads", help="The number of threads for processing workflows", default=10, type=int) - parser.add_option( - "--fractionPass", help="The completion fraction that is permitted", default=0.0, type="float" - ) - parser.add_option("--lumiSize", help="Force the upper limit on lumisection", default=0, type="float") - parser.add_option( - "--ignoreFiles", help="Force ignoring dbs/rucio differences", action="store_true", default=False - ) - parser.add_option( - "--ignoreInvalid", help="Force ignoring high level of invalid files", action="store_true", default=False - ) - parser.add_option( - "--ignoreDuplicates", help="Force ignoring lumi duplicates", action="store_true", default=False - ) - parser.add_option("--tapeSizeLimit", help="The limit in size of all outputs", default=0, type=int) - parser.add_option("--html", help="Build the the monitor page", action="store_true", default=False) - parser.add_option( - "--noReport", help="Prevent from making the error report", action="store_true", default=False - ) - parser.add_option( - "--backfill", help="To run in test mode (only with backfill workflows)", action="store_true", default=False - ) - - options, args = parser.parse_args() - options = vars(options) - - options["manual"] = not options.get("recovering") - actions = ["strict", "update", "clear", "review"] - if all(not options.get(option) for option in actions): - for option in actions + ["recovering", "manual"]: - options[option] = True - - return options, args[0] if args else None - - def _setWfs(self) -> None: - """ - The function to set workflows to override, on hold, bypass and force complete. - """ - self.overrideWfs = self._getWorkflowsByAction("force-complete", details=True) - self.onHoldWfs = self._getWorkflowsByAction("hold") - self.bypassWfs = self._getWorkflowsByAction("bypass") + unnestList(self.overrideWfs) - self.forceCompleteWfs = self.mcmClient.get("/restapi/requests/forcecomplete") if self.mcmClient is not None else {} - - if self.jiraClient is not None: - # TODO: update wfs by JIRA tickets / migrate JiraClient - pass - - def _checkPoint(self, label: str = "", subLap: bool = False, now: struct_time = gmtime()) -> None: - """ - The function to get the check points - :label: log message label - :subLap: True if for sub lap, False o/w - :now: time now - """ - self.logger.info("Time check (%s) point at: %s", label, asctime(now)) - self.logger.info("Since start: %s [s]", now - self.timePoint.get("start", now)) - - self.logger.info( - "%s: %s [s]", - "Sub lap" if subLap else "Lap", - now - self.timePoint.get("subLap" if subLap else "lap", now), - ) - self.timePoint["subLap"] = now - if not subLap: - self.timePoint["lap"] = now - - def _filterBackfills(self, workflows: list) -> list: - """ - The function to filter only backfill workflows - :workflows: workflows - :return: backfill workflows - """ - if self.options.get("backfill"): - self.logger.info(self.logMsg["backfill"]) - return [wf for wf in workflows if "backfill" in wf.name.lower()] - return workflows - - def _getWorkflowsToCheck(self) -> list: - """ - The function to get the workflows to check - :return: workflows - """ - workflows = set() - - awayWfs = self.session.query(Workflow).filter(Workflow.status == "away").all() - assistanceWfs = self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all() - completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed") - - if self.options.get("strict"): - self.logger.info("Strict option is on: checking workflows that freshly completed") - workflows.update(filter(lambda wf: wf.name in completedWfs, awayWfs)) - - if self.options.get("update"): - self.logger.info("Update option is on: checking workflows that have not completed yet") - workflows.update(filter(lambda wf: wf.name not in completedWfs, awayWfs)) - - if self.options.get("clear"): - self.logger.info("Clear option is on: checking workflows that are ready to toggle closed-out") - workflows.update(filter(lambda wf: "custodial" in wf.status, assistanceWfs)) - - if self.options.get("review"): - nonCustodialWfs = [*filter(lambda wf: "custodial" not in wf.status, assistanceWfs)] - if self.options.get("recovering"): - self.logger.info("Review-recovering option is on: checking only the workflows that had been already acted on") - workflows.update(filter(lambda wf: "manual" not in wf.status, nonCustodialWfs)) - if self.options.get("manual"): - self.logger.info("Review-manual option is on: checking the workflows to be acted on") - workflows.update(filter(lambda wf: "manual" in wf.status, nonCustodialWfs)) - - return list(workflows) - - def _getWorkflowsByAction(self, action: str, details: bool = False) -> Union[list, dict]: - """ - The function to get workflows for a given user action - :action: user action - :details: if True return dict of workflows by user. O/w return workflows names - :return: workflows - """ - workflows = defaultdict(set) - - allowedUsers = [*self.unifiedConfiguration.get("allowed_bypass", {}).keys()] - - userWfs = ( - self.wtcController.getHold() - if action == "hold" - else self.wtcController.getBypass() - if action == "bypass" - else self.wtcController.getForce() - if action == "force-complete" - else {} - ) - for user, wf in userWfs.items(): - if user not in allowedUsers: - self.logger.info("%s is not allowed to %s", user, action) - continue - - self.logger.info("%s allowed to %s %s", user, action, wf) - workflows[user].add(wf) - - if details: - return workflows - return unnestList(workflows) - - def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: - """ - The function to filter the max number of workflows per round - :param workflows: list of workflows - :return: filtered list of workflows - """ - maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") - if self.options.get("limit"): - self.logger.info("Command line to limit workflows to %s", self.options.get("limit")) - maxPerRound = self.options.get("limit") - - if maxPerRound and not self.specific: - self.logger.info("Limiting workflows to %s this round", maxPerRound) - - workflows = self._rankWorkflows(workflows) - if self.option.get("update"): - random.shuffle(workflows) - return workflows[:maxPerRound] - - return workflows - - def _rankWorkflows(self, workflows: list) -> list: - """ - The function to rank the workflows by their priority - :param workflows: workflows - :return: sorted workflows - """ - completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed", details=True) - completedWfs = sorted(completedWfs, key=lambda wf: wf.get("RequestPriority", 0)) - completedWfs = [wf.get("RequestName") for wf in completedWfs] - - return sorted(workflows, key=lambda wf: completedWfs.index(wf) if wf in completedWfs else 0, reverse=True) - - def _updateWorkflowsRecords(self, wfsRecords: dict) -> None: - """ - The function to update the workflows records - :param wfsRecords: records - """ - for wf in wfsRecords: - if wf.get("record"): - self.closeoutController.set(wf.get("wf"), wf.get("record")) - - def _updateWorkflowsStatus(self, wfsStatus: list) -> None: - """ - The function to update the workflows status - :param wfsStatus: status - """ - for wf in wfsStatus: - if wf.get("newStatus"): - newStatus = wf.get("newStatus") - wf["workflow"].status = newStatus - self.session.commit() - - if newStatus == "close": - self.closeoutController.clean(wf.get("wf")) - if self.mcmClient is not None and wf.get("mcmForceComplete"): - for prepId in wf.get("prepIds"): - self.mcmClient.clean(f"/restapi/requests/forcecomplete/{prepId}") - - def _checkExecutionTime(self, nWfs: int, now: struct_time = mktime(gmtime())) -> None: - """ - The function to check the execution time of the module - :param nWfs: number of workflows - :param now: time now - """ - if nWfs: - avgExecutionTime = (now - self.timePoint.get("start")) / nWfs - self.logger.info("Average time spend per workflow: %s s", avgExecutionTime) - - if avgExecutionTime > 120: - self.logger.critical("Checkor took %s [s] per workflow", avgExecutionTime) - - def _countAssistanceWorkflowsByStatus(self) -> dict: - """ - The function to count the number of assistance workflows by status - :return: count of workflows by status - """ - status = defaultdict(int) - for wf in self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all(): - status[wf.status] += 1 - - return status - - def _writeSummary(self) -> None: - """ - The function to write a summary of checkor - """ - if not self.specificWf: - msg = "" - if self.options.get("strict"): - msg += "Workflows which just got in completed were looked at. Look in manual.\n" - if self.options.get("update"): - msg += "Workflows that are still running (and not completed) got looked at.\n" - if self.options.get("clear"): - msg += "Workflows that just need to close-out were verified. Nothing too new a-priori.\n" - if self.options.get("review"): - msg += "Workflows under intervention got review.\n" - - msg += "\n".join([f"{count} in status {status}" for status, count in self._countAssistanceWorkflowsByStatus()]) - - self.logger.info(msg) - - def go(self) -> bool: - """ - The function to check if the checkor can go - :return: True if it can go, False o/w - """ - try: - userLockChecker = UserLockChecker() - servicesChecker = ServicesChecker(softServices=["mcm", "wtc"]) - - if not userLockChecker.isLocked() and servicesChecker.check(): - self.mcmClient = McMClient() if servicesChecker.status.get("mcm") else None - self.jiraClient = None #JiraClient() if servicesChecker.status.get("jira") else None TODO: migrate JiraClient - return True - - return False - - except Exception as error: - self.logger.error("Failed to check if Checkor can go") - self.logger.error(str(error)) - - def run(self) -> None: - """ - The function to run checkor - """ - try: - self._setWfs() - - wfsToCheck = self._filterBackfills(self._getWorkflowsToCheck()) - random.shuffle(wfsToCheck) - - self.logger.info("Considering %s workflows (before any limitation)", len(wfsToCheck)) - - wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) - self._check(wfsToCheck) - - self._writeSummary() - - except Exception as error: - self.logger.error("Failed to run checkor") - self.logger.error(str(error)) - - def _check(self, wfsToCheck: list) -> None: - """ - The wrapper function to check the workflows - :param wfsToCheck: workflows to check - """ - - @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=len(wfsToCheck)) - def _checkWorkflow(self, wfsToCheck: list) -> dict: - return WorkflowCheckor(wfsToCheck, checkor=self).check() - - checkResponses = _checkWorkflow(wfsToCheck) - self._updateWorkflowsRecords(checkResponses) - self._updateWorkflowsStatus(checkResponses) - - self._checkExecutionTime(len(wfsToCheck)) +from typing import Optional, Tuple class WorkflowCheckor(object): """ __WorkflowCheckor__ - General API for checking a fiven workflow + General API for checking a given workflow """ def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwargs) -> None: @@ -469,10 +69,10 @@ def _skipWorkflow(self) -> bool: :return: True if workflow should be skipped, False o/w """ if ( - (self.checkor.specific and self.checkor.specific not in self.wf) + (self.checkor.specificWf and self.checkor.specificWf not in self.wf) or os.path.isfile(".checkor_stop") or self.wfToCheck.wm_status in ["assigned", "acquired"] - or (self.wfToCheck.wm_status != "completed" and self.wf not in self.checkor.exceptions) + or self.wfToCheck.wm_status != "completed" ): self.logger.info("Skipping workflow %s", self.wf) return True @@ -484,7 +84,7 @@ def _setWorkflowToClose(self) -> bool: The function to check if a given workflow should be closed :return: True if workflow should be closed, False o/w """ - if self.wfToCheck.wm_status in ["closed-out", "announced"] and self.wf not in self.checkor.exceptions: + if self.wfToCheck.wm_status in ["closed-out", "announced"]: self.logger.info("%s is already %s, setting as close", self.wf, self.wfToCheck.wm_status) self.newStatus = "close" return True @@ -535,7 +135,7 @@ def _setBypassChecks(self) -> None: """ for bypassWf in self.checkor.bypassWfs: if bypassWf == self.wf: - self.logger.info(self.logMsg["bypassCheck"], self.wf, bypassWf) + self.logger.info("Bypassing checks %s because of keyword %s", self.wf, bypassWf) self.bypassChecks = True self.bypassChecks = False @@ -597,7 +197,7 @@ def _setNewOnHoldStatus(self, family: list) -> bool: :param family: workflow family :return: if set new status """ - onHoldTimeout = self.unifiedConfiguration.get("onhold_timeout") + onHoldTimeout = self.checkor.unifiedConfiguration.get("onhold_timeout") minFamilyCompletedDelay = self._getMinFamilyCompletedDelay(family, self.now) self.logger.info("On hold since %s, timeout at %s", minFamilyCompletedDelay, onHoldTimeout) @@ -635,19 +235,21 @@ def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: The function yo get the tiers with no check and custodial :return: list of tiers with no check, and list of tiers with no custodial """ - tiersWithNoCheck = set(self.unifiedConfiguration.get("tiers_with_no_check")) + tiersWithNoCheck = set(self.checkor.unifiedConfiguration.get("tiers_with_no_check")) tiersWithNoCustodial = set( - self.unifiedConfiguration.get("tiers_with_no_custodial") if not self.wfController.request.isRelVal() else [] + self.checkor.unifiedConfiguration.get("tiers_with_no_custodial") if not self.wfController.request.isRelVal() else [] ) - for campaign in self._getWorkflowCampaigns(self.wfController).values(): + campaigns = self.campaigns or self._setWorkflowCampaigns() + + for campaign in campaigns.values(): if ( - campaign in self.campaignController.campaigns - and "custodial_override" in self.campaignController.campaigns[campaign] + campaign in self.checkor.campaignController.campaigns + and "custodial_override" in self.checkor.campaignController.campaigns[campaign] ): - if isinstance(self.campaignController.campaigns[campaign]["custodial_override"], list): - tiersWithNoCheck -= self.campaignController.campaigns[campaign]["custodial_override"] - elif self.campaignController.campaigns[campaign]["custodial_override"] == "notape": + if isinstance(self.checkor.campaignController.campaigns[campaign]["custodial_override"], list): + tiersWithNoCheck -= self.checkor.campaignController.campaigns[campaign]["custodial_override"] + elif self.checkor.campaignController.campaigns[campaign]["custodial_override"] == "notape": tiersWithNoCustodial = set( sorted([dataset.split("/")[-1] for dataset in self.wfController.request.get("OutputDatasets")]) ) @@ -682,9 +284,9 @@ def _skipFamilyWorkflow(self, wfSchema: dict) -> bool: :return: True if to skip, False o/w """ return ( - wfSchema.get("RequestType") != "Ressubmission" - or wfSchema.get("RequestDate") < wfSchema.request.get("RequestDate") - or wfSchema.get("PrepID") != wfSchema.request.get("PrepID") + wfSchema.get("RequestType") != "Resubmission" + or wfSchema.get("RequestDate") < self.wfController.request.get("RequestDate") + or wfSchema.get("PrepID") != self.wfController.request.get("PrepID") or wfSchema.get("RequestStatus") == None ) @@ -814,7 +416,11 @@ def _checkWorkflowFamily(self) -> None: if (self.bypassChecksByMcMForceComplete or self.bypassChecksByUserForceComplete) and len(self.acdcs.get("healthy")): self.wfController.logger.info("%s is being force completed while recovering", self.wf) self.wfController.logger.critical("The workflow %s was force completed", self.wf) - WorkflowStatusEnforcer(self.wf).forceComplete() + + if self.checkor.options.get("dryRun"): + self.checkor.logger.debug("Dry run: force completed workflow %s", self.wf) + else: + WorkflowStatusEnforcer(self.wf).forceComplete() if self.acdcs.get("failed"): self.logger.critical("For %s, ACDC %s failed", self.wf, ", ".join(self.acdcs.get("failed"))) @@ -842,19 +448,20 @@ def _getFractionDumping(self) -> float: fractionDamping = min( 0.01 * ( - max(wfCompletedDelay - self.unifiedConfiguration.get("damping_fraction_pass"), 0) - / self.unifiedConfiguration.get("damping_fraction_pass_rate") + max(wfCompletedDelay - self.checkor.unifiedConfiguration.get("damping_fraction_pass"), 0) + / self.checkor.unifiedConfiguration.get("damping_fraction_pass_rate") ), - self.unifiedConfiguration.get("damping_fraction_pass_max") / 100.0, + self.checkor.unifiedConfiguration.get("damping_fraction_pass_max") / 100.0, ) self.logger.info("The passing fraction could be reduced to %s given it has been in for long", fractionDamping) return fractionDamping - def _setWorkflowCampaigns(self) -> None: + def _setWorkflowCampaigns(self) -> dict: """ The function to set the campaigns + :return: campaigns """ campaigns = {} wfCampaigns = self.wfController.request.getCampaigns(details=False) @@ -867,6 +474,8 @@ def _setWorkflowCampaigns(self) -> None: self.logger.info("Campaigns: %s", campaigns) self.campaigns = campaigns + return campaigns + def _setDatasetsFractionsToAnnounce(self) -> None: """ The function to set the dataset fractions to announce @@ -877,49 +486,73 @@ def _setDatasetsFractionsToAnnounce(self) -> None: self.fractions["announce"] = fractionsAnnounce + def _getFractionPassByCommandLine(self) -> float: + """ + The function to get the fraction pass by the command line + :return: fractions to pass if any, else None + """ + if self.checkor.options.get("fractionPass"): + fractionPass = self.checkor.options.get("fractionPass") + self.wfController.logger.info("Overriding fraction to pass to %s by command line", fractionPass) + return fractionPass + + def _getFractionPassByCampaignRequirement(self, campaign: str, dataset: str) -> float: + """ + The function to get the fraction pass by the campaign requirement + :param campaign: campaign name + :param dataset: dataset name + :return: fractions to pass if any, else None + """ + fractionPass = None + campaignFractionPass = self.checkor.campaignController.campaigns.get(campaign, {}).get("fractionpass") + + if campaignFractionPass: + if isinstance(campaignFractionPass, dict): + tier = dataset.split("/")[-1] + priority = str(self.wfController.request.get("RequestPriority")) + fractionPass = campaignFractionPass.get("all") + if campaignFractionPass.get(tier): + tierFractionPass = campaignFractionPass.get(tier) + if isinstance(tierFractionPass, dict): + fractionPass = tierFractionPass.get("all") + for key, passValue in tierFractionPass.items(): + if dataset.startswith(key): + fractionPass = passValue + else: + fractionPass = tierFractionPass + + if campaignFractionPass.get(priority): + fractionPass = campaignFractionPass.get("priority") + + else: + fractionPass = campaignFractionPass + + if fractionPass is not None: + self.wfController.logger.info( + "Overriding fraction to pass to %s by campaign requirement", fractionPass + ) + return fractionPass + + def _setDatasetsFractionsToPass(self) -> None: """ The function to set the dataset fractions to pass """ fractionsPass = {} - defaultPass = self.unifiedConfiguration.get("default_fraction_pass") + defaultPass = self.checkor.unifiedConfiguration.get("default_fraction_pass") campaigns = self.campaigns or self._setWorkflowCampaigns() for dataset in self.outputDatasetsToCheck: campaign = campaigns.get(dataset) - if self.options.get("fractionPass"): - fractionsPass[dataset] = self.options.get("fractionPass") - self.wfController.logger.info( - "Overriding fraction to %s for %s by command line", fractionsPass[dataset], dataset - ) - elif self.campaignController.campaigns.get(campaign, {}).get("fractionpass"): - fractionPass = self.campaignController.campaigns.get(campaign).get("fractionpass") - if isinstance(fractionPass, dict): - tier = dataset.split("/")[-1] - priority = str(self.wfController.request.get("RequestPriority")) - fractionsPass[dataset] = fractionPass.get("all", defaultPass) - if fractionPass.get(tier): - tierFractionPass = fractionPass.get(tier) - if isinstance(tierFractionPass, dict): - fractionsPass[dataset] = tierFractionPass.get("all", defaultPass) - for key, passValue in tierFractionPass.items(): - if dataset.startswith(key): - fractionsPass[dataset] = passValue - else: - fractionsPass[dataset] = tierFractionPass - if fractionPass.get(priority): - fractionsPass[dataset] = fractionPass.get("priority") - else: - fractionsPass[dataset] = fractionPass - self.wfController.logger.info( - "Overriding fraction to %s for %s by campaign requirement", fractionsPass[dataset], dataset - ) - else: + fractionsPass[dataset] = self._getFractionPassByCommandLine() + if not fractionsPass[dataset]: + fractionsPass[dataset] = self._getFractionPassByCampaignRequirement(campaign, dataset) + if not fractionsPass[dataset]: fractionsPass[dataset] = defaultPass - for key, passValue in self.unifiedConfiguration.get("pattern_fraction_pass").items(): + for key, passValue in self.checkor.unifiedConfiguration.get("pattern_fraction_pass").items(): if key in dataset: fractionsPass[dataset] = passValue self.wfController.logger.info("Overriding fraction to %s for %s by dataset key", passValue, dataset) @@ -945,14 +578,14 @@ def _setDatasetsFractionsToTruncateRecovery(self) -> None: ) / (weightPass + weightFull * weightUnderPass) campaign = campaigns.get(dataset) - if self.campaignController.campaigns.get(campaign, {}).get("truncaterecovery"): - fractionsTruncateRecovery[dataset] = self.campaignController.campaigns.get(campaign).get( + if self.checkor.campaignController.campaigns.get(campaign, {}).get("truncaterecovery"): + fractionsTruncateRecovery[dataset] = self.checkor.campaignController.campaigns.get(campaign).get( "truncaterecovery" ) self.wfController.logger.info( "Allowed to truncate recovery of %s over %.2f by campaign requirement", dataset, - self.campaignController.campaigns.get(campaign).get("truncaterecovery"), + self.checkor.campaignController.campaigns.get(campaign).get("truncaterecovery"), ) if fractionsTruncateRecovery[dataset] < self.fractions["pass"][dataset]: @@ -970,12 +603,12 @@ def _setStatisticsThresholds(self) -> None: fractionDamping = self._getFractionDumping() for dataset, value in self.fractions["pass"].items(): - if value != 1.0 and fractionDamping and self.unifiedConfiguration.get("timeout_for_damping_fraction"): + if value != 1.0 and fractionDamping and self.checkor.unifiedConfiguration.get("timeout_for_damping_fraction"): self.fractions["pass"][dataset] -= fractionDamping self.fractions["truncate"][dataset] -= fractionDamping - if self.acdsWfs.get("order") > self.unifiedConfiguration.get("acdc_rank_for_truncate"): - self.wfController.logger.info("Truncating at pass threshold because of ACDC at rank %d", self.acdsWfs.get("order")) + if self.checkor.acdsWfs.get("order") > self.checkor.unifiedConfiguration.get("acdc_rank_for_truncate"): + self.wfController.logger.info("Truncating at pass threshold because of ACDC at rank %d", self.checkor.acdsWfs.get("order")) self.fractions["truncate"][dataset] = self.fractions["pass"][dataset] self._updateFractionsToPassAndToTruncateRecovery() @@ -995,7 +628,7 @@ def _updateFractionsToPassAndToTruncateRecovery(self) -> None: descendingPass *= self.fractions["pass"].get(ancestor, 1.0) descendingTruncate *= self.fractions["truncate"].get(ancestor, 1.0) - if self.unifiedConfiguration.get("cumulative_fraction_pass"): + if self.checkor.unifiedConfiguration.get("cumulative_fraction_pass"): self.fractions["pass"][dataset] = descendingPass self.fractions["truncate"][dataset] = descendingTruncate self.logger.info( @@ -1012,7 +645,7 @@ def _getExpectedEvents(self) -> float: :return: number of expected events """ if self.wfController.request.get("RequestType") in ["TaskChain", "StepChain"]: - return self.wfController.getRequestNumEvents() + return self.wfController.request.getRequestNumEvents() expectedEvents = self.wfController.request.get("TotalInputEvents") if expectedEvents is None: @@ -1092,8 +725,10 @@ def _setPassStatisticsCheckOverCompletion(self) -> None: """ The function to set the pass statistics to over complete """ - defaultFractionOverdoing = self.unifiedConfiguration.get("default_fraction_overdoing") - self.passStatsCheckOverCompletion = dict([(dataset, value >= defaultFractionOverdoing) for dataset, value in self.percentCompletions.items()]) + defaultFractionOverdoing = self.checkor.unifiedConfiguration.get("default_fraction_overdoing") + self.passStatsCheckOverCompletion = dict( + [(dataset, value >= defaultFractionOverdoing) for dataset, value in self.percentCompletions.items()] + ) def _checkAvgCompletionStatistics(self) -> None: @@ -1104,7 +739,7 @@ def _checkAvgCompletionStatistics(self) -> None: _, primaries, _, _ = self.wfController.request.getIO() runWhiteList = self.wfController.getRunWhiteList() - lumiWhiteList = self.wfController.getLumiWhiteList() + lumiWhiteList = self.wfController.request.getLumiWhiteList() lumisPerRun = {} if not all(self.passStatsCheck.values()): @@ -1157,7 +792,7 @@ def _getRecoveryAssistanceTags(self) -> list: :return: recovery assistance tags """ if not all(self.passStatsCheck.values()): - possibleRecoveries = self.acdcReader.getRecoveryDocs() + possibleRecoveries = self.checkor.acdcReader.getRecoveryDocs(self.wf) if not possibleRecoveries: self.logger.info( "The workflow is not completed/has missing statistics, but nothing is recoverable. Passing through to announcement" @@ -1165,7 +800,7 @@ def _getRecoveryAssistanceTags(self) -> list: self.bypassChecks = True if not self.bypassChecks: - return ["recovery" if self.unifiedConfiguration.get("use_recoveror") else "manual"] + return ["recovery" if self.checkor.unifiedConfiguration.get("use_recoveror") else "manual"] return [] @@ -1204,18 +839,21 @@ def _checkOutputSize(self,) -> None: self.assistanceTags.add("over100") if self._forceCompleteWorkflow(): - WorkflowStatusEnforcer(self.wf).forceComplete() + if self.checkor.options.get("dryRun"): + self.checkor.logger.debug("Dry run: force completed workflow %s", self.wf) + else: + WorkflowStatusEnforcer(self.wf).forceComplete() def _hasSmallLumis(self) -> bool: """ The function to check if the workflow has small lumi sections :return: True if it has small lumis, False o/w """ - lumiLowerLimit = self.unifiedConfiguration.get("min_events_per_lumi_output") + lumiLowerLimit = self.checkor.unifiedConfiguration.get("min_events_per_lumi_output") _, primaries, _, _ = self.wfController.request.getIO() if ( - not self.wfController.isRelVal() + not self.wfController.request.isRelVal() and not primaries and any( [ @@ -1252,18 +890,18 @@ def _setLumiUpperLimit(self) -> None: The function to set the lumi sections upper limit """ lumiUpperLimit = {} - campaigns = self.campaigns or self._getWorkflowCampaigns() + campaigns = self.campaigns or self._setWorkflowCampaigns() for dataset in self.outputDatasetsToCheck: campaign = campaigns[dataset] if self.wfController.request.get("RequestType") in ["ReDigi", "ReReco"]: upperLimit = -1 - elif self.options.get("lumisize"): - upperLimit = self.options.get("lumisize") + elif self.checkor.options.get("lumisize"): + upperLimit = self.checkor.options.get("lumisize") self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) - elif self.campaignController.campaigns.get(campaign, {}).get("lumisize"): - upperLimit = self.campaignController.campaigns.get(campaign, {}).get("lumisize") + elif self.checkor.campaignController.campaigns.get(campaign, {}).get("lumisize"): + upperLimit = self.checkor.campaignController.campaigns.get(campaign, {}).get("lumisize") self.logger.info("Overriding the upper lumi size to %s for %s", upperLimit, campaign) else: upperLimit = 1001 @@ -1326,7 +964,7 @@ def _hasFileMismatch(self) -> bool: The function to check if there is any file mismatch :return: if number of files in DBS in different from Rucio """ - if not self.options.get("ignoreFiles") and not all( + if not self.checkor.options.get("ignoreFiles") and not all( [self.dbsPresence[dataset] == self.dbsInvalid[dataset] + self.rucioPresence[dataset] for dataset in self.outputDatasetsToCheck] ): self.logger.info("The workflow has a dbs/rucio mismatch") @@ -1347,8 +985,7 @@ def _checkFileCounts(self) -> None: for file in self.dbs["reader"].getDatasetFiles(dataset, validFileOnly=True, details=True) ] ) - rucioReader = RucioReader() - rucioFilenames = set(rucioReader.getDatasetFileNames(dataset)) + rucioFilenames = set(self.rucioReader.getDatasetFileNames(dataset)) missingRucioFiles = dbsFilenames - rucioFilenames missingDBSFiles = rucioFilenames - dbsFilenames @@ -1360,7 +997,7 @@ def _checkFileCounts(self) -> None: showOnlyN, "\n".join(missingRucioFiles[:showOnlyN]), ) - wereInvalidated = sorted(missingRucioFiles & set(self.dataCacheLoader.load("file_invalidation"))) + wereInvalidated = sorted(missingRucioFiles & set(self.checkor.dataCacheLoader.load("file_invalidation"))) if wereInvalidated: self.wfController.logger.info( "These %d files were invalidated globally, showing %d only.\n %s", @@ -1368,7 +1005,11 @@ def _checkFileCounts(self) -> None: showOnlyN, "\n".join(wereInvalidated[:showOnlyN]), ) - self.dbs["writer"].setFileStatus(wereInvalidated, validate=False) + + if self.checkor.options.get("dryRun"): + self.checkor.logger.debug("Dry run: invalidated %s files", len(wereInvalidated)) + else: + self.dbs["writer"].setFileStatus(wereInvalidated, validate=False) if missingDBSFiles: self.wfController.logger.info( @@ -1377,7 +1018,7 @@ def _checkFileCounts(self) -> None: showOnlyN, "\n".join(missingDBSFiles[:showOnlyN]), ) - wereInvalidated = sorted(missingDBSFiles & set(self.dataCacheLoader.load("file_invalidation"))) + wereInvalidated = sorted(missingDBSFiles & set(self.checkor.dataCacheLoader.load("file_invalidation"))) if wereInvalidated: self.wfController.logger.info( "These %d files were invalidated globally, showing %d only.\n %s", @@ -1396,7 +1037,7 @@ def _checkInvalidations(self) -> None: The function to check the invalidations """ fractionInvalid = 0.2 - if not self.options.get("ignoreinvalid") and not all( + if not self.checkor.options.get("ignoreinvalid") and not all( [ self.dbsInvalid[dataset] <= int(fractionInvalid * self.dbsPresence[dataset]) for dataset in self.wfController.request.get("OutputDatasets") @@ -1431,7 +1072,7 @@ def _setRecord(self) -> None: ) record["dbsFiles"] = self.dbsPresence[dataset] record["dbsInvFiles"] = self.dbsInvalid[dataset] - record["rucioFiles"] = set(RucioReader().getDatasetFileNames(dataset)) + record["rucioFiles"] = set(self.rucioReader.getDatasetFileNames(dataset)) record[ "acdc" ] = f"{len(self.acdcs.get('healthy', []))} / {len(self.acdcs.get('healthy', []) + self.acdcs.get('inactive', []))}" @@ -1460,12 +1101,15 @@ def _closeWorkflow(self) -> None: self.newStatus = "close" return - response = self.checkor.reqMgr["writer"].closeoutWorkflow(self.wf, cascade=True) - if response: - self.newStatus = "close" - return + if self.checkor.options.get("dryRun"): + self.checkor.logger.debug("Dry run: closed out workflow %s", self.wf) + else: + response = self.checkor.reqMgr["writer"].closeoutWorkflow(self.wf, cascade=True) + if response: + self.newStatus = "close" + return - self.logger.info("Could not close-out, will try again next time") + self.logger.info("Could not close-out, will try again next time") def _checkAssistanceTags(self) -> None: @@ -1603,9 +1247,3 @@ def _writeResponse(self) -> dict: } return response - -if __name__ == "__main__": - options, specificWf = Checkor.parseOptions() - checkor = Checkor(options=options, specificWf=specificWf) - if checkor.go(): - checkor.run() diff --git a/unified/__init__.py b/src/python/Unified/Helpers/__init__.py similarity index 100% rename from unified/__init__.py rename to src/python/Unified/Helpers/__init__.py diff --git a/unified/Injector.py b/src/python/Unified/Injector.py similarity index 100% rename from unified/Injector.py rename to src/python/Unified/Injector.py diff --git a/unified/Invalidator.py b/src/python/Unified/Invalidator.py similarity index 100% rename from unified/Invalidator.py rename to src/python/Unified/Invalidator.py diff --git a/unified/Rejector.py b/src/python/Unified/Rejector.py similarity index 100% rename from unified/Rejector.py rename to src/python/Unified/Rejector.py diff --git a/unified/.gitkeep b/src/python/Unified/__init__.py similarity index 100% rename from unified/.gitkeep rename to src/python/Unified/__init__.py From 384cca05ff57406c51b6ba4212359a82b6687c84 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 25 Jan 2022 13:01:52 +0300 Subject: [PATCH 06/17] Update the modes of checkor --- src/python/Unified/Checkor.py | 73 ++++++++++++++++------------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index eefc5a5e7..6f97ac1d4 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -81,32 +81,29 @@ def parseOptions() -> Tuple[dict, Optional[str]]: parser.add_option("--go", help="Does not check on duplicate process", action="store_true", default=False) parser.add_option( - "--update", + "--checkRunning", help="Running workflows that have not yet reached completed", action="store_true", default=False, ) parser.add_option( - "--strict", help="Only running workflows that reached completed", action="store_true", default=False + "--checkNewlyCompleted", help="Only running workflows that reached completed", action="store_true", default=False ) parser.add_option( - "--clear", help="Only running workflows that have reached custodial", action="store_true", default=False - ) - parser.add_option( - "--review", + "--checkAssistance", help="Look at the workflows that have already completed and had required actions", action="store_true", default=False, ) parser.add_option( - "--recovering", + "--checkAssistanceRecovering", help="Look at the workflows that already have on-going acdc", action="store_true", default=False, ) - parser.add_option("--manual", help='Look at the workflows in "manual"', action="store_true", default=False) - parser.add_option("--limit", help="The number of workflows to consider for checking", default=0, type=int) - parser.add_option("--threads", help="The number of threads for processing workflows", default=10, type=int) + parser.add_option("--checkAssistanceManual", help='Look at the workflows in "manual"', action="store_true", default=False) + parser.add_option("--maxPerRound", help="The number of workflows to consider for checking", default=0, type=int) + parser.add_option("--nThreads", help="The number of threads for processing workflows", default=1, type=int) parser.add_option( "--fractionPass", help="The completion fraction that is permitted", default=0.0, type="float" ) @@ -138,10 +135,10 @@ def parseOptions() -> Tuple[dict, Optional[str]]: options, args = parser.parse_args() options = vars(options) - options["manual"] = not options.get("recovering") - actions = ["strict", "update", "clear", "review"] + options["checkAssistanceManual"] = not options.get("checkAssistanceRecovering") + actions = ["checkNewlyCompleted", "checkRunning", "checkAssistance"] if all(not options.get(option) for option in actions): - for option in actions + ["recovering", "manual"]: + for option in actions + ["checkAssistanceRecovering", "checkAssistanceManual"]: options[option] = True return options, args[0] if args else None @@ -202,27 +199,23 @@ def _getWorkflowsToCheck(self) -> list: assistanceWfs = self.session.query(Workflow).filter(Workflow.status.startswith("assistance")).all() completedWfs = self.reqMgr["reader"].getWorkflowsByStatus("completed") - if self.options.get("strict"): - self.logger.info("Strict option is on: checking workflows that freshly completed") + if self.options.get("checkNewlyCompleted"): + self.logger.info("checkNewlyCompleted option is on: checking workflows that freshly completed") workflows.update(filter(lambda wf: wf.name in completedWfs, awayWfs)) - if self.options.get("update"): - self.logger.info("Update option is on: checking workflows that have not completed yet") + if self.options.get("checkRunning"): + self.logger.info("checkRunning option is on: checking workflows that have not completed yet") workflows.update(filter(lambda wf: wf.name not in completedWfs, awayWfs)) - if self.options.get("clear"): - self.logger.info("Clear option is on: checking workflows that are ready to toggle closed-out") - workflows.update(filter(lambda wf: "custodial" in wf.status, assistanceWfs)) - - if self.options.get("review"): + if self.options.get("checkAssistance"): nonCustodialWfs = [*filter(lambda wf: "custodial" not in wf.status, assistanceWfs)] - if self.options.get("recovering"): + if self.options.get("checkAssistanceRecovering"): self.logger.info( - "Review-recovering option is on: checking only the workflows that had been already acted on" + "checkAssistanceRecovering option is on: checking only the workflows that had been already acted on" ) workflows.update(filter(lambda wf: "manual" not in wf.status, nonCustodialWfs)) - if self.options.get("manual"): - self.logger.info("Review-manual option is on: checking the workflows to be acted on") + if self.options.get("checkAssistanceManual"): + self.logger.info("checkAssistanceManual option is on: checking the workflows to be acted on") workflows.update(filter(lambda wf: "manual" in wf.status, nonCustodialWfs)) return list(workflows) @@ -266,15 +259,15 @@ def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: :return: filtered list of workflows """ maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") - if self.options.get("limit"): - self.logger.info("Command line to limit workflows to %s", self.options.get("limit")) - maxPerRound = self.options.get("limit") + if self.options.get("maxPerRound"): + self.logger.info("Command line to limit workflows to %s", self.options.get("maxPerRound")) + maxPerRound = self.options.get("maxPerRound") if maxPerRound and not self.specificWf: - self.logger.info("Limiting workflows to %s this round", maxPerRound) + self.logger.info("Number of workflows to check after limitation: %s", maxPerRound) workflows = self._rankWorkflows(workflows) - if self.option.get("update"): + if self.option.get("checkRunning"): random.shuffle(workflows) return workflows[:maxPerRound] @@ -364,13 +357,11 @@ def _writeSummary(self) -> None: """ if not self.specificWf: msg = "" - if self.options.get("strict"): + if self.options.get("checkNewlyCompleted"): msg += "Workflows which just got in completed were looked at. Look in manual.\n" - if self.options.get("update"): + if self.options.get("checkRunning"): msg += "Workflows that are still running (and not completed) got looked at.\n" - if self.options.get("clear"): - msg += "Workflows that just need to close-out were verified. Nothing too new a-priori.\n" - if self.options.get("review"): + if self.options.get("checkAssistance"): msg += "Workflows under intervention got review.\n" msg += "\n".join( @@ -406,14 +397,18 @@ def run(self) -> None: The function to run checkor """ try: - self._setWfs() + # Review this later. Consider moving JIRA functionalities to a different module. + #self._setWfs() wfsToCheck = self._filterBackfills(self._getWorkflowsToCheck()) random.shuffle(wfsToCheck) - self.logger.info("Considering %s workflows (before any limitation)", len(wfsToCheck)) + self.logger.info("Number of workflows to check before any limitation: %s", len(wfsToCheck)) wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) + self.logger.info("Workflows to check: ") + for w in wfsToCheck: + self.logger.info(w) self._check(wfsToCheck) self._writeSummary() @@ -428,7 +423,7 @@ def _check(self, wfsToCheck: list) -> None: :param wfsToCheck: workflows to check """ - @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=len(wfsToCheck)) + @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=self.options.get("nThreads")) def _checkWorkflow(self, wfsToCheck: list) -> dict: return WorkflowCheckor(wfsToCheck, checkor=self).check() From 060964514bc84b2fbe0e586c3a1f093089321a81 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 12:55:06 +0300 Subject: [PATCH 07/17] Checkor dry run mode --- src/python/Unified/Checkor.py | 13 +- src/python/Unified/Helpers/WorkflowCheckor.py | 191 ++++++++++++------ 2 files changed, 143 insertions(+), 61 deletions(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index 6f97ac1d4..d92090481 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -302,13 +302,14 @@ def _updateWorkflowsStatus(self, wfsStatus: list) -> None: The function to update the workflows status :param wfsStatus: status """ + # TODO: Do the ReqMgr update here as well: WorkflowCheckor:_closeOutWorkflow for wf in wfsStatus: if wf.get("newStatus"): newStatus = wf.get("newStatus") wf["workflow"].status = newStatus if wf.options.get("dryRun"): - self.logger.debug("Dry run: updated workflow %s status to %s", wf.get("wf"), newStatus) + self.logger.debug("Dry run: Unified status update of %s to %s", wf.get("wf"), newStatus) else: self.session.commit() @@ -427,9 +428,15 @@ def _check(self, wfsToCheck: list) -> None: def _checkWorkflow(self, wfsToCheck: list) -> dict: return WorkflowCheckor(wfsToCheck, checkor=self).check() + # TODO: Workflow statuses have been already updated in WorkflowCheckor???? checkResponses = _checkWorkflow(wfsToCheck) - self._updateWorkflowsRecords(checkResponses) - self._updateWorkflowsStatus(checkResponses) + self.logger.critical("Response to Checkor:") + self.logger.critical(checkResponses) + # TODO: The following function updates closeoutInfo table of MongoDB. + #self._updateWorkflowsRecords(checkResponses) + # TODO: This does unified status update, mongodb record update and McM force-completion. + # TODO: Why not do all the operations here, especially ReqMgr2 status update?? + #self._updateWorkflowsStatus(checkResponses) self._checkExecutionTime(len(wfsToCheck)) diff --git a/src/python/Unified/Helpers/WorkflowCheckor.py b/src/python/Unified/Helpers/WorkflowCheckor.py index a72e46814..84393d100 100644 --- a/src/python/Unified/Helpers/WorkflowCheckor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -29,10 +29,13 @@ def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwarg self.wfToCheck = wfToCheck self.wfToCheck.wm_status = self.wfController.request.get("RequestStatus") + # TODO: Check this + self.unifiedStatus = wfToCheck.status self.checkor = kwargs.get("checkor") self.rucioReader = RucioReader() + # TODO: What's the difference between assistanceTags & existingAssistaceTags? self.assistanceTags, self.existingAssistaceTags = set(), set(wfToCheck.status.split("-")[1:]) self.acdcs = dict() @@ -84,10 +87,13 @@ def _setWorkflowToClose(self) -> bool: The function to check if a given workflow should be closed :return: True if workflow should be closed, False o/w """ - if self.wfToCheck.wm_status in ["closed-out", "announced"]: + self.logger.info("Checking if the unified status should be 'close' already, but not") + if self.wfToCheck.wm_status in ["closed-out", "announced"] and self.wfToCheck.unifiedStatus != "close": self.logger.info("%s is already %s, setting as close", self.wf, self.wfToCheck.wm_status) self.newStatus = "close" return True + else: + self.logger.info("Unified status is already okay, skipping the request.") return False @@ -96,6 +102,7 @@ def _setWorkflowToForget(self) -> bool: The function to check if a given workflow should be forgotten :return: True if workflow should be forgotten, False o/w """ + self.logger.info("Checking if the unified status should be 'forget'") if self.wfController.request.isRelVal() and self.wfToCheck.wm_status in [ "failed", "aborted", @@ -104,9 +111,11 @@ def _setWorkflowToForget(self) -> bool: "rejected-archived", "aborted-completed", ]: - self.logger.info("%s is %s, but will not be set in trouble to find a replacement", self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting the unified status as 'forget'", self.wf, self.wfToCheck.wm_status) self.newStatus = "forget" return True + else: + self.logger.info("%s is %s, not setting it as 'forget'", self.wf, self.wfToCheck.wm_status) return False @@ -223,25 +232,29 @@ def _updateWorkflowStatus(self) -> bool: The function to check if the status of a given workflow should be updated :return: True if update status, False o/w """ - if self._setWorkflowToClose() or self._setWorkflowToForget() or self._setWorkflowToTrouble(): + if self._setWorkflowToClose(): return True - if self._isWorkflowOnHold() and self._setNewOnHoldStatus(self.wfController.getFamily(includeItself=True)): + if self._setWorkflowToForget(): return True + # TODO Check this + #if self._setWorkflowToTrouble(): + # return True + + # TODO Check this later + #if self._isWorkflowOnHold() and self._setNewOnHoldStatus(self.wfController.getFamily(includeItself=True)): + # return True return False - def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: + def _getTiersWithNoCheck(self) -> Tuple[list, list]: """ The function yo get the tiers with no check and custodial :return: list of tiers with no check, and list of tiers with no custodial """ tiersWithNoCheck = set(self.checkor.unifiedConfiguration.get("tiers_with_no_check")) - tiersWithNoCustodial = set( - self.checkor.unifiedConfiguration.get("tiers_with_no_custodial") if not self.wfController.request.isRelVal() else [] - ) campaigns = self.campaigns or self._setWorkflowCampaigns() - + # TODO custodial_override is not good name anymore, change it! for campaign in campaigns.values(): if ( campaign in self.checkor.campaignController.campaigns @@ -249,19 +262,15 @@ def _getTiersWithNoCheckAndCustodial(self) -> Tuple[list, list]: ): if isinstance(self.checkor.campaignController.campaigns[campaign]["custodial_override"], list): tiersWithNoCheck -= self.checkor.campaignController.campaigns[campaign]["custodial_override"] - elif self.checkor.campaignController.campaigns[campaign]["custodial_override"] == "notape": - tiersWithNoCustodial = set( - sorted([dataset.split("/")[-1] for dataset in self.wfController.request.get("OutputDatasets")]) - ) - return tiersWithNoCheck, tiersWithNoCustodial + return tiersWithNoCheck def _setOutputDatasetsToCheck(self) -> None: """ The function to set the datasets to check """ - tiersWithNoCheck, tiersWithNoCustodial = self._getTiersWithNoCheckAndCustodial(self.wfController) - + self.logger.info("Checking which output datasets to be handled") + tiersWithNoCheck = self._getTiersWithNoCheck(self.wfController) expectedOutputsDatasets = self.wfController.request.get("OutputDatasets") self.outputDatasetsToCheck = [ dataset @@ -269,12 +278,11 @@ def _setOutputDatasetsToCheck(self) -> None: if all([dataset.split("/")[-1] != tier for tier in tiersWithNoCheck]) ] - self.wfController.logger.debug( + self.wfController.logger.info( "Initial outputs: %s\nWill check on: %s\nTiers out: %s\nTiers no custodials: %s", ", ".join(sorted(expectedOutputsDatasets)), ", ".join(sorted(self.outputDatasetsToCheck)), - ", ".join(sorted(tiersWithNoCheck)), - ", ".join(sorted(tiersWithNoCustodial)), + ", ".join(sorted(tiersWithNoCheck)) ) def _skipFamilyWorkflow(self, wfSchema: dict) -> bool: @@ -534,7 +542,7 @@ def _getFractionPassByCampaignRequirement(self, campaign: str, dataset: str) -> return fractionPass - def _setDatasetsFractionsToPass(self) -> None: + def _setDatasetsFractionPass(self) -> None: """ The function to set the dataset fractions to pass """ @@ -557,6 +565,7 @@ def _setDatasetsFractionsToPass(self) -> None: fractionsPass[dataset] = passValue self.wfController.logger.info("Overriding fraction to %s for %s by dataset key", passValue, dataset) + self.logger.info("Expected stats (Fraction pass): %s", str(fractionsPass)) self.fractions["pass"] = fractionsPass def _setDatasetsFractionsToTruncateRecovery(self) -> None: @@ -593,14 +602,18 @@ def _setDatasetsFractionsToTruncateRecovery(self) -> None: self.fractions["truncate"] = fractionsTruncateRecovery - def _setStatisticsThresholds(self) -> None: + def _setExpectedStats(self) -> None: """ The function to set the statistics thresholds """ - self._setDatasetsFractionsToAnnounce() - self._setDatasetsFractionsToPass() - self._setDatasetsFractionsToTruncateRecovery() + # TODO: # Sets it 1 regardless of the dataset and it's not used? + #self._setDatasetsFractionsToAnnounce() + self._setDatasetsFractionPass() + + # TODO: Disabling truncation for now. Check that later. + """ + self._setDatasetsFractionsToTruncateRecovery() fractionDamping = self._getFractionDumping() for dataset, value in self.fractions["pass"].items(): if value != 1.0 and fractionDamping and self.checkor.unifiedConfiguration.get("timeout_for_damping_fraction"): @@ -612,6 +625,7 @@ def _setStatisticsThresholds(self) -> None: self.fractions["truncate"][dataset] = self.fractions["pass"][dataset] self._updateFractionsToPassAndToTruncateRecovery() + """ def _updateFractionsToPassAndToTruncateRecovery(self) -> None: """ @@ -666,7 +680,7 @@ def _getTasksByOutputDataset(self) -> dict: return tasksByDataset - def _checkCompletionStatistics(self) -> None: + def _setActualStats(self) -> None: """ The function to check the completion statistics """ @@ -699,17 +713,20 @@ def _checkCompletionStatistics(self) -> None: outputEventsExpected, dataset, ) + self.logger.info("Actual stats: %s", str(self.percentCompletions)) - def _setPassStatisticsCheck(self) -> None: + def _compareAndSetExpectedAndActualStats(self) -> None: """ The function to set pass statistics """ + # TODO: The way bypassChecks is introduced seems problematic, review it. self.passStatsCheck = dict( [ (dataset, self.bypassChecks or self.percentCompletions[dataset] >= passValue) for dataset, passValue in self.fractions["pass"].items() ] ) + self.logger.info("Compared actual and expected stats: %s", str(self.passStatsCheck)) def _setPassStatisticsCheckToAnnounce(self) -> None: """ @@ -916,21 +933,26 @@ def _checkLumiSize(self) -> None: """ The function to check the lumi sections sizes """ + self.logger.info("Checking lumi size") if self._hasSmallLumis(): self.assistanceTags.add("smalllumi") self.isClosing = False + # TODO: Give more details?? + self.logger.info("Output has small lumisections, not closing out.") self._setLumiUpperLimit() if self._hasBigLumis(): self.assistanceTags.add("biglumi") self.isClosing = False + # TODO: Give more details?? + self.logger.info("Output has big lumisections, not closing out.") def _checkRucioFileCounts(self) -> None: """ The function to check the number of files in Rucio """ rucioPresence = {} - + # TODO: Check the algorithm of this function for dataset in self.wfController.request.get("OutputDatasets"): filesPerBlock = set(self.rucioReader.countDatasetFilesPerBlock(dataset)) allBlocks = set([*map(lambda x: x[0], filesPerBlock)]) @@ -947,6 +969,7 @@ def _checkRucioFileCounts(self) -> None: self.assistanceTags.remove("announce") self.rucioPresence = rucioPresence + self.logger.info("Checked the rucio presence: %s", str(self.rucioPresence)) def _checkDBSFileCounts(self) -> None: """ @@ -959,6 +982,9 @@ def _checkDBSFileCounts(self) -> None: self.dbsPresence, self.dbsInvalid = dbsPresence, dbsInvalid + self.logger.info("Checked DBS Presence: %s", str(self.dbsPresence)) + self.logger.info("Checked INVALID DBS Presence: %s", str(self.dbsInvalid)) + def _hasFileMismatch(self) -> bool: """ The function to check if there is any file mismatch @@ -972,12 +998,16 @@ def _hasFileMismatch(self) -> bool: return False - def _checkFileCounts(self) -> None: + def _handleFileMismatch(self) -> None: """ The function to check the number of files """ + # TODO: Make this configurable! showOnlyN = 10 + # TODO: Check the algorithm of this function + self.logger.info("There is a RUCIO/DBS filemismatch. Checking the details") + for dataset in self.dbsPresence: dbsFilenames = set( [ @@ -998,6 +1028,7 @@ def _checkFileCounts(self) -> None: "\n".join(missingRucioFiles[:showOnlyN]), ) wereInvalidated = sorted(missingRucioFiles & set(self.checkor.dataCacheLoader.load("file_invalidation"))) + # TODO: Check this invalidation!! if wereInvalidated: self.wfController.logger.info( "These %d files were invalidated globally, showing %d only.\n %s", @@ -1019,6 +1050,7 @@ def _checkFileCounts(self) -> None: "\n".join(missingDBSFiles[:showOnlyN]), ) wereInvalidated = sorted(missingDBSFiles & set(self.checkor.dataCacheLoader.load("file_invalidation"))) + # TODO: Check this invalidation if wereInvalidated: self.wfController.logger.info( "These %d files were invalidated globally, showing %d only.\n %s", @@ -1032,10 +1064,11 @@ def _checkFileCounts(self) -> None: self.isClosing = False - def _checkInvalidations(self) -> None: + def _checkInvalidFiles(self) -> None: """ The function to check the invalidations """ + self.logger.info("Checking if the output(s) has/have a significant amount of invalid files") fractionInvalid = 0.2 if not self.checkor.options.get("ignoreinvalid") and not all( [ @@ -1045,6 +1078,8 @@ def _checkInvalidations(self) -> None: ): self.wfController.logger.info("The workflow has a DBS invalid file level too high") self.assistanceTags.add("invalidfiles") + else: + self.logger.info("The outputs don't have a significant amount of invalid files") def _setRecord(self) -> None: """ @@ -1084,9 +1119,11 @@ def _setRecord(self) -> None: wfRecord["datasets"][dataset] = record + self.logger.info("Following record has been produced for MongoDB update: %s", record) + self.record = wfRecord - def _closeWorkflow(self) -> None: + def _closeOutWorkflow(self) -> None: """ The function to close the workflow """ @@ -1112,10 +1149,11 @@ def _closeWorkflow(self) -> None: self.logger.info("Could not close-out, will try again next time") - def _checkAssistanceTags(self) -> None: + def _updateAssistanceStatus(self) -> None: """ The function to check the assistance tags """ + # TODO: Needs dry-run mode update self.logger.info("%s was tagged with: %s", self.wf, self.assistanceTags) if "recovering" in self.assistanceTags: self.assistanceTags -= set(["recovery", "filemismatch", "manual"]) @@ -1141,9 +1179,9 @@ def _warnRequestor(self) -> None: msg = "The request PREPID (WORKFLOW) is facing issue in production.\n" if "recovery" in self.assistanceTags: - msg += f"Samples completed with missing statistics\n{'\n'.join([f'{round(self.percentCompletions[dataset]*100, 2)}%% complete for {dataset}' for dataset in self.outputDatasetsToCheck ])}\nhttps://cmsweb.cern.ch/report/{self.wf}\n" + msg += f"Samples completed with missing statistics\n{''.join([f'{round(self.percentCompletions[dataset]*100, 2)}%% complete for {dataset}' for dataset in self.outputDatasetsToCheck ])}\nhttps://cmsweb.cern.ch/report/{self.wf}\n" if "biglumi" in self.assistanceTags: - msg += f"Samples completed with large luminosity blocks:\n{'\n'.join([f'{self.eventsPerLumi[dataset]} > {self.lumiUpperLimit[dataset]} for {dataset}' for dataset in self.outputDatasetsToCheck])}\nhttps://cmsweb.cern.ch/reqmgr/view/splitting/{self.wf}\n" + msg += f"Samples completed with large luminosity blocks:\n{''.join([f'{self.eventsPerLumi[dataset]} > {self.lumiUpperLimit[dataset]} for {dataset}' for dataset in self.outputDatasetsToCheck])}\nhttps://cmsweb.cern.ch/reqmgr/view/splitting/{self.wf}\n" msg += "You are invited to check, while this is being taken care of by Comp-Ops.\n" msg += "This is an automated message from Comp-Ops.\n" @@ -1158,7 +1196,7 @@ def _getAssistanceStatus(self) -> str: return "assistance-" + "-".join(sorted(self.assistanceTags)) return "assistance" - def _setWorkflowToAssistance(self) -> None: + def _setAssistanceStatus(self) -> None: """ The function to set the workflow to assistance """ @@ -1166,63 +1204,96 @@ def _setWorkflowToAssistance(self) -> None: if "manual" not in self.wfToCheck.status or assistanceStatus != "assistance-recovery": self.newStatus = assistanceStatus + self.logger.info("Ultimate assistance status is: %s", str(assistanceStatus)) + def check(self) -> dict: """ The function to check the workflow """ try: self.checkor._checkPoint(f"Starting checkor with {self.wf}") - if self._skipWorkflow() or self._updateWorkflowStatus(): + + # Investigate this later: There should not be a case to skip a workflow + #if self._skipWorkflow(): + # return self._writeResponse() + + # TODO: The following function checks workflows whose unified status should have been updated, but not. + # TODO: Workflows to close & Workflows to forget + # TODO: I think we can keep it for a while and see if it really has a use + # TODO: I also think that it needs a renaming. + if self._updateWorkflowStatus(): + # TODO: Check this function!! return self._writeResponse() self.checkor._checkPoint("Checked workflow status", subLap=True) self._setOutputDatasetsToCheck() - self._checkWorkflowFamily() - self.checkor._checkPoint("Checked workflow family", subLap=True) - self._setStatisticsThresholds() - self.checkor._checkPoint("Checked statistics threshold", subLap=True) + # TODO: Check this function. It does more than checking + #self._checkWorkflowFamily() + #self.checkor._checkPoint("Checked workflow family", subLap=True) - self._checkCompletionStatistics() - self.checkor._checkPoint("Checked observed statistics", subLap=True) + # TODO: I did simplifications. Review it later. + self._setExpectedStats() + self.checkor._checkPoint("Checked expected stats", subLap=True) - self._setPassStatisticsCheck() - self._checkAvgCompletionStatistics() - self.checkor._checkPoint("Checked more detailed observed statistics", subLap=True) + # TODO: "_set" prefix seems confusing. The function gets and sets + # TODO: Perhaps, break it down into 2 function: getActualStats & setActualStats + self._setActualStats() + self.checkor._checkPoint("Checked actual stats", subLap=True) - self._setPassStatisticsCheckToAnnounce() - self._setPassStatisticsCheckOverCompletion() - self._checkOutputSize() - self.checkor._checkPoint("Checked output size", subLap=True) + # TODO: Rename this after understanding the following functions + self._compareAndSetExpectedAndActualStats() + # TODO: I don't understand these two functions. Could be related to over100 and announce tags. Disabling now + #self._checkAvgCompletionStatistics() + #self._setPassStatisticsCheckToAnnounce() + #self.checkor._checkPoint("Checked more detailed observed statistics", subLap=True) + + # TODO: This might be a good feature. Disabling for now. Review later. + #self._setPassStatisticsCheckOverCompletion() + + # TODO: Function does more than output size checking, which I don't understand. Disabling for now. + #self._checkOutputSize() + #self.checkor._checkPoint("Checked output size", subLap=True) + + # TODO: We might disable smalllumi check. Review later. Keep it for now. self._checkLumiSize() self.checkor._checkPoint("Checked lumi size", subLap=True) self._checkRucioFileCounts() - self.checkor_checkPoint("Checked Rucio count", subLap=True) + self.checkor_checkPoint("Checked Rucio file count", subLap=True) self._checkDBSFileCounts() - self.checkor._checkPoint("DBS file count", subLap=True) + self.checkor._checkPoint("Checked DBS file count", subLap=True) if self._hasFileMismatch() and "recovering" not in self.assistanceTags: - self._checkFileCounts() + # TODO: Test this function carefully: filemismatch, agentfilemismatch, etc. + self._handleFileMismatch() self.checkor._checkPoint("Checked file count", subLap=True) - self._checkInvalidations() - self.checkor._checkPoint("Checked invalidation", subLap=True) + # TODO: Test this function + self._checkInvalidFiles() + self.checkor._checkPoint("Checked invalid files", subLap=True) self.checkor._checkPoint(f"Done with {self.wf}") + # TODO: Log this properly and understand how it is used self._setRecord() if self.isClosing: - self._closeWorkflow() + self.logger.info("The workflow is okay to be closed-out. Perform the action later.") + #self._closeOutWorkflow() else: - self._checkAssistanceTags() - self._warnRequestor() - self._setWorkflowToAssistance() + self.logger.info("The workflow is not okay to be closed-out.") + # TODO: Rename and update dry run mode + self._updateAssistanceStatus() + # TODO: What's the difference from the function above? Clear here. + self._setAssistanceStatus() + + # TODO: Disabling for now, check later. + # self._warnRequestor() ## TODO: update JIRA tickets / migrate JiraClient - + return self._writeResponse() except Exception as error: @@ -1235,6 +1306,7 @@ def _writeResponse(self) -> dict: """ The function to write the check response """ + # TODO: What's the difference between workflow and wf??? response = { "workflow": self.wfToCheck, "wf": self.wf, @@ -1245,5 +1317,8 @@ def _writeResponse(self) -> dict: "mcmForceComplete": self.bypassChecksByMcMForceComplete, "record": self.record } + + self.logger.info("An update is required: ") + self.logger.info(response) return response From 3eb374200993cd8a97d6c46c28dca42b73ddb900 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:00:53 +0300 Subject: [PATCH 08/17] Fix the type in option handling --- src/python/Unified/Checkor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index d92090481..8cba527e3 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -128,7 +128,7 @@ def parseOptions() -> Tuple[dict, Optional[str]]: parser.add_option( "--dryRun", help="To run in test mode (with production but without making any changes)", - actions="store_true", + action="store_true", default=False, ) From 0b12632e325b7c00e869cfd82814d570aecf4b75 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:09:35 +0300 Subject: [PATCH 09/17] Fix configuration retrival for max_per_round --- src/python/Unified/Checkor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index 8cba527e3..e1887b51c 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -258,7 +258,7 @@ def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: :param workflows: list of workflows :return: filtered list of workflows """ - maxPerRound = self.unifiedConfiguration.get("max_per_round", {}).get("checkor") + maxPerRound = self.unifiedConfiguration.get("max_per_round").get("checkor") if self.options.get("maxPerRound"): self.logger.info("Command line to limit workflows to %s", self.options.get("maxPerRound")) maxPerRound = self.options.get("maxPerRound") From fea6fc6233114b63efbf766a45f0bedb209930ce Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:10:26 +0300 Subject: [PATCH 10/17] Fix typo --- src/python/Unified/Checkor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index e1887b51c..502d8ab7e 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -267,7 +267,7 @@ def _filterMaxNumberOfWorkflows(self, workflows: list) -> list: self.logger.info("Number of workflows to check after limitation: %s", maxPerRound) workflows = self._rankWorkflows(workflows) - if self.option.get("checkRunning"): + if self.options.get("checkRunning"): random.shuffle(workflows) return workflows[:maxPerRound] From df9963b86f76f0f9cbaa9cbf316bb6512cfc1beb Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:15:21 +0300 Subject: [PATCH 11/17] Fix typo --- src/python/Unified/Checkor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index 502d8ab7e..534c3e65d 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -27,6 +27,7 @@ from Unified.Helpers.WorkflowCheckor import WorkflowCheckor from typing import Optional, Tuple, Union +import traceback class Checkor(OracleClient): @@ -408,8 +409,8 @@ def run(self) -> None: wfsToCheck = self._filterMaxNumberOfWorkflows(wfsToCheck) self.logger.info("Workflows to check: ") - for w in wfsToCheck: - self.logger.info(w) + for workflow in wfsToCheck: + self.logger.info(workflow.name) self._check(wfsToCheck) self._writeSummary() @@ -417,6 +418,7 @@ def run(self) -> None: except Exception as error: self.logger.error("Failed to run checkor") self.logger.error(str(error)) + self.logger.error(traceback.format_exc()) def _check(self, wfsToCheck: list) -> None: """ From f824fdb5f8990874804e121565ef068cd6a832d9 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:37:17 +0300 Subject: [PATCH 12/17] Disable multithreading for now --- src/python/Unified/Checkor.py | 30 +++++++++++-------- src/python/Unified/Helpers/WorkflowCheckor.py | 1 + 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index 534c3e65d..c89065dc1 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -426,19 +426,23 @@ def _check(self, wfsToCheck: list) -> None: :param wfsToCheck: workflows to check """ - @runWithMultiThreading(mtParam="wfsToCheck", maxThreads=self.options.get("nThreads")) - def _checkWorkflow(self, wfsToCheck: list) -> dict: - return WorkflowCheckor(wfsToCheck, checkor=self).check() - - # TODO: Workflow statuses have been already updated in WorkflowCheckor???? - checkResponses = _checkWorkflow(wfsToCheck) - self.logger.critical("Response to Checkor:") - self.logger.critical(checkResponses) - # TODO: The following function updates closeoutInfo table of MongoDB. - #self._updateWorkflowsRecords(checkResponses) - # TODO: This does unified status update, mongodb record update and McM force-completion. - # TODO: Why not do all the operations here, especially ReqMgr2 status update?? - #self._updateWorkflowsStatus(checkResponses) + # TODO: Fix the multithreading! + #@runWithMultiThreading(mtParam="wfsToCheck", maxThreads=self.options.get("nThreads")) + #def _checkWorkflow(self, wfsToCheck: list) -> dict: + #return WorkflowCheckor(wfsToCheck, checkor=self).check() + def _checkWorkflow(workflow: list) -> dict: + return WorkflowCheckor(workflow).check() + + for workflow in wfsToCheck: + # TODO: Workflow statuses have been already updated in WorkflowCheckor???? + checkResponses = _checkWorkflow(wfsToCheck) + self.logger.critical("Response to Checkor:") + self.logger.critical(checkResponses) + # TODO: The following function updates closeoutInfo table of MongoDB. + #self._updateWorkflowsRecords(checkResponses) + # TODO: This does unified status update, mongodb record update and McM force-completion. + # TODO: Why not do all the operations here, especially ReqMgr2 status update?? + #self._updateWorkflowsStatus(checkResponses) self._checkExecutionTime(len(wfsToCheck)) diff --git a/src/python/Unified/Helpers/WorkflowCheckor.py b/src/python/Unified/Helpers/WorkflowCheckor.py index 84393d100..d9e6ced19 100644 --- a/src/python/Unified/Helpers/WorkflowCheckor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -22,6 +22,7 @@ def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwarg try: super().__init__(self) self.logger = logger or getLogger(self.__class__.__name__) + self.logger.info("Initializing Workflow Checkor") self.now = mktime(gmtime()) self.wf = wfToCheck.name From 1262237de7f2ebfeb108647a555f4cb336bfd29b Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:41:32 +0300 Subject: [PATCH 13/17] Update arguments of WorkflowCheckor --- src/python/Unified/Helpers/WorkflowCheckor.py | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/python/Unified/Helpers/WorkflowCheckor.py b/src/python/Unified/Helpers/WorkflowCheckor.py index d9e6ced19..c50c8fdda 100644 --- a/src/python/Unified/Helpers/WorkflowCheckor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -12,32 +12,33 @@ from typing import Optional, Tuple -class WorkflowCheckor(object): +#class WorkflowCheckor(object): +class WorkflowCheckor(): """ __WorkflowCheckor__ General API for checking a given workflow """ - def __init__(self, wfToCheck: Workflow, logger: Optional[Logger] = None, **kwargs) -> None: + def __init__(self, workflow: Workflow, logger: Optional[Logger] = None, **kwargs) -> None: try: - super().__init__(self) + #super().__init__(self) self.logger = logger or getLogger(self.__class__.__name__) self.logger.info("Initializing Workflow Checkor") self.now = mktime(gmtime()) - self.wf = wfToCheck.name + self.wf = workflow.name self.wfController = WorkflowController(self.wf) - self.wfToCheck = wfToCheck - self.wfToCheck.wm_status = self.wfController.request.get("RequestStatus") + self.workflow = workflow + self.workflow.wm_status = self.wfController.request.get("RequestStatus") # TODO: Check this - self.unifiedStatus = wfToCheck.status + self.unifiedStatus = workflow.status self.checkor = kwargs.get("checkor") self.rucioReader = RucioReader() # TODO: What's the difference between assistanceTags & existingAssistaceTags? - self.assistanceTags, self.existingAssistaceTags = set(), set(wfToCheck.status.split("-")[1:]) + self.assistanceTags, self.existingAssistaceTags = set(), set(workflow.status.split("-")[1:]) self.acdcs = dict() self.campaigns = dict() @@ -75,8 +76,8 @@ def _skipWorkflow(self) -> bool: if ( (self.checkor.specificWf and self.checkor.specificWf not in self.wf) or os.path.isfile(".checkor_stop") - or self.wfToCheck.wm_status in ["assigned", "acquired"] - or self.wfToCheck.wm_status != "completed" + or self.workflow.wm_status in ["assigned", "acquired"] + or self.workflow.wm_status != "completed" ): self.logger.info("Skipping workflow %s", self.wf) return True @@ -89,8 +90,8 @@ def _setWorkflowToClose(self) -> bool: :return: True if workflow should be closed, False o/w """ self.logger.info("Checking if the unified status should be 'close' already, but not") - if self.wfToCheck.wm_status in ["closed-out", "announced"] and self.wfToCheck.unifiedStatus != "close": - self.logger.info("%s is already %s, setting as close", self.wf, self.wfToCheck.wm_status) + if self.workflow.wm_status in ["closed-out", "announced"] and self.workflow.unifiedStatus != "close": + self.logger.info("%s is already %s, setting as close", self.wf, self.workflow.wm_status) self.newStatus = "close" return True else: @@ -104,7 +105,7 @@ def _setWorkflowToForget(self) -> bool: :return: True if workflow should be forgotten, False o/w """ self.logger.info("Checking if the unified status should be 'forget'") - if self.wfController.request.isRelVal() and self.wfToCheck.wm_status in [ + if self.wfController.request.isRelVal() and self.workflow.wm_status in [ "failed", "aborted", "aborted-archived", @@ -112,11 +113,11 @@ def _setWorkflowToForget(self) -> bool: "rejected-archived", "aborted-completed", ]: - self.logger.info("%s is %s, setting the unified status as 'forget'", self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting the unified status as 'forget'", self.wf, self.workflow.wm_status) self.newStatus = "forget" return True else: - self.logger.info("%s is %s, not setting it as 'forget'", self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, not setting it as 'forget'", self.wf, self.workflow.wm_status) return False @@ -125,7 +126,7 @@ def _setWorkflowToTrouble(self) -> bool: The function to check if a given workflow should be set as trouble :return: True if workflow should be set as trouble, False o/w """ - if not self.wfController.request.isRelVal() and self.wfToCheck.wm_status in [ + if not self.wfController.request.isRelVal() and self.workflow.wm_status in [ "failed", "aborted", "aborted-archived", @@ -133,7 +134,7 @@ def _setWorkflowToTrouble(self) -> bool: "rejected-archived", "aborted-completed", ]: - self.logger.info("%s is %s, setting as trouble", self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting as trouble", self.wf, self.workflow.wm_status) self.newStatus = "trouble" return True @@ -181,7 +182,7 @@ def _isWorkflowOnHold(self) -> bool: The function to check if a given workfloe is on hold :return: True if workflow is on hold, False o/w """ - return "-onhold" in self.wfToCheck.wm_status or self.wf in self.checkor.onHoldWfs + return "-onhold" in self.workflow.wm_status or self.wf in self.checkor.onHoldWfs def _getMinFamilyCompletedDelay(self, family: list) -> float: """ @@ -217,12 +218,12 @@ def _setNewOnHoldStatus(self, family: list) -> bool: self.bypassChecks = True return False - if "-onhold" in self.wfToCheck.wm_status and self.wf in self.checkor.onHoldWfs and not self.bypassChecks: + if "-onhold" in self.workflow.wm_status and self.wf in self.checkor.onHoldWfs and not self.bypassChecks: self.logger.info("%s is on hold", self.wf) return True if self.wf in self.checkor.onHoldWfs and not self.bypassChecks: - self.logger.info("%s is %s, setting as on hold", self.wf, self.wfToCheck.wm_status) + self.logger.info("%s is %s, setting as on hold", self.wf, self.workflow.wm_status) self.newStatus = "assistance-onhold" return True @@ -800,7 +801,7 @@ def _getAnnounceAssistanceTags(self) -> list: self.wfController.logger.info( "The output of this workflow are essentially good to be announced while we work on the rest" ) - return ["announced" if "announced" in self.wfToCheck.status else "announce"] + return ["announced" if "announced" in self.workflow.status else "announce"] return [] @@ -1130,11 +1131,11 @@ def _closeOutWorkflow(self) -> None: """ self.wfController.logger.info("Setting %s as closed-out", self.wf) - if self.wfToCheck.status in ["closed-out", "announced", "normal-archived"]: + if self.workflow.status in ["closed-out", "announced", "normal-archived"]: self.logger.info( "%s is already %s, not trying to close-out as assuming it does", self.wf, - self.wfToCheck.status, + self.workflow.status, ) self.newStatus = "close" return @@ -1202,7 +1203,7 @@ def _setAssistanceStatus(self) -> None: The function to set the workflow to assistance """ assistanceStatus = self._getAssistanceStatus() - if "manual" not in self.wfToCheck.status or assistanceStatus != "assistance-recovery": + if "manual" not in self.workflow.status or assistanceStatus != "assistance-recovery": self.newStatus = assistanceStatus self.logger.info("Ultimate assistance status is: %s", str(assistanceStatus)) @@ -1309,7 +1310,7 @@ def _writeResponse(self) -> dict: """ # TODO: What's the difference between workflow and wf??? response = { - "workflow": self.wfToCheck, + "workflow": self.workflow, "wf": self.wf, "failed": self.failed, "isClosing": self.isClosing, From b7ff5dee5294f572b47b93e238b0fc8a5386b462 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:43:29 +0300 Subject: [PATCH 14/17] Fix typo --- src/python/Unified/Checkor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index c89065dc1..1ae0bc5fc 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -433,9 +433,13 @@ def _check(self, wfsToCheck: list) -> None: def _checkWorkflow(workflow: list) -> dict: return WorkflowCheckor(workflow).check() + #checkResponses = _checkWorkflow(wfsToCheck) + # self._updateWorkflowsRecords(checkResponses) + # self._updateWorkflowsStatus(checkResponses) + for workflow in wfsToCheck: # TODO: Workflow statuses have been already updated in WorkflowCheckor???? - checkResponses = _checkWorkflow(wfsToCheck) + checkResponses = _checkWorkflow(workflow) self.logger.critical("Response to Checkor:") self.logger.critical(checkResponses) # TODO: The following function updates closeoutInfo table of MongoDB. From c2749c8451eae8ca4a7fc904d7b92c2bee9514ea Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:46:29 +0300 Subject: [PATCH 15/17] Add a new argument --- src/python/Unified/Checkor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index 1ae0bc5fc..bfbb0cb9f 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -439,7 +439,7 @@ def _checkWorkflow(workflow: list) -> dict: for workflow in wfsToCheck: # TODO: Workflow statuses have been already updated in WorkflowCheckor???? - checkResponses = _checkWorkflow(workflow) + checkResponses = _checkWorkflow(workflow=workflow, checkor=self) self.logger.critical("Response to Checkor:") self.logger.critical(checkResponses) # TODO: The following function updates closeoutInfo table of MongoDB. From 7396a5ad0612b494cbcbab3537c68db6efc0689e Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 15 Feb 2022 13:50:21 +0300 Subject: [PATCH 16/17] Add a new argument --- src/python/Unified/Helpers/WorkflowCheckor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python/Unified/Helpers/WorkflowCheckor.py b/src/python/Unified/Helpers/WorkflowCheckor.py index c50c8fdda..beccfd595 100644 --- a/src/python/Unified/Helpers/WorkflowCheckor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -9,6 +9,7 @@ from Services.Rucio.RucioReader import RucioReader from WorkflowMgmt.WorkflowController import WorkflowController from WorkflowMgmt.WorkflowStatusEnforcer import WorkflowStatusEnforcer +from Unified.Checkor import Checkor from typing import Optional, Tuple @@ -19,7 +20,7 @@ class WorkflowCheckor(): General API for checking a given workflow """ - def __init__(self, workflow: Workflow, logger: Optional[Logger] = None, **kwargs) -> None: + def __init__(self, workflow: Workflow, checkor: Checkor, logger: Optional[Logger] = None, **kwargs) -> None: try: #super().__init__(self) self.logger = logger or getLogger(self.__class__.__name__) @@ -34,7 +35,8 @@ def __init__(self, workflow: Workflow, logger: Optional[Logger] = None, **kwargs # TODO: Check this self.unifiedStatus = workflow.status - self.checkor = kwargs.get("checkor") + #self.checkor = kwargs.get("checkor") + self.checkor = checkor self.rucioReader = RucioReader() # TODO: What's the difference between assistanceTags & existingAssistaceTags? From e2dbd07e5155f56e3a3ac1aa37e221b3ed3c36c5 Mon Sep 17 00:00:00 2001 From: haozturk Date: Tue, 5 Apr 2022 15:27:57 +0200 Subject: [PATCH 17/17] Adjust dry run mode --- src/python/Unified/Checkor.py | 8 +++++--- src/python/Unified/Helpers/WorkflowCheckor.py | 3 +-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/python/Unified/Checkor.py b/src/python/Unified/Checkor.py index bfbb0cb9f..580c71df2 100644 --- a/src/python/Unified/Checkor.py +++ b/src/python/Unified/Checkor.py @@ -430,8 +430,8 @@ def _check(self, wfsToCheck: list) -> None: #@runWithMultiThreading(mtParam="wfsToCheck", maxThreads=self.options.get("nThreads")) #def _checkWorkflow(self, wfsToCheck: list) -> dict: #return WorkflowCheckor(wfsToCheck, checkor=self).check() - def _checkWorkflow(workflow: list) -> dict: - return WorkflowCheckor(workflow).check() + def _checkWorkflow(workflow: Workflow, checkor: Checkor) -> dict: + return WorkflowCheckor(workflow, checkor).check() #checkResponses = _checkWorkflow(wfsToCheck) # self._updateWorkflowsRecords(checkResponses) @@ -454,5 +454,7 @@ def _checkWorkflow(workflow: list) -> dict: if __name__ == "__main__": options, specificWf = Checkor.parseOptions() checkor = Checkor(options=options, specificWf=specificWf) - if checkor.go(): + # TODO: Update this in production + #if checkor.go(): + if True: checkor.run() diff --git a/src/python/Unified/Helpers/WorkflowCheckor.py b/src/python/Unified/Helpers/WorkflowCheckor.py index beccfd595..4f5ab3af7 100644 --- a/src/python/Unified/Helpers/WorkflowCheckor.py +++ b/src/python/Unified/Helpers/WorkflowCheckor.py @@ -9,7 +9,6 @@ from Services.Rucio.RucioReader import RucioReader from WorkflowMgmt.WorkflowController import WorkflowController from WorkflowMgmt.WorkflowStatusEnforcer import WorkflowStatusEnforcer -from Unified.Checkor import Checkor from typing import Optional, Tuple @@ -20,7 +19,7 @@ class WorkflowCheckor(): General API for checking a given workflow """ - def __init__(self, workflow: Workflow, checkor: Checkor, logger: Optional[Logger] = None, **kwargs) -> None: + def __init__(self, workflow: Workflow, checkor, logger: Optional[Logger] = None, **kwargs) -> None: try: #super().__init__(self) self.logger = logger or getLogger(self.__class__.__name__)