diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4a8c0413..9b56b611 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,21 +18,3 @@ repos: - id: isort name: isort (python) args: ["--settings-path", "pyproject.toml"] - -# exclude python 2 code which cannot be dealt with black -exclude: | - (?x)( - ^pandaharvester/harvestermonitor/arc_monitor.py| - ^pandaharvester/harvestermisc/arc_utils.py| - ^pandaharvester/harvesterpayload/simple_wrapper_mpi.py| - ^pandaharvester/harvestersubmitter/apfgrid_submitter.py| - ^pandaharvester/harvestertest/dumpTable.py| - ^pandaharvester/harvestertest/getQueuedata.py| - ^pandaharvester/harvestermessenger/arc_messenger.py| - ^pandaharvester/harvestersubmitter/arc_submitter.py| - ^pandaharvester/harvestertest/stageOutTest_globus.py| - ^pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py| - ^pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py| - ^pandaharvester/harvestercloud/google_startup_script.py| - ^& - ) \ No newline at end of file diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 33b954b8..a5dfb2ab 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-01-2024 12:22:14 on flin (by mightqxc)" +timestamp = "13-02-2024 10:21:25 on cleanup (by mightqxc)" diff --git a/pandaharvester/harvestercloud/cernvm_aux.py b/pandaharvester/harvestercloud/cernvm_aux.py deleted file mode 100644 index 6ce1324c..00000000 --- a/pandaharvester/harvestercloud/cernvm_aux.py +++ /dev/null @@ -1,12 +0,0 @@ -import sys -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText - - -def encode_user_data(user_data): - attached_message = MIMEMultipart() - message = MIMEText(user_data, "cloud-config", sys.getdefaultencoding()) - message.add_header("Content-Disposition", f'attachment; filename="cs-cloud-init.yaml"') - attached_message.attach(message) - - return attached_message diff --git a/pandaharvester/harvestercloud/google_startup_script.py b/pandaharvester/harvestercloud/google_startup_script.py deleted file mode 100644 index 3da11abc..00000000 --- a/pandaharvester/harvestercloud/google_startup_script.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python - -""" -This script will be executed at the VM startup time. -- It will download the proxy and panda queue from Google instance metadata -- It will download the pilot wrapper from github and execute it -- It will upload the pilot logs to panda cache -""" - -import requests -try: - import subprocess32 as subprocess -except BaseException: - import subprocess -import os -import sys -import logging -import time -import traceback -import zlib -from threading import Thread - -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', - filename='/tmp/vm_script.log', filemode='w') - -METADATA_URL = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/{0}" - -global loop -loop = True - - -def upload_logs(url, log_file_name, destination_name, proxy_path): - try: - - # open and compress the content of the file - with open(log_file_name, 'rb') as log_file_object: - files = {'file': (destination_name, zlib.compress(log_file_object.read()))} - - cert = [proxy_path, proxy_path] - # verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory - - logging.debug('[upload_logs] start') - res = requests.post(url, files=files, timeout=180, verify=False, cert=cert) - logging.debug('[upload_logs] finished with code={0} msg={1}'.format(res.status_code, res.text)) - if res.status_code == 200: - return True - except BaseException: - err_type, err_value = sys.exc_info()[:2] - err_messsage = "failed to put with {0}:{1} ".format(err_type, err_value) - err_messsage += traceback.format_exc() - logging.debug('[upload_logs] excepted with:\n {0}'.format(err_messsage)) - - return False - - -def contact_harvester(harvester_frontend, data, auth_token, proxy_path): - try: - headers = {'Content-Type': 'application/json', - 'Authorization': 'Bearer {0}'.format(auth_token)} - cert = [proxy_path, proxy_path] - # verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory - verify = False - resp = requests.post(harvester_frontend, json=data, headers=headers, cert=cert, verify=verify) - logging.debug('[contact_harvester] harvester returned: {0}'.format(resp.text)) - except Exception as e: - # message could not be sent - logging.debug('[contact_harvester] failed to send message to harvester: {0}'.format(e)) - pass - - -def heartbeat(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'heartbeat', 'workerID': worker_id, 'data': None} - logging.debug('[heartbeat] sending heartbeat to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def suicide(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'killWorker', 'workerID': worker_id, 'data': None} - logging.debug('[suicide] sending suicide message to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def heartbeat_loop(harvester_frontend, worker_id, auth_token, proxy_path): - while loop: - heartbeat(harvester_frontend, worker_id, auth_token, proxy_path) - time.sleep(300) - - -def get_url(url, headers=None): - """ - get content from specified URL - """ - - reply = requests.get(url, headers=headers) - if reply.status_code != 200: - logging.debug('[get_attribute] Failed to open {0}'.format(url)) - return None - else: - return reply.content - - -def get_configuration(): - - # get the proxy certificate and save it - proxy_path = "/tmp/x509up" - proxy_url = METADATA_URL.format("proxy") - proxy_string = get_url(proxy_url, headers={"Metadata-Flavor": "Google"}) - with open(proxy_path, "w") as proxy_file: - proxy_file.write(proxy_string) - os.environ['X509_USER_PROXY'] = proxy_path - logging.debug('[main] initialized proxy') - - # get the panda queue name - pq_url = METADATA_URL.format("panda_queue") - panda_queue = get_url(pq_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got panda queue: {0}'.format(panda_queue)) - - # get the harvester frontend URL, where we'll send heartbeats - harvester_frontend_url = METADATA_URL.format("harvester_frontend") - harvester_frontend = get_url(harvester_frontend_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got harvester frontend: {0}'.format(harvester_frontend)) - - # get the worker id - worker_id_url = METADATA_URL.format("worker_id") - worker_id = get_url(worker_id_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got worker id: {0}'.format(worker_id)) - - # get the authentication token - auth_token_url = METADATA_URL.format("auth_token") - auth_token = get_url(auth_token_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got authentication token') - - # get the URL (e.g. panda cache) to upload logs - logs_frontend_w_url = METADATA_URL.format("logs_url_w") - logs_frontend_w = get_url(logs_frontend_w_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got url to upload logs') - - # get the URL (e.g. panda cache) where the logs can be downloaded afterwards - logs_frontend_r_url = METADATA_URL.format("logs_url_r") - logs_frontend_r = get_url(logs_frontend_r_url, headers={"Metadata-Flavor": "Google"}) - logging.debug('[main] got url to download logs') - - return proxy_path, panda_queue, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r - - -if __name__ == "__main__": - - # get all the configuration from the GCE metadata server - proxy_path, panda_queue, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r = get_configuration() - - # start a separate thread that will send a heartbeat to harvester every 5 minutes - heartbeat_thread = Thread(target=heartbeat_loop, args=(harvester_frontend, worker_id, auth_token, proxy_path)) - heartbeat_thread.start() - - # the pilot should propagate the download link via the pilotId field in the job table - destination_name = '{0}.log'.format(worker_id) - log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) - os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot - - # get the pilot wrapper - wrapper_path = "/tmp/runpilot3-wrapper.sh" - wrapper_url = "https://raw.githubusercontent.com/fbarreir/adc/master/runpilot3-wrapper.sh" - wrapper_string = get_url(wrapper_url) - with open(wrapper_path, "w") as wrapper_file: - wrapper_file.write(wrapper_string) - os.chmod(wrapper_path, 0544) # make pilot wrapper executable - logging.debug('[main] downloaded pilot wrapper') - - # execute the pilot wrapper - logging.debug('[main] starting pilot wrapper...') - wrapper_params = '-s {0} -h {0}'.format(panda_queue) - if 'ANALY' in panda_queue: - wrapper_params = '{0} -u user'.format(wrapper_params) - command = "/tmp/runpilot3-wrapper.sh {0} -p 25443 -w https://pandaserver.cern.ch >& /tmp/wrapper-wid.log".\ - format(wrapper_params) - subprocess.call(command, shell=True) - logging.debug('[main] pilot wrapper done...') - - # upload logs to e.g. panda cache or similar - upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) - - # ask harvester to kill the VM and stop the heartbeat - suicide(harvester_frontend, worker_id, auth_token, proxy_path) - loop = False - heartbeat_thread.join() diff --git a/pandaharvester/harvestercloud/googlecloud.py b/pandaharvester/harvestercloud/googlecloud.py deleted file mode 100644 index ffc5ac8f..00000000 --- a/pandaharvester/harvestercloud/googlecloud.py +++ /dev/null @@ -1,132 +0,0 @@ -import os - -import googleapiclient.discovery -from pandaharvester.harvestercloud import cernvm_aux -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestermisc.frontend_utils import HarvesterToken - -PROXY_PATH = harvester_config.pandacon.cert_file -USER_DATA_PATH = harvester_config.googlecloud.user_data_file -HARVESTER_FRONTEND = harvester_config.googlecloud.harvester_frontend - -IMAGE = harvester_config.googlecloud.image -ZONE = harvester_config.googlecloud.zone -PROJECT = harvester_config.googlecloud.project -SERVICE_ACCOUNT_FILE = harvester_config.googlecloud.service_account_file -os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_FILE - -compute = googleapiclient.discovery.build("compute", "v1") - - -class GoogleVM: - def __init__(self, work_spec, queue_config): - self.harvester_token = HarvesterToken() - self.work_spec = work_spec - self.queue_config = queue_config - harvester_id_clean = harvester_config.master.harvester_id.replace("-", "").replace("_", "").lower() - self.name = f"{harvester_id_clean}-gce-{work_spec.workerID}" - # self.name = self.name.replace('_', '-') # underscores in VM names are not allowed by GCE - self.image = self.resolve_image_url() - self.instance_type = self.resolve_instance_type() - self.config = self.prepare_metadata() - - def resolve_image_url(self): - """ - TODO: implement - :param work_spec: worker specifications - :return: URL pointing to the machine type to use - """ - # Get the latest Debian Jessie image - image_response = compute.images().getFromFamily(project=PROJECT, family="cernvm").execute() - source_disk_image = image_response["selfLink"] - - return source_disk_image - - def resolve_instance_type(self): - """ - Resolves the ideal instance type for the work specifications. An overview on VM types can be found here: https://cloud.google.com/compute/docs/machine-types - - TODO: for the moment we will just assume we need the standard type, but in the future this function can be expanded - TODO: to consider also custom VMs, hi/lo mem, many-to-one mode, etc. - - :param work_spec: worker specifications - :return: instance type name - """ - - # Calculate the number of VCPUs - cores = 8 # default value. TODO: probably should except if we don't find a suitable number - standard_cores = [1, 2, 4, 8, 16, 32, 64, 96] - for standard_core in standard_cores: - if self.work_spec.nCore <= standard_core: - cores = standard_core - break - - # Calculate the memory: 2 GBs per core. It needs to be expressed in MB - # https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type - try: - ram_per_core = self.queue_config.submitter["ram_per_core"] - except KeyError: - ram_per_core = 2 - memory = cores * ram_per_core * 1024 - - try: - zone = self.queue_config.zone - except AttributeError: - zone = ZONE - - # instance_type = 'zones/{0}/machineTypes/n1-standard-{1}'.format(zone, cores) - # Use custom machine types to reduce cost - instance_type = f"zones/{zone}/machineTypes/custom-{cores}-{memory}" - - return instance_type - - def prepare_metadata(self): - """ - TODO: prepare any user data and metadata that we want to pass to the VM instance - :return: - """ - - # read the proxy - with open(PROXY_PATH, "r") as proxy_file: - proxy_string = proxy_file.read() - - with open(USER_DATA_PATH, "r") as user_data_file: - user_data = user_data_file.read() - - try: - preemptible = self.queue_config.submitter["preemptible"] - except KeyError: - preemptible = False - - try: - disk_size = self.queue_config.submitter["disk_size"] - except KeyError: - disk_size = 50 - - config = { - "name": self.name, - "machineType": self.instance_type, - "scheduling": {"preemptible": preemptible}, - # Specify the boot disk and the image to use as a source. - "disks": [{"boot": True, "autoDelete": True, "initializeParams": {"sourceImage": IMAGE, "diskSizeGb": 50}}], - # Specify a network interface with NAT to access the public internet - "networkInterfaces": [{"network": "global/networks/default", "accessConfigs": [{"type": "ONE_TO_ONE_NAT", "name": "External NAT"}]}], - # Allow the instance to access cloud storage and logging. - "serviceAccounts": [ - {"email": "default", "scopes": ["https://www.googleapis.com/auth/devstorage.read_write", "https://www.googleapis.com/auth/logging.write"]} - ], - "metadata": { - "items": [ - {"key": "user-data", "value": str(cernvm_aux.encode_user_data(user_data))}, - {"key": "proxy", "value": proxy_string}, - {"key": "panda_queue", "value": self.work_spec.computingSite}, - {"key": "harvester_frontend", "value": HARVESTER_FRONTEND}, - {"key": "worker_id", "value": self.work_spec.workerID}, - {"key": "auth_token", "value": self.harvester_token.generate(payload={"sub": str(self.work_spec.batchID)})}, - {"key": "logs_url_w", "value": f"{harvester_config.pandacon.pandaCacheURL_W}/updateLog"}, - {"key": "logs_url_r", "value": harvester_config.pandacon.pandaCacheURL_R}, - ] - }, - } - - return config diff --git a/pandaharvester/harvesterfifo/redis_fifo.py b/pandaharvester/harvesterfifo/redis_fifo.py deleted file mode 100644 index b51f4036..00000000 --- a/pandaharvester/harvesterfifo/redis_fifo.py +++ /dev/null @@ -1,240 +0,0 @@ -import os -import random -import re -import time - -import redis -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore.plugin_base import PluginBase - - -def random_id(): - return random.randrange(2**30) - - -class RedisFifo(PluginBase): - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - _redis_conn_opt_dict = {} - if hasattr(self, "redisHost"): - _redis_conn_opt_dict["host"] = self.redisHost - elif hasattr(harvester_config.fifo, "redisHost"): - _redis_conn_opt_dict["host"] = harvester_config.fifo.redisHost - if hasattr(self, "redisPort"): - _redis_conn_opt_dict["port"] = self.redisPort - elif hasattr(harvester_config.fifo, "redisPort"): - _redis_conn_opt_dict["port"] = harvester_config.fifo.redisPort - if hasattr(self, "redisDB"): - _redis_conn_opt_dict["db"] = self.redisDB - elif hasattr(harvester_config.fifo, "redisDB"): - _redis_conn_opt_dict["db"] = harvester_config.fifo.redisDB - if hasattr(self, "redisPassword"): - _redis_conn_opt_dict["password"] = self.redisPassword - elif hasattr(harvester_config.fifo, "redisPassword"): - _redis_conn_opt_dict["password"] = harvester_config.fifo.redisPassword - self.qconn = redis.StrictRedis(**_redis_conn_opt_dict) - self.id_score = f"{self.titleName}-fifo_id-score" - self.id_item = f"{self.titleName}-fifo_id-item" - self.id_temp = f"{self.titleName}-fifo_id-temp" - - def __len__(self): - return self.qconn.zcard(self.id_score) - - def _peek(self, mode="first", id=None, skip_item=False): - if mode == "first": - try: - id_gotten, score = self.qconn.zrange(self.id_score, 0, 0, withscores=True)[0] - except IndexError: - return None - elif mode == "last": - try: - id_gotten, score = self.qconn.zrevrange(self.id_score, 0, 0, withscores=True)[0] - except IndexError: - return None - else: - resVal = self.qconn.sismember(self.id_temp, id) - if (mode == "id" and not resVal) or (mode == "idtemp" and resVal): - id_gotten = id - score = self.qconn.zscore(self.id_score, id) - else: - id_gotten, score = None, None - if skip_item: - item = None - else: - item = self.qconn.hget(self.id_item, id_gotten) - if id_gotten is None: - return None - else: - return (id_gotten, item, score) - - def _pop(self, timeout=None, protective=False, mode="first"): - keep_polling = True - wait = 0.1 - max_wait = 2 - tries = 1 - last_attempt_timestamp = time.time() - id, item, score = None, None, None - while keep_polling: - peeked_tuple = self._peek(mode=mode) - if peeked_tuple is None: - time.sleep(wait) - wait = min(max_wait, tries / 10.0 + wait) - else: - id, item, score = peeked_tuple - while True: - try: - with self.qconn.pipeline() as pipeline: - pipeline.watch(self.id_score, self.id_item, self.id_temp) - pipeline.multi() - if protective: - pipeline.sadd(self.id_temp, id) - pipeline.zrem(self.id_score, id) - else: - pipeline.srem(self.id_temp, id) - pipeline.hdel(self.id_item, id) - pipeline.zrem(self.id_score, id) - resVal = pipeline.execute() - except redis.WatchError: - continue - else: - break - if resVal[-2] == 1 and resVal[-1] == 1: - break - tries += 1 - now_timestamp = time.time() - if timeout is not None and (now_timestamp - last_attempt_timestamp) >= timeout: - break - return id, item, score - - # number of objects in queue - def size(self): - return len(self) - - # enqueue with priority score - def put(self, item, score): - generate_id_attempt_timestamp = time.time() - while True: - id = random_id() - resVal = None - with self.qconn.pipeline() as pipeline: - while True: - try: - pipeline.watch(self.id_score, self.id_item) - pipeline.multi() - pipeline.execute_command("ZADD", self.id_score, "NX", score, id) - pipeline.hsetnx(self.id_item, id, item) - resVal = pipeline.execute() - except redis.WatchError: - continue - else: - break - if resVal is not None: - if resVal[-2] == 1 and resVal[-1] == 1: - return True - if time.time() > generate_id_attempt_timestamp + 60: - raise Exception("Cannot generate unique id") - return False - time.sleep(0.0001) - return False - - # enqueue by id - def putbyid(self, id, item, score): - with self.qconn.pipeline() as pipeline: - while True: - try: - pipeline.watch(self.id_score, self.id_item) - pipeline.multi() - pipeline.execute_command("ZADD", self.id_score, "NX", score, id) - pipeline.hsetnx(self.id_item, id, item) - resVal = pipeline.execute() - except redis.WatchError: - continue - else: - break - if resVal is not None: - if resVal[-2] == 1 and resVal[-1] == 1: - return True - return False - - # dequeue the first object - def get(self, timeout=None, protective=False): - return self._pop(timeout=timeout, protective=protective, mode="first") - - # dequeue the last object - def getlast(self, timeout=None, protective=False): - return self._pop(timeout=timeout, protective=protective, mode="last") - - # get tuple of (id, item, score) of the first object without dequeuing it - def peek(self, skip_item=False): - return self._peek(skip_item=skip_item) - - # get tuple of (id, item, score) of the last object without dequeuing it - def peeklast(self, skip_item=False): - return self._peek(mode="last", skip_item=skip_item) - - # get tuple of (id, item, score) of object by id without dequeuing it - def peekbyid(self, id, temporary=False, skip_item=False): - if temporary: - return self._peek(mode="idtemp", id=id, skip_item=skip_item) - else: - return self._peek(mode="id", id=id, skip_item=skip_item) - - # drop all objects in queue - def clear(self): - with self.qconn.pipeline() as pipeline: - while True: - try: - pipeline.watch(self.id_score, self.id_item, self.id_temp) - pipeline.multi() - pipeline.delete(self.id_score) - pipeline.delete(self.id_item) - pipeline.delete(self.id_temp) - pipeline.execute() - except redis.WatchError: - continue - else: - break - - # delete objects by list of id - def delete(self, ids): - if isinstance(ids, (list, tuple)): - with self.qconn.pipeline() as pipeline: - while True: - try: - pipeline.watch(self.id_score, self.id_item, self.id_temp) - pipeline.multi() - pipeline.srem(self.id_temp, *ids) - pipeline.hdel(self.id_item, *ids) - pipeline.zrem(self.id_score, *ids) - resVal = pipeline.execute() - except redis.WatchError: - continue - else: - n_row = resVal[-1] - return n_row - else: - raise TypeError("ids should be list or tuple") - - # Move objects in temporary space to the queue - def restore(self, ids): - with self.qconn.pipeline() as pipeline: - while True: - now_timestamp = time.time() - try: - pipeline.watch(self.id_score, self.id_item, self.id_temp) - if ids is None: - pipeline.multi() - pipeline.delete(self.id_temp) - pipeline.execute() - elif isinstance(ids, (list, tuple)): - if len(ids) > 0: - pipeline.multi() - pipeline.srem(self.id_temp, *ids) - pipeline.execute() - else: - raise TypeError("ids should be list or tuple or None") - except redis.WatchError: - continue - else: - break diff --git a/pandaharvester/harvestermisc/arc_utils.py b/pandaharvester/harvestermisc/arc_utils.py deleted file mode 100644 index 62db2201..00000000 --- a/pandaharvester/harvestermisc/arc_utils.py +++ /dev/null @@ -1,178 +0,0 @@ -import inspect -import json -import pickle -import re -import arc - -from pandaharvester.harvestercore import core_utils - -class DataPoint: - ''' - Wrapper around arc.datapoint_from_url() which does not clean up DataPoints - when python objects are destroyed, leading to connection leaking when used - with gridftp. This class should be used instead of arc.datapoint_from_url(). - It can be called like dp = DataPoint('gsiftp://...', uc); dp.h.Stat() - where uc is an arc.UserConfig object. - ''' - def __init__(self, u, uc): - self.h = arc.datapoint_from_url(u, uc) - def __del__(self): - arc.DataPoint.__swig_destroy__(self.h) - -class ARCPandaJob: - ''' - Class containing information on a panda job being processed by ARC plugins. - Normally at the end of the job this information is encoded in the pickle - file produced by the pilot and downloaded by harvester. Harvester adjusts - certain information before sending it to panda. If the pickle is missing - then default values are used. The fields here are those expected by - JobDispatcher.updateJob() in the Panda server. - ''' - - def __init__(self, jobinfo={}, filehandle=None, filename=''): - ''' - Make a new ARCPandaJob. jobId and state are mandatory. jobinfo is a - dictonary which can contain any of this class' attributes. If filehandle - is given pickled info is loaded from the file object. If filename is - given pickled info is read from a file. - ''' - self.jobId = None - self.state = None - self.timeout = 60 - - if jobinfo: - self.setAttributes(jobinfo) - elif filehandle: - self.setAttributes(pickle.load(filehandle)) - elif filename: - with open(filename) as f: - jobinfo = pickle.load(f) - self.setAttributes(jobinfo) - - def __setattr__(self, name, value): - ''' - Override to allow setting arbitrary key value pairs - ''' - self.__dict__[name] = value - - def setAttributes(self, jobinfo): - ''' - Set attributes in the jobinfo dictionary - ''' - for key, value in jobinfo.iteritems(): - self.__dict__[key] = value - - def dictionary(self): - ''' - Return a dictionary of all the attributes with set values - ''' - return self.__dict__ - - def writeToFile(self, filename): - ''' - Write a pickle of job info to filename. Overwrites an existing file. - ''' - try: - os.makedirs(os.path.dirname(filename), 0755) - except: - pass - - with open(filename, 'w') as f: - pickle.dump(self.dictionary(), f) - -class ARCLogger: - ''' - Wrapper around harvester logger to add ARC logging to same log file - ''' - - def __init__(self, baselogger, workerid): - '''Set up ARC logging to log to the same file as the baselogger''' - - # Set method name to the caller of this method - self.log = core_utils.make_logger(baselogger, - token='workerID={0}'.format(workerid), - method_name=inspect.stack()[1][3]) - - # Get the log file from the baseLogger - loghandler = baselogger.handlers[0] # Assumes one handler - # LogFile must exist for the lifetime of this object - self.logfile = arc.LogFile(str(loghandler.baseFilename)) - self.logfile.setFormat(arc.LongFormat) - arc.Logger_getRootLogger().setThreadContext() - arc.Logger_getRootLogger().addDestination(self.logfile) - arc.Logger_getRootLogger().setThreshold(arc.VERBOSE) # TODO configurable - - def __del__(self): - ''' - Since self.logfile disappears when this object is deleted we have to - remove it from the root logger destinations - ''' - arc.Logger_getRootLogger().removeDestinations() - - -def workspec2arcjob(workspec): - '''Convert WorkSpec.workAttributes to arc.Job object''' - - job = arc.Job() - try: - wsattrs = workspec.workAttributes['arcjob'] - proxyrole = workspec.workAttributes['proxyrole'] - except: - # Job was not submitted yet - return (job, arc.Time(), None) - - for attr in dir(job): - if attr not in wsattrs or attr == 'CreationTime': - continue - - attrtype = type(getattr(job, attr)) - # Some object types need special treatment - if attrtype == arc.StringList: - strlist = arc.StringList() - for item in wsattrs[attr].split('|'): - strlist.append(str(item)) - setattr(job, attr, strlist) - elif attrtype == arc.StringStringMap: - ssm = arc.StringStringMap() - for (k, v) in json.loads(wsattrs[attr]).items(): - ssm[str(k)] = str(v) - setattr(job, attr, ssm) - else: - setattr(job, attr, attrtype(str(wsattrs[attr]))) - return (job, arc.Time(str(wsattrs['ModificationTime'])), proxyrole) - -def arcjob2workspec(arcjob, workspec): - '''Fill WorkSpec workAttributes with ARC job attributes''' - - jobattrs = {} - for attr in dir(arcjob): - # Don't store internal python attrs or job description - if re.match('^__', attr) or attr == 'JobDescriptionDocument': - continue - - attrtype = type(getattr(arcjob, attr)) - if attrtype == int or attrtype == str: - jobattrs[attr] = getattr(arcjob, attr) - elif attrtype == arc.JobState: - jobattrs[attr] = getattr(arcjob, attr).GetGeneralState() - elif attrtype == arc.StringList: - jobattrs[attr] = '|'.join(getattr(arcjob, attr)) - elif attrtype == arc.URL: - jobattrs[attr] = getattr(arcjob, attr).str().replace(r'\2f',r'/') - elif attrtype == arc.StringStringMap: - ssm = getattr(arcjob, attr) - tmpdict = dict(zip(ssm.keys(), ssm.values())) - jobattrs[attr] = json.dumps(tmpdict) - elif attrtype == arc.Period: - jobattrs[attr] = getattr(arcjob, attr).GetPeriod() - elif attrtype == arc.Time: - if getattr(arcjob, attr).GetTime() != -1: - jobattrs[attr] = getattr(arcjob, attr).str(arc.UTCTime) - # Other attributes of complex types are not stored - - # Set update time - jobattrs['ModificationTime'] = arc.Time().str(arc.UTCTime) - if workspec.workAttributes: - workspec.workAttributes['arcjob'] = jobattrs - else: - workspec.workAttributes = {'arcjob': jobattrs} diff --git a/pandaharvester/harvestermisc/cloud_openstack_utils.py b/pandaharvester/harvestermisc/cloud_openstack_utils.py deleted file mode 100644 index f1174670..00000000 --- a/pandaharvester/harvestermisc/cloud_openstack_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -import json - -from keystoneauth1 import loading as loading -from keystoneauth1 import session as session -from novaclient import client as nova_cl - -# from cinderclient import client as cinder_cl - - -class OS_SimpleClient(object): - def __init__(self, auth_config_json_file=None): - with open(auth_config_json_file, "r") as _f: - auth_config_dict = json.load(_f) - - # Openstack API version - version = "2.0" # FIXME - if version == "2.0": - loader = loading.get_plugin_loader("v2password") - elif version >= "3.0": - loader = loading.get_plugin_loader("password") - - auth = loader.load_from_options(**auth_config_dict) - # sess = keystoneauth1.session.Session(auth=auth) - sess = session.Session(auth=auth) - - # self.nova = novaclient.client.Client(version, session=sess) - self.nova = nova_cl.Client(version, session=sess) - # self.cinder = cinderclient.client.Client(version, session=sess) - # self.cinder = cinder_cl.client.Client(version, session=sess) diff --git a/pandaharvester/harvestermisc/lancium_utils.py b/pandaharvester/harvestermisc/lancium_utils.py deleted file mode 100644 index 13548fcb..00000000 --- a/pandaharvester/harvestermisc/lancium_utils.py +++ /dev/null @@ -1,406 +0,0 @@ -""" -Lancium python API wrapper functions -""" - -import datetime -import os -import random -import re -import threading -import time -import traceback -from threading import get_ident - -from lancium.api.Data import Data -from lancium.api.Job import Job -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.core_utils import SingletonWithID -from pandaharvester.harvestercore.fifos import SpecialFIFOBase - -try: - api_key = harvester_config.lancium.api_key -except AttributeError: - raise RuntimeError("The configuration is missing the [lancium] section and/or the api_key entry") - -# The key needs to be set before importing the lancium API -os.environ["LANCIUM_API_KEY"] = api_key - - -# logger -base_logger = core_utils.setup_logger("lancium_utils") - -SECRETS_PATH = "/voms/" -SCRIPTS_PATH = "/scripts/" - -LANCIUM_JOB_ATTRS_LIST = [ - "id", - "name", - "status", - "created_at", - "updated_at", - "submitted_at", - "completed_at", - "exit_code", -] - - -def fake_callback(total_chunks, current_chunk): - pass - - -def get_job_name_from_workspec(workspec): - job_name = f"{harvester_config.master.harvester_id}:{workspec.workerID}" - return job_name - - -def get_workerid_from_job_name(job_name): - tmp_str_list = job_name.split(":") - harvester_id = None - worker_id = None - try: - harvester_id = str(tmp_str_list[0]) - worker_id = int(tmp_str_list[-1]) - except Exception: - pass - return (harvester_id, worker_id) - - -def get_full_batch_id(submission_host, batch_id): - full_batch_id = f"{submission_host}#{batch_id}" - return full_batch_id - - -def get_full_batch_id_from_workspec(workspec): - full_batch_id = f"{workspec.submissionHost}#{workspec.batchID}" - return full_batch_id - - -def get_host_batch_id_map(workspec_list): - """ - Get a dictionary of submissionHost: list of batchIDs from workspec_list - return {submissionHost_1: {batchID_1_1, ...}, submissionHost_2: {...}, ...} - """ - host_batch_id_map = {} - for workspec in workspec_list: - host = workspec.submissionHost - batch_id = workspec.batchID - if batch_id is None: - continue - try: - host_batch_id_map[host].append(batch_id) - except KeyError: - host_batch_id_map[host] = [batch_id] - return host_batch_id_map - - -def timestamp_to_datetime(timestamp_str): - return datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%fZ") - - -class LanciumClient(object): - def __init__(self, submission_host, queue_name=None): - self.submission_host = submission_host - self.queue_name = queue_name - - def upload_file(self, local_path, lancium_path, force=True): - tmp_log = core_utils.make_logger(base_logger, method_name="upload_file") - try: - tmp_log.debug(f"Uploading file {local_path}") - tmp_log = core_utils.make_logger(base_logger, f"queue_name={self.queue_name}", method_name="upload_file") - - data = Data().create(lancium_path, "file", source=os.path.abspath(local_path), force=force) - data.upload(os.path.abspath(local_path), fake_callback) - ex = data.show(lancium_path)[0] - tmp_log.debug(f"Done: {ex.__dict__}") - - return True, "" - except Exception as _e: - error_message = f"Failed to upload file with {_e}" - tmp_log.error(f"Failed to upload the file with {traceback.format_exc()}") - return False, error_message - - def submit_job(self, **jobparams): - # create and submit a job to lancium - tmp_log = core_utils.make_logger(base_logger, f"queue_name={self.queue_name}", method_name="submit_job") - - try: - tmp_log.debug("Creating and submitting a job") - - job = Job().create(**jobparams) - tmp_log.debug(f"Job created. name: {job.name}, id: {job.id}, status: {job.status}") - - job.submit() - tmp_log.debug(f"Job submitted. name: {job.name}, id: {job.id}, status: {job.status}") - batch_id = str(job.id) - return True, batch_id - except Exception as _e: - error_message = f"Failed to create or submit a job with {_e}" - tmp_log.error(f"Failed to create or submit a job with {traceback.format_exc()}") - return False, error_message - - def delete_job(self, job_id): - # delete job by job ID - tmp_log = core_utils.make_logger(base_logger, f"queue_name={self.queue_name} job_id={job_id}", method_name="delete_job") - tmp_log.debug(f"Going to delete job {job_id}") - Job.delete(job_id) - tmp_log.debug(f"Deleted job {job_id}") - - -class LanciumJobsCacheFifo(SpecialFIFOBase, metaclass=SingletonWithID): - """ - Cache FIFO for Lancium jobs - """ - - global_lock_id = -1 - - def __init__(self, target, *args, **kwargs): - name_suffix = target.split(".")[0] - name_suffix = re.sub("-", "_", name_suffix) - self.titleName = f"LanciumJobsCache_{name_suffix}" - SpecialFIFOBase.__init__(self) - - def lock(self, score=None): - lock_key = format(int(random.random() * 2**32), "x") - if score is None: - score = time.time() - retVal = self.putbyid(self.global_lock_id, lock_key, score) - if retVal: - return lock_key - return None - - def unlock(self, key=None, force=False): - peeked_tuple = self.peekbyid(id=self.global_lock_id) - if peeked_tuple.score is None or peeked_tuple.item is None: - return True - elif force or self.decode(peeked_tuple.item) == key: - self.delete([self.global_lock_id]) - return True - else: - return False - - -class LanciumJobQuery(object, metaclass=SingletonWithID): - # class lock - classLock = threading.Lock() - - def __init__(self, cacheEnable=False, cacheRefreshInterval=None, *args, **kwargs): - self.submission_host = str(kwargs.get("id")) - # Make logger - tmpLog = core_utils.make_logger( - base_logger, f"submissionHost={self.submission_host} thrid={get_ident()} oid={id(self)}", method_name="LanciumJobQuery.__init__" - ) - # Initialize - with self.classLock: - tmpLog.debug("Start") - # For cache - self.cacheEnable = cacheEnable - if self.cacheEnable: - self.cache = ([], 0) - self.cacheRefreshInterval = cacheRefreshInterval - tmpLog.debug("Initialize done") - - def query_jobs(self, batchIDs_list=[], all_jobs=False): - # Make logger - tmpLog = core_utils.make_logger(base_logger, f"submissionHost={self.submission_host}", method_name="LanciumJobQuery.query_jobs") - # Start query - tmpLog.debug("Start query") - cache_fifo = None - job_attr_all_dict = {} - # make id sets - batchIDs_set = set(batchIDs_list) - # query from cache - - def cache_query(batch_id_set, timeout=60): - # query from lancium job and update cache to fifo - def update_cache(lockInterval=90): - tmpLog.debug("update_cache") - # acquire lock with score timestamp - score = time.time() - self.cacheRefreshInterval + lockInterval - lock_key = cache_fifo.lock(score=score) - if lock_key is not None: - # acquired lock, update - tmpLog.debug("got lock, updating cache") - all_jobs_light_list = Job().all() - jobs_iter = [] - for job in all_jobs_light_list: - try: - lancium_job_id = job.id - one_job_attr = Job().get(lancium_job_id) - one_job_dict = dict() - for attr in LANCIUM_JOB_ATTRS_LIST: - one_job_dict[attr] = getattr(one_job_attr, attr, None) - jobs_iter.append(one_job_dict) - except Exception as e: - tmpLog.error(f"In update_cache all job; got exception {e.__class__.__name__}: {e} ; {repr(job)}") - timeNow = time.time() - cache_fifo.put(jobs_iter, timeNow) - self.cache = (jobs_iter, timeNow) - # release lock - retVal = cache_fifo.unlock(key=lock_key) - if retVal: - tmpLog.debug("done update cache and unlock") - else: - tmpLog.warning("cannot unlock... Maybe something wrong") - return jobs_iter - else: - tmpLog.debug("cache fifo locked by other thread. Skipped") - return None - - # remove invalid or outdated caches from fifo - - def cleanup_cache(timeout=60): - tmpLog.debug("cleanup_cache") - id_list = list() - attempt_timestamp = time.time() - n_cleanup = 0 - while True: - if time.time() > attempt_timestamp + timeout: - tmpLog.debug("time is up when cleanup cache. Skipped") - break - peeked_tuple = cache_fifo.peek(skip_item=True) - if peeked_tuple is None: - tmpLog.debug("empty cache fifo") - break - elif peeked_tuple.score is not None and time.time() <= peeked_tuple.score + self.cacheRefreshInterval: - tmpLog.debug("nothing expired") - break - elif peeked_tuple.id is not None: - retVal = cache_fifo.delete([peeked_tuple.id]) - if isinstance(retVal, int): - n_cleanup += retVal - else: - # problematic - tmpLog.warning("got nothing when cleanup cache, maybe problematic. Skipped") - break - tmpLog.debug(f"cleaned up {n_cleanup} objects in cache fifo") - - # start - jobs_iter = tuple() - try: - attempt_timestamp = time.time() - while True: - if time.time() > attempt_timestamp + timeout: - # skip cache_query if too long - tmpLog.debug(f"cache_query got timeout ({timeout} seconds). Skipped ") - break - # get latest cache - peeked_tuple = cache_fifo.peeklast(skip_item=True) - if peeked_tuple is not None and peeked_tuple.score is not None: - # got something - if peeked_tuple.id == cache_fifo.global_lock_id: - if time.time() <= peeked_tuple.score + self.cacheRefreshInterval: - # lock - tmpLog.debug("got fifo locked. Wait and retry...") - time.sleep(random.uniform(1, 5)) - continue - else: - # expired lock - tmpLog.debug("got lock expired. Clean up and retry...") - cleanup_cache() - continue - elif time.time() <= peeked_tuple.score + self.cacheRefreshInterval: - # got valid cache - _obj, _last_update = self.cache - if _last_update >= peeked_tuple.score: - # valid local cache - tmpLog.debug("valid local cache") - jobs_iter = _obj - else: - # valid fifo cache - tmpLog.debug("update local cache from fifo") - peeked_tuple_with_item = cache_fifo.peeklast() - if ( - peeked_tuple_with_item is not None - and peeked_tuple.id != cache_fifo.global_lock_id - and peeked_tuple_with_item.item is not None - ): - jobs_iter = cache_fifo.decode(peeked_tuple_with_item.item) - self.cache = (jobs_iter, peeked_tuple_with_item.score) - else: - tmpLog.debug("peeked invalid cache fifo object. Wait and retry...") - time.sleep(random.uniform(1, 5)) - continue - else: - # cache expired - tmpLog.debug("update cache in fifo") - retVal = update_cache() - if retVal is not None: - jobs_iter = retVal - cleanup_cache() - break - else: - # no cache in fifo, check with size again - if cache_fifo.size() == 0: - if time.time() > attempt_timestamp + random.uniform(10, 30): - # have waited for long enough, update cache - tmpLog.debug("waited enough, update cache in fifo") - retVal = update_cache() - if retVal is not None: - jobs_iter = retVal - break - else: - # still nothing, wait - time.sleep(2) - continue - except Exception as _e: - tb_str = traceback.format_exc() - tmpLog.error(f"Error querying from cache fifo; {_e} ; {tb_str}") - return jobs_iter - - def direct_query(batch_id_set, **kwargs): - jobs_iter = [] - batch_ids = batch_id_set - if all_jobs: - try: - all_jobs_light_list = Job().all() - batch_ids = set() - for job in all_jobs_light_list: - lancium_job_id = job.id - batch_ids.add(lancium_job_id) - except Exception as e: - tmpLog.error(f"In doing Job().all(); got exception {e.__class__.__name__}: {e} ") - for batch_id in batch_id_set: - try: - lancium_job_id = batch_id - one_job_attr = Job().get(lancium_job_id) - one_job_dict = dict() - for attr in LANCIUM_JOB_ATTRS_LIST: - one_job_dict[attr] = getattr(one_job_attr, attr, None) - jobs_iter.append(one_job_dict) - except Exception as e: - tmpLog.error(f"In doing Job().get({batch_id}); got exception {e.__class__.__name__}: {e} ") - return jobs_iter - - # query method options - query_method_list = [direct_query] - if self.cacheEnable: - cache_fifo = LanciumJobsCacheFifo(target=self.submission_host, idlist=f"{self.submission_host},{get_ident()}") - query_method_list.insert(0, cache_query) - # Go - for query_method in query_method_list: - # Query - jobs_iter = query_method(batch_id_set=batchIDs_set) - for job in jobs_iter: - try: - job_attr_dict = dict(job) - batch_id = job_attr_dict["id"] - except Exception as e: - tmpLog.error(f"in querying; got exception {e.__class__.__name__}: {e} ; {repr(job)}") - else: - full_batch_id = get_full_batch_id(self.submission_host, batch_id) - job_attr_all_dict[full_batch_id] = job_attr_dict - # Remove batch jobs already gotten from the list - if not all_jobs: - batchIDs_set.discard(batch_id) - if len(batchIDs_set) == 0 or all_jobs: - break - # Remaining - if not all_jobs and len(batchIDs_set) > 0: - # Job unfound, marked as unknown worker in harvester - for batch_id in batchIDs_set: - full_batch_id = get_full_batch_id(self.submission_host, batch_id) - job_attr_all_dict[full_batch_id] = dict() - tmpLog.info(f"Unfound batch jobs of submissionHost={self.submission_host}: {' '.join(list(batchIDs_set))}") - # Return - return job_attr_all_dict diff --git a/pandaharvester/harvestermonitor/apfgrid_monitor.py b/pandaharvester/harvestermonitor/apfgrid_monitor.py deleted file mode 100644 index 39604ae7..00000000 --- a/pandaharvester/harvestermonitor/apfgrid_monitor.py +++ /dev/null @@ -1,125 +0,0 @@ -import logging -import sys -import threading -import traceback - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.work_spec import WorkSpec -from pandaharvester.harvestersubmitter.apfgrid_submitter import APFGridSubmitter - -try: - from autopyfactory import condorlib -except ImportError: - logging.error(f"Unable to import htcondor/condorlib. sys.path={sys.path}") - -# setup base logger -baseLogger = core_utils.setup_logger() - - -class APFGridMonitorSingleton(type): - def __init__(self, *args, **kwargs): - super(APFGridMonitorSingleton, self).__init__(*args, **kwargs) - self.__instance = None - - def __call__(self, *args, **kwargs): - if self.__instance is None: - self.__instance = super(APFGridMonitorSingleton, self).__call__(*args, **kwargs) - return self.__instance - - -class APFGridMonitor(object): - """ - 1 WorkSpec.ST_submitted = 'submitted' - 2 WorkSpec.ST_running = 'running' - 4 WorkSpec.ST_finished = 'finished' - 5 WorkSpec.ST_failed = 'failed' - 6 WorkSpec.ST_ready = 'ready' - 3 WorkSpec.ST_cancelled = 'cancelled ' - - CONDOR_JOBSTATUS - 1 Idle I - 2 Running R - 3 Removed X - 4 Completed C - 5 Held H - 6 Submission_err E - """ - - __metaclass__ = APFGridMonitorSingleton - STATUS_MAP = { - 1: WorkSpec.ST_submitted, - 2: WorkSpec.ST_running, - 3: WorkSpec.ST_cancelled, - 4: WorkSpec.ST_finished, - 5: WorkSpec.ST_failed, - 6: WorkSpec.ST_ready, - } - - JOBQUERYATTRIBUTES = [ - "match_apf_queue", - "jobstatus", - "workerid", - "apf_queue", - "apf_logurl", - "apf_outurl", - "apf_errurl", - ] - - def __init__(self, **kwarg): - self.log = core_utils.make_logger(baseLogger) - self.jobinfo = None - self.historyinfo = None - self.log.debug("APFGridMonitor initialized.") - - def _updateJobInfo(self): - self.log.debug("Getting job info from Condor...") - out = condorlib.condor_q(APFGridMonitor.JOBQUERYATTRIBUTES) - self.log.debug(f"Got jobinfo {out}") - self.jobinfo = out - out = condorlib.condor_history(attributes=APFGridMonitor.JOBQUERYATTRIBUTES, constraints=[]) - self.log.debug(f"Got history info {out}") - self.historyinfo = out - alljobs = self.jobinfo + self.historyinfo - for jobad in alljobs: - try: - workerid = jobad["workerid"] - self.allbyworkerid[workerid] = jobad - except KeyError: - # some non-harvester jobs may not have workerids, ignore them - pass - self.log.debug(f"All jobs indexed by worker_id. {len(self.allbyworkerid)} entries.") - - # check workers - def check_workers(self, workspec_list): - """Check status of workers. This method takes a list of WorkSpecs as input argument - and returns a list of worker's statuses. - Nth element if the return list corresponds to the status of Nth WorkSpec in the given list. Worker's - status is one of WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_running, - WorkSpec.ST_submitted. - - :param workspec_list: a list of work specs instances - :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. - :rtype: (bool, [string,]) - - """ - self.jobinfo = [] - self.historyinfo = [] - self.allbyworkerid = {} - self._updateJobInfo() - - retlist = [] - for workSpec in workspec_list: - self.log.debug( - f"Worker(workerId={workSpec.workerID} queueName={workSpec.queueName} computingSite={workSpec.computingSite} status={workSpec.status} )" - ) - try: - jobad = self.allbyworkerid[workSpec.workerID] - self.log.debug(f"Found matching job: ID {jobad['workerid']}") - jobstatus = int(jobad["jobstatus"]) - retlist.append((APFGridMonitor.STATUS_MAP[jobstatus], "")) - except KeyError: - self.log.error(f"No corresponding job for workspec {workSpec}") - retlist.append((WorkSpec.ST_cancelled, "")) - self.log.debug(f"retlist={retlist}") - return True, retlist diff --git a/pandaharvester/harvestermonitor/arc_monitor.py b/pandaharvester/harvestermonitor/arc_monitor.py deleted file mode 100644 index f3b60ac0..00000000 --- a/pandaharvester/harvestermonitor/arc_monitor.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -import re -try: - import subprocess32 as subprocess -except: - import subprocess -import arc - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.work_spec import WorkSpec -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestermisc import arc_utils - -# logger -baselogger = core_utils.setup_logger() - - -class ARCMonitor(PluginBase): - '''Monitor for ARC CE plugin''' - - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - - # Credential dictionary role: proxy file - self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], - list(harvester_config.credmanager.outCertFile))) - self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) - - - # check workers - def check_workers(self, workspec_list): - retList = [] - for workspec in workspec_list: - - # make logger - arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) - tmplog = arclog.log - tmplog.info("checking worker id {0}".format(workspec.workerID)) - (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) - - # Set certificate - userconfig = arc.UserConfig(self.cred_type) - try: - userconfig.ProxyPath(str(self.certs[proxyrole])) - except: - tmplog.error("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) - retList.append((workspec.status, '')) - continue - - job_supervisor = arc.JobSupervisor(userconfig, [job]) - job_supervisor.Update() - - jobsupdated = job_supervisor.GetAllJobs() - jobsnotupdated = job_supervisor.GetIDsNotProcessed() - - for updatedjob in jobsupdated: - if updatedjob.JobID in jobsnotupdated: - tmplog.error("Failed to find information on {0}".format(updatedjob.JobID)) - # If missing for too long (2 days), mark as lost - if arc.Time() - modtime > arc.Period(172800): - tmplog.error("Job {0} missing for more than 2 days, marking as lost".format(updatedjob.JobID)) - retList.append((workspec.ST_failed, '')) - else: - retList.append((workspec.status, '')) - continue - - # Convert arc state to WorkSpec state - arcstatus = updatedjob.State - newstatus = WorkSpec.ST_submitted - if arcstatus == arc.JobState.RUNNING or \ - arcstatus == arc.JobState.FINISHING: - newstatus = WorkSpec.ST_running - elif arcstatus == arc.JobState.FINISHED: - if updatedjob.ExitCode == -1: - # Missing exit code, but assume success - tmplog.warning("Job {0} FINISHED but has missing exit code, setting to zero".format(updatedjob.JobID)) - updatedjob.ExitCode = 0 - newstatus = WorkSpec.ST_finished - elif arcstatus == arc.JobState.FAILED: - newstatus = WorkSpec.ST_failed - tmplog.info("Job {0} failed: {1}".format(updatedjob.JobID, ";".join([joberr for joberr in updatedjob.Error]))) - elif arcstatus == arc.JobState.KILLED: - newstatus = WorkSpec.ST_cancelled - elif arcstatus == arc.JobState.DELETED or \ - arcstatus == arc.JobState.OTHER: - # unexpected - newstatus = WorkSpec.ST_failed - # Not covered: arc.JobState.HOLD. Maybe need a post-run state in - # harvester, also to cover FINISHING - - # compare strings here to get around limitations of JobState API - if job.State.GetGeneralState() == updatedjob.State.GetGeneralState(): - tmplog.debug("Job {0} still in state {1}".format(job.JobID, job.State.GetGeneralState())) - retList.append((newstatus, '')) - continue - - tmplog.info("Job {0}: {1} -> {2} ({3})".format(job.JobID, job.State.GetGeneralState(), - updatedjob.State.GetGeneralState(), - updatedjob.State.GetSpecificState())) - - arc_utils.arcjob2workspec(updatedjob, workspec) - # Have to force update to change info in DB - workspec.force_update('workAttributes') - tmplog.debug("batchStatus {0} -> workerStatus {1}".format(arcstatus.GetGeneralState(), newstatus)) - retList.append((newstatus, '')) - - return True, retList - -def test(jobid): - '''Test checking status''' - from pandaharvester.harvestercore.work_spec import WorkSpec - wspec = WorkSpec() - wspec.batchID = jobid #"gsiftp://pikolit.ijs.si:2811/jobs/HtgKDmtCe7qn4J8tmqCBXHLnABFKDmABFKDmBcGKDmABFKDm4NCTCn" - workAttributes = {"arcjob": {}} - workAttributes["arcjob"]["JobID"] = wspec.batchID - workAttributes["arcjob"]["JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format(urlparse.urlparse(jobid).netloc, jobid) - workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" - jobmanagementurl = arc.URL(wspec.batchID) - jobmanagementurl.ChangePath("/jobs") - workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() - workAttributes["arcjob"]["JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" - - wspec.workAttributes = workAttributes - print wspec.workAttributes - - monitor = ARCMonitor() - print monitor.check_workers([wspec]) - -if __name__ == "__main__": - import time, sys, urlparse - if len(sys.argv) != 2: - print "Please give ARC job id" - sys.exit(1) - while True: - test(sys.argv[1]) - time.sleep(2) diff --git a/pandaharvester/harvestermonitor/cloud_google_monitor.py b/pandaharvester/harvestermonitor/cloud_google_monitor.py deleted file mode 100644 index 72bd7d4d..00000000 --- a/pandaharvester/harvestermonitor/cloud_google_monitor.py +++ /dev/null @@ -1,121 +0,0 @@ -from pandaharvester.harvestercloud.googlecloud import PROJECT, ZONE, compute -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestercore.work_spec import WorkSpec - -base_logger = core_utils.setup_logger("google_monitor") - - -class GoogleMonitor(PluginBase): - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - self.queue_config_mapper = QueueConfigMapper() - - # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status - self.vm_to_worker_status = { - "RUNNING": WorkSpec.ST_running, - "TERMINATED": WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted - "STOPPING": WorkSpec.ST_finished, - "PROVISIONING": WorkSpec.ST_submitted, - "STAGING": WorkSpec.ST_submitted, - } - - def list_vms(self, zone): - """ - List the status of the running VMs - :return: - """ - - try: - result = compute.instances().list(project=PROJECT, zone=zone).execute() - - try: - vm_instances = result["items"] - except KeyError: - # there are no VMs running - return [], {} - - # make a list with the VM names - vm_names = map(lambda vm_instance: vm_instance["name"], vm_instances) - - # make a dictionary so we can retrieve a VM by its name - vm_name_to_status = {} - for vm_instance in vm_instances: - vm_name_to_status[vm_instance["name"]] = vm_instance["status"] - - return vm_names, vm_name_to_status - - except BaseException: - return None, None - - def kill_worker(self, vm_name, zone): - """ - Sends the command to Google to destroy a VM - """ - - try: - base_logger.debug(f"Going to kill VM {vm_name}") - compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() - base_logger.debug(f"Killed VM {vm_name}") - except Exception as e: - base_logger.error(f"Problems killing the VM: {e}") - - def check_workers(self, workers): - """ - This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. - Nth element in the return list corresponds to the status of Nth WorkSpec in the given list. - - :param worker_list: a list of work specs instances - :return: A tuple containing the return code (True for success, False otherwise) and a list of worker's statuses - :rtype: (bool, [string,]) - """ - - if not workers: - return False, "Empty workers list received" - - # it assumes that all workers belong to the same queue, which is currently the case - # we assume all work_specs in the list belong to the same queue - queue_config = self.queue_config_mapper.get_queue(workers[0].computingSite) - try: - zone = queue_config.zone - except AttributeError: - zone = ZONE - - # running instances - vm_names, vm_name_to_status = self.list_vms(zone) - if vm_names is None and vm_name_to_status is None: - error_string = "Could not list the VMs" - base_logger.error(error_string) - return False, error_string - - # extract the list of batch IDs - batch_IDs = map(lambda x: str(x.batchID), workers) - base_logger.debug(f"Batch IDs: {batch_IDs}") - - ret_list = [] - for batch_ID in batch_IDs: - tmp_log = self.make_logger(base_logger, f"batch ID={batch_ID}", method_name="check_workers") - - if batch_ID not in vm_names: - new_status = WorkSpec.ST_finished - message = "VM not found" - else: - try: - new_status = self.vm_to_worker_status[vm_name_to_status[batch_ID]] - message = "VM status returned by GCE API" - - # Preemptible VMs: GCE terminates a VM, but a stopped VM with its disk is left and needs to be - # explicitly deleted - if vm_name_to_status[batch_ID] == "TERMINATED": - self.kill_worker(batch_ID, zone) - - except KeyError: - new_status = WorkSpec.ST_missed - message = f"Unknown status to Harvester: {vm_name_to_status[batch_ID]}" - - tmp_log.debug(f"new_status={new_status}") - ret_list.append((new_status, message)) - - base_logger.debug(f"ret_list: {ret_list}") - return True, ret_list diff --git a/pandaharvester/harvestermonitor/cloud_openstack_monitor.py b/pandaharvester/harvestermonitor/cloud_openstack_monitor.py deleted file mode 100644 index 8c6688a9..00000000 --- a/pandaharvester/harvestermonitor/cloud_openstack_monitor.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -from concurrent.futures import ThreadPoolExecutor - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.work_spec import WorkSpec -from pandaharvester.harvestermisc.cloud_openstack_utils import OS_SimpleClient - -# setup base logger -baseLogger = core_utils.setup_logger("cloud_openstack_monitor") - - -# status map -# FIXME -vm_worker_status_map_dict = { - "ACTIVE": WorkSpec.ST_running, - "BUILD": WorkSpec.ST_submitted, - "DELETED": WorkSpec.ST_finished, - "ERROR": WorkSpec.ST_failed, - "HARD_REBOOT": WorkSpec.ST_pending, - "MIGRATING": WorkSpec.ST_pending, - "PASSWORD": WorkSpec.ST_pending, - "PAUSED": WorkSpec.ST_pending, - "REBOOT": WorkSpec.ST_pending, - "REBUILD": WorkSpec.ST_pending, - "RESCUE": WorkSpec.ST_pending, - "RESIZE": WorkSpec.ST_pending, - "REVERT_RESIZE": WorkSpec.ST_pending, - "SHELVED": WorkSpec.ST_pending, - "SHELVED_OFFLOADED": WorkSpec.ST_pending, - "SHUTOFF": WorkSpec.ST_cancelled, - "SOFT_DELETED": WorkSpec.ST_pending, - "SUSPENDED": WorkSpec.ST_pending, - "UNKNOWN": WorkSpec.ST_failed, - "VERIFY_RESIZE": WorkSpec.ST_pending, -} - - -# whether to kill the vm -def _toKillVM(*some_info): - retVal = False - # FIXME - # information should come from harvester messenger or else - return retVal - - -# Cloud Openstack monitor -class CloudOpenstackMonitor(PluginBase): - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - self.nProcesses = 4 - self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - - # kill a vm - - def _kill_a_vm(self, vm_id): - # set logger - tmpLog = self.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="_kill_a_vm") - try: - self.vm_client.nova.delete(vm_id) - except Exception as _e: - errStr = f"Failed to delete a VM with id={vm_id} ; {_e}" - tmpLog.error(errStr) - tmpRetVal = (False, errStr) - else: - tmpLog.info(f"Deleted a VM with id={vm_id}") - tmpRetVal = (True, "") - return tmpRetVal - - # check a vm - - def _check_a_vm(self, workspec): - # set logger - tmpLog = self.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="_check_a_vm") - - # initialization - vm_id = workspec.batchID - newStatus = workspec.status - errStr = "" - - try: - vm_server = self.vm_client.nova.servers.get(vm_id) - vm_status = vm_server.status - except Exception as _e: - errStr = f"Failed to get VM status of id={vm_id} ; {_e}" - tmpLog.error(errStr) - tmpLog.info("Force to cancel the worker due to failure to get VM status") - newStatus = WorkSpec.ST_cancelled - else: - newStatus = vm_worker_status_map_dict.get(vm_status) - tmpLog.info(f"batchID={workspec.batchID}: vm_status {vm_status} -> worker_status {newStatus}") - - if _toKillVM(): # FIXME - self._kill_a_vm(vm_id) - - return (newStatus, errStr) - - # check workers - - def check_workers(self, workspec_list): - # Check for all workers - with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retIterator = thread_pool.map(self._check_a_vm, workspec_list) - - retList = list(retIterator) - - return True, retList diff --git a/pandaharvester/harvestermonitor/lancium_monitor.py b/pandaharvester/harvestermonitor/lancium_monitor.py deleted file mode 100644 index 8cd48035..00000000 --- a/pandaharvester/harvestermonitor/lancium_monitor.py +++ /dev/null @@ -1,262 +0,0 @@ -import time -from concurrent.futures import ThreadPoolExecutor as Pool - -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.pilot_errors import PilotErrors -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.work_spec import WorkSpec -from pandaharvester.harvestercore.worker_errors import WorkerErrors -from pandaharvester.harvestermisc.lancium_utils import ( - LanciumJobQuery, - get_full_batch_id_from_workspec, - get_host_batch_id_map, - get_workerid_from_job_name, - timestamp_to_datetime, -) - -# logger -base_logger = core_utils.setup_logger("lancium_monitor") - -# pilot error object -PILOT_ERRORS = PilotErrors() - - -# Check one worker -def _check_one_worker(workspec, job_attr_all_dict, cancel_unknown=False, held_timeout=3600): - # Make logger for one single worker - tmp_log = core_utils.make_logger(base_logger, f"workerID={workspec.workerID}", method_name="_check_one_worker") - # Initialize newStatus - newStatus = workspec.status - errStr = "" - try: - job_attr_dict = job_attr_all_dict[get_full_batch_id_from_workspec(workspec)] - except KeyError: - got_job_attr = False - except Exception as e: - got_job_attr = False - tmp_log.error(f"With error {e}") - else: - got_job_attr = True - # Parse job ads - if got_job_attr: - # Check - try: - # FIXME - new_batch_status = job_attr_dict["status"] - except KeyError: - # Propagate native job status as unknown - workspec.nativeStatus = "unknown" - if cancel_unknown: - newStatus = WorkSpec.ST_cancelled - errStr = f"cannot get job status of submissionHost={workspec.submissionHost} batchID={workspec.batchID}. Regard the worker as canceled" - tmp_log.error(errStr) - else: - newStatus = None - errStr = f"cannot get job status of submissionHost={workspec.submissionHost} batchID={workspec.batchID}. Skipped" - tmp_log.warning(errStr) - else: - # Possible native statuses: "created" "submitted" "queued" "ready" "running" "error" "finished" "delete pending" - last_batch_status = workspec.nativeStatus - batchStatus = new_batch_status - # Set batchStatus if last_batch_status is terminated status - if (last_batch_status in ["error", "finished", "delete pending"] and new_batch_status not in ["error", "finished", "delete pending"]) or ( - last_batch_status in ["error", "finished"] and new_batch_status in ["delete pending"] - ): - batchStatus = last_batch_status - tmp_log.warning( - "refer to last_batch_status={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (new_batch_status={3})".format( - last_batch_status, workspec.submissionHost, workspec.batchID, new_batch_status - ) - ) - # Propagate native job status - workspec.nativeStatus = batchStatus - if batchStatus in ["running"]: - # running - newStatus = WorkSpec.ST_running - elif batchStatus in ["created", "submitted", "queued", "ready"]: - # pre-running - newStatus = WorkSpec.ST_submitted - elif batchStatus in ["error"]: - # failed - errStr += f"job error_string: {job_attr_dict.get('error_string')} " - newStatus = WorkSpec.ST_failed - elif batchStatus in ["delete pending"]: - # cancelled - errStr = f"job error_string: {job_attr_dict.get('error_string')} " - newStatus = WorkSpec.ST_cancelled - # Mark the PanDA job as closed instead of failed - workspec.set_pilot_closed() - tmp_log.debug("Called workspec set_pilot_closed") - elif batchStatus in ["finished"]: - # finished - # try: - # payloadExitCode_str = str(job_attr_dict['exit_code']) - # payloadExitCode = int(payloadExitCode_str) - # except KeyError: - # errStr = 'cannot get exit_code of submissionHost={0} batchID={1}. Regard the worker as failed'.format(workspec.submissionHost, workspec.batchID) - # tmp_log.warning(errStr) - # newStatus = WorkSpec.ST_failed - # except ValueError: - # errStr = 'got invalid exit_code {0} of submissionHost={1} batchID={2}. Regard the worker as failed'.format(payloadExitCode_str, workspec.submissionHost, workspec.batchID) - # tmp_log.warning(errStr) - # newStatus = WorkSpec.ST_failed - # else: - # # Propagate exit_code code - # workspec.nativeExitCode = payloadExitCode - # if payloadExitCode == 0: - # # Payload should return 0 after successful run - # newStatus = WorkSpec.ST_finished - # else: - # # Other return codes are considered failed - # newStatus = WorkSpec.ST_failed - # errStr = 'Payload execution error: returned non-zero {0}'.format(payloadExitCode) - # tmp_log.debug(errStr) - # # Map return code to Pilot error code - # reduced_exit_code = payloadExitCode // 256 if (payloadExitCode % 256 == 0) else payloadExitCode - # pilot_error_code, pilot_error_diag = PILOT_ERRORS.convertToPilotErrors(reduced_exit_code) - # if pilot_error_code is not None: - # workspec.set_pilot_error(pilot_error_code, pilot_error_diag) - # tmp_log.info('Payload return code = {0}'.format(payloadExitCode)) - # - # finished - newStatus = WorkSpec.ST_finished - try: - payloadExitCode_str = str(job_attr_dict["exit_code"]) - payloadExitCode = int(payloadExitCode_str) - except KeyError: - errStr = f"cannot get exit_code of submissionHost={workspec.submissionHost} batchID={workspec.batchID}" - tmp_log.warning(errStr) - except ValueError: - errStr = f"got invalid exit_code {payloadExitCode_str} of submissionHost={workspec.submissionHost} batchID={workspec.batchID}" - tmp_log.warning(errStr) - else: - # Propagate exit_code code - workspec.nativeExitCode = payloadExitCode - tmp_log.info(f"Payload return code = {payloadExitCode}") - else: - errStr = "cannot get reasonable job status of submissionHost={0} batchID={1}. Regard the worker as failed by default".format( - workspec.submissionHost, workspec.batchID - ) - tmp_log.error(errStr) - newStatus = WorkSpec.ST_failed - tmp_log.info(f"submissionHost={workspec.submissionHost} batchID={workspec.batchID} : batchStatus {batchStatus} -> workerStatus {newStatus}") - else: - # Propagate native job status as unknown - workspec.nativeStatus = "unknown" - if cancel_unknown: - errStr = f"job submissionHost={workspec.submissionHost} batchID={workspec.batchID} not found. Regard the worker as canceled by default" - tmp_log.error(errStr) - newStatus = WorkSpec.ST_cancelled - tmp_log.info(f"submissionHost={workspec.submissionHost} batchID={workspec.batchID} : batchStatus 3 -> workerStatus {newStatus}") - else: - errStr = f"job submissionHost={workspec.submissionHost} batchID={workspec.batchID} not found. Skipped" - tmp_log.warning(errStr) - newStatus = None - # Set supplemental error message - error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") if errStr else WorkerErrors.error_codes.get("SUCCEEDED") - workspec.set_supplemental_error(error_code=error_code, error_diag=errStr) - # Return - return (newStatus, errStr) - - -# monitor for Lancium -class LanciumMonitor(PluginBase): - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - try: - self.nProcesses - except AttributeError: - self.nProcesses = 4 - try: - self.cancelUnknown - except AttributeError: - self.cancelUnknown = False - else: - self.cancelUnknown = bool(self.cancelUnknown) - try: - self.heldTimeout - except AttributeError: - self.heldTimeout = 3600 - try: - self.cacheEnable = harvester_config.monitor.pluginCacheEnable - except AttributeError: - self.cacheEnable = False - try: - self.cacheRefreshInterval = harvester_config.monitor.pluginCacheRefreshInterval - except AttributeError: - self.cacheRefreshInterval = harvester_config.monitor.checkInterval - try: - self.submissionHost_list - except AttributeError: - self.submissionHost_list = [] - - # check workers - def check_workers(self, workspec_list): - # Make logger for batch job query - tmp_log = self.make_logger(base_logger, "batch job query", method_name="check_workers") - tmp_log.debug("start") - # Loop over submissionHost - job_attr_all_dict = {} - for submissionHost, batchIDs_list in get_host_batch_id_map(workspec_list).items(): - # Record batch job query result to this dict, with key = batchID - try: - job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, id=submissionHost) - host_job_attr_dict = job_query.query_jobs(batchIDs_list=batchIDs_list) - except Exception as e: - host_job_attr_dict = {} - ret_err_str = f"Exception {e.__class__.__name__}: {e}" - tmp_log.error(ret_err_str) - job_attr_all_dict.update(host_job_attr_dict) - # Check for all workers - with Pool(self.nProcesses) as _pool: - retIterator = _pool.map( - lambda _x: _check_one_worker(_x, job_attr_all_dict, cancel_unknown=self.cancelUnknown, held_timeout=self.heldTimeout), workspec_list - ) - retList = list(retIterator) - tmp_log.debug("done") - return True, retList - - # report updated workers info to monitor to check - def report_updated_workers(self, time_window): - # Make logger for batch job query - tmp_log = self.make_logger(base_logger, method_name="report_updated_workers") - tmp_log.debug("start") - # Get now timestamp - timeNow = time.time() - # Set of submission hosts - submission_host_set = set() - for submissionHost in self.submissionHost_list: - submission_host_set.add(submissionHost) - # Loop over submissionHost and get all jobs - job_attr_all_dict = {} - for submissionHost in submission_host_set: - try: - job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, id=submissionHost) - job_attr_all_dict.update(job_query.query_jobs(all_jobs=True)) - tmp_log.debug(f"got information of jobs on {submissionHost}") - except Exception as e: - ret_err_str = f"Exception {e.__class__.__name__}: {e}" - tmp_log.error(ret_err_str) - # Choose workers updated within a time window - workers_to_check_list = [] - for full_batch_id, job_attr_dict in job_attr_all_dict.items(): - # put in worker cache fifo, with lock mechanism - job_update_at_str = job_attr_dict.get("updated_at") - try: - job_update_at = timestamp_to_datetime(job_update_at_str) - except Exception as e: - ret_err_str = f"Exception {e.__class__.__name__}: {e}" - tmp_log.error(ret_err_str) - job_update_at = None - if job_update_at is not None and not (job_update_at > timeNow - time_window): - continue - job_name = job_attr_dict.get("name") - harvester_id, worker_id = get_workerid_from_job_name(job_name) - if worker_id is None or harvester_id != harvester_config.master.harvester_id: - continue - workers_to_check_list.append((worker_id, job_update_at)) - tmp_log.debug(f"got {len(workers_to_check_list)} workers") - tmp_log.debug("done") - return workers_to_check_list diff --git a/pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py b/pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py deleted file mode 100755 index 7cbc2de6..00000000 --- a/pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py +++ /dev/null @@ -1,705 +0,0 @@ -#!/usr/bin/env python -import json -import logging -import os -import shutil -import sys -import tarfile -import time -from collections import defaultdict -from datetime import datetime -from glob import glob -from socket import gethostname -from subprocess import call - -from mpi4py import MPI - -from pilot.jobdescription import JobDescription # temporary hack -from pilot.util.filehandling import get_json_dictionary as read_json - -# from pilot.util.filehandling import read_json -# from pilot.control.payload import parse_jobreport_data # failed with third party import "import _ssl" - -# TODO Safe local copy, with proper exit on failure - -comm = MPI.COMM_WORLD -rank = comm.Get_rank() -max_rank = comm.Get_size() - -logger = logging.getLogger('Rank {0}'.format(rank)) -logger.setLevel(logging.DEBUG) -debug_h = logging.StreamHandler(stream=sys.stdout) -formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s') -debug_h.setFormatter(formatter) -debug_h.setLevel(logging.DEBUG) -error_h = logging.StreamHandler(stream=sys.stderr) -error_h.setFormatter(formatter) -error_h.setLevel(logging.ERROR) -logger.addHandler(error_h) -logger.addHandler(debug_h) - -logger.info('HPC Pilot ver. 0.007') - - -# TODO: loglevel as input parameter - -def parse_jobreport_data(job_report): - work_attributes = {} - if job_report is None or not any(job_report): - return work_attributes - - # these are default values for job metrics - core_count = 16 - work_attributes["nEvents"] = 0 - work_attributes["dbTime"] = "" - work_attributes["dbData"] = "" - - class DictQuery(dict): - def get(self, path, dst_dict, dst_key): - keys = path.split("/") - if len(keys) == 0: - return - last_key = keys.pop() - v = self - for key in keys: - if key in v and isinstance(v[key], dict): - v = v[key] - else: - return - if last_key in v: - dst_dict[dst_key] = v[last_key] - - if 'ATHENA_PROC_NUMBER' in os.environ: - work_attributes['core_count'] = os.environ['ATHENA_PROC_NUMBER'] - core_count = int(os.environ['ATHENA_PROC_NUMBER']) - - dq = DictQuery(job_report) - dq.get("resource/transform/processedEvents", work_attributes, "nEvents") - dq.get("resource/transform/cpuTimeTotal", work_attributes, "cpuConsumptionTime") - dq.get("resource/machine/node", work_attributes, "node") - dq.get("resource/machine/model_name", work_attributes, "cpuConsumptionUnit") - dq.get("resource/dbTimeTotal", work_attributes, "dbTime") - dq.get("resource/dbDataTotal", work_attributes, "dbData") - dq.get("exitCode", work_attributes, "transExitCode") - dq.get("exitMsg", work_attributes, "exeErrorDiag") - dq.get("files/output", work_attributes, "outputfiles") - - outputfiles_dict = {} - if 'outputfiles' in work_attributes.keys(): - for ofs in work_attributes['outputfiles']: - for of in ofs['subFiles']: - outputfiles_dict[of['name']] = {'guid': of['file_guid'], - 'nentries': of['nentries'], - 'size': of['file_size']} - work_attributes['outputfiles'] = outputfiles_dict - - if 'resource' in job_report and 'executor' in job_report['resource']: - j = job_report['resource']['executor'] - exc_report = [] - fin_report = defaultdict(int) - for v in filter(lambda d: 'memory' in d and ('Max' or 'Avg' in d['memory']), j.itervalues()): - if 'Avg' in v['memory']: - exc_report.extend(v['memory']['Avg'].items()) - if 'Max' in v['memory']: - exc_report.extend(v['memory']['Max'].items()) - for x in exc_report: - fin_report[x[0]] += x[1] - work_attributes.update(fin_report) - - if 'files' in job_report and 'input' in job_report['files']: - nInputFiles = 0 - for input_file in job_report['files']['input']: - if 'subfiles' in input_file: - nInputFiles += len(job_report['files']['input']['subfiles']) - work_attributes['nInputFiles'] = nInputFiles - - # workdir_size = get_workdir_size() - work_attributes['jobMetrics'] = 'coreCount=%s nEvents=%s dbTime=%s dbData=%s' % \ - (core_count, - work_attributes["nEvents"], - work_attributes["dbTime"], - work_attributes["dbData"]) - del (work_attributes["dbData"]) - del (work_attributes["dbTime"]) - - return work_attributes - - -def get_setup(job): - # special setup preparation. - - setup_commands = ['source /ccs/proj/csc108/athena_grid_env/setup.sh', - 'source $MODULESHOME/init/bash', - 'tmp_dirname=/tmp/scratch', - 'tmp_dirname+="/tmp"', - 'export TEMP=$tmp_dirname', - 'export TMPDIR=$TEMP', - 'export TMP=$TEMP', - 'export LD_LIBRARY_PATH=/ccs/proj/csc108/AtlasReleases/ldpatch:$LD_LIBRARY_PATH', - 'export ATHENA_PROC_NUMBER=16', - 'export G4ATLAS_SKIPFILEPEEK=1', - 'export PANDA_RESOURCE=\"ORNL_Titan_MCORE\"', - 'export ROOT_TTREECACHE_SIZE=1', - 'export RUCIO_APPID=\"simul\"', - 'export RUCIO_ACCOUNT=\"pilot\"', - 'export CORAL_DBLOOKUP_PATH=/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files', - 'export CORAL_AUTH_PATH=$SW_INSTALL_AREA/DBRelease/current/XMLConfig', - 'export DATAPATH=$SW_INSTALL_AREA/DBRelease/current:$DATAPATH', - ' '] - - return setup_commands - - -def timestamp(): - """ return ISO-8601 compliant date/time format. Should be migrated to Pilot 2""" - tmptz = time.timezone - sign_str = '+' - if tmptz > 0: - sign_str = '-' - tmptz_hours = int(tmptz / 3600) - - return str("%s%s%02d:%02d" % (time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), sign_str, abs(tmptz_hours), - int(tmptz / 60 - tmptz_hours * 60))) - - -def main_exit(exit_code, work_report=None, workerAttributesFile="worker_attributes.json"): - if work_report: - publish_work_report(work_report, workerAttributesFile) - sys.exit(exit_code) - - -def publish_work_report(work_report=None, workerAttributesFile="worker_attributes.json"): - """Publishing of work report to file""" - if work_report: - if work_report.has_key("outputfiles"): - del (work_report["outputfiles"]) - with open(workerAttributesFile, 'w') as outputfile: - work_report['timestamp'] = timestamp() - json.dump(work_report, outputfile) - logger.debug("Work report published: {0}".format(work_report)) - return 0 - - -def main(): - workerAttributesFile = "worker_attributes.json" - StageOutnFile = "event_status.dump.json" - payload_report_file = 'jobReport.json' - - start_g = time.time() - start_g_str = time.asctime(time.localtime(start_g)) - hostname = gethostname() - logger.info("Pilot statrted at {0} on {1}".format(start_g_str, hostname)) - starting_point = os.getcwd() - scratch_path = '/tmp/scratch/' - cleanup_pathes() - - work_report = {} - work_report["jobStatus"] = "starting" - work_report["messageLevel"] = logging.getLevelName(logger.getEffectiveLevel()) - work_report['cpuConversionFactor'] = 1.0 - work_report['node'] = hostname - - # Get a file name with job descriptions - if len(sys.argv) > 1: - input_file = sys.argv[1] - else: - input_file = 'worker_pandaids.json' - try: - in_file = open(input_file) - panda_ids = json.load(in_file) - in_file.close() - except IOError as (errno, strerror): - logger.critical("I/O error({0}): {1}".format(errno, strerror)) - logger.critical("Exit from rank") - main_exit(errno) - - logger.debug("Collected list of jobs") - # PandaID of the job for the command - try: - job_id = panda_ids[rank] - except ValueError: - logger.critical("Pilot have no job for rank {0}".format(rank)) - logger.critical("Exit pilot") - main_exit(1) - - logger.debug("Job [{0}] will be processed".format(job_id)) - os.chdir(str(job_id)) - worker_communication_point = os.getcwd() - - work_report['workdir'] = worker_communication_point - workerAttributesFile = os.path.join(worker_communication_point, workerAttributesFile) - trans_job_workdir = os.path.join(scratch_path, str(job_id)) - - jobs_dict = read_json("HPCJobs.json") - job_dict = jobs_dict[str(job_id)] - - job = JobDescription() - job.load(job_dict) - # add path to input files in RAM - for inp_file in job.input_files: - job.input_files[inp_file]["scratch_path"] = os.path.join(trans_job_workdir, inp_file) - - job.startTime = "" - job.endTime = "" - setup_str = "; ".join(get_setup(job)) - - job_working_dir = titan_prepare_wd(scratch_path, trans_job_workdir, worker_communication_point, job, - workerAttributesFile) - - my_command = " ".join([job.script, job.script_parameters]) - my_command = titan_command_fix(my_command, job_working_dir) - my_command = setup_str + my_command - logger.debug("Going to launch: {0}".format(my_command)) - logger.debug("Current work directory: {0}".format(job_working_dir)) - payloadstdout = open("athena_stdout.txt", "w") - payloadstderr = open("athena_stderr.txt", "w") - - job.state = 'running' - work_report["jobStatus"] = job.state - start_time = time.asctime(time.localtime(time.time())) - job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - publish_work_report(work_report, workerAttributesFile) - stime = time.time() - t0 = os.times() - exit_code = call(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) - t1 = os.times() - exetime = time.time() - stime - end_time = time.asctime(time.localtime(time.time())) - t = map(lambda x, y: x - y, t1, t0) - t_tot = reduce(lambda x, y: x + y, t[2:3]) - job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - payloadstdout.close() - payloadstderr.close() - if exit_code == 0: - job.state = 'finished' - else: - job.state = 'failed' - job.exitcode = exit_code - - work_report["endTime"] = job.endTime - work_report["jobStatus"] = job.state - work_report["cpuConsumptionTime"] = t_tot - work_report["transExitCode"] = job.exitcode - - logger.info("Payload exit code: {0} JobID: {1}".format(exit_code, job_id)) - logger.info("CPU comsumption time: {0} JobID: {1}".format(t_tot, job_id)) - logger.info("Start time: {0} JobID: {1}".format(start_time, job_id)) - logger.info("End time: {0} JobID: {1}".format(end_time, job_id)) - logger.info("Execution time: {0} sec. JobID: {1}".format(exetime, job_id)) - logger.debug("Job report start time: {0}".format(job.startTime)) - logger.debug("Job report end time: {0}".format(job.endTime)) - - if os.path.exists(payload_report_file): - payload_report = parse_jobreport_data(read_json(payload_report_file)) - work_report.update(payload_report) - copy_jobreport(job_working_dir, worker_communication_point, payload_report_file, workerAttributesFile) - - titan_postprocess_wd(job_working_dir) - - # log file not produced (yet) - protectedfiles = job.output_files.keys() - if job.log_file in protectedfiles: - protectedfiles.remove(job.log_file) - else: - logger.info("Log files was not declared") - - cleanup_strat = time.time() - logger.info("Cleanup of working directory") - protectedfiles.extend([workerAttributesFile, StageOutnFile]) - removeRedundantFiles(job_working_dir, protectedfiles) - cleanup_time = time.time() - cleanup_strat - logger.info("Cleanup took: {0} sec.".format(cleanup_time)) - res = packlogs(job_working_dir, protectedfiles, job.log_file) - if res > 0: - job.state = 'failed' - work_report['pilotErrorCode'] = 1164 # Let's take this as closed one - work_report['jobStatus'] = job.state - main_exit(0, work_report, workerAttributesFile) - - # Copy of output to shared FS for stageout - if not job_working_dir == worker_communication_point: - cp_start = time.time() - for outfile in job.output_files.keys(): - if os.path.exists(outfile): - shutil.copyfile(os.path.join(job_working_dir, outfile), - os.path.join(worker_communication_point, outfile)) - os.chdir(worker_communication_point) - cp_time = time.time() - cp_start - logger.info("Copy of outputs took: {0} sec.".format(cp_time)) - - logger.info("Declare stage-out") - out_file_report = {} - out_file_report[job.job_id] = [] - - for outfile in job.output_files.keys(): - logger.debug("File {} will be checked and declared for stage out".format(outfile)) - if os.path.exists(outfile): - file_desc = {} - if outfile == job.log_file: - file_desc['type'] = 'log' - else: - file_desc['type'] = 'output' - file_desc['path'] = os.path.abspath(outfile) - file_desc['fsize'] = os.path.getsize(outfile) - if 'guid' in job.output_files[outfile].keys(): - file_desc['guid'] = job.output_files[outfile]['guid'] - elif work_report['outputfiles'] and work_report['outputfiles'][outfile]: - file_desc['guid'] = work_report['outputfiles'][outfile]['guid'] - out_file_report[job.job_id].append(file_desc) - else: - logger.info("Expected output file {0} missed. Job {1} will be failed".format(outfile, job.job_id)) - job.state = 'failed' - - if out_file_report[job.job_id]: - with open(StageOutnFile, 'w') as stageoutfile: - json.dump(out_file_report, stageoutfile) - logger.debug('Stagout declared in: {0}'.format(StageOutnFile)) - logger.debug('Report for stageout: {}'.format(out_file_report)) - - logger.info("All done") - logger.debug("Final report: {0}".format(work_report)) - main_exit(0, work_report, workerAttributesFile) - - -def copy_jobreport(job_working_dir, worker_communication_point, payload_report_file, workerattributesfile): - src_file = os.path.join(job_working_dir, payload_report_file) - dst_file = os.path.join(worker_communication_point, payload_report_file) - - try: - logger.info( - "Copy of payload report [{0}] to access point: {1}".format(payload_report_file, worker_communication_point)) - cp_start = time.time() - # shrink jobReport - job_report = read_json(src_file) - if 'executor' in job_report: - for executor in job_report['executor']: - if 'logfileReport' in executor: - executor['logfileReport'] = {} - - with open(dst_file, 'w') as job_report_outfile: - json.dump(job_report, job_report_outfile) - cp_time = time.time() - cp_start - logger.info("Copy of payload report file took: {0} sec.".format(cp_time)) - except: - logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) - work_report = dict() - work_report["jobStatus"] = "failed" - work_report["pilotErrorCode"] = 1103 # Should be changed to Pilot2 errors - work_report["exitMsg"] = str(sys.exc_info()[1]) - main_exit(1103, work_report, workerattributesfile) - - -def titan_command_fix(command, job_working_dir): - subs_a = command.split() - for i in range(len(subs_a)): - if i > 0: - if '(' in subs_a[i] and not subs_a[i][0] == '"': - subs_a[i] = '"' + subs_a[i] + '"' - if subs_a[i].startswith("--inputEVNTFile"): - filename = subs_a[i].split("=")[1] - subs_a[i] = subs_a[i].replace(filename, os.path.join(job_working_dir, filename)) - - command = ' '.join(subs_a) - command = command.strip() - command = command.replace('--DBRelease="all:current"', '') # avoid Frontier reading - - return command - - -def titan_prepare_wd(scratch_path, trans_job_workdir, worker_communication_point, job, workerAttributesFile): - # --------- - # Copy Poolcond files to scratch (RAMdisk, ssd, etc) to cope high IO. MOve execution to RAM disk - - dst_db_path = 'sqlite200/' - dst_db_filename = 'ALLP200.db' - dst_db_path_2 = 'geomDB/' - dst_db_filename_2 = 'geomDB_sqlite' - tmp_path = 'tmp/' - src_file = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/sqlite200/ALLP200.db' - src_file_2 = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/geomDB/geomDB_sqlite' - copy_start = time.time() - if os.path.exists(scratch_path): - try: - if not os.path.exists(scratch_path + tmp_path): - os.makedirs(scratch_path + tmp_path) - if not os.path.exists(scratch_path + dst_db_path): - os.makedirs(scratch_path + dst_db_path) - shutil.copyfile(src_file, scratch_path + dst_db_path + dst_db_filename) - if not os.path.exists(scratch_path + dst_db_path_2): - os.makedirs(scratch_path + dst_db_path_2) - shutil.copyfile(src_file_2, scratch_path + dst_db_path_2 + dst_db_filename_2) - if not os.path.exists(trans_job_workdir): - os.makedirs(trans_job_workdir) - for inp_file in job.input_files: - shutil.copyfile(os.path.join(worker_communication_point, inp_file), - job.input_files[inp_file]["scratch_path"]) - except IOError as e: - copy_time = time.time() - copy_start - logger.info('Special Titan setup failed after: {0}'.format(copy_time)) - logger.error("Copy to scratch failed, execution terminated': \n %s " % (sys.exc_info()[1])) - work_report = dict() - work_report["jobStatus"] = "failed" - work_report["pilotErrorCode"] = 1103 # Should be changed to Pilot2 errors - work_report["exitMsg"] = str(sys.exc_info()[1]) - main_exit(1103, work_report, workerAttributesFile) - except: - pass - else: - logger.info('Scratch directory (%s) dose not exist' % scratch_path) - return worker_communication_point - - os.chdir(trans_job_workdir) - logger.debug("Current directory: {0}".format(os.getcwd())) - true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files' - pseudo_dir = "./poolcond" - os.symlink(true_dir, pseudo_dir) - copy_time = time.time() - copy_start - logger.info('Special Titan setup took: {0}'.format(copy_time)) - - return trans_job_workdir - - -def titan_postprocess_wd(jobdir): - pseudo_dir = "poolcond" - if os.path.exists(pseudo_dir): - remove(os.path.join(jobdir, pseudo_dir)) - return 0 - - -def removeRedundantFiles(workdir, outputfiles=[]): - """ Remove redundant files and directories. Should be migrated to Pilot2 """ - - logger.info("Removing redundant files prior to log creation") - - workdir = os.path.abspath(workdir) - - dir_list = ["AtlasProduction*", - "AtlasPoint1", - "AtlasTier0", - "buildJob*", - "CDRelease*", - "csc*.log", - "DBRelease*", - "EvgenJobOptions", - "external", - "fort.*", - "geant4", - "geomDB", - "geomDB_sqlite", - "home", - "o..pacman..o", - "pacman-*", - "python", - "runAthena*", - "share", - "sources.*", - "sqlite*", - "sw", - "tcf_*", - "triggerDB", - "trusted.caches", - "workdir", - "*.data*", - "*.events", - "*.py", - "*.pyc", - "*.root*", - "JEM", - "tmp*", - "*.tmp", - "*.TMP", - "MC11JobOptions", - "scratch", - "jobState-*-test.pickle", - "*.writing", - "pwg*", - "pwhg*", - "*PROC*", - "madevent", - "HPC", - "objectstore*.json", - "saga", - "radical", - "ckpt*"] - - # remove core and pool.root files from AthenaMP sub directories - try: - cleanupAthenaMP(workdir, outputfiles) - except Exception, e: - print("Failed to execute cleanupAthenaMP(): %s" % (e)) - - # explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command (--dereference option) - matches = [] - import fnmatch - for root, dirnames, filenames in os.walk(workdir): - for filename in fnmatch.filter(filenames, '*.a'): - matches.append(os.path.join(root, filename)) - for root, dirnames, filenames in os.walk(os.path.dirname(workdir)): - for filename in fnmatch.filter(filenames, 'EventService_premerge_*.tar'): - matches.append(os.path.join(root, filename)) - if matches != []: - for f in matches: - remove(f) - # else: - # print("Found no archive files") - - # note: these should be partitial file/dir names, not containing any wildcards - exceptions_list = ["runargs", "runwrapper", "jobReport", "log."] - - to_delete = [] - for _dir in dir_list: - files = glob(os.path.join(workdir, _dir)) - exclude = [] - - if files: - for exc in exceptions_list: - for f in files: - if exc in f: - exclude.append(os.path.abspath(f)) - - _files = [] - for f in files: - if not f in exclude: - _files.append(os.path.abspath(f)) - to_delete += _files - - exclude_files = [] - for of in outputfiles: - exclude_files.append(os.path.join(workdir, of)) - for f in to_delete: - if not f in exclude_files: - remove(f) - - # run a second pass to clean up any broken links - broken = [] - for root, dirs, files in os.walk(workdir): - for filename in files: - path = os.path.join(root, filename) - if os.path.islink(path): - target_path = os.readlink(path) - # Resolve relative symlinks - if not os.path.isabs(target_path): - target_path = os.path.join(os.path.dirname(path), target_path) - if not os.path.exists(target_path): - broken.append(path) - else: - # If it's not a symlink we're not interested. - continue - - if broken: - for p in broken: - remove(p) - - return 0 - - -def cleanupAthenaMP(workdir, outputfiles=[]): - """ Cleanup AthenaMP sud directories prior to log file creation. ATLAS specific """ - - for ampdir in glob('%s/athenaMP-workers-*' % (workdir)): - for (p, d, f) in os.walk(ampdir): - for filename in f: - if 'core' in filename or 'tmp.' in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) - remove(path) - for outfile in outputfiles: - if outfile in filename: - path = os.path.join(p, filename) - path = os.path.abspath(path) - remove(path) - - return 0 - - -def remove(path): - "Common function for removing of file. Should migrate to Pilo2" - try: - os.unlink(path) - except OSError as e: - logger.error("Problem with deletion: %s : %s" % (e.errno, e.strerror)) - return -1 - return 0 - - -def packlogs(wkdir, excludedfiles, logfile_name, attempt=0): - # logfile_size = 0 - to_pack = [] - pack_start = time.time() - for path, subdir, files in os.walk(wkdir): - for file in files: - if not file in excludedfiles: - relDir = os.path.relpath(path, wkdir) - file_rel_path = os.path.join(relDir, file) - file_path = os.path.join(path, file) - to_pack.append((file_path, file_rel_path)) - if to_pack: - try: - logfile_name = os.path.join(wkdir, logfile_name) - log_pack = tarfile.open(logfile_name, 'w:gz') - for f in to_pack: - log_pack.add(f[0], arcname=f[1]) - log_pack.close() - # logfile_size = os.path.getsize(logfile_name) - except IOError as e: - if attempt == 0: - safe_delay = 15 - logger.info('I/O error. Will retry in {0} sec.'.format(safe_delay)) - time.sleep(safe_delay) - packlogs(wkdir, excludedfiles, logfile_name, attempt=1) - else: - logger.info("Continues I/O error during packing of logs. Job will be failed") - return 1 - - for f in to_pack: - remove(f[0]) - - del_empty_dirs(wkdir) - pack_time = time.time() - pack_start - logger.debug("Pack of logs took: {0} sec.".format(pack_time)) - return 0 - - -def del_empty_dirs(src_dir): - "Common function for removing of empty directories. Should migrate to Pilo2" - - for dirpath, subdirs, files in os.walk(src_dir, topdown=False): - if dirpath == src_dir: - break - try: - os.rmdir(dirpath) - except OSError as ex: - pass - return 0 - - -def cleanup_pathes(pathprefix="/lustre/"): - """" - Cleanup of PATH, LD_PATH etc from entities, which points to shared file system required to reduce IO from traversing - of python libraries - """ - path = os.environ['PATH'].split(':') - for p in path[:]: - if p.startswith("/lustre/"): - path.remove(p) - ppath = os.environ['PYTHONPATH'].split(':') - for p in ppath[:]: - if p.startswith("/lustre/"): - ppath.remove(p) - ldpath = os.environ['LD_LIBRARY_PATH'].split(':') - for p in ldpath[:]: - if p.startswith("/lustre/"): - ldpath.remove(p) - - os.environ['PATH'] = ':'.join(path) - os.putenv('PATH', ':'.join(path)) - os.environ['PYTHONPATH'] = ':'.join(ppath) - os.putenv('PYTHONPATH', ':'.join(ppath)) - os.environ['LD_LIBRARY_PATH'] = ':'.join(ldpath) - os.putenv('LD_LIBRARY_PATH', ':'.join(ldpath)) - - return 0 - - -if __name__ == "__main__": - main() diff --git a/pandaharvester/harvesterpayload/simple_wrapper_mpi.py b/pandaharvester/harvesterpayload/simple_wrapper_mpi.py deleted file mode 100644 index eba9a55c..00000000 --- a/pandaharvester/harvesterpayload/simple_wrapper_mpi.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import time -import json -from socket import gethostname -from subprocess import call -import logging -from mpi4py import MPI -from jobdescription import JobDescription - -comm = MPI.COMM_WORLD -rank = comm.Get_rank() -max_rank = comm.Get_size() - -logger = logging.getLogger('Rank {0}'.format(rank)) -logger.setLevel(logging.DEBUG) -debug_h = logging.StreamHandler(stream=sys.stdout) -formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s') -debug_h.setFormatter(formatter) -debug_h.setLevel(logging.DEBUG) -error_h = logging.StreamHandler(stream=sys.stderr) -error_h.setFormatter(formatter) -error_h.setLevel(logging.ERROR) -logger.addHandler(error_h) -logger.addHandler(debug_h) - -# TODO: -# Input file processing -# - read file attributes from PoolFileCatalog_H.xml -# - check validity of file by checksum -# Get payload run command -# - setup string -# - trf full commandline -# Extract results of execution -# - collect exit code, message -# - extract data from jobreport - - -def main(): - - workerAttributesFile = "worker_attributes.json" - eventStatusDumpJsonFile = "event_status.dump.json" - - start_g = time.time() - start_g_str = time.asctime(time.localtime(start_g)) - hostname = gethostname() - logger.info("Script statrted at {0} on {1}".format(start_g_str, hostname)) - # Get a file name with job descriptions - if len(sys.argv) > 1: - input_file = sys.argv[1] - else: - input_file = 'worker_pandaids.json' - try: - in_file = open(input_file) - panda_ids = json.load(in_file) - in_file.close() - except IOError as (errno, strerror): - logger.critical("I/O error({0}): {1}".format(errno, strerror)) - logger.critical("Exit from rank") - return errno - - logger.debug("Collected list of jobs {0}".format(panda_ids)) - logger.error("Only for test") - # PandaID of the job for the command - try: - job_id = panda_ids[rank] - except ValueError: - logger.critical("Pilot have no job: rank {0}".format(rank)) - logger.critical("Exit pilot") - return 1 - - logger.debug("Job [{0}] will be processed".format(job_id)) - os.chdir(str(job_id)) - - try: - job_file = open("HPCJobs.json") - jobs = json.load(job_file) - job_file.close() - except IOError as (errno, strerror): - logger.critical("I/O error({0}): {1}".format(errno, strerror)) - logger.critical("Unable to open 'HPCJobs.json'") - return errno - - job_dict = jobs[str(job_id)] - - my_command = " ".join([job['transformation'],job['jobPars']]) - my_command = my_command.strip() - logger.debug("Going to launch: {0}".format(my_command)) - payloadstdout = open("stdout.txt", "w") - payloadstderr = open("stderr.txt", "w") - - start_time = time.asctime(time.localtime(time.time())) - t0 = os.times() - exit_code = call(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) - t1 = os.times() - end_time = time.asctime(time.localtime(time.time())) - t = map(lambda x, y: x - y, t1, t0) - t_tot = reduce(lambda x, y: x + y, t[2:3]) - - payloadstdout.close() - payloadstderr.close() - logger.info("Payload exit code: {0}".format(exit_code)) - logger.info("CPU comsumption time: {0}".format(t_tot)) - logger.info("Start time: {0}".format(start_time)) - logger.info("End time: {0}".format(end_time)) - - report = open("rank_report.txt", "w") - report.write("cpuConsumptionTime: %s\n" % t_tot) - report.write("exitCode: %s" % exit_code) - report.close() - logger.info("All done") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/pandaharvester/harvestersubmitter/apfgrid_submitter.py b/pandaharvester/harvestersubmitter/apfgrid_submitter.py deleted file mode 100644 index 7f02cd2c..00000000 --- a/pandaharvester/harvestersubmitter/apfgrid_submitter.py +++ /dev/null @@ -1,199 +0,0 @@ - -import logging -import os -import random -import sys -import threading - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.work_spec import WorkSpec - -from autopyfactory.plugins.factory.config.queues.Agis import Agis -from autopyfactory.configloader import Config -from autopyfactory.queueslib import SubmitAPFQueue -from autopyfactory.authmanager import AuthManager -from autopyfactory.logserver import LogServer - -# setup base logger -baseLogger = core_utils.setup_logger() - -class APFGridSubmitterSingleton(type): - def __init__(self, *args, **kwargs): - super(APFGridSubmitterSingleton, self).__init__(*args, **kwargs) - self.__instance = None - - def __call__(self, *args, **kwargs): - if self.__instance is None: - self.__instance = super(APFGridSubmitterSingleton, self).__call__(*args, **kwargs) - return self.__instance - - -class APFGridSubmitter(object): - - __metaclass__ = APFGridSubmitterSingleton - - factorymock = None - authman = None - agis = None - logserver = None - cleanlogs = None - - - def __init__(self, **kwarg): - self.log = core_utils.make_logger(baseLogger) - self.config = Config() - factoryconffile = os.path.expanduser('~/harvester/etc/autopyfactory/autopyfactory.conf') - self.log.debug('Reading config: %s' % factoryconffile) - okread = self.config.read(factoryconffile) - self.log.debug('Successfully read %s' % okread) - - # Setup Authmanager - authconfigfile = os.path.expanduser(self.config.get('Factory','authConf')) - - ac = Config() - self.log.debug('Reading config: %s' % authconfigfile) - okread = ac.read(authconfigfile) - self.log.debug('Successfully read %s' % okread) - if APFGridSubmitter.authman is None : - APFGridSubmitter.authman = AuthManager() - APFGridSubmitter.authman.reconfig(ac) - self.authman = APFGridSubmitter.authman - APFGridSubmitter.authman.activate() - - # Setup factory mock object. - from autopyfactory.factory import Factory - if APFGridSubmitter.factorymock is None: - APFGridSubmitter.factorymock = Factory.getFactoryMock(fcl=self.config, am=self.authman) - - # Setup AGIS - if APFGridSubmitter.agis is None: - APFGridSubmitter.agisobj = Agis(None, self.config, None) - self.agisobj = APFGridSubmitter.agisobj - self.log.debug("AGIS object: %s" % self.agisobj) - - # Setup logserver - self.log.debug("Handling LogServer...") - if self.config.generic_get('Factory', 'logserver.enabled', 'getboolean'): - self.log.info("LogServer enabled. Initializing...") - self.logserver = LogServer(self.config) - self.log.info('LogServer initialized. Starting...') - self.logserver.start() - self.log.debug('LogServer thread started.') - else: - self.log.info('LogServer disabled. Not running.') - self.log.debug('APFGridSubmitter initialized.') - - - def _print_config(self, config): - s="" - for section in config.sections(): - s+= "[%s]\n" % section - for opt in config.options(section): - s+="%s = %s\n" % (opt, config.get(section, opt)) - return s - - - def submit_workers(self, workspec_list): - ''' - Submit workers to a scheduling system like batch systems and computing elements. - This method takes a list of WorkSpecs as input argument, and returns a list of tuples. - Each tuple is composed of a return code and a dialog message. - Nth tuple in the returned list corresponds to submission status and dialog message for Nth worker - in the given WorkSpec list. - A unique identifier is set to WorkSpec.batchID when submission is successful, - so that they can be identified in the scheduling system. - - - :param workspec_list: a list of work specs instances - :return: A list of tuples. Each tuple is composed of submission status (True for success, False otherwise) - and dialog message - :rtype: [(bool, string),] - - - For UPQ - PQ name, e.g. HARVESTER_BNL_APF_TEST/MCORE - SCORE, SCORE_HIMEM, MCORE, and MCORE_HIMEM - - Workspec AGIS PQ/CEQ attribute. - WorkSpec.minRamCount PQ.memory - WorkSpec.nCore PQ.corecount - WorkSpec.maxDiskCount PQ.maxwdir - WorkSpec.maxWalltime PQ.maxtime - - ''' - self.log.debug('start nWorkers={0}'.format(len(workspec_list))) - self.log.debug("Update AGIS info...") - qc = self.agisobj.getConfig() - #self.log.debug('Agis config output %s' % self._print_config(qc)) - #self.log.debug('Agis config defaults= %s' % qc.defaults() ) - retlist = [] - - # wsmap is indexed by computing site: - # { : [ws1, ws2, ws3 ], - # : [ws4, ws5] - # } - wsmap = {} - jobmap = {} - try: - self.log.debug("Handling workspec_list with %d items..." % len(workspec_list)) - for workSpec in workspec_list: - self.log.debug("Worker(workerId=%s queueName=%s computingSite=%s nCore=%s status=%s " % (workSpec.workerID, - workSpec.queueName, - workSpec.computingSite, - workSpec.nCore, - workSpec.status) ) - try: - wslist = wsmap[workSpec.computingSite] - wslist.append(workSpec) - except KeyError: - wsmap[workSpec.computingSite] = [workSpec] - self.log.debug("wsmap = %s" % wsmap) - - for pq in wsmap.keys(): - found = False - apfqsections = [] - - for s in qc.sections(): - qcq = qc.get(s, 'wmsqueue').strip() - self.log.debug('Checking %s' % qcq) - if qcq == pq: - found = True - apfqsections.append(s) - self.log.debug("Found a queues config for %s" % pq ) - - self.log.info("Found %d sections for PQ %s" % (len(apfqsections), pq)) - if found: - # make apfq and submit - self.log.debug("One or more Agis configs found for PQ. Choosing one...") - section = random.choice(apfqsections) - pqc = qc.getSection(section) - ac = os.path.expanduser('~/harvester/etc/autopyfactory/autopyfactory.conf') - pqc.set(section, 'factoryconf', ac) - self.log.debug("Section config= %s" % pqc) - self.log.debug("Making APF queue for PQ %s with label %s"% (pq, section)) - apfq = SubmitAPFQueue( pqc, self.authman ) - self.log.debug("Successfully made APFQueue") - joblist = [] - for ws in wsmap[pq]: - jobentry = { "+workerid" : ws.workerID } - joblist.append(jobentry) - self.log.debug("joblist made= %s. Submitting..." % joblist) - jobinfo = apfq.submitlist(joblist) - self.log.debug("Got jobinfo %s" % jobinfo) - - wslist = wsmap[pq] - self.log.debug("wslist for pq %s is length %s" % (pq, len(wslist))) - for i in range(0, len(wslist)): - self.log.debug("Setting ws.batchID to %s" % jobinfo[i].jobid ) - wslist[i].batchID = jobinfo[i].jobid - wslist[i].set_status(WorkSpec.ST_submitted) - retlist.append((True, '')) - else: - self.log.info('No AGIS config found for PQ %s skipping.' % pq) - - except Exception, e: - self.log.error(traceback.format_exc(None)) - - self.log.debug("return list=%s " % retlist) - return retlist diff --git a/pandaharvester/harvestersubmitter/arc_submitter.py b/pandaharvester/harvestersubmitter/arc_submitter.py deleted file mode 100644 index cdaa2960..00000000 --- a/pandaharvester/harvestersubmitter/arc_submitter.py +++ /dev/null @@ -1,301 +0,0 @@ -import os -import re -from threading import Thread -import time -import traceback -import arc - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestermisc.arc_parser import ARCParser -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestermisc import arc_utils - -# logger -baselogger = core_utils.setup_logger() - -class SubmitThr(Thread): - '''Class to submit jobs in separate thread''' - - def __init__(self, queuelist, jobdescs, userconfig): - Thread.__init__(self) - self.queuelist = queuelist - self.jobdescs = jobdescs - self.userconfig = userconfig - self.job = None - - def run(self): - '''Do brokering and submit''' - - arclog = arc_utils.ARCLogger(baselogger, 0) - tmplog = arclog.log - # Do brokering among the available queues - jobdesc = self.jobdescs[0] - broker = arc.Broker(self.userconfig, jobdesc, "Random") - targetsorter = arc.ExecutionTargetSorter(broker) - for target in self.queuelist: - tmplog.debug("considering target {0}:{1}".format(target.ComputingService.Name, - target.ComputingShare.Name)) - - # Adding an entity performs matchmaking and brokering - targetsorter.addEntity(target) - - if len(targetsorter.getMatchingTargets()) == 0: - tmplog.error("no clusters satisfied job description requirements") - return - - targetsorter.reset() # required to reset iterator, otherwise we get a seg fault - selectedtarget = targetsorter.getCurrentTarget() - # Job object will contain the submitted job - job = arc.Job() - submitter = arc.Submitter(self.userconfig) - if submitter.Submit(selectedtarget, jobdesc, job) != arc.SubmissionStatus.NONE: - tmplog.error("Submission failed") - return - - self.job = job - - -class ARCSubmitter(PluginBase): - '''Submitter for ARC CE''' - - def __init__(self, **kwarg): - '''Set up DB connection and credentials''' - PluginBase.__init__(self, **kwarg) - - self.dbproxy = DBProxy() - self.schedulerid = harvester_config.master.harvester_id - - # Credential dictionary role: proxy file - self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], - list(harvester_config.credmanager.outCertFile))) - self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) - - - def _run_submit(self, thr): - '''Run a thread to do the submission''' - - try: - thr.start() - except: - pass - - # Be careful to wait longer than submission timeout - thr.join(thr.userconfig.Timeout() + 60.0) - if thr.isAlive(): - # abort due to timeout and try again - raise Exception("Submission timeout") - if thr.job is None: - raise Exception("Submission failed") - - return thr.job - - - def _arc_submit(self, xrsl, arcces, userconfig, log): - '''Check the available CEs and submit''' - - queuelist = [] - - for arcce in arcces: - (ce_endpoint, ce_queue) = arcce - aris = arc.URL(str(ce_endpoint)) - ce_host = aris.Host() - if aris.Protocol() == 'https': - aris.ChangePath('/arex') - infoendpoints = [arc.Endpoint(aris.str(), - arc.Endpoint.COMPUTINGINFO, - 'org.ogf.glue.emies.resourceinfo')] - else: - aris = 'ldap://'+aris.Host()+'/mds-vo-name=local,o=grid' - infoendpoints = [arc.Endpoint(aris, - arc.Endpoint.COMPUTINGINFO, - 'org.nordugrid.ldapng')] - - # retriever contains a list of CE endpoints - retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) - retriever.wait() - # targets is the list of queues - # parse target.ComputingService.ID for the CE hostname - # target.ComputingShare.Name is the queue name - targets = retriever.GetExecutionTargets() - - # Filter only sites for this process - for target in targets: - if not target.ComputingService.ID: - log.info("Target {0} does not have ComputingService ID defined, skipping".format(target.ComputingService.Name)) - continue - # If EMI-ES infoendpoint, force EMI-ES submission - if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ - and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': - log.debug("Rejecting target interface {0} because not EMI-ES".format(target.ComputingEndpoint.InterfaceName)) - continue - # Check for matching host and queue - targethost = re.sub(':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) - targetqueue = target.ComputingShare.Name - if targethost != ce_host: - log.debug('Rejecting target host {0} as it does not match {1}'.format(targethost, ce_host)) - continue - if targetqueue != ce_queue: - log.debug('Rejecting target queue {0} as it does not match {1}'.format(targetqueue, ce_queue)) - continue - - queuelist.append(target) - log.debug("Adding target {0}:{1}".format(targethost, targetqueue)) - - # check if any queues are available, if not leave and try again next time - if not queuelist: - raise Exception("No free queues available") - - log.debug("preparing submission") - jobdescs = arc.JobDescriptionList() - if not arc.JobDescription_Parse(str(xrsl), jobdescs): - raise Exception("Failed to prepare job description") - - # Run the submission in a separate thread - thr = SubmitThr(queuelist, jobdescs, userconfig) - return self._run_submit(thr) - - - def _set_logdir(self, site): - date = time.strftime('%Y-%m-%d') - return os.path.join(date, site) - - - # submit workers - def submit_workers(self, workspec_list): - retlist = [] - - # Get queue info from DB - pandaqueues = self.dbproxy.get_cache("panda_queues.json", None) - if pandaqueues is None: - raise Exception("Failed to get panda queue info from database") - pandaqueues = pandaqueues.data - - osmap = self.dbproxy.get_cache("ddmendpoints_objectstores.json", None) - if osmap is None: - raise Exception("Failed to get Object Store info from database") - osmap = osmap.data - - for workspec in workspec_list: - - arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) - tmplog = arclog.log - - # Assume for aCT that jobs are always pre-fetched (no late-binding) - for jobspec in workspec.get_jobspec_list(): - - tmplog.debug("JobSpec: {0}".format(jobspec.values_map())) - - if jobspec.computingSite not in pandaqueues: - retlist.append((False, "No queue information for {0}".format(jobspec.computingSite))) - continue - - # Get CEs from panda queue info - # List of (endpoint, queue) tuples - arcces = [] - for endpoint in pandaqueues[jobspec.computingSite]['queues']: - ce_endpoint = endpoint['ce_endpoint'] - if not re.search('://', ce_endpoint): - ce_endpoint = 'gsiftp://%s' % ce_endpoint - ce_queue = endpoint['ce_queue_name'] - arcces.append((ce_endpoint, ce_queue)) - - if not arcces: - retlist.append((False, "No CEs defined for %{0}".format(jobspec.computingSite))) - continue - - # Set true pilot or not - queueconfigmapper = QueueConfigMapper() - queueconfig = queueconfigmapper.get_queue(jobspec.computingSite) - pandaqueues[jobspec.computingSite]['truepilot'] = queueconfig.truePilot - - # Set log URL for GTAG env in job description - logbaseurl = queueconfig.submitter.get('logBaseURL') - logsubdir = self._set_logdir(jobspec.computingSite) - logfileurl = '/'.join([logbaseurl, logsubdir, '%d.out' % jobspec.PandaID]) if logbaseurl else None - - tmplog.debug("Converting to ARC XRSL format") - arcxrsl = ARCParser(jobspec.jobParams, - jobspec.computingSite, - pandaqueues[jobspec.computingSite], - logfileurl, - self.schedulerid, - osmap, - '/tmp', # tmpdir, TODO common tmp dir - None, #jobSpec.eventranges, # TODO event ranges - tmplog) - arcxrsl.parse() - xrsl = arcxrsl.getXrsl() - tmplog.debug("ARC xrsl: {0}".format(xrsl)) - - # Set the files to be downloaded at the end of the job - downloadfiles = 'gmlog/errors' - if 'logFile' in jobspec.jobParams: - downloadfiles += ';%s' %jobspec.jobParams['logFile'].replace('.tgz', '') - if not pandaqueues[jobspec.computingSite]['truepilot']: - downloadfiles += ';jobSmallFiles.tgz' - - # Set certificate - userconfig = arc.UserConfig(self.cred_type) - proxyrole = '' - if jobspec.jobParams['prodSourceLabel'] == 'user': - userconfig.ProxyPath(str(self.certs['pilot'])) - proxyrole = 'pilot' - else: - userconfig.ProxyPath(str(self.certs['production'])) - proxyrole = 'production' - tmplog.debug("Submitting using {0} proxy at {1}".format(proxyrole, userconfig.ProxyPath())) - - try: - tmplog.debug("Submission targets: {0}".format(arcces)) - arcjob = self._arc_submit(xrsl, arcces, userconfig, tmplog) - tmplog.info("ARC CE job id {0}".format(arcjob.JobID)) - arc_utils.arcjob2workspec(arcjob, workspec) - workspec.workAttributes['arcdownloadfiles'] = downloadfiles - workspec.workAttributes['proxyrole'] = proxyrole - workspec.workAttributes['logsubdir'] = logsubdir - workspec.batchID = arcjob.JobID - tmplog.debug(workspec.workAttributes) - result = (True, '') - except Exception as exc: - tmplog.error(traceback.format_exc()) - result = (False, "Failed to submit ARC job: {0}".format(str(exc))) - - retlist.append(result) - - return retlist - - -def test(): - '''test submission''' - from pandaharvester.harvestercore.job_spec import JobSpec - from pandaharvester.harvestercore.plugin_factory import PluginFactory - - import json - - queuename = 'ARC-TEST' - queueconfmapper = QueueConfigMapper() - queueconf = queueconfmapper.get_queue(queuename) - pluginfactory = PluginFactory() - - pandajob = '{"jobsetID": 11881, "logGUID": "88ee8a52-5c70-490c-a585-5eb6f48e4152", "cmtConfig": "x86_64-slc6-gcc49-opt", "prodDBlocks": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "dispatchDBlockTokenForOut": "NULL,NULL", "destinationDBlockToken": "dst:CERN-PROD_DATADISK,dst:NDGF-T1_DATADISK", "destinationSE": "CERN-PROD_PRESERVATION", "realDatasets": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00", "prodUserID": "gingrich", "GUID": "A407D965-B139-A543-8851-A8E134A678D7", "realDatasetsIn": "mc16_13TeV:mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.merge.EVNT.e5340_e5984_tid11329621_00", "nSent": 2, "cloud": "WORLD", "StatusCode": 0, "homepackage": "AtlasOffline/21.0.15", "inFiles": "EVNT.11329621._001079.pool.root.1", "processingType": "simul", "currentPriority": 900, "fsize": "129263662", "fileDestinationSE": "CERN-PROD_PRESERVATION,BOINC_MCORE", "scopeOut": "mc16_13TeV", "minRamCount": 1573, "jobDefinitionID": 0, "maxWalltime": 40638, "scopeLog": "mc16_13TeV", "transformation": "Sim_tf.py", "maxDiskCount": 485, "coreCount": 1, "prodDBlockToken": "NULL", "transferType": "NULL", "destinationDblock": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.HITS.e5340_e5984_s3126_tid11364822_00_sub0418634273,mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.log.e5340_e5984_s3126_tid11364822_00_sub0418634276", "dispatchDBlockToken": "NULL", "jobPars": "--inputEVNTFile=EVNT.11329621._001079.pool.root.1 --maxEvents=50 --postInclude \\"default:RecJobTransforms/UseFrontier.py\\" --preExec \\"EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)\\" \\"EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True\\" --preInclude \\"EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py\\" --skipEvents=4550 --firstEvent=5334551 --outputHITSFile=HITS.11364822._128373.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=106692 --DBRelease=\\"all:current\\" --conditionsTag \\"default:OFLCOND-MC16-SDR-14\\" --geometryVersion=\\"default:ATLAS-R2-2016-01-00-01_VALIDATION\\" --runNumber=364168 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus", "attemptNr": 2, "swRelease": "Atlas-21.0.15", "nucleus": "CERN-PROD", "maxCpuCount": 40638, "outFiles": "HITS.11364822._128373.pool.root.11,log.11364822._128373.job.log.tgz.11", "ddmEndPointOut": "CERN-PROD_DATADISK,NDGF-T1_DATADISK", "scopeIn": "mc16_13TeV", "PandaID": 3487584273, "sourceSite": "NULL", "dispatchDblock": "panda.11364822.07.05.GEN.0c9b1d3b-feec-411a-89e4-1cbf7347d70c_dis003487584270", "prodSourceLabel": "managed", "checksum": "ad:cd0bf10b", "jobName": "mc16_13TeV.364168.Sherpa_221_NNPDF30NNLO_Wmunu_MAXHTPTV500_1000.simul.e5340_e5984_s3126.3433643361", "ddmEndPointIn": "NDGF-T1_DATADISK", "taskID": 11364822, "logFile": "log.11364822._128373.job.log.tgz.1"}' - pandajob = json.loads(pandajob) - jspec = JobSpec() - jspec.convert_job_json(pandajob) - jspec.computingSite = queuename - jspeclist = [jspec] - - maker = pluginfactory.get_plugin(queueconf.workerMaker) - wspec = maker.make_worker(jspeclist, queueconf) - - wspec.hasJob = 1 - wspec.set_jobspec_list(jspeclist) - - sub = ARCSubmitter() - print sub.submit_workers([wspec]) - print wspec.batchID - -if __name__ == '__main__': - test() diff --git a/pandaharvester/harvestersubmitter/cloud_google_submitter.py b/pandaharvester/harvestersubmitter/cloud_google_submitter.py deleted file mode 100644 index 22f93255..00000000 --- a/pandaharvester/harvestersubmitter/cloud_google_submitter.py +++ /dev/null @@ -1,140 +0,0 @@ -""" -Based on: https://cloud.google.com/compute/docs/tutorials/python-guide#before-you-begin -""" - -import time - -# from requests.exceptions import SSLError -from pandaharvester.harvestercloud.googlecloud import PROJECT, ZONE, GoogleVM, compute -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper - -# setup base logger -base_logger = core_utils.setup_logger("google_submitter") - - -def wait_for_operation(project, zone, operation_name): - """ - Waits for an operation to complete. - TODO: decide whether we want to block or just move on and list the instance status later - :param project: - :param zone: - :param operation_name: - :return: - """ - tmp_log = core_utils.make_logger(base_logger, method_name="wait_for_operation") - tmp_log.debug("Waiting for operation to finish...") - - while True: - result = compute.zoneOperations().get(project=project, zone=zone, operation=operation_name).execute() - - if result["status"] == "DONE": - if "error" in result: - raise Exception(result["error"]) - tmp_log.debug("Operation finished...") - return result - - time.sleep(1) - - -def create_vm(work_spec, queue_config): - """ - Boots up a VM in GCE based on the worker specifications - - :param work_spec: worker specifications - :return: - """ - work_spec.reset_changed_list() - - tmp_log = core_utils.make_logger(base_logger, f"workerID={work_spec.workerID}", method_name="submit_a_worker") - - tmp_log.debug( - "nCore={0} minRamCount={1} maxDiskCount={2} maxWalltime={0}".format( - work_spec.nCore, work_spec.minRamCount, work_spec.maxDiskCount, work_spec.maxWalltime - ) - ) - - try: - vm = GoogleVM(work_spec, queue_config) - - try: - zone = queue_config.zone - except AttributeError: - zone = ZONE - - except Exception as e: - tmp_log.debug(f"VM preparation failed with: {e}") - # there was some problem preparing the VM, usually related to interaction with GCE - # since the VM was not submitted yet, we mark the worker as "missed" - return (False, str(e)), work_spec.get_changed_attributes() - - try: - tmp_log.debug(f"Going to submit VM {vm.name}") - work_spec.batchID = vm.name - operation = compute.instances().insert(project=PROJECT, zone=zone, body=vm.config).execute() - # tmp_log.debug('Submitting VM {0}'.format(vm.name)) - # wait_for_operation(PROJECT, ZONE, operation['name']) - tmp_log.debug(f"Submitted VM {vm.name}") - - return (True, "OK"), work_spec.get_changed_attributes() - except Exception as e: - tmp_log.debug(f"GCE API exception: {e}") - # Despite the exception we will consider the submission successful to set the worker as "submitted". - # This is related to the GCE API reliability. We have observed that despite failures (time outs, SSL errors, etc) - # in many cases the VMs still start and we don't want VMs that are not inventorized. If the VM submission failed - # the harvester monitor will see when listing the running VMs - return (True, str(e)), work_spec.get_changed_attributes() - - -class GoogleSubmitter(PluginBase): - """ - Plug-in for Google Cloud Engine VM submission. In this case the worker will abstract a VM running a job - """ - - def __init__(self, **kwarg): - self.logBaseURL = "http://localhost/test" - PluginBase.__init__(self, **kwarg) - - self.queue_config_mapper = QueueConfigMapper() - - def submit_workers(self, work_spec_list): - """ - :param work_spec_list: list of workers to submit - :return: - """ - - tmp_log = self.make_logger(base_logger, method_name="submit_workers") - tmp_log.debug(f"start nWorkers={len(work_spec_list)}") - - ret_list = [] - if not work_spec_list: - tmp_log.debug("empty work_spec_list") - return ret_list - - # we assume all work_specs in the list belong to the same queue - queue_config = self.queue_config_mapper.get_queue(work_spec_list[0].computingSite) - - # Create VMs in parallel - # authentication issues when running the Cloud API in multiprocess - # pool_size = min(len(work_spec_list), 10) - # with Pool(pool_size) as pool: - # ret_val_list = pool.map(create_vm, work_spec_list, lock) - - ret_val_list = [] - for work_spec in work_spec_list: - ret_val_list.append(create_vm(work_spec, queue_config)) - - # Propagate changed attributes - for work_spec, tmp_val in zip(work_spec_list, ret_val_list): - ret_val, tmp_dict = tmp_val - - work_spec.set_attributes_with_dict(tmp_dict) - work_spec.set_log_file("batch_log", f"{self.logBaseURL}/{work_spec.batchID}.log") - work_spec.set_log_file("stdout", f"{self.logBaseURL}/{work_spec.batchID}.out") - work_spec.set_log_file("stderr", f"{self.logBaseURL}/{work_spec.batchID}.err") - ret_list.append(ret_val) - - tmp_log.debug("done") - - return ret_list diff --git a/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py b/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py deleted file mode 100644 index 29a92037..00000000 --- a/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -import tempfile - -try: - import subprocess32 as subprocess -except BaseException: - import subprocess - -import random -import re -import uuid -from concurrent.futures import ThreadPoolExecutor - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestermisc.cloud_openstack_utils import OS_SimpleClient - -# setup base logger -baseLogger = core_utils.setup_logger("cloud_openstack_submitter") - - -def _init_script_replace(string, **kwarg): - new_string = string - macro_map = { - "\$\(workerID\)": str(kwarg["workerID"]), - "\$\(batchID\)": str(kwarg["batchID"]), - "\$\(accessPoint\)": str(kwarg["accessPoint"]), - } - for k, v in macro_map.items(): - new_string = re.sub(k, v, new_string) - return new_string - - -# make cloud initialization script -def _make_init_script(workspec, template_str): - # make logger - tmpLog = core_utils.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="_make_init_script") - - # make init tempfile - tmpFile = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_init.sh", dir=workspec.get_access_point()) - new_template_str = _init_script_replace(template_str, **workspec.__dict__) - tmpFile.write(new_template_str) - tmpFile.close() - tmpLog.debug("done") - return tmpFile.name - - -# Cloud Openstack submitter -class CloudOpenstackSubmitter(PluginBase): - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - self.nProcesses = 4 - self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - - def _submit_a_vm(self, workspec): - # set logger - tmpLog = self.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="_submit_a_vm") - - # initial return values - tmpRetVal = (None, "Nothing done") - - # decide id - vm_name = f"harvester-vm_{str(uuid.uuid4())}" - - # # decide image - vm_image_id = self.vmImageID - - # decide flavor - # FIXME - if workspec.nCore == 1: - vm_flavor_id = self.jobType_vmFlavor_map["SCORE"] - elif workspec.nCore == 8: - vm_flavor_id = self.jobType_vmFlavor_map["MCORE"] - else: - vm_flavor_id = self.jobType_vmFlavor_map["other"] - - # decide userdata - with open(self.initScriptTemplate) as _f: - template_str = _f.read() - vm_userdata_file = _make_init_script(workspec, template_str) - vm_userdata = open(vm_userdata_file, "r") - - # get image and flavor - try: - vm_image = self.vm_client.nova.glance.find_image(vm_image_id) - vm_flavor = self.vm_client.nova.flavors.get(vm_flavor_id) - except Exception as _e: - errStr = f"Failed to create a VM with name={vm_name} ; {_e}" - tmpLog.error(errStr) - tmpRetVal = (None, errStr) - return tmpRetVal - - # create a VM - try: - self.vm_client.nova.servers.create(name=vm_name, image=vm_image, flavor=vm_flavor, userdata=vm_userdata, **self.vmCreateAttributes) - except Exception as _e: - errStr = f"Failed to create a VM with name={vm_name} ; {_e}" - tmpLog.error(errStr) - tmpRetVal = (None, errStr) - else: - try: - vm_server = self.vm_client.nova.servers.list(search_opts={"name": vm_name}, limit=1)[0] - vm_id = vm_server.id - except Exception as _e: - errStr = f"Failed to create a VM with name={vm_name} ; {_e}" - tmpLog.error(errStr) - tmpRetVal = (None, errStr) - else: - workspec.batchID = vm_id - tmpLog.info(f"Created a VM with name={vm_name} id={vm_id}") - tmpRetVal = (True, "") - - vm_userdata.close() - - # return - return tmpRetVal - - # submit workers - - def submit_workers(self, workspec_list): - # set logger - tmpLog = self.make_logger(baseLogger, method_name="submit_workers") - - nWorkers = len(workspec_list) - tmpLog.debug(f"start nWorkers={nWorkers}") - - # exec with multi-thread - with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retValList = thread_pool.map(self._submit_a_vm, workspec_list) - tmpLog.debug(f"{nWorkers} workers submitted") - - # return - retList = list(retValList) - - tmpLog.debug("done") - - return retList diff --git a/pandaharvester/harvestersubmitter/lancium_submitter.py b/pandaharvester/harvestersubmitter/lancium_submitter.py deleted file mode 100644 index 8be2cbfb..00000000 --- a/pandaharvester/harvestersubmitter/lancium_submitter.py +++ /dev/null @@ -1,245 +0,0 @@ -import os -import socket -import traceback -from concurrent.futures import ThreadPoolExecutor - -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestermisc.info_utils import PandaQueuesDict -from pandaharvester.harvestermisc.lancium_utils import ( - SCRIPTS_PATH, - LanciumClient, - get_job_name_from_workspec, -) -from pandaharvester.harvestersubmitter import submitter_common - -base_logger = core_utils.setup_logger("lancium_submitter") - -voms_lancium_path = "/voms/voms" -script_lancium_path = "/scripts/pilots_starter.py" -mount_path = "input_files" -full_mount_path_secrets = "/jobDir/input_files/voms" - - -class LanciumSubmitter(PluginBase): - # constructor - def __init__(self, **kwarg): - self.hostname = socket.getfqdn() - - self.logBaseURL = None - PluginBase.__init__(self, **kwarg) - - # retrieve the configurations for the panda queues - self.panda_queues_dict = PandaQueuesDict() - - # allowed associated parameters from CRIC - self._allowed_agis_attrs = ("pilot_url",) - - # number of processes - try: - self.nProcesses - except AttributeError: - self.nProcesses = 1 - else: - if (not self.nProcesses) or (self.nProcesses < 1): - self.nProcesses = 1 - - self.lancium_client = LanciumClient(self.hostname, queue_name=self.queueName) - - # update or create the pilot starter executable - self.upload_pilots_starter() - - def upload_pilots_starter(self): - tmp_log = self.make_logger(base_logger, method_name="upload_pilots_starter") - tmp_log.debug("Start") - try: - base_name = "pilots_starter.py" - dir_name = os.path.dirname(__file__) - - local_file = os.path.join(dir_name, f"../harvestercloud/{base_name}") - lancium_file = os.path.join(SCRIPTS_PATH, base_name) - self.lancium_client.upload_file(local_file, lancium_file) - tmp_log.debug("Done") - except Exception: - tmp_log.error(f"Problem uploading proxy {local_file}. {traceback.format_exc()}") - - def _choose_proxy(self, workspec): - """ - Choose the proxy based on the job type - """ - cert = None - job_type = workspec.jobType - is_grandly_unified_queue = self.panda_queues_dict.is_grandly_unified_queue(self.queueName) - - if is_grandly_unified_queue and job_type in ("user", "panda", "analysis"): - if self.user_proxy: - cert = self.user_proxy - elif self.prod_proxy: - cert = self.prod_proxy - else: - if self.prod_proxy: - cert = self.prod_proxy - - return cert - - def _fill_params( - self, - workspec, - container_image, - cert, - physical_cores, - memory_gb, - maxwdir_prorated_gib, - max_time, - pilot_type, - pilot_url_str, - pilot_version, - prod_source_label, - pilot_python_option, - log_file_name, - ): - lancium_job_name = get_job_name_from_workspec(workspec) - - # submit the worker - params = { - "name": lancium_job_name, - "command_line": "python input_files/scripts/pilots_starter.py", - "image": container_image, # 'harvester/centos7-singularity' - "max_run_time": max_time, - "resources": {"core_count": physical_cores, "memory": memory_gb, "scratch": int(maxwdir_prorated_gib)}, - "input_files": [ - {"source_type": "data", "data": voms_lancium_path, "name": mount_path}, - {"source_type": "data", "data": script_lancium_path, "name": mount_path}, - ], - "environment": ( - {"variable": "pilotUrlOpt", "value": pilot_url_str}, # pilotUrlOpt, stdout_name - {"variable": "stdout_name", "value": log_file_name}, - {"variable": "PILOT_NOKILL", "value": "True"}, - {"variable": "computingSite", "value": self.queueName}, - {"variable": "pandaQueueName", "value": self.queueName}, - {"variable": "resourceType", "value": workspec.resourceType}, - {"variable": "prodSourceLabel", "value": prod_source_label}, - {"variable": "pilotType", "value": pilot_type}, - # {'variable': 'pythonOption', 'value': pilot_python_option}, - {"variable": "pilotVersion", "value": pilot_version}, - {"variable": "jobType", "value": prod_source_label}, - {"variable": "proxySecretPath", "value": os.path.join(full_mount_path_secrets, cert)}, - {"variable": "workerID", "value": str(workspec.workerID)}, - {"variable": "pilotProxyCheck", "value": "False"}, - {"variable": "logs_frontend_w", "value": harvester_config.pandacon.pandaCacheURL_W}, - {"variable": "logs_frontend_r", "value": harvester_config.pandacon.pandaCacheURL_R}, - {"variable": "PANDA_JSID", "value": "harvester-" + harvester_config.master.harvester_id}, - {"variable": "HARVESTER_WORKER_ID", "value": str(workspec.workerID)}, - {"variable": "HARVESTER_ID", "value": harvester_config.master.harvester_id}, - {"variable": "submit_mode", "value": "PULL"}, - {"variable": "TMPDIR", "value": "/jobDir"}, - {"variable": "HOME", "value": "/jobDir"}, - # {'variable': 'K8S_JOB_ID', 'value': lancium_job_name}, - ), - } - return params - - def submit_lancium_worker(self, workspec): - tmp_log = self.make_logger(base_logger, f"queueName={self.queueName}", method_name="submit_lancium_worker") - - this_panda_queue_dict = self.panda_queues_dict.get(self.queueName, dict()) - - try: - # get info from harvester queue config - _queueConfigMapper = QueueConfigMapper() - harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) - - # set the stdout log file - log_file_name = f"{harvester_config.master.harvester_id}_{workspec.workerID}.out" - workspec.set_log_file("stdout", f"{self.logBaseURL}/{log_file_name}") - - # choose the appropriate proxy - cert = self._choose_proxy(workspec) - if not cert: - err_str = "No proxy specified in proxySecretPath. Not submitted" - tmp_return_value = (False, err_str) - return tmp_return_value - - # set the container image - container_image = "harvester/centos7-singularity" # harvester_queue_config.container_image - physical_cores = workspec.nCore / 2 # lancium uses hyperthreading, but expects job size in physical cores - memory_gb = workspec.minRamCount / 2 / 1000 - maxwdir_prorated_gib = self.panda_queues_dict.get_prorated_maxwdir_GiB(workspec.computingSite, workspec.nCore) - max_time = this_panda_queue_dict.get("maxtime", None) - - associated_params_dict = {} - for key, val in self.panda_queues_dict.get_harvester_params(self.queueName).items(): - if key in self._allowed_agis_attrs: - associated_params_dict[key] = val - - pilot_url = associated_params_dict.get("pilot_url") - pilot_version = str(this_panda_queue_dict.get("pilot_version", "current")) - python_version = str(this_panda_queue_dict.get("python_version", "3")) - - prod_source_label_tmp = harvester_queue_config.get_source_label(workspec.jobType) - pilot_opt_dict = submitter_common.get_complicated_pilot_options(workspec.pilotType, pilot_url, pilot_version, prod_source_label_tmp) - if pilot_opt_dict is None: - prod_source_label = prod_source_label_tmp - pilot_type = workspec.pilotType - pilot_url_str = f"--piloturl {pilot_url}" if pilot_url else "" - else: - prod_source_label = pilot_opt_dict["prod_source_label"] - pilot_type = pilot_opt_dict["pilot_type_opt"] - pilot_url_str = pilot_opt_dict["pilot_url_str"] - - pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) - - params = self._fill_params( - workspec, - container_image, - cert, - physical_cores, - memory_gb, - maxwdir_prorated_gib, - max_time, - pilot_type, - pilot_url_str, - pilot_version, - prod_source_label, - pilot_python_option, - log_file_name, - ) - - return_code, return_str = self.lancium_client.submit_job(**params) - if not return_code: - return return_code, return_str - - except Exception as _e: - tmp_log.error(traceback.format_exc()) - err_str = f"Failed to create a worker; {_e}" - tmp_return_value = (False, err_str) - else: - workspec.batchID = return_str - tmp_log.debug(f"Created worker {workspec.workerID} with batchID={workspec.batchID}") - tmp_return_value = (True, "") - - return tmp_return_value - - # submit workers - def submit_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, f"queueName={self.queueName}", method_name="submit_workers") - - n_workers = len(workspec_list) - tmp_log.debug(f"start, n_workers={n_workers}") - - ret_list = list() - if not workspec_list: - tmp_log.debug("empty workspec_list") - return ret_list - - with ThreadPoolExecutor(self.nProcesses) as thread_pool: - ret_val_list = thread_pool.map(self.submit_lancium_worker, workspec_list) - tmp_log.debug(f"{n_workers} workers submitted") - - ret_list = list(ret_val_list) - - tmp_log.debug("done") - - return ret_list diff --git a/pandaharvester/harvestersweeper/apfgrid_sweeper.py b/pandaharvester/harvestersweeper/apfgrid_sweeper.py deleted file mode 100644 index a0721dd3..00000000 --- a/pandaharvester/harvestersweeper/apfgrid_sweeper.py +++ /dev/null @@ -1,101 +0,0 @@ -import logging -import sys -import traceback - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.work_spec import WorkSpec -from pandaharvester.harvestersubmitter.apfgrid_submitter import APFGridSubmitter - -try: - from autopyfactory import condorlib -except ImportError: - logging.error(f"Unable to import htcondor/condorlib. sys.path={sys.path}") - -# setup base logger -baseLogger = core_utils.setup_logger() - - -class APFGridSweeperSingleton(type): - def __init__(self, *args, **kwargs): - super(APFGridSweeperSingleton, self).__init__(*args, **kwargs) - self.__instance = None - - def __call__(self, *args, **kwargs): - if self.__instance is None: - self.__instance = super(APFGridSweeperSingleton, self).__call__(*args, **kwargs) - return self.__instance - - -# dummy plugin for sweeper -class APFGridSweeper(object): - __metaclass__ = APFGridSweeperSingleton - - STATUS_MAP = { - 1: WorkSpec.ST_submitted, - 2: WorkSpec.ST_running, - 3: WorkSpec.ST_cancelled, - 4: WorkSpec.ST_finished, - 5: WorkSpec.ST_failed, - 6: WorkSpec.ST_ready, - } - - JOBQUERYATTRIBUTES = ["match_apf_queue", "jobstatus", "workerid", "apf_queue", "clusterid", "procid"] - - # constructor - def __init__(self, **kwarg): - self.log = core_utils.make_logger(baseLogger) - self.jobinfo = None - self.allbyworkerid = {} - - self.log.debug("APFGridSweeper initialized.") - - def _updateJobInfo(self): - self.log.debug("Getting job info from Condor...") - out = condorlib.condor_q(APFGridSweeper.JOBQUERYATTRIBUTES) - self.log.debug(f"Got jobinfo {out}") - self.jobinfo = out - for jobad in self.jobinfo: - try: - workerid = jobad["workerid"] - self.allbyworkerid[workerid] = jobad - except KeyError: - # some non-harvester jobs may not have workerids, ignore them - pass - self.log.debug(f"All jobs indexed by worker_id. {len(self.allbyworkerid)} entries.") - - # kill a worker - - def kill_worker(self, workspec): - """Kill a single worker in a scheduling system like batch systems and computing elements. - - :param workspec: worker specification - :type workspec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - self.allbyworkerid = {} - self._updateJobInfo() - try: - jobad = self.allbyworkerid(workspec.workerID) - clusterid = jobad["clusterid"] - procid = jobad["procid"] - killstr = f"{clusterid}.{procid}" - self.log.debug(f"Killing condor job {killstr} ...") - condorlib.condor_rm([killstr]) - self.log.debug(f"Killed condor job {killstr} with workerid {workspec.workerID}") - except KeyError: - self.log.warning(f"kill_worker called on non-existent workerid: {workspec.workerID}") - - return True, "" - - # cleanup for a worker - def sweep_worker(self, workspec): - """Perform cleanup procedures for a single worker, such as deletion of work directory. - - :param workspec: worker specification - :type workspec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - return True, "" diff --git a/pandaharvester/harvestersweeper/arc_sweeper.py b/pandaharvester/harvestersweeper/arc_sweeper.py deleted file mode 100644 index c9602d9f..00000000 --- a/pandaharvester/harvestersweeper/arc_sweeper.py +++ /dev/null @@ -1,152 +0,0 @@ -import arc -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestermisc import arc_utils - -# logger -baselogger = core_utils.setup_logger() - - -class ARCSweeper(PluginBase): - """Sweeper for killing and cleaning ARC jobs""" - - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - - # Credential dictionary role: proxy file - self.certs = dict(zip([r.split("=")[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) - self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) - - def kill_worker(self, workspec): - """Cancel the ARC job. - - :param workspec: worker specification - :type workspec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - - # make logger - arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) - tmplog = arclog.log - - (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) - if not job.JobID: - # Job not submitted - tmplog.info("Job was not submitted so cannot be cancelled") - return True, "" - - # Set certificate - userconfig = arc.UserConfig(self.cred_type) - try: - userconfig.ProxyPath(str(self.certs[proxyrole])) - except BaseException: - # Log a warning and return True so that job can be cleaned - tmplog.warning(f"Job {job.JobID}: no proxy found with role {proxyrole}") - return True, "" - - job_supervisor = arc.JobSupervisor(userconfig, [job]) - job_supervisor.Update() - job_supervisor.Cancel() - - notcancelled = job_supervisor.GetIDsNotProcessed() - - if job.JobID in notcancelled: - if job.State == arc.JobState.UNDEFINED: - # If longer than one hour since submission assume job never made it - if job.SubmissionTime + arc.Period(3600) < arc.Time(): - tmplog.warning("Assuming job is lost and marking as cancelled") - return True, "" - - # Job has not yet reached info system - tmplog.warning("Job is not yet in info system so cannot be cancelled") - return False, "Job is not yet in info system so could not be cancelled" - - # Log a warning and return True so that job can be cleaned - tmplog.warning("Job could not be cancelled") - return True, "" - - tmplog.info("Job cancelled successfully") - return True, "" - - def sweep_worker(self, workspec): - """Clean the ARC job - - :param workspec: worker specification - :type workspec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - - # make logger - arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) - tmplog = arclog.log - - (job, modtime, proxyrole) = arc_utils.workspec2arcjob(workspec) - if not job.JobID: - # Job not submitted - tmplog.info("Job was not submitted so cannot be cleaned") - return True, "" - - # Set certificate - userconfig = arc.UserConfig(self.cred_type) - try: - userconfig.ProxyPath(str(self.certs[proxyrole])) - except BaseException: - # Log a warning and return True so that job can be cleaned - tmplog.warning(f"Job {job.JobID}: no proxy found with role {proxyrole}") - return True, "" - - job_supervisor = arc.JobSupervisor(userconfig, [job]) - job_supervisor.Update() - job_supervisor.Clean() - - notcleaned = job_supervisor.GetIDsNotProcessed() - - if job.JobID in notcleaned: - # Log a warning and return True so that job can be finished - tmplog.warning("Job could not be cleaned") - return True, "" - - tmplog.info("Job cleaned successfully") - return True, "" - - -def test(jobid): - """Kill a job""" - import json - - from pandaharvester.harvestercore.work_spec import WorkSpec - - wspec = WorkSpec() - wspec.batchID = jobid - workAttributes = {"arcjob": {}} - workAttributes["arcjob"]["JobID"] = wspec.batchID - workAttributes["arcjob"]["JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format( - urlparse.urlparse(jobid).netloc, wspec.batchID - ) - workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" - jobmanagementurl = arc.URL(wspec.batchID) - jobmanagementurl.ChangePath("/jobs") - workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() - workAttributes["arcjob"]["JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" - - wspec.workAttributes = workAttributes - print(wspec.workAttributes) - - sweeper = ARCSweeper() - print(sweeper.kill_worker(wspec)) - - -if __name__ == "__main__": - import sys - import time - - import urlparse - - if len(sys.argv) != 2: - print("Please give ARC job id") - sys.exit(1) - test(sys.argv[1]) diff --git a/pandaharvester/harvestersweeper/cloud_google_sweeper.py b/pandaharvester/harvestersweeper/cloud_google_sweeper.py deleted file mode 100644 index b774850f..00000000 --- a/pandaharvester/harvestersweeper/cloud_google_sweeper.py +++ /dev/null @@ -1,63 +0,0 @@ -import googleapiclient -from pandaharvester.harvestercloud.googlecloud import PROJECT, ZONE, compute -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper - -base_logger = core_utils.setup_logger("google_sweeper") - - -class GoogleSweeper(PluginBase): - """ - Sweeper with kill/clean-up functions for Google Compute Engine - """ - - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - self.queue_config_mapper = QueueConfigMapper() - - def kill_worker(self, work_spec): - """ - Sends the command to Google to destroy a VM - - :param work_spec: worker specification - :type work_spec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - - try: - vm_name = work_spec.batchID - - queue_config = self.queue_config_mapper.get_queue(work_spec.computingSite) - try: - zone = queue_config.zone - except AttributeError: - zone = ZONE - - base_logger.debug(f"Going to kill VM {vm_name}") - compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() - base_logger.debug(f"Killed VM {vm_name}") - return True, "" - except googleapiclient.errors.HttpError as e: - if "was not found" in e.content: - # the VM was already killed or does not exist for any other reason - message = "VM does not exist".format(vm_name) - base_logger.debug(message) - return True, message - else: - # there was an issue killing the VM and it should be retried at another time - return False, f"Problems killing the VM: {e}" - except Exception as e: - return False, f"Problems killing the VM: {e}" - - def sweep_worker(self, work_spec): - """ - In the cloud, cleaning means destroying a VM - - :param work_spec: worker specification - :type work_spec: WorkSpec - :return: A tuple of return code (True for success, False otherwise) and error dialog - :rtype: (bool, string) - """ - return self.kill_worker(work_spec) diff --git a/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py b/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py deleted file mode 100644 index 8b478818..00000000 --- a/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py +++ /dev/null @@ -1,47 +0,0 @@ -import os - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestermisc.cloud_openstack_utils import OS_SimpleClient - -# setup base logger -baseLogger = core_utils.setup_logger("cloud_openstack_sweeper") - - -# Cloud Openstack submitter -class CloudOpenstackSweeper(PluginBase): - # constructor - def __init__(self, **kwarg): - PluginBase.__init__(self, **kwarg) - self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - - # kill a worker - - def kill_worker(self, workspec): - # set logger - tmpLog = self.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="kill_worker") - - # initial return values - tmpRetVal = (None, "Nothing done") - - # kill vm - vm_id = workspec.batchID - try: - self.vm_client.nova.servers.delete(vm_id) - except Exception as _e: - errStr = f"Failed to delete a VM with id={vm_id} ; {_e}" - tmpLog.error(errStr) - tmpRetVal = (False, errStr) - else: - tmpLog.info(f"Deleted a VM with id={vm_id}") - tmpRetVal = (True, "") - - return tmpRetVal - - # cleanup for a worker - - def sweep_worker(self, workspec): - # set logger - tmpLog = self.make_logger(baseLogger, f"workerID={workspec.workerID}", method_name="sweep_worker") - - return True, "" diff --git a/pandaharvester/harvestersweeper/lancium_sweeper.py b/pandaharvester/harvestersweeper/lancium_sweeper.py deleted file mode 100644 index e6d69661..00000000 --- a/pandaharvester/harvestersweeper/lancium_sweeper.py +++ /dev/null @@ -1,53 +0,0 @@ -import socket - -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestermisc.lancium_utils import LanciumClient -from pandaharvester.harvestersweeper.base_sweeper import BaseSweeper - -# logger -base_logger = core_utils.setup_logger("lancium_sweeper") - - -# sweeper for Lancium -class LanciumSweeper(BaseSweeper): - # constructor - def __init__(self, **kwarg): - BaseSweeper.__init__(self, **kwarg) - self.hostname = socket.getfqdn() - self.lancium_client = LanciumClient(self.hostname, queue_name=self.queueName) - - # kill workers - def kill_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, method_name="kill_workers") - tmp_log.debug("Start") - ret_list = [] - for workspec in workspec_list: - tmp_log.debug(f"Running kill_worker for {workspec.workerID}") - tmp_ret_val = self.kill_worker(workspec) - ret_list.append(tmp_ret_val) - tmp_log.debug("Done") - return ret_list - - def kill_worker(self, workspec): - tmp_log = self.make_logger(base_logger, f"workerID={workspec.workerID}", method_name="kill_worker") - batch_id = workspec.batchID - tmp_log.debug("Running kill_worker") - if batch_id: # sometimes there are missed workers that were not submitted - try: - self.lancium_client.delete_job(batch_id) - tmp_log.debug(f"Deleted job {batch_id}") - return True, "" - except Exception as _e: - err_str = f"Failed to delete a job with id={batch_id} ; {_e}" - tmp_log.error(err_str) - return False, err_str - - else: # the worker does not need be cleaned - tmp_log.debug("No action necessary, since no batch ID") - return True, "" - - def sweep_worker(self, workspec): - # cleanup for a worker - tmp_log = self.make_logger(base_logger, f"workerID={workspec.workerID}", method_name="sweep_worker") - tmp_log.debug("Returning kill_worker") - return self.kill_worker(workspec) diff --git a/pandaharvester/harvestertest/dumpTable.py b/pandaharvester/harvestertest/dumpTable.py deleted file mode 100644 index aa7e43e2..00000000 --- a/pandaharvester/harvestertest/dumpTable.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import sys -import logging -import datetime -import sqlite3 -import pprint -from future.utils import iteritems -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy -from pandaharvester.harvestercore.job_spec import JobSpec -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestercore.communicator_pool import CommunicatorPool - - -for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) - -pp = pprint.PrettyPrinter(indent=4) - -queueConfigMapper = QueueConfigMapper() - -proxy = DBProxy() - -sqlJ ="SELECT * FROM job_table" - -resultsJobcur = proxy.execute(sqlJ) -resultsJob = resultsJobcur.fetchall() -proxy.commit() - -sqlF ="SELECT * FROM file_table" - -resultsFilescur = proxy.execute(sqlF) -resultsFiles = resultsFilescur.fetchall() -proxy.commit() - -print "job_table - " -print resultsJob[0].keys() -for row in resultsJob: - print tuple(row) -print - -print "file_table - " -print resultsFiles[0].keys() -for row in resultsFiles: - print tuple(row) -#pp.pprint(resultsFiles) -print diff --git a/pandaharvester/harvestertest/getQueuedata.py b/pandaharvester/harvestertest/getQueuedata.py deleted file mode 100644 index 733dbafc..00000000 --- a/pandaharvester/harvestertest/getQueuedata.py +++ /dev/null @@ -1,39 +0,0 @@ -import json -from pandaharvester.harvesterbody.cacher import Cacher -from pandaharvester.harvestercore.db_proxy import DBProxy -from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils - -# make communication channel to PanDA -com = CommunicatorPool() - -proxy = DBProxy() - -cacher = Cacher(com,single_mode=True) -cacher.run() - -# now get this results from DBProxy and print out data (if possible) -queuStat = proxy.get_cache('panda_queues.json',None) -if queuStat is None: - queueStat = dict() -else: - queueStat = queuStat.data - -#print "panda_queues.json data :",queueStat -#print "panda_queues.json type :",type(queueStat) -print -print '{{"{}":'.format('ALCF_Theta'),json.dumps(queueStat['ALCF_Theta']),"}" -print -#print "panda_queues.json data [ALCF_Theta] :",json.dumps(queueStat['ALCF_Theta'], indent=4) -print -print - -""" -globalDict = core_utils.get_global_dict() -print "printing globalDict " -for k,v in globalDict.iteritems(): - print "key: {}".format(k) - print "value: {}".format(v) - #print "key: {}, value: {}".format(k, v) -""" - diff --git a/pandaharvester/harvestertest/lancium/README.md b/pandaharvester/harvestertest/lancium/README.md deleted file mode 100644 index 427fb02a..00000000 --- a/pandaharvester/harvestertest/lancium/README.md +++ /dev/null @@ -1,29 +0,0 @@ -1. Export the API KEY -``` -export LANCIUM_API_KEY= -``` - -2. Add an image from docker hub -``` -lcli image add --name "centos7-singularity" --description "ADC Centos 7 image + singularity configuration" --type docker_image --url docker://fbarreir/adc-centos7-singularity:latest test/centos7-singularity -``` - -2. Upload the `pilots_starter.py` (distributed with this package [here](https://github.com/HSF/harvester/blob/lancium/pandaharvester/harvestercloud/pilots_starter.py)) and the `voms proxy`. The local paths are set in `contants.py`. -``` -python file_contants.py -``` - -3. Submit a job. The job will mount the files from step 2 into '/jobDir' -``` -python submit.py -``` -This will return the job id. - -3. Get the status of the job. You can either list all jobs -``` -python monitor.py -``` -or for a particular job -``` -python monitor.py -``` \ No newline at end of file diff --git a/pandaharvester/harvestertest/lancium/__init__.py b/pandaharvester/harvestertest/lancium/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pandaharvester/harvestertest/lancium/clean.py b/pandaharvester/harvestertest/lancium/clean.py deleted file mode 100644 index 8baafc14..00000000 --- a/pandaharvester/harvestertest/lancium/clean.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -from lancium.api.Job import Job - -if len(sys.argv) != 2: - print("Pass the job id as argument") - return - -job_id = sys.argv[1] - -# What is the difference between terminate, delete and destroy a job -# Does delete also terminate a job? -Job.delete(job_id) diff --git a/pandaharvester/harvestertest/lancium/constants.py b/pandaharvester/harvestertest/lancium/constants.py deleted file mode 100644 index d1fe82e9..00000000 --- a/pandaharvester/harvestertest/lancium/constants.py +++ /dev/null @@ -1,9 +0,0 @@ -# voms proxy definitions -voms_local_path = "/root/lancium/voms" -voms_lancium_path = "/secrets/test1" -voms_job_path = "voms" - -# pilot starter script -script_local_path = "/root/lancium/pilots_starter.py" -script_lancium_path = "/scripts/pilots_starter.py" -script_job_path = "pilots_starter.py" diff --git a/pandaharvester/harvestertest/lancium/file_uploads.py b/pandaharvester/harvestertest/lancium/file_uploads.py deleted file mode 100644 index fb0253bb..00000000 --- a/pandaharvester/harvestertest/lancium/file_uploads.py +++ /dev/null @@ -1,28 +0,0 @@ -import os - -from constants import ( - script_lancium_path, - script_local_path, - voms_lancium_path, - voms_local_path, -) -from lancium.api.Data import Data - - -def fake_callback(total_chunks, current_chunk): - pass - - -# https://lancium.github.io/compute-api-docs/library/lancium/api/Data.html#Data.create - -# 1. Upload a fake voms proxy -data = Data().create(voms_lancium_path, "file", source=os.path.abspath(voms_local_path), force=True) -data.upload(os.path.abspath(voms_local_path), fake_callback) -ex = data.show(voms_lancium_path)[0] -print(ex.__dict__) - -# 2. Upload the pilot starter -data = Data().create(script_lancium_path, "file", source=os.path.abspath(script_local_path), force=True) -data.upload(os.path.abspath(script_local_path), fake_callback) -ex = data.show(script_lancium_path)[0] -print(ex.__dict__) diff --git a/pandaharvester/harvestertest/lancium/monitor.py b/pandaharvester/harvestertest/lancium/monitor.py deleted file mode 100644 index a391483e..00000000 --- a/pandaharvester/harvestertest/lancium/monitor.py +++ /dev/null @@ -1,17 +0,0 @@ -import sys - -from lancium.api.Job import Job - -if len(sys.argv) == 2: - job_id = sys.argv[1] -else: - # print all job statuses - job_id = 0 - -if job_id == 0: - all_jobs = Job().all() - for job in all_jobs: - print(f"id: {job.id}, status: {job.status}") -else: - job = Job().get(job_id) - print(f"id: {job.id}, status: {job.status}") diff --git a/pandaharvester/harvestertest/lancium/submit.py b/pandaharvester/harvestertest/lancium/submit.py deleted file mode 100644 index f2a31256..00000000 --- a/pandaharvester/harvestertest/lancium/submit.py +++ /dev/null @@ -1,60 +0,0 @@ -import uuid - -from constants import ( - script_job_path, - script_lancium_path, - voms_job_path, - voms_lancium_path, -) -from lancium.api.Job import Job - -# https://lancium.github.io/compute-api-docs/library/lancium/api/Job.html#Job.create - -worker_id = uuid.uuid1() -core_count = 0.5 -memory = 1 -scratch = 20 - -params = { - "name": f"grid-job-{worker_id}", - "command_line": "python voms/scripts/pilots_starter.py", - "image": "harvester/centos7-singularity", - "resources": {"core_count": core_count, "memory": memory, "scratch": scratch}, - "input_files": [ - {"source_type": "data", "data": voms_lancium_path, "name": voms_job_path}, - {"source_type": "data", "data": script_lancium_path, "name": script_job_path}, - ], - # 'output_files': RETRIEVE THE PILOT LOG AND STORE IT IN HARVESTER? - "environment": ( - # pilotUrlOpt, stdout_name - {"variable": "PILOT_NOKILL", "value": "True"}, - {"variable": "computingSite", "value": "GOOGLE_EUW1"}, - {"variable": "pandaQueueName", "value": "GOOGLE_EUW1"}, - {"variable": "resourceType", "value": "SCORE"}, - {"variable": "prodSourceLabel", "value": "managed"}, - {"variable": "pilotType", "value": "PR"}, - # {'variable': 'pythonOption', 'value': '--pythonversion\ 3'}, - {"variable": "pilotVersion", "value": "3.5.0.31"}, - {"variable": "jobType", "value": "managed"}, - {"variable": "proxySecretPath", "value": "/jobDir/voms/secrets/test1"}, - {"variable": "workerID", "value": "1"}, - {"variable": "pilotProxyCheck", "value": "False"}, - {"variable": "logs_frontend_w", "value": "https://aipanda047.cern.ch:25443/server/panda"}, - {"variable": "logs_frontend_r", "value": "https://aipanda047.cern.ch:25443/cache"}, - {"variable": "PANDA_JSID", "value": "harvester-CERN_central_k8s"}, - {"variable": "HARVESTER_WORKER_ID", "value": "21421931"}, - {"variable": "HARVESTER_ID", "value": "CERN_central_k8s"}, - {"variable": "submit_mode", "value": "PULL"}, - {"variable": "TMPDIR", "value": "/jobDir"}, - {"variable": "HOME", "value": "/jobDir"}, - {"variable": "K8S_JOB_ID", "value": "grid-job-1"}, - ), -} - -# create the job -job = Job().create(**params) -print(f"Created! name: {job.name}, id: {job.id}, status: {job.status}") - -# submit the job -job.submit() -print(f"Submitted! name: {job.name}, id: {job.id}, status: {job.status}") diff --git a/pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py b/pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py deleted file mode 100644 index 2fe68706..00000000 --- a/pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py +++ /dev/null @@ -1,425 +0,0 @@ -import sys -import os -import os.path -import hashlib -import datetime -import uuid -import random -import string -import time -import threading -import logging -from future.utils import iteritems -from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore.job_spec import JobSpec -from pandaharvester.harvestercore.file_spec import FileSpec -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestercore.plugin_factory import PluginFactory -from pandaharvester.harvesterbody.cacher import Cacher -from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy -from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils -from pandaharvester.harvestermisc import globus_utils - -from globus_sdk import TransferClient -from globus_sdk import TransferData -from globus_sdk import NativeAppAuthClient -from globus_sdk import RefreshTokenAuthorizer - - -#initial variables -fileTableName = 'file_table' -queueName = 'ALCF_Theta' -begin_job_id = 1111 -end_job_id = 1113 -globus_sleep_time = 15 - -# connection lock -conLock = threading.Lock() - - -def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - - - -if len(sys.argv) > 1: - queueName = sys.argv[1] -if len(sys.argv) > 2: - begin_job_id = int(sys.argv[2]) -if len(sys.argv) > 3: - end_job_id = int(sys.argv[3]) -if len(sys.argv) > 4: - globus_sleep_time = int(sys.argv[4]) - -queueConfigMapper = QueueConfigMapper() -queueConfig = queueConfigMapper.get_queue(queueName) -initial_queueConfig_preparator = queueConfig.preparator -queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' -queueConfig.preparator['name'] = 'GlobusBulkPreparator' -modified_queueConfig_preparator = queueConfig.preparator - -pluginFactory = PluginFactory() -# get stage-out plugin -preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) - -# logger -_logger = core_utils.setup_logger('stageInTest_go_bulk_preparator') -tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_bulk_preparator') -tmpLog.debug('start') - -for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) - -msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) -tmpLog.debug(msgStr) -msgStr = "Initial queueConfig.preparator = {}".format(initial_queueConfig_preparator) -tmpLog.debug(msgStr) -msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) -tmpLog.debug(msgStr) - -scope = 'panda' - -proxy = DBProxy() -communicator = CommunicatorPool() -cacher = Cacher(communicator, single_mode=True) -cacher.run() - -Globus_srcPath = queueConfig.preparator['Globus_srcPath'] -srcEndpoint = queueConfig.preparator['srcEndpoint'] -basePath = queueConfig.preparator['basePath'] -Globus_dstPath = queueConfig.preparator['Globus_dstPath'] -dstEndpoint = queueConfig.preparator['dstEndpoint'] - -# check if db lock exits -locked = preparatorCore.dbInterface.get_object_lock('dummy_id_for_in_0',lock_interval=120) -if not locked: - tmpLog.debug('DB Already locked by another thread') -# now unlock db -unlocked = preparatorCore.dbInterface.release_object_lock('dummy_id_for_in_0') -if unlocked : - tmpLog.debug('unlocked db') -else: - tmpLog.debug(' Could not unlock db') - -# need to get client_id and refresh_token from PanDA server via harvester cache mechanism -c_data = preparatorCore.dbInterface.get_cache('globus_secret') -client_id = None -privateKey = None -if (not c_data == None) and c_data.data['StatusCode'] == 0 : - client_id = c_data.data['publicKey'] # client_id - refresh_token = c_data.data['privateKey'] # refresh_token -else : - client_id = None - refresh_token = None - tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' - tmpLog.error(errStr) - sys.exit(1) - -# create Globus transfer client to send initial files to remote Globus source -tmpStat, tc = globus_utils.create_globus_transfer_client(tmpLog,client_id,refresh_token) -if not tmpStat: - tc = None - errStr = 'failed to create Globus Transfer Client' - tmpLog.error(errStr) - sys.exit(1) -try: - # We are sending test files from our destination machine to the source machine - # Test endpoints for activation - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,tc,dstEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,tc,srcEndpoint) - if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' - tmpLog.debug(errStr) - else: - errStr = '' - if not tmpStatsrc : - errStr += ' source Endpoint not activated ' - if not tmpStatdst : - errStr += ' destination Endpoint not activated ' - tmpLog.error(errStr) - sys.exit(2) - # both endpoints activated now prepare to transfer data - # We are sending test files from our destination machine to the source machine - tdata = TransferData(tc,dstEndpoint,srcEndpoint,sync_level="checksum") -except: - errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) - sys.exit(1) - -# loop over the job id's creating various JobSpecs -jobSpec_list = [] -for job_id in range(begin_job_id,end_job_id+1): - jobSpec = JobSpec() - jobSpec.jobParams = { - 'scopeLog': 'panda', - 'logFile': 'log', - } - jobSpec.computingSite = queueName - jobSpec.PandaID = job_id - jobSpec.modificationTime = datetime.datetime.now() - realDataset = 'panda.sgotest.' + uuid.uuid4().hex - ddmEndPointIn = 'BNL-OSG2_DATADISK' - inFiles_scope_str = '' - inFiles_str = '' - realDatasets_str = '' - realDatasetsIn_str = '' - ddmEndPointIn_str = '' - GUID_str = '' - fsize_str = '' - checksum_str = '' - scope_in_str = '' - - # create up 5 files for input - for index in range(random.randint(1, 5)): - fileSpec = FileSpec() - assFileSpec = FileSpec() - # some dummy inputs - GUID_str += 'd82e8e5e301b77489fd4da04bcdd6565,' - fsize_str += '3084569129,' - checksum_str += 'ad:9f60d29f,' - scope_in_str += 'panda,' - # - fileSpec.fileType = 'input' - assFileSpec.lfn = 'panda.sgotest.' + uuid.uuid4().hex - fileSpec.lfn = assFileSpec.lfn - fileSpec.scope = 'panda' - inFiles_scope_str += 'panda,' - inFiles_str += fileSpec.lfn + ',' - realDatasets_str += realDataset + "," - realDatasetsIn_str += realDataset + "," - ddmEndPointIn_str += ddmEndPointIn + "," - assFileSpec.fileType = 'input' - assFileSpec.fsize = random.randint(10, 100) - # create source file - hash = hashlib.md5() - hash.update('%s:%s' % (fileSpec.scope, fileSpec.lfn)) - hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - fileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_dstPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - assFileSpec.path = fileSpec.path - fileSpec.add_associated_file(assFileSpec) - # add input file to jobSpec - jobSpec.add_in_file(fileSpec) - # now create the temporary file - tmpfile_path = "{mountPoint}/testdata/{lfn}".format(mountPoint=queueConfig.preparator['basePath'], - lfn=assFileSpec.lfn) - - if not os.path.exists(os.path.dirname(tmpfile_path)): - tmpLog.debug("os.makedirs({})".format(os.path.dirname(tmpfile_path))) - os.makedirs(os.path.dirname(tmpfile_path)) - oFile = open(tmpfile_path, 'w') - oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) - oFile.close() - # create destination file path - destfile_path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_srcPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - # add to Globus transfer list - tdata.add_item(tmpfile_path,destfile_path) - #print "dump(fileSpec)" - #dump(fileSpec) - # - tmpLog.debug("source file to transfer - {}".format(tmpfile_path)) - tmpLog.debug("destination file to transfer - {}".format(destfile_path)) - #print "dump(jobSpec)" - #dump(jobSpec) - # remove final "," - realDatasetsIn_str=realDatasetsIn_str[:-1] - inFiles_str = inFiles_str[:-1] - inFiles_scope_str = inFiles_scope_str[:-1] - GUID_str = GUID_str[:-1] - fsize_str = fsize_str[:-1] - checksum_str = checksum_str[:-1] - scope_in_str = scope_in_str[:-1] - jobSpec.jobParams['realDatasets'] = realDatasets_str - jobSpec.jobParams['ddmEndPointIn'] = ddmEndPointIn_str - jobSpec.jobParams['inFiles'] = inFiles_str - jobSpec.jobParams['GUID'] = GUID_str - jobSpec.jobParams['fsize'] = fsize_str - jobSpec.jobParams['checksum'] = checksum_str - jobSpec.jobParams['scopeIn'] = scope_in_str - jobSpec.jobParams['realDatasetsIn'] = realDatasetsIn_str - msgStr = "jobSpec.jobParams ={}".format(jobSpec.jobParams) - tmpLog.debug(msgStr) - jobSpec_list.append(jobSpec) - - -# now load into DB JobSpec's and output FileSpec's from jobSpec_list -tmpStat = proxy.insert_jobs(jobSpec_list) -if tmpStat: - msgStr = "OK Loaded jobs into DB" - tmpLog.debug(msgStr) -else: - msgStr = "NG Could not load jobs into DB" - tmpLog.debug(msgStr) -tmpStat = proxy.insert_files(jobSpec_list) -if tmpStat: - msgStr = "OK Loaded files into DB" - tmpLog.debug(msgStr) -else: - msgStr = "NG Could not load files into DB" - tmpLog.debug(msgStr) - -# transfer dummy files to Remote site for input -transfer_result = tc.submit_transfer(tdata) -# check status code and message -tmpLog.debug(str(transfer_result)) -if transfer_result['code'] == "Accepted": - # succeeded - # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - tmpLog.debug('done') -else: - tmpLog.error('Failed to send intial files') - sys.exit(3) - -print "sleep {0} seconds".format(globus_sleep_time) -time.sleep(globus_sleep_time) - -# enter polling loop to see if the intial files have transfered -maxloop = 5 -iloop = 0 -NotFound = True -while (iloop < maxloop) and NotFound : - # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,tc,transferID) - # return a temporary error when failed to get task - if not tmpStat: - errStr = 'failed to get transfer task' - tmpLog.error(errStr) - else: - # return a temporary error when task is missing - tmpLog.debug('transferTasks : {} '.format(transferTasks)) - if transferID not in transferTasks: - errStr = 'transfer task ID - {} is missing'.format(transferID) - tmpLog.error(errStr) - else: - # succeeded in finding a transfer task by tranferID - if transferTasks[transferID]['status'] == 'SUCCEEDED': - tmpLog.debug('transfer task {} succeeded'.format(transferID)) - NotFound = False - # failed - if transferTasks[transferID]['status'] == 'FAILED': - errStr = 'transfer task {} failed'.format(transferID) - tmpLog.error(errStr) - # another status - tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) - tmpLog.debug(tmpStr) - if NotFound : - print "sleep {0} seconds".format(globus_sleep_time) - time.sleep(globus_sleep_time) - ++iloop -if NotFound : - errStr = 'transfer task ID - {} is missing'.format(transferID) - tmpLog.error(errStr) - sys.exit(1) - - -print "plugin={0}".format(preparatorCore.__class__.__name__) - -print "testing stagein:" -print "BasePath from preparator configuration: %s " % preparatorCore.basePath - -# Now loop over the jobSpec's - -for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - msgStr = "testing trigger_stage_out" - tmpLog.debug(msgStr) - tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) - if tmpStat: - msgStr = " OK " - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - sys.exit(1) - print - # check status to actually trigger transfer - # get the files with the group_id and print out - msgStr = "dummy_transfer_id = {}".format(preparatorCore.get_dummy_transfer_id()) - files = proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - files = preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpLog.debug(msgStr) - tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - -# sleep for 10 minutes 1 second - -msgStr = "Sleep for 601 seconds" -#msgStr = "Sleep for 181 seconds" -tmpLog.debug(msgStr) -#time.sleep(181) -time.sleep(601) -msgStr = "now check the jobs" -tmpLog.debug(msgStr) - -for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - tmpLog.debug(msgStr) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - -# sleep for 3 minutes - -msgStr = "Sleep for 180 seconds" -tmpLog.debug(msgStr) -time.sleep(180) -msgStr = "now check the jobs" -tmpLog.debug(msgStr) - -for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - tmpLog.debug(msgStr) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) diff --git a/pandaharvester/harvestertest/stageOutTest_globus.py b/pandaharvester/harvestertest/stageOutTest_globus.py deleted file mode 100644 index a844a896..00000000 --- a/pandaharvester/harvestertest/stageOutTest_globus.py +++ /dev/null @@ -1,124 +0,0 @@ -import sys -import os -import os.path -import hashlib -import uuid -import random -import string -import time -from pandaharvester.harvestercore.job_spec import JobSpec -from pandaharvester.harvestercore.file_spec import FileSpec -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper -from pandaharvester.harvestercore.plugin_factory import PluginFactory - -def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - -ueueName = 'ALCF_Theta' -job_id = 1111 -globus_sleep_time = 15 # seconds - -if len(sys.argv) > 1: - queueName = sys.argv[1] -if len(sys.argv) > 2: - job_id = int(sys.argv[2]) -if len(sys.argv) > 3: - globus_sleep_time = int(sys.argv[3]) - -queueConfigMapper = QueueConfigMapper() -queueConfig = queueConfigMapper.get_queue(queueName) -print queueConfig.stager -print queueConfig.stager['Globus_srcPath'] -print queueConfig.stager['srcEndpoint'] -print queueConfig.stager['Globus_dstPath'] -print queueConfig.stager['dstEndpoint'] -print queueConfig.stager['zipDir'] - -print "Initial queueConfig.stager = ",queueConfig.stager -queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_stager' -queueConfig.stager['name'] = 'GlobusStager' -print "Modified queueConfig.stager = ",queueConfig.stager - -scope = 'panda' - -fileSpec = FileSpec() -fileSpec.fileType = 'es_output' -fileSpec.lfn = 'panda.sgotest.' + uuid.uuid4().hex + '.gz' -fileSpec.fileAttributes = {} -assFileSpec = FileSpec() -assFileSpec.lfn = 'panda.sgotest.' + uuid.uuid4().hex -assFileSpec.fileType = 'es_output' -assFileSpec.fsize = random.randint(10, 100) -# create source file -hash = hashlib.md5() -hash.update('%s:%s' % (scope, fileSpec.lfn)) -hash_hex = hash.hexdigest() -correctedscope = "/".join(scope.split('.')) -assFileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.stager['Globus_srcPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=assFileSpec.lfn) -if not os.path.exists(os.path.dirname(assFileSpec.path)): - print "os.makedirs({})".format(os.path.dirname(assFileSpec.path)) - os.makedirs(os.path.dirname(assFileSpec.path)) -oFile = open(assFileSpec.path, 'w') -oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) -oFile.close() -fileSpec.add_associated_file(assFileSpec) - -jobSpec = JobSpec() -jobSpec.jobParams = {'outFiles': fileSpec.lfn + ',log', - 'scopeOut': 'panda', - 'scopeLog': 'panda', - 'logFile': 'log', - 'realDatasets': 'panda.' + fileSpec.lfn, - 'ddmEndPointOut': 'BNL-OSG2_DATADISK', - } -jobSpec.computingSite = queueName -jobSpec.PandaID = job_id -jobSpec.add_out_file(fileSpec) - -print "file to transfer - {}".format(assFileSpec.path) -print "dump(jobSpec)" -#dump(jobSpec) - - -pluginFactory = PluginFactory() - -# get stage-out plugin -stagerCore = pluginFactory.get_plugin(queueConfig.stager) -print "plugin={0}".format(stagerCore.__class__.__name__) - -print "testing zip" -tmpStat, tmpOut = stagerCore.zip_output(jobSpec) -if tmpStat: - print " OK" -else: - print " NG {0}".format(tmpOut) - -print - - - -print "testing stage-out" -tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec) -if tmpStat: - print " OK " -else: - print " NG {0}".format(tmpOut) - sys.exit(1) - -print "sleep {0} seconds".format(globus_sleep_time) -time.sleep(globus_sleep_time) - -print "checking status for transfer" -tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) -if tmpStat: - print " OK" -else: - print " NG {0}".format(tmpOut) - - diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index ba50f361..9e0bb825 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.3.4" +release_version = "0.4.0" diff --git a/templates/panda/panda_harvester-uwsgi.ini.rpmnew.template b/templates/panda/panda_harvester-uwsgi.ini.rpmnew.template index eab2e16b..ad016951 100644 --- a/templates/panda/panda_harvester-uwsgi.ini.rpmnew.template +++ b/templates/panda/panda_harvester-uwsgi.ini.rpmnew.template @@ -23,6 +23,7 @@ processes = 2 threads = 4 worker-reload-mercy = 2 +reload-on-exception = true buffer = 32768 post-buffering = 32768