From 587f3f550ddc32d2af95734daa26c029d654c080 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 13:13:40 +0200 Subject: [PATCH 1/7] rename hook to avoid name conflict with pre-commit --- git_hooks/{pre-commit => commit_timestamp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename git_hooks/{pre-commit => commit_timestamp} (100%) diff --git a/git_hooks/pre-commit b/git_hooks/commit_timestamp similarity index 100% rename from git_hooks/pre-commit rename to git_hooks/commit_timestamp From 0e4585d35e3eca96e9cae2735564388bed668703 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 14:14:10 +0200 Subject: [PATCH 2/7] pre-commit --- pyproject.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2a7c4e82 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[tool.black] +line-length=160 + +[tool.autopep8] +# https://pypi.org/project/autopep8/#pyproject-toml +max_line_length = 160 +ignore = ["E203", "E501", "W6"] +in-place = true +recursive = true +aggressive = 3 \ No newline at end of file From c654ed4d37819fd5927baa13c1e2e620f4b880f7 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 14:16:58 +0200 Subject: [PATCH 3/7] pre-commit exclude python 2 code --- .pre-commit-config.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..8fa7542e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +repos: +- repo: https://github.com/pre-commit/mirrors-autopep8 + rev: v2.0.4 + hooks: + - id: autopep8 + # args: ["--global-config", "package/pyproject.toml"] +- repo: https://github.com/psf/black + rev: 23.9.1 + hooks: + - id: black + # args: ["--config", "package/pyproject.toml"] +# exclude python 2 code which cannot be dealt with black +exclude: | + (?x)( + ^pandaharvester/harvestermonitor/arc_monitor.py| + ^pandaharvester/harvestermisc/arc_utils.py| + ^pandaharvester/harvesterpayload/simple_wrapper_mpi.py| + ^pandaharvester/harvestersubmitter/apfgrid_submitter.py| + ^pandaharvester/harvestertest/dumpTable.py| + ^pandaharvester/harvestertest/getQueuedata.py| + ^pandaharvester/harvestermessenger/arc_messenger.py| + ^pandaharvester/harvestersubmitter/arc_submitter.py| + ^pandaharvester/harvestertest/stageOutTest_globus.py| + ^pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py| + ^pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py| + ^& + ) \ No newline at end of file From 2ad8d34c207116380b9b10af297f35e414628097 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 14:38:23 +0200 Subject: [PATCH 4/7] formatted by autopep8 and black --- .pre-commit-config.yaml | 1 + examples/k8s/k8s_atlas_scheduler.py | 98 +- pandaharvester/harvesterbody/agent_base.py | 5 +- pandaharvester/harvesterbody/cacher.py | 72 +- .../harvesterbody/command_manager.py | 49 +- pandaharvester/harvesterbody/cred_manager.py | 105 +- pandaharvester/harvesterbody/event_feeder.py | 57 +- pandaharvester/harvesterbody/file_syncer.py | 64 +- pandaharvester/harvesterbody/job_fetcher.py | 63 +- pandaharvester/harvesterbody/master.py | 137 +- pandaharvester/harvesterbody/monitor.py | 522 ++-- pandaharvester/harvesterbody/preparator.py | 209 +- pandaharvester/harvesterbody/propagator.py | 156 +- .../harvesterbody/service_monitor.py | 43 +- pandaharvester/harvesterbody/stager.py | 207 +- pandaharvester/harvesterbody/submitter.py | 295 +-- pandaharvester/harvesterbody/sweeper.py | 132 +- pandaharvester/harvesterbody/watcher.py | 113 +- .../harvesterbody/worker_adjuster.py | 138 +- pandaharvester/harvesterbody/worker_maker.py | 16 +- .../harvestercloud/aws_unhealthy_nodes.py | 24 +- pandaharvester/harvestercloud/cernvm_aux.py | 6 +- .../harvestercloud/gke_unhealthy_nodes.py | 20 +- .../harvestercloud/google_startup_script.py | 12 +- pandaharvester/harvestercloud/googlecloud.py | 141 +- .../harvestercloud/pilots_starter.py | 252 +- .../base_communicator.py | 25 +- .../panda_communicator.py | 617 ++--- .../harvesterconfig/harvester_config.py | 36 +- pandaharvester/harvestercore/cache_spec.py | 6 +- pandaharvester/harvestercore/command_spec.py | 33 +- .../harvestercore/communicator_pool.py | 13 +- pandaharvester/harvestercore/core_utils.py | 136 +- pandaharvester/harvestercore/db_interface.py | 8 +- pandaharvester/harvestercore/db_proxy.py | 2342 ++++++++--------- pandaharvester/harvestercore/db_proxy_pool.py | 16 +- pandaharvester/harvestercore/diag_spec.py | 28 +- pandaharvester/harvestercore/event_spec.py | 30 +- pandaharvester/harvestercore/fifos.py | 165 +- pandaharvester/harvestercore/file_spec.py | 64 +- pandaharvester/harvestercore/job_spec.py | 385 +-- .../harvestercore/job_worker_relation_spec.py | 9 +- .../harvestercore/panda_queue_spec.py | 44 +- pandaharvester/harvestercore/pilot_errors.py | 66 +- pandaharvester/harvestercore/plugin_base.py | 2 +- .../harvestercore/plugin_factory.py | 26 +- .../harvestercore/process_lock_spec.py | 5 +- .../harvestercore/queue_config_dump_spec.py | 21 +- .../harvestercore/queue_config_mapper.py | 333 ++- .../harvestercore/resource_type_mapper.py | 27 +- .../harvestercore/seq_number_spec.py | 7 +- .../harvestercore/service_metrics_spec.py | 12 +- pandaharvester/harvestercore/spec_base.py | 32 +- pandaharvester/harvestercore/work_spec.py | 291 +- pandaharvester/harvestercore/worker_errors.py | 10 +- .../arcproxy_cred_manager.py | 30 +- .../harvestercredmanager/base_cred_manager.py | 3 +- .../dummy_cred_manager.py | 3 +- .../harvestercredmanager/grid_cred_manager.py | 25 +- .../iam_token_cred_manager.py | 122 +- .../k8s_secret_cred_manager.py | 34 +- .../lancium_cred_manager.py | 34 +- .../no_voms_cred_manager.py | 61 +- .../proxy_cache_cred_manager.py | 17 +- .../harvesterextractor/aux_extractor.py | 22 +- .../harvesterextractor/base_extractor.py | 6 +- pandaharvester/harvesterfifo/mysql_fifo.py | 293 +-- pandaharvester/harvesterfifo/redis_fifo.py | 70 +- pandaharvester/harvesterfifo/sqlite_fifo.py | 165 +- .../harvesterfilesyncer/base_file_syncer.py | 3 +- .../harvesterfilesyncer/git_file_syncer.py | 103 +- .../harvestermessenger/act_messenger.py | 80 +- .../harvestermessenger/apache_messenger.py | 26 +- .../harvestermessenger/base_messenger.py | 2 +- .../http_server_messenger.py | 128 +- .../harvestermessenger/k8s_messenger.py | 22 +- .../shared_file_messenger.py | 401 ++- .../harvestermiddleware/direct_ssh_bot.py | 31 +- .../harvestermiddleware/direct_ssh_herder.py | 106 +- pandaharvester/harvestermiddleware/rpc_bot.py | 41 +- .../harvestermiddleware/rpc_herder.py | 200 +- .../harvestermiddleware/ssh_master_pool.py | 121 +- .../harvestermiddleware/ssh_tunnel_pool.py | 98 +- pandaharvester/harvestermisc/apfmon.py | 299 ++- pandaharvester/harvestermisc/arc_parser.py | 290 +- .../harvestermisc/cloud_openstack_utils.py | 23 +- .../harvestermisc/frontend_utils.py | 19 +- pandaharvester/harvestermisc/gitlab_utils.py | 5 +- pandaharvester/harvestermisc/globus_utils.py | 121 +- .../harvestermisc/htcondor_utils.py | 440 ++-- pandaharvester/harvestermisc/idds_utils.py | 13 +- pandaharvester/harvestermisc/info_utils.py | 51 +- .../harvestermisc/info_utils_k8s.py | 91 +- pandaharvester/harvestermisc/k8s_utils.py | 444 ++-- pandaharvester/harvestermisc/lancium_utils.py | 187 +- pandaharvester/harvestermisc/rucio_utils.py | 212 +- pandaharvester/harvestermisc/selfcheck.py | 16 +- pandaharvester/harvestermisc/titan_utils.py | 98 +- pandaharvester/harvestermisc/token_utils.py | 17 +- .../harvestermonitor/act_monitor.py | 49 +- .../harvestermonitor/apfgrid_monitor.py | 98 +- .../harvestermonitor/cloud_google_monitor.py | 49 +- .../cloud_openstack_monitor.py | 76 +- .../harvestermonitor/cobalt_monitor.py | 126 +- .../harvestermonitor/dummy_mcore_monitor.py | 21 +- .../harvestermonitor/dummy_monitor.py | 17 +- .../harvestermonitor/gitlab_monitor.py | 35 +- .../globus_compute_monitor.py | 128 +- .../harvestermonitor/htcondor_monitor.py | 206 +- .../harvestermonitor/k8s_monitor.py | 150 +- .../harvestermonitor/lancium_monitor.py | 127 +- .../harvestermonitor/lsf_monitor.py | 50 +- .../harvestermonitor/pbs_monitor.py | 38 +- .../harvestermonitor/saga_monitor.py | 75 +- .../harvestermonitor/slurm_bulk_monitor.py | 80 +- .../harvestermonitor/slurm_monitor.py | 36 +- .../harvestermonitor/slurm_squeue_monitor.py | 57 +- pandaharvester/harvestermover/mover_utils.py | 10 +- .../analysis_aux_preparator.py | 164 +- .../harvesterpreparator/aux_preparator.py | 6 +- .../dummy_bulk_preparator.py | 34 +- .../harvesterpreparator/dummy_preparator.py | 10 +- .../harvesterpreparator/go_bulk_preparator.py | 386 +-- .../harvesterpreparator/go_preparator.py | 191 +- .../harvesterpreparator/gridftp_preparator.py | 58 +- .../pilotmover_mt_preparator.py | 88 +- .../pilotmover_mt_preparator_kari.py | 99 +- .../pilotmover_preparator.py | 65 +- .../rse_direct_preparator.py | 10 +- .../harvesterpreparator/rucio_preparator.py | 65 +- .../harvesterpreparator/xrdcp_preparator.py | 69 +- .../harvesterscripts/file_operation.py | 80 +- .../harvesterscripts/harvester_admin.py | 395 +-- pandaharvester/harvesterscripts/prescript.py | 14 +- .../harvesterscripts/queue_config_tool.py | 33 +- .../harvesterscripts/remote_install.py | 121 +- pandaharvester/harvesterstager/act_stager.py | 70 +- .../harvesterstager/dummy_bulk_stager.py | 38 +- .../harvesterstager/dummy_stager.py | 30 +- pandaharvester/harvesterstager/fts_stager.py | 126 +- .../harvesterstager/go_bulk_stager.py | 372 ++- .../harvesterstager/go_rucio_stager.py | 200 +- pandaharvester/harvesterstager/go_stager.py | 225 +- .../harvesterstager/gridftp_stager.py | 80 +- .../harvesterstager/rse_direct_stager.py | 13 +- .../rucio_rse_direct_stager.py | 215 +- .../harvesterstager/rucio_stager.py | 120 +- .../harvesterstager/rucio_stager_hpc.py | 157 +- .../rucio_stager_hpc_minikui.py | 177 +- .../harvesterstager/xrdcp_stager.py | 103 +- .../harvesterstager/yoda_rse_direct_stager.py | 124 +- .../yoda_rucio_rse_direct_stager.py | 452 ++-- .../harvestersubmitter/act_submitter.py | 86 +- .../cloud_google_submitter.py | 55 +- .../cloud_openstack_submitter.py | 64 +- .../harvestersubmitter/cobalt_submitter.py | 53 +- .../dummy_mcore_submitter.py | 46 +- .../dummy_singleton_submitter.py | 4 +- .../harvestersubmitter/dummy_submitter.py | 40 +- .../harvestersubmitter/gitlab_submitter.py | 33 +- .../globus_compute_submitter.py | 54 +- .../harvestersubmitter/htcondor_submitter.py | 618 ++--- .../harvestersubmitter/k8s_submitter.py | 73 +- .../harvestersubmitter/lancium_submitter.py | 197 +- .../harvestersubmitter/lsf_submitter.py | 87 +- .../harvestersubmitter/pbs_submitter.py | 62 +- .../harvestersubmitter/saga_submitter.py | 68 +- .../harvestersubmitter/slurm_submitter.py | 90 +- .../slurm_submitter_jinja.py | 76 +- .../harvestersubmitter/submitter_common.py | 239 +- .../harvestersweeper/act_sweeper.py | 40 +- .../harvestersweeper/apfgrid_sweeper.py | 53 +- .../harvestersweeper/arc_sweeper.py | 64 +- .../harvestersweeper/cloud_google_sweeper.py | 18 +- .../cloud_openstack_sweeper.py | 20 +- .../harvestersweeper/cobalt_sweeper.py | 61 +- .../harvestersweeper/dummy_sweeper.py | 6 +- .../harvestersweeper/gitlab_sweeper.py | 29 +- .../harvestersweeper/htcondor_sweeper.py | 44 +- .../harvestersweeper/k8s_sweeper.py | 25 +- .../harvestersweeper/lancium_sweeper.py | 29 +- .../harvestersweeper/lsf_sweeper.py | 23 +- .../harvestersweeper/pbs_sweeper.py | 23 +- .../harvestersweeper/saga_sweeper.py | 38 +- .../harvestersweeper/slurm_sweeper.py | 21 +- pandaharvester/harvestertest/basicTest.py | 7 +- pandaharvester/harvestertest/cacherTest.py | 5 +- .../harvestertest/check_log_db_proxy_pool.py | 28 +- pandaharvester/harvestertest/cleanDN.py | 52 +- .../container_auxpreparator_test.py | 8 +- .../harvestertest/credMangerTest.py | 41 +- .../harvestertest/encryptForWatcher.py | 14 +- ...further_testing_go_bulk_preparator-test.py | 177 +- .../further_testing_go_bulk_preparator.py | 137 +- .../further_testing_go_bulk_stager.py | 137 +- .../harvestertest/getEventRangesTest.py | 6 +- pandaharvester/harvestertest/getEvents.py | 8 +- pandaharvester/harvestertest/getJob.py | 8 +- pandaharvester/harvestertest/getJobs.py | 15 +- .../harvestertest/k8s_node_occupancy.py | 24 +- pandaharvester/harvestertest/lancium/clean.py | 2 +- .../harvestertest/lancium/constants.py | 12 +- .../harvestertest/lancium/file_uploads.py | 7 +- .../harvestertest/lancium/submit.py | 82 +- .../harvestertest/monitorFifoTest.py | 27 +- .../read_shared_file_messenger_files.py | 30 +- pandaharvester/harvestertest/renice.py | 24 +- pandaharvester/harvestertest/sshTunnelTest.py | 50 +- pandaharvester/harvestertest/stageInTest.py | 45 +- .../harvestertest/stageInTest_GlobusOnline.py | 53 +- .../harvestertest/stageInTest_dpb.py | 45 +- .../harvestertest/stageInTest_globus.py | 383 ++- pandaharvester/harvestertest/stageOutTest.py | 63 +- .../stageOutTest_go_bulk_stager.py | 373 ++- pandaharvester/harvestertest/submitterTest.py | 56 +- .../harvestertest/testCommunication.py | 6 +- .../harvestertest/updateEventRangesTest.py | 15 +- pandaharvester/harvestertest/updateEvents.py | 22 +- pandaharvester/harvestertest/updateJob.py | 4 +- pandaharvester/harvestertest/watcherTest.py | 3 +- .../harvestertest/worker_pandajob_dump.py | 216 +- .../harvesterthrottler/simple_throttler.py | 53 +- .../dummy_dynamic_worker_maker.py | 11 +- .../multijob_worker_maker.py | 19 +- .../multinode_worker_maker.py | 32 +- .../simple_bf_es_worker_maker.py | 153 +- .../simple_worker_maker.py | 134 +- pandaharvester/harvesterzipper/base_zipper.py | 398 ++- .../harvesterzipper/dummy_zipper.py | 28 +- .../harvesterzipper/simple_zipper.py | 10 +- pandaharvester/harvesterzipper/ssh_zipper.py | 10 +- setup.py | 117 +- 232 files changed, 11580 insertions(+), 12126 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8fa7542e..987bee82 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,5 +23,6 @@ exclude: | ^pandaharvester/harvestertest/stageOutTest_globus.py| ^pandaharvester/harvestertest/stageInTest_go_bulk_preparator.py| ^pandaharvester/harvesterpayload/ATLAS_simple_wrapper_mpi.py| + ^pandaharvester/harvestercloud/google_startup_script.py| ^& ) \ No newline at end of file diff --git a/examples/k8s/k8s_atlas_scheduler.py b/examples/k8s/k8s_atlas_scheduler.py index 58b0f2d1..ea9431b1 100644 --- a/examples/k8s/k8s_atlas_scheduler.py +++ b/examples/k8s/k8s_atlas_scheduler.py @@ -8,67 +8,66 @@ from kubernetes import client, config, watch -config.load_kube_config(config_file=os.environ.get('KUBECONFIG')) +config.load_kube_config(config_file=os.environ.get("KUBECONFIG")) corev1 = client.CoreV1Api() -scheduler_name = 'atlas_scheduler' +scheduler_name = "atlas_scheduler" def node_allocatable_map(node_status_allocatable): - cpu_str = node_status_allocatable['cpu'] - memory_str = node_status_allocatable['memory'] - mCpu = int(cpu_str)*1000 - _m = re.match('^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$', memory_str) + cpu_str = node_status_allocatable["cpu"] + memory_str = node_status_allocatable["memory"] + mCpu = int(cpu_str) * 1000 + _m = re.match("^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$", memory_str) if _m is None: - print('No memory allocatable in node') + print("No memory allocatable in node") memoryKB = 0 - elif 'M' in _m.group(2): - memoryKB = int(float(_m.group(1))*2**10) - elif 'G' in _m.group(2): - memoryKB = int(float(_m.group(1))*2**20) + elif "M" in _m.group(2): + memoryKB = int(float(_m.group(1)) * 2**10) + elif "G" in _m.group(2): + memoryKB = int(float(_m.group(1)) * 2**20) else: memoryKB = int(float(_m.group(1))) - return {'mCpu': mCpu, 'memoryKB': memoryKB} + return {"mCpu": mCpu, "memoryKB": memoryKB} def get_mcpu(containers): mcpu_req = 0 for c in containers: - if hasattr(c.resources, 'requests'): - mcpu_req_str = c.resources.requests['cpu'] - elif hasattr(c.resources, 'limits'): - mcpu_req_str = c.resources.limits['cpu'] + if hasattr(c.resources, "requests"): + mcpu_req_str = c.resources.requests["cpu"] + elif hasattr(c.resources, "limits"): + mcpu_req_str = c.resources.limits["cpu"] else: - mcpu_req_str = '' - _m = re.match('^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$', mcpu_req_str) + mcpu_req_str = "" + _m = re.match("^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$", mcpu_req_str) if _m is None: - print('No cpu reources requests or limits specified') + print("No cpu reources requests or limits specified") mcpu_req += 999 - elif _m.group(2) == '': - mcpu_req += int(float(_m.group(1))*1000) - elif _m.group(2) == 'm': + elif _m.group(2) == "": + mcpu_req += int(float(_m.group(1)) * 1000) + elif _m.group(2) == "m": mcpu_req += int(float(_m.group(1))) else: - print('Invalid cpu reources requests or limits specified') + print("Invalid cpu reources requests or limits specified") mcpu_req += 999 return mcpu_req -def get_allocated_resources(namespace='default'): +def get_allocated_resources(namespace="default"): node_allocated_resources_map = {} - ret = corev1.list_namespaced_pod(namespace=namespace, - field_selector='status.phase!=Succeeded,status.phase!=Failed') + ret = corev1.list_namespaced_pod(namespace=namespace, field_selector="status.phase!=Succeeded,status.phase!=Failed") for i in ret.items: # pod_info = {} # pod_info['name'] = i.metadata.name # pod_info['status'] = i.status.phase # pod_info['status_reason'] = i.status.conditions[0].reason if i.status.conditions else None # pod_info['status_message'] = i.status.conditions[0].message if i.status.conditions else None - nodeName = getattr(i.spec, 'node_name', None) + nodeName = getattr(i.spec, "node_name", None) if nodeName is None: continue node_allocated_resources_map.setdefault(nodeName, {}) - node_allocated_resources_map[nodeName].setdefault('mCpu', 0) - node_allocated_resources_map[nodeName]['mCpu'] += get_mcpu(i.spec.containers) + node_allocated_resources_map[nodeName].setdefault("mCpu", 0) + node_allocated_resources_map[nodeName]["mCpu"] += get_mcpu(i.spec.containers) return node_allocated_resources_map @@ -78,21 +77,20 @@ def nodes_available(): for node in corev1.list_node().items: node_name = node.metadata.name for status in node.status.conditions: - if status.status == 'True' and status.type == 'Ready': + if status.status == "True" and status.type == "Ready": node_allocatable_dict = node_allocatable_map(node.status.allocatable) - mcpu_available = node_allocatable_dict['mCpu'] \ - - allocated_resources_map.get(node_name, {'mCpu': 0})['mCpu'] - ready_nodes.append({'name': node_name, 'mCpu': mcpu_available}) - ready_nodes = sorted(ready_nodes, key=(lambda x: x['mCpu'])) + mcpu_available = node_allocatable_dict["mCpu"] - allocated_resources_map.get(node_name, {"mCpu": 0})["mCpu"] + ready_nodes.append({"name": node_name, "mCpu": mcpu_available}) + ready_nodes = sorted(ready_nodes, key=(lambda x: x["mCpu"])) return ready_nodes -def scheduler(name, node, namespace='default'): +def scheduler(name, node, namespace="default"): target = client.V1ObjectReference() - target.kind = 'Node' - target.apiVersion = 'corev1' + target.kind = "Node" + target.apiVersion = "corev1" target.name = node - print('target', target) + print("target", target) meta = client.V1ObjectMeta() meta.name = name body = client.V1Binding(metadata=meta, target=target) @@ -102,28 +100,26 @@ def scheduler(name, node, namespace='default'): def main(): w = watch.Watch() while True: - for event in w.stream(corev1.list_namespaced_pod, 'default', timeout_seconds=30): - pod = event['object'] - if pod.status.phase == 'Pending' and not pod.spec.node_name \ - and pod.spec.scheduler_name == scheduler_name: + for event in w.stream(corev1.list_namespaced_pod, "default", timeout_seconds=30): + pod = event["object"] + if pod.status.phase == "Pending" and not pod.spec.node_name and pod.spec.scheduler_name == scheduler_name: for node_info in nodes_available(): pod_mcpu_req = get_mcpu(pod.spec.containers) - node_mcpu_free = node_info['mCpu'] - to_bind = (pod_mcpu_req <= node_mcpu_free) - print('Node {0} has {1} mcpu ; pod requests {2} mcpu ; to_bind: {3}'.format( - node_info['name'], node_mcpu_free, pod_mcpu_req, to_bind)) + node_mcpu_free = node_info["mCpu"] + to_bind = pod_mcpu_req <= node_mcpu_free + print("Node {0} has {1} mcpu ; pod requests {2} mcpu ; to_bind: {3}".format(node_info["name"], node_mcpu_free, pod_mcpu_req, to_bind)) if to_bind: try: - print('Scheduling ' + pod.metadata.name) - res = scheduler(pod.metadata.name, node_info['name']) + print("Scheduling " + pod.metadata.name) + res = scheduler(pod.metadata.name, node_info["name"]) except ValueError as e: - print('ValueError (maybe harmless):', e) + print("ValueError (maybe harmless):", e) except client.rest.ApiException as e: - print(json.loads(e.body)['message']) + print(json.loads(e.body)["message"]) finally: break time.sleep(2**-4) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pandaharvester/harvesterbody/agent_base.py b/pandaharvester/harvesterbody/agent_base.py index e7786e97..9ffb6488 100644 --- a/pandaharvester/harvesterbody/agent_base.py +++ b/pandaharvester/harvesterbody/agent_base.py @@ -7,7 +7,6 @@ # base class for agents class AgentBase(threading.Thread): - # constructor def __init__(self, single_mode): threading.Thread.__init__(self) @@ -30,11 +29,11 @@ def terminated(self, wait_interval, randomize=True): # get process identifier def get_pid(self): thread_id = self.ident if self.ident else 0 - return '{0}_{1}-{2}'.format(self.hostname, self.os_pid, format(thread_id, 'x')) + return "{0}_{1}-{2}".format(self.hostname, self.os_pid, format(thread_id, "x")) # make logger def make_logger(self, base_log, token=None, method_name=None, send_dialog=True): - if send_dialog and hasattr(self, 'dbInterface'): + if send_dialog and hasattr(self, "dbInterface"): hook = self.dbInterface else: hook = None diff --git a/pandaharvester/harvesterbody/cacher.py b/pandaharvester/harvesterbody/cacher.py index f7ed9341..4605f9c6 100644 --- a/pandaharvester/harvesterbody/cacher.py +++ b/pandaharvester/harvesterbody/cacher.py @@ -14,7 +14,7 @@ from pandaharvester.harvesterbody.agent_base import AgentBase # logger -_logger = core_utils.setup_logger('cacher') +_logger = core_utils.setup_logger("cacher") # cache information @@ -36,79 +36,79 @@ def run(self): # main def execute(self, force_update=False, skip_lock=False, n_thread=0): - mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='execute') + mainLog = self.make_logger(_logger, "id={0}".format(self.get_pid()), method_name="execute") # get lock - locked = self.dbProxy.get_process_lock('cacher', self.get_pid(), harvester_config.cacher.sleepTime) + locked = self.dbProxy.get_process_lock("cacher", self.get_pid(), harvester_config.cacher.sleepTime) if locked or skip_lock: - mainLog.debug('getting information') - timeLimit = datetime.datetime.utcnow() - \ - datetime.timedelta(minutes=harvester_config.cacher.refreshInterval) + mainLog.debug("getting information") + timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=harvester_config.cacher.refreshInterval) itemsList = [] nItems = 4 for tmpStr in harvester_config.cacher.data: - tmpItems = tmpStr.split('|') + tmpItems = tmpStr.split("|") if len(tmpItems) < 3: continue tmpItems += [None] * (nItems - len(tmpItems)) tmpItems = tmpItems[:nItems] itemsList.append(tmpItems) # refresh cache function + def _refresh_cache(inputs): mainKey, subKey, infoURL, dumpFile = inputs - if subKey == '': + if subKey == "": subKey = None # check last update time lastUpdateTime = self.dbProxy.get_cache_last_update_time(mainKey, subKey) - if (not force_update) and lastUpdateTime is not None \ - and lastUpdateTime > timeLimit: + if (not force_update) and lastUpdateTime is not None and lastUpdateTime > timeLimit: return # get information tmpStat, newInfo = self.get_data(infoURL, mainLog) if not tmpStat: - mainLog.error('failed to get info for key={0} subKey={1}'.format(mainKey, subKey)) + mainLog.error("failed to get info for key={0} subKey={1}".format(mainKey, subKey)) return # update tmpStat = self.dbProxy.refresh_cache(mainKey, subKey, newInfo) if tmpStat: - mainLog.debug('refreshed key={0} subKey={1}'.format(mainKey, subKey)) + mainLog.debug("refreshed key={0} subKey={1}".format(mainKey, subKey)) if dumpFile is not None: try: - tmpFileName = dumpFile + '.tmp' - with open(tmpFileName, 'w') as tmpFile: + tmpFileName = dumpFile + ".tmp" + with open(tmpFileName, "w") as tmpFile: json.dump(newInfo, tmpFile) shutil.move(tmpFileName, dumpFile) except Exception: core_utils.dump_error_message(mainLog) else: - mainLog.error('failed to refresh key={0} subKey={1} due to a DB error'.format(mainKey, subKey)) + mainLog.error("failed to refresh key={0} subKey={1} due to a DB error".format(mainKey, subKey)) + # loop over all items if n_thread: - mainLog.debug('refresh cache with {0} threads'.format(n_thread)) + mainLog.debug("refresh cache with {0} threads".format(n_thread)) with ThreadPoolExecutor(n_thread) as thread_pool: thread_pool.map(_refresh_cache, itemsList) else: - mainLog.debug('refresh cache') + mainLog.debug("refresh cache") for inputs in itemsList: _refresh_cache(inputs) - mainLog.debug('done') - + mainLog.debug("done") # get new data + def get_data(self, info_url, tmp_log): retStat = False retVal = None # resolve env variable - match = re.search(r'\$\{*([^\}]+)\}*', info_url) + match = re.search(r"\$\{*([^\}]+)\}*", info_url) if match: var_name = match.group(1) if var_name not in os.environ: - errMsg = 'undefined environment variable: {}'.format(var_name) + errMsg = "undefined environment variable: {}".format(var_name) tmp_log.error(errMsg) else: info_url = os.environ[var_name] - if info_url.startswith('file:'): + if info_url.startswith("file:"): try: - with open(info_url.split(':')[-1], 'r') as infoFile: + with open(info_url.split(":")[-1], "r") as infoFile: retVal = infoFile.read() try: retVal = json.loads(retVal) @@ -116,23 +116,23 @@ def get_data(self, info_url, tmp_log): pass except Exception: core_utils.dump_error_message(tmp_log) - elif info_url.startswith('http:'): + elif info_url.startswith("http:"): try: res = requests.get(info_url, timeout=60) if res.status_code == 200: try: retVal = res.json() except Exception: - errMsg = 'corrupted json from {0} : {1}'.format(info_url, res.text) + errMsg = "corrupted json from {0} : {1}".format(info_url, res.text) tmp_log.error(errMsg) else: - errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(info_url, res.status_code, res.text) + errMsg = "failed to get {0} with StatusCode={1} {2}".format(info_url, res.status_code, res.text) tmp_log.error(errMsg) except requests.exceptions.ReadTimeout: - tmp_log.error('read timeout when getting data from {0}'.format(info_url)) + tmp_log.error("read timeout when getting data from {0}".format(info_url)) except Exception: core_utils.dump_error_message(tmp_log) - elif info_url.startswith('https:'): + elif info_url.startswith("https:"): try: try: # try with pandacon certificate @@ -148,7 +148,7 @@ def get_data(self, info_url, tmp_log): # try without certificate res = requests.get(info_url, timeout=60) except requests.exceptions.ReadTimeout: - tmp_log.error('read timeout when getting data from {0}'.format(info_url)) + tmp_log.error("read timeout when getting data from {0}".format(info_url)) except Exception: core_utils.dump_error_message(tmp_log) else: @@ -156,22 +156,22 @@ def get_data(self, info_url, tmp_log): try: retVal = res.json() except Exception: - errMsg = 'corrupted json from {0} : {1}'.format(info_url, res.text) + errMsg = "corrupted json from {0} : {1}".format(info_url, res.text) tmp_log.error(errMsg) else: - errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(info_url, res.status_code, res.text) + errMsg = "failed to get {0} with StatusCode={1} {2}".format(info_url, res.status_code, res.text) tmp_log.error(errMsg) - elif info_url.startswith('panda_cache:'): + elif info_url.startswith("panda_cache:"): try: - publicKey, privateKey = info_url.split(':')[-1].split('&') + publicKey, privateKey = info_url.split(":")[-1].split("&") retVal, outStr = self.communicator.get_key_pair(publicKey, privateKey) if retVal is None: tmp_log.error(outStr) except Exception: core_utils.dump_error_message(tmp_log) - elif info_url.startswith('panda_server:'): + elif info_url.startswith("panda_server:"): try: - method_name = info_url.split(':')[-1] + method_name = info_url.split(":")[-1] method_function = getattr(self.communicator, method_name) retVal, outStr = method_function() if not retVal: @@ -179,7 +179,7 @@ def get_data(self, info_url, tmp_log): except Exception: core_utils.dump_error_message(tmp_log) else: - errMsg = 'unsupported protocol for {0}'.format(info_url) + errMsg = "unsupported protocol for {0}".format(info_url) tmp_log.error(errMsg) if retVal is not None: retStat = True diff --git a/pandaharvester/harvesterbody/command_manager.py b/pandaharvester/harvesterbody/command_manager.py index c6e65050..ca971e8b 100644 --- a/pandaharvester/harvesterbody/command_manager.py +++ b/pandaharvester/harvesterbody/command_manager.py @@ -11,7 +11,7 @@ # logger -_logger = core_utils.setup_logger('command_manager') +_logger = core_utils.setup_logger("command_manager") # class to retrieve commands from panda server @@ -49,66 +49,57 @@ def run(self): """ main """ - main_log = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') + main_log = self.make_logger(_logger, "id={0}".format(self.get_pid()), method_name="run") bulk_size = harvester_config.commandmanager.commands_bulk_size - locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), - harvester_config.commandmanager.sleepTime) + locked = self.db_proxy.get_process_lock("commandmanager", self.get_pid(), harvester_config.commandmanager.sleepTime) if locked: # send command list to be received siteNames = set() commandList = [] for queueName, queueConfig in iteritems(self.queueConfigMapper.get_active_queues()): - if queueConfig is None or queueConfig.runMode != 'slave': + if queueConfig is None or queueConfig.runMode != "slave": continue # one command for all queues in one site if queueConfig.siteName not in siteNames: - commandItem = {'command': CommandSpec.COM_reportWorkerStats, - 'computingSite': queueConfig.siteName, - 'resourceType': queueConfig.resourceType - } + commandItem = { + "command": CommandSpec.COM_reportWorkerStats, + "computingSite": queueConfig.siteName, + "resourceType": queueConfig.resourceType, + } commandList.append(commandItem) siteNames.add(queueConfig.siteName) # one command for each queue - commandItem = {'command': CommandSpec.COM_setNWorkers, - 'computingSite': queueConfig.siteName, - 'resourceType': queueConfig.resourceType - } + commandItem = {"command": CommandSpec.COM_setNWorkers, "computingSite": queueConfig.siteName, "resourceType": queueConfig.resourceType} commandList.append(commandItem) - data = {'startTime': datetime.datetime.utcnow(), - 'sw_version': panda_pkg_info.release_version, - 'commit_stamp': commit_timestamp.timestamp} + data = {"startTime": datetime.datetime.utcnow(), "sw_version": panda_pkg_info.release_version, "commit_stamp": commit_timestamp.timestamp} if len(commandList) > 0: - main_log.debug('sending command list to receive') - data['commands'] = commandList + main_log.debug("sending command list to receive") + data["commands"] = commandList self.communicator.is_alive(data) # main loop while True: # get lock - locked = self.db_proxy.get_process_lock('commandmanager', self.get_pid(), - harvester_config.commandmanager.sleepTime) + locked = self.db_proxy.get_process_lock("commandmanager", self.get_pid(), harvester_config.commandmanager.sleepTime) if locked or self.singleMode: - - main_log.debug('polling commands loop') + main_log.debug("polling commands loop") # send heartbeat - if self.lastHeartbeat is None \ - or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): + if self.lastHeartbeat is None or self.lastHeartbeat < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): self.lastHeartbeat = datetime.datetime.utcnow() self.communicator.is_alive({}) continuous_loop = True # as long as there are commands, retrieve them while continuous_loop: - # get commands from panda server for this harvester instance commands = self.communicator.get_commands(bulk_size) - main_log.debug('got {0} commands (bulk size: {1})'.format(len(commands), bulk_size)) + main_log.debug("got {0} commands (bulk size: {1})".format(len(commands), bulk_size)) command_specs = self.convert_to_command_specs(commands) # cache commands in internal DB self.db_proxy.store_commands(command_specs) - main_log.debug('cached {0} commands in internal DB'.format(len(command_specs))) + main_log.debug("cached {0} commands in internal DB".format(len(command_specs))) # retrieve processed commands from harvester cache command_ids_ack = self.db_proxy.get_commands_ack() @@ -116,7 +107,7 @@ def run(self): for shard in core_utils.create_shards(command_ids_ack, bulk_size): # post acknowledgements to panda server self.communicator.ack_commands(shard) - main_log.debug('acknowledged {0} commands to panda server'.format(len(shard))) + main_log.debug("acknowledged {0} commands to panda server".format(len(shard))) # clean acknowledged commands self.db_proxy.clean_commands_by_id(shard) @@ -130,5 +121,5 @@ def run(self): # check if being terminated if self.terminated(harvester_config.commandmanager.sleepTime, randomize=False): - main_log.debug('terminated') + main_log.debug("terminated") return diff --git a/pandaharvester/harvesterbody/cred_manager.py b/pandaharvester/harvesterbody/cred_manager.py index 8b4b4078..cd3c4111 100644 --- a/pandaharvester/harvesterbody/cred_manager.py +++ b/pandaharvester/harvesterbody/cred_manager.py @@ -8,12 +8,11 @@ from pandaharvester.harvesterbody.agent_base import AgentBase # logger -_logger = core_utils.setup_logger('cred_manager') +_logger = core_utils.setup_logger("cred_manager") # credential manager class CredManager(AgentBase): - # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) @@ -38,103 +37,95 @@ def get_list(self, data): # get plugin cores from harvester config def get_cores_from_harvester_config(self): # get module and class names - if hasattr(harvester_config.credmanager, 'moduleName'): + if hasattr(harvester_config.credmanager, "moduleName"): moduleNames = self.get_list(harvester_config.credmanager.moduleName) else: moduleNames = [] - if hasattr(harvester_config.credmanager, 'className'): + if hasattr(harvester_config.credmanager, "className"): classNames = self.get_list(harvester_config.credmanager.className) else: classNames = [] # file names of original certificates - if hasattr(harvester_config.credmanager, 'inCertFile'): + if hasattr(harvester_config.credmanager, "inCertFile"): inCertFiles = self.get_list(harvester_config.credmanager.inCertFile) - elif hasattr(harvester_config.credmanager, 'certFile'): + elif hasattr(harvester_config.credmanager, "certFile"): inCertFiles = self.get_list(harvester_config.credmanager.certFile) else: inCertFiles = [] # file names of certificates to be generated - if hasattr(harvester_config.credmanager, 'outCertFile'): + if hasattr(harvester_config.credmanager, "outCertFile"): outCertFiles = self.get_list(harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS - if hasattr(harvester_config.credmanager, 'voms'): + if hasattr(harvester_config.credmanager, "voms"): vomses = self.get_list(harvester_config.credmanager.voms) else: vomses = [] # direct and merged plugin configuration in json - if hasattr(harvester_config.credmanager, 'pluginConfigs'): + if hasattr(harvester_config.credmanager, "pluginConfigs"): pluginConfigs = harvester_config.credmanager.pluginConfigs else: pluginConfigs = [] # from traditional attributes - for moduleName, className, inCertFile, outCertFile, voms in \ - zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): + for moduleName, className, inCertFile, outCertFile, voms in zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): plugin_params = {} - plugin_params['module'] = moduleName - plugin_params['name'] = className - plugin_params['inCertFile'] = inCertFile - plugin_params['outCertFile'] = outCertFile - plugin_params['voms'] = voms + plugin_params["module"] = moduleName + plugin_params["name"] = className + plugin_params["inCertFile"] = inCertFile + plugin_params["outCertFile"] = outCertFile + plugin_params["voms"] = voms try: exe_core = self.pluginFactory.get_plugin(plugin_params) self.exe_cores.append(exe_core) except Exception: - _logger.error('failed to launch credmanager with traditional attributes for {0}'.format(plugin_params)) + _logger.error("failed to launch credmanager with traditional attributes for {0}".format(plugin_params)) core_utils.dump_error_message(_logger) # from pluginConfigs for pc in pluginConfigs: try: - setup_maps = pc['configs'] + setup_maps = pc["configs"] for setup_name, setup_map in setup_maps.items(): try: - plugin_params = {'module': pc['module'], - 'name': pc['name'], - 'setup_name': setup_name} + plugin_params = {"module": pc["module"], "name": pc["name"], "setup_name": setup_name} plugin_params.update(setup_map) exe_core = self.pluginFactory.get_plugin(plugin_params) self.exe_cores.append(exe_core) except Exception: - _logger.error('failed to launch credmanager in pluginConfigs for {0}'.format(plugin_params)) + _logger.error("failed to launch credmanager in pluginConfigs for {0}".format(plugin_params)) core_utils.dump_error_message(_logger) except Exception: - _logger.error('failed to parse pluginConfigs {0}'.format(pc)) + _logger.error("failed to parse pluginConfigs {0}".format(pc)) core_utils.dump_error_message(_logger) # update plugin cores from queue config def update_cores_from_queue_config(self): self.queue_exe_cores = [] for queue_name, queue_config in self.queue_config_mapper.get_all_queues().items(): - if queue_config.queueStatus == 'offline' \ - or not hasattr(queue_config, 'credmanagers') \ - or not isinstance(queue_config.credmanagers, list): + if queue_config.queueStatus == "offline" or not hasattr(queue_config, "credmanagers") or not isinstance(queue_config.credmanagers, list): continue for cm_setup in queue_config.credmanagers: try: - plugin_params = {'module': cm_setup['module'], - 'name': cm_setup['name'], - 'setup_name': queue_name, - 'queueName': queue_name} + plugin_params = {"module": cm_setup["module"], "name": cm_setup["name"], "setup_name": queue_name, "queueName": queue_name} for k, v in cm_setup.items(): - if k in ('module', 'name'): + if k in ("module", "name"): pass - if isinstance(v, str) and '$' in v: + if isinstance(v, str) and "$" in v: # replace placeholders value = v - patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v) + patts = re.findall("\$\{([a-zA-Z\d_.]+)\}", v) for patt in patts: - tmp_ph = '${' + patt + '}' + tmp_ph = "${" + patt + "}" tmp_val = None - if patt == 'harvesterID': + if patt == "harvesterID": tmp_val = harvester_config.master.harvester_id - elif patt == 'queueName': + elif patt == "queueName": tmp_val = queue_name - elif patt.startswith('common.'): + elif patt.startswith("common."): # values from common blocks - attr = patt.replace('common.', '') - if hasattr(queue_config, 'common') and attr in queue_config.common: + attr = patt.replace("common.", "") + if hasattr(queue_config, "common") and attr in queue_config.common: tmp_val = queue_config.common[attr] if tmp_val is not None: value = value.replace(tmp_ph, tmp_val) @@ -146,7 +137,7 @@ def update_cores_from_queue_config(self): exe_core = self.pluginFactory.get_plugin(plugin_params) self.queue_exe_cores.append(exe_core) except Exception: - _logger.error('failed to launch plugin for queue={0} and {1}'.format(queue_name, plugin_params)) + _logger.error("failed to launch plugin for queue={0} and {1}".format(queue_name, plugin_params)) core_utils.dump_error_message(_logger) # main loop @@ -165,8 +156,7 @@ def run(self): # main def execute(self): # get lock - locked = self.dbProxy.get_process_lock('credmanager', self.get_pid(), - harvester_config.credmanager.sleepTime) + locked = self.dbProxy.get_process_lock("credmanager", self.get_pid(), harvester_config.credmanager.sleepTime) if not locked: return # loop over all plugins @@ -175,30 +165,28 @@ def execute(self): if exe_core is None: continue # make logger - if hasattr(exe_core, 'setup_name'): + if hasattr(exe_core, "setup_name"): credmanager_name = exe_core.setup_name else: - credmanager_name = '{0} {1}'.format(exe_core.inCertFile, exe_core.outCertFile) - mainLog = self.make_logger(_logger, - '{0} {1}'.format(exe_core.__class__.__name__, credmanager_name), - method_name='execute') + credmanager_name = "{0} {1}".format(exe_core.inCertFile, exe_core.outCertFile) + mainLog = self.make_logger(_logger, "{0} {1}".format(exe_core.__class__.__name__, credmanager_name), method_name="execute") try: # check credential - mainLog.debug('check credential') + mainLog.debug("check credential") isValid = exe_core.check_credential() if isValid: - mainLog.debug('valid') + mainLog.debug("valid") elif not isValid: # renew it if necessary - mainLog.debug('invalid') - mainLog.debug('renew credential') + mainLog.debug("invalid") + mainLog.debug("renew credential") tmpStat, tmpOut = exe_core.renew_credential() if not tmpStat: - mainLog.error('failed : {0}'.format(tmpOut)) + mainLog.error("failed : {0}".format(tmpOut)) continue except Exception: core_utils.dump_error_message(mainLog) - mainLog.debug('done') + mainLog.debug("done") # monit main def execute_monit(self): @@ -212,22 +200,21 @@ def execute_monit(self): continue # make logger - if hasattr(exe_core, 'setup_name'): + if hasattr(exe_core, "setup_name"): credmanager_name = exe_core.setup_name else: - credmanager_name = '{0} {1}'.format(exe_core.inCertFile, exe_core.outCertFile) + credmanager_name = "{0} {1}".format(exe_core.inCertFile, exe_core.outCertFile) - sub_log = self.make_logger(_logger, '{0} {1}'.format(exe_core.__class__.__name__, credmanager_name), - method_name='execute_monit') + sub_log = self.make_logger(_logger, "{0} {1}".format(exe_core.__class__.__name__, credmanager_name), method_name="execute_monit") try: # check credential - sub_log.debug('check credential lifetime') + sub_log.debug("check credential lifetime") lifetime = exe_core.check_credential_lifetime() if lifetime is not None: metrics[exe_core.outCertFile] = lifetime except Exception: core_utils.dump_error_message(sub_log) - sub_log.debug('done') + sub_log.debug("done") return metrics diff --git a/pandaharvester/harvesterbody/event_feeder.py b/pandaharvester/harvesterbody/event_feeder.py index e358897f..0c73a252 100644 --- a/pandaharvester/harvesterbody/event_feeder.py +++ b/pandaharvester/harvesterbody/event_feeder.py @@ -8,7 +8,7 @@ from pandaharvester.harvesterbody.agent_base import AgentBase # logger -_logger = core_utils.setup_logger('event_feeder') +_logger = core_utils.setup_logger("event_feeder") # class to feed events to workers @@ -23,24 +23,24 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): # main loop def run(self): - lockedBy = 'eventfeeder-{0}'.format(self.get_pid()) + lockedBy = "eventfeeder-{0}".format(self.get_pid()) while True: - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') - mainLog.debug('getting workers to feed events') - workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events(harvester_config.eventfeeder.maxWorkers, - harvester_config.eventfeeder.lockInterval, - lockedBy) - mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) + mainLog = self.make_logger(_logger, "id={0}".format(lockedBy), method_name="run") + mainLog.debug("getting workers to feed events") + workSpecsPerQueue = self.dbProxy.get_workers_to_feed_events( + harvester_config.eventfeeder.maxWorkers, harvester_config.eventfeeder.lockInterval, lockedBy + ) + mainLog.debug("got {0} queues".format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecList in iteritems(workSpecsPerQueue): - tmpQueLog = self.make_logger(_logger, 'queue={0}'.format(queueName), method_name='run') + tmpQueLog = self.make_logger(_logger, "queue={0}".format(queueName), method_name="run") # check queue if not self.queueConfigMapper.has_queue(queueName): - tmpQueLog.error('config not found') + tmpQueLog.error("config not found") continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) - if hasattr(queueConfig, 'scatteredEvents') and queueConfig.scatteredEvents: + if hasattr(queueConfig, "scatteredEvents") and queueConfig.scatteredEvents: scattered = True else: scattered = False @@ -48,47 +48,42 @@ def run(self): messenger = self.pluginFactory.get_plugin(queueConfig.messenger) # loop over all workers for workSpec in workSpecList: - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), - method_name='run') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workSpec.workerID), method_name="run") # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: - tmpLog.debug('skipped since locked by another') + tmpLog.debug("skipped since locked by another") continue # get events - tmpLog.debug('get events') - tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, - scattered, - workSpec.get_access_point()) + tmpLog.debug("get events") + tmpStat, events = self.communicator.get_event_ranges(workSpec.eventsRequestParams, scattered, workSpec.get_access_point()) # failed if tmpStat is False: - tmpLog.error('failed to get events with {0}'.format(events)) + tmpLog.error("failed to get events with {0}".format(events)) continue # lock worker again lockedFlag = self.dbProxy.lock_worker_again_to_feed_events(workSpec.workerID, lockedBy) if not lockedFlag: - tmpLog.debug('skipped before feeding since locked by another') + tmpLog.debug("skipped before feeding since locked by another") continue tmpStat = messenger.feed_events(workSpec, events) # failed if tmpStat is False: - tmpLog.error('failed to feed events') + tmpLog.error("failed to feed events") continue # dump for pandaID, eventList in iteritems(events): try: - nRanges = workSpec.eventsRequestParams[pandaID]['nRanges'] + nRanges = workSpec.eventsRequestParams[pandaID]["nRanges"] except Exception: nRanges = None - tmpLog.debug('got {0} events for PandaID={1} while getting {2} events'.format(len(eventList), - pandaID, - nRanges)) + tmpLog.debug("got {0} events for PandaID={1} while getting {2} events".format(len(eventList), pandaID, nRanges)) # disable multi workers if workSpec.mapType == WorkSpec.MT_MultiWorkers: if len(eventList) == 0 or (nRanges is not None and len(eventList) < nRanges): tmpStat = self.dbProxy.disable_multi_workers(pandaID) if tmpStat == 1: - tmpStr = 'disabled MultiWorkers for PandaID={0}'.format(pandaID) + tmpStr = "disabled MultiWorkers for PandaID={0}".format(pandaID) tmpLog.debug(tmpStr) # update worker workSpec.eventsRequest = WorkSpec.EV_useEvents @@ -96,11 +91,11 @@ def run(self): workSpec.eventFeedTime = None workSpec.eventFeedLock = None # update local database - tmpStat = self.dbProxy.update_worker(workSpec, {'eventFeedLock': lockedBy}) - tmpLog.debug('done with {0}'.format(tmpStat)) - tmpQueLog.debug('done') - mainLog.debug('done') + tmpStat = self.dbProxy.update_worker(workSpec, {"eventFeedLock": lockedBy}) + tmpLog.debug("done with {0}".format(tmpStat)) + tmpQueLog.debug("done") + mainLog.debug("done") # check if being terminated if self.terminated(harvester_config.eventfeeder.sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return diff --git a/pandaharvester/harvesterbody/file_syncer.py b/pandaharvester/harvesterbody/file_syncer.py index 0f98ba4d..71e58f50 100644 --- a/pandaharvester/harvesterbody/file_syncer.py +++ b/pandaharvester/harvesterbody/file_syncer.py @@ -8,12 +8,11 @@ from pandaharvester.harvesterbody.agent_base import AgentBase # logger -_logger = core_utils.setup_logger('file_syncer') +_logger = core_utils.setup_logger("file_syncer") # file syncer class FileSyncer(AgentBase): - # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) @@ -38,62 +37,54 @@ def get_list(self, data): # get plugin cores from harvester config def get_cores_from_harvester_config(self): # direct and merged plugin configuration in json - if hasattr(harvester_config, 'file_syncer') \ - and hasattr(harvester_config.file_syncer, 'pluginConfigs'): + if hasattr(harvester_config, "file_syncer") and hasattr(harvester_config.file_syncer, "pluginConfigs"): plugin_configs = harvester_config.file_syncer.pluginConfigs else: plugin_configs = [] # from plugin_configs for pc in plugin_configs: try: - setup_maps = pc['configs'] + setup_maps = pc["configs"] for setup_name, setup_map in setup_maps.items(): try: - plugin_params = {'module': pc['module'], - 'name': pc['name'], - 'setup_name': setup_name} + plugin_params = {"module": pc["module"], "name": pc["name"], "setup_name": setup_name} plugin_params.update(setup_map) exe_core = self.pluginFactory.get_plugin(plugin_params) self.exe_cores.append(exe_core) except Exception: - _logger.error('failed to launch file_syncer in pluginConfigs for {0}'.format(plugin_params)) + _logger.error("failed to launch file_syncer in pluginConfigs for {0}".format(plugin_params)) core_utils.dump_error_message(_logger) except Exception: - _logger.error('failed to parse pluginConfigs {0}'.format(pc)) + _logger.error("failed to parse pluginConfigs {0}".format(pc)) core_utils.dump_error_message(_logger) # update plugin cores from queue config def update_cores_from_queue_config(self): self.queue_exe_cores = [] for queue_name, queue_config in self.queue_config_mapper.get_all_queues().items(): - if queue_config.queueStatus == 'offline' \ - or not hasattr(queue_config, 'file_syncer') \ - or not isinstance(queue_config.file_syncer, list): + if queue_config.queueStatus == "offline" or not hasattr(queue_config, "file_syncer") or not isinstance(queue_config.file_syncer, list): continue for cm_setup in queue_config.file_syncer: try: - plugin_params = {'module': cm_setup['module'], - 'name': cm_setup['name'], - 'setup_name': queue_name, - 'queueName': queue_name} + plugin_params = {"module": cm_setup["module"], "name": cm_setup["name"], "setup_name": queue_name, "queueName": queue_name} for k, v in cm_setup.items(): - if k in ('module', 'name'): + if k in ("module", "name"): pass - if isinstance(v, str) and '$' in v: + if isinstance(v, str) and "$" in v: # replace placeholders value = v - patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', v) + patts = re.findall("\$\{([a-zA-Z\d_.]+)\}", v) for patt in patts: - tmp_ph = '${' + patt + '}' + tmp_ph = "${" + patt + "}" tmp_val = None - if patt == 'harvesterID': + if patt == "harvesterID": tmp_val = harvester_config.master.harvester_id - elif patt == 'queueName': + elif patt == "queueName": tmp_val = queue_name - elif patt.startswith('common.'): + elif patt.startswith("common."): # values from common blocks - attr = patt.replace('common.', '') - if hasattr(queue_config, 'common') and attr in queue_config.common: + attr = patt.replace("common.", "") + if hasattr(queue_config, "common") and attr in queue_config.common: tmp_val = queue_config.common[attr] if tmp_val is not None: value = value.replace(tmp_ph, tmp_val) @@ -105,7 +96,7 @@ def update_cores_from_queue_config(self): exe_core = self.pluginFactory.get_plugin(plugin_params) self.queue_exe_cores.append(exe_core) except Exception: - _logger.error('failed to launch plugin for queue={0} and {1}'.format(queue_name, plugin_params)) + _logger.error("failed to launch plugin for queue={0} and {1}".format(queue_name, plugin_params)) core_utils.dump_error_message(_logger) # main loop @@ -122,8 +113,7 @@ def run(self): # main def execute(self): # get lock - locked = self.dbProxy.get_process_lock('file_syncer', self.get_pid(), - getattr(harvester_config.file_syncer, 'sleepTime', 10)) + locked = self.dbProxy.get_process_lock("file_syncer", self.get_pid(), getattr(harvester_config.file_syncer, "sleepTime", 10)) if not locked: return # loop over all plugins @@ -133,23 +123,21 @@ def execute(self): continue # make logger file_syncer_name = exe_core.setup_name - mainLog = self.make_logger(_logger, - '{0} {1}'.format(exe_core.__class__.__name__, file_syncer_name), - method_name='execute') + mainLog = self.make_logger(_logger, "{0} {1}".format(exe_core.__class__.__name__, file_syncer_name), method_name="execute") try: # check freshness - mainLog.debug('check') + mainLog.debug("check") to_update = exe_core.check() if not to_update: - mainLog.debug('no need to update, skip') + mainLog.debug("no need to update, skip") else: # update if necessary - mainLog.debug('updating') + mainLog.debug("updating") tmpStat, tmpOut = exe_core.update() if not tmpStat: - mainLog.error('failed : {0}'.format(tmpOut)) + mainLog.error("failed : {0}".format(tmpOut)) continue - mainLog.debug('updated') + mainLog.debug("updated") except Exception: core_utils.dump_error_message(mainLog) - mainLog.debug('done') + mainLog.debug("done") diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index b8eecfa4..1e32b46d 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -13,7 +13,7 @@ from pandaharvester.harvestermisc.info_utils import PandaQueuesDict # logger -_logger = core_utils.setup_logger('job_fetcher') +_logger = core_utils.setup_logger("job_fetcher") # class to fetch jobs @@ -30,12 +30,11 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): # main loop def run(self): while True: - mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') - mainLog.debug('getting number of jobs to be fetched') + mainLog = self.make_logger(_logger, "id={0}".format(self.get_pid()), method_name="run") + mainLog.debug("getting number of jobs to be fetched") # get number of jobs to be fetched - nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, - harvester_config.jobfetcher.lookupTime) - mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) + nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) + mainLog.debug("got {0} queues".format(len(nJobsPerQueue))) # get up to date queue configuration pandaQueueDict = PandaQueuesDict() @@ -45,8 +44,7 @@ def run(self): # check queue if not self.queueConfigMapper.has_queue(queueName): continue - tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), - method_name='run') + tmpLog = self.make_logger(_logger, "queueName={0}".format(queueName), method_name="run") # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) siteName = queueConfig.siteName @@ -62,20 +60,17 @@ def run(self): default_prodSourceLabel = queueConfig.get_source_label(is_gu=is_grandly_unified_queue) - pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) + pdpm = getattr(queueConfig, "prodSourceLabelRandomWeightsPermille", {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) - tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(nJobs, prodSourceLabel)) + tmpLog.debug("getting {0} jobs for prodSourceLabel {1}".format(nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() - jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, - prodSourceLabel, - self.nodeName, nJobs, - queueConfig.getJobCriteria) - tmpLog.info('got {0} jobs with {1} {2}'.format(len(jobs), errStr, sw.get_elapsed_time())) + jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) + tmpLog.info("got {0} jobs with {1} {2}".format(len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin - if hasattr(queueConfig, 'extractor'): + if hasattr(queueConfig, "extractor"): extractorCore = self.pluginFactory.get_plugin(queueConfig.extractor) else: extractorCore = None @@ -87,13 +82,12 @@ def run(self): jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName - jobSpec.status = 'starting' - jobSpec.subStatus = 'fetched' + jobSpec.status = "starting" + jobSpec.subStatus = "fetched" jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID - jobSpec.set_one_attribute('schedulerID', - 'harvester-{0}'.format(harvester_config.master.harvester_id)) + jobSpec.set_one_attribute("schedulerID", "harvester-{0}".format(harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [jobSpec.get_input_file_attributes()] @@ -107,26 +101,23 @@ def run(self): fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn - fileSpec.scope = fileAttrs['scope'] - if 'INTERNAL_FileType' in fileAttrs: - fileSpec.fileType = fileAttrs['INTERNAL_FileType'] + fileSpec.scope = fileAttrs["scope"] + if "INTERNAL_FileType" in fileAttrs: + fileSpec.fileType = fileAttrs["INTERNAL_FileType"] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: - fileSpec.fileType = 'input' + fileSpec.fileType = "input" # check file status if tmpLFN not in fileStatMap: - fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, fileSpec.fileType, - queueConfig.ddmEndpointIn, - 'starting') + fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, fileSpec.fileType, queueConfig.ddmEndpointIn, "starting") # set preparing to skip stage-in if the file is (being) taken care of by another job - if [x for x in ['ready', 'preparing', 'to_prepare', 'triggered'] - if x in fileStatMap[tmpLFN]]: - fileSpec.status = 'preparing' + if [x for x in ["ready", "preparing", "to_prepare", "triggered"] if x in fileStatMap[tmpLFN]]: + fileSpec.status = "preparing" else: - fileSpec.status = 'to_prepare' + fileSpec.status = "to_prepare" fileStatMap[tmpLFN].setdefault(fileSpec.status, None) - if 'INTERNAL_URL' in fileAttrs: - fileSpec.url = fileAttrs['INTERNAL_URL'] + if "INTERNAL_URL" in fileAttrs: + fileSpec.url = fileAttrs["INTERNAL_URL"] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) @@ -134,9 +125,9 @@ def run(self): tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) - tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time())) - mainLog.debug('done') + tmpLog.debug("Insert of {0} jobs {1}".format(len(jobSpecs), sw_insertdb.get_elapsed_time())) + mainLog.debug("done") # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return diff --git a/pandaharvester/harvesterbody/master.py b/pandaharvester/harvesterbody/master.py index 20a285a4..3ea26c05 100644 --- a/pandaharvester/harvesterbody/master.py +++ b/pandaharvester/harvesterbody/master.py @@ -10,6 +10,7 @@ import threading import cProfile from future.utils import iteritems + try: import pprofile except Exception: @@ -24,7 +25,7 @@ from pandaharvester.harvestermisc.apfmon import Apfmon # logger -_logger = core_utils.setup_logger('master') +_logger = core_utils.setup_logger("master") # for singleton master_instance = False @@ -33,7 +34,6 @@ # the master class which runs the main process class Master(object): - # constructor def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): # initialize database and config @@ -41,10 +41,13 @@ def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): self.stopEvent = stop_event self.daemonMode = daemon_mode from pandaharvester.harvestercore.communicator_pool import CommunicatorPool + self.communicatorPool = CommunicatorPool() from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper + self.queueConfigMapper = QueueConfigMapper() from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy + dbProxy = DBProxy() dbProxy.make_tables(self.queueConfigMapper, self.communicatorPool) @@ -54,6 +57,7 @@ def start(self): thrList = [] # Credential Manager from pandaharvester.harvesterbody.cred_manager import CredManager + thr = CredManager(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute() @@ -61,12 +65,14 @@ def start(self): thrList.append(thr) # Command manager from pandaharvester.harvesterbody.command_manager import CommandManager + thr = CommandManager(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Cacher from pandaharvester.harvesterbody.cacher import Cacher + thr = Cacher(self.communicatorPool, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute(force_update=True, skip_lock=True) @@ -74,88 +80,86 @@ def start(self): thrList.append(thr) # Watcher from pandaharvester.harvesterbody.watcher import Watcher + thr = Watcher(single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Job Fetcher from pandaharvester.harvesterbody.job_fetcher import JobFetcher + nThr = harvester_config.jobfetcher.nThreads for iThr in range(nThr): - thr = JobFetcher(self.communicatorPool, - self.queueConfigMapper, - single_mode=self.singleMode) + thr = JobFetcher(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Propagator from pandaharvester.harvesterbody.propagator import Propagator + nThr = harvester_config.propagator.nThreads for iThr in range(nThr): - thr = Propagator(self.communicatorPool, - self.queueConfigMapper, - single_mode=self.singleMode) + thr = Propagator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Monitor from pandaharvester.harvesterbody.monitor import Monitor + nThr = harvester_config.monitor.nThreads for iThr in range(nThr): - thr = Monitor(self.queueConfigMapper, - single_mode=self.singleMode) + thr = Monitor(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Preparator from pandaharvester.harvesterbody.preparator import Preparator + nThr = harvester_config.preparator.nThreads for iThr in range(nThr): - thr = Preparator(self.communicatorPool, - self.queueConfigMapper, - single_mode=self.singleMode) + thr = Preparator(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Submitter from pandaharvester.harvesterbody.submitter import Submitter + nThr = harvester_config.submitter.nThreads for iThr in range(nThr): - thr = Submitter(self.queueConfigMapper, - single_mode=self.singleMode) + thr = Submitter(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Stager from pandaharvester.harvesterbody.stager import Stager + nThr = harvester_config.stager.nThreads for iThr in range(nThr): - thr = Stager(self.queueConfigMapper, - single_mode=self.singleMode) + thr = Stager(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # EventFeeder from pandaharvester.harvesterbody.event_feeder import EventFeeder + nThr = harvester_config.eventfeeder.nThreads for iThr in range(nThr): - thr = EventFeeder(self.communicatorPool, - self.queueConfigMapper, - single_mode=self.singleMode) + thr = EventFeeder(self.communicatorPool, self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # Sweeper from pandaharvester.harvesterbody.sweeper import Sweeper + nThr = harvester_config.sweeper.nThreads for iThr in range(nThr): - thr = Sweeper(self.queueConfigMapper, - single_mode=self.singleMode) + thr = Sweeper(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() thrList.append(thr) # File Syncer from pandaharvester.harvesterbody.file_syncer import FileSyncer + thr = FileSyncer(self.queueConfigMapper, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.execute() @@ -169,6 +173,7 @@ def start(self): if sm_active: from pandaharvester.harvesterbody.service_monitor import ServiceMonitor + thr = ServiceMonitor(options.pid, single_mode=self.singleMode) thr.set_stop_event(self.stopEvent) thr.start() @@ -179,7 +184,6 @@ def start(self): apf_mon.create_factory() apf_mon.create_labels() - ################## # loop on stop event to be interruptable since thr.join blocks signal capture in python 2.7 while True: @@ -208,7 +212,7 @@ def __exit__(self, *x): class StdErrWrapper(object): def write(self, message): # set a header and footer to the message to make it easier to parse - wrapped_message = '#####START#####\n{0}#####END#####\n'.format(message) + wrapped_message = "#####START#####\n{0}#####END#####\n".format(message) _logger.error(wrapped_message) def flush(self): @@ -227,38 +231,35 @@ def isatty(self): options = None # main + + def main(daemon_mode=True): global prof global options # parse option parser = argparse.ArgumentParser() - parser.add_argument('--pid', action='store', dest='pid', default=None, - help='pid filename') - parser.add_argument('--single', action='store_true', dest='singleMode', default=False, - help='use single mode') - parser.add_argument('--hostname_file', action='store', dest='hostNameFile', default=None, - help='to record the hostname where harvester is launched') - parser.add_argument('--rotate_log', action='store_true', dest='rotateLog', default=False, - help='rollover log files before launching harvester') - parser.add_argument('--version', action='store_true', dest='showVersion', default=False, - help='show version information and exit') - parser.add_argument('--profile_output', action='store', dest='profileOutput', default=None, - help='filename to save the results of profiler') - parser.add_argument('--profile_mode', action='store', dest='profileMode', default='s', - help='profile mode. s (statistic), d (deterministic), or t (thread-aware)') - parser.add_argument('--memory_logging', action='store_true', dest='memLogging', default=False, - help='add information of memory usage in each logging message') - parser.add_argument('--foreground', action='store_true', dest='foreground', default=False, - help='run in the foreground not to be daemonized') + parser.add_argument("--pid", action="store", dest="pid", default=None, help="pid filename") + parser.add_argument("--single", action="store_true", dest="singleMode", default=False, help="use single mode") + parser.add_argument("--hostname_file", action="store", dest="hostNameFile", default=None, help="to record the hostname where harvester is launched") + parser.add_argument("--rotate_log", action="store_true", dest="rotateLog", default=False, help="rollover log files before launching harvester") + parser.add_argument("--version", action="store_true", dest="showVersion", default=False, help="show version information and exit") + parser.add_argument("--profile_output", action="store", dest="profileOutput", default=None, help="filename to save the results of profiler") + parser.add_argument( + "--profile_mode", action="store", dest="profileMode", default="s", help="profile mode. s (statistic), d (deterministic), or t (thread-aware)" + ) + parser.add_argument( + "--memory_logging", action="store_true", dest="memLogging", default=False, help="add information of memory usage in each logging message" + ) + parser.add_argument("--foreground", action="store_true", dest="foreground", default=False, help="run in the foreground not to be daemonized") options = parser.parse_args() # show version information if options.showVersion: - print ("Version : {0}".format(panda_pkg_info.release_version)) - print ("Last commit : {0}".format(commit_timestamp.timestamp)) + print("Version : {0}".format(panda_pkg_info.release_version)) + print("Last commit : {0}".format(commit_timestamp.timestamp)) return # check pid if options.pid is not None and os.path.exists(options.pid): - print ("ERROR: Cannot start since lock file {0} already exists".format(options.pid)) + print("ERROR: Cannot start since lock file {0} already exists".format(options.pid)) return # uid and gid uid = pwd.getpwnam(harvester_config.master.uname).pw_uid @@ -271,12 +272,12 @@ def main(daemon_mode=True): core_utils.enable_memory_profiling() # hostname if options.hostNameFile is not None: - with open(options.hostNameFile, 'w') as f: + with open(options.hostNameFile, "w") as f: f.write(socket.getfqdn()) # rollover log files if options.rotateLog: core_utils.do_log_rollover() - if hasattr(_logger.handlers[0], 'doRollover'): + if hasattr(_logger.handlers[0], "doRollover"): _logger.handlers[0].doRollover() if daemon_mode and not options.foreground: # redirect messages to stdout @@ -286,19 +287,15 @@ def main(daemon_mode=True): # collect streams not to be closed by daemon files_preserve = [] for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda'): + if loggerName.startswith("panda"): for handler in loggerObj.handlers: - if hasattr(handler, 'stream'): + if hasattr(handler, "stream"): files_preserve.append(handler.stream) sys.stderr = StdErrWrapper() # make daemon context - dc = daemon.DaemonContext(stdout=sys.stdout, - stderr=sys.stderr, - uid=uid, - gid=gid, - umask=umask, - files_preserve=files_preserve, - pidfile=daemon.pidfile.PIDLockFile(options.pid)) + dc = daemon.DaemonContext( + stdout=sys.stdout, stderr=sys.stderr, uid=uid, gid=gid, umask=umask, files_preserve=files_preserve, pidfile=daemon.pidfile.PIDLockFile(options.pid) + ) else: dc = DummyContext() with dc: @@ -307,9 +304,8 @@ def main(daemon_mode=True): dc.pidfile = None if options.pid: core_utils.set_file_permission(options.pid) - core_utils.set_file_permission(logger_config.daemon['logdir']) - _logger.info("start : version = {0}, last_commit = {1}".format(panda_pkg_info.release_version, - commit_timestamp.timestamp)) + core_utils.set_file_permission(logger_config.daemon["logdir"]) + _logger.info("start : version = {0}, last_commit = {1}".format(panda_pkg_info.release_version, commit_timestamp.timestamp)) # stop event stopEvent = threading.Event() @@ -318,10 +314,10 @@ def main(daemon_mode=True): prof = None if options.profileOutput is not None: # run with profiler - if options.profileMode == 'd': + if options.profileMode == "d": # deterministic prof = pprofile.Profile() - elif options.profileMode == 't': + elif options.profileMode == "t": # thread-aware prof = pprofile.ThreadProfile() else: @@ -348,7 +344,7 @@ def delete_pid(pid): # signal handlers def catch_sigkill(sig, frame): disable_profiler() - _logger.info('got signal={0} to be killed'.format(sig)) + _logger.info("got signal={0} to be killed".format(sig)) try: os.remove(options.pid) except Exception: @@ -360,9 +356,9 @@ def catch_sigkill(sig, frame): os.kill(os.getpid(), signal.SIGKILL) except Exception: core_utils.dump_error_message(_logger) - _logger.error('failed to be killed') + _logger.error("failed to be killed") - ''' + """ def catch_sigterm(sig, frame): _logger.info('got signal={0} to be terminated'.format(sig)) stopEvent.set() @@ -371,20 +367,21 @@ def catch_sigterm(sig, frame): atexit.register(delete_pid, options.pid) # set alarm just in case signal.alarm(30) - ''' + """ def catch_debug(sig, frame): - _logger.info('got signal={0} to go into debugger mode'.format(sig)) + _logger.info("got signal={0} to go into debugger mode".format(sig)) from trepan.interfaces import server from trepan.api import debug + try: portNum = harvester_config.master.debugger_port except Exception: portNum = 19550 - connection_opts = {'IO': 'TCP', 'PORT': portNum} + connection_opts = {"IO": "TCP", "PORT": portNum} interface = server.ServerInterface(connection_opts=connection_opts) - dbg_opts = {'interface': interface} - _logger.info('starting debugger on port {0}'.format(portNum)) + dbg_opts = {"interface": interface} + _logger.info("starting debugger on port {0}".format(portNum)) debug(dbg_opts=dbg_opts) # set handler @@ -408,7 +405,7 @@ def catch_debug(sig, frame): # disable profiler disable_profiler() if daemon_mode: - _logger.info('terminated') + _logger.info("terminated") if __name__ == "__main__": diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index cb300106..5b87da53 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -16,14 +16,14 @@ from pandaharvester.harvestermisc.apfmon import Apfmon # logger -_logger = core_utils.setup_logger('monitor') +_logger = core_utils.setup_logger("monitor") # propagate important checkpoints to panda class Monitor(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): - tmp_log = self.make_logger(_logger, method_name='__init__') + tmp_log = self.make_logger(_logger, method_name="__init__") AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() @@ -32,107 +32,104 @@ def __init__(self, queue_config_mapper, single_mode=False): try: self.monitor_fifo = MonitorFIFO() except Exception: - tmp_log.error('failed to launch monitor-fifo') + tmp_log.error("failed to launch monitor-fifo") core_utils.dump_error_message(tmp_log) if self.monitor_fifo.enabled: try: self.monitor_event_fifo = MonitorEventFIFO() except Exception: - tmp_log.error('failed to launch monitor-event-fifo') + tmp_log.error("failed to launch monitor-event-fifo") core_utils.dump_error_message(tmp_log) else: self.monitor_event_fifo = None self.apfmon = Apfmon(self.queueConfigMapper) self.eventBasedMonCoreList = [] - if getattr(harvester_config.monitor, 'eventBasedEnable', False): + if getattr(harvester_config.monitor, "eventBasedEnable", False): for pluginConf in harvester_config.monitor.eventBasedPlugins: pluginFactory = PluginFactory() plugin_key = pluginFactory.get_plugin_key(pluginConf) try: self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf)) except Exception: - tmp_log.error('failed to launch event-based-monitor plugin of {0}'.format(plugin_key)) + tmp_log.error("failed to launch event-based-monitor plugin of {0}".format(plugin_key)) core_utils.dump_error_message(tmp_log) # main loop def run(self): - lockedBy = 'monitor-{0}'.format(self.get_pid()) - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') + lockedBy = "monitor-{0}".format(self.get_pid()) + mainLog = self.make_logger(_logger, "id={0}".format(lockedBy), method_name="run") # init messengers for queueName, queueConfig in self.queueConfigMapper.get_all_queues().items(): # just import for module initialization try: self.pluginFactory.get_plugin(queueConfig.messenger) except Exception: - mainLog.error('failed to launch messenger plugin for {0}'.format(queueName)) + mainLog.error("failed to launch messenger plugin for {0}".format(queueName)) core_utils.dump_error_message(mainLog) # main - fifoSleepTimeMilli = getattr(harvester_config.monitor, 'fifoSleepTimeMilli', 5000) - fifoCheckDuration = getattr(harvester_config.monitor, 'fifoCheckDuration', 30) - fifoMaxWorkersPerChunk = getattr(harvester_config.monitor, 'fifoMaxWorkersPerChunk', 500) - fifoProtectiveDequeue = getattr(harvester_config.monitor, 'fifoProtectiveDequeue', True) - eventBasedCheckInterval = getattr(harvester_config.monitor, 'eventBasedCheckInterval', 300) - eventBasedTimeWindow = getattr(harvester_config.monitor, 'eventBasedTimeWindow', 450) - eventBasedCheckMaxEvents = getattr(harvester_config.monitor, 'eventBasedCheckMaxEvents', 500) - eventBasedEventLifetime = getattr(harvester_config.monitor, 'eventBasedEventLifetime', 1800) - eventBasedRemoveMaxEvents = getattr(harvester_config.monitor, 'eventBasedRemoveMaxEvents', 2000) + fifoSleepTimeMilli = getattr(harvester_config.monitor, "fifoSleepTimeMilli", 5000) + fifoCheckDuration = getattr(harvester_config.monitor, "fifoCheckDuration", 30) + fifoMaxWorkersPerChunk = getattr(harvester_config.monitor, "fifoMaxWorkersPerChunk", 500) + fifoProtectiveDequeue = getattr(harvester_config.monitor, "fifoProtectiveDequeue", True) + eventBasedCheckInterval = getattr(harvester_config.monitor, "eventBasedCheckInterval", 300) + eventBasedTimeWindow = getattr(harvester_config.monitor, "eventBasedTimeWindow", 450) + eventBasedCheckMaxEvents = getattr(harvester_config.monitor, "eventBasedCheckMaxEvents", 500) + eventBasedEventLifetime = getattr(harvester_config.monitor, "eventBasedEventLifetime", 1800) + eventBasedRemoveMaxEvents = getattr(harvester_config.monitor, "eventBasedRemoveMaxEvents", 2000) last_DB_cycle_timestamp = 0 last_event_delivery_timestamp = 0 last_event_digest_timestamp = 0 last_event_dispose_timestamp = 0 monitor_fifo = self.monitor_fifo - sleepTime = (fifoSleepTimeMilli / 1000.0) \ - if monitor_fifo.enabled else harvester_config.monitor.sleepTime + sleepTime = (fifoSleepTimeMilli / 1000.0) if monitor_fifo.enabled else harvester_config.monitor.sleepTime adjusted_sleepTime = sleepTime if monitor_fifo.enabled: monitor_fifo.restore() while True: sw_main = core_utils.get_stopwatch() - mainLog.debug('start a monitor cycle') - if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and \ - not (monitor_fifo.enabled and self.singleMode): + mainLog.debug("start a monitor cycle") + if time.time() >= last_DB_cycle_timestamp + harvester_config.monitor.sleepTime and not (monitor_fifo.enabled and self.singleMode): # run with workers from DB sw_db = core_utils.get_stopwatch() - mainLog.debug('starting run with DB') - mainLog.debug('getting workers to monitor') - workSpecsPerQueue = self.dbProxy.get_workers_to_update(harvester_config.monitor.maxWorkers, - harvester_config.monitor.checkInterval, - harvester_config.monitor.lockInterval, - lockedBy) - mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) + mainLog.debug("starting run with DB") + mainLog.debug("getting workers to monitor") + workSpecsPerQueue = self.dbProxy.get_workers_to_update( + harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy + ) + mainLog.debug("got {0} queues".format(len(workSpecsPerQueue))) # loop over all workers for queueName, configIdWorkSpecs in iteritems(workSpecsPerQueue): for configID, workSpecsList in iteritems(configIdWorkSpecs): try: - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB') + retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source="DB") except Exception as e: - mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + mainLog.error("monitor_agent_core excepted with {0}".format(e)) retVal = None # skip the loop if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: - mainLog.debug('putting workers to FIFO') + mainLog.debug("putting workers to FIFO") try: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put((queueName, workSpecsToEnqueue), score) - mainLog.info('put workers of {0} to FIFO with score {1}'.format(queueName, score)) + mainLog.info("put workers of {0} to FIFO with score {1}".format(queueName, score)) except Exception as errStr: - mainLog.error('failed to put object from FIFO: {0}'.format(errStr)) + mainLog.error("failed to put object from FIFO: {0}".format(errStr)) if workSpecsToEnqueueToHead: - mainLog.debug('putting workers to FIFO head') + mainLog.debug("putting workers to FIFO head") try: score = fifoCheckInterval - timeNow_timestamp monitor_fifo.put((queueName, workSpecsToEnqueueToHead), score) - mainLog.info('put workers of {0} to FIFO with score {1}'.format(queueName, score)) + mainLog.info("put workers of {0} to FIFO with score {1}".format(queueName, score)) except Exception as errStr: - mainLog.error('failed to put object from FIFO head: {0}'.format(errStr)) + mainLog.error("failed to put object from FIFO head: {0}".format(errStr)) last_DB_cycle_timestamp = time.time() if sw_db.get_elapsed_time_in_sec() > harvester_config.monitor.lockInterval: - mainLog.warning('a single DB cycle was longer than lockInterval ' + sw_db.get_elapsed_time()) + mainLog.warning("a single DB cycle was longer than lockInterval " + sw_db.get_elapsed_time()) else: - mainLog.debug('done a DB cycle' + sw_db.get_elapsed_time()) - mainLog.debug('ended run with DB') + mainLog.debug("done a DB cycle" + sw_db.get_elapsed_time()) + mainLog.debug("ended run with DB") elif monitor_fifo.enabled: # with FIFO sw = core_utils.get_stopwatch() @@ -151,16 +148,15 @@ def run(self): if self.monitor_event_fifo.enabled: # run with workers reported from plugin (event-based check) to_deliver = time.time() >= last_event_delivery_timestamp + eventBasedCheckInterval - to_digest = time.time() >= last_event_digest_timestamp + eventBasedCheckInterval/4 - to_dispose = time.time() >= last_event_dispose_timestamp + eventBasedCheckInterval/2 + to_digest = time.time() >= last_event_digest_timestamp + eventBasedCheckInterval / 4 + to_dispose = time.time() >= last_event_dispose_timestamp + eventBasedCheckInterval / 2 if to_deliver: # deliver events of worker update - got_lock = self.dbProxy.get_process_lock('monitor_event_deliverer', lockedBy, - eventBasedCheckInterval) + got_lock = self.dbProxy.get_process_lock("monitor_event_deliverer", lockedBy, eventBasedCheckInterval) if got_lock: self.monitor_event_deliverer(time_window=eventBasedTimeWindow) else: - mainLog.debug('did not get lock. Skip monitor_event_deliverer') + mainLog.debug("did not get lock. Skip monitor_event_deliverer") last_event_delivery_timestamp = time.time() if to_digest: # digest events of worker update @@ -185,7 +181,7 @@ def run(self): try: retVal, overhead_time = monitor_fifo.to_check_workers() except Exception as e: - mainLog.error('failed to check workers from FIFO: {0}'.format(e)) + mainLog.error("failed to check workers from FIFO: {0}".format(e)) if overhead_time is not None: n_chunk_peeked_stat += 1 sum_overhead_time_stat += overhead_time @@ -193,16 +189,16 @@ def run(self): # check fifo size try: fifo_size = monitor_fifo.size() - mainLog.debug('FIFO size is {0}'.format(fifo_size)) + mainLog.debug("FIFO size is {0}".format(fifo_size)) except Exception as e: - mainLog.error('failed to get size of FIFO: {0}'.format(e)) + mainLog.error("failed to get size of FIFO: {0}".format(e)) time.sleep(2) continue - mainLog.debug('starting run with FIFO') + mainLog.debug("starting run with FIFO") try: obj_gotten = monitor_fifo.get(timeout=1, protective=fifoProtectiveDequeue) except Exception as errStr: - mainLog.error('failed to get object from FIFO: {0}'.format(errStr)) + mainLog.error("failed to get object from FIFO: {0}".format(errStr)) time.sleep(2) continue else: @@ -211,7 +207,7 @@ def run(self): if fifoProtectiveDequeue: obj_dequeued_id_list.append(obj_gotten.id) queueName, workSpecsList = obj_gotten.item - mainLog.debug('got a chunk of {0} workers of {1} from FIFO'.format(len(workSpecsList), queueName) + sw.get_elapsed_time()) + mainLog.debug("got a chunk of {0} workers of {1} from FIFO".format(len(workSpecsList), queueName) + sw.get_elapsed_time()) sw.reset() configID = None for workSpecs in workSpecsList: @@ -224,12 +220,13 @@ def run(self): workSpec.pandaid_list = [j.PandaID for j in workSpec.get_jobspec_list()] else: workSpec.pandaid_list = [] - workSpec.force_update('pandaid_list') + workSpec.force_update("pandaid_list") try: - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True, - config_id=configID, check_source='FIFO') + retVal = self.monitor_agent_core( + lockedBy, queueName, workSpecsList, from_fifo=True, config_id=configID, check_source="FIFO" + ) except Exception as e: - mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + mainLog.error("monitor_agent_core excepted with {0}".format(e)) retVal = None # skip the loop if retVal is not None: @@ -244,7 +241,7 @@ def run(self): to_break = True remaining_obj_to_enqueue_dict[qc_key] = [workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval] except Exception as errStr: - mainLog.error('failed to gather workers for FIFO: {0}'.format(errStr)) + mainLog.error("failed to gather workers for FIFO: {0}".format(errStr)) to_break = True try: if len(obj_to_enqueue_to_head_dict[qc_key][0]) + len(workSpecsToEnqueueToHead) <= fifoMaxWorkersPerChunk: @@ -255,89 +252,86 @@ def run(self): to_break = True remaining_obj_to_enqueue_to_head_dict[qc_key] = [workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval] except Exception as errStr: - mainLog.error('failed to gather workers for FIFO head: {0}'.format(errStr)) + mainLog.error("failed to gather workers for FIFO head: {0}".format(errStr)) to_break = True - mainLog.debug('checked {0} workers from FIFO'.format(len(workSpecsList)) + sw.get_elapsed_time()) + mainLog.debug("checked {0} workers from FIFO".format(len(workSpecsList)) + sw.get_elapsed_time()) else: - mainLog.debug('monitor_agent_core returned None. Skipped putting to FIFO') + mainLog.debug("monitor_agent_core returned None. Skipped putting to FIFO") if sw_fifo.get_elapsed_time_in_sec() > harvester_config.monitor.lockInterval: - mainLog.warning('a single FIFO cycle was longer than lockInterval ' + sw_fifo.get_elapsed_time()) + mainLog.warning("a single FIFO cycle was longer than lockInterval " + sw_fifo.get_elapsed_time()) else: - mainLog.debug('done a FIFO cycle' + sw_fifo.get_elapsed_time()) + mainLog.debug("done a FIFO cycle" + sw_fifo.get_elapsed_time()) n_loops_hit += 1 if to_break: break else: - mainLog.debug('got nothing in FIFO') + mainLog.debug("got nothing in FIFO") else: - mainLog.debug('workers in FIFO too young to check. Skipped') + mainLog.debug("workers in FIFO too young to check. Skipped") if self.singleMode: break if overhead_time is not None: - time.sleep(max(-overhead_time*random.uniform(0.1, 1), adjusted_sleepTime)) + time.sleep(max(-overhead_time * random.uniform(0.1, 1), adjusted_sleepTime)) else: - time.sleep(max(fifoCheckDuration*random.uniform(0.1, 1), adjusted_sleepTime)) - mainLog.debug('run {0} loops, including {1} FIFO cycles'.format(n_loops, n_loops_hit)) + time.sleep(max(fifoCheckDuration * random.uniform(0.1, 1), adjusted_sleepTime)) + mainLog.debug("run {0} loops, including {1} FIFO cycles".format(n_loops, n_loops_hit)) # enqueue to fifo sw.reset() n_chunk_put = 0 - mainLog.debug('putting worker chunks to FIFO') + mainLog.debug("putting worker chunks to FIFO") for _dct in (obj_to_enqueue_dict, remaining_obj_to_enqueue_dict): - for ((queueName, configID), obj_to_enqueue) in iteritems(_dct): + for (queueName, configID), obj_to_enqueue in iteritems(_dct): try: workSpecsToEnqueue, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue if workSpecsToEnqueue: score = fifoCheckInterval + timeNow_timestamp monitor_fifo.put((queueName, workSpecsToEnqueue), score) n_chunk_put += 1 - mainLog.info('put a chunk of {0} workers of {1} to FIFO with score {2}'.format( - len(workSpecsToEnqueue), queueName, score)) + mainLog.info("put a chunk of {0} workers of {1} to FIFO with score {2}".format(len(workSpecsToEnqueue), queueName, score)) except Exception as errStr: - mainLog.error('failed to put object from FIFO: {0}'.format(errStr)) - mainLog.debug('putting worker chunks to FIFO head') + mainLog.error("failed to put object from FIFO: {0}".format(errStr)) + mainLog.debug("putting worker chunks to FIFO head") for _dct in (obj_to_enqueue_to_head_dict, remaining_obj_to_enqueue_to_head_dict): - for ((queueName, configID), obj_to_enqueue_to_head) in iteritems(_dct): + for (queueName, configID), obj_to_enqueue_to_head in iteritems(_dct): try: workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = obj_to_enqueue_to_head if workSpecsToEnqueueToHead: score = fifoCheckInterval + timeNow_timestamp - 2**32 monitor_fifo.put((queueName, workSpecsToEnqueueToHead), score) n_chunk_put += 1 - mainLog.info('put a chunk of {0} workers of {1} to FIFO with score {2}'.format( - len(workSpecsToEnqueueToHead), queueName, score)) + mainLog.info("put a chunk of {0} workers of {1} to FIFO with score {2}".format(len(workSpecsToEnqueueToHead), queueName, score)) except Exception as errStr: - mainLog.error('failed to put object from FIFO head: {0}'.format(errStr)) + mainLog.error("failed to put object from FIFO head: {0}".format(errStr)) # delete protective dequeued objects if fifoProtectiveDequeue and len(obj_dequeued_id_list) > 0: try: monitor_fifo.delete(ids=obj_dequeued_id_list) except Exception as e: - mainLog.error('failed to delete object from FIFO: {0}'.format(e)) - mainLog.debug('put {0} worker chunks into FIFO'.format(n_chunk_put) + sw.get_elapsed_time()) + mainLog.error("failed to delete object from FIFO: {0}".format(e)) + mainLog.debug("put {0} worker chunks into FIFO".format(n_chunk_put) + sw.get_elapsed_time()) # adjust adjusted_sleepTime if n_chunk_peeked_stat > 0 and sum_overhead_time_stat > sleepTime: speedup_factor = (sum_overhead_time_stat - sleepTime) / (n_chunk_peeked_stat * harvester_config.monitor.checkInterval) speedup_factor = max(speedup_factor, 0) - adjusted_sleepTime = adjusted_sleepTime / (1. + speedup_factor) + adjusted_sleepTime = adjusted_sleepTime / (1.0 + speedup_factor) elif n_chunk_peeked_stat == 0 or sum_overhead_time_stat < 0: - adjusted_sleepTime = (sleepTime + adjusted_sleepTime)/2 - mainLog.debug('adjusted_sleepTime becomes {0:.3f} sec'.format(adjusted_sleepTime)) + adjusted_sleepTime = (sleepTime + adjusted_sleepTime) / 2 + mainLog.debug("adjusted_sleepTime becomes {0:.3f} sec".format(adjusted_sleepTime)) # end run with fifo - mainLog.debug('ended run with FIFO') + mainLog.debug("ended run with FIFO") # time the cycle - mainLog.debug('done a monitor cycle' + sw_main.get_elapsed_time()) + mainLog.debug("done a monitor cycle" + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(adjusted_sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return # core of monitor agent to check workers in workSpecsList of queueName def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False, config_id=None, check_source=None): - tmpQueLog = self.make_logger(_logger, 'id={0} queue={1}'.format(lockedBy, queueName), - method_name='run') + tmpQueLog = self.make_logger(_logger, "id={0} queue={1}".format(lockedBy, queueName), method_name="run") # check queue if not self.queueConfigMapper.has_queue(queueName, config_id): - tmpQueLog.error('config not found') + tmpQueLog.error("config not found") return None # get queue queueConfig = self.queueConfigMapper.get_queue(queueName, config_id) @@ -352,7 +346,7 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False try: fifoCheckInterval = monCore.fifoCheckInterval except Exception: - if hasattr(harvester_config.monitor, 'fifoCheckInterval'): + if hasattr(harvester_config.monitor, "fifoCheckInterval"): fifoCheckInterval = harvester_config.monitor.fifoCheckInterval else: fifoCheckInterval = harvester_config.monitor.checkInterval @@ -366,11 +360,11 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False fifoMaxPreemptInterval = 60 # check workers allWorkers = [item for sublist in workSpecsList for item in sublist] - tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) + tmpQueLog.debug("checking {0} workers".format(len(allWorkers))) tmpStat, tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog, from_fifo) if tmpStat: # loop over all worker chunks - tmpQueLog.debug('update jobs and workers') + tmpQueLog.debug("update jobs and workers") iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None @@ -381,31 +375,26 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False mapType = workSpecs[0].mapType # loop over workSpecs for workSpec in workSpecs: - tmpLog = self.make_logger(_logger, - 'id={0} workerID={1} from={2}'.format( - lockedBy, workSpec.workerID, check_source), - method_name='run') + tmpLog = self.make_logger(_logger, "id={0} workerID={1} from={2}".format(lockedBy, workSpec.workerID, check_source), method_name="run") tmpOut = tmpRetMap[workSpec.workerID] - oldStatus = tmpOut['oldStatus'] - newStatus = tmpOut['newStatus'] - monStatus = tmpOut['monStatus'] - diagMessage = tmpOut['diagMessage'] - workAttributes = tmpOut['workAttributes'] - eventsToUpdate = tmpOut['eventsToUpdate'] - filesToStageOut = tmpOut['filesToStageOut'] - eventsRequestParams = tmpOut['eventsRequestParams'] - nJobsToReFill = tmpOut['nJobsToReFill'] - pandaIDs = tmpOut['pandaIDs'] - isChecked = tmpOut['isChecked'] - tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' - tmpStr += 'postProcessed={3} files={4}' - tmpLog.debug(tmpStr.format(newStatus, monStatus, diagMessage, - workSpec.is_post_processed(), - str(filesToStageOut))) + oldStatus = tmpOut["oldStatus"] + newStatus = tmpOut["newStatus"] + monStatus = tmpOut["monStatus"] + diagMessage = tmpOut["diagMessage"] + workAttributes = tmpOut["workAttributes"] + eventsToUpdate = tmpOut["eventsToUpdate"] + filesToStageOut = tmpOut["filesToStageOut"] + eventsRequestParams = tmpOut["eventsRequestParams"] + nJobsToReFill = tmpOut["nJobsToReFill"] + pandaIDs = tmpOut["pandaIDs"] + isChecked = tmpOut["isChecked"] + tmpStr = "newStatus={0} monitoredStatus={1} diag={2} " + tmpStr += "postProcessed={3} files={4}" + tmpLog.debug(tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: - tmpLog.error('unknown status={0}'.format(newStatus)) + tmpLog.error("unknown status={0}".format(newStatus)) return # update worker workSpec.set_status(newStatus) @@ -421,7 +410,7 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False if not workSpec.has_pilot_error() and workSpec.errorCode is None: workSpec.set_pilot_error(PilotErrors.PANDAKILL, diagMessage) if monStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]: - workSpec.set_work_params({'finalMonStatus': monStatus}) + workSpec.set_work_params({"finalMonStatus": monStatus}) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents @@ -431,10 +420,7 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: - jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, - None, - only_running=True, - slim=True) + jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, only_running=True, slim=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: @@ -443,8 +429,9 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False filesToStageOutList[workSpec.workerID] = filesToStageOut # apfmon status update if newStatus != oldStatus: - tmpQueLog.debug('newStatus: {0} monStatus: {1} oldStatus: {2} workSpecStatus: {3}'. - format(newStatus, monStatus, oldStatus, workSpec.status)) + tmpQueLog.debug( + "newStatus: {0} monStatus: {1} oldStatus: {2} workSpecStatus: {3}".format(newStatus, monStatus, oldStatus, workSpec.status) + ) self.apfmon.update_worker(workSpec, monStatus) # lock workers for fifo @@ -454,49 +441,44 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False for workSpec, isChecked in zip(workSpecs, isCheckedList): attrs = dict() if isChecked: - attrs['checkTime'] = workSpec.checkTime - workSpec.force_not_update('checkTime') + attrs["checkTime"] = workSpec.checkTime + workSpec.force_not_update("checkTime") if workSpec.has_updated_attributes(): - attrs['lockedBy'] = lockedBy + attrs["lockedBy"] = lockedBy workSpec.lockedBy = lockedBy - workSpec.force_not_update('lockedBy') + workSpec.force_not_update("lockedBy") else: - attrs['lockedBy'] = None + attrs["lockedBy"] = None worker_id_list[workSpec.workerID] = attrs - temRetLockWorker = self.dbProxy.lock_workers(worker_id_list, - harvester_config.monitor.lockInterval) + temRetLockWorker = self.dbProxy.lock_workers(worker_id_list, harvester_config.monitor.lockInterval) # skip if not locked if not temRetLockWorker: continue # update jobs and workers if jobSpecs is not None and len(jobSpecs) > 0: - tmpQueLog.debug('updating {0} jobs with {1} workers'.format(len(jobSpecs), len(workSpecs))) - core_utils.update_job_attributes_with_workers(mapType, jobSpecs, workSpecs, - filesToStageOutList, eventsToUpdateList) + tmpQueLog.debug("updating {0} jobs with {1} workers".format(len(jobSpecs), len(workSpecs))) + core_utils.update_job_attributes_with_workers(mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) # update local database tmpRet = self.dbProxy.update_jobs_workers(jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: - tmpLog = self.make_logger(_logger, - 'id={0} workerID={1}'.format(lockedBy, workSpec.workerID), - method_name='run') + tmpLog = self.make_logger(_logger, "id={0} workerID={1}".format(lockedBy, workSpec.workerID), method_name="run") if from_fifo: - tmpLog.info('failed to update the DB. Maybe locked by other thread running with DB') + tmpLog.info("failed to update the DB. Maybe locked by other thread running with DB") else: if workSpec.status in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_missed]: - tmpLog.info('worker already in final status. Skipped') + tmpLog.info("worker already in final status. Skipped") else: - tmpLog.error('failed to update the DB. lockInterval may be too short') + tmpLog.error("failed to update the DB. lockInterval may be too short") else: if jobSpecs is not None: for jobSpec in jobSpecs: - tmpLog = self.make_logger(_logger, - 'id={0} PandaID={1}'.format(lockedBy, jobSpec.PandaID), - method_name='run') - tmpLog.debug('new status={0} subStatus={1} status_in_metadata={2}'.format( - jobSpec.status, - jobSpec.subStatus, - jobSpec.get_job_status_from_attributes())) + tmpLog = self.make_logger(_logger, "id={0} PandaID={1}".format(lockedBy, jobSpec.PandaID), method_name="run") + tmpLog.debug( + "new status={0} subStatus={1} status_in_metadata={2}".format( + jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes() + ) + ) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len(filesToStageOutList) > 0: for workSpec in workSpecs: @@ -504,78 +486,93 @@ def monitor_agent_core(self, lockedBy, queueName, workSpecsList, from_fifo=False messenger.acknowledge_events_files(workSpec) except Exception: core_utils.dump_error_message(tmpQueLog) - tmpQueLog.error('failed to send ACK to workerID={0}'.format(workSpec.workerID)) + tmpQueLog.error("failed to send ACK to workerID={0}".format(workSpec.workerID)) # active workers for fifo if self.monitor_fifo.enabled and workSpecs: workSpec = workSpecs[0] tmpOut = tmpRetMap[workSpec.workerID] - newStatus = tmpOut['newStatus'] - monStatus = tmpOut['monStatus'] - if newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] \ - and workSpec.mapType != WorkSpec.MT_MultiWorkers \ - and workSpec.workAttributes is not None: + newStatus = tmpOut["newStatus"] + monStatus = tmpOut["monStatus"] + if ( + newStatus in [WorkSpec.ST_submitted, WorkSpec.ST_running, WorkSpec.ST_idle] + and workSpec.mapType != WorkSpec.MT_MultiWorkers + and workSpec.workAttributes is not None + ): timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() # get lastCheckAt - _bool, lastCheckAt = workSpec.get_work_params('lastCheckAt') + _bool, lastCheckAt = workSpec.get_work_params("lastCheckAt") try: last_check_period = timeNow_timestamp - lastCheckAt except TypeError: last_check_period = forceEnqueueInterval + 1.0 # get lastForceEnqueueAt - _bool, lastForceEnqueueAt = workSpec.get_work_params('lastForceEnqueueAt') + _bool, lastForceEnqueueAt = workSpec.get_work_params("lastForceEnqueueAt") if not (_bool and lastForceEnqueueAt is not None): lastForceEnqueueAt = 0 # notification - intolerable_delay = max(forceEnqueueInterval*2, harvester_config.monitor.checkInterval * 4) - if _bool and lastCheckAt is not None \ - and last_check_period > harvester_config.monitor.checkInterval \ - and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp: + intolerable_delay = max(forceEnqueueInterval * 2, harvester_config.monitor.checkInterval * 4) + if ( + _bool + and lastCheckAt is not None + and last_check_period > harvester_config.monitor.checkInterval + and timeNow_timestamp - harvester_config.monitor.checkInterval > self.startTimestamp + ): if last_check_period > intolerable_delay: - tmpQueLog.error('last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly'.format( - workSpec.workerID, last_check_period)) + tmpQueLog.error( + "last check period of workerID={0} is {1} sec, intolerably longer than monitor checkInterval. Will NOT enquque worker by force. Please check why monitor checks worker slowly".format( + workSpec.workerID, last_check_period + ) + ) else: - tmpQueLog.warning('last check period of workerID={0} is {1} sec, longer than monitor checkInterval'.format( - workSpec.workerID, last_check_period)) + tmpQueLog.warning( + "last check period of workerID={0} is {1} sec, longer than monitor checkInterval".format( + workSpec.workerID, last_check_period + ) + ) # prepartion to enqueue fifo - if (from_fifo) \ - or (not from_fifo - and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp - and last_check_period > forceEnqueueInterval - and last_check_period < intolerable_delay - and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval): + if (from_fifo) or ( + not from_fifo + and timeNow_timestamp - harvester_config.monitor.sleepTime > self.startTimestamp + and last_check_period > forceEnqueueInterval + and last_check_period < intolerable_delay + and timeNow_timestamp - lastForceEnqueueAt > 86400 + forceEnqueueInterval + ): if not from_fifo: # in DB cycle - tmpQueLog.warning('last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force'.format( - workSpec.workerID, last_check_period)) - workSpec.set_work_params({'lastForceEnqueueAt': timeNow_timestamp}) - workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) + tmpQueLog.warning( + "last check period of workerID={0} is {1} sec, longer than monitor forceEnqueueInterval. Enqueue the worker by force".format( + workSpec.workerID, last_check_period + ) + ) + workSpec.set_work_params({"lastForceEnqueueAt": timeNow_timestamp}) + workSpec.set_work_params({"lastCheckAt": timeNow_timestamp}) workSpec.lockedBy = None - workSpec.force_update('lockedBy') + workSpec.force_update("lockedBy") if monStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]: # for post-processing - _bool, startFifoPreemptAt = workSpec.get_work_params('startFifoPreemptAt') + _bool, startFifoPreemptAt = workSpec.get_work_params("startFifoPreemptAt") if not _bool or startFifoPreemptAt is None: startFifoPreemptAt = timeNow_timestamp - workSpec.set_work_params({'startFifoPreemptAt': startFifoPreemptAt}) - tmpQueLog.debug('workerID={0} , startFifoPreemptAt: {1}'.format(workSpec.workerID, startFifoPreemptAt)) + workSpec.set_work_params({"startFifoPreemptAt": startFifoPreemptAt}) + tmpQueLog.debug("workerID={0} , startFifoPreemptAt: {1}".format(workSpec.workerID, startFifoPreemptAt)) if timeNow_timestamp - startFifoPreemptAt < fifoMaxPreemptInterval: workSpecsToEnqueueToHead_dict[workSpec.workerID] = workSpecs else: - workSpec.set_work_params({'startFifoPreemptAt': timeNow_timestamp}) + workSpec.set_work_params({"startFifoPreemptAt": timeNow_timestamp}) workSpec.modificationTime = timeNow - workSpec.force_update('modificationTime') + workSpec.force_update("modificationTime") workSpecsToEnqueue_dict[workSpec.workerID] = workSpecs else: workSpec.modificationTime = timeNow - workSpec.force_update('modificationTime') + workSpec.force_update("modificationTime") workSpecsToEnqueue_dict[workSpec.workerID] = workSpecs else: - tmpQueLog.error('failed to check workers') + tmpQueLog.error("failed to check workers") workSpecsToEnqueue = list(workSpecsToEnqueue_dict.values()) workSpecsToEnqueueToHead = list(workSpecsToEnqueueToHead_dict.values()) retVal = workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval - tmpQueLog.debug('done') + tmpQueLog.debug("done") return retVal # wrapper for checkWorkers @@ -603,10 +600,10 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, workAttributes = None filesToStageOut = [] nJobsToReFill = None - if workSpec.has_work_params('finalMonStatus'): + if workSpec.has_work_params("finalMonStatus"): # to post-process - _bool, finalMonStatus = workSpec.get_work_params('finalMonStatus') - _thing = (workSpec, (finalMonStatus, '')) + _bool, finalMonStatus = workSpec.get_work_params("finalMonStatus") + _thing = (workSpec, (finalMonStatus, "")) thingsToPostProcess.append(_thing) else: # job-level late binding @@ -627,127 +624,124 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, else: workersToCheck.append(workSpec) # add - retMap[workSpec.workerID] = {'oldStatus': workSpec.status, - 'newStatus': workStatus, - 'monStatus': workStatus, - 'workAttributes': workAttributes, - 'filesToStageOut': filesToStageOut, - 'eventsRequestParams': eventsRequestParams, - 'eventsToUpdate': eventsToUpdate, - 'diagMessage': '', - 'pandaIDs': pandaIDs, - 'nJobsToReFill': nJobsToReFill, - 'isChecked': True} + retMap[workSpec.workerID] = { + "oldStatus": workSpec.status, + "newStatus": workStatus, + "monStatus": workStatus, + "workAttributes": workAttributes, + "filesToStageOut": filesToStageOut, + "eventsRequestParams": eventsRequestParams, + "eventsToUpdate": eventsToUpdate, + "diagMessage": "", + "pandaIDs": pandaIDs, + "nJobsToReFill": nJobsToReFill, + "isChecked": True, + } # check workers - tmp_log.debug('checking workers with plugin') + tmp_log.debug("checking workers with plugin") try: if workersToCheck: tmpStat, tmpOut = mon_core.check_workers(workersToCheck) if not tmpStat: - tmp_log.error('failed to check workers with: {0}'.format(tmpOut)) + tmp_log.error("failed to check workers with: {0}".format(tmpOut)) workersToCheck = [] tmpOut = [] else: - tmp_log.debug('checked') + tmp_log.debug("checked") else: - tmp_log.debug('Nothing to be checked with plugin') + tmp_log.debug("Nothing to be checked with plugin") tmpOut = [] timeNow = datetime.datetime.utcnow() - for workSpec, (newStatus, diagMessage) in itertools.chain( - zip(workersToCheck, tmpOut), thingsToPostProcess): + for workSpec, (newStatus, diagMessage) in itertools.chain(zip(workersToCheck, tmpOut), thingsToPostProcess): workerID = workSpec.workerID - tmp_log.debug('Going to check workerID={0}'.format(workerID)) + tmp_log.debug("Going to check workerID={0}".format(workerID)) pandaIDs = [] if workerID in retMap: # failed to check status if newStatus is None: - tmp_log.warning('Failed to check workerID={0} with {1}'.format(workerID, diagMessage)) - retMap[workerID]['isChecked'] = False + tmp_log.warning("Failed to check workerID={0} with {1}".format(workerID, diagMessage)) + retMap[workerID]["isChecked"] = False # set status - if workSpec.checkTime is not None and checkTimeout is not None and \ - timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout): + if ( + workSpec.checkTime is not None + and checkTimeout is not None + and timeNow - workSpec.checkTime > datetime.timedelta(seconds=checkTimeout) + ): # kill due to timeout - tmp_log.debug('kill workerID={0} due to consecutive check failures'.format(workerID)) + tmp_log.debug("kill workerID={0} due to consecutive check failures".format(workerID)) self.dbProxy.mark_workers_to_kill_by_workerids([workSpec.workerID]) newStatus = WorkSpec.ST_cancelled - diagMessage = 'Killed by Harvester due to consecutive worker check failures. ' + diagMessage + diagMessage = "Killed by Harvester due to consecutive worker check failures. " + diagMessage workSpec.set_pilot_error(PilotErrors.FAILEDBYSERVER, diagMessage) else: # use original status newStatus = workSpec.status # request kill if messenger.kill_requested(workSpec): - tmp_log.debug('kill workerID={0} as requested'.format(workerID)) + tmp_log.debug("kill workerID={0} as requested".format(workerID)) self.dbProxy.mark_workers_to_kill_by_workerids([workSpec.workerID]) # stuck queuing for too long - if workSpec.status == WorkSpec.ST_submitted \ - and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit): - tmp_log.debug('kill workerID={0} due to queuing longer than {1} seconds'.format( - workerID, workerQueueTimeLimit)) + if workSpec.status == WorkSpec.ST_submitted and timeNow > workSpec.submitTime + datetime.timedelta(seconds=workerQueueTimeLimit): + tmp_log.debug("kill workerID={0} due to queuing longer than {1} seconds".format(workerID, workerQueueTimeLimit)) self.dbProxy.mark_workers_to_kill_by_workerids([workSpec.workerID]) - diagMessage = 'Killed by Harvester due to worker queuing too long. ' + diagMessage + diagMessage = "Killed by Harvester due to worker queuing too long. " + diagMessage workSpec.set_pilot_error(PilotErrors.FAILEDBYSERVER, diagMessage) # set closed workSpec.set_pilot_closed() # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat - worker_heartbeat_limit = int(queue_config.messenger['worker_heartbeat']) + worker_heartbeat_limit = int(queue_config.messenger["worker_heartbeat"]) except (AttributeError, KeyError): worker_heartbeat_limit = None - tmp_log.debug( - 'workerID={0} heartbeat limit is configured to {1}'.format(workerID, - worker_heartbeat_limit)) + tmp_log.debug("workerID={0} heartbeat limit is configured to {1}".format(workerID, worker_heartbeat_limit)) if worker_heartbeat_limit: if messenger.is_alive(workSpec, worker_heartbeat_limit): - tmp_log.debug('heartbeat for workerID={0} is valid'.format(workerID)) + tmp_log.debug("heartbeat for workerID={0} is valid".format(workerID)) else: - tmp_log.debug('heartbeat for workerID={0} expired: sending kill request'.format(workerID)) + tmp_log.debug("heartbeat for workerID={0} expired: sending kill request".format(workerID)) self.dbProxy.mark_workers_to_kill_by_workerids([workSpec.workerID]) - diagMessage = 'Killed by Harvester due to worker heartbeat expired. ' + diagMessage + diagMessage = "Killed by Harvester due to worker heartbeat expired. " + diagMessage workSpec.set_pilot_error(PilotErrors.FAILEDBYSERVER, diagMessage) # get work attributes workAttributes = messenger.get_work_attributes(workSpec) - retMap[workerID]['workAttributes'] = workAttributes + retMap[workerID]["workAttributes"] = workAttributes # get output files filesToStageOut = messenger.get_files_to_stage_out(workSpec) - retMap[workerID]['filesToStageOut'] = filesToStageOut + retMap[workerID]["filesToStageOut"] = filesToStageOut # get events to update if workSpec.eventsRequest in [WorkSpec.EV_useEvents, WorkSpec.EV_requestEvents]: eventsToUpdate = messenger.events_to_update(workSpec) - retMap[workerID]['eventsToUpdate'] = eventsToUpdate + retMap[workerID]["eventsToUpdate"] = eventsToUpdate # request events if workSpec.eventsRequest == WorkSpec.EV_useEvents: eventsRequestParams = messenger.events_requested(workSpec) - retMap[workerID]['eventsRequestParams'] = eventsRequestParams + retMap[workerID]["eventsRequestParams"] = eventsRequestParams # get PandaIDs for pull model if workSpec.mapType == WorkSpec.MT_NoJob: pandaIDs = messenger.get_panda_ids(workSpec) - retMap[workerID]['pandaIDs'] = pandaIDs + retMap[workerID]["pandaIDs"] = pandaIDs # keep original new status - retMap[workerID]['monStatus'] = newStatus + retMap[workerID]["monStatus"] = newStatus # set running or idle while there are events to update or files to stage out if newStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled]: isOK = True - if len(retMap[workerID]['filesToStageOut']) > 0 or \ - len(retMap[workerID]['eventsToUpdate']) > 0: + if len(retMap[workerID]["filesToStageOut"]) > 0 or len(retMap[workerID]["eventsToUpdate"]) > 0: if workSpec.status == WorkSpec.ST_running: newStatus = WorkSpec.ST_running else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): - if (not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot) \ - or (hasattr(messenger, 'forcePostProcessing') and messenger.forcePostProcessing): + if (not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot) or ( + hasattr(messenger, "forcePostProcessing") and messenger.forcePostProcessing + ): # post processing unless heartbeat is suppressed - jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, - None, True, - only_running=True, - slim=True) + jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, True, only_running=True, slim=True) # post processing tmpStat = messenger.post_processing(workSpec, jobSpecs, workSpec.mapType) if tmpStat is None: # retry - ppTimeOut = getattr(harvester_config.monitor, 'postProcessTimeout', 0) + ppTimeOut = getattr(harvester_config.monitor, "postProcessTimeout", 0) if ppTimeOut > 0: timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=ppTimeOut) if workSpec.endTime is None or workSpec.endTime > timeLimit: @@ -763,10 +757,10 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, # reset modification time to immediately trigger subsequent lookup if isOK and not self.monitor_fifo.enabled: workSpec.trigger_next_lookup() - retMap[workerID]['newStatus'] = newStatus - retMap[workerID]['diagMessage'] = diagMessage + retMap[workerID]["newStatus"] = newStatus + retMap[workerID]["diagMessage"] = diagMessage else: - tmp_log.debug('workerID={0} not in retMap'.format(workerID)) + tmp_log.debug("workerID={0} not in retMap".format(workerID)) return True, retMap except Exception: core_utils.dump_error_message(tmp_log) @@ -774,71 +768,67 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, # ask plugin for workers to update, get workspecs, and queue the event def monitor_event_deliverer(self, time_window): - tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_deliverer') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "id=monitor-{0}".format(self.get_pid()), method_name="monitor_event_deliverer") + tmpLog.debug("start") for mon_core in self.eventBasedMonCoreList: - tmpLog.debug('run with {0}'.format(mon_core.__class__.__name__)) + tmpLog.debug("run with {0}".format(mon_core.__class__.__name__)) worker_update_list = mon_core.report_updated_workers(time_window=time_window) for workerID, updateTimestamp in worker_update_list: retVal = self.monitor_event_fifo.putbyid(id=workerID, item=True, score=updateTimestamp) if not retVal: - retVal = self.monitor_event_fifo.update(id=workerID, score=updateTimestamp, temporary=0, cond_score='gt') + retVal = self.monitor_event_fifo.update(id=workerID, score=updateTimestamp, temporary=0, cond_score="gt") if retVal: - tmpLog.debug('updated event with workerID={0}'.format(workerID)) + tmpLog.debug("updated event with workerID={0}".format(workerID)) else: - tmpLog.debug('event with workerID={0} is updated. Skipped'.format(workerID)) + tmpLog.debug("event with workerID={0} is updated. Skipped".format(workerID)) else: - tmpLog.debug('put event with workerID={0}'.format(workerID)) - tmpLog.debug('done') + tmpLog.debug("put event with workerID={0}".format(workerID)) + tmpLog.debug("done") # get events and check workers def monitor_event_digester(self, locked_by, max_events): - tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_digester') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "id=monitor-{0}".format(self.get_pid()), method_name="monitor_event_digester") + tmpLog.debug("start") retMap = {} try: - obj_gotten_list = self.monitor_event_fifo.getmany(mode='first', count=max_events, protective=True) + obj_gotten_list = self.monitor_event_fifo.getmany(mode="first", count=max_events, protective=True) except Exception as e: obj_gotten_list = [] - tmpLog.error('monitor_event_fifo excepted with {0}'.format(e)) - workerID_list = [ obj_gotten.id for obj_gotten in obj_gotten_list ] - tmpLog.debug('got {0} worker events'.format(len(workerID_list))) + tmpLog.error("monitor_event_fifo excepted with {0}".format(e)) + workerID_list = [obj_gotten.id for obj_gotten in obj_gotten_list] + tmpLog.debug("got {0} worker events".format(len(workerID_list))) if len(workerID_list) > 0: updated_workers_dict = self.dbProxy.get_workers_from_ids(workerID_list) - tmpLog.debug('got workspecs for worker events') + tmpLog.debug("got workspecs for worker events") for queueName, _val in iteritems(updated_workers_dict): for configID, workSpecsList in iteritems(_val): qc_key = (queueName, configID) - tmpLog.debug('checking workers of queueName={0} configID={1}'.format(*qc_key)) + tmpLog.debug("checking workers of queueName={0} configID={1}".format(*qc_key)) try: - retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, - from_fifo=True, config_id=configID, - check_source='Event') + retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, from_fifo=True, config_id=configID, check_source="Event") except Exception as e: - tmpLog.error('monitor_agent_core excepted with {0}'.format(e)) + tmpLog.error("monitor_agent_core excepted with {0}".format(e)) retVal = None # skip the loop if retVal: retMap[qc_key] = retVal - tmpLog.debug('done') + tmpLog.debug("done") return retMap # remove outdated events def monitor_event_disposer(self, event_lifetime, max_events): - tmpLog = self.make_logger(_logger, 'id=monitor-{0}'.format(self.get_pid()), method_name='monitor_event_disposer') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "id=monitor-{0}".format(self.get_pid()), method_name="monitor_event_disposer") + tmpLog.debug("start") timeNow_timestamp = time.time() try: - obj_gotten_list = self.monitor_event_fifo.getmany(mode='first', - maxscore=(timeNow_timestamp-event_lifetime), - count=max_events, temporary=True) + obj_gotten_list = self.monitor_event_fifo.getmany(mode="first", maxscore=(timeNow_timestamp - event_lifetime), count=max_events, temporary=True) except Exception as e: obj_gotten_list = [] - tmpLog.error('monitor_event_fifo excepted with {0}'.format(e)) - tmpLog.debug('removed {0} events'.format(len(obj_gotten_list))) + tmpLog.error("monitor_event_fifo excepted with {0}".format(e)) + tmpLog.debug("removed {0} events".format(len(obj_gotten_list))) try: n_events = self.monitor_event_fifo.size() - tmpLog.debug('now {0} events in monitor-event fifo'.format(n_events)) + tmpLog.debug("now {0} events in monitor-event fifo".format(n_events)) except Exception as e: - tmpLog.error('failed to get size of monitor-event fifo: {0}'.format(e)) - tmpLog.debug('done') + tmpLog.error("failed to get size of monitor-event fifo: {0}".format(e)) + tmpLog.debug("done") diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 58386fa4..f785bdb1 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -10,7 +10,7 @@ from pandaharvester.harvestercore.file_spec import FileSpec # logger -_logger = core_utils.setup_logger('preparator') +_logger = core_utils.setup_logger("preparator") # class to prepare jobs @@ -23,14 +23,14 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() - # main loop + def run(self): - lockedBy = 'preparator-{0}'.format(self.get_pid()) + lockedBy = "preparator-{0}".format(self.get_pid()) while True: sw = core_utils.get_stopwatch() - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') - mainLog.debug('try to get jobs to check') + mainLog = self.make_logger(_logger, "id={0}".format(lockedBy), method_name="run") + mainLog.debug("try to get jobs to check") # get jobs to check preparation try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToCheck @@ -38,29 +38,30 @@ def run(self): maxFilesPerJob = None except Exception: maxFilesPerJob = None - jobsToCheck = self.dbProxy.get_jobs_in_sub_status('preparing', - harvester_config.preparator.maxJobsToCheck, - 'preparatorTime', 'lockedBy', - harvester_config.preparator.checkInterval, - harvester_config.preparator.lockInterval, - lockedBy, - max_files_per_job=maxFilesPerJob, - ng_file_status_list=['ready']) - mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) + jobsToCheck = self.dbProxy.get_jobs_in_sub_status( + "preparing", + harvester_config.preparator.maxJobsToCheck, + "preparatorTime", + "lockedBy", + harvester_config.preparator.checkInterval, + harvester_config.preparator.lockInterval, + lockedBy, + max_files_per_job=maxFilesPerJob, + ng_file_status_list=["ready"], + ) + mainLog.debug("got {0} jobs to check".format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('start checking') + tmpLog.debug("start checking") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, jobSpec.configID) oldSubStatus = jobSpec.subStatus @@ -71,22 +72,21 @@ def run(self): preparatorCore = self.pluginFactory.get_plugin(queueConfig.aux_preparator) if preparatorCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "preparatorTime", "lockedBy", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue tmpStat, tmpStr = preparatorCore.check_stage_in_status(jobSpec) # still running if tmpStat is None: # update job jobSpec.lockedBy = None - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.debug('try to check later since still preparing with {0}'.format(tmpStr)) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.debug("try to check later since still preparing with {0}".format(tmpStr)) continue # succeeded if tmpStat is True: @@ -94,20 +94,20 @@ def run(self): tmpStat, tmpStr = preparatorCore.resolve_input_paths(jobSpec) if tmpStat is False: jobSpec.lockedBy = None - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.error('failed to resolve input file paths : {0}'.format(tmpStr)) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.error("failed to resolve input file paths : {0}".format(tmpStr)) continue # manipulate container-related job params jobSpec.manipulate_job_params_for_container() # update job jobSpec.lockedBy = None jobSpec.set_all_input_ready() - if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ - (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady]): + if (maxFilesPerJob is None and jobSpec.auxInput is None) or ( + len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inReady] + ): # all done allDone = True - jobSpec.subStatus = 'prepared' + jobSpec.subStatus = "prepared" jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allReady @@ -118,81 +118,78 @@ def run(self): # change auxInput flag to check auxiliary inputs if len(jobSpec.inFiles) == 0 and jobSpec.auxInput == JobSpec.AUX_allTriggered: jobSpec.auxInput = JobSpec.AUX_inReady - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}, - update_in_file=True) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}, update_in_file=True) if allDone: - tmpLog.debug('succeeded') + tmpLog.debug("succeeded") else: - tmpLog.debug('partially succeeded') + tmpLog.debug("partially succeeded") else: # update job - jobSpec.status = 'failed' - jobSpec.subStatus = 'failed_to_prepare' + jobSpec.status = "failed" + jobSpec.subStatus = "failed_to_prepare" jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() - errStr = 'stage-in failed with {0}'.format(tmpStr) + errStr = "stage-in failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.error('failed with {0}'.format(tmpStr)) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.error("failed with {0}".format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger preparation - mainLog.debug('try to get jobs to prepare') + mainLog.debug("try to get jobs to prepare") try: maxFilesPerJob = harvester_config.preparator.maxFilesPerJobToPrepare if maxFilesPerJob <= 0: maxFilesPerJob = None except Exception: maxFilesPerJob = None - jobsToTrigger = self.dbProxy.get_jobs_in_sub_status('fetched', - harvester_config.preparator.maxJobsToTrigger, - 'preparatorTime', 'lockedBy', - harvester_config.preparator.triggerInterval, - harvester_config.preparator.lockInterval, - lockedBy, - 'preparing', - max_files_per_job=maxFilesPerJob, - ng_file_status_list=['triggered', - 'ready']) - mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) + jobsToTrigger = self.dbProxy.get_jobs_in_sub_status( + "fetched", + harvester_config.preparator.maxJobsToTrigger, + "preparatorTime", + "lockedBy", + harvester_config.preparator.triggerInterval, + harvester_config.preparator.lockInterval, + lockedBy, + "preparing", + max_files_per_job=maxFilesPerJob, + ng_file_status_list=["triggered", "ready"], + ) + mainLog.debug("got {0} jobs to prepare".format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() for jobSpec in jobsToTrigger: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('try to trigger preparation') + tmpLog.debug("try to trigger preparation") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) oldSubStatus = jobSpec.subStatus # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) - fileType = 'input' + fileType = "input" else: preparatorCore = self.pluginFactory.get_plugin(queueConfig.aux_preparator) fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'preparatorTime', 'lockedBy', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "preparatorTime", "lockedBy", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue # check file status if queueConfig.ddmEndpointIn not in fileStatMap: @@ -200,46 +197,44 @@ def run(self): # check if has to_prepare hasToPrepare = False for fileSpec in jobSpec.inFiles: - if fileSpec.status == 'to_prepare': + if fileSpec.status == "to_prepare": hasToPrepare = True break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: - if fileSpec.status in ['preparing', 'to_prepare']: + if fileSpec.status in ["preparing", "to_prepare"]: newInFiles.append(fileSpec) updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ - = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, - 'starting') - if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] = self.dbProxy.get_file_status( + fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, "starting" + ) + if "ready" in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready - fileSpec.status = 'ready' - if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path']: - fileSpec.path = list( - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path'])[0] + fileSpec.status = "ready" + if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]["ready"]["path"]: + fileSpec.path = list(fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]["ready"]["path"])[0] # set group info if any - groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, - queueConfig.ddmEndpointIn) + groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: - fileSpec.groupID = groupInfo['groupID'] - fileSpec.groupStatus = groupInfo['groupStatus'] - fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] + fileSpec.groupID = groupInfo["groupID"] + fileSpec.groupStatus = groupInfo["groupStatus"] + fileSpec.groupUpdateTime = groupInfo["groupUpdateTime"] updateStatus = True - elif (not hasToPrepare and - 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ - 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: + elif (not hasToPrepare and "to_prepare" in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or "triggered" in fileStatMap[ + queueConfig.ddmEndpointIn + ][fileSpec.lfn]: # the file is being prepared by another toWait = True - if fileSpec.status != 'preparing': - fileSpec.status = 'preparing' + if fileSpec.status != "preparing": + fileSpec.status = "preparing" updateStatus = True else: # change file status if the file is not prepared by another - if fileSpec.status != 'to_prepare': - fileSpec.status = 'to_prepare' + if fileSpec.status != "to_prepare": + fileSpec.status = "to_prepare" updateStatus = True # set new status if updateStatus: @@ -251,9 +246,8 @@ def run(self): if toWait: # update job jobSpec.lockedBy = None - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.debug('wait since files are being prepared by another job') + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.debug("wait since files are being prepared by another job") continue # trigger preparation tmpStat, tmpStr = preparatorCore.trigger_preparation(jobSpec) @@ -261,11 +255,12 @@ def run(self): if tmpStat is True: # succeeded jobSpec.lockedBy = None - if (maxFilesPerJob is None and jobSpec.auxInput is None) or \ - (len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered]): + if (maxFilesPerJob is None and jobSpec.auxInput is None) or ( + len(jobSpec.inFiles) == 0 and jobSpec.auxInput in [None, JobSpec.AUX_inTriggered] + ): # all done allDone = True - jobSpec.subStatus = 'preparing' + jobSpec.subStatus = "preparing" jobSpec.preparatorTime = None if jobSpec.auxInput is not None: jobSpec.auxInput = JobSpec.AUX_allTriggered @@ -274,43 +269,39 @@ def run(self): # there could be more files to prepare allDone = False for fileSpec in jobSpec.inFiles: - if fileSpec.status == 'to_prepare': - fileSpec.status = 'triggered' + if fileSpec.status == "to_prepare": + fileSpec.status = "triggered" # immediate next lookup jobSpec.trigger_preparation() # change auxInput flag to prepare auxiliary inputs if len(jobSpec.inFiles) == 0 and jobSpec.auxInput == JobSpec.AUX_hasAuxInput: jobSpec.auxInput = JobSpec.AUX_inTriggered - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}, - update_in_file=True) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}, update_in_file=True) if allDone: - tmpLog.debug('triggered') + tmpLog.debug("triggered") else: - tmpLog.debug('partially triggered') + tmpLog.debug("partially triggered") elif tmpStat is False: # fatal error - jobSpec.status = 'failed' - jobSpec.subStatus = 'failed_to_prepare' + jobSpec.status = "failed" + jobSpec.subStatus = "failed_to_prepare" jobSpec.lockedBy = None jobSpec.preparatorTime = None jobSpec.stateChangeTime = datetime.datetime.utcnow() - errStr = 'stage-in failed with {0}'.format(tmpStr) + errStr = "stage-in failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEINFAILED, errStr) jobSpec.trigger_propagation() - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.debug('failed to trigger with {0}'.format(tmpStr)) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.debug("failed to trigger with {0}".format(tmpStr)) else: # temporary error jobSpec.lockedBy = None - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': oldSubStatus}) - tmpLog.debug('try to prepare later since {0}'.format(tmpStr)) + self.dbProxy.update_job(jobSpec, {"lockedBy": lockedBy, "subStatus": oldSubStatus}) + tmpLog.debug("try to prepare later since {0}".format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) - mainLog.debug('done' + sw.get_elapsed_time()) + mainLog.debug("done" + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.preparator.sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return diff --git a/pandaharvester/harvesterbody/propagator.py b/pandaharvester/harvesterbody/propagator.py index cca25093..d89b1001 100644 --- a/pandaharvester/harvesterbody/propagator.py +++ b/pandaharvester/harvesterbody/propagator.py @@ -9,12 +9,14 @@ from pandaharvester.harvestercore.pilot_errors import PilotErrors # logger -_logger = core_utils.setup_logger('propagator') +_logger = core_utils.setup_logger("propagator") STATS_PERIOD = 300 METRICS_PERIOD = 300 # propagate important checkpoints to panda + + class Propagator(AgentBase): # constructor def __init__(self, communicator, queue_config_mapper, single_mode=False): @@ -29,20 +31,19 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): def run(self): while True: sw_main = core_utils.get_stopwatch() - mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') - mainLog.debug('getting jobs to propagate') + mainLog = self.make_logger(_logger, "id={0}".format(self.get_pid()), method_name="run") + mainLog.debug("getting jobs to propagate") sw = core_utils.get_stopwatch() - jobSpecs = self.dbProxy.get_jobs_to_propagate(harvester_config.propagator.maxJobs, - harvester_config.propagator.lockInterval, - harvester_config.propagator.updateInterval, - self.get_pid()) - mainLog.debug('got {0} jobs {1}'.format(len(jobSpecs), sw.get_elapsed_time())) + jobSpecs = self.dbProxy.get_jobs_to_propagate( + harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.get_pid() + ) + mainLog.debug("got {0} jobs {1}".format(len(jobSpecs), sw.get_elapsed_time())) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): - jobList = jobSpecs[iJobs:iJobs + nJobs] + jobList = jobSpecs[iJobs : iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] @@ -51,135 +52,118 @@ def run(self): retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: - queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, - tmpJobSpec.configID) + queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) if queueConfig: hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() - else: # assume truepilot - hbSuppressMap[tmpJobSpec.computingSite] = ['running', 'transferring', 'finished', 'failed'] + else: # assume truepilot + hbSuppressMap[tmpJobSpec.computingSite] = ["running", "transferring", "finished", "failed"] # heartbeat is suppressed - if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ - not tmpJobSpec.not_suppress_heartbeat(): + if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and not tmpJobSpec.not_suppress_heartbeat(): # check running job to detect lost heartbeat - if tmpJobSpec.status == 'running': + if tmpJobSpec.status == "running": jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) - retList.append({'StatusCode': 0, 'command': None}) + retList.append({"StatusCode": 0, "command": None}) else: jobListToUpdate.append(tmpJobSpec) sw.reset() retList += self.communicator.check_jobs(jobListToCheck) - mainLog.debug('check_jobs for {0} jobs {1}'.format(len(jobListToCheck), sw.get_elapsed_time())) + mainLog.debug("check_jobs for {0} jobs {1}".format(len(jobListToCheck), sw.get_elapsed_time())) sw.reset() retList += self.communicator.update_jobs(jobListToUpdate, self.get_pid()) - mainLog.debug('update_jobs for {0} jobs took {1}'.format(len(jobListToUpdate), - sw.get_elapsed_time())) + mainLog.debug("update_jobs for {0} jobs took {1}".format(len(jobListToUpdate), sw.get_elapsed_time())) # logging - for tmpJobSpec, tmpRet in zip(jobListToSkip+jobListToCheck+jobListToUpdate, retList): - if tmpRet['StatusCode'] == 0: + for tmpJobSpec, tmpRet in zip(jobListToSkip + jobListToCheck + jobListToUpdate, retList): + if tmpRet["StatusCode"] == 0: if tmpJobSpec in jobListToUpdate: - mainLog.debug('updated PandaID={0} status={1}'.format(tmpJobSpec.PandaID, - tmpJobSpec.status)) + mainLog.debug("updated PandaID={0} status={1}".format(tmpJobSpec.PandaID, tmpJobSpec.status)) else: - mainLog.debug('skip updating PandaID={0} status={1}'.format(tmpJobSpec.PandaID, - tmpJobSpec.status)) + mainLog.debug("skip updating PandaID={0} status={1}".format(tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status() and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None - tmpJobSpec.subStatus = 'done' + tmpJobSpec.subStatus = "done" tmpJobSpec.modificationTime = datetime.datetime.utcnow() elif tmpJobSpec.is_final_status() and not tmpJobSpec.all_events_done(): # trigger next propagation to update remaining events tmpJobSpec.trigger_propagation() else: # check event availability - if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ - tmpJobSpec.subStatus != 'submitted': + if tmpJobSpec.status == "starting" and "eventService" in tmpJobSpec.jobParams and tmpJobSpec.subStatus != "submitted": tmpEvStat, tmpEvRet = self.communicator.check_event_availability(tmpJobSpec) if tmpEvStat: if tmpEvRet is not None: tmpJobSpec.nRemainingEvents = tmpEvRet if tmpEvRet == 0: - mainLog.debug('kill PandaID={0} due to no event'.format(tmpJobSpec.PandaID)) - tmpRet['command'] = 'tobekilled' + mainLog.debug("kill PandaID={0} due to no event".format(tmpJobSpec.PandaID)) + tmpRet["command"] = "tobekilled" # got kill command - if 'command' in tmpRet and tmpRet['command'] in ['tobekilled']: + if "command" in tmpRet and tmpRet["command"] in ["tobekilled"]: nWorkers = self.dbProxy.mark_workers_to_kill_by_pandaid(tmpJobSpec.PandaID) if nWorkers == 0: # no workers - tmpJobSpec.status = 'cancelled' - tmpJobSpec.subStatus = 'killed' - tmpJobSpec.set_pilot_error(PilotErrors.PANDAKILL, - PilotErrors.pilot_error_msg[PilotErrors.PANDAKILL]) + tmpJobSpec.status = "cancelled" + tmpJobSpec.subStatus = "killed" + tmpJobSpec.set_pilot_error(PilotErrors.PANDAKILL, PilotErrors.pilot_error_msg[PilotErrors.PANDAKILL]) tmpJobSpec.stateChangeTime = datetime.datetime.utcnow() tmpJobSpec.trigger_propagation() - self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.get_pid()}, - update_out_file=True) + self.dbProxy.update_job(tmpJobSpec, {"propagatorLock": self.get_pid()}, update_out_file=True) else: - mainLog.error('failed to update PandaID={0} status={1}'.format(tmpJobSpec.PandaID, - tmpJobSpec.status)) - mainLog.debug('getting workers to propagate') + mainLog.error("failed to update PandaID={0} status={1}".format(tmpJobSpec.PandaID, tmpJobSpec.status)) + mainLog.debug("getting workers to propagate") sw.reset() - workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers, - harvester_config.propagator.updateInterval) - mainLog.debug('got {0} workers {1}'.format(len(workSpecs), sw.get_elapsed_time())) + workSpecs = self.dbProxy.get_workers_to_propagate(harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) + mainLog.debug("got {0} workers {1}".format(len(workSpecs), sw.get_elapsed_time())) # update workers in central database sw.reset() iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): - workList = workSpecs[iWorkers:iWorkers + nWorkers] + workList = workSpecs[iWorkers : iWorkers + nWorkers] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: - mainLog.error('failed to update workers with {0}'.format(tmpErrStr)) + mainLog.error("failed to update workers with {0}".format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: - mainLog.debug('updated workerID={0} status={1}'.format(tmpWorkSpec.workerID, - tmpWorkSpec.status)) + mainLog.debug("updated workerID={0} status={1}".format(tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs - for logFilePath, logOffset, logSize, logRemoteName in \ - tmpWorkSpec.get_log_files_to_upload(): - with open(logFilePath, 'rb') as logFileObj: - tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj, - logOffset, logSize) + for logFilePath, logOffset, logSize, logRemoteName in tmpWorkSpec.get_log_files_to_upload(): + with open(logFilePath, "rb") as logFileObj: + tmpStat, tmpErr = self.communicator.upload_file(logRemoteName, logFileObj, logOffset, logSize) if tmpStat: - tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset+logSize) + tmpWorkSpec.update_log_files_to_upload(logFilePath, logOffset + logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() - self.dbProxy.update_worker(tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) + self.dbProxy.update_worker(tmpWorkSpec, {"workerID": tmpWorkSpec.workerID}) else: - mainLog.error('failed to update workerID={0} status={1}'.format(tmpWorkSpec.workerID, - tmpWorkSpec.status)) - mainLog.debug('update_workers for {0} workers took {1}'.format(iWorkers, - sw.get_elapsed_time())) - mainLog.debug('getting commands') - commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') - mainLog.debug('got {0} commands'.format(len(commandSpecs))) + mainLog.error("failed to update workerID={0} status={1}".format(tmpWorkSpec.workerID, tmpWorkSpec.status)) + mainLog.debug("update_workers for {0} workers took {1}".format(iWorkers, sw.get_elapsed_time())) + mainLog.debug("getting commands") + commandSpecs = self.dbProxy.get_commands_for_receiver("propagator") + mainLog.debug("got {0} commands".format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith(CommandSpec.COM_reportWorkerStats): # get worker stats - siteName = commandSpec.command.split(':')[-1] + siteName = commandSpec.command.split(":")[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: - mainLog.error('failed to get worker stats for {0}'.format(siteName)) + mainLog.error("failed to get worker stats for {0}".format(siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats(siteName, workerStats) if tmpRet: - mainLog.debug('updated worker stats (command) for {0}'.format(siteName)) + mainLog.debug("updated worker stats (command) for {0}".format(siteName)) else: - mainLog.error('failed to update worker stats (command) for {0} err={1}'.format(siteName, - tmpStr)) + mainLog.error("failed to update worker stats (command) for {0} err={1}".format(siteName, tmpStr)) if not self._last_stats_update or time.time() - self._last_stats_update > STATS_PERIOD: - # get active UPS queues. PanDA server needs to know about them and which harvester instance is taking # care of them active_ups_queues = self.queueConfigMapper.get_active_ups_queues() @@ -187,57 +171,53 @@ def run(self): # update worker stats for all sites worker_stats_bulk = self.dbProxy.get_worker_stats_bulk(active_ups_queues) if not worker_stats_bulk: - mainLog.error('failed to get worker stats in bulk') + mainLog.error("failed to get worker stats in bulk") else: for site_name in worker_stats_bulk: - tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name, - worker_stats_bulk[site_name]) + tmp_ret, tmp_str = self.communicator.update_worker_stats(site_name, worker_stats_bulk[site_name]) if tmp_ret: - mainLog.debug('update of worker stats (bulk) for {0}'.format(site_name)) + mainLog.debug("update of worker stats (bulk) for {0}".format(site_name)) self._last_stats_update = time.time() else: - mainLog.error('failed to update worker stats (bulk) for {0} err={1}'.format(site_name, - tmp_str)) + mainLog.error("failed to update worker stats (bulk) for {0} err={1}".format(site_name, tmp_str)) - if not self._last_metrics_update \ - or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD): + if not self._last_metrics_update or datetime.datetime.utcnow() - self._last_metrics_update > datetime.timedelta(seconds=METRICS_PERIOD): # get latest metrics from DB service_metrics_list = self.dbProxy.get_service_metrics(self._last_metrics_update) if not service_metrics_list: if self._last_metrics_update: - mainLog.error('failed to get service metrics') + mainLog.error("failed to get service metrics") self._last_metrics_update = datetime.datetime.utcnow() else: tmp_ret, tmp_str = self.communicator.update_service_metrics(service_metrics_list) if tmp_ret: - mainLog.debug('update of service metrics OK') + mainLog.debug("update of service metrics OK") self._last_metrics_update = datetime.datetime.utcnow() else: - mainLog.error('failed to update service metrics err={0}'.format(tmp_str)) + mainLog.error("failed to update service metrics err={0}".format(tmp_str)) # send dialog messages - mainLog.debug('getting dialog messages to propagate') + mainLog.debug("getting dialog messages to propagate") try: maxDialogs = harvester_config.propagator.maxDialogs except Exception: maxDialogs = 50 - diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs, - harvester_config.propagator.lockInterval) - mainLog.debug('got {0} dialogs'.format(len(diagSpecs))) + diagSpecs = self.dbProxy.get_dialog_messages_to_send(maxDialogs, harvester_config.propagator.lockInterval) + mainLog.debug("got {0} dialogs".format(len(diagSpecs))) if len(diagSpecs) > 0: tmpStat, tmpStr = self.communicator.send_dialog_messages(diagSpecs) if tmpStat: diagIDs = [diagSpec.diagID for diagSpec in diagSpecs] self.dbProxy.delete_dialog_messages(diagIDs) - mainLog.debug('sent {0} dialogs'.format(len(diagSpecs))) + mainLog.debug("sent {0} dialogs".format(len(diagSpecs))) else: - mainLog.error('failed to send dialogs err={0}'.format(tmpStr)) + mainLog.error("failed to send dialogs err={0}".format(tmpStr)) if sw_main.get_elapsed_time_in_sec() > harvester_config.propagator.lockInterval: - mainLog.warning('a single cycle was longer than lockInterval. done' + sw_main.get_elapsed_time()) + mainLog.warning("a single cycle was longer than lockInterval. done" + sw_main.get_elapsed_time()) else: - mainLog.debug('done' + sw_main.get_elapsed_time()) + mainLog.debug("done" + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return diff --git a/pandaharvester/harvesterbody/service_monitor.py b/pandaharvester/harvesterbody/service_monitor.py index 22aa2c0c..d2d7c3b9 100644 --- a/pandaharvester/harvesterbody/service_monitor.py +++ b/pandaharvester/harvesterbody/service_monitor.py @@ -2,6 +2,7 @@ import psutil import re import multiprocessing + try: import subprocess32 as subprocess except Exception: @@ -16,7 +17,7 @@ from pandaharvester.harvesterbody.cred_manager import CredManager # logger -_logger = core_utils.setup_logger('service_monitor') +_logger = core_utils.setup_logger("service_monitor") def round_floats(value): @@ -53,7 +54,7 @@ def get_master_pid(self): :return: """ try: - fh = open(self.pid_file, 'r') + fh = open(self.pid_file, "r") pid = int(fh.readline()) fh.close() except Exception: @@ -63,7 +64,6 @@ def get_master_pid(self): return pid def refresh_children_list(self, children): - children_refreshed = [] for child_current in children: @@ -102,11 +102,11 @@ def get_memory_n_cpu(self): cpu_pc += child.cpu_percent() # convert bytes to MiB - rss_mib = rss / float(2 ** 20) + rss_mib = rss / float(2**20) # normalize cpu percentage by cpu count cpu_pc = cpu_pc * 1.0 / self.cpu_count except Exception: - _logger.error('Excepted with: {0}'.format(traceback.format_exc())) + _logger.error("Excepted with: {0}".format(traceback.format_exc())) rss_mib = None memory_pc = None cpu_pc = None @@ -119,7 +119,7 @@ def volume_use(self, volume_name): tmp_array = command.split() output = subprocess.Popen(tmp_array, stdout=subprocess.PIPE).communicate()[0].decode("utf-8") - for line in output.split('\n'): + for line in output.split("\n"): if re.search(volume_name, line): used_amount = re.search(r"(\d+)\%", line).group(1) @@ -127,7 +127,7 @@ def volume_use(self, volume_name): used_amount_float = float(used_amount) except ValueError: used_amount_float = None - _logger.error('Could not convert used amount {0} to float for volume {1}'.format(used_amount, volume_name)) + _logger.error("Could not convert used amount {0} to float for volume {1}".format(used_amount, volume_name)) return used_amount_float @@ -136,41 +136,40 @@ def cert_validities(self): cert_validities = self.cred_manager.execute_monit() return cert_validities except Exception: - _logger.error('Could not extract ') + _logger.error("Could not extract ") return {} # main loop def run(self): while True: - _logger.debug('Running service monitor') + _logger.debug("Running service monitor") service_metrics = {} # get memory usage rss_mib, memory_pc, cpu_pc = self.get_memory_n_cpu() - service_metrics['rss_mib'] = round_floats(rss_mib) - service_metrics['memory_pc'] = round_floats(memory_pc) - service_metrics['cpu_pc'] = round_floats(cpu_pc) - _logger.debug('Memory usage: {0} MiB/{1}%, CPU usage: {2}'.format(service_metrics['rss_mib'], - service_metrics['memory_pc'], - service_metrics['cpu_pc'])) + service_metrics["rss_mib"] = round_floats(rss_mib) + service_metrics["memory_pc"] = round_floats(memory_pc) + service_metrics["cpu_pc"] = round_floats(cpu_pc) + _logger.debug( + "Memory usage: {0} MiB/{1}%, CPU usage: {2}".format(service_metrics["rss_mib"], service_metrics["memory_pc"], service_metrics["cpu_pc"]) + ) # get volume usage try: - volumes = harvester_config.service_monitor.disk_volumes.split(',') + volumes = harvester_config.service_monitor.disk_volumes.split(",") except Exception: volumes = [] for volume in volumes: volume_use = self.volume_use(volume) - service_metrics['volume_{0}_pc'.format(volume)] = round_floats(volume_use) - _logger.debug('Disk usage of {0}: {1} %'.format(volume, - service_metrics['volume_{0}_pc'.format(volume)])) + service_metrics["volume_{0}_pc".format(volume)] = round_floats(volume_use) + _logger.debug("Disk usage of {0}: {1} %".format(volume, service_metrics["volume_{0}_pc".format(volume)])) # get certificate lifetimes. Not all plugins have implemented it - _logger.debug('Getting cert lifetimes') - service_metrics['cert_lifetime'] = {cert: round(value) for (cert, value) in self.cert_validities().items()} + _logger.debug("Getting cert lifetimes") + service_metrics["cert_lifetime"] = {cert: round(value) for (cert, value) in self.cert_validities().items()} - _logger.debug('Got cert validities: {0}'.format(service_metrics['cert_lifetime'])) + _logger.debug("Got cert validities: {0}".format(service_metrics["cert_lifetime"])) service_metrics_spec = ServiceMetricSpec(service_metrics) self.db_proxy.insert_service_metrics(service_metrics_spec) diff --git a/pandaharvester/harvesterbody/stager.py b/pandaharvester/harvesterbody/stager.py index c333c480..7e2351dd 100644 --- a/pandaharvester/harvesterbody/stager.py +++ b/pandaharvester/harvesterbody/stager.py @@ -7,7 +7,7 @@ from pandaharvester.harvestercore.pilot_errors import PilotErrors # logger -_logger = core_utils.setup_logger('stager') +_logger = core_utils.setup_logger("stager") # class for stage-out @@ -21,49 +21,50 @@ def __init__(self, queue_config_mapper, single_mode=False): # main loop def run(self): - lockedBy = 'stager-{0}'.format(self.get_pid()) + lockedBy = "stager-{0}".format(self.get_pid()) while True: sw = core_utils.get_stopwatch() - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') - mainLog.debug('try to get jobs to check') + mainLog = self.make_logger(_logger, "id={0}".format(lockedBy), method_name="run") + mainLog.debug("try to get jobs to check") # get jobs to check preparation try: maxFilesPerJob = harvester_config.stager.maxFilesPerJobToCheck except Exception: maxFilesPerJob = None - jobsToCheck = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToCheck, - harvester_config.stager.checkInterval, - harvester_config.stager.lockInterval, - lockedBy, 'transferring', - JobSpec.HO_hasTransfer, - max_files_per_job=maxFilesPerJob) - mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) + jobsToCheck = self.dbProxy.get_jobs_for_stage_out( + harvester_config.stager.maxJobsToCheck, + harvester_config.stager.checkInterval, + harvester_config.stager.lockInterval, + lockedBy, + "transferring", + JobSpec.HO_hasTransfer, + max_files_per_job=maxFilesPerJob, + ) + mainLog.debug("got {0} jobs to check".format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('start checking') + tmpLog.debug("start checking") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "stagerTime", "stagerLock", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue tmpLog.debug("plugin={0}".format(stagerCore.__class__.__name__)) tmpStat, tmpStr = stagerCore.check_stage_out_status(jobSpec) @@ -71,22 +72,22 @@ def run(self): if tmpStat is True: # succeeded newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('succeeded new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("succeeded new subStatus={0}".format(newSubStatus)) elif tmpStat is False: # fatal error - tmpLog.debug('fatal error when checking status with {0}'.format(tmpStr)) + tmpLog.debug("fatal error when checking status with {0}".format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: - if fileSpec.status != 'finished': - fileSpec.status = 'failed' - errStr = 'stage-out failed with {0}'.format(tmpStr) + if fileSpec.status != "finished": + fileSpec.status = "failed" + errStr = "stage-out failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("updated new subStatus={0}".format(newSubStatus)) else: # on-going - tmpLog.debug('try to check later since {0}'.format(tmpStr)) + tmpLog.debug("try to check later since {0}".format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to trigger stage-out @@ -94,40 +95,41 @@ def run(self): maxFilesPerJob = harvester_config.stager.maxFilesPerJobToTrigger except Exception: maxFilesPerJob = None - jobsToTrigger = self.dbProxy.get_jobs_for_stage_out(harvester_config.stager.maxJobsToTrigger, - harvester_config.stager.triggerInterval, - harvester_config.stager.lockInterval, - lockedBy, 'to_transfer', - JobSpec.HO_hasOutput, - [JobSpec.HO_hasZipOutput, JobSpec.HO_hasPostZipOutput], - max_files_per_job=maxFilesPerJob) - mainLog.debug('got {0} jobs to trigger'.format(len(jobsToTrigger))) + jobsToTrigger = self.dbProxy.get_jobs_for_stage_out( + harvester_config.stager.maxJobsToTrigger, + harvester_config.stager.triggerInterval, + harvester_config.stager.lockInterval, + lockedBy, + "to_transfer", + JobSpec.HO_hasOutput, + [JobSpec.HO_hasZipOutput, JobSpec.HO_hasPostZipOutput], + max_files_per_job=maxFilesPerJob, + ) + mainLog.debug("got {0} jobs to trigger".format(len(jobsToTrigger))) # loop over all jobs for jobSpec in jobsToTrigger: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('try to trigger stage-out') + tmpLog.debug("try to trigger stage-out") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin stagerCore = self.pluginFactory.get_plugin(queueConfig.stager) if stagerCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "stagerTime", "stagerLock", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue # trigger stage-out tmpLog.debug("plugin={0}".format(stagerCore.__class__.__name__)) @@ -138,26 +140,26 @@ def run(self): jobSpec.trigger_stage_out() jobSpec.all_files_triggered_to_stage_out() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('triggered new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("triggered new subStatus={0}".format(newSubStatus)) elif tmpStat is False: # fatal error - tmpLog.debug('fatal error to trigger with {0}'.format(tmpStr)) + tmpLog.debug("fatal error to trigger with {0}".format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: - if fileSpec.status != 'finished': - fileSpec.status = 'failed' - errStr = 'stage-out failed with {0}'.format(tmpStr) + if fileSpec.status != "finished": + fileSpec.status = "failed" + errStr = "stage-out failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("updated new subStatus={0}".format(newSubStatus)) else: # temporary error - tmpLog.debug('try to trigger later since {0}'.format(tmpStr)) + tmpLog.debug("try to trigger later since {0}".format(tmpStr)) except Exception: core_utils.dump_error_message(tmpLog) # get jobs to zip output - if hasattr(harvester_config, 'zipper'): + if hasattr(harvester_config, "zipper"): pluginConf = harvester_config.zipper else: pluginConf = harvester_config.stager @@ -173,43 +175,44 @@ def run(self): usePostZipping = pluginConf.usePostZipping except Exception: usePostZipping = False - jobsToZip = self.dbProxy.get_jobs_for_stage_out(pluginConf.maxJobsToZip, - zipInterval, - pluginConf.lockInterval, - lockedBy, 'to_transfer', - JobSpec.HO_hasZipOutput, - [JobSpec.HO_hasOutput, JobSpec.HO_hasPostZipOutput], - max_files_per_job=maxFilesPerJob) - mainLog.debug('got {0} jobs to zip'.format(len(jobsToZip))) + jobsToZip = self.dbProxy.get_jobs_for_stage_out( + pluginConf.maxJobsToZip, + zipInterval, + pluginConf.lockInterval, + lockedBy, + "to_transfer", + JobSpec.HO_hasZipOutput, + [JobSpec.HO_hasOutput, JobSpec.HO_hasPostZipOutput], + max_files_per_job=maxFilesPerJob, + ) + mainLog.debug("got {0} jobs to zip".format(len(jobsToZip))) # loop over all jobs for jobSpec in jobsToZip: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('try to zip output') + tmpLog.debug("try to zip output") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin - if hasattr(queueConfig, 'zipper'): + if hasattr(queueConfig, "zipper"): zipperCore = self.pluginFactory.get_plugin(queueConfig.zipper) else: zipperCore = self.pluginFactory.get_plugin(queueConfig.stager) if zipperCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', 'stagerLock', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "stagerTime", "stagerLock", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue # zipping tmpLog.debug("plugin={0}".format(zipperCore.__class__.__name__)) @@ -224,64 +227,64 @@ def run(self): jobSpec.all_files_zipped(usePostZipping) newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) if usePostZipping: - tmpLog.debug('async zipped new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("async zipped new subStatus={0}".format(newSubStatus)) else: - tmpLog.debug('zipped new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("zipped new subStatus={0}".format(newSubStatus)) elif tmpStat is None: - tmpLog.debug('try later since {0}'.format(tmpStr)) + tmpLog.debug("try later since {0}".format(tmpStr)) else: # failed - tmpLog.debug('fatal error to zip with {0}'.format(tmpStr)) + tmpLog.debug("fatal error to zip with {0}".format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: - if fileSpec.status == 'zipping': - fileSpec.status = 'failed' - errStr = 'zip-output failed with {0}'.format(tmpStr) + if fileSpec.status == "zipping": + fileSpec.status = "failed" + errStr = "zip-output failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("updated new subStatus={0}".format(newSubStatus)) except Exception: core_utils.dump_error_message(tmpLog) if usePostZipping: - jobsToPostZip = self.dbProxy.get_jobs_for_stage_out(pluginConf.maxJobsToZip, - zipInterval, - pluginConf.lockInterval, - lockedBy, 'to_transfer', - JobSpec.HO_hasPostZipOutput, - [JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput], - max_files_per_job=maxFilesPerJob) - mainLog.debug('got {0} jobs to post-zip'.format(len(jobsToPostZip))) + jobsToPostZip = self.dbProxy.get_jobs_for_stage_out( + pluginConf.maxJobsToZip, + zipInterval, + pluginConf.lockInterval, + lockedBy, + "to_transfer", + JobSpec.HO_hasPostZipOutput, + [JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput], + max_files_per_job=maxFilesPerJob, + ) + mainLog.debug("got {0} jobs to post-zip".format(len(jobsToPostZip))) # loop over all jobs for jobSpec in jobsToPostZip: - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobSpec.PandaID), - method_name='run') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobSpec.PandaID), method_name="run") try: - tmpLog.debug('try to post-zip output') + tmpLog.debug("try to post-zip output") # configID configID = jobSpec.configID if not core_utils.dynamic_plugin_change(): configID = None # get queue if not self.queueConfigMapper.has_queue(jobSpec.computingSite, configID): - tmpLog.error('queue config for {0}/{1} not found'.format(jobSpec.computingSite, - configID)) + tmpLog.error("queue config for {0}/{1} not found".format(jobSpec.computingSite, configID)) continue queueConfig = self.queueConfigMapper.get_queue(jobSpec.computingSite, configID) # get plugin - if hasattr(queueConfig, 'zipper'): + if hasattr(queueConfig, "zipper"): zipperCore = self.pluginFactory.get_plugin(queueConfig.zipper) else: zipperCore = self.pluginFactory.get_plugin(queueConfig.stager) if zipperCore is None: # not found - tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) + tmpLog.error("plugin for {0} not found".format(jobSpec.computingSite)) continue # lock job again - lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, 'stagerTime', - 'stagerLock', lockedBy) + lockedAgain = self.dbProxy.lock_job_again(jobSpec.PandaID, "stagerTime", "stagerLock", lockedBy) if not lockedAgain: - tmpLog.debug('skip since locked by another thread') + tmpLog.debug("skip since locked by another thread") continue # post-zipping tmpLog.debug("plugin={0}".format(zipperCore.__class__.__name__)) @@ -292,27 +295,27 @@ def run(self): jobSpec.trigger_stage_out() jobSpec.all_files_zipped() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, False, lockedBy) - tmpLog.debug('post-zipped new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("post-zipped new subStatus={0}".format(newSubStatus)) elif tmpStat is None: # pending - tmpLog.debug('try to post-zip later since {0}'.format(tmpStr)) + tmpLog.debug("try to post-zip later since {0}".format(tmpStr)) else: # fatal error - tmpLog.debug('fatal error to post-zip since {0}'.format(tmpStr)) + tmpLog.debug("fatal error to post-zip since {0}".format(tmpStr)) # update job for fileSpec in jobSpec.outFiles: - if fileSpec.status == 'post_zipping': - fileSpec.status = 'failed' - errStr = 'post-zipping failed with {0}'.format(tmpStr) + if fileSpec.status == "post_zipping": + fileSpec.status = "failed" + errStr = "post-zipping failed with {0}".format(tmpStr) jobSpec.set_pilot_error(PilotErrors.STAGEOUTFAILED, errStr) jobSpec.trigger_propagation() newSubStatus = self.dbProxy.update_job_for_stage_out(jobSpec, True, lockedBy) - tmpLog.debug('updated new subStatus={0}'.format(newSubStatus)) + tmpLog.debug("updated new subStatus={0}".format(newSubStatus)) except Exception: core_utils.dump_error_message(tmpLog) - mainLog.debug('done' + sw.get_elapsed_time()) + mainLog.debug("done" + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.stager.sleepTime): - mainLog.debug('terminated') + mainLog.debug("terminated") return diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index e8e68e0b..6b02c5be 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -18,7 +18,7 @@ from pandaharvester.harvestermisc.apfmon import Apfmon # logger -_logger = core_utils.setup_logger('submitter') +_logger = core_utils.setup_logger("submitter") # class to submit workers @@ -36,28 +36,30 @@ def __init__(self, queue_config_mapper, single_mode=False): # main loop def run(self): - locked_by = 'submitter-{0}'.format(self.get_pid()) + locked_by = "submitter-{0}".format(self.get_pid()) monitor_fifo = self.monitor_fifo - queue_lock_interval = getattr(harvester_config.submitter, 'queueLockInterval', - harvester_config.submitter.lockInterval) + queue_lock_interval = getattr(harvester_config.submitter, "queueLockInterval", harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() - main_log = self.make_logger(_logger, 'id={0}'.format(locked_by), method_name='run') - main_log.debug('getting queues to submit workers') + main_log = self.make_logger(_logger, "id={0}".format(locked_by), method_name="run") + main_log.debug("getting queues to submit workers") # get queues associated to a site to submit workers - current_workers, site_name, res_map = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, - harvester_config.submitter.lookupTime, - harvester_config.submitter.lockInterval, - locked_by, queue_lock_interval) + current_workers, site_name, res_map = self.dbProxy.get_queues_to_submit( + harvester_config.submitter.nQueues, + harvester_config.submitter.lookupTime, + harvester_config.submitter.lockInterval, + locked_by, + queue_lock_interval, + ) submitted = False if site_name is not None: - main_log.debug('got {0} queues for site {1}'.format(len(current_workers), site_name)) + main_log.debug("got {0} queues for site {1}".format(len(current_workers), site_name)) # get commands from panda server - com_str = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, site_name) - command_specs = self.dbProxy.get_commands_for_receiver('submitter', com_str) - main_log.debug('got {0} {1} commands'.format(len(command_specs), com_str)) + com_str = "{0}:{1}".format(CommandSpec.COM_setNWorkers, site_name) + command_specs = self.dbProxy.get_commands_for_receiver("submitter", com_str) + main_log.debug("got {0} {1} commands".format(len(command_specs), com_str)) for command_spec in command_specs: new_limits = self.dbProxy.set_queue_limit(site_name, command_spec.params) for tmp_job_type, tmp_jt_vals in iteritems(new_limits): @@ -67,7 +69,7 @@ def run(self): if tmp_resource_type in res_map[tmp_job_type]: tmp_queue_name = res_map[tmp_job_type][tmp_resource_type] if tmp_queue_name in current_workers: - current_workers[tmp_queue_name][tmp_job_type][tmp_resource_type]['nNewWorkers'] = tmp_new_val + current_workers[tmp_queue_name][tmp_job_type][tmp_resource_type]["nNewWorkers"] = tmp_new_val # define number of new workers if len(current_workers) == 0: @@ -76,7 +78,7 @@ def run(self): n_workers_per_queue_jt_rt = self.workerAdjuster.define_num_workers(current_workers, site_name) if n_workers_per_queue_jt_rt is None: - main_log.error('WorkerAdjuster failed to define the number of workers') + main_log.error("WorkerAdjuster failed to define the number of workers") elif len(n_workers_per_queue_jt_rt) == 0: pass else: @@ -85,52 +87,48 @@ def run(self): for job_type in n_workers_per_queue_jt_rt[queue_name]: for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type] - tmp_log = self.make_logger(_logger, 'id={0} queue={1} jtype={2} rtype={3}'.format( - locked_by, queue_name, job_type, resource_type), method_name='run') + tmp_log = self.make_logger( + _logger, "id={0} queue={1} jtype={2} rtype={3}".format(locked_by, queue_name, job_type, resource_type), method_name="run" + ) try: - tmp_log.debug('start') - tmp_log.debug('workers status: %s' % tmp_val) - nWorkers = tmp_val['nNewWorkers'] + tmp_val['nReady'] - nReady = tmp_val['nReady'] + tmp_log.debug("start") + tmp_log.debug("workers status: %s" % tmp_val) + nWorkers = tmp_val["nNewWorkers"] + tmp_val["nReady"] + nReady = tmp_val["nReady"] # check queue if not self.queue_configMapper.has_queue(queue_name): - tmp_log.error('config not found') + tmp_log.error("config not found") continue # no new workers if nWorkers == 0: - tmp_log.debug('skipped since no new worker is needed based on current stats') + tmp_log.debug("skipped since no new worker is needed based on current stats") continue # get queue queue_config = self.queue_configMapper.get_queue(queue_name) workerMakerCore = self.workerMaker.get_plugin(queue_config) # check if resource is ready - if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: - numReadyResources = self.workerMaker.num_ready_resources(queue_config, - job_type, - resource_type, - workerMakerCore) - tmp_log.debug('numReadyResources: %s' % numReadyResources) + if hasattr(workerMakerCore, "dynamicSizing") and workerMakerCore.dynamicSizing is True: + numReadyResources = self.workerMaker.num_ready_resources(queue_config, job_type, resource_type, workerMakerCore) + tmp_log.debug("numReadyResources: %s" % numReadyResources) if not numReadyResources: - if hasattr(workerMakerCore, 'staticWorkers'): - nQRWorkers = tmp_val['nQueue'] + tmp_val['nRunning'] - tmp_log.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % - (workerMakerCore.staticWorkers, nQRWorkers)) + if hasattr(workerMakerCore, "staticWorkers"): + nQRWorkers = tmp_val["nQueue"] + tmp_val["nRunning"] + tmp_log.debug("staticWorkers: %s, nQRWorkers(Queue+Running): %s" % (workerMakerCore.staticWorkers, nQRWorkers)) if nQRWorkers >= workerMakerCore.staticWorkers: - tmp_log.debug('No left static workers, skip') + tmp_log.debug("No left static workers, skip") continue else: nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) - tmp_log.debug('staticWorkers: %s, nWorkers: %s' % - (workerMakerCore.staticWorkers, nWorkers)) + tmp_log.debug("staticWorkers: %s, nWorkers: %s" % (workerMakerCore.staticWorkers, nWorkers)) else: - tmp_log.debug('skip since no resources are ready') + tmp_log.debug("skip since no resources are ready") continue else: nWorkers = min(nWorkers, numReadyResources) # post action of worker maker - if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: + if hasattr(workerMakerCore, "skipOnFail") and workerMakerCore.skipOnFail is True: skipOnFail = True else: skipOnFail = False @@ -144,64 +142,74 @@ def run(self): # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queue_name, - nWorkers, nReady, 1, None, + nWorkers, + nReady, + 1, + None, queue_config.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, - locked_by) + locked_by, + ) elif queue_config.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs - nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queue_config, - nWorkers, - job_type, - resource_type, - maker=workerMakerCore) - tmp_log.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) + nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( + queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore + ) + tmp_log.debug("nJobsPerWorker={0}".format(nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queue_name, - nWorkers, nReady, nJobsPerWorker, None, + nWorkers, + nReady, + nJobsPerWorker, + None, queue_config.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, locked_by, - queue_config.allowJobMixture) + queue_config.allowJobMixture, + ) elif queue_config.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job - nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queue_config, - nWorkers, - job_type, - resource_type, - maker=workerMakerCore) + nWorkersPerJob = self.workerMaker.get_num_workers_per_job( + queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore + ) maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( - queue_config, job_type, resource_type, maker=workerMakerCore) + queue_config, job_type, resource_type, maker=workerMakerCore + ) maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( - queue_config, job_type, resource_type, maker=workerMakerCore) - tmp_log.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) + queue_config, job_type, resource_type, maker=workerMakerCore + ) + tmp_log.debug("nWorkersPerJob={0}".format(nWorkersPerJob)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queue_name, - nWorkers, nReady, None, nWorkersPerJob, + nWorkers, + nReady, + None, + nWorkersPerJob, queue_config.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, - locked_by, max_workers_per_job_in_total=maxWorkersPerJob, - max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) + locked_by, + max_workers_per_job_in_total=maxWorkersPerJob, + max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle, + ) else: - tmp_log.error('unknown mapType={0}'.format(queue_config.mapType)) + tmp_log.error("unknown mapType={0}".format(queue_config.mapType)) continue - tmp_log.debug('got {0} job chunks'.format(len(jobChunks))) + tmp_log.debug("got {0} job chunks".format(len(jobChunks))) if len(jobChunks) == 0: continue # make workers - okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queue_config, - nReady, job_type, resource_type, - maker=workerMakerCore) + okChunks, ngChunks = self.workerMaker.make_workers( + jobChunks, queue_config, nReady, job_type, resource_type, maker=workerMakerCore + ) if len(ngChunks) == 0: - tmp_log.debug('successfully made {0} workers'.format(len(okChunks))) + tmp_log.debug("successfully made {0} workers".format(len(okChunks))) else: - tmp_log.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), - len(ngChunks))) + tmp_log.debug("made {0} workers, while {1} workers failed".format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() timeNow_timestamp = time.time() pandaIDs = set() @@ -212,22 +220,20 @@ def run(self): # release jobs when workers are not made pandaIDs.add(job_spec.PandaID) else: - job_spec.status = 'failed' - job_spec.subStatus = 'failed_to_make' + job_spec.status = "failed" + job_spec.subStatus = "failed_to_make" job_spec.stateChangeTime = timeNow job_spec.locked_by = None - errStr = 'failed to make a worker' + errStr = "failed to make a worker" job_spec.set_pilot_error(PilotErrors.SETUPFAILURE, errStr) job_spec.trigger_propagation() - self.dbProxy.update_job(job_spec, {'locked_by': locked_by, - 'subStatus': 'prepared'}) + self.dbProxy.update_job(job_spec, {"locked_by": locked_by, "subStatus": "prepared"}) # OK work_specList = [] if len(okChunks) > 0: for work_spec, okJobs in okChunks: # has job - if (queue_config.useJobLateBinding and work_spec.workerID is None) \ - or queue_config.mapType == WorkSpec.MT_NoJob: + if (queue_config.useJobLateBinding and work_spec.workerID is None) or queue_config.mapType == WorkSpec.MT_NoJob: work_spec.hasJob = 0 else: work_spec.hasJob = 1 @@ -235,9 +241,9 @@ def run(self): work_spec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running - work_spec.set_jobspec_list(okJobs[:work_spec.nJobsToReFill]) + work_spec.set_jobspec_list(okJobs[: work_spec.nJobsToReFill]) work_spec.nJobsToReFill = None - for job_spec in okJobs[work_spec.nJobsToReFill:]: + for job_spec in okJobs[work_spec.nJobsToReFill :]: pandaIDs.add(job_spec.PandaID) work_spec.set_num_jobs_with_list() # map type @@ -245,14 +251,13 @@ def run(self): # queue name work_spec.computingSite = queue_config.queueName # set access point - work_spec.accessPoint = queue_config.messenger['accessPoint'] + work_spec.accessPoint = queue_config.messenger["accessPoint"] # sync level work_spec.syncLevel = queue_config.get_synchronization_level() # events - if len(okJobs) > 0 and \ - ('eventService' in okJobs[0].jobParams or - 'cloneJob' in okJobs[0].jobParams or - 'isHPO' in okJobs[0].jobParams): + if len(okJobs) > 0 and ( + "eventService" in okJobs[0].jobParams or "cloneJob" in okJobs[0].jobParams or "isHPO" in okJobs[0].jobParams + ): work_spec.eventsRequest = WorkSpec.EV_useEvents work_specList.append(work_spec) if len(work_specList) > 0: @@ -261,15 +266,13 @@ def run(self): submitterCore = self.pluginFactory.get_plugin(queue_config.submitter) if submitterCore is None: # not found - tmp_log.error( - 'submitter plugin for {0} not found'.format(job_spec.computingSite)) + tmp_log.error("submitter plugin for {0} not found".format(job_spec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin(queue_config.messenger) if messenger is None: # not found - tmp_log.error( - 'messenger plugin for {0} not found'.format(job_spec.computingSite)) + tmp_log.error("messenger plugin for {0} not found".format(job_spec.computingSite)) continue # setup access points messenger.setup_access_points(work_specList) @@ -278,21 +281,16 @@ def run(self): if work_spec.hasJob == 1: tmpStat = messenger.feed_jobs(work_spec, work_spec.get_jobspec_list()) if tmpStat is False: - tmp_log.error( - 'failed to send jobs to workerID={0}'.format(work_spec.workerID)) + tmp_log.error("failed to send jobs to workerID={0}".format(work_spec.workerID)) else: - tmp_log.debug( - 'sent jobs to workerID={0} with {1}'.format(work_spec.workerID, - tmpStat)) + tmp_log.debug("sent jobs to workerID={0} with {1}".format(work_spec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers(work_specList, locked_by) # submit sw.reset() - tmp_log.info('submitting {0} workers'.format(len(work_specList))) - work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, - work_specList) - tmp_log.debug('done submitting {0} workers'.format(len(work_specList)) - + sw.get_elapsed_time()) + tmp_log.info("submitting {0} workers".format(len(work_specList))) + work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, work_specList) + tmp_log.debug("done submitting {0} workers".format(len(work_specList)) + sw.get_elapsed_time()) # collect successful jobs okPandaIDs = set() for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): @@ -312,9 +310,7 @@ def run(self): # set status if not tmpRet: # failed submission - errStr = 'failed to submit a workerID={0} with {1}'.format( - work_spec.workerID, - tmpStr) + errStr = "failed to submit a workerID={0} with {1}".format(work_spec.workerID, tmpStr) tmp_log.error(errStr) work_spec.set_status(WorkSpec.ST_missed) work_spec.set_dialog_message(tmpStr) @@ -331,14 +327,10 @@ def run(self): job_spec.submissionAttempts = 0 job_spec.submissionAttempts += 1 # max attempt or permanent error - if tmpRet is False or \ - job_spec.submissionAttempts >= \ - queue_config.maxSubmissionAttempts: + if tmpRet is False or job_spec.submissionAttempts >= queue_config.maxSubmissionAttempts: newJobList.append(job_spec) else: - self.dbProxy.increment_submission_attempt( - job_spec.PandaID, - job_spec.submissionAttempts) + self.dbProxy.increment_submission_attempt(job_spec.PandaID, job_spec.submissionAttempts) jobList = newJobList elif queue_config.useJobLateBinding and work_spec.hasJob == 1: # directly go to running after feeding jobs for late biding @@ -350,32 +342,30 @@ def run(self): work_spec.modificationTime = timeNow work_spec.checkTime = timeNow if self.monitor_fifo.enabled: - work_spec.set_work_params({'lastCheckAt': timeNow_timestamp}) + work_spec.set_work_params({"lastCheckAt": timeNow_timestamp}) # prefetch events - if tmpRet and work_spec.hasJob == 1 and \ - work_spec.eventsRequest == WorkSpec.EV_useEvents and \ - queue_config.prefetchEvents: + if ( + tmpRet + and work_spec.hasJob == 1 + and work_spec.eventsRequest == WorkSpec.EV_useEvents + and queue_config.prefetchEvents + ): work_spec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for job_spec in jobList: - eventsRequestParams[job_spec.PandaID] = \ - {'pandaID': job_spec.PandaID, - 'taskID': job_spec.taskID, - 'jobsetID': job_spec.jobParams['jobsetID'], - 'nRanges': max(int(math.ceil(work_spec.nCore / len(jobList))), - job_spec.jobParams['coreCount']) * \ - queue_config.initEventsMultipler, - } - if 'isHPO' in job_spec.jobParams: - if 'sourceURL' in job_spec.jobParams: - sourceURL = job_spec.jobParams['sourceURL'] + eventsRequestParams[job_spec.PandaID] = { + "pandaID": job_spec.PandaID, + "taskID": job_spec.taskID, + "jobsetID": job_spec.jobParams["jobsetID"], + "nRanges": max(int(math.ceil(work_spec.nCore / len(jobList))), job_spec.jobParams["coreCount"]) + * queue_config.initEventsMultipler, + } + if "isHPO" in job_spec.jobParams: + if "sourceURL" in job_spec.jobParams: + sourceURL = job_spec.jobParams["sourceURL"] else: sourceURL = None - eventsRequestParams[job_spec.PandaID].update( - {'isHPO': True, - 'jobsetID': 0, - 'sourceURL': sourceURL - }) + eventsRequestParams[job_spec.PandaID].update({"isHPO": True, "jobsetID": 0, "sourceURL": sourceURL}) work_spec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker(work_spec, jobList, locked_by) @@ -384,63 +374,52 @@ def run(self): pandaIDs.add(job_spec.PandaID) if tmpStat: if tmpRet: - tmpStr = \ - 'submitted a workerID={0} for PandaID={1} with submissionHost={2} batchID={3}' - tmp_log.info(tmpStr.format(work_spec.workerID, - job_spec.PandaID, - work_spec.submissionHost, - work_spec.batchID)) + tmpStr = "submitted a workerID={0} for PandaID={1} with submissionHost={2} batchID={3}" + tmp_log.info( + tmpStr.format(work_spec.workerID, job_spec.PandaID, work_spec.submissionHost, work_spec.batchID) + ) else: - tmpStr = 'failed to submit a workerID={0} for PandaID={1}' - tmp_log.error(tmpStr.format(work_spec.workerID, - job_spec.PandaID)) + tmpStr = "failed to submit a workerID={0} for PandaID={1}" + tmp_log.error(tmpStr.format(work_spec.workerID, job_spec.PandaID)) else: - tmpStr = \ - 'failed to register a worker for PandaID={0} with submissionHost={1} batchID={2}' + tmpStr = "failed to register a worker for PandaID={0} with submissionHost={1} batchID={2}" tmp_log.error(tmpStr.format(job_spec.PandaID, work_spec.submissionHost, work_spec.batchID)) # enqueue to monitor fifo - if self.monitor_fifo.enabled \ - and queue_config.mapType != WorkSpec.MT_MultiWorkers: - work_specsToEnqueue = \ - [[w] for w in work_specList if w.status - in (WorkSpec.ST_submitted, WorkSpec.ST_running)] + if self.monitor_fifo.enabled and queue_config.mapType != WorkSpec.MT_MultiWorkers: + work_specsToEnqueue = [[w] for w in work_specList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] check_delay = min( - getattr(harvester_config.monitor, 'eventBasedCheckInterval', - harvester_config.monitor.checkInterval), - getattr(harvester_config.monitor, 'fifoCheckInterval', - harvester_config.monitor.checkInterval)) + getattr(harvester_config.monitor, "eventBasedCheckInterval", harvester_config.monitor.checkInterval), + getattr(harvester_config.monitor, "fifoCheckInterval", harvester_config.monitor.checkInterval), + ) monitor_fifo.put((queue_name, work_specsToEnqueue), time.time() + check_delay) - main_log.debug('put workers to monitor FIFO') + main_log.debug("put workers to monitor FIFO") submitted = True # release jobs self.dbProxy.release_jobs(pandaIDs, locked_by) - tmp_log.info('done') + tmp_log.info("done") except Exception: core_utils.dump_error_message(tmp_log) # release the site self.dbProxy.release_site(site_name, locked_by) if sw_main.get_elapsed_time_in_sec() > queue_lock_interval: - main_log.warning('a submitter cycle was longer than queue_lock_interval {0} sec'.format(queue_lock_interval) - + sw_main.get_elapsed_time()) - main_log.debug('done') + main_log.warning("a submitter cycle was longer than queue_lock_interval {0} sec".format(queue_lock_interval) + sw_main.get_elapsed_time()) + main_log.debug("done") # define sleep interval - if site_name is None or \ - (hasattr(harvester_config.submitter, 'respectSleepTime') and - harvester_config.submitter.respectSleepTime): + if site_name is None or (hasattr(harvester_config.submitter, "respectSleepTime") and harvester_config.submitter.respectSleepTime): sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 - if submitted and hasattr(harvester_config.submitter, 'minSubmissionInterval'): + if submitted and hasattr(harvester_config.submitter, "minSubmissionInterval"): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) - self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=site_name) + self.dbProxy.update_panda_queue_attribute("submitTime", newTime, site_name=site_name) # time the cycle - main_log.debug('done a submitter cycle' + sw_main.get_elapsed_time()) + main_log.debug("done a submitter cycle" + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): - main_log.debug('terminated') + main_log.debug("terminated") return # wrapper for submitWorkers to skip ready workers @@ -453,7 +432,7 @@ def submit_workers(self, submitter_core, workspec_list): if work_spec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: newSpecList.append(work_spec) retList.append(True) - strList.append('') + strList.append("") else: workersToSubmit.append(work_spec) tmpRetList = submitter_core.submit_workers(workersToSubmit) diff --git a/pandaharvester/harvesterbody/sweeper.py b/pandaharvester/harvesterbody/sweeper.py index 1aaa912c..f9197f06 100644 --- a/pandaharvester/harvesterbody/sweeper.py +++ b/pandaharvester/harvesterbody/sweeper.py @@ -1,4 +1,5 @@ import os + try: from os import walk except ImportError: @@ -13,7 +14,7 @@ from pandaharvester.harvestercore.command_spec import CommandSpec # logger -_logger = core_utils.setup_logger('sweeper') +_logger = core_utils.setup_logger("sweeper") # class for cleanup @@ -29,36 +30,36 @@ def __init__(self, queue_config_mapper, single_mode=False): def process_kill_commands(self): # process commands for marking workers that need to be killed - tmp_log = self.make_logger(_logger, 'id={0}'.format(self.lockedBy), method_name='process_commands') + tmp_log = self.make_logger(_logger, "id={0}".format(self.lockedBy), method_name="process_commands") # 1. KILL_WORKER commands that were sent to panda server and forwarded to harvester stopwatch = core_utils.get_stopwatch() command_string = CommandSpec.COM_killWorkers - tmp_log.debug('try to get {0} commands'.format(command_string)) - command_specs = self.dbProxy.get_commands_for_receiver('sweeper', command_string) - tmp_log.debug('got {0} {1} commands'.format(len(command_specs), command_string)) + tmp_log.debug("try to get {0} commands".format(command_string)) + command_specs = self.dbProxy.get_commands_for_receiver("sweeper", command_string) + tmp_log.debug("got {0} {1} commands".format(len(command_specs), command_string)) for command_spec in command_specs: n_to_kill = self.dbProxy.mark_workers_to_kill_by_query(command_spec.params) - tmp_log.debug('will kill {0} workers with {1}'.format(n_to_kill, command_spec.params)) - tmp_log.debug('done handling {0} commands took {1}s'.format(command_string, stopwatch.get_elapsed_time())) + tmp_log.debug("will kill {0} workers with {1}".format(n_to_kill, command_spec.params)) + tmp_log.debug("done handling {0} commands took {1}s".format(command_string, stopwatch.get_elapsed_time())) # 2. SYNC_WORKERS_KILL commands from comparing worker status provided by pilot and harvester stopwatch = core_utils.get_stopwatch() command_string = CommandSpec.COM_syncWorkersKill - tmp_log.debug('try to get {0} commands'.format(command_string)) - command_specs = self.dbProxy.get_commands_for_receiver('sweeper', command_string) - tmp_log.debug('got {0} {1} commands'.format(len(command_specs), command_string)) + tmp_log.debug("try to get {0} commands".format(command_string)) + command_specs = self.dbProxy.get_commands_for_receiver("sweeper", command_string) + tmp_log.debug("got {0} {1} commands".format(len(command_specs), command_string)) for command_spec in command_specs: n_to_kill = self.dbProxy.mark_workers_to_kill_by_workerids(command_spec.params) - tmp_log.debug('will kill {0} workers with {1}'.format(n_to_kill, command_spec.params)) - tmp_log.debug('done handling {0} commands took {1}s'.format(command_string, stopwatch.get_elapsed_time())) + tmp_log.debug("will kill {0} workers with {1}".format(n_to_kill, command_spec.params)) + tmp_log.debug("done handling {0} commands took {1}s".format(command_string, stopwatch.get_elapsed_time())) # main loop def run(self): - self.lockedBy = 'sweeper-{0}'.format(self.get_pid()) + self.lockedBy = "sweeper-{0}".format(self.get_pid()) while True: sw_main = core_utils.get_stopwatch() - main_log = self.make_logger(_logger, 'id={0}'.format(self.lockedBy), method_name='run') + main_log = self.make_logger(_logger, "id={0}".format(self.lockedBy), method_name="run") # process commands that mark workers to be killed try: @@ -68,42 +69,40 @@ def run(self): # actual killing stage sw_kill = core_utils.get_stopwatch() - main_log.debug('try to get workers to kill') + main_log.debug("try to get workers to kill") # get workers to kill - workers_to_kill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, - harvester_config.sweeper.checkInterval) - main_log.debug('got {0} queues to kill workers'.format(len(workers_to_kill))) + workers_to_kill = self.dbProxy.get_workers_to_kill(harvester_config.sweeper.maxWorkers, harvester_config.sweeper.checkInterval) + main_log.debug("got {0} queues to kill workers".format(len(workers_to_kill))) # loop over all workers sw = core_utils.get_stopwatch() for queue_name, configIdWorkSpecList in iteritems(workers_to_kill): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queue_name, configID): - main_log.error('queue config for {0}/{1} not found'.format(queue_name, configID)) + main_log.error("queue config for {0}/{1} not found".format(queue_name, configID)) continue queue_config = self.queueConfigMapper.get_queue(queue_name, configID) try: sweeper_core = self.pluginFactory.get_plugin(queue_config.sweeper) except Exception: - main_log.error('failed to launch sweeper plugin for {0}/{1}'.format(queue_name, configID)) + main_log.error("failed to launch sweeper plugin for {0}/{1}".format(queue_name, configID)) core_utils.dump_error_message(main_log) continue sw.reset() n_workers = len(workspec_list) try: # try bulk method - tmp_log = self.make_logger(_logger, 'id={0}'.format(self.lockedBy), method_name='run') - tmp_log.debug('start killing') + tmp_log = self.make_logger(_logger, "id={0}".format(self.lockedBy), method_name="run") + tmp_log.debug("start killing") tmp_list = sweeper_core.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: - tmp_log = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='run') + tmp_log = self.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="run") try: - tmp_log.debug('start killing one worker') + tmp_log.debug("start killing one worker") tmp_stat, tmp_out = sweeper_core.kill_worker(workspec) - tmp_log.debug('done killing with status={0} diag={1}'.format(tmp_stat, tmp_out)) + tmp_log.debug("done killing with status={0} diag={1}".format(tmp_stat, tmp_out)) except Exception: core_utils.dump_error_message(tmp_log) except Exception: @@ -112,13 +111,12 @@ def run(self): # bulk method n_killed = 0 for workspec, (tmp_stat, tmp_out) in zip(workspec_list, tmp_list): - tmp_log.debug('done killing workerID={0} with status={1} diag={2}'.format( - workspec.workerID, tmp_stat, tmp_out)) + tmp_log.debug("done killing workerID={0} with status={1} diag={2}".format(workspec.workerID, tmp_stat, tmp_out)) if tmp_stat: n_killed += 1 - tmp_log.debug('killed {0}/{1} workers'.format(n_killed, n_workers)) - main_log.debug('done killing {0} workers'.format(n_workers) + sw.get_elapsed_time()) - main_log.debug('done all killing' + sw_kill.get_elapsed_time()) + tmp_log.debug("killed {0}/{1} workers".format(n_killed, n_workers)) + main_log.debug("done killing {0} workers".format(n_workers) + sw.get_elapsed_time()) + main_log.debug("done all killing" + sw_kill.get_elapsed_time()) # cleanup stage sw_cleanup = core_utils.get_stopwatch() @@ -132,21 +130,21 @@ def run(self): except Exception: keep_pending = 24 # get workers for cleanup - statusTimeoutMap = {'finished': harvester_config.sweeper.keepFinished, - 'failed': harvester_config.sweeper.keepFailed, - 'cancelled': harvester_config.sweeper.keepCancelled, - 'missed': keep_missed, - 'pending': keep_pending - } - workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, - statusTimeoutMap) - main_log.debug('got {0} queues for workers cleanup'.format(len(workersForCleanup))) + statusTimeoutMap = { + "finished": harvester_config.sweeper.keepFinished, + "failed": harvester_config.sweeper.keepFailed, + "cancelled": harvester_config.sweeper.keepCancelled, + "missed": keep_missed, + "pending": keep_pending, + } + workersForCleanup = self.dbProxy.get_workers_for_cleanup(harvester_config.sweeper.maxWorkers, statusTimeoutMap) + main_log.debug("got {0} queues for workers cleanup".format(len(workersForCleanup))) sw = core_utils.get_stopwatch() for queue_name, configIdWorkSpecList in iteritems(workersForCleanup): for configID, workspec_list in iteritems(configIdWorkSpecList): # get sweeper if not self.queueConfigMapper.has_queue(queue_name, configID): - main_log.error('queue config for {0}/{1} not found'.format(queue_name, configID)) + main_log.error("queue config for {0}/{1} not found".format(queue_name, configID)) continue queue_config = self.queueConfigMapper.get_queue(queue_name, configID) sweeper_core = self.pluginFactory.get_plugin(queue_config.sweeper) @@ -154,61 +152,57 @@ def run(self): sw.reset() n_workers = len(workspec_list) # make sure workers to clean up are all terminated - main_log.debug('making sure workers to clean up are all terminated') + main_log.debug("making sure workers to clean up are all terminated") try: # try bulk method tmp_list = sweeper_core.kill_workers(workspec_list) except AttributeError: # fall back to single-worker method for workspec in workspec_list: - tmp_log = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='run') + tmp_log = self.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="run") try: tmp_stat, tmp_out = sweeper_core.kill_worker(workspec) except Exception: core_utils.dump_error_message(tmp_log) except Exception: core_utils.dump_error_message(main_log) - main_log.debug('made sure workers to clean up are all terminated') + main_log.debug("made sure workers to clean up are all terminated") # start cleanup for workspec in workspec_list: - tmp_log = self.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='run') + tmp_log = self.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="run") try: - tmp_log.debug('start cleaning up one worker') + tmp_log.debug("start cleaning up one worker") # sweep worker tmp_stat, tmp_out = sweeper_core.sweep_worker(workspec) - tmp_log.debug('swept_worker with status={0} diag={1}'.format(tmp_stat, tmp_out)) - tmp_log.debug('start messenger cleanup') + tmp_log.debug("swept_worker with status={0} diag={1}".format(tmp_stat, tmp_out)) + tmp_log.debug("start messenger cleanup") mc_tmp_stat, mc_tmp_out = messenger.clean_up(workspec) - tmp_log.debug('messenger cleaned up with status={0} diag={1}'.format(mc_tmp_stat, mc_tmp_out)) + tmp_log.debug("messenger cleaned up with status={0} diag={1}".format(mc_tmp_stat, mc_tmp_out)) if tmp_stat: self.dbProxy.delete_worker(workspec.workerID) except Exception: core_utils.dump_error_message(tmp_log) - main_log.debug('done cleaning up {0} workers'.format(n_workers) + sw.get_elapsed_time()) - main_log.debug('done all cleanup' + sw_cleanup.get_elapsed_time()) + main_log.debug("done cleaning up {0} workers".format(n_workers) + sw.get_elapsed_time()) + main_log.debug("done all cleanup" + sw_cleanup.get_elapsed_time()) # old-job-deletion stage sw_delete = core_utils.get_stopwatch() - main_log.debug('delete old jobs') + main_log.debug("delete old jobs") jobTimeout = max(statusTimeoutMap.values()) + 1 self.dbProxy.delete_old_jobs(jobTimeout) # delete orphaned job info self.dbProxy.delete_orphaned_job_info() - main_log.debug('done deletion of old jobs' + sw_delete.get_elapsed_time()) + main_log.debug("done deletion of old jobs" + sw_delete.get_elapsed_time()) # disk cleanup - if hasattr(harvester_config.sweeper, 'diskCleanUpInterval') and \ - hasattr(harvester_config.sweeper, 'diskHighWatermark'): - locked = self.dbProxy.get_process_lock('sweeper', self.get_pid(), - harvester_config.sweeper.diskCleanUpInterval*60*60) + if hasattr(harvester_config.sweeper, "diskCleanUpInterval") and hasattr(harvester_config.sweeper, "diskHighWatermark"): + locked = self.dbProxy.get_process_lock("sweeper", self.get_pid(), harvester_config.sweeper.diskCleanUpInterval * 60 * 60) if locked: try: all_active_files = None - for item in harvester_config.sweeper.diskHighWatermark.split(','): + for item in harvester_config.sweeper.diskHighWatermark.split(","): # dir name and watermark in GB - dir_name, watermark = item.split('|') - main_log.debug('checking {0} for cleanup with watermark {1} GB'.format(dir_name, watermark)) + dir_name, watermark = item.split("|") + main_log.debug("checking {0} for cleanup with watermark {1} GB".format(dir_name, watermark)) watermark = int(watermark) * 10**9 total_size = 0 file_dict = {} @@ -224,12 +218,14 @@ def run(self): # delete if necessary if total_size < watermark: main_log.debug( - 'skip cleanup {0} due to total_size {1} GB < watermark {2} GB'.format( - dir_name, total_size//(10**9), watermark//(10**9))) + "skip cleanup {0} due to total_size {1} GB < watermark {2} GB".format( + dir_name, total_size // (10**9), watermark // (10**9) + ) + ) else: main_log.debug( - 'cleanup {0} due to total_size {1} GB >= watermark {2} GB'.format( - dir_name, total_size//(10**9), watermark//(10**9))) + "cleanup {0} due to total_size {1} GB >= watermark {2} GB".format(dir_name, total_size // (10**9), watermark // (10**9)) + ) # get active input files if all_active_files is None: all_active_files = self.dbProxy.get_all_active_input_files() @@ -252,8 +248,8 @@ def run(self): except Exception: core_utils.dump_error_message(main_log) # time the cycle - main_log.debug('done a sweeper cycle' + sw_main.get_elapsed_time()) + main_log.debug("done a sweeper cycle" + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.sweeper.sleepTime): - main_log.debug('terminated') + main_log.debug("terminated") return diff --git a/pandaharvester/harvesterbody/watcher.py b/pandaharvester/harvesterbody/watcher.py index 6b93ed56..a7b1ef21 100644 --- a/pandaharvester/harvesterbody/watcher.py +++ b/pandaharvester/harvesterbody/watcher.py @@ -4,6 +4,7 @@ import socket import smtplib import datetime + try: import subprocess32 as subprocess except Exception: @@ -16,14 +17,14 @@ from pandalogger import logger_config -logDir = logger_config.daemon['logdir'] -if 'PANDA_LOCK_DIR' in os.environ: - lockFileName = os.path.join('PANDA_LOCK_DIR', 'watcher.lock') +logDir = logger_config.daemon["logdir"] +if "PANDA_LOCK_DIR" in os.environ: + lockFileName = os.path.join("PANDA_LOCK_DIR", "watcher.lock") else: - lockFileName = os.path.join(logDir, 'watcher.lock') + lockFileName = os.path.join(logDir, "watcher.lock") # logger -_logger = core_utils.setup_logger('watcher') +_logger = core_utils.setup_logger("watcher") # watching the system @@ -45,23 +46,22 @@ def run(self): # main def execute(self): # avoid too early check - if not self.singleMode and datetime.datetime.utcnow() - self.startTime \ - < datetime.timedelta(seconds=harvester_config.watcher.checkInterval): + if not self.singleMode and datetime.datetime.utcnow() - self.startTime < datetime.timedelta(seconds=harvester_config.watcher.checkInterval): return - mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='execute') - mainLog.debug('start') + mainLog = core_utils.make_logger(_logger, "id={0}".format(self.get_pid()), method_name="execute") + mainLog.debug("start") # get file lock try: with core_utils.get_file_lock(lockFileName, harvester_config.watcher.checkInterval): try: - logFileNameList = harvester_config.watcher.logFileNameList.split(',') + logFileNameList = harvester_config.watcher.logFileNameList.split(",") except Exception: - logFileNameList = ['panda-db_proxy.log'] + logFileNameList = ["panda-db_proxy.log"] lastTime = None logDuration = None lastTimeName = None logDurationName = None - actionsList = harvester_config.watcher.actions.split(',') + actionsList = harvester_config.watcher.actions.split(",") for logFileName in logFileNameList: logFilePath = os.path.join(logDir, logFileName) timeNow = datetime.datetime.utcnow() @@ -69,27 +69,28 @@ def execute(self): # get latest timestamp tmpLogDuration = None try: - p = subprocess.Popen(['tail', '-1', logFilePath], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(["tail", "-1", logFilePath], stdout=subprocess.PIPE, stderr=subprocess.PIPE) line = p.communicate()[0] - tmpLastTime = datetime.datetime.strptime(str(line[:23], 'utf-8'), "%Y-%m-%d %H:%M:%S,%f") + tmpLastTime = datetime.datetime.strptime(str(line[:23], "utf-8"), "%Y-%m-%d %H:%M:%S,%f") except Exception: tmpLastTime = None # get processing time for last 1000 queries try: - p = subprocess.Popen('tail -{0} {1} | head -1'.format(harvester_config.watcher.nMessages, - logFilePath), - stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen( + "tail -{0} {1} | head -1".format(harvester_config.watcher.nMessages, logFilePath), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + ) line = p.communicate()[0] - firstTime = datetime.datetime.strptime(str(line[:23], 'utf-8'), "%Y-%m-%d %H:%M:%S,%f") + firstTime = datetime.datetime.strptime(str(line[:23], "utf-8"), "%Y-%m-%d %H:%M:%S,%f") if tmpLastTime is not None: tmpLogDuration = tmpLastTime - firstTime except Exception as e: - mainLog.warning('Skip with error {0}: {1}'.format(e.__class__.__name__, e)) - tmpMsg = 'log={0} : last message at {0}. '.format(logFileName, tmpLastTime) + mainLog.warning("Skip with error {0}: {1}".format(e.__class__.__name__, e)) + tmpMsg = "log={0} : last message at {0}. ".format(logFileName, tmpLastTime) if tmpLogDuration is not None: - tmpMsg += '{0} messages took {1} sec'.format(harvester_config.watcher.nMessages, - tmpLogDuration.total_seconds()) + tmpMsg += "{0} messages took {1} sec".format(harvester_config.watcher.nMessages, tmpLogDuration.total_seconds()) mainLog.debug(tmpMsg) if tmpLastTime is not None and (lastTime is None or lastTime > tmpLastTime): lastTime = tmpLastTime @@ -99,28 +100,32 @@ def execute(self): logDurationName = logFileName # check timestamp doAction = False - if harvester_config.watcher.maxStalled > 0 and lastTime is not None and \ - timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled): - mainLog.warning('last log message is too old in {0}. seems to be stalled'.format(lastTimeName)) + if ( + harvester_config.watcher.maxStalled > 0 + and lastTime is not None + and timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled) + ): + mainLog.warning("last log message is too old in {0}. seems to be stalled".format(lastTimeName)) doAction = True - elif harvester_config.watcher.maxDuration > 0 and logDuration is not None and \ - logDuration.total_seconds() > harvester_config.watcher.maxDuration: - mainLog.warning('slow message generation in {0}. seems to be a performance issue'.format( - logDurationName)) + elif ( + harvester_config.watcher.maxDuration > 0 + and logDuration is not None + and logDuration.total_seconds() > harvester_config.watcher.maxDuration + ): + mainLog.warning("slow message generation in {0}. seems to be a performance issue".format(logDurationName)) doAction = True # take action if doAction: # email - if 'email' in actionsList: + if "email" in actionsList: # get pass phrase toSkip = False mailUser = None mailPass = None - if harvester_config.watcher.mailUser != '' and \ - harvester_config.watcher.mailPassword != '': + if harvester_config.watcher.mailUser != "" and harvester_config.watcher.mailPassword != "": envName = harvester_config.watcher.passphraseEnv if envName not in os.environ: - tmpMsg = '{0} is undefined in etc/sysconfig/panda_harvester'.format(envName) + tmpMsg = "{0} is undefined in etc/sysconfig/panda_harvester".format(envName) mainLog.error(tmpMsg) toSkip = True else: @@ -129,42 +134,38 @@ def execute(self): mailPass = core_utils.decrypt_string(key, harvester_config.watcher.mailPassword) if not toSkip: # message - msgBody = 'harvester {0} '.format(harvester_config.master.harvester_id) - msgBody += 'is having a problem on {0} '.format(socket.getfqdn()) - msgBody += 'at {0} (UTC)'.format(datetime.datetime.utcnow()) + msgBody = "harvester {0} ".format(harvester_config.master.harvester_id) + msgBody += "is having a problem on {0} ".format(socket.getfqdn()) + msgBody += "at {0} (UTC)".format(datetime.datetime.utcnow()) message = MIMEText(msgBody) - message['Subject'] = "Harvester Alarm" - message['From'] = harvester_config.watcher.mailFrom - message['To'] = harvester_config.watcher.mailTo + message["Subject"] = "Harvester Alarm" + message["From"] = harvester_config.watcher.mailFrom + message["To"] = harvester_config.watcher.mailTo # send email - mainLog.debug('sending email to {0}'.format(harvester_config.watcher.mailTo)) - server = smtplib.SMTP(harvester_config.watcher.mailServer, - harvester_config.watcher.mailPort) - if hasattr(harvester_config.watcher, 'mailUseSSL') and \ - harvester_config.watcher.mailUseSSL is True: + mainLog.debug("sending email to {0}".format(harvester_config.watcher.mailTo)) + server = smtplib.SMTP(harvester_config.watcher.mailServer, harvester_config.watcher.mailPort) + if hasattr(harvester_config.watcher, "mailUseSSL") and harvester_config.watcher.mailUseSSL is True: server.starttls() if mailUser is not None and mailPass is not None: server.login(mailUser, mailPass) server.ehlo() - server.sendmail(harvester_config.watcher.mailFrom, - harvester_config.watcher.mailTo.split(','), - message.as_string()) + server.sendmail(harvester_config.watcher.mailFrom, harvester_config.watcher.mailTo.split(","), message.as_string()) server.quit() # kill - if 'kill' in actionsList: + if "kill" in actionsList: # send USR2 fist - mainLog.debug('sending SIGUSR2') + mainLog.debug("sending SIGUSR2") os.killpg(os.getpgrp(), signal.SIGUSR2) time.sleep(60) - mainLog.debug('sending SIGKILL') + mainLog.debug("sending SIGKILL") os.killpg(os.getpgrp(), signal.SIGKILL) - elif 'terminate' in actionsList: - mainLog.debug('sending SIGTERM') + elif "terminate" in actionsList: + mainLog.debug("sending SIGTERM") os.killpg(os.getpgrp(), signal.SIGTERM) else: - mainLog.debug('No action needed for {0}'.format(logFileName)) + mainLog.debug("No action needed for {0}".format(logFileName)) except IOError: - mainLog.debug('skip as locked by another thread or too early to check') + mainLog.debug("skip as locked by another thread or too early to check") except Exception: core_utils.dump_error_message(mainLog) - mainLog.debug('done') + mainLog.debug("done") diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 1726b5f6..9ba79516 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -8,7 +8,7 @@ from pandaharvester.harvestermisc.apfmon import Apfmon # logger -_logger = core_utils.setup_logger('worker_adjuster') +_logger = core_utils.setup_logger("worker_adjuster") # class to define number of workers to submit @@ -27,9 +27,9 @@ def __init__(self, queue_config_mapper): # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): - tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') - tmp_log.debug('start') - tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) + tmp_log = core_utils.make_logger(_logger, "site={0}".format(site_name), method_name="define_num_workers") + tmp_log.debug("start") + tmp_log.debug("static_num_workers: {0}".format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status @@ -49,37 +49,39 @@ def define_num_workers(self, static_num_workers, site_name): # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) - max_workers = worker_limits_dict.get('maxWorkers', 0) - n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) - n_queue_limit_per_rt = worker_limits_dict['nQueueLimitWorkerPerRT'] + max_workers = worker_limits_dict.get("maxWorkers", 0) + n_queue_limit = worker_limits_dict.get("nQueueLimitWorker", 0) + n_queue_limit_per_rt = worker_limits_dict["nQueueLimitWorkerPerRT"] n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None for job_type, jt_values in iteritems(static_num_workers[queue_name]): for resource_type, tmp_val in iteritems(jt_values): - tmp_log.debug('Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}'. - format(queue_name, job_type, resource_type, tmp_val)) + tmp_log.debug( + "Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}".format( + queue_name, job_type, resource_type, tmp_val + ) + ) # set 0 to num of new workers when the queue is disabled - if queue_name in queue_stat and queue_stat[queue_name]['status'] in ['offline', 'standby', - 'maintenance']: - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 - ret_msg = 'set n_new_workers=0 since status={0}'.format(queue_stat[queue_name]['status']) + if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 since status={0}".format(queue_stat[queue_name]["status"]) tmp_log.debug(ret_msg) - apf_msg = 'Not submitting workers since queue status = {0}'.format(queue_stat[queue_name]['status']) + apf_msg = "Not submitting workers since queue status = {0}".format(queue_stat[queue_name]["status"]) continue # protection against not-up-to-date queue config if queue_config is None: - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 - ret_msg = 'set n_new_workers=0 due to missing queue_config' + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 due to missing queue_config" tmp_log.debug(ret_msg) - apf_msg = 'Not submitting workers because of missing queue_config' + apf_msg = "Not submitting workers because of missing queue_config" continue # get throttler if queue_name not in self.throttlerMap: - if hasattr(queue_config, 'throttler'): + if hasattr(queue_config, "throttler"): throttler = self.pluginFactory.get_plugin(queue_config.throttler) else: throttler = None @@ -90,24 +92,24 @@ def define_num_workers(self, static_num_workers, site_name): if throttler is not None: to_throttle, tmp_msg = throttler.to_be_throttled(queue_config) if to_throttle: - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 - ret_msg = 'set n_new_workers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmp_msg) + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 by {0}:{1}".format(throttler.__class__.__name__, tmp_msg) tmp_log.debug(ret_msg) continue # check stats - n_queue = tmp_val['nQueue'] - n_ready = tmp_val['nReady'] - n_running = tmp_val['nRunning'] - if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: + n_queue = tmp_val["nQueue"] + n_ready = tmp_val["nReady"] + n_running = tmp_val["nRunning"] + if resource_type != "ANY" and job_type != "ANY" and job_type is not None: n_queue_total += n_queue n_ready_total += n_ready n_running_total += n_running - if queue_config.runMode == 'slave': - n_new_workers_def = tmp_val['nNewWorkers'] + if queue_config.runMode == "slave": + n_new_workers_def = tmp_val["nNewWorkers"] if n_new_workers_def == 0: - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 - ret_msg = 'set n_new_workers=0 by panda in slave mode' + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 by panda in slave mode" tmp_log.debug(ret_msg) continue else: @@ -117,20 +119,16 @@ def define_num_workers(self, static_num_workers, site_name): n_new_workers = 0 if n_queue >= n_queue_limit_per_rt > 0: # enough queued workers - ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format(n_queue, - n_queue_limit_per_rt) + ret_msg = "No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})".format(n_queue, n_queue_limit_per_rt) tmp_log.debug(ret_msg) pass elif (n_queue + n_ready + n_running) >= max_workers > 0: # enough workers in the system - ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format(n_queue, - n_ready, - n_running) - ret_msg += '>= max_workers({0})'.format(max_workers) + ret_msg = "No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) ".format(n_queue, n_ready, n_running) + ret_msg += ">= max_workers({0})".format(max_workers) tmp_log.debug(ret_msg) pass else: - max_queued_workers = None if n_queue_limit_per_rt > 0: # there is a limit set for the queue @@ -144,49 +142,44 @@ def define_num_workers(self, static_num_workers, site_name): else: max_queued_workers = maxQueuedWorkers_slave - elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs + elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs if job_stats is None: - tmp_log.warning('n_activated not defined, defaulting to configured queue limits') + tmp_log.warning("n_activated not defined, defaulting to configured queue limits") pass else: # limit the queue to the number of activated jobs to avoid empty pilots try: - n_activated = max(job_stats[queue_name]['activated'], 1) # avoid no activity queues + n_activated = max(job_stats[queue_name]["activated"], 1) # avoid no activity queues except KeyError: # zero job in the queue - tmp_log.debug('no job in queue') + tmp_log.debug("no job in queue") n_activated = 1 finally: queue_limit = max_queued_workers max_queued_workers = min(n_activated, max_queued_workers) - tmp_log.debug('limiting max_queued_workers to min(n_activated={0}, queue_limit={1})'. - format(n_activated, queue_limit)) + tmp_log.debug("limiting max_queued_workers to min(n_activated={0}, queue_limit={1})".format(n_activated, queue_limit)) if max_queued_workers is None: # no value found, use default value max_queued_workers = 1 # new workers n_new_workers = max(max_queued_workers - n_queue, 0) - tmp_log.debug('setting n_new_workers to {0} in max_queued_workers calculation' - .format(n_new_workers)) + tmp_log.debug("setting n_new_workers to {0} in max_queued_workers calculation".format(n_new_workers)) if max_workers > 0: n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) - tmp_log.debug('setting n_new_workers to {0} to respect max_workers' - .format(n_new_workers)) + tmp_log.debug("setting n_new_workers to {0} to respect max_workers".format(n_new_workers)) if queue_config.maxNewWorkersPerCycle > 0: n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) - tmp_log.debug('setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' - .format(n_new_workers)) + tmp_log.debug("setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle".format(n_new_workers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) - tmp_log.debug('setting n_new_workers to {0} in order to respect universal maxNewWorkers' - .format(n_new_workers)) - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers + tmp_log.debug("setting n_new_workers to {0} in order to respect universal maxNewWorkers".format(n_new_workers)) + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: max_new_workers_per_cycle = 0 - ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' + ret_msg = "set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config" tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle @@ -194,10 +187,9 @@ def define_num_workers(self, static_num_workers, site_name): total_new_workers_rts = 0 for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: - if _jt != 'ANY' and _rt != 'ANY': - total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]['nNewWorkers'] - n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), - max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) + if _jt != "ANY" and _rt != "ANY": + total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]["nNewWorkers"] + n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: @@ -208,28 +200,25 @@ def define_num_workers(self, static_num_workers, site_name): if n_new_workers_max_agg == 0: for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][job_type]: - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 - tmp_log.debug('No n_new_workers since n_new_workers_max_agg=0 for UCORE') + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + tmp_log.debug("No n_new_workers since n_new_workers_max_agg=0 for UCORE") else: - tmp_log.debug('n_new_workers_max_agg={0} for UCORE'.format(n_new_workers_max_agg)) + tmp_log.debug("n_new_workers_max_agg={0} for UCORE".format(n_new_workers_max_agg)) _d = dyn_num_workers[queue_name].copy() - del _d['ANY'] + del _d["ANY"] # TODO: needs to be recalculated simple_rt_nw_list = [] - for job_type in _d: # jt: job type + for job_type in _d: # jt: job type for resource_type in _d[job_type]: # rt: resource type - simple_rt_nw_list.append([(resource_type, job_type), _d[job_type][resource_type].get('nNewWorkers', 0), 0]) + simple_rt_nw_list.append([(resource_type, job_type), _d[job_type][resource_type].get("nNewWorkers", 0), 0]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: (resource_type, job_type), n_new_workers_orig, _r = _rt_list - n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, - total_new_workers_rts) - dyn_num_workers[queue_name][job_type].setdefault(resource_type, - {'nReady': 0, 'nRunning': 0, - 'nQueue': 0, 'nNewWorkers': 0}) - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers + n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) + dyn_num_workers[queue_name][job_type].setdefault(resource_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) @@ -237,15 +226,18 @@ def define_num_workers(self, static_num_workers, site_name): for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break - dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] += 1 + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] += 1 _countdown -= 1 for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][job_type]: - if job_type == 'ANY' or resource_type == 'ANY': + if job_type == "ANY" or resource_type == "ANY": continue - n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] - tmp_log.debug('setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' - .format(n_new_workers, job_type, resource_type)) + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] + tmp_log.debug( + "setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE".format( + n_new_workers, job_type, resource_type + ) + ) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) @@ -253,7 +245,7 @@ def define_num_workers(self, static_num_workers, site_name): self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump - tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) + tmp_log.debug("defined {0}".format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 8aae3440..89ba3e21 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -3,7 +3,7 @@ from pandaharvester.harvestercore.plugin_factory import PluginFactory # logger -_logger = core_utils.setup_logger('worker_maker') +_logger = core_utils.setup_logger("worker_maker") # class to make worker @@ -19,16 +19,17 @@ def get_plugin(self, queue_config): # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): - tmpLog = core_utils.make_logger(_logger, 'queue={0} jtype={1} rtype={2}'.format(queue_config.queueName, job_type, resource_type), - method_name='make_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger( + _logger, "queue={0} jtype={1} rtype={2}".format(queue_config.queueName, job_type, resource_type), method_name="make_workers" + ) + tmpLog.debug("start") try: # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) if maker is None: # not found - tmpLog.error('plugin for {0} not found'.format(queue_config.queueName)) + tmpLog.error("plugin for {0} not found".format(queue_config.queueName)) return [], jobchunk_list # get ready workers readyWorkers = self.dbProxy.get_ready_workers(queue_config.queueName, n_ready) @@ -51,13 +52,12 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_ continue # set workerID if workSpec.workerID is None: - workSpec.workerID = self.dbProxy.get_next_seq_number('SEQ_workerID') + workSpec.workerID = self.dbProxy.get_next_seq_number("SEQ_workerID") workSpec.configID = queue_config.configID workSpec.isNew = True okChunks.append((workSpec, jobChunk)) # dump - tmpLog.debug('made {0} workers while {1} chunks failed'.format(len(okChunks), - len(ngChunks))) + tmpLog.debug("made {0} workers while {1} chunks failed".format(len(okChunks), len(ngChunks))) return okChunks, ngChunks except Exception: # dump error diff --git a/pandaharvester/harvestercloud/aws_unhealthy_nodes.py b/pandaharvester/harvestercloud/aws_unhealthy_nodes.py index d03e9d1e..eb71a2b5 100644 --- a/pandaharvester/harvestercloud/aws_unhealthy_nodes.py +++ b/pandaharvester/harvestercloud/aws_unhealthy_nodes.py @@ -3,34 +3,34 @@ from kubernetes import client, config from subprocess import Popen, PIPE -config.load_kube_config(config_file='YOUR KUBECONFIG FILE') +config.load_kube_config(config_file="YOUR KUBECONFIG FILE") apis_api = client.CoreV1Api() # get running nodes running_nodes = {} nodes = apis_api.list_node() for node in nodes.items: - running_nodes[node.metadata.name] = node.spec.provider_id.split('/')[-1] + running_nodes[node.metadata.name] = node.spec.provider_id.split("/")[-1] # get events with FailedMounts and filter them by known error message -failed_mount_events = apis_api.list_namespaced_event(namespace='default', field_selector='reason=FailedMount') +failed_mount_events = apis_api.list_namespaced_event(namespace="default", field_selector="reason=FailedMount") unhealthy_node_ids = set() for event in failed_mount_events.items: node_name = event.source.host - if 'Argument list too long' in event.message and node_name in running_nodes: - unhealthy_node_ids.add(running_nodes[node_name]) + if "Argument list too long" in event.message and node_name in running_nodes: + unhealthy_node_ids.add(running_nodes[node_name]) # set the node as unhealthy using the AWS CLI -command = '/usr/local/bin/aws autoscaling set-instance-health --instance-id {0} --health-status Unhealthy' +command = "/usr/local/bin/aws autoscaling set-instance-health --instance-id {0} --health-status Unhealthy" for id in unhealthy_node_ids: command_with_id = command.format(id) - command_list = command_with_id.split(' ') + command_list = command_with_id.split(" ") p = Popen(command_list, stdin=PIPE, stdout=PIPE, stderr=PIPE) output, err = p.communicate() - print('------------------------------------') + print("------------------------------------") print(command_with_id) - print('return code: {0}'.format(p.returncode)) - print('output: {0}'.format(output)) - print('err: {0}'.format(err)) - print('------------------------------------') \ No newline at end of file + print("return code: {0}".format(p.returncode)) + print("output: {0}".format(output)) + print("err: {0}".format(err)) + print("------------------------------------") diff --git a/pandaharvester/harvestercloud/cernvm_aux.py b/pandaharvester/harvestercloud/cernvm_aux.py index 191066ab..05203027 100644 --- a/pandaharvester/harvestercloud/cernvm_aux.py +++ b/pandaharvester/harvestercloud/cernvm_aux.py @@ -5,8 +5,8 @@ def encode_user_data(user_data): attached_message = MIMEMultipart() - message = MIMEText(user_data, 'cloud-config', sys.getdefaultencoding()) - message.add_header('Content-Disposition', 'attachment; filename="%s"' % ("cs-cloud-init.yaml")) + message = MIMEText(user_data, "cloud-config", sys.getdefaultencoding()) + message.add_header("Content-Disposition", 'attachment; filename="%s"' % ("cs-cloud-init.yaml")) attached_message.attach(message) - return attached_message \ No newline at end of file + return attached_message diff --git a/pandaharvester/harvestercloud/gke_unhealthy_nodes.py b/pandaharvester/harvestercloud/gke_unhealthy_nodes.py index efe091be..201c3149 100644 --- a/pandaharvester/harvestercloud/gke_unhealthy_nodes.py +++ b/pandaharvester/harvestercloud/gke_unhealthy_nodes.py @@ -1,42 +1,44 @@ from kubernetes import client, config import datetime from subprocess import Popen, PIPE -config.load_kube_config(config_file='PATH TO YOUR CONFIG') -namespace = 'default' + +config.load_kube_config(config_file="PATH TO YOUR CONFIG") +namespace = "default" nodes = [] current_time = datetime.datetime.now().astimezone() corev1 = client.CoreV1Api() -aux = corev1.list_namespaced_pod(namespace=namespace, field_selector='status.phase=Pending') +aux = corev1.list_namespaced_pod(namespace=namespace, field_selector="status.phase=Pending") for item in aux.items: try: - if item.status.container_statuses[0].state.waiting.reason == 'ContainerCreating' and current_time - item.metadata.creation_timestamp > datetime.timedelta(minutes=30): + if item.status.container_statuses[ + 0 + ].state.waiting.reason == "ContainerCreating" and current_time - item.metadata.creation_timestamp > datetime.timedelta(minutes=30): if item.spec.node_name not in nodes: nodes.append(item.spec.node_name) except Exception: continue # delete the node -command_desc = '/bin/gcloud compute instances describe --format=value[](metadata.items.created-by) {0} --zone={1}' +command_desc = "/bin/gcloud compute instances describe --format=value[](metadata.items.created-by) {0} --zone={1}" command_del = "/bin/gcloud compute instance-groups managed delete-instances --instances={0} {1} --zone={2}" -zones = ['europe-west1-b', 'europe-west1-c', 'europe-west1-d'] +zones = ["europe-west1-b", "europe-west1-c", "europe-west1-d"] for node in nodes: for zone in zones: command_with_node = command_desc.format(node, zone) - command_list = command_with_node.split(' ') + command_list = command_with_node.split(" ") p = Popen(command_list, stdin=PIPE, stdout=PIPE, stderr=PIPE) output, err = p.communicate() if output: output_str = output[:-1].decode() command_del_with_vars = command_del.format(node, output_str, zone) - command_del_list = command_del_with_vars.split(' ') + command_del_list = command_del_with_vars.split(" ") p = Popen(command_del_list, stdin=PIPE, stdout=PIPE, stderr=PIPE) output, err = p.communicate() print(command_del_with_vars) print(output) print(err) print("--------------------") - diff --git a/pandaharvester/harvestercloud/google_startup_script.py b/pandaharvester/harvestercloud/google_startup_script.py index 95bb6968..3da11abc 100644 --- a/pandaharvester/harvestercloud/google_startup_script.py +++ b/pandaharvester/harvestercloud/google_startup_script.py @@ -10,7 +10,7 @@ import requests try: import subprocess32 as subprocess -except: +except BaseException: import subprocess import os import sys @@ -44,7 +44,7 @@ def upload_logs(url, log_file_name, destination_name, proxy_path): logging.debug('[upload_logs] finished with code={0} msg={1}'.format(res.status_code, res.text)) if res.status_code == 200: return True - except: + except BaseException: err_type, err_value = sys.exc_info()[:2] err_messsage = "failed to put with {0}:{1} ".format(err_type, err_value) err_messsage += traceback.format_exc() @@ -58,7 +58,7 @@ def contact_harvester(harvester_frontend, data, auth_token, proxy_path): headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer {0}'.format(auth_token)} cert = [proxy_path, proxy_path] - #verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory + # verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory verify = False resp = requests.post(harvester_frontend, json=data, headers=headers, cert=cert, verify=verify) logging.debug('[contact_harvester] harvester returned: {0}'.format(resp.text)) @@ -155,7 +155,7 @@ def get_configuration(): # the pilot should propagate the download link via the pilotId field in the job table destination_name = '{0}.log'.format(worker_id) log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) - os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot + os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot # get the pilot wrapper wrapper_path = "/tmp/runpilot3-wrapper.sh" @@ -163,7 +163,7 @@ def get_configuration(): wrapper_string = get_url(wrapper_url) with open(wrapper_path, "w") as wrapper_file: wrapper_file.write(wrapper_string) - os.chmod(wrapper_path, 0544) # make pilot wrapper executable + os.chmod(wrapper_path, 0544) # make pilot wrapper executable logging.debug('[main] downloaded pilot wrapper') # execute the pilot wrapper @@ -182,4 +182,4 @@ def get_configuration(): # ask harvester to kill the VM and stop the heartbeat suicide(harvester_frontend, worker_id, auth_token, proxy_path) loop = False - heartbeat_thread.join() \ No newline at end of file + heartbeat_thread.join() diff --git a/pandaharvester/harvestercloud/googlecloud.py b/pandaharvester/harvestercloud/googlecloud.py index 241a7d77..542446ee 100644 --- a/pandaharvester/harvestercloud/googlecloud.py +++ b/pandaharvester/harvestercloud/googlecloud.py @@ -13,18 +13,18 @@ ZONE = harvester_config.googlecloud.zone PROJECT = harvester_config.googlecloud.project SERVICE_ACCOUNT_FILE = harvester_config.googlecloud.service_account_file -os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_FILE +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_FILE -compute = googleapiclient.discovery.build('compute', 'v1') +compute = googleapiclient.discovery.build("compute", "v1") -class GoogleVM(): +class GoogleVM: def __init__(self, work_spec, queue_config): self.harvester_token = HarvesterToken() self.work_spec = work_spec self.queue_config = queue_config - harvester_id_clean = harvester_config.master.harvester_id.replace('-','').replace('_','').lower() - self.name = '{0}-gce-{1}'.format(harvester_id_clean, work_spec.workerID) + harvester_id_clean = harvester_config.master.harvester_id.replace("-", "").replace("_", "").lower() + self.name = "{0}-gce-{1}".format(harvester_id_clean, work_spec.workerID) # self.name = self.name.replace('_', '-') # underscores in VM names are not allowed by GCE self.image = self.resolve_image_url() self.instance_type = self.resolve_instance_type() @@ -37,8 +37,8 @@ def resolve_image_url(self): :return: URL pointing to the machine type to use """ # Get the latest Debian Jessie image - image_response = compute.images().getFromFamily(project=PROJECT, family='cernvm').execute() - source_disk_image = image_response['selfLink'] + image_response = compute.images().getFromFamily(project=PROJECT, family="cernvm").execute() + source_disk_image = image_response["selfLink"] return source_disk_image @@ -54,7 +54,7 @@ def resolve_instance_type(self): """ # Calculate the number of VCPUs - cores = 8 # default value. TODO: probably should except if we don't find a suitable number + cores = 8 # default value. TODO: probably should except if we don't find a suitable number standard_cores = [1, 2, 4, 8, 16, 32, 64, 96] for standard_core in standard_cores: if self.work_spec.nCore <= standard_core: @@ -64,7 +64,7 @@ def resolve_instance_type(self): # Calculate the memory: 2 GBs per core. It needs to be expressed in MB # https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type try: - ram_per_core = self.queue_config.submitter['ram_per_core'] + ram_per_core = self.queue_config.submitter["ram_per_core"] except KeyError: ram_per_core = 2 memory = cores * ram_per_core * 1024 @@ -74,9 +74,9 @@ def resolve_instance_type(self): except AttributeError: zone = ZONE - #instance_type = 'zones/{0}/machineTypes/n1-standard-{1}'.format(zone, cores) + # instance_type = 'zones/{0}/machineTypes/n1-standard-{1}'.format(zone, cores) # Use custom machine types to reduce cost - instance_type = 'zones/{0}/machineTypes/custom-{1}-{2}'.format(zone, cores, memory) + instance_type = "zones/{0}/machineTypes/custom-{1}-{2}".format(zone, cores, memory) return instance_type @@ -87,109 +87,46 @@ def prepare_metadata(self): """ # read the proxy - with open(PROXY_PATH, 'r') as proxy_file: + with open(PROXY_PATH, "r") as proxy_file: proxy_string = proxy_file.read() - with open(USER_DATA_PATH, 'r') as user_data_file: + with open(USER_DATA_PATH, "r") as user_data_file: user_data = user_data_file.read() try: - preemptible = self.queue_config.submitter['preemptible'] + preemptible = self.queue_config.submitter["preemptible"] except KeyError: preemptible = False try: - disk_size = self.queue_config.submitter['disk_size'] + disk_size = self.queue_config.submitter["disk_size"] except KeyError: disk_size = 50 config = { - 'name': self.name, - 'machineType': self.instance_type, - - 'scheduling': - { - 'preemptible': preemptible - }, - - # Specify the boot disk and the image to use as a source. - 'disks': - [ - { - 'boot': True, - 'autoDelete': True, - 'initializeParams': { - 'sourceImage': IMAGE, - 'diskSizeGb': 50} - } - ], - - # Specify a network interface with NAT to access the public internet - 'networkInterfaces': - [ - { - 'network': 'global/networks/default', - 'accessConfigs': - [ - { - 'type': 'ONE_TO_ONE_NAT', - 'name': 'External NAT' - } - ] - } - ], - - # Allow the instance to access cloud storage and logging. - 'serviceAccounts': - [ - { - 'email': 'default', - 'scopes': - [ - 'https://www.googleapis.com/auth/devstorage.read_write', - 'https://www.googleapis.com/auth/logging.write' - ] - } - ], - - 'metadata': - { - 'items': - [ - { - 'key': 'user-data', - 'value': str(cernvm_aux.encode_user_data(user_data)) - }, - { - 'key': 'proxy', - 'value': proxy_string - }, - { - 'key': 'panda_queue', - 'value': self.work_spec.computingSite - }, - { - 'key': 'harvester_frontend', - 'value': HARVESTER_FRONTEND - }, - { - 'key': 'worker_id', - 'value': self.work_spec.workerID - }, - { - 'key': 'auth_token', - 'value': self.harvester_token.generate(payload={'sub': str(self.work_spec.batchID)}) - }, - { - 'key': 'logs_url_w', - 'value': '{0}/{1}'.format(harvester_config.pandacon.pandaCacheURL_W, 'updateLog') - }, - { - 'key': 'logs_url_r', - 'value': harvester_config.pandacon.pandaCacheURL_R - } - ] - } - } + "name": self.name, + "machineType": self.instance_type, + "scheduling": {"preemptible": preemptible}, + # Specify the boot disk and the image to use as a source. + "disks": [{"boot": True, "autoDelete": True, "initializeParams": {"sourceImage": IMAGE, "diskSizeGb": 50}}], + # Specify a network interface with NAT to access the public internet + "networkInterfaces": [{"network": "global/networks/default", "accessConfigs": [{"type": "ONE_TO_ONE_NAT", "name": "External NAT"}]}], + # Allow the instance to access cloud storage and logging. + "serviceAccounts": [ + {"email": "default", "scopes": ["https://www.googleapis.com/auth/devstorage.read_write", "https://www.googleapis.com/auth/logging.write"]} + ], + "metadata": { + "items": [ + {"key": "user-data", "value": str(cernvm_aux.encode_user_data(user_data))}, + {"key": "proxy", "value": proxy_string}, + {"key": "panda_queue", "value": self.work_spec.computingSite}, + {"key": "harvester_frontend", "value": HARVESTER_FRONTEND}, + {"key": "worker_id", "value": self.work_spec.workerID}, + {"key": "auth_token", "value": self.harvester_token.generate(payload={"sub": str(self.work_spec.batchID)})}, + {"key": "logs_url_w", "value": "{0}/{1}".format(harvester_config.pandacon.pandaCacheURL_W, "updateLog")}, + {"key": "logs_url_r", "value": harvester_config.pandacon.pandaCacheURL_R}, + ] + }, + } return config diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py index 94658959..2bfb4b3e 100644 --- a/pandaharvester/harvestercloud/pilots_starter.py +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -32,18 +32,19 @@ import ssl import traceback -WORK_DIR = '/scratch' -CONFIG_DIR = '/scratch/jobconfig' -PJD = 'pandaJobData.out' -PFC = 'PoolFileCatalog_H.xml' +WORK_DIR = "/scratch" +CONFIG_DIR = "/scratch/jobconfig" +PJD = "pandaJobData.out" +PFC = "PoolFileCatalog_H.xml" CONFIG_FILES = [PJD, PFC] -logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', stream=sys.stdout) +logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s", stream=sys.stdout) # This is necessary in Lancium, otherwise the wrapper breaks os.unsetenv("SINGULARITY_ENVIRONMENT") os.unsetenv("SINGULARITY_BIND") + def post_multipart(host, port, selector, files, proxy_cert): """ Post files to an http host as multipart/form-data. @@ -57,9 +58,9 @@ def post_multipart(host, port, selector, files, proxy_cert): h = httplib.HTTPSConnection(host, port, context=context, timeout=180) - h.putrequest('POST', selector) - h.putheader('content-type', content_type) - h.putheader('content-length', str(len(body))) + h.putrequest("POST", selector) + h.putheader("content-type", content_type) + h.putheader("content-length", str(len(body))) h.endheaders() h.send(body.encode()) response = h.getresponse() @@ -71,42 +72,42 @@ def encode_multipart_formdata(files): files is a sequence of (name, filename, value) elements for data to be uploaded as files Return (content_type, body) ready for httplib.HTTP instance """ - BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' - CRLF = '\r\n' + BOUNDARY = "----------ThIs_Is_tHe_bouNdaRY_$" + CRLF = "\r\n" L = [] - for (key, filename, value) in files: - L.append('--' + BOUNDARY) + for key, filename, value in files: + L.append("--" + BOUNDARY) L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) - L.append('Content-Type: %s' % get_content_type(filename)) - L.append('') + L.append("Content-Type: %s" % get_content_type(filename)) + L.append("") L.append(value) - L.append('--' + BOUNDARY + '--') - L.append('') + L.append("--" + BOUNDARY + "--") + L.append("") body = CRLF.join(L) - content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + content_type = "multipart/form-data; boundary=%s" % BOUNDARY return content_type, body def get_content_type(filename): - return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + return mimetypes.guess_type(filename)[0] or "application/octet-stream" def upload_logs(url, log_file_name, destination_name, proxy_cert): try: - full_url = url + '/putFile' + full_url = url + "/putFile" urlparts = urlparse.urlsplit(full_url) - logging.debug('[upload_logs] start') - files = [('file', destination_name, open(log_file_name).read())] + logging.debug("[upload_logs] start") + files = [("file", destination_name, open(log_file_name).read())] status, reason = post_multipart(urlparts.hostname, urlparts.port, urlparts.path, files, proxy_cert) - logging.debug('[upload_logs] finished with code={0} msg={1}'.format(status, reason)) + logging.debug("[upload_logs] finished with code={0} msg={1}".format(status, reason)) if status == 200: return True except Exception: err_type, err_value = sys.exc_info()[:2] err_messsage = "failed to put with {0}:{1} ".format(err_type, err_value) err_messsage += traceback.format_exc() - logging.debug('[upload_logs] excepted with:\n {0}'.format(err_messsage)) + logging.debug("[upload_logs] excepted with:\n {0}".format(err_messsage)) return False @@ -121,163 +122,198 @@ def copy_files_in_dir(src_dir, dst_dir): def str_to_bool(input_str, default=False): output_str = default try: - if input_str.upper() == 'FALSE': + if input_str.upper() == "FALSE": output_str = False - elif input_str.upper() == 'TRUE': + elif input_str.upper() == "TRUE": output_str = True - except: + except BaseException: pass return output_str def get_configuration(): # get the proxy certificate and save it - if os.environ.get('proxySecretPath'): - proxy_path = os.environ.get('proxySecretPath') + if os.environ.get("proxySecretPath"): + proxy_path = os.environ.get("proxySecretPath") else: - logging.debug('[main] no proxy specified in env var $proxySecretPath') - raise Exception('Found no voms proxy specified') - os.environ['X509_USER_PROXY'] = proxy_path - logging.debug('[main] initialized proxy') + logging.debug("[main] no proxy specified in env var $proxySecretPath") + raise Exception("Found no voms proxy specified") + os.environ["X509_USER_PROXY"] = proxy_path + logging.debug("[main] initialized proxy") # get the panda site name - panda_site = os.environ.get('computingSite') - logging.debug('[main] got panda site: {0}'.format(panda_site)) + panda_site = os.environ.get("computingSite") + logging.debug("[main] got panda site: {0}".format(panda_site)) # get the panda queue name - panda_queue = os.environ.get('pandaQueueName') - logging.debug('[main] got panda queue: {0}'.format(panda_queue)) + panda_queue = os.environ.get("pandaQueueName") + logging.debug("[main] got panda queue: {0}".format(panda_queue)) # get the resource type of the worker - resource_type = os.environ.get('resourceType') - logging.debug('[main] got resource type: {0}'.format(resource_type)) + resource_type = os.environ.get("resourceType") + logging.debug("[main] got resource type: {0}".format(resource_type)) - prodSourceLabel = os.environ.get('prodSourceLabel') - logging.debug('[main] got prodSourceLabel: {0}'.format(prodSourceLabel)) + prodSourceLabel = os.environ.get("prodSourceLabel") + logging.debug("[main] got prodSourceLabel: {0}".format(prodSourceLabel)) - job_type = os.environ.get('jobType') - logging.debug('[main] got job type: {0}'.format(job_type)) + job_type = os.environ.get("jobType") + logging.debug("[main] got job type: {0}".format(job_type)) - pilot_type = os.environ.get('pilotType', '') - logging.debug('[main] got pilotType: {0}'.format(pilot_type)) + pilot_type = os.environ.get("pilotType", "") + logging.debug("[main] got pilotType: {0}".format(pilot_type)) - pilot_url_option = os.environ.get('pilotUrlOpt', '') - logging.debug('[main] got pilotUrlOpt: {0}'.format(pilot_url_option)) + pilot_url_option = os.environ.get("pilotUrlOpt", "") + logging.debug("[main] got pilotUrlOpt: {0}".format(pilot_url_option)) - python_option = os.environ.get('pythonOption', '') - logging.debug('[main] got pythonOption: {0}'.format(python_option)) + python_option = os.environ.get("pythonOption", "") + logging.debug("[main] got pythonOption: {0}".format(python_option)) - pilot_version = os.environ.get('pilotVersion', '') - logging.debug('[main] got pilotVersion: {0}'.format(pilot_version)) + pilot_version = os.environ.get("pilotVersion", "") + logging.debug("[main] got pilotVersion: {0}".format(pilot_version)) - pilot_proxy_check_tmp = os.environ.get('pilotProxyCheck', 'False') + pilot_proxy_check_tmp = os.environ.get("pilotProxyCheck", "False") pilot_proxy_check = str_to_bool(pilot_proxy_check_tmp) - logging.debug('[main] got pilotProxyCheck: {0}'.format(pilot_proxy_check)) + logging.debug("[main] got pilotProxyCheck: {0}".format(pilot_proxy_check)) # get the Harvester ID - harvester_id = os.environ.get('HARVESTER_ID') - logging.debug('[main] got Harvester ID: {0}'.format(harvester_id)) + harvester_id = os.environ.get("HARVESTER_ID") + logging.debug("[main] got Harvester ID: {0}".format(harvester_id)) # get the worker id - worker_id = os.environ.get('workerID') - logging.debug('[main] got worker ID: {0}'.format(worker_id)) + worker_id = os.environ.get("workerID") + logging.debug("[main] got worker ID: {0}".format(worker_id)) # get the URL (e.g. panda cache) to upload logs - logs_frontend_w = os.environ.get('logs_frontend_w') - logging.debug('[main] got url to upload logs') + logs_frontend_w = os.environ.get("logs_frontend_w") + logging.debug("[main] got url to upload logs") # get the URL (e.g. panda cache) where the logs can be downloaded afterwards - logs_frontend_r = os.environ.get('logs_frontend_r') - logging.debug('[main] got url to download logs') + logs_frontend_r = os.environ.get("logs_frontend_r") + logging.debug("[main] got url to download logs") # get the filename to use for the stdout log - stdout_name = os.environ.get('stdout_name') + stdout_name = os.environ.get("stdout_name") if not stdout_name: - stdout_name = '{0}_{1}.out'.format(harvester_id, worker_id) + stdout_name = "{0}_{1}.out".format(harvester_id, worker_id) - logging.debug('[main] got filename for the stdout log') + logging.debug("[main] got filename for the stdout log") # get the submission mode (push/pull) for the pilot - submit_mode = os.environ.get('submit_mode') + submit_mode = os.environ.get("submit_mode") if not submit_mode: - submit_mode = 'PULL' + submit_mode = "PULL" # see if there is a work directory specified - tmpdir = os.environ.get('TMPDIR') + tmpdir = os.environ.get("TMPDIR") if tmpdir: global WORK_DIR WORK_DIR = tmpdir - return proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, pilot_type, \ - pilot_url_option, python_option, pilot_proxy_check, pilot_version, harvester_id, worker_id, logs_frontend_w, \ - logs_frontend_r, stdout_name, submit_mode + return ( + proxy_path, + panda_site, + panda_queue, + resource_type, + prodSourceLabel, + job_type, + pilot_type, + pilot_url_option, + python_option, + pilot_proxy_check, + pilot_version, + harvester_id, + worker_id, + logs_frontend_w, + logs_frontend_r, + stdout_name, + submit_mode, + ) if __name__ == "__main__": - # get all the configuration from environment - proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, pilot_type, pilot_url_opt, \ - python_option, pilot_proxy_check, pilot_version, harvester_id, worker_id, logs_frontend_w, logs_frontend_r, \ - destination_name, submit_mode = get_configuration() + ( + proxy_path, + panda_site, + panda_queue, + resource_type, + prodSourceLabel, + job_type, + pilot_type, + pilot_url_opt, + python_option, + pilot_proxy_check, + pilot_version, + harvester_id, + worker_id, + logs_frontend_w, + logs_frontend_r, + destination_name, + submit_mode, + ) = get_configuration() # the pilot should propagate the download link via the pilotId field in the job table - log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) - os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot + log_download_url = "{0}/{1}".format(logs_frontend_r, destination_name) + os.environ["GTAG"] = log_download_url # GTAG env variable is read by pilot # execute the pilot wrapper - logging.debug('[main] starting pilot wrapper...') - resource_type_option = '' + logging.debug("[main] starting pilot wrapper...") + resource_type_option = "" if resource_type: - resource_type_option = '--resource-type {0}'.format(resource_type) + resource_type_option = "--resource-type {0}".format(resource_type) if prodSourceLabel: - psl_option = '-j {0}'.format(prodSourceLabel) + psl_option = "-j {0}".format(prodSourceLabel) else: - psl_option = '-j managed' + psl_option = "-j managed" - job_type_option = '' + job_type_option = "" if job_type: - job_type_option = '--job-type {0}'.format(job_type) + job_type_option = "--job-type {0}".format(job_type) - pilot_type_option = '-i PR' + pilot_type_option = "-i PR" if pilot_type: - pilot_type_option = '-i {0}'.format(pilot_type) + pilot_type_option = "-i {0}".format(pilot_type) - pilot_proxy_check_option = '-t' # This disables the proxy check + pilot_proxy_check_option = "-t" # This disables the proxy check if pilot_proxy_check: - pilot_proxy_check_option = '' # Empty enables the proxy check (default pilot behaviour) - + pilot_proxy_check_option = "" # Empty enables the proxy check (default pilot behaviour) - pilot_version_option = '--pilotversion 2' + pilot_version_option = "--pilotversion 2" if pilot_version: - pilot_version_option = '--pilotversion {0}'.format(pilot_version) - - wrapper_params = '-a {0} -s {1} -r {2} -q {3} {4} {5} {6} {7} {8} {9} {10} {11}'.format(WORK_DIR, panda_site, - panda_queue, panda_queue, - resource_type_option, - psl_option, - pilot_type_option, - job_type_option, - pilot_url_opt, - python_option, - pilot_version_option, - pilot_proxy_check_option) - - if submit_mode == 'PUSH': + pilot_version_option = "--pilotversion {0}".format(pilot_version) + + wrapper_params = "-a {0} -s {1} -r {2} -q {3} {4} {5} {6} {7} {8} {9} {10} {11}".format( + WORK_DIR, + panda_site, + panda_queue, + panda_queue, + resource_type_option, + psl_option, + pilot_type_option, + job_type_option, + pilot_url_opt, + python_option, + pilot_version_option, + pilot_proxy_check_option, + ) + + if submit_mode == "PUSH": # job configuration files need to be copied, because k8s configmap mounts as read-only file system # and therefore the pilot cannot execute in the same directory copy_files_in_dir(CONFIG_DIR, WORK_DIR) wrapper_executable = "/cvmfs/atlas.cern.ch/repo/sw/PandaPilotWrapper/latest/runpilot2-wrapper.sh" - command = "sh {0} {1} -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={2} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ - format(wrapper_executable, wrapper_params, submit_mode) + command = "sh {0} {1} -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={2} --allow-same-user=False | tee /tmp/wrapper-wid.log".format( + wrapper_executable, wrapper_params, submit_mode + ) try: subprocess.call(command, shell=True) - except: + except BaseException: logging.error(traceback.format_exc()) - logging.debug('[main] pilot wrapper done...') + logging.debug("[main] pilot wrapper done...") # upload logs to e.g. panda cache or similar - upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) - logging.debug('[main] FINISHED') \ No newline at end of file + upload_logs(logs_frontend_w, "/tmp/wrapper-wid.log", destination_name, proxy_path) + logging.debug("[main] FINISHED") diff --git a/pandaharvester/harvestercommunicator/base_communicator.py b/pandaharvester/harvestercommunicator/base_communicator.py index 5bca53de..2ab83c0b 100644 --- a/pandaharvester/harvestercommunicator/base_communicator.py +++ b/pandaharvester/harvestercommunicator/base_communicator.py @@ -9,7 +9,7 @@ from pandaharvester.harvestercore import core_utils # logger -_logger = core_utils.setup_logger('communicator') +_logger = core_utils.setup_logger("communicator") # base class for communication with WMS @@ -23,13 +23,12 @@ def make_logger(self, tag=None, method_name=None): return core_utils.make_logger(_logger, token=tag, method_name=method_name) # get jobs - def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, - additional_criteria): - return [], '' + def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, additional_criteria): + return [], "" # update jobs def update_jobs(self, jobspec_list, id): - return [{'StatusCode': 0, 'ErrorDiag': '', 'command': ''}] * len(jobspec_list) + return [{"StatusCode": 0, "ErrorDiag": "", "command": ""}] * len(jobspec_list) # get events def get_event_ranges(self, data_map, scattered, base_path): @@ -38,8 +37,8 @@ def get_event_ranges(self, data_map, scattered, base_path): # update events def update_event_ranges(self, event_ranges, tmp_log): retMap = dict() - retMap['StatusCode'] = 0 - retMap['Returns'] = [True] * len(event_ranges) + retMap["StatusCode"] = 0 + retMap["Returns"] = [True] * len(event_ranges) return retMap # get commands @@ -52,24 +51,24 @@ def ack_commands(self, command_ids): # update workers def update_workers(self, workspec_list): - return [True] * len(workspec_list), '' + return [True] * len(workspec_list), "" # send heartbeat of harvester instance def is_alive(self, key_values): - return True, '' + return True, "" # update worker stats def update_worker_stats(self, site_name, stats): - return True, '' + return True, "" # check jobs def check_jobs(self, jobspec_list): - return [{'StatusCode': 0, 'ErrorDiag': '', 'command': ''}] * len(jobspec_list) + return [{"StatusCode": 0, "ErrorDiag": "", "command": ""}] * len(jobspec_list) # send dialog messages def send_dialog_messages(self, dialog_list): - return True, '' + return True, "" # update service metrics def update_service_metrics(self, service_metrics_list): - return True, '' \ No newline at end of file + return True, "" diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index 0f28a4fe..cb266d82 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -3,6 +3,7 @@ """ import ssl + try: # disable SNI for TLSV1_UNRECOGNIZED_NAME before importing requests ssl.HAS_SNI = False @@ -18,8 +19,10 @@ import datetime import traceback from future.utils import iteritems + # TO BE REMOVED for python2.7 import requests.packages.urllib3 + try: requests.packages.urllib3.disable_warnings() except Exception: @@ -38,27 +41,26 @@ class PandaCommunicator(BaseCommunicator): def __init__(self): BaseCommunicator.__init__(self) self.useInspect = False - if hasattr(harvester_config.pandacon, 'verbose') and harvester_config.pandacon.verbose: + if hasattr(harvester_config.pandacon, "verbose") and harvester_config.pandacon.verbose: self.verbose = True - if hasattr(harvester_config.pandacon, 'useInspect') and harvester_config.pandacon.useInspect is True: + if hasattr(harvester_config.pandacon, "useInspect") and harvester_config.pandacon.useInspect is True: self.useInspect = True else: self.verbose = False - if hasattr(harvester_config.pandacon, 'auth_type'): + if hasattr(harvester_config.pandacon, "auth_type"): self.auth_type = harvester_config.pandacon.auth_type else: - self.auth_type = 'x509' + self.auth_type = "x509" self.auth_token = None self.auth_token_last_update = None # renew token def renew_token(self): - if hasattr(harvester_config.pandacon, 'auth_token'): - if harvester_config.pandacon.auth_token.startswith('file:'): - if self.auth_token_last_update is not None and \ - datetime.datetime.utcnow() - self.auth_token_last_update < datetime.timedelta(minutes=60): + if hasattr(harvester_config.pandacon, "auth_token"): + if harvester_config.pandacon.auth_token.startswith("file:"): + if self.auth_token_last_update is not None and datetime.datetime.utcnow() - self.auth_token_last_update < datetime.timedelta(minutes=60): return - with open(harvester_config.pandacon.auth_token.split(':')[-1]) as f: + with open(harvester_config.pandacon.auth_token.split(":")[-1]) as f: self.auth_token = f.read() self.auth_token_last_update = datetime.datetime.utcnow() else: @@ -71,27 +73,22 @@ def post(self, path, data): try: tmpLog = None if self.verbose: - tmpLog = self.make_logger(method_name='post') + tmpLog = self.make_logger(method_name="post") if self.useInspect: tmpExec = inspect.stack()[1][3] - tmpExec += '/' + tmpExec += "/" tmpExec = str(uuid.uuid4()) - url = '{0}/{1}'.format(harvester_config.pandacon.pandaURL, path) + url = "{0}/{1}".format(harvester_config.pandacon.pandaURL, path) if self.verbose: - tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data))) + tmpLog.debug("exec={0} URL={1} data={2}".format(tmpExec, url, str(data))) session = get_http_adapter_with_random_dns_resolution() - res = session.post(url, - data=data, - headers={"Accept": "application/json", - "Connection": "close"}, - timeout=harvester_config.pandacon.timeout) + res = session.post(url, data=data, headers={"Accept": "application/json", "Connection": "close"}, timeout=harvester_config.pandacon.timeout) if self.verbose: - tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) + tmpLog.debug("exec={0} code={1} return={2}".format(tmpExec, res.status_code, res.text)) if res.status_code == 200: return True, res else: - errMsg = 'StatusCode={0} {1}'.format(res.status_code, - res.text) + errMsg = "StatusCode={0} {1}".format(res.status_code, res.text) except Exception: errType, errValue = sys.exc_info()[:2] errMsg = "failed to post with {0}:{1} ".format(errType, errValue) @@ -103,43 +100,34 @@ def post_ssl(self, path, data, cert=None, base_url=None): try: tmpLog = None if self.verbose: - tmpLog = self.make_logger(method_name='post_ssl') + tmpLog = self.make_logger(method_name="post_ssl") if self.useInspect: tmpExec = inspect.stack()[1][3] - tmpExec += '/' + tmpExec += "/" tmpExec = str(uuid.uuid4()) if base_url is None: base_url = harvester_config.pandacon.pandaURLSSL - url = '{0}/{1}'.format(base_url, path) + url = "{0}/{1}".format(base_url, path) if self.verbose: - tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data))) - headers = {"Accept": "application/json", - "Connection": "close"} - if self.auth_type == 'oidc': + tmpLog.debug("exec={0} URL={1} data={2}".format(tmpExec, url, str(data))) + headers = {"Accept": "application/json", "Connection": "close"} + if self.auth_type == "oidc": self.renew_token() cert = None - headers['Authorization'] = 'Bearer {0}'.format(self.auth_token) - headers['Origin'] = harvester_config.pandacon.auth_origin + headers["Authorization"] = "Bearer {0}".format(self.auth_token) + headers["Origin"] = harvester_config.pandacon.auth_origin else: if cert is None: - cert = (harvester_config.pandacon.cert_file, - harvester_config.pandacon.key_file) + cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) session = get_http_adapter_with_random_dns_resolution() sw = core_utils.get_stopwatch() - res = session.post(url, - data=data, - headers=headers, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + res = session.post(url, data=data, headers=headers, timeout=harvester_config.pandacon.timeout, verify=harvester_config.pandacon.ca_cert, cert=cert) if self.verbose: - tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(tmpExec, res.status_code, res.text, - sw.get_elapsed_time())) + tmpLog.debug("exec={0} code={1} {3}. return={2}".format(tmpExec, res.status_code, res.text, sw.get_elapsed_time())) if res.status_code == 200: return True, res else: - errMsg = 'StatusCode={0} {1}'.format(res.status_code, - res.text) + errMsg = "StatusCode={0} {1}".format(res.status_code, res.text) except Exception: errType, errValue = sys.exc_info()[:2] errMsg = "failed to post with {0}:{1} ".format(errType, errValue) @@ -152,41 +140,36 @@ def put_ssl(self, path, files, cert=None, base_url=None): tmpLog = None tmpExec = None if self.verbose: - tmpLog = self.make_logger(method_name='put_ssl') + tmpLog = self.make_logger(method_name="put_ssl") if self.useInspect: tmpExec = inspect.stack()[1][3] - tmpExec += '/' + tmpExec += "/" tmpExec = str(uuid.uuid4()) if base_url is None: base_url = harvester_config.pandacon.pandaCacheURL_W - url = '{0}/{1}'.format(base_url, path) + url = "{0}/{1}".format(base_url, path) if self.verbose: - tmpLog.debug('exec={0} URL={1} files={2}'.format(tmpExec, url, files['file'][0])) - if self.auth_type == 'oidc': + tmpLog.debug("exec={0} URL={1} files={2}".format(tmpExec, url, files["file"][0])) + if self.auth_type == "oidc": self.renew_token() cert = None headers = dict() - headers['Authorization'] = 'Bearer {0}'.format(self.auth_token) - headers['Origin'] = harvester_config.pandacon.auth_origin + headers["Authorization"] = "Bearer {0}".format(self.auth_token) + headers["Origin"] = harvester_config.pandacon.auth_origin else: headers = None if cert is None: - cert = (harvester_config.pandacon.cert_file, - harvester_config.pandacon.key_file) + cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) session = get_http_adapter_with_random_dns_resolution() - res = session.post(url, - files=files, - headers=headers, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + res = session.post( + url, files=files, headers=headers, timeout=harvester_config.pandacon.timeout, verify=harvester_config.pandacon.ca_cert, cert=cert + ) if self.verbose: - tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) + tmpLog.debug("exec={0} code={1} return={2}".format(tmpExec, res.status_code, res.text)) if res.status_code == 200: return True, res else: - errMsg = 'StatusCode={0} {1}'.format(res.status_code, - res.text) + errMsg = "StatusCode={0} {1}".format(res.status_code, res.text) except Exception: errType, errValue = sys.exc_info()[:2] errMsg = "failed to put with {0}:{1} ".format(errType, errValue) @@ -195,46 +178,45 @@ def put_ssl(self, path, files, cert=None, base_url=None): # check server def check_panda(self): - tmpStat, tmpRes = self.post_ssl('isAlive', {}) + tmpStat, tmpRes = self.post_ssl("isAlive", {}) if tmpStat: return tmpStat, tmpRes.status_code, tmpRes.text else: return tmpStat, tmpRes # get jobs - def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, - additional_criteria): + def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n_jobs, additional_criteria): # get logger - tmpLog = self.make_logger('siteName={0}'.format(site_name), method_name='get_jobs') - tmpLog.debug('try to get {0} jobs'.format(n_jobs)) + tmpLog = self.make_logger("siteName={0}".format(site_name), method_name="get_jobs") + tmpLog.debug("try to get {0} jobs".format(n_jobs)) data = {} - data['siteName'] = site_name - data['node'] = node_name - data['prodSourceLabel'] = prod_source_label - data['computingElement'] = computing_element - data['nJobs'] = n_jobs - data['schedulerID'] = 'harvester-{0}'.format(harvester_config.master.harvester_id) + data["siteName"] = site_name + data["node"] = node_name + data["prodSourceLabel"] = prod_source_label + data["computingElement"] = computing_element + data["nJobs"] = n_jobs + data["schedulerID"] = "harvester-{0}".format(harvester_config.master.harvester_id) if additional_criteria is not None: for tmpKey, tmpVal in iteritems(additional_criteria): data[tmpKey] = tmpVal sw = core_utils.get_stopwatch() - tmpStat, tmpRes = self.post_ssl('getJob', data) - tmpLog.debug('getJob for {0} jobs {1}'.format(n_jobs, sw.get_elapsed_time())) - errStr = 'OK' + tmpStat, tmpRes = self.post_ssl("getJob", data) + tmpLog.debug("getJob for {0} jobs {1}".format(n_jobs, sw.get_elapsed_time())) + errStr = "OK" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() - tmpLog.debug('StatusCode={0}'.format(tmpDict['StatusCode'])) - if tmpDict['StatusCode'] == 0: - tmpLog.debug('got {0} jobs'.format(len(tmpDict['jobs']))) - return tmpDict['jobs'], errStr + tmpLog.debug("StatusCode={0}".format(tmpDict["StatusCode"])) + if tmpDict["StatusCode"] == 0: + tmpLog.debug("got {0} jobs".format(len(tmpDict["jobs"]))) + return tmpDict["jobs"], errStr else: - if 'errorDialog' in tmpDict: - errStr = tmpDict['errorDialog'] + if "errorDialog" in tmpDict: + errStr = tmpDict["errorDialog"] else: - errStr = "StatusCode={0}".format(tmpDict['StatusCode']) + errStr = "StatusCode={0}".format(tmpDict["StatusCode"]) return [], errStr except Exception: errStr = core_utils.dump_error_message(tmpLog, tmpRes) @@ -243,98 +225,95 @@ def get_jobs(self, site_name, node_name, prod_source_label, computing_element, n # update jobs def update_jobs(self, jobspec_list, id): sw = core_utils.get_stopwatch() - tmpLogG = self.make_logger('id={0}'.format(id), method_name='update_jobs') - tmpLogG.debug('update {0} jobs'.format(len(jobspec_list))) + tmpLogG = self.make_logger("id={0}".format(id), method_name="update_jobs") + tmpLogG.debug("update {0} jobs".format(len(jobspec_list))) retList = [] # upload checkpoints for jobSpec in jobspec_list: if jobSpec.outFiles: - tmpLogG.debug('upload {0} checkpoint files for PandaID={1}'.format(len(jobSpec.outFiles), - jobSpec.PandaID)) + tmpLogG.debug("upload {0} checkpoint files for PandaID={1}".format(len(jobSpec.outFiles), jobSpec.PandaID)) for fileSpec in jobSpec.outFiles: - if 'sourceURL' in jobSpec.jobParams: - tmpS = self.upload_checkpoint(jobSpec.jobParams['sourceURL'], jobSpec.taskID, - jobSpec.PandaID, fileSpec.lfn, fileSpec.path) + if "sourceURL" in jobSpec.jobParams: + tmpS = self.upload_checkpoint(jobSpec.jobParams["sourceURL"], jobSpec.taskID, jobSpec.PandaID, fileSpec.lfn, fileSpec.path) if tmpS: - fileSpec.status = 'done' + fileSpec.status = "done" # update events for jobSpec in jobspec_list: eventRanges, eventSpecs = jobSpec.to_event_data(max_events=10000) if eventRanges != []: - tmpLogG.debug('update {0} events for PandaID={1}'.format(len(eventSpecs), jobSpec.PandaID)) + tmpLogG.debug("update {0} events for PandaID={1}".format(len(eventSpecs), jobSpec.PandaID)) tmpRet = self.update_event_ranges(eventRanges, tmpLogG) - if tmpRet['StatusCode'] == 0: - for eventSpec, retVal in zip(eventSpecs, tmpRet['Returns']): + if tmpRet["StatusCode"] == 0: + for eventSpec, retVal in zip(eventSpecs, tmpRet["Returns"]): if retVal in [True, False] and eventSpec.is_final_status(): - eventSpec.subStatus = 'done' + eventSpec.subStatus = "done" # update jobs in bulk nLookup = 100 iLookup = 0 while iLookup < len(jobspec_list): dataList = [] - jobSpecSubList = jobspec_list[iLookup:iLookup+nLookup] + jobSpecSubList = jobspec_list[iLookup : iLookup + nLookup] for jobSpec in jobSpecSubList: data = jobSpec.get_job_attributes_for_panda() - data['jobId'] = jobSpec.PandaID - data['siteName'] = jobSpec.computingSite - data['state'] = jobSpec.get_status() - data['attemptNr'] = jobSpec.attemptNr - data['jobSubStatus'] = jobSpec.subStatus + data["jobId"] = jobSpec.PandaID + data["siteName"] = jobSpec.computingSite + data["state"] = jobSpec.get_status() + data["attemptNr"] = jobSpec.attemptNr + data["jobSubStatus"] = jobSpec.subStatus # change cancelled to failed to be accepted by panda server - if data['state'] in ['cancelled', 'missed']: + if data["state"] in ["cancelled", "missed"]: if jobSpec.is_pilot_closed(): - data['jobSubStatus'] = 'pilot_closed' + data["jobSubStatus"] = "pilot_closed" else: - data['jobSubStatus'] = data['state'] - data['state'] = 'failed' - if jobSpec.startTime is not None and 'startTime' not in data: - data['startTime'] = jobSpec.startTime.strftime('%Y-%m-%d %H:%M:%S') - if jobSpec.endTime is not None and 'endTime' not in data: - data['endTime'] = jobSpec.endTime.strftime('%Y-%m-%d %H:%M:%S') - if 'coreCount' not in data and jobSpec.nCore is not None: - data['coreCount'] = jobSpec.nCore + data["jobSubStatus"] = data["state"] + data["state"] = "failed" + if jobSpec.startTime is not None and "startTime" not in data: + data["startTime"] = jobSpec.startTime.strftime("%Y-%m-%d %H:%M:%S") + if jobSpec.endTime is not None and "endTime" not in data: + data["endTime"] = jobSpec.endTime.strftime("%Y-%m-%d %H:%M:%S") + if "coreCount" not in data and jobSpec.nCore is not None: + data["coreCount"] = jobSpec.nCore if jobSpec.is_final_status() and jobSpec.status == jobSpec.get_status(): if jobSpec.metaData is not None: - data['metaData'] = json.dumps(jobSpec.metaData) + data["metaData"] = json.dumps(jobSpec.metaData) if jobSpec.outputFilesToReport is not None: - data['xml'] = jobSpec.outputFilesToReport + data["xml"] = jobSpec.outputFilesToReport dataList.append(data) harvester_id = harvester_config.master.harvester_id - tmpData = {'jobList': json.dumps(dataList), 'harvester_id': harvester_id} - tmpStat, tmpRes = self.post_ssl('updateJobsInBulk', tmpData) + tmpData = {"jobList": json.dumps(dataList), "harvester_id": harvester_id} + tmpStat, tmpRes = self.post_ssl("updateJobsInBulk", tmpData) retMaps = None - errStr = '' + errStr = "" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLogG, tmpRes) else: try: tmpStat, retMaps = tmpRes.json() if tmpStat is False: - tmpLogG.error('updateJobsInBulk failed with {0}'.format(retMaps)) + tmpLogG.error("updateJobsInBulk failed with {0}".format(retMaps)) retMaps = None except Exception: errStr = core_utils.dump_error_message(tmpLogG) if retMaps is None: retMap = {} - retMap['content'] = {} - retMap['content']['StatusCode'] = 999 - retMap['content']['ErrorDiag'] = errStr + retMap["content"] = {} + retMap["content"]["StatusCode"] = 999 + retMap["content"]["ErrorDiag"] = errStr retMaps = [json.dumps(retMap)] * len(jobSpecSubList) for jobSpec, retMap, data in zip(jobSpecSubList, retMaps, dataList): - tmpLog = self.make_logger('id={0} PandaID={1}'.format(id, jobSpec.PandaID), - method_name='update_jobs') + tmpLog = self.make_logger("id={0} PandaID={1}".format(id, jobSpec.PandaID), method_name="update_jobs") try: - retMap = json.loads(retMap['content']) + retMap = json.loads(retMap["content"]) except Exception: - errStr = 'failed to json_load {}'.format(str(retMap)) + errStr = "failed to json_load {}".format(str(retMap)) retMap = {} - retMap['StatusCode'] = 999 - retMap['ErrorDiag'] = errStr - tmpLog.debug('data={0}'.format(str(data))) - tmpLog.debug('done with {0}'.format(str(retMap))) + retMap["StatusCode"] = 999 + retMap["ErrorDiag"] = errStr + tmpLog.debug("data={0}".format(str(data))) + tmpLog.debug("done with {0}".format(str(retMap))) retList.append(retMap) iLookup += nLookup - tmpLogG.debug('done' + sw.get_elapsed_time()) + tmpLogG.debug("done" + sw.get_elapsed_time()) return retList # get events @@ -347,106 +326,99 @@ def get_event_ranges(self, data_map, scattered, base_path): getEventsChunkSize = 5120 for pandaID, data in iteritems(data_map): # get logger - tmpLog = self.make_logger('PandaID={0}'.format(data['pandaID']), - method_name='get_event_ranges') - if 'nRanges' in data: - nRanges = data['nRanges'] + tmpLog = self.make_logger("PandaID={0}".format(data["pandaID"]), method_name="get_event_ranges") + if "nRanges" in data: + nRanges = data["nRanges"] else: nRanges = 1 if scattered: - data['scattered'] = True - if 'isHPO' in data: - isHPO = data['isHPO'] - del data['isHPO'] + data["scattered"] = True + if "isHPO" in data: + isHPO = data["isHPO"] + del data["isHPO"] else: isHPO = False - if 'sourceURL' in data: - sourceURL = data['sourceURL'] - del data['sourceURL'] + if "sourceURL" in data: + sourceURL = data["sourceURL"] + del data["sourceURL"] else: sourceURL = None - tmpLog.debug('start nRanges={0}'.format(nRanges)) + tmpLog.debug("start nRanges={0}".format(nRanges)) while nRanges > 0: # use a small chunk size to avoid timeout chunkSize = min(getEventsChunkSize, nRanges) - data['nRanges'] = chunkSize - tmpStat, tmpRes = self.post_ssl('getEventRanges', data) + data["nRanges"] = chunkSize + tmpStat, tmpRes = self.post_ssl("getEventRanges", data) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() - if tmpDict['StatusCode'] == 0: + if tmpDict["StatusCode"] == 0: retStat = True - retVal.setdefault(data['pandaID'], []) + retVal.setdefault(data["pandaID"], []) if not isHPO: - retVal[data['pandaID']] += tmpDict['eventRanges'] + retVal[data["pandaID"]] += tmpDict["eventRanges"] else: - for event in tmpDict['eventRanges']: - event_id = event['eventRangeID'] - task_id = event_id.split('-')[0] - point_id = event_id.split('-')[3] + for event in tmpDict["eventRanges"]: + event_id = event["eventRangeID"] + task_id = event_id.split("-")[0] + point_id = event_id.split("-")[3] # get HP point - tmpSI, tmpOI = idds_utils.get_hp_point(harvester_config.pandacon.iddsURL, - task_id, point_id, - tmpLog, self.verbose) + tmpSI, tmpOI = idds_utils.get_hp_point(harvester_config.pandacon.iddsURL, task_id, point_id, tmpLog, self.verbose) if tmpSI: - event['hp_point'] = tmpOI + event["hp_point"] = tmpOI # get checkpoint if sourceURL: - tmpSO, tmpOO = self.download_checkpoint(sourceURL, task_id, - data['pandaID'], - point_id, base_path) + tmpSO, tmpOO = self.download_checkpoint(sourceURL, task_id, data["pandaID"], point_id, base_path) if tmpSO: - event['checkpoint'] = tmpOO - retVal[data['pandaID']].append(event) + event["checkpoint"] = tmpOO + retVal[data["pandaID"]].append(event) else: core_utils.dump_error_message(tmpLog, tmpOI) # got empty - if len(tmpDict['eventRanges']) == 0: + if len(tmpDict["eventRanges"]) == 0: break except Exception: core_utils.dump_error_message(tmpLog) break nRanges -= chunkSize - tmpLog.debug('done with {0}'.format(str(retVal))) + tmpLog.debug("done with {0}".format(str(retVal))) return retStat, retVal # update events def update_event_ranges(self, event_ranges, tmp_log): - tmp_log.debug('start update_event_ranges') + tmp_log.debug("start update_event_ranges") # loop over for HPO for item in event_ranges: new_event_ranges = [] - for event in item['eventRanges']: + for event in item["eventRanges"]: # report loss to idds - if 'loss' in event: - event_id = event['eventRangeID'] - task_id = event_id.split('-')[0] - point_id = event_id.split('-')[3] - tmpSI, tmpOI = idds_utils.update_hp_point(harvester_config.pandacon.iddsURL, - task_id, point_id, event['loss'], - tmp_log, self.verbose) + if "loss" in event: + event_id = event["eventRangeID"] + task_id = event_id.split("-")[0] + point_id = event_id.split("-")[3] + tmpSI, tmpOI = idds_utils.update_hp_point(harvester_config.pandacon.iddsURL, task_id, point_id, event["loss"], tmp_log, self.verbose) if not tmpSI: core_utils.dump_error_message(tmp_log, tmpOI) - tmp_log.error('skip {0} since cannot update iDDS'.format(event_id)) + tmp_log.error("skip {0} since cannot update iDDS".format(event_id)) continue else: # clear checkpoint - if 'sourceURL' in item: - tmpSC, tmpOC = self.clear_checkpoint(item['sourceURL'], task_id, point_id) + if "sourceURL" in item: + tmpSC, tmpOC = self.clear_checkpoint(item["sourceURL"], task_id, point_id) if not tmpSC: core_utils.dump_error_message(tmp_log, tmpOC) - del event['loss'] + del event["loss"] new_event_ranges.append(event) - item['eventRanges'] = new_event_ranges + item["eventRanges"] = new_event_ranges # update in panda data = {} - data['eventRanges'] = json.dumps(event_ranges) - data['version'] = 1 - tmp_log.debug('data={0}'.format(str(data))) - tmpStat, tmpRes = self.post_ssl('updateEventRanges', data) + data["eventRanges"] = json.dumps(event_ranges) + data["version"] = 1 + tmp_log.debug("data={0}".format(str(data))) + tmpStat, tmpRes = self.post_ssl("updateEventRanges", data) retMap = None if tmpStat is False: core_utils.dump_error_message(tmp_log, tmpRes) @@ -457,28 +429,27 @@ def update_event_ranges(self, event_ranges, tmp_log): core_utils.dump_error_message(tmp_log) if retMap is None: retMap = {} - retMap['StatusCode'] = 999 - tmp_log.debug('done updateEventRanges with {0}'.format(str(retMap))) + retMap["StatusCode"] = 999 + tmp_log.debug("done updateEventRanges with {0}".format(str(retMap))) return retMap # get commands def get_commands(self, n_commands): harvester_id = harvester_config.master.harvester_id - tmpLog = self.make_logger('harvesterID={0}'.format(harvester_id), - method_name='get_commands') - tmpLog.debug('Start retrieving {0} commands'.format(n_commands)) + tmpLog = self.make_logger("harvesterID={0}".format(harvester_id), method_name="get_commands") + tmpLog.debug("Start retrieving {0} commands".format(n_commands)) data = {} - data['harvester_id'] = harvester_id - data['n_commands'] = n_commands - tmp_stat, tmp_res = self.post_ssl('getCommands', data) + data["harvester_id"] = harvester_id + data["n_commands"] = n_commands + tmp_stat, tmp_res = self.post_ssl("getCommands", data) if tmp_stat is False: core_utils.dump_error_message(tmpLog, tmp_res) else: try: tmp_dict = tmp_res.json() - if tmp_dict['StatusCode'] == 0: - tmpLog.debug('Commands {0}'.format(tmp_dict['Commands'])) - return tmp_dict['Commands'] + if tmp_dict["StatusCode"] == 0: + tmpLog.debug("Commands {0}".format(tmp_dict["Commands"])) + return tmp_dict["Commands"] return [] except Exception: core_utils.dump_error_message(tmpLog, tmp_res) @@ -487,19 +458,18 @@ def get_commands(self, n_commands): # send ACKs def ack_commands(self, command_ids): harvester_id = harvester_config.master.harvester_id - tmpLog = self.make_logger('harvesterID={0}'.format(harvester_id), - method_name='ack_commands') - tmpLog.debug('Start acknowledging {0} commands (command_ids={1})'.format(len(command_ids), command_ids)) + tmpLog = self.make_logger("harvesterID={0}".format(harvester_id), method_name="ack_commands") + tmpLog.debug("Start acknowledging {0} commands (command_ids={1})".format(len(command_ids), command_ids)) data = {} - data['command_ids'] = json.dumps(command_ids) - tmp_stat, tmp_res = self.post_ssl('ackCommands', data) + data["command_ids"] = json.dumps(command_ids) + tmp_stat, tmp_res = self.post_ssl("ackCommands", data) if tmp_stat is False: core_utils.dump_error_message(tmpLog, tmp_res) else: try: tmp_dict = tmp_res.json() - if tmp_dict['StatusCode'] == 0: - tmpLog.debug('Finished acknowledging commands') + if tmp_dict["StatusCode"] == 0: + tmpLog.debug("Finished acknowledging commands") return True return False except Exception: @@ -509,48 +479,48 @@ def ack_commands(self, command_ids): # get proxy def get_proxy(self, voms_role, cert=None): retVal = None - retMsg = '' + retMsg = "" # get logger - tmpLog = self.make_logger(method_name='get_proxy') - tmpLog.debug('start') - data = {'role': voms_role} - tmpStat, tmpRes = self.post_ssl('getProxy', data, cert) + tmpLog = self.make_logger(method_name="get_proxy") + tmpLog.debug("start") + data = {"role": voms_role} + tmpStat, tmpRes = self.post_ssl("getProxy", data, cert) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() - if tmpDict['StatusCode'] == 0: - retVal = tmpDict['userProxy'] + if tmpDict["StatusCode"] == 0: + retVal = tmpDict["userProxy"] else: - retMsg = tmpDict['errorDialog'] + retMsg = tmpDict["errorDialog"] core_utils.dump_error_message(tmpLog, retMsg) tmpStat = False except Exception: retMsg = core_utils.dump_error_message(tmpLog, tmpRes) tmpStat = False if tmpStat: - tmpLog.debug('done with {0}'.format(str(retVal))) + tmpLog.debug("done with {0}".format(str(retVal))) return retVal, retMsg # get resource types def get_resource_types(self): - tmp_log = self.make_logger(method_name='get_resource_types') - tmp_log.debug('Start retrieving resource types') + tmp_log = self.make_logger(method_name="get_resource_types") + tmp_log.debug("Start retrieving resource types") data = {} - ret_msg = '' + ret_msg = "" ret_val = None - tmp_stat, tmp_res = self.post_ssl('getResourceTypes', data) + tmp_stat, tmp_res = self.post_ssl("getResourceTypes", data) if tmp_stat is False: core_utils.dump_error_message(tmp_log, tmp_res) else: try: tmp_dict = tmp_res.json() - if tmp_dict['StatusCode'] == 0: - ret_val = tmp_dict['ResourceTypes'] - tmp_log.debug('Resource types: {0}'.format(ret_val)) + if tmp_dict["StatusCode"] == 0: + ret_val = tmp_dict["ResourceTypes"] + tmp_log.debug("Resource types: {0}".format(ret_val)) else: - ret_msg = tmp_dict['errorDialog'] + ret_msg = tmp_dict["errorDialog"] core_utils.dump_error_message(tmp_log, ret_msg) except Exception: core_utils.dump_error_message(tmp_log, tmp_res) @@ -559,38 +529,38 @@ def get_resource_types(self): # get job statistics def get_job_stats(self): - tmp_log = self.make_logger(method_name='get_job_stats') - tmp_log.debug('start') + tmp_log = self.make_logger(method_name="get_job_stats") + tmp_log.debug("start") - tmp_stat, tmp_res = self.post_ssl('getJobStatisticsPerSite', {}) + tmp_stat, tmp_res = self.post_ssl("getJobStatisticsPerSite", {}) stats = {} if tmp_stat is False: - ret_msg = 'FAILED' + ret_msg = "FAILED" core_utils.dump_error_message(tmp_log, tmp_res) else: try: stats = pickle.loads(tmp_res.content) - ret_msg = 'OK' + ret_msg = "OK" except Exception: - ret_msg = 'Exception' + ret_msg = "Exception" core_utils.dump_error_message(tmp_log) return stats, ret_msg # update workers def update_workers(self, workspec_list): - tmpLog = self.make_logger(method_name='update_workers') - tmpLog.debug('start') + tmpLog = self.make_logger(method_name="update_workers") + tmpLog.debug("start") dataList = [] for workSpec in workspec_list: dataList.append(workSpec.convert_to_propagate()) data = dict() - data['harvesterID'] = harvester_config.master.harvester_id - data['workers'] = json.dumps(dataList) - tmpLog.debug('update {0} workers'.format(len(dataList))) - tmpStat, tmpRes = self.post_ssl('updateWorkers', data) + data["harvesterID"] = harvester_config.master.harvester_id + data["workers"] = json.dumps(dataList) + tmpLog.debug("update {0} workers".format(len(dataList))) + tmpStat, tmpRes = self.post_ssl("updateWorkers", data) retList = None - errStr = 'OK' + errStr = "OK" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: @@ -602,26 +572,26 @@ def update_workers(self, workspec_list): tmpStat = False except Exception: errStr = core_utils.dump_error_message(tmpLog) - tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) + tmpLog.error("conversion failure from {0}".format(tmpRes.text)) tmpStat = False if tmpStat: - tmpLog.debug('done with {0}'.format(errStr)) + tmpLog.debug("done with {0}".format(errStr)) return retList, errStr # send heartbeat of harvester instance def is_alive(self, key_values): - tmpLog = self.make_logger(method_name='is_alive') - tmpLog.debug('start') + tmpLog = self.make_logger(method_name="is_alive") + tmpLog.debug("start") # convert datetime for tmpKey, tmpVal in iteritems(key_values): if isinstance(tmpVal, datetime.datetime): - tmpVal = 'datetime/' + tmpVal.strftime('%Y-%m-%d %H:%M:%S.%f') + tmpVal = "datetime/" + tmpVal.strftime("%Y-%m-%d %H:%M:%S.%f") key_values[tmpKey] = tmpVal # send data data = dict() - data['harvesterID'] = harvester_config.master.harvester_id - data['data'] = json.dumps(key_values) - tmpStat, tmpRes = self.post_ssl('harvesterIsAlive', data) + data["harvesterID"] = harvester_config.master.harvester_id + data["data"] = json.dumps(key_values) + tmpStat, tmpRes = self.post_ssl("harvesterIsAlive", data) retCode = False if tmpStat is False: tmpStr = core_utils.dump_error_message(tmpLog, tmpRes) @@ -630,23 +600,23 @@ def is_alive(self, key_values): retCode, tmpStr = tmpRes.json() except Exception: tmpStr = core_utils.dump_error_message(tmpLog) - tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) + tmpLog.error("conversion failure from {0}".format(tmpRes.text)) tmpStat = False if tmpStat: - tmpLog.debug('done with {0} : {1}'.format(retCode, tmpStr)) + tmpLog.debug("done with {0} : {1}".format(retCode, tmpStr)) return retCode, tmpStr # update worker stats def update_worker_stats(self, site_name, stats): - tmpLog = self.make_logger(method_name='update_worker_stats') - tmpLog.debug('start') + tmpLog = self.make_logger(method_name="update_worker_stats") + tmpLog.debug("start") data = dict() - data['harvesterID'] = harvester_config.master.harvester_id - data['siteName'] = site_name - data['paramsList'] = json.dumps(stats) - tmpLog.debug('update stats for {0}, stats: {1}'.format(site_name, stats)) - tmpStat, tmpRes = self.post_ssl('reportWorkerStats_jobtype', data) - errStr = 'OK' + data["harvesterID"] = harvester_config.master.harvester_id + data["siteName"] = site_name + data["paramsList"] = json.dumps(stats) + tmpLog.debug("update stats for {0}, stats: {1}".format(site_name, stats)) + tmpStat, tmpRes = self.post_ssl("reportWorkerStats_jobtype", data) + errStr = "OK" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: @@ -658,27 +628,27 @@ def update_worker_stats(self, site_name, stats): except Exception: tmpStat = False errStr = core_utils.dump_error_message(tmpLog) - tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) + tmpLog.error("conversion failure from {0}".format(tmpRes.text)) if tmpStat: - tmpLog.debug('done with {0}:{1}'.format(tmpStat, errStr)) + tmpLog.debug("done with {0}:{1}".format(tmpStat, errStr)) return tmpStat, errStr # check jobs def check_jobs(self, jobspec_list): - tmpLog = self.make_logger(method_name='check_jobs') - tmpLog.debug('start') + tmpLog = self.make_logger(method_name="check_jobs") + tmpLog.debug("start") retList = [] nLookup = 100 iLookup = 0 while iLookup < len(jobspec_list): ids = [] - for jobSpec in jobspec_list[iLookup:iLookup+nLookup]: + for jobSpec in jobspec_list[iLookup : iLookup + nLookup]: ids.append(str(jobSpec.PandaID)) iLookup += nLookup data = dict() - data['ids'] = ','.join(ids) - tmpStat, tmpRes = self.post_ssl('checkJobStatus', data) - errStr = 'OK' + data["ids"] = ",".join(ids) + tmpStat, tmpRes = self.post_ssl("checkJobStatus", data) + errStr = "OK" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) tmpRes = None @@ -689,26 +659,26 @@ def check_jobs(self, jobspec_list): tmpRes = None errStr = core_utils.dump_error_message(tmpLog) for idx, pandaID in enumerate(ids): - if tmpRes is None or 'data' not in tmpRes or idx >= len(tmpRes['data']): + if tmpRes is None or "data" not in tmpRes or idx >= len(tmpRes["data"]): retMap = dict() - retMap['StatusCode'] = 999 - retMap['ErrorDiag'] = errStr + retMap["StatusCode"] = 999 + retMap["ErrorDiag"] = errStr else: - retMap = tmpRes['data'][idx] - retMap['StatusCode'] = 0 - retMap['ErrorDiag'] = errStr + retMap = tmpRes["data"][idx] + retMap["StatusCode"] = 0 + retMap["ErrorDiag"] = errStr retList.append(retMap) - tmpLog.debug('got {0} for PandaID={1}'.format(str(retMap), pandaID)) + tmpLog.debug("got {0} for PandaID={1}".format(str(retMap), pandaID)) return retList # get key pair def get_key_pair(self, public_key_name, private_key_name): - tmpLog = self.make_logger(method_name='get_key_pair') - tmpLog.debug('start for {0}:{1}'.format(public_key_name, private_key_name)) + tmpLog = self.make_logger(method_name="get_key_pair") + tmpLog.debug("start for {0}:{1}".format(public_key_name, private_key_name)) data = dict() - data['publicKeyName'] = public_key_name - data['privateKeyName'] = private_key_name - tmpStat, tmpRes = self.post_ssl('getKeyPair', data) + data["publicKeyName"] = public_key_name + data["privateKeyName"] = private_key_name + tmpStat, tmpRes = self.post_ssl("getKeyPair", data) retMap = None errStr = None if tmpStat is False: @@ -716,73 +686,71 @@ def get_key_pair(self, public_key_name, private_key_name): else: try: retMap = tmpRes.json() - if retMap['StatusCode'] != 0: - errStr = 'failed to get key with StatusCode={0} : {1}'.format(retMap['StatusCode'], - retMap['errorDialog']) + if retMap["StatusCode"] != 0: + errStr = "failed to get key with StatusCode={0} : {1}".format(retMap["StatusCode"], retMap["errorDialog"]) tmpLog.error(errStr) retMap = None else: - tmpLog.debug('got {0} with'.format(str(retMap), errStr)) + tmpLog.debug("got {0} with".format(str(retMap), errStr)) except Exception: errStr = core_utils.dump_error_message(tmpLog) return retMap, errStr # upload file def upload_file(self, file_name, file_object, offset, read_bytes): - tmpLog = self.make_logger(method_name='upload_file') - tmpLog.debug('start for {0} {1}:{2}'.format(file_name, offset, read_bytes)) + tmpLog = self.make_logger(method_name="upload_file") + tmpLog.debug("start for {0} {1}:{2}".format(file_name, offset, read_bytes)) file_object.seek(offset) - files = {'file': (file_name, zlib.compress(file_object.read(read_bytes)))} - tmpStat, tmpRes = self.put_ssl('updateLog', files) + files = {"file": (file_name, zlib.compress(file_object.read(read_bytes)))} + tmpStat, tmpRes = self.put_ssl("updateLog", files) errStr = None if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: errStr = tmpRes.text - tmpLog.debug('got {0}'.format(errStr)) + tmpLog.debug("got {0}".format(errStr)) return tmpStat, errStr # check event availability def check_event_availability(self, jobspec): retStat = False retVal = None - tmpLog = self.make_logger('PandaID={0}'.format(jobspec.PandaID), - method_name='check_event_availability') - tmpLog.debug('start') + tmpLog = self.make_logger("PandaID={0}".format(jobspec.PandaID), method_name="check_event_availability") + tmpLog.debug("start") data = dict() - data['taskID'] = jobspec.taskID - data['pandaID'] = jobspec.PandaID + data["taskID"] = jobspec.taskID + data["pandaID"] = jobspec.PandaID if jobspec.jobsetID is None: - data['jobsetID'] = jobspec.jobParams['jobsetID'] + data["jobsetID"] = jobspec.jobParams["jobsetID"] else: - data['jobsetID'] = jobspec.jobsetID - tmpStat, tmpRes = self.post_ssl('checkEventsAvailability', data) + data["jobsetID"] = jobspec.jobsetID + tmpStat, tmpRes = self.post_ssl("checkEventsAvailability", data) if tmpStat is False: core_utils.dump_error_message(tmpLog, tmpRes) else: try: tmpDict = tmpRes.json() - if tmpDict['StatusCode'] == 0: + if tmpDict["StatusCode"] == 0: retStat = True - retVal = tmpDict['nEventRanges'] + retVal = tmpDict["nEventRanges"] except Exception: core_utils.dump_error_message(tmpLog, tmpRes) - tmpLog.debug('done with {0}'.format(retVal)) + tmpLog.debug("done with {0}".format(retVal)) return retStat, retVal # send dialog messages def send_dialog_messages(self, dialog_list): - tmpLog = self.make_logger(method_name='send_dialog_messages') - tmpLog.debug('start') + tmpLog = self.make_logger(method_name="send_dialog_messages") + tmpLog.debug("start") dataList = [] for diagSpec in dialog_list: dataList.append(diagSpec.convert_to_propagate()) data = dict() - data['harvesterID'] = harvester_config.master.harvester_id - data['dialogs'] = json.dumps(dataList) - tmpLog.debug('send {0} messages'.format(len(dataList))) - tmpStat, tmpRes = self.post_ssl('addHarvesterDialogs', data) - errStr = 'OK' + data["harvesterID"] = harvester_config.master.harvester_id + data["dialogs"] = json.dumps(dataList) + tmpLog.debug("send {0} messages".format(len(dataList))) + tmpStat, tmpRes = self.post_ssl("addHarvesterDialogs", data) + errStr = "OK" if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) else: @@ -793,22 +761,22 @@ def send_dialog_messages(self, dialog_list): tmpStat = False except Exception: errStr = core_utils.dump_error_message(tmpLog) - tmpLog.error('conversion failure from {0}'.format(tmpRes.text)) + tmpLog.error("conversion failure from {0}".format(tmpRes.text)) tmpStat = False if tmpStat: - tmpLog.debug('done with {0}'.format(errStr)) + tmpLog.debug("done with {0}".format(errStr)) return tmpStat, errStr # update service metrics def update_service_metrics(self, service_metrics_list): - tmp_log = self.make_logger(method_name='update_service_metrics') - tmp_log.debug('start') + tmp_log = self.make_logger(method_name="update_service_metrics") + tmp_log.debug("start") data = dict() - data['harvesterID'] = harvester_config.master.harvester_id - data['metrics'] = json.dumps(service_metrics_list) - tmp_log.debug('updating metrics...') - tmp_stat, tmp_res = self.post_ssl('updateServiceMetrics', data) - err_str = 'OK' + data["harvesterID"] = harvester_config.master.harvester_id + data["metrics"] = json.dumps(service_metrics_list) + tmp_log.debug("updating metrics...") + tmp_stat, tmp_res = self.post_ssl("updateServiceMetrics", data) + err_str = "OK" if tmp_stat is False: err_str = core_utils.dump_error_message(tmp_log, tmp_res) else: @@ -820,23 +788,22 @@ def update_service_metrics(self, service_metrics_list): except Exception: tmp_stat = False err_str = core_utils.dump_error_message(tmp_log) - tmp_log.error('conversion failure from {0}'.format(tmp_res.text)) + tmp_log.error("conversion failure from {0}".format(tmp_res.text)) if tmp_stat: - tmp_log.debug('done with {0}:{1}'.format(tmp_stat, err_str)) + tmp_log.debug("done with {0}:{1}".format(tmp_stat, err_str)) return tmp_stat, err_str # upload checkpoint def upload_checkpoint(self, base_url, task_id, panda_id, file_name, file_path): - tmp_log = self.make_logger('taskID={0} pandaID={1}'.format(task_id, panda_id), - method_name='upload_checkpoint') - tmp_log.debug('start for {0}'.format(file_name)) + tmp_log = self.make_logger("taskID={0} pandaID={1}".format(task_id, panda_id), method_name="upload_checkpoint") + tmp_log.debug("start for {0}".format(file_name)) try: - files = {'file': (file_name, open(file_path).read())} - tmpStat, tmpRes = self.put_ssl('server/panda/put_checkpoint', files, base_url=base_url) + files = {"file": (file_name, open(file_path).read())} + tmpStat, tmpRes = self.put_ssl("server/panda/put_checkpoint", files, base_url=base_url) if tmpStat is False: core_utils.dump_error_message(tmp_log, tmpRes) else: - tmp_log.debug('got {0}'.format(tmpRes.text)) + tmp_log.debug("got {0}".format(tmpRes.text)) return tmpStat except Exception: core_utils.dump_error_message(tmp_log) @@ -844,20 +811,19 @@ def upload_checkpoint(self, base_url, task_id, panda_id, file_name, file_path): # download checkpoint def download_checkpoint(self, base_url, task_id, panda_id, point_id, base_path): - tmp_log = self.make_logger('taskID={0} pandaID={1}'.format(task_id, panda_id), - method_name='download_checkpoint') - tmp_log.debug('start for ID={0}'.format(point_id)) + tmp_log = self.make_logger("taskID={0} pandaID={1}".format(task_id, panda_id), method_name="download_checkpoint") + tmp_log.debug("start for ID={0}".format(point_id)) try: - path = 'cache/hpo_cp_{0}_{1}'.format(task_id, point_id) + path = "cache/hpo_cp_{0}_{1}".format(task_id, point_id) tmpStat, tmpRes = self.post_ssl(path, {}, base_url=base_url) file_name = None if tmpStat is False: core_utils.dump_error_message(tmp_log, tmpRes) else: file_name = os.path.join(base_path, str(uuid.uuid4())) - with open(file_name, 'w') as f: + with open(file_name, "w") as f: f.write(tmpRes.content) - tmp_log.debug('got {0}'.format(file_name)) + tmp_log.debug("got {0}".format(file_name)) return tmpStat, file_name except Exception: core_utils.dump_error_message(tmp_log) @@ -865,13 +831,12 @@ def download_checkpoint(self, base_url, task_id, panda_id, point_id, base_path): # clear checkpoint def clear_checkpoint(self, base_url, task_id, point_id): - tmp_log = self.make_logger('taskID={0} pointID={1}'.format(task_id, point_id), - method_name='clear_checkpoints') + tmp_log = self.make_logger("taskID={0} pointID={1}".format(task_id, point_id), method_name="clear_checkpoints") data = dict() - data['task_id'] = task_id - data['sub_id'] = point_id - tmp_log.debug('start') - tmpStat, tmpRes = self.post_ssl('server/panda/delete_checkpoint', data, base_url=base_url) + data["task_id"] = task_id + data["sub_id"] = point_id + tmp_log.debug("start") + tmpStat, tmpRes = self.post_ssl("server/panda/delete_checkpoint", data, base_url=base_url) retMap = None if tmpStat is False: core_utils.dump_error_message(tmp_log, tmpRes) @@ -882,16 +847,16 @@ def clear_checkpoint(self, base_url, task_id, point_id): core_utils.dump_error_message(tmp_log) if retMap is None: retMap = {} - retMap['StatusCode'] = 999 - tmp_log.debug('done with {0}'.format(str(retMap))) + retMap["StatusCode"] = 999 + tmp_log.debug("done with {0}".format(str(retMap))) return retMap # check event availability def get_max_worker_id(self): - tmpLog = self.make_logger(method_name='get_max_worker_id') - tmpLog.debug('start') - data = {'harvester_id': harvester_config.master.harvester_id} - retStat, retVal = self.post_ssl('get_max_worker_id', data) + tmpLog = self.make_logger(method_name="get_max_worker_id") + tmpLog.debug("start") + data = {"harvester_id": harvester_config.master.harvester_id} + retStat, retVal = self.post_ssl("get_max_worker_id", data) if retStat is False: core_utils.dump_error_message(tmpLog, retVal) else: @@ -901,5 +866,5 @@ def get_max_worker_id(self): core_utils.dump_error_message(tmpLog, retVal.text) retStat = False retVal = retVal.text - tmpLog.debug('done with {} {}'.format(retStat, retVal)) + tmpLog.debug("done with {} {}".format(retStat, retVal)) return retStat, retVal diff --git a/pandaharvester/harvesterconfig/harvester_config.py b/pandaharvester/harvesterconfig/harvester_config.py index 80b7b9c9..852827d6 100644 --- a/pandaharvester/harvesterconfig/harvester_config.py +++ b/pandaharvester/harvesterconfig/harvester_config.py @@ -12,26 +12,26 @@ tmpConf = LiveConfigParser() # URL for config file if any -configEnv = 'HARVESTER_INSTANCE_CONFIG_URL' +configEnv = "HARVESTER_INSTANCE_CONFIG_URL" if configEnv in os.environ: configURL = os.environ[configEnv] else: configURL = None # read -tmpConf.read('panda_harvester.cfg', configURL) +tmpConf.read("panda_harvester.cfg", configURL) # get the value of env var in the config def env_var_parse(val): - match = re.search('\$\{*([^\}]+)\}*', val) + match = re.search("\$\{*([^\}]+)\}*", val) if match is None: return val var_name = match.group(1) - if var_name.upper() == 'HOSTNAME': + if var_name.upper() == "HOSTNAME": return socket.gethostname().split(".")[0] if var_name not in os.environ: - raise KeyError('{0} in the cfg is an undefined environment variable.'.format(var_name)) + raise KeyError("{0} in the cfg is an undefined environment variable.".format(var_name)) else: return os.environ[var_name] @@ -60,9 +60,9 @@ def __init__(self): # load configmap config_map_data = {} -if 'PANDA_HOME' in os.environ: - config_map_name = 'panda_harvester_configmap.json' - config_map_path = os.path.join(os.environ['PANDA_HOME'], 'etc/configmap', config_map_name) +if "PANDA_HOME" in os.environ: + config_map_name = "panda_harvester_configmap.json" + config_map_path = os.path.join(os.environ["PANDA_HOME"], "etc/configmap", config_map_name) if os.path.exists(config_map_path): with open(config_map_path) as f: config_map_data = json.load(f) @@ -87,30 +87,28 @@ def __init__(self): # expand all values for tmpKey, tmpVal in iteritems(tmpDict): # use env vars - if isinstance(tmpVal, str) and tmpVal.startswith('$'): + if isinstance(tmpVal, str) and tmpVal.startswith("$"): tmpVal = env_var_parse(tmpVal) # convert string to bool/int if not isinstance(tmpVal, six.string_types): pass - elif tmpVal == 'True': + elif tmpVal == "True": tmpVal = True - elif tmpVal == 'False': + elif tmpVal == "False": tmpVal = False - elif tmpVal == 'None': + elif tmpVal == "None": tmpVal = None - elif re.match('^\d+$', tmpVal): + elif re.match("^\d+$", tmpVal): tmpVal = int(tmpVal) - elif '\n' in tmpVal and ( - re.match(r'^\W*\[.*\]\W*$', tmpVal.replace('\n', '')) - or re.match(r'^\W*\{.*\}\W*$', tmpVal.replace('\n', ''))): + elif "\n" in tmpVal and (re.match(r"^\W*\[.*\]\W*$", tmpVal.replace("\n", "")) or re.match(r"^\W*\{.*\}\W*$", tmpVal.replace("\n", ""))): tmpVal = json.loads(tmpVal) nested_obj_env_var_sub(tmpVal) - elif '\n' in tmpVal: - tmpVal = tmpVal.split('\n') + elif "\n" in tmpVal: + tmpVal = tmpVal.split("\n") # remove empty tmpVal = [x.strip() for x in tmpVal if x.strip()] # update dict setattr(tmpSelf, tmpKey, tmpVal) # update config dict - if not any(ss in tmpKey.lower() for ss in ['password', 'passphrase', 'secret']): + if not any(ss in tmpKey.lower() for ss in ["password", "passphrase", "secret"]): config_dict[tmpSection][tmpKey] = tmpVal diff --git a/pandaharvester/harvestercore/cache_spec.py b/pandaharvester/harvestercore/cache_spec.py index 33cb2c4e..6d22b1ca 100644 --- a/pandaharvester/harvestercore/cache_spec.py +++ b/pandaharvester/harvestercore/cache_spec.py @@ -7,11 +7,7 @@ class CacheSpec(SpecBase): # attributes - attributesWithTypes = ('mainKey:text', - 'subKey:text', - 'data:blob', - 'lastUpdate:timestamp' - ) + attributesWithTypes = ("mainKey:text", "subKey:text", "data:blob", "lastUpdate:timestamp") # constructor def __init__(self): diff --git a/pandaharvester/harvestercore/command_spec.py b/pandaharvester/harvestercore/command_spec.py index fe6253d0..f122789a 100644 --- a/pandaharvester/harvestercore/command_spec.py +++ b/pandaharvester/harvestercore/command_spec.py @@ -7,26 +7,15 @@ class CommandSpec(SpecBase): # attributes - attributesWithTypes = ('command_id:integer primary key', - 'command:text', - 'receiver:text', - 'params:blob', - 'ack_requested:integer', - 'processed:integer' - ) + attributesWithTypes = ("command_id:integer primary key", "command:text", "receiver:text", "params:blob", "ack_requested:integer", "processed:integer") # commands - COM_reportWorkerStats = 'REPORT_WORKER_STATS' - COM_setNWorkers = 'SET_N_WORKERS_JOBTYPE' - COM_killWorkers = 'KILL_WORKERS' - COM_syncWorkersKill = 'SYNC_WORKERS_KILL' + COM_reportWorkerStats = "REPORT_WORKER_STATS" + COM_setNWorkers = "SET_N_WORKERS_JOBTYPE" + COM_killWorkers = "KILL_WORKERS" + COM_syncWorkersKill = "SYNC_WORKERS_KILL" # mapping between command and receiver - receiver_map = { - COM_reportWorkerStats: 'propagator', - COM_setNWorkers: 'submitter', - COM_killWorkers: 'sweeper', - COM_syncWorkersKill: 'sweeper' - } + receiver_map = {COM_reportWorkerStats: "propagator", COM_setNWorkers: "submitter", COM_killWorkers: "sweeper", COM_syncWorkersKill: "sweeper"} # constructor def __init__(self): @@ -35,16 +24,16 @@ def __init__(self): # convert from Command JSON def convert_command_json(self, data): # mandatory fields - self.command_id = data['command_id'] - self.command = data['command'] - self.params = data['params'] - self.ack_requested = data['ack_requested'] + self.command_id = data["command_id"] + self.command = data["command"] + self.params = data["params"] + self.ack_requested = data["ack_requested"] # For the day we want to parse the creation_date # from datetime import datetime # c = datetime.strptime(b, "%Y-%m-%dT%H:%M:%S.%f") # optional field try: - self.processed = data['processed'] + self.processed = data["processed"] except KeyError: self.processed = 0 diff --git a/pandaharvester/harvestercore/communicator_pool.py b/pandaharvester/harvestercore/communicator_pool.py index 4cb12312..49aefdc6 100644 --- a/pandaharvester/harvestercore/communicator_pool.py +++ b/pandaharvester/harvestercore/communicator_pool.py @@ -5,7 +5,7 @@ from . import core_utils # logger -_logger = core_utils.setup_logger('communicator_pool') +_logger = core_utils.setup_logger("communicator_pool") # method wrapper @@ -17,19 +17,19 @@ def __init__(self, method_name, pool): # method emulation def __call__(self, *args, **kwargs): - tmpLog = core_utils.make_logger(_logger, 'method={0}'.format(self.methodName), method_name='call') + tmpLog = core_utils.make_logger(_logger, "method={0}".format(self.methodName), method_name="call") sw = core_utils.get_stopwatch() try: # get connection con = self.pool.get() - tmpLog.debug('got lock. qsize={0} {1}'.format(self.pool.qsize(), sw.get_elapsed_time())) + tmpLog.debug("got lock. qsize={0} {1}".format(self.pool.qsize(), sw.get_elapsed_time())) sw.reset() # get function func = getattr(con, self.methodName) # exec return func(*args, **kwargs) finally: - tmpLog.debug('release lock' + sw.get_elapsed_time()) + tmpLog.debug("release lock" + sw.get_elapsed_time()) self.pool.put(con) @@ -38,7 +38,7 @@ class CommunicatorPool(object): # constructor def __init__(self): # install members - object.__setattr__(self, 'pool', None) + object.__setattr__(self, "pool", None) # connection pool try: nConnections = harvester_config.communicator.nConnections @@ -46,8 +46,7 @@ def __init__(self): nConnections = harvester_config.pandacon.nConnections self.pool = queue.Queue(nConnections) try: - Communicator = importlib.import_module(harvester_config.communicator.className, - harvester_config.communicator.moduleName) + Communicator = importlib.import_module(harvester_config.communicator.className, harvester_config.communicator.moduleName) except Exception: from pandaharvester.harvestercommunicator.panda_communicator import PandaCommunicator as Communicator for i in range(nConnections): diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index 7642b36d..6c8663ac 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -51,11 +51,13 @@ # synchronize decorator def synchronize(func): - """ synchronize decorator """ + """synchronize decorator""" + @functools.wraps(func) def wrapper(*args, **kwargs): with sync_lock: return func(*args, **kwargs) + return wrapper @@ -68,8 +70,7 @@ def __init__(self): # get elapsed time def get_elapsed_time(self): diff = datetime.datetime.utcnow() - self.startTime - return " : took {0}.{1:03} sec".format(diff.seconds + diff.days * 24 * 3600, - diff.microseconds // 1000) + return " : took {0}.{1:03} sec".format(diff.seconds + diff.days * 24 * 3600, diff.microseconds // 1000) # get elapsed time in seconds def get_elapsed_time_in_sec(self, precise=False): @@ -113,13 +114,13 @@ def iteritems(self): # singleton distinguishable with id class SingletonWithID(type): - def __init__(cls, *args,**kwargs): + def __init__(cls, *args, **kwargs): cls.__instance = {} super(SingletonWithID, cls).__init__(*args, **kwargs) @synchronize def __call__(cls, *args, **kwargs): - obj_id = str(kwargs.get('id', '')) + obj_id = str(kwargs.get("id", "")) if obj_id not in cls.__instance: cls.__instance[obj_id] = super(SingletonWithID, cls).__call__(*args, **kwargs) return cls.__instance.get(obj_id) @@ -127,14 +128,14 @@ def __call__(cls, *args, **kwargs): # singleton distinguishable with each thread and id class SingletonWithThreadAndID(type): - def __init__(cls, *args,**kwargs): + def __init__(cls, *args, **kwargs): cls.__instance = {} super(SingletonWithThreadAndID, cls).__init__(*args, **kwargs) @synchronize def __call__(cls, *args, **kwargs): thread_id = get_ident() - obj_id = (thread_id, str(kwargs.get('id', ''))) + obj_id = (thread_id, str(kwargs.get("id", ""))) if obj_id not in cls.__instance: cls.__instance[obj_id] = super(SingletonWithThreadAndID, cls).__call__(*args, **kwargs) return cls.__instance.get(obj_id) @@ -151,7 +152,7 @@ def setup_logger(name=None): if name is None: frm = inspect.stack()[1][0] mod = inspect.getmodule(frm) - name = mod.__name__.split('.')[-1] + name = mod.__name__.split(".")[-1] try: log_level = getattr(harvester_config.log_level, name) return PandaLogger().getLogger(name, log_level=log_level) @@ -168,9 +169,9 @@ def make_logger(tmp_log, token=None, method_name=None, hook=None): else: tmpStr = method_name if token is not None: - tmpStr += ' <{0}>'.format(token) + tmpStr += " <{0}>".format(token) else: - tmpStr += ' :'.format(token) + tmpStr += " :".format(token) newLog = LogWrapper(tmp_log, tmpStr, seeMem=with_memory_profile, hook=hook) return newLog @@ -178,9 +179,9 @@ def make_logger(tmp_log, token=None, method_name=None, hook=None): # dump error message def dump_error_message(tmp_log, err_str=None, no_message=False): if not isinstance(tmp_log, LogWrapper): - methodName = '{0} : '.format(inspect.stack()[1][3]) + methodName = "{0} : ".format(inspect.stack()[1][3]) else: - methodName = '' + methodName = "" # error if err_str is None: errtype, errvalue = sys.exc_info()[:2] @@ -230,7 +231,9 @@ def make_pool_file_catalog(jobspec_list): - """.format(guid=inFile['guid'], lfn=inLFN) + """.format( + guid=inFile["guid"], lfn=inLFN + ) xmlStr += "" return xmlStr @@ -239,14 +242,14 @@ def make_pool_file_catalog(jobspec_list): def calc_adler32(file_name): val = 1 blockSize = 32 * 1024 * 1024 - with open(file_name, 'rb') as fp: + with open(file_name, "rb") as fp: while True: data = fp.read(blockSize) if not data: break val = zlib.adler32(data, val) if val < 0: - val += 2 ** 32 + val += 2**32 return hex(val)[2:10].zfill(8).lower() @@ -263,18 +266,18 @@ def get_output_file_report(jobspec): # body for fileSpec in jobspec.outFiles: # only successful files - if fileSpec.status != 'finished': + if fileSpec.status != "finished": continue # extract guid - if 'guid' in fileSpec.fileAttributes: - guid = fileSpec.fileAttributes['guid'] - elif fileSpec.fileType == 'log': - guid = jobspec.get_logfile_info()['guid'] + if "guid" in fileSpec.fileAttributes: + guid = fileSpec.fileAttributes["guid"] + elif fileSpec.fileType == "log": + guid = jobspec.get_logfile_info()["guid"] else: guid = str(uuid.uuid4()) # checksum - if fileSpec.chksum is not None and ':' in fileSpec.chksum: - chksum = fileSpec.chksum.split(':')[-1] + if fileSpec.chksum is not None and ":" in fileSpec.chksum: + chksum = fileSpec.chksum.split(":")[-1] else: chksum = fileSpec.chksum xml += """ @@ -284,9 +287,11 @@ def get_output_file_report(jobspec): - """.format(guid=guid, lfn=fileSpec.lfn, fsize=fileSpec.fsize, chksum=chksum) + """.format( + guid=guid, lfn=fileSpec.lfn, fsize=fileSpec.fsize, chksum=chksum + ) # skipped files - skippedLFNs = jobspec.get_one_attribute('skippedInputs') + skippedLFNs = jobspec.get_one_attribute("skippedInputs") if skippedLFNs: for tmpLFN in skippedLFNs: xml += """ @@ -296,7 +301,9 @@ def get_output_file_report(jobspec): - """.format(lfn=tmpLFN) + """.format( + lfn=tmpLFN + ) # tailor xml += """ @@ -321,15 +328,14 @@ def create_shards(input_list, size): # update job attributes with workers -def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, files_to_stage_out_list, - events_to_update_list): +def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, files_to_stage_out_list, events_to_update_list): if map_type in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiJobs]: workSpec = workspec_list[0] for jobSpec in jobspec_list: jobSpec.set_attributes(workSpec.workAttributes) # delete job metadata from worker attributes try: - del workSpec.workAttributes[jobSpec.PandaID]['metaData'] + del workSpec.workAttributes[jobSpec.PandaID]["metaData"] except Exception: pass # set start and end times @@ -348,9 +354,9 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi except Exception: pass # batch ID - if not jobSpec.has_attribute('batchID'): + if not jobSpec.has_attribute("batchID"): if workSpec.batchID is not None: - jobSpec.set_one_attribute('batchID', workSpec.batchID) + jobSpec.set_one_attribute("batchID", workSpec.batchID) # add files outFileAttrs = jobSpec.get_output_file_attributes() for tmpWorkerID, files_to_stage_out in iteritems(files_to_stage_out_list): @@ -361,25 +367,25 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi fileSpec.lfn = lfn fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID - fileSpec.path = fileAtters['path'] - fileSpec.fsize = fileAtters['fsize'] - fileSpec.fileType = fileAtters['type'] + fileSpec.path = fileAtters["path"] + fileSpec.fsize = fileAtters["fsize"] + fileSpec.fileType = fileAtters["type"] fileSpec.fileAttributes = fileAtters fileSpec.workerID = tmpWorkerID - if 'isZip' in fileAtters: - fileSpec.isZip = fileAtters['isZip'] - if 'chksum' in fileAtters: - fileSpec.chksum = fileAtters['chksum'] - if 'eventRangeID' in fileAtters: - fileSpec.eventRangeID = fileAtters['eventRangeID'] + if "isZip" in fileAtters: + fileSpec.isZip = fileAtters["isZip"] + if "chksum" in fileAtters: + fileSpec.chksum = fileAtters["chksum"] + if "eventRangeID" in fileAtters: + fileSpec.eventRangeID = fileAtters["eventRangeID"] # use input fileID as provenanceID try: - provenanceID = fileSpec.eventRangeID.split('-')[2] + provenanceID = fileSpec.eventRangeID.split("-")[2] except Exception: provenanceID = None fileSpec.provenanceID = provenanceID if lfn in outFileAttrs: - fileSpec.scope = outFileAttrs[lfn]['scope'] + fileSpec.scope = outFileAttrs[lfn]["scope"] jobSpec.add_out_file(fileSpec) # add events for events_to_update in events_to_update_list: @@ -451,25 +457,25 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi fileSpec.lfn = lfn fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID - fileSpec.path = fileAtters['path'] - fileSpec.fsize = fileAtters['fsize'] - fileSpec.fileType = fileAtters['type'] + fileSpec.path = fileAtters["path"] + fileSpec.fsize = fileAtters["fsize"] + fileSpec.fileType = fileAtters["type"] fileSpec.fileAttributes = fileAtters fileSpec.workerID = tmpWorkerID - if 'isZip' in fileAtters: - fileSpec.isZip = fileAtters['isZip'] - if 'chksum' in fileAtters: - fileSpec.chksum = fileAtters['chksum'] - if 'eventRangeID' in fileAtters: - fileSpec.eventRangeID = fileAtters['eventRangeID'] + if "isZip" in fileAtters: + fileSpec.isZip = fileAtters["isZip"] + if "chksum" in fileAtters: + fileSpec.chksum = fileAtters["chksum"] + if "eventRangeID" in fileAtters: + fileSpec.eventRangeID = fileAtters["eventRangeID"] # use input fileID as provenanceID try: - provenanceID = fileSpec.eventRangeID.split('-')[2] + provenanceID = fileSpec.eventRangeID.split("-")[2] except Exception: provenanceID = None fileSpec.provenanceID = provenanceID if lfn in outFileAttrs: - fileSpec.scope = outFileAttrs[lfn]['scope'] + fileSpec.scope = outFileAttrs[lfn]["scope"] jobSpec.add_out_file(fileSpec) # add events for events_to_update in events_to_update_list: @@ -488,7 +494,7 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi else: jobSpec.status, jobSpec.subStatus = workSpec.convert_to_job_status(WorkSpec.ST_cancelled) else: - if isRunning or jobSpec.status == 'running': + if isRunning or jobSpec.status == "running": jobSpec.status, jobSpec.subStatus = workSpec.convert_to_job_status(WorkSpec.ST_running) else: jobSpec.status, jobSpec.subStatus = workSpec.convert_to_job_status(WorkSpec.ST_submitted) @@ -518,9 +524,9 @@ def get_global_dict(): @contextmanager def get_file_lock(file_name, lock_interval): if os.path.exists(file_name): - opt = 'r+' + opt = "r+" else: - opt = 'w+' + opt = "w+" with open(file_name, opt) as f: locked = False try: @@ -570,9 +576,9 @@ def encrypt_string(key_phrase, plain_text): def decrypt_string(key_phrase, cipher_text): cipher_text = base64.b64decode(cipher_text) k = convert_phrase_to_key(key_phrase) - v = cipher_text[:Cryptodome.Cipher.AES.block_size] + v = cipher_text[: Cryptodome.Cipher.AES.block_size] c = Cryptodome.Cipher.AES.new(k, Cryptodome.Cipher.AES.MODE_CFB, v) - cipher_text = cipher_text[Cryptodome.Cipher.AES.block_size:] + cipher_text = cipher_text[Cryptodome.Cipher.AES.block_size :] return c.decrypt(cipher_text) @@ -601,14 +607,14 @@ def set_file_permission(path): # get URL of queues config file def get_queues_config_url(): try: - return os.environ['HARVESTER_QUEUE_CONFIG_URL'] + return os.environ["HARVESTER_QUEUE_CONFIG_URL"] except Exception: return None # get unique queue name def get_unique_queue_name(queue_name, resource_type, job_type): - return '{0}:{1}:{2}'.format(queue_name, resource_type, job_type) + return "{0}:{1}:{2}".format(queue_name, resource_type, job_type) # capability to dynamically change plugins @@ -638,20 +644,20 @@ def make_choice_list(pdpm={}, default=None): real_weight = int(weight * 1000 / weight_sum) else: real_weight = int(weight) - ret_list.extend([candidate]*real_weight) + ret_list.extend([candidate] * real_weight) weight_default -= real_weight - ret_list.extend([default]*weight_default) + ret_list.extend([default] * weight_default) return ret_list # pickle to text def pickle_to_text(data): - return codecs.encode(pickle.dumps(data), 'base64').decode() + return codecs.encode(pickle.dumps(data), "base64").decode() # unpickle from text def unpickle_from_text(text): - return pickle.loads(codecs.decode(text.encode(), 'base64')) + return pickle.loads(codecs.decode(text.encode(), "base64")) # increasing retry period after timeout or failure @@ -661,7 +667,7 @@ def retry_period_sec(nth_retry, increment=1, max_retries=None, max_seconds=None, if max_retries and nth_retry > max_retries: return False else: - ret_period += (nth - 1)*increment + ret_period += (nth - 1) * increment if max_seconds: ret_period = min(ret_period, max_seconds) return ret_period @@ -670,4 +676,4 @@ def retry_period_sec(nth_retry, increment=1, max_retries=None, max_seconds=None, # safe dictionary to retrun original strings for missing keys class SafeDict(dict): def __missing__(self, key): - return '{' + key + '}' + return "{" + key + "}" diff --git a/pandaharvester/harvestercore/db_interface.py b/pandaharvester/harvestercore/db_interface.py index 36f19e53..0ea875a6 100644 --- a/pandaharvester/harvestercore/db_interface.py +++ b/pandaharvester/harvestercore/db_interface.py @@ -38,7 +38,7 @@ def get_locked_by(self): thrName = currentThr.ident else: thrName = None - return 'plugin-{0}-{1}'.format(os.getpid(), thrName) + return "plugin-{0}-{1}".format(os.getpid(), thrName) # get a lock for an object def get_object_lock(self, object_name, lock_interval): @@ -73,9 +73,9 @@ def get_worker_ce_backend_throughput(self, site_name, time_window): # add dialog message def add_dialog_message(self, message, level, module_name, identifier=None): # set level - validLevels = ['DEBUG', 'INFO', 'ERROR', 'WARNING'] + validLevels = ["DEBUG", "INFO", "ERROR", "WARNING"] if level not in validLevels: - level = 'INFO' + level = "INFO" levelNum = getattr(logging, level) # get minimum level try: @@ -83,7 +83,7 @@ def add_dialog_message(self, message, level, module_name, identifier=None): except Exception: minLevel = None if minLevel not in validLevels: - minLevel = 'WARNING' + minLevel = "WARNING" minLevelNum = getattr(logging, minLevel) # check level to avoid redundant db lock if levelNum < minLevelNum: diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 3578032f..f3021356 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -32,22 +32,22 @@ from pandaharvester.harvesterconfig import harvester_config # logger -_logger = core_utils.setup_logger('db_proxy') +_logger = core_utils.setup_logger("db_proxy") # table names -commandTableName = 'command_table' -jobTableName = 'job_table' -workTableName = 'work_table' -fileTableName = 'file_table' -cacheTableName = 'cache_table' -eventTableName = 'event_table' -seqNumberTableName = 'seq_table' -pandaQueueTableName = 'pq_table' -jobWorkerTableName = 'jw_table' -processLockTableName = 'lock_table' -diagTableName = 'diag_table' -queueConfigDumpTableName = 'qcdump_table' -serviceMetricsTableName = 'sm_table' +commandTableName = "command_table" +jobTableName = "job_table" +workTableName = "work_table" +fileTableName = "file_table" +cacheTableName = "cache_table" +eventTableName = "event_table" +seqNumberTableName = "seq_table" +pandaQueueTableName = "pq_table" +jobWorkerTableName = "jw_table" +processLockTableName = "lock_table" +diagTableName = "diag_table" +queueConfigDumpTableName = "qcdump_table" +serviceMetricsTableName = "sm_table" # connection lock conLock = threading.Lock() @@ -62,41 +62,41 @@ def __init__(self, thr_name=None, read_only=False): self.useInspect = False self.reconnectTimeout = 300 self.read_only = read_only - if hasattr(harvester_config.db, 'reconnectTimeout'): + if hasattr(harvester_config.db, "reconnectTimeout"): self.reconnectTimeout = harvester_config.db.reconnectTimeout if harvester_config.db.verbose: - self.verbLog = core_utils.make_logger(_logger, method_name='execute') + self.verbLog = core_utils.make_logger(_logger, method_name="execute") if self.thrName is None: currentThr = threading.current_thread() if currentThr is not None: self.thrName = currentThr.ident - if hasattr(harvester_config.db, 'useInspect') and harvester_config.db.useInspect is True: + if hasattr(harvester_config.db, "useInspect") and harvester_config.db.useInspect is True: self.useInspect = True # connect DB self._connect_db() self.lockDB = False # using application side lock if DB doesn't have a mechanism for exclusive access - if harvester_config.db.engine == 'mariadb': + if harvester_config.db.engine == "mariadb": self.usingAppLock = False else: self.usingAppLock = True # connect DB def _connect_db(self): - if harvester_config.db.engine == 'mariadb': - if hasattr(harvester_config.db, 'host'): + if harvester_config.db.engine == "mariadb": + if hasattr(harvester_config.db, "host"): host = harvester_config.db.host else: - host = '127.0.0.1' - if hasattr(harvester_config.db, 'port'): + host = "127.0.0.1" + if hasattr(harvester_config.db, "port"): port = harvester_config.db.port else: port = 3306 - if hasattr(harvester_config.db, 'useMySQLdb') and harvester_config.db.useMySQLdb is True: + if hasattr(harvester_config.db, "useMySQLdb") and harvester_config.db.useMySQLdb is True: import MySQLdb import MySQLdb.cursors - class MyCursor (MySQLdb.cursors.Cursor): + class MyCursor(MySQLdb.cursors.Cursor): def fetchone(self): tmpRet = MySQLdb.cursors.Cursor.fetchone(self) if tmpRet is None: @@ -117,49 +117,58 @@ def fetchall(self): newTmpRets.append(tmpRet) return newTmpRets - self.con = MySQLdb.connect(user=harvester_config.db.user, passwd=harvester_config.db.password, - db=harvester_config.db.schema, host=host, port=port, - cursorclass=MyCursor, charset='utf8') + self.con = MySQLdb.connect( + user=harvester_config.db.user, + passwd=harvester_config.db.password, + db=harvester_config.db.schema, + host=host, + port=port, + cursorclass=MyCursor, + charset="utf8", + ) self.cur = self.con.cursor() else: import mysql.connector - self.con = mysql.connector.connect(user=harvester_config.db.user, passwd=harvester_config.db.password, - db=harvester_config.db.schema, host=host, port=port) + + self.con = mysql.connector.connect( + user=harvester_config.db.user, passwd=harvester_config.db.password, db=harvester_config.db.schema, host=host, port=port + ) self.cur = self.con.cursor(named_tuple=True, buffered=True) else: import sqlite3 + if self.read_only: fd = os.open(harvester_config.db.database_filename, os.O_RDONLY) - database_filename = '/dev/fd/{0}'.format(fd) + database_filename = "/dev/fd/{0}".format(fd) else: database_filename = harvester_config.db.database_filename - self.con = sqlite3.connect(database_filename, - detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES, - check_same_thread=False) + self.con = sqlite3.connect(database_filename, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES, check_same_thread=False) core_utils.set_file_permission(harvester_config.db.database_filename) # change the row factory to use Row self.con.row_factory = sqlite3.Row self.cur = self.con.cursor() - self.cur.execute('PRAGMA journal_mode') + self.cur.execute("PRAGMA journal_mode") resJ = self.cur.fetchone() - if resJ[0] != 'wal': - self.cur.execute('PRAGMA journal_mode = WAL') + if resJ[0] != "wal": + self.cur.execute("PRAGMA journal_mode = WAL") # read to avoid database lock self.cur.fetchone() # exception handler for type of DBs def _handle_exception(self, exc): - tmpLog = core_utils.make_logger(_logger, 'thr={0}'.format(self.thrName), method_name='_handle_exception') - if harvester_config.db.engine == 'mariadb': - tmpLog.warning('exception of mysql {0} occurred'.format(exc.__class__.__name__)) + tmpLog = core_utils.make_logger(_logger, "thr={0}".format(self.thrName), method_name="_handle_exception") + if harvester_config.db.engine == "mariadb": + tmpLog.warning("exception of mysql {0} occurred".format(exc.__class__.__name__)) # Case to try renew connection isOperationalError = False - if hasattr(harvester_config.db, 'useMySQLdb') and harvester_config.db.useMySQLdb is True: + if hasattr(harvester_config.db, "useMySQLdb") and harvester_config.db.useMySQLdb is True: import MySQLdb + if isinstance(exc, MySQLdb.OperationalError): isOperationalError = True else: import mysql.connector + if isinstance(exc, mysql.connector.errors.OperationalError): isOperationalError = True if isOperationalError: @@ -170,19 +179,19 @@ def _handle_exception(self, exc): try: self.cur.close() except Exception as e: - tmpLog.error('failed to close cursor: {0}'.format(e)) + tmpLog.error("failed to close cursor: {0}".format(e)) # close DB connection try: self.con.close() except Exception as e: - tmpLog.error('failed to close connection: {0}'.format(e)) + tmpLog.error("failed to close connection: {0}".format(e)) # restart the proxy instance try: self._connect_db() - tmpLog.info('renewed connection') + tmpLog.info("renewed connection") break except Exception as e: - tmpLog.error('failed to renew connection ({0} retries); {1}'.format(n_retry, e)) + tmpLog.error("failed to renew connection ({0} retries); {1}".format(n_retry, e)) sleep_time = core_utils.retry_period_sec(n_retry, increment=2, max_seconds=300, min_seconds=1) if not sleep_time: break @@ -193,36 +202,36 @@ def _handle_exception(self, exc): # convert param dict to list def convert_params(self, sql, varmap): # lock database if application side lock is used - if self.usingAppLock and \ - (re.search('^INSERT', sql, re.I) is not None - or re.search('^UPDATE', sql, re.I) is not None - or re.search(' FOR UPDATE', sql, re.I) is not None - or re.search('^DELETE', sql, re.I) is not None - ): - self.lockDB = True + if self.usingAppLock and ( + re.search("^INSERT", sql, re.I) is not None + or re.search("^UPDATE", sql, re.I) is not None + or re.search(" FOR UPDATE", sql, re.I) is not None + or re.search("^DELETE", sql, re.I) is not None + ): + self.lockDB = True # remove FOR UPDATE for sqlite - if harvester_config.db.engine == 'sqlite': - sql = re.sub(' FOR UPDATE', ' ', sql, re.I) - sql = re.sub('INSERT IGNORE', 'INSERT OR IGNORE', sql, re.I) + if harvester_config.db.engine == "sqlite": + sql = re.sub(" FOR UPDATE", " ", sql, re.I) + sql = re.sub("INSERT IGNORE", "INSERT OR IGNORE", sql, re.I) else: - sql = re.sub('INSERT OR IGNORE', 'INSERT IGNORE', sql, re.I) + sql = re.sub("INSERT OR IGNORE", "INSERT IGNORE", sql, re.I) # no conversation unless dict if not isinstance(varmap, dict): # using the printf style syntax for mariaDB - if harvester_config.db.engine == 'mariadb': - sql = re.sub(':[^ $,)]+', '%s', sql) + if harvester_config.db.engine == "mariadb": + sql = re.sub(":[^ $,)]+", "%s", sql) return sql, varmap paramList = [] # extract placeholders - items = re.findall(':[^ $,)]+', sql) + items = re.findall(":[^ $,)]+", sql) for item in items: if item not in varmap: - raise KeyError('{0} is missing in SQL parameters'.format(item)) + raise KeyError("{0} is missing in SQL parameters".format(item)) if item not in paramList: paramList.append(varmap[item]) # using the printf style syntax for mariaDB - if harvester_config.db.engine == 'mariadb': - sql = re.sub(':[^ $,)]+', '%s', sql) + if harvester_config.db.engine == "mariadb": + sql = re.sub(":[^ $,)]+", "%s", sql) return sql, paramList # wrapper for execute @@ -233,20 +242,18 @@ def execute(self, sql, varmap=None): # get lock if application side lock is used if self.usingAppLock and not self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} locking'.format(self.thrName)) + self.verbLog.debug("thr={0} locking".format(self.thrName)) conLock.acquire() if harvester_config.db.verbose: - self.verbLog.debug('thr={0} locked'.format(self.thrName)) + self.verbLog.debug("thr={0} locked".format(self.thrName)) # execute try: # verbose if harvester_config.db.verbose: if not self.useInspect: - self.verbLog.debug('thr={2} sql={0} var={1}'.format(sql, str(varmap), self.thrName)) + self.verbLog.debug("thr={2} sql={0} var={1}".format(sql, str(varmap), self.thrName)) else: - self.verbLog.debug('thr={3} sql={0} var={1} exec={2}'.format(sql, str(varmap), - inspect.stack()[1][3], - self.thrName)) + self.verbLog.debug("thr={3} sql={0} var={1} exec={2}".format(sql, str(varmap), inspect.stack()[1][3], self.thrName)) # convert param dict newSQL, params = self.convert_params(sql, varmap) # execute @@ -255,18 +262,17 @@ def execute(self, sql, varmap=None): except Exception as e: self._handle_exception(e) if harvester_config.db.verbose: - self.verbLog.debug('thr={0} exception during execute'.format(self.thrName)) + self.verbLog.debug("thr={0} exception during execute".format(self.thrName)) raise finally: # release lock if self.usingAppLock and not self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} release'.format(self.thrName)) + self.verbLog.debug("thr={0} release".format(self.thrName)) conLock.release() # return if harvester_config.db.verbose: - self.verbLog.debug('thr={0} {1} sql=[{2}]'.format(self.thrName, sw.get_elapsed_time(), - newSQL.replace('\n', ' ').strip())) + self.verbLog.debug("thr={0} {1} sql=[{2}]".format(self.thrName, sw.get_elapsed_time(), newSQL.replace("\n", " ").strip())) return retVal # wrapper for executemany @@ -274,19 +280,17 @@ def executemany(self, sql, varmap_list): # get lock if self.usingAppLock and not self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} locking'.format(self.thrName)) + self.verbLog.debug("thr={0} locking".format(self.thrName)) conLock.acquire() if harvester_config.db.verbose: - self.verbLog.debug('thr={0} locked'.format(self.thrName)) + self.verbLog.debug("thr={0} locked".format(self.thrName)) try: # verbose if harvester_config.db.verbose: if not self.useInspect: - self.verbLog.debug('thr={2} sql={0} var={1}'.format(sql, str(varmap_list), self.thrName)) + self.verbLog.debug("thr={2} sql={0} var={1}".format(sql, str(varmap_list), self.thrName)) else: - self.verbLog.debug('thr={3} sql={0} var={1} exec={2}'.format(sql, str(varmap_list), - inspect.stack()[1][3], - self.thrName)) + self.verbLog.debug("thr={3} sql={0} var={1} exec={2}".format(sql, str(varmap_list), inspect.stack()[1][3], self.thrName)) # convert param dict paramList = [] newSQL = sql @@ -297,25 +301,25 @@ def executemany(self, sql, varmap_list): paramList.append(params) # execute try: - if harvester_config.db.engine == 'sqlite': + if harvester_config.db.engine == "sqlite": retVal = [] iList = 0 nList = 5000 while iList < len(paramList): - retVal += self.cur.executemany(newSQL, paramList[iList:iList+nList]) + retVal += self.cur.executemany(newSQL, paramList[iList : iList + nList]) iList += nList else: retVal = self.cur.executemany(newSQL, paramList) except Exception as e: self._handle_exception(e) if harvester_config.db.verbose: - self.verbLog.debug('thr={0} exception during executemany'.format(self.thrName)) + self.verbLog.debug("thr={0} exception during executemany".format(self.thrName)) raise finally: # release lock if self.usingAppLock and not self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} release'.format(self.thrName)) + self.verbLog.debug("thr={0} release".format(self.thrName)) conLock.release() # return return retVal @@ -327,11 +331,11 @@ def commit(self): except Exception as e: self._handle_exception(e) if harvester_config.db.verbose: - self.verbLog.debug('thr={0} exception during commit'.format(self.thrName)) + self.verbLog.debug("thr={0} exception during commit".format(self.thrName)) raise if self.usingAppLock and self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} release with commit'.format(self.thrName)) + self.verbLog.debug("thr={0} release with commit".format(self.thrName)) conLock.release() self.lockDB = False @@ -342,35 +346,35 @@ def rollback(self): except Exception as e: self._handle_exception(e) if harvester_config.db.verbose: - self.verbLog.debug('thr={0} exception during rollback'.format(self.thrName)) + self.verbLog.debug("thr={0} exception during rollback".format(self.thrName)) finally: if self.usingAppLock and self.lockDB: if harvester_config.db.verbose: - self.verbLog.debug('thr={0} release with rollback'.format(self.thrName)) + self.verbLog.debug("thr={0} release with rollback".format(self.thrName)) conLock.release() self.lockDB = False # type conversion def type_conversion(self, attr_type): # remove decorator - attr_type = attr_type.split('/')[0] + attr_type = attr_type.split("/")[0] attr_type = attr_type.strip() - if attr_type == 'timestamp': + if attr_type == "timestamp": # add NULL attribute to disable automatic update - attr_type += ' null' + attr_type += " null" # type conversion - if harvester_config.db.engine == 'mariadb': - if attr_type.startswith('text'): - attr_type = attr_type.replace('text', 'varchar(256)') - elif attr_type.startswith('blob'): - attr_type = attr_type.replace('blob', 'longtext') - elif attr_type.startswith('integer'): - attr_type = attr_type.replace('integer', 'bigint') - attr_type = attr_type.replace('autoincrement', 'auto_increment') - elif harvester_config.db.engine == 'sqlite': - if attr_type.startswith('varchar'): - attr_type = re.sub('varchar\(\d+\)', 'text', attr_type) - attr_type = attr_type.replace('auto_increment', 'autoincrement') + if harvester_config.db.engine == "mariadb": + if attr_type.startswith("text"): + attr_type = attr_type.replace("text", "varchar(256)") + elif attr_type.startswith("blob"): + attr_type = attr_type.replace("blob", "longtext") + elif attr_type.startswith("integer"): + attr_type = attr_type.replace("integer", "bigint") + attr_type = attr_type.replace("autoincrement", "auto_increment") + elif harvester_config.db.engine == "sqlite": + if attr_type.startswith("varchar"): + attr_type = re.sub("varchar\(\d+\)", "text", attr_type) + attr_type = attr_type.replace("auto_increment", "autoincrement") return attr_type # check if index is needed @@ -378,11 +382,11 @@ def need_index(self, attr): isIndex = False isUnique = False # look for separator - if '/' in attr: - decorators = attr.split('/')[-1].split() - if 'index' in decorators: + if "/" in attr: + decorators = attr.split("/")[-1].split() + if "index" in decorators: isIndex = True - if 'unique' in decorators: + if "unique" in decorators: isIndex = True isUnique = True return isIndex, isUnique @@ -390,14 +394,14 @@ def need_index(self, attr): def initialize_jobType(self, table_name): # initialize old NULL entries to ANY in pq_table and work_table # get logger - tmp_log = core_utils.make_logger(_logger, method_name='initialize_jobType') + tmp_log = core_utils.make_logger(_logger, method_name="initialize_jobType") sql_update = "UPDATE {0} SET jobType = 'ANY' WHERE jobType is NULL ".format(table_name) try: self.execute(sql_update) # commit self.commit() - tmp_log.debug('initialized entries in {0}'.format(table_name)) + tmp_log.debug("initialized entries in {0}".format(table_name)) except Exception: core_utils.dump_error_message(tmp_log) @@ -405,17 +409,17 @@ def initialize_jobType(self, table_name): def make_table(self, cls, table_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='make_table') - tmpLog.debug('table={0}'.format(table_name)) + tmpLog = core_utils.make_logger(_logger, method_name="make_table") + tmpLog.debug("table={0}".format(table_name)) # check if table already exists varMap = dict() - varMap[':name'] = table_name - if harvester_config.db.engine == 'mariadb': - varMap[':schema'] = harvester_config.db.schema - sqlC = 'SELECT * FROM information_schema.tables WHERE table_schema=:schema AND table_name=:name ' + varMap[":name"] = table_name + if harvester_config.db.engine == "mariadb": + varMap[":schema"] = harvester_config.db.schema + sqlC = "SELECT * FROM information_schema.tables WHERE table_schema=:schema AND table_name=:name " else: - varMap[':type'] = 'table' - sqlC = 'SELECT name FROM sqlite_master WHERE type=:type AND tbl_name=:name ' + varMap[":type"] = "table" + sqlC = "SELECT name FROM sqlite_master WHERE type=:type AND tbl_name=:name " self.execute(sqlC, varMap) resC = self.cur.fetchone() indexes = [] @@ -423,11 +427,11 @@ def make_table(self, cls, table_name): # not exists if resC is None: # sql to make table - sqlM = 'CREATE TABLE {0}('.format(table_name) + sqlM = "CREATE TABLE {0}(".format(table_name) # collect columns for attr in cls.attributesWithTypes: # split to name and type - attrName, attrType = attr.split(':') + attrName, attrType = attr.split(":") attrType = self.type_conversion(attrType) # check if index is needed isIndex, isUnique = self.need_index(attr) @@ -435,21 +439,21 @@ def make_table(self, cls, table_name): indexes.append(attrName) if isUnique: uniques.add(attrName) - sqlM += '{0} {1},'.format(attrName, attrType) + sqlM += "{0} {1},".format(attrName, attrType) sqlM = sqlM[:-1] - sqlM += ')' + sqlM += ")" # make table self.execute(sqlM) # commit self.commit() - tmpLog.debug('made {0}'.format(table_name)) + tmpLog.debug("made {0}".format(table_name)) else: # check table missingAttrs = self.check_table(cls, table_name, True) if len(missingAttrs) > 0: for attr in cls.attributesWithTypes: # split to name and type - attrName, attrType = attr.split(':') + attrName, attrType = attr.split(":") attrType = self.type_conversion(attrType) # ony missing if attrName not in missingAttrs: @@ -461,24 +465,23 @@ def make_table(self, cls, table_name): if isUnique: uniques.add(attrName) # add column - sqlA = 'ALTER TABLE {0} ADD COLUMN '.format(table_name) - sqlA += '{0} {1}'.format(attrName, attrType) + sqlA = "ALTER TABLE {0} ADD COLUMN ".format(table_name) + sqlA += "{0} {1}".format(attrName, attrType) try: self.execute(sqlA) # commit self.commit() - tmpLog.debug('added {0} to {1}'.format(attr, table_name)) + tmpLog.debug("added {0} to {1}".format(attr, table_name)) except Exception: core_utils.dump_error_message(tmpLog) # if we just added the jobType, old entries need to be initialized - if (table_name == pandaQueueTableName and attrName == 'jobType') \ - or (table_name == pandaQueueTableName and attrName == 'jobType'): + if (table_name == pandaQueueTableName and attrName == "jobType") or (table_name == pandaQueueTableName and attrName == "jobType"): self.initialize_jobType(table_name) # make indexes for index in indexes: - indexName = 'idx_{0}_{1}'.format(index, table_name) + indexName = "idx_{0}_{1}".format(index, table_name) if index in uniques: sqlI = "CREATE UNIQUE INDEX " else: @@ -488,7 +491,7 @@ def make_table(self, cls, table_name): self.execute(sqlI) # commit self.commit() - tmpLog.debug('added {0}'.format(indexName)) + tmpLog.debug("added {0}".format(indexName)) except Exception: core_utils.dump_error_message(tmpLog) except Exception: @@ -501,8 +504,8 @@ def make_table(self, cls, table_name): # make tables def make_tables(self, queue_config_mapper, communicator_pool): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='make_tables') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="make_tables") + tmpLog.debug("start") outStrs = [] outStrs += self.make_table(CommandSpec, commandTableName) outStrs += self.make_table(JobSpec, jobTableName) @@ -524,52 +527,52 @@ def make_tables(self, queue_config_mapper, communicator_pool): errMsg += "Please add missing columns, or drop those tables " errMsg += "so that harvester automatically re-creates those tables." errMsg += "\n" - print (errMsg) + print(errMsg) for outStr in outStrs: - print (outStr) + print(outStr) sys.exit(1) # sync workerID init_worker = 1 - if hasattr(harvester_config.db, 'syncMaxWorkerID') and harvester_config.db.syncMaxWorkerID: + if hasattr(harvester_config.db, "syncMaxWorkerID") and harvester_config.db.syncMaxWorkerID: retVal, out = communicator_pool.get_max_worker_id() if not retVal: - tmpLog.warning('failed to get max workerID with {}'.format(out)) + tmpLog.warning("failed to get max workerID with {}".format(out)) elif not out: - tmpLog.debug('max workerID is undefined') + tmpLog.debug("max workerID is undefined") else: - tmpLog.debug('got max_workerID={}'.format(out)) + tmpLog.debug("got max_workerID={}".format(out)) init_worker = out + 1 # add sequential numbers - self.add_seq_number('SEQ_workerID', init_worker) - self.add_seq_number('SEQ_configID', 1) + self.add_seq_number("SEQ_workerID", init_worker) + self.add_seq_number("SEQ_configID", 1) # fill PandaQueue table queue_config_mapper.load_data() # delete process locks self.clean_process_locks() - tmpLog.debug('done') + tmpLog.debug("done") # check table def check_table(self, cls, table_name, get_missing=False): # get columns in DB varMap = dict() - if harvester_config.db.engine == 'mariadb': - varMap[':name'] = table_name - varMap[':schema'] = harvester_config.db.schema - sqlC = 'SELECT column_name,column_type FROM information_schema.columns WHERE table_schema=:schema AND table_name=:name ' + if harvester_config.db.engine == "mariadb": + varMap[":name"] = table_name + varMap[":schema"] = harvester_config.db.schema + sqlC = "SELECT column_name,column_type FROM information_schema.columns WHERE table_schema=:schema AND table_name=:name " else: - sqlC = 'PRAGMA table_info({0}) '.format(table_name) + sqlC = "PRAGMA table_info({0}) ".format(table_name) self.execute(sqlC, varMap) resC = self.cur.fetchall() colMap = dict() for tmpItem in resC: - if harvester_config.db.engine == 'mariadb': - if hasattr(tmpItem, '_asdict'): + if harvester_config.db.engine == "mariadb": + if hasattr(tmpItem, "_asdict"): tmpItem = tmpItem._asdict() try: - columnName, columnType = tmpItem['column_name'], tmpItem['column_type'] + columnName, columnType = tmpItem["column_name"], tmpItem["column_type"] except KeyError: - columnName, columnType = tmpItem['COLUMN_NAME'], tmpItem['COLUMN_TYPE'] + columnName, columnType = tmpItem["COLUMN_NAME"], tmpItem["COLUMN_TYPE"] else: columnName, columnType = tmpItem[1], tmpItem[2] colMap[columnName] = columnType @@ -577,20 +580,20 @@ def check_table(self, cls, table_name, get_missing=False): # check with class definition outStrs = [] for attr in cls.attributesWithTypes: - attrName, attrType = attr.split(':') + attrName, attrType = attr.split(":") if attrName not in colMap: if get_missing: outStrs.append(attrName) else: attrType = self.type_conversion(attrType) - outStrs.append('{0} {1} is missing in {2}'.format(attrName, attrType, table_name)) + outStrs.append("{0} {1} is missing in {2}".format(attrName, attrType, table_name)) return outStrs # insert jobs def insert_jobs(self, jobspec_list): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='insert_jobs') - tmpLog.debug('{0} jobs'.format(len(jobspec_list))) + tmpLog = core_utils.make_logger(_logger, method_name="insert_jobs") + tmpLog.debug("{0} jobs".format(len(jobspec_list))) try: # sql to insert a job sqlJ = "INSERT INTO {0} ({1}) ".format(jobTableName, JobSpec.column_names()) @@ -616,7 +619,7 @@ def insert_jobs(self, jobspec_list): for jobSpec in jobspec_list: # delete job just in case varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlDJ, varMap) iDel = self.cur.rowcount if iDel > 0: @@ -653,14 +656,14 @@ def insert_jobs(self, jobspec_list): def get_job(self, panda_id): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(panda_id), method_name='get_job') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "PandaID={0}".format(panda_id), method_name="get_job") + tmpLog.debug("start") # sql to get job sql = "SELECT {0} FROM {1} ".format(JobSpec.column_names(), jobTableName) sql += "WHERE PandaID=:pandaID " # get job varMap = dict() - varMap[':pandaID'] = panda_id + varMap[":pandaID"] = panda_id self.execute(sql, varMap) resJ = self.cur.fetchone() if resJ is None: @@ -673,7 +676,7 @@ def get_job(self, panda_id): sqlF = "SELECT {0} FROM {1} ".format(FileSpec.column_names(), fileTableName) sqlF += "WHERE PandaID=:PandaID " varMap = dict() - varMap[':PandaID'] = panda_id + varMap[":PandaID"] = panda_id self.execute(sqlF, varMap) resFileList = self.cur.fetchall() for resFile in resFileList: @@ -682,7 +685,7 @@ def get_job(self, panda_id): jobSpec.add_file(fileSpec) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") # return return jobSpec except Exception: @@ -697,8 +700,8 @@ def get_job(self, panda_id): def get_jobs(self): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_jobs') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_jobs") + tmpLog.debug("start") # sql to get job sql = "SELECT {0} FROM {1} ".format(JobSpec.column_names(), jobTableName) sql += "WHERE PandaID IS NOT NULL" @@ -708,13 +711,13 @@ def get_jobs(self): resJobs = self.cur.fetchall() if resJobs is None: return None - jobSpecList=[] + jobSpecList = [] # make jobs list for resJ in resJobs: jobSpec = JobSpec() jobSpec.pack(resJ) jobSpecList.append(jobSpec) - tmpLog.debug('done') + tmpLog.debug("done") # return return jobSpecList except Exception: @@ -729,10 +732,8 @@ def get_jobs(self): def update_job(self, jobspec, criteria=None, update_in_file=False, update_out_file=False): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'PandaID={0} subStatus={1}'.format(jobspec.PandaID, - jobspec.subStatus), - method_name='update_job') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "PandaID={0} subStatus={1}".format(jobspec.PandaID, jobspec.subStatus), method_name="update_job") + tmpLog.debug("start") if criteria is None: criteria = {} # sql to update job @@ -741,10 +742,10 @@ def update_job(self, jobspec, criteria=None, update_in_file=False, update_out_fi # update job varMap = jobspec.values_map(only_changed=True) for tmpKey, tmpVal in iteritems(criteria): - mapKey = ':{0}_cr'.format(tmpKey) + mapKey = ":{0}_cr".format(tmpKey) sql += "AND {0}={1} ".format(tmpKey, mapKey) varMap[mapKey] = tmpVal - varMap[':PandaID'] = jobspec.PandaID + varMap[":PandaID"] = jobspec.PandaID self.execute(sql, varMap) nRow = self.cur.rowcount if nRow > 0: @@ -754,26 +755,25 @@ def update_job(self, jobspec, criteria=None, update_in_file=False, update_out_fi if varMap != {}: sqlE = "UPDATE {0} SET {1} ".format(eventTableName, eventSpec.bind_update_changes_expression()) sqlE += "WHERE eventRangeID=:eventRangeID " - varMap[':eventRangeID'] = eventSpec.eventRangeID + varMap[":eventRangeID"] = eventSpec.eventRangeID self.execute(sqlE, varMap) # update input file if update_in_file: for fileSpec in jobspec.inFiles: varMap = fileSpec.values_map(only_changed=True) if varMap != {}: - sqlF = "UPDATE {0} SET {1} ".format(fileTableName, - fileSpec.bind_update_changes_expression()) + sqlF = "UPDATE {0} SET {1} ".format(fileTableName, fileSpec.bind_update_changes_expression()) sqlF += "WHERE fileID=:fileID " - varMap[':fileID'] = fileSpec.fileID + varMap[":fileID"] = fileSpec.fileID self.execute(sqlF, varMap) else: # set file status to done if jobs are done if jobspec.is_final_status(): varMap = dict() - varMap[':PandaID'] = jobspec.PandaID - varMap[':type1'] = 'input' - varMap[':type2'] = FileSpec.AUX_INPUT - varMap[':status'] = 'done' + varMap[":PandaID"] = jobspec.PandaID + varMap[":type1"] = "input" + varMap[":type2"] = FileSpec.AUX_INPUT + varMap[":status"] = "done" sqlF = "UPDATE {0} SET status=:status ".format(fileTableName) sqlF += "WHERE PandaID=:PandaID AND fileType IN (:type1,:type2) " self.execute(sqlF, varMap) @@ -782,22 +782,21 @@ def update_job(self, jobspec, criteria=None, update_in_file=False, update_out_fi for fileSpec in jobspec.outFiles: varMap = fileSpec.values_map(only_changed=True) if varMap != {}: - sqlF = "UPDATE {0} SET {1} ".format(fileTableName, - fileSpec.bind_update_changes_expression()) + sqlF = "UPDATE {0} SET {1} ".format(fileTableName, fileSpec.bind_update_changes_expression()) sqlF += "WHERE fileID=:fileID " - varMap[':fileID'] = fileSpec.fileID + varMap[":fileID"] = fileSpec.fileID self.execute(sqlF, varMap) # set to_delete flag - if jobspec.subStatus == 'done': + if jobspec.subStatus == "done": sqlD = "UPDATE {0} SET todelete=:to_delete ".format(fileTableName) sqlD += "WHERE PandaID=:PandaID " varMap = dict() - varMap[':PandaID'] = jobspec.PandaID - varMap[':to_delete'] = 1 + varMap[":PandaID"] = jobspec.PandaID + varMap[":to_delete"] = 1 self.execute(sqlD, varMap) # commit self.commit() - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) # return return nRow except Exception: @@ -809,10 +808,10 @@ def update_job(self, jobspec, criteria=None, update_in_file=False, update_out_fi return None # insert output files into database - def insert_files(self,jobspec_list): + def insert_files(self, jobspec_list): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='insert_files') - tmpLog.debug('{0} jobs'.format(len(jobspec_list))) + tmpLog = core_utils.make_logger(_logger, method_name="insert_files") + tmpLog.debug("{0} jobs".format(len(jobspec_list))) try: # sql to insert a file sqlF = "INSERT INTO {0} ({1}) ".format(fileTableName, FileSpec.column_names()) @@ -841,9 +840,8 @@ def insert_files(self,jobspec_list): def update_worker(self, workspec, criteria=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='update_worker') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="update_worker") + tmpLog.debug("start") if criteria is None: criteria = {} # sql to update job @@ -853,18 +851,18 @@ def update_worker(self, workspec, criteria=None): varMap = workspec.values_map(only_changed=True) if len(varMap) > 0: for tmpKey, tmpVal in iteritems(criteria): - mapKey = ':{0}_cr'.format(tmpKey) + mapKey = ":{0}_cr".format(tmpKey) sql += "AND {0}={1} ".format(tmpKey, mapKey) varMap[mapKey] = tmpVal - varMap[':workerID'] = workspec.workerID + varMap[":workerID"] = workspec.workerID self.execute(sql, varMap) nRow = self.cur.rowcount # commit self.commit() - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) else: nRow = None - tmpLog.debug('skip since no updated attributes') + tmpLog.debug("skip since no updated attributes") # return return nRow except Exception: @@ -879,20 +877,20 @@ def update_worker(self, workspec, criteria=None): def fill_panda_queue_table(self, panda_queue_list, queue_config_mapper, refill_table=False): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='fill_panda_queue_table') - tmpLog.debug('start, refill={0}'.format(refill_table)) + tmpLog = core_utils.make_logger(_logger, method_name="fill_panda_queue_table") + tmpLog.debug("start, refill={0}".format(refill_table)) # get existing queues sqlE = "SELECT queueName FROM {0} ".format(pandaQueueTableName) varMap = dict() self.execute(sqlE, varMap) resE = self.cur.fetchall() - for queueName, in resE: + for (queueName,) in resE: # delete if not listed in cfg if queueName not in panda_queue_list: sqlD = "DELETE FROM {0} ".format(pandaQueueTableName) sqlD += "WHERE queueName=:queueName " varMap = dict() - varMap[':queueName'] = queueName + varMap[":queueName"] = queueName self.execute(sqlD, varMap) # commit self.commit() @@ -905,51 +903,59 @@ def fill_panda_queue_table(self, panda_queue_list, queue_config_mapper, refill_t sqlC += "WHERE queueName=:queueName " sqlC += " AND resourceType=:resourceType AND jobType=:jobType " varMap = dict() - varMap[':queueName'] = queueName - varMap[':resourceType'] = PandaQueueSpec.RT_catchall - varMap[':jobType'] = PandaQueueSpec.JT_catchall + varMap[":queueName"] = queueName + varMap[":resourceType"] = PandaQueueSpec.RT_catchall + varMap[":jobType"] = PandaQueueSpec.JT_catchall self.execute(sqlC, varMap) resC = self.cur.fetchone() if refill_table: sqlD = "DELETE FROM {0} ".format(pandaQueueTableName) sqlD += "WHERE queueName=:queueName " varMap = dict() - varMap[':queueName'] = queueName + varMap[":queueName"] = queueName self.execute(sqlD, varMap) if resC is not None and not refill_table: # update limits just in case varMap = dict() sqlU = "UPDATE {0} SET ".format(pandaQueueTableName) - for qAttr in ['nQueueLimitJob', 'nQueueLimitWorker', 'maxWorkers', - 'nQueueLimitJobRatio', 'nQueueLimitJobMax', 'nQueueLimitJobMin', - 'nQueueLimitWorkerRatio', 'nQueueLimitWorkerMax', 'nQueueLimitWorkerMin']: + for qAttr in [ + "nQueueLimitJob", + "nQueueLimitWorker", + "maxWorkers", + "nQueueLimitJobRatio", + "nQueueLimitJobMax", + "nQueueLimitJobMin", + "nQueueLimitWorkerRatio", + "nQueueLimitWorkerMax", + "nQueueLimitWorkerMin", + ]: if hasattr(queueConfig, qAttr): - sqlU += '{0}=:{0},'.format(qAttr) - varMap[':{0}'.format(qAttr)] = getattr(queueConfig, qAttr) + sqlU += "{0}=:{0},".format(qAttr) + varMap[":{0}".format(qAttr)] = getattr(queueConfig, qAttr) if len(varMap) == 0: continue sqlU = sqlU[:-1] sqlU += " WHERE queueName=:queueName " - varMap[':queueName'] = queueName + varMap[":queueName"] = queueName self.execute(sqlU, varMap) else: # insert queue varMap = dict() - varMap[':queueName'] = queueName + varMap[":queueName"] = queueName attrName_list = [] tmpKey_list = [] - for attrName in PandaQueueSpec.column_names().split(','): + for attrName in PandaQueueSpec.column_names().split(","): if hasattr(queueConfig, attrName): - tmpKey = ':{0}'.format(attrName) + tmpKey = ":{0}".format(attrName) attrName_list.append(attrName) tmpKey_list.append(tmpKey) varMap[tmpKey] = getattr(queueConfig, attrName) - sqlP = "INSERT IGNORE INTO {0} ({1}) ".format(pandaQueueTableName, ','.join(attrName_list)) - sqlS = "VALUES ({0}) ".format(','.join(tmpKey_list)) + sqlP = "INSERT IGNORE INTO {0} ({1}) ".format(pandaQueueTableName, ",".join(attrName_list)) + sqlS = "VALUES ({0}) ".format(",".join(tmpKey_list)) self.execute(sqlP + sqlS, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") # return return True except Exception: @@ -963,9 +969,9 @@ def fill_panda_queue_table(self, panda_queue_list, queue_config_mapper, refill_t # get number of jobs to fetch def get_num_jobs_to_fetch(self, n_queues, interval): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_num_jobs_to_fetch') + tmpLog = core_utils.make_logger(_logger, method_name="get_num_jobs_to_fetch") try: - tmpLog.debug('start') + tmpLog.debug("start") retMap = {} # sql to get queues sqlQ = "SELECT queueName,nQueueLimitJob,nQueueLimitJobRatio,nQueueLimitJobMax,nQueueLimitJobMin " @@ -983,17 +989,16 @@ def get_num_jobs_to_fetch(self, n_queues, interval): # get queues timeNow = datetime.datetime.utcnow() varMap = dict() - varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=interval) + varMap[":timeLimit"] = timeNow - datetime.timedelta(seconds=interval) self.execute(sqlQ, varMap) resQ = self.cur.fetchall() iQueues = 0 - for queueName, nQueueLimitJob, nQueueLimitJobRatio, \ - nQueueLimitJobMax, nQueueLimitJobMin in resQ: + for queueName, nQueueLimitJob, nQueueLimitJobRatio, nQueueLimitJobMax, nQueueLimitJobMin in resQ: # update timestamp to lock the queue varMap = dict() - varMap[':queueName'] = queueName - varMap[':jobFetchTime'] = timeNow - varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=interval) + varMap[":queueName"] = queueName + varMap[":jobFetchTime"] = timeNow + varMap[":timeLimit"] = timeNow - datetime.timedelta(seconds=interval) self.execute(sqlU, varMap) nRow = self.cur.rowcount # commit @@ -1003,9 +1008,9 @@ def get_num_jobs_to_fetch(self, n_queues, interval): continue # count nQueue varMap = dict() - varMap[':computingSite'] = queueName - varMap[':status1'] = 'starting' - varMap[':status2'] = 'running' + varMap[":computingSite"] = queueName + varMap[":status1"] = "starting" + varMap[":status2"] = "running" self.execute(sqlN, varMap) resN = self.cur.fetchall() nsMap = dict() @@ -1013,13 +1018,13 @@ def get_num_jobs_to_fetch(self, n_queues, interval): nsMap[tmpStatus] = tmpN # get num of queued jobs try: - nQueue = nsMap['starting'] + nQueue = nsMap["starting"] except Exception: nQueue = 0 # dynamic nQueueLimitJob if nQueueLimitJobRatio is not None and nQueueLimitJobRatio > 0: try: - nRunning = nsMap['running'] + nRunning = nsMap["running"] except Exception: nRunning = 0 nQueueLimitJob = int(nRunning * nQueueLimitJobRatio / 100) @@ -1035,7 +1040,7 @@ def get_num_jobs_to_fetch(self, n_queues, interval): iQueues += 1 if iQueues >= n_queues: break - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -1049,9 +1054,8 @@ def get_num_jobs_to_fetch(self, n_queues, interval): def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'thr={0}'.format(locked_by), - method_name='get_jobs_to_propagate') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "thr={0}".format(locked_by), method_name="get_jobs_to_propagate") + tmpLog.debug("start") # sql to get jobs sql = "SELECT PandaID FROM {0} ".format(jobTableName) sql += "WHERE propagatorTime IS NOT NULL " @@ -1070,9 +1074,7 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked sqlE = "SELECT {0} FROM {1} ".format(EventSpec.column_names(), eventTableName) sqlE += "WHERE PandaID=:PandaID AND subStatus IN (:statusFinished,:statusFailed) " # sql to get file - sqlF = "SELECT DISTINCT {0} FROM {1} f, {2} e, {1} f2 ".format(FileSpec.column_names('f2'), - fileTableName, - eventTableName) + sqlF = "SELECT DISTINCT {0} FROM {1} f, {2} e, {1} f2 ".format(FileSpec.column_names("f2"), fileTableName, eventTableName) sqlF += "WHERE e.PandaID=:PandaID AND e.fileID=f.fileID " sqlF += "AND e.subStatus IN (:statusFinished,:statusFailed) " sqlF += "AND f2.fileID=f.zipFileID " @@ -1088,12 +1090,12 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked lockTimeLimit = timeNow - datetime.timedelta(seconds=lock_interval) updateTimeLimit = timeNow - datetime.timedelta(seconds=update_interval) varMap = dict() - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':updateTimeLimit'] = updateTimeLimit + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":updateTimeLimit"] = updateTimeLimit self.execute(sql, varMap) resList = self.cur.fetchall() pandaIDs = [] - for pandaID, in resList: + for (pandaID,) in resList: pandaIDs.append(pandaID) # partially randomise to increase success rate for lock nJobs = int(max_jobs * 0.2) @@ -1109,11 +1111,11 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked break # lock job varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':timeNow'] = timeNow - varMap[':lockedBy'] = locked_by - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':updateTimeLimit'] = updateTimeLimit + varMap[":PandaID"] = pandaID + varMap[":timeNow"] = timeNow + varMap[":lockedBy"] = locked_by + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":updateTimeLimit"] = updateTimeLimit self.execute(sqlL, varMap) nRow = self.cur.rowcount # commit @@ -1121,7 +1123,7 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked if nRow > 0: # read job varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJ, varMap) res = self.cur.fetchone() # make job @@ -1132,18 +1134,18 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked zipIdMap = dict() # get zipIDs varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':statusFinished'] = 'finished' - varMap[':statusFailed'] = 'failed' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":statusFinished"] = "finished" + varMap[":statusFailed"] = "failed" self.execute(sqlZ, varMap) resZ = self.cur.fetchall() for tmpFileID, tmpZipFileID in resZ: zipIdMap[tmpFileID] = tmpZipFileID # get zip files varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':statusFinished'] = 'finished' - varMap[':statusFailed'] = 'failed' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":statusFinished"] = "finished" + varMap[":statusFailed"] = "failed" self.execute(sqlF, varMap) resFs = self.cur.fetchall() for resF in resFs: @@ -1152,9 +1154,9 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked zipFiles[fileSpec.fileID] = fileSpec # read events varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':statusFinished'] = 'finished' - varMap[':statusFailed'] = 'failed' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":statusFinished"] = "finished" + varMap[":statusFailed"] = "failed" self.execute(sqlE, varMap) resEs = self.cur.fetchall() for resE in resEs: @@ -1172,9 +1174,9 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked iEvents += 1 # read checkpoint files varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':type'] = 'checkpoint' - varMap[':status'] = 'renewed' + varMap[":PandaID"] = pandaID + varMap[":type"] = "checkpoint" + varMap[":status"] = "renewed" self.execute(sqlC, varMap) resC = self.cur.fetchall() for resFile in resC: @@ -1183,7 +1185,7 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked jobSpec.add_out_file(fileSpec) # add to job list jobSpecList.append(jobSpec) - tmpLog.debug('got {0} jobs'.format(len(jobSpecList))) + tmpLog.debug("got {0} jobs".format(len(jobSpecList))) return jobSpecList except Exception: # roll back @@ -1194,18 +1196,27 @@ def get_jobs_to_propagate(self, max_jobs, lock_interval, update_interval, locked return [] # get jobs in sub status - def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_column=None, - interval_without_lock=None, interval_with_lock=None, - locked_by=None, new_sub_status=None, max_files_per_job=None, - ng_file_status_list=None): + def get_jobs_in_sub_status( + self, + sub_status, + max_jobs, + time_column=None, + lock_column=None, + interval_without_lock=None, + interval_with_lock=None, + locked_by=None, + new_sub_status=None, + max_files_per_job=None, + ng_file_status_list=None, + ): try: # get logger if locked_by is None: msgPfx = None else: - msgPfx = 'id={0}'.format(locked_by) - tmpLog = core_utils.make_logger(_logger, msgPfx, method_name='get_jobs_in_sub_status') - tmpLog.debug('start subStatus={0} timeColumn={1}'.format(sub_status, time_column)) + msgPfx = "id={0}".format(locked_by) + tmpLog = core_utils.make_logger(_logger, msgPfx, method_name="get_jobs_in_sub_status") + tmpLog.debug("start subStatus={0} timeColumn={1}".format(sub_status, time_column)) timeNow = datetime.datetime.utcnow() # sql to count jobs being processed sqlC = "SELECT COUNT(*) cnt FROM {0} ".format(jobTableName) @@ -1216,17 +1227,16 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co # count jobs if max_jobs > 0 and new_sub_status is not None: varMap = dict() - varMap[':subStatus'] = sub_status - varMap[':newSubStatus'] = new_sub_status + varMap[":subStatus"] = sub_status + varMap[":newSubStatus"] = new_sub_status if time_column is not None and interval_with_lock is not None: - varMap[':lockTimeLimit'] = timeNow - datetime.timedelta(seconds=interval_with_lock) + varMap[":lockTimeLimit"] = timeNow - datetime.timedelta(seconds=interval_with_lock) self.execute(sqlC, varMap) - nProcessing, = self.cur.fetchone() + (nProcessing,) = self.cur.fetchone() if nProcessing >= max_jobs: # commit self.commit() - tmpLog.debug('enough jobs {0} are being processed in {1} state'.format(nProcessing, - new_sub_status)) + tmpLog.debug("enough jobs {0} are being processed in {1} state".format(nProcessing, new_sub_status)) return [] max_jobs -= nProcessing # sql to get job IDs @@ -1238,7 +1248,7 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co sql += "OR ({0}<:lockTimeLimit AND {1} IS NOT NULL) ".format(time_column, lock_column) if interval_without_lock is not None: sql += "OR ({0}<:updateTimeLimit AND {1} IS NULL) ".format(time_column, lock_column) - sql += ') ' + sql += ") " sql += "ORDER BY {0} ".format(time_column) # sql to lock job sqlL = "UPDATE {0} SET {1}=:timeNow,{2}=:lockedBy ".format(jobTableName, time_column, lock_column) @@ -1249,7 +1259,7 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co sqlL += "OR ({0}<:lockTimeLimit AND {1} IS NOT NULL) ".format(time_column, lock_column) if interval_without_lock is not None: sqlL += "OR ({0}<:updateTimeLimit AND {1} IS NULL) ".format(time_column, lock_column) - sqlL += ') ' + sqlL += ") " # sql to get jobs sqlGJ = "SELECT {0} FROM {1} ".format(JobSpec.column_names(), jobTableName) sqlGJ += "WHERE PandaID=:PandaID " @@ -1259,7 +1269,7 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co if ng_file_status_list is not None: sqlGF += "AND status NOT IN (" for tmpStatus in ng_file_status_list: - tmpKey = ':status_{0}'.format(tmpStatus) + tmpKey = ":status_{0}".format(tmpStatus) sqlGF += "{0},".format(tmpKey) sqlGF = sqlGF[:-1] sqlGF += ") " @@ -1267,15 +1277,15 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co sqlGF += "LIMIT {0} ".format(max_files_per_job) # get jobs varMap = dict() - varMap[':subStatus'] = sub_status + varMap[":subStatus"] = sub_status if interval_with_lock is not None: - varMap[':lockTimeLimit'] = timeNow - datetime.timedelta(seconds=interval_with_lock) + varMap[":lockTimeLimit"] = timeNow - datetime.timedelta(seconds=interval_with_lock) if interval_without_lock is not None: - varMap[':updateTimeLimit'] = timeNow - datetime.timedelta(seconds=interval_without_lock) + varMap[":updateTimeLimit"] = timeNow - datetime.timedelta(seconds=interval_without_lock) self.execute(sql, varMap) resList = self.cur.fetchall() pandaIDs = [] - for pandaID, in resList: + for (pandaID,) in resList: pandaIDs.append(pandaID) # partially randomise to increase success rate for lock nJobs = int(max_jobs * 0.2) @@ -1288,14 +1298,14 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co # lock job if locked_by is not None: varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':timeNow'] = timeNow - varMap[':lockedBy'] = locked_by - varMap[':subStatus'] = sub_status + varMap[":PandaID"] = pandaID + varMap[":timeNow"] = timeNow + varMap[":lockedBy"] = locked_by + varMap[":subStatus"] = sub_status if interval_with_lock is not None: - varMap[':lockTimeLimit'] = timeNow - datetime.timedelta(seconds=interval_with_lock) + varMap[":lockTimeLimit"] = timeNow - datetime.timedelta(seconds=interval_with_lock) if interval_without_lock is not None: - varMap[':updateTimeLimit'] = timeNow - datetime.timedelta(seconds=interval_without_lock) + varMap[":updateTimeLimit"] = timeNow - datetime.timedelta(seconds=interval_without_lock) self.execute(sqlL, varMap) nRow = self.cur.rowcount # commit @@ -1305,7 +1315,7 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co if nRow > 0: # get job varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlGJ, varMap) resGJ = self.cur.fetchone() # make job @@ -1316,14 +1326,14 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co setattr(jobSpec, time_column, timeNow) # get files varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput, JobSpec.AUX_allTriggered]: - varMap[':type'] = 'input' + varMap[":type"] = "input" else: - varMap[':type'] = FileSpec.AUX_INPUT + varMap[":type"] = FileSpec.AUX_INPUT if ng_file_status_list is not None: for tmpStatus in ng_file_status_list: - tmpKey = ':status_{0}'.format(tmpStatus) + tmpKey = ":status_{0}".format(tmpStatus) varMap[tmpKey] = tmpStatus self.execute(sqlGF, varMap) resGF = self.cur.fetchall() @@ -1333,7 +1343,7 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co jobSpec.add_in_file(fileSpec) # append jobSpecList.append(jobSpec) - tmpLog.debug('got {0} jobs'.format(len(jobSpecList))) + tmpLog.debug("got {0} jobs".format(len(jobSpecList))) return jobSpecList except Exception: # roll back @@ -1345,10 +1355,9 @@ def get_jobs_in_sub_status(self, sub_status, max_jobs, time_column=None, lock_co # register a worker def register_worker(self, workspec, jobspec_list, locked_by): - tmpLog = core_utils.make_logger(_logger, 'batchID={0}'.format(workspec.batchID), - method_name='register_worker') + tmpLog = core_utils.make_logger(_logger, "batchID={0}".format(workspec.batchID), method_name="register_worker") try: - tmpLog.debug('start') + tmpLog.debug("start") # sql to check if exists sqlE = "SELECT 1 c FROM {0} WHERE workerID=:workerID ".format(workTableName) # sql to insert job and worker relationship @@ -1366,7 +1375,7 @@ def register_worker(self, workspec, jobspec_list, locked_by): isNew = False if workspec.isNew: varMap = dict() - varMap[':workerID'] = workspec.workerID + varMap[":workerID"] = workspec.workerID self.execute(sqlE, varMap) resE = self.cur.fetchone() if resE is None: @@ -1379,16 +1388,16 @@ def register_worker(self, workspec, jobspec_list, locked_by): self.execute(sqlI, varMap) # decrement nNewWorkers varMap = dict() - varMap[':queueName'] = workspec.computingSite + varMap[":queueName"] = workspec.computingSite self.execute(sqlDN, varMap) else: # not update workerID - workspec.force_not_update('workerID') + workspec.force_not_update("workerID") # update a worker sqlU = "UPDATE {0} SET {1} ".format(workTableName, workspec.bind_update_changes_expression()) sqlU += "WHERE workerID=:workerID " varMap = workspec.values_map(only_changed=True) - varMap[':workerID'] = workspec.workerID + varMap[":workerID"] = workspec.workerID self.execute(sqlU, varMap) # collect values to update jobs or insert job/worker mapping varMapsR = [] @@ -1396,18 +1405,18 @@ def register_worker(self, workspec, jobspec_list, locked_by): for jobSpec in jobspec_list: # get number of workers for the job varMap = dict() - varMap[':pandaID'] = jobSpec.PandaID - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":pandaID"] = jobSpec.PandaID + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlNW, varMap) resNW = self.cur.fetchall() workerIDs = set() workerIDs.add(workspec.workerID) - for tmpWorkerID, in resNW: + for (tmpWorkerID,) in resNW: workerIDs.add(tmpWorkerID) # update attributes - if jobSpec.subStatus in ['submitted', 'running']: + if jobSpec.subStatus in ["submitted", "running"]: jobSpec.nWorkers = len(workerIDs) try: jobSpec.nWorkersInTotal += 1 @@ -1418,11 +1427,10 @@ def register_worker(self, workspec, jobspec_list, locked_by): # not update if other workers are active if len(workerIDs) > 1: continue - core_utils.update_job_attributes_with_workers(workspec.mapType, [jobSpec], - [workspec], {}, {}) + core_utils.update_job_attributes_with_workers(workspec.mapType, [jobSpec], [workspec], {}, {}) jobSpec.trigger_propagation() else: - jobSpec.subStatus = 'submitted' + jobSpec.subStatus = "submitted" jobSpec.nWorkers = len(workerIDs) try: jobSpec.nWorkersInTotal += 1 @@ -1433,21 +1441,20 @@ def register_worker(self, workspec, jobspec_list, locked_by): # not update if other workers are active if len(workerIDs) > 1: continue - core_utils.update_job_attributes_with_workers(workspec.mapType, [jobSpec], - [workspec], {}, {}) + core_utils.update_job_attributes_with_workers(workspec.mapType, [jobSpec], [workspec], {}, {}) jobSpec.trigger_propagation() else: - jobSpec.subStatus = 'queued' + jobSpec.subStatus = "queued" # sql to update job if len(jobSpec.values_map(only_changed=True)) > 0: sqlJ = "UPDATE {0} SET {1} ".format(jobTableName, jobSpec.bind_update_changes_expression()) sqlJ += "WHERE PandaID=:cr_PandaID AND lockedBy=:cr_lockedBy " # update job varMap = jobSpec.values_map(only_changed=True) - varMap[':cr_PandaID'] = jobSpec.PandaID - varMap[':cr_lockedBy'] = locked_by + varMap[":cr_PandaID"] = jobSpec.PandaID + varMap[":cr_lockedBy"] = locked_by self.execute(sqlJ, varMap) - if jobSpec.subStatus in ['submitted', 'running']: + if jobSpec.subStatus in ["submitted", "running"]: # values for job/worker mapping jwRelation = JobWorkerRelationSpec() jwRelation.PandaID = jobSpec.PandaID @@ -1471,10 +1478,9 @@ def register_worker(self, workspec, jobspec_list, locked_by): # insert workers def insert_workers(self, workspec_list, locked_by): - tmpLog = core_utils.make_logger(_logger, 'locked_by={0}'.format(locked_by), - method_name='insert_workers') + tmpLog = core_utils.make_logger(_logger, "locked_by={0}".format(locked_by), method_name="insert_workers") try: - tmpLog.debug('start') + tmpLog.debug("start") timeNow = datetime.datetime.utcnow() # sql to insert a worker sqlI = "INSERT INTO {0} ({1}) ".format(workTableName, WorkSpec.column_names()) @@ -1504,8 +1510,8 @@ def insert_workers(self, workspec_list, locked_by): def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_by, queue_lock_interval): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_queues_to_submit') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_queues_to_submit") + tmpLog.debug("start") retMap = dict() siteName = None resourceMap = dict() @@ -1539,17 +1545,17 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ # get sites timeNow = datetime.datetime.utcnow() varMap = dict() - varMap[':lockTimeLimit'] = timeNow - datetime.timedelta(seconds=queue_lock_interval) - varMap[':lookupTimeLimit'] = timeNow - datetime.timedelta(seconds=lookup_interval) + varMap[":lockTimeLimit"] = timeNow - datetime.timedelta(seconds=queue_lock_interval) + varMap[":lookupTimeLimit"] = timeNow - datetime.timedelta(seconds=lookup_interval) self.execute(sqlS, varMap) resS = self.cur.fetchall() - for siteName, in resS: + for (siteName,) in resS: # update timestamp to lock the site varMap = dict() - varMap[':siteName'] = siteName - varMap[':submitTime'] = timeNow - varMap[':lockedBy'] = locked_by - varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=lookup_interval) + varMap[":siteName"] = siteName + varMap[":submitTime"] = timeNow + varMap[":lockedBy"] = locked_by + varMap[":timeLimit"] = timeNow - datetime.timedelta(seconds=lookup_interval) self.execute(sqlU, varMap) nRow = self.cur.rowcount # commit @@ -1559,42 +1565,41 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ continue # get queues varMap = dict() - varMap[':siteName'] = siteName + varMap[":siteName"] = siteName self.execute(sqlQ, varMap) resQ = self.cur.fetchall() for queueName, jobType, resourceType, nNewWorkers in resQ: - # delete orphaned workers varMap = dict() - varMap[':computingSite'] = queueName - varMap[':status'] = WorkSpec.ST_pending - varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=lock_interval) + varMap[":computingSite"] = queueName + varMap[":status"] = WorkSpec.ST_pending + varMap[":timeLimit"] = timeNow - datetime.timedelta(seconds=lock_interval) sqlO_tmp = sqlO - if jobType != 'ANY': - varMap[':jobType'] = jobType + if jobType != "ANY": + varMap[":jobType"] = jobType sqlO_tmp += "AND jobType=:jobType " - if resourceType != 'ANY': - varMap[':resourceType'] = resourceType + if resourceType != "ANY": + varMap[":resourceType"] = resourceType sqlO_tmp += "AND resourceType=:resourceType " self.execute(sqlO_tmp, varMap) resO = self.cur.fetchall() - for tmpWorkerID, in resO: + for (tmpWorkerID,) in resO: varMap = dict() - varMap[':workerID'] = tmpWorkerID + varMap[":workerID"] = tmpWorkerID self.execute(sqlD, varMap) # commit self.commit() # count nQueue varMap = dict() - varMap[':computingSite'] = queueName - varMap[':resourceType'] = resourceType + varMap[":computingSite"] = queueName + varMap[":resourceType"] = resourceType sqlN_tmp = sqlN - if jobType != 'ANY': - varMap[':jobType'] = jobType + if jobType != "ANY": + varMap[":jobType"] = jobType sqlN_tmp += "AND jobType=:jobType " - if resourceType != 'ANY': - varMap[':resourceType'] = resourceType + if resourceType != "ANY": + varMap[":resourceType"] = resourceType sqlN_tmp += "AND resourceType=:resourceType " sqlN_tmp += "GROUP BY status " self.execute(sqlN_tmp, varMap) @@ -1611,33 +1616,30 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ # count nFillers varMap = dict() - varMap[':computingSite'] = queueName - varMap[':status'] = WorkSpec.ST_running + varMap[":computingSite"] = queueName + varMap[":status"] = WorkSpec.ST_running sqlR_tmp = sqlR - if jobType != 'ANY': - varMap[':jobType'] = jobType + if jobType != "ANY": + varMap[":jobType"] = jobType sqlR_tmp += "AND jobType=:jobType " - if resourceType != 'ANY': - varMap[':resourceType'] = resourceType + if resourceType != "ANY": + varMap[":resourceType"] = resourceType sqlR_tmp += "AND resourceType=:resourceType " self.execute(sqlR_tmp, varMap) - nReFill, = self.cur.fetchone() + (nReFill,) = self.cur.fetchone() nReady += nReFill # add retMap.setdefault(queueName, {}) retMap[queueName].setdefault(jobType, {}) - retMap[queueName][jobType][resourceType] = {'nReady': nReady, - 'nRunning': nRunning, - 'nQueue': nQueue, - 'nNewWorkers': nNewWorkers} + retMap[queueName][jobType][resourceType] = {"nReady": nReady, "nRunning": nRunning, "nQueue": nQueue, "nNewWorkers": nNewWorkers} resourceMap.setdefault(jobType, {}) resourceMap[jobType][resourceType] = queueName # enough queues if len(retMap) >= 0: break - tmpLog.debug('got retMap {0}'.format(str(retMap))) - tmpLog.debug('got siteName {0}'.format(str(siteName))) - tmpLog.debug('got resourceMap {0}'.format(str(resourceMap))) + tmpLog.debug("got retMap {0}".format(str(retMap))) + tmpLog.debug("got siteName {0}".format(str(siteName))) + tmpLog.debug("got resourceMap {0}".format(str(resourceMap))) return retMap, siteName, resourceMap except Exception: # roll back @@ -1648,16 +1650,26 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ return {}, None, {} # get job chunks to make workers - def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_worker, n_workers_per_job, - use_job_late_binding, check_interval, lock_interval, locked_by, - allow_job_mixture=False, max_workers_per_job_in_total=None, - max_workers_per_job_per_cycle=None): + def get_job_chunks_for_workers( + self, + queue_name, + n_workers, + n_ready, + n_jobs_per_worker, + n_workers_per_job, + use_job_late_binding, + check_interval, + lock_interval, + locked_by, + allow_job_mixture=False, + max_workers_per_job_in_total=None, + max_workers_per_job_per_cycle=None, + ): toCommit = False try: # get logger - tmpLog = core_utils.make_logger(_logger, 'queue={0}'.format(queue_name), - method_name='get_job_chunks_for_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "queue={0}".format(queue_name), method_name="get_job_chunks_for_workers") + tmpLog.debug("start") # define maxJobs if n_jobs_per_worker is not None: maxJobs = (n_workers + n_ready) * n_jobs_per_worker @@ -1700,61 +1712,61 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ sqlC += sqlCore # count jobs varMap = dict() - varMap[':subStat1'] = 'prepared' - varMap[':subStat2'] = 'queued' - varMap[':subStat3'] = 'submitted' - varMap[':subStat4'] = 'running' - varMap[':queueName'] = queue_name - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':checkTimeLimit'] = checkTimeLimit + varMap[":subStat1"] = "prepared" + varMap[":subStat2"] = "queued" + varMap[":subStat3"] = "submitted" + varMap[":subStat4"] = "running" + varMap[":queueName"] = queue_name + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":checkTimeLimit"] = checkTimeLimit self.execute(sqlC, varMap) - nAvailableJobs, = self.cur.fetchone() + (nAvailableJobs,) = self.cur.fetchone() maxJobs = int(min(maxJobs, nAvailableJobs) / n_jobs_per_worker) * n_jobs_per_worker - tmpStr = 'n_workers={0} n_ready={1} '.format(n_workers, n_ready) - tmpStr += 'n_jobs_per_worker={0} n_workers_per_job={1} '.format(n_jobs_per_worker, n_workers_per_job) - tmpStr += 'n_ava_jobs={0}'.format(nAvailableJobs) + tmpStr = "n_workers={0} n_ready={1} ".format(n_workers, n_ready) + tmpStr += "n_jobs_per_worker={0} n_workers_per_job={1} ".format(n_jobs_per_worker, n_workers_per_job) + tmpStr += "n_ava_jobs={0}".format(nAvailableJobs) tmpLog.debug(tmpStr) if maxJobs == 0: - tmpStr = 'skip due to maxJobs=0' + tmpStr = "skip due to maxJobs=0" tmpLog.debug(tmpStr) else: # get job IDs varMap = dict() - varMap[':subStat1'] = 'prepared' - varMap[':subStat2'] = 'queued' - varMap[':subStat3'] = 'submitted' - varMap[':subStat4'] = 'running' - varMap[':queueName'] = queue_name - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':checkTimeLimit'] = checkTimeLimit + varMap[":subStat1"] = "prepared" + varMap[":subStat2"] = "queued" + varMap[":subStat3"] = "submitted" + varMap[":subStat4"] = "running" + varMap[":queueName"] = queue_name + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":checkTimeLimit"] = checkTimeLimit self.execute(sqlP, varMap) resP = self.cur.fetchall() - tmpStr = 'fetched {0} jobs'.format(len(resP)) + tmpStr = "fetched {0} jobs".format(len(resP)) tmpLog.debug(tmpStr) jobChunk = [] iJobs = 0 - for pandaID, in resP: + for (pandaID,) in resP: toCommit = True toEscape = False # lock job varMap = dict() - varMap[':subStat1'] = 'prepared' - varMap[':subStat2'] = 'queued' - varMap[':subStat3'] = 'submitted' - varMap[':subStat4'] = 'running' - varMap[':queueName'] = queue_name - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':checkTimeLimit'] = checkTimeLimit - varMap[':PandaID'] = pandaID - varMap[':timeNow'] = timeNow - varMap[':lockedBy'] = locked_by + varMap[":subStat1"] = "prepared" + varMap[":subStat2"] = "queued" + varMap[":subStat3"] = "submitted" + varMap[":subStat4"] = "running" + varMap[":queueName"] = queue_name + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":checkTimeLimit"] = checkTimeLimit + varMap[":PandaID"] = pandaID + varMap[":timeNow"] = timeNow + varMap[":lockedBy"] = locked_by self.execute(sqlL, varMap) nRow = self.cur.rowcount if nRow > 0: iJobs += 1 # get job varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJ, varMap) resJ = self.cur.fetchone() # make job @@ -1763,9 +1775,9 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ jobSpec.lockedBy = locked_by # get files varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':type1'] = 'input' - varMap[':type2'] = FileSpec.AUX_INPUT + varMap[":PandaID"] = pandaID + varMap[":type1"] = "input" + varMap[":type2"] = FileSpec.AUX_INPUT self.execute(sqlGF, varMap) resGF = self.cur.fetchall() for resFile in resGF: @@ -1774,17 +1786,17 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ jobSpec.add_in_file(fileSpec) # new chunk if len(jobChunk) > 0 and jobChunk[0].taskID != jobSpec.taskID and not allow_job_mixture: - tmpLog.debug('new chunk with {0} jobs due to taskID change'.format(len(jobChunk))) + tmpLog.debug("new chunk with {0} jobs due to taskID change".format(len(jobChunk))) jobChunkList.append(jobChunk) jobChunk = [] # only prepared for new worker - if len(jobChunkList) >= n_ready and jobSpec.subStatus == 'queued': + if len(jobChunkList) >= n_ready and jobSpec.subStatus == "queued": toCommit = False else: jobChunk.append(jobSpec) # enough jobs in chunk if n_jobs_per_worker is not None and len(jobChunk) >= n_jobs_per_worker: - tmpLog.debug('new chunk with {0} jobs due to n_jobs_per_worker'.format(len(jobChunk))) + tmpLog.debug("new chunk with {0} jobs due to n_jobs_per_worker".format(len(jobChunk))) jobChunkList.append(jobChunk) jobChunk = [] # one job per multiple workers @@ -1793,18 +1805,14 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ jobSpec.nWorkersLimit = n_workers_per_job if max_workers_per_job_in_total is not None: jobSpec.maxWorkersInTotal = max_workers_per_job_in_total - nMultiWorkers = min(jobSpec.nWorkersLimit - jobSpec.nWorkers, - n_workers - len(jobChunkList)) + nMultiWorkers = min(jobSpec.nWorkersLimit - jobSpec.nWorkers, n_workers - len(jobChunkList)) if jobSpec.maxWorkersInTotal is not None and jobSpec.nWorkersInTotal is not None: - nMultiWorkers = min(nMultiWorkers, - jobSpec.maxWorkersInTotal - jobSpec.nWorkersInTotal) + nMultiWorkers = min(nMultiWorkers, jobSpec.maxWorkersInTotal - jobSpec.nWorkersInTotal) if max_workers_per_job_per_cycle is not None: nMultiWorkers = min(nMultiWorkers, max_workers_per_job_per_cycle) if nMultiWorkers < 0: nMultiWorkers = 0 - tmpLog.debug( - 'new {0} chunks with {1} jobs due to n_workers_per_job'.format(nMultiWorkers, - len(jobChunk))) + tmpLog.debug("new {0} chunks with {1} jobs due to n_workers_per_job".format(nMultiWorkers, len(jobChunk))) for i in range(nMultiWorkers): jobChunkList.append(jobChunk) jobChunk = [] @@ -1817,7 +1825,7 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ self.rollback() if toEscape or iJobs >= maxJobs: break - tmpLog.debug('got {0} job chunks'.format(len(jobChunkList))) + tmpLog.debug("got {0} job chunks".format(len(jobChunkList))) return jobChunkList except Exception: # roll back @@ -1832,8 +1840,8 @@ def get_job_chunks_for_workers(self, queue_name, n_workers, n_ready, n_jobs_per_ def get_workers_to_update(self, max_workers, check_interval, lock_interval, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_to_update') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_to_update") + tmpLog.debug("start") # sql to get workers sqlW = "SELECT workerID,configID,mapType FROM {0} ".format(workTableName) sqlW += "WHERE status IN (:st_submitted,:st_running,:st_idle) " @@ -1867,11 +1875,11 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock lockTimeLimit = timeNow - datetime.timedelta(seconds=lock_interval) checkTimeLimit = timeNow - datetime.timedelta(seconds=check_interval) varMap = dict() - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':checkTimeLimit'] = checkTimeLimit + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":checkTimeLimit"] = checkTimeLimit self.execute(sqlW, varMap) resW = self.cur.fetchall() tmpWorkers = set() @@ -1888,14 +1896,14 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock continue # get associated workerIDs varMap = dict() - varMap[':workerID'] = workerID - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":workerID"] = workerID + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlA, varMap) resA = self.cur.fetchall() workerIDtoScan = set() - for tmpWorkID, in resA: + for (tmpWorkID,) in resA: workerIDtoScan.add(tmpWorkID) # add original ID just in case since no relation when job is not yet bound workerIDtoScan.add(workerID) @@ -1904,22 +1912,22 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock if workerID != min(workerIDtoScan): # update modification time varMap = dict() - varMap[':workerID'] = workerID - varMap[':timeNow'] = timeNow + varMap[":workerID"] = workerID + varMap[":timeNow"] = timeNow self.execute(sqlLM, varMap) # commit self.commit() continue # lock worker varMap = dict() - varMap[':workerID'] = workerID - varMap[':lockedBy'] = locked_by - varMap[':timeNow'] = timeNow - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':checkTimeLimit'] = checkTimeLimit + varMap[":workerID"] = workerID + varMap[":lockedBy"] = locked_by + varMap[":timeNow"] = timeNow + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":checkTimeLimit"] = checkTimeLimit self.execute(sqlLT, varMap) nRow = self.cur.rowcount # commit @@ -1934,7 +1942,7 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock checkedIDs.add(tmpWorkID) # get worker varMap = dict() - varMap[':workerID'] = tmpWorkID + varMap[":workerID"] = tmpWorkID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -1944,23 +1952,23 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock workersList.append(workSpec) # get associated PandaIDs varMap = dict() - varMap[':workerID'] = tmpWorkID + varMap[":workerID"] = tmpWorkID self.execute(sqlP, varMap) resP = self.cur.fetchall() workSpec.pandaid_list = [] - for tmpPandaID, in resP: + for (tmpPandaID,) in resP: workSpec.pandaid_list.append(tmpPandaID) if len(workSpec.pandaid_list) > 0: workSpec.nJobs = len(workSpec.pandaid_list) # lock worker if tmpWorkID != workerID: varMap = dict() - varMap[':workerID'] = tmpWorkID - varMap[':lockedBy'] = locked_by - varMap[':timeNow'] = timeNow + varMap[":workerID"] = tmpWorkID + varMap[":lockedBy"] = locked_by + varMap[":timeNow"] = timeNow self.execute(sqlL, varMap) workSpec.lockedBy = locked_by - workSpec.force_not_update('lockedBy') + workSpec.force_not_update("lockedBy") # commit self.commit() # add @@ -1968,7 +1976,7 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock retVal.setdefault(queueName, dict()) retVal[queueName].setdefault(configID, []) retVal[queueName][configID].append(workersList) - tmpLog.debug('got {0}'.format(str(retVal))) + tmpLog.debug("got {0}".format(str(retVal))) return retVal except Exception: # roll back @@ -1982,8 +1990,8 @@ def get_workers_to_update(self, max_workers, check_interval, lock_interval, lock def get_workers_to_propagate(self, max_workers, check_interval): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_to_propagate') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_to_propagate") + tmpLog.debug("start") # sql to get worker IDs sqlW = "SELECT workerID FROM {0} ".format(workTableName) sqlW += "WHERE lastUpdate IS NOT NULL AND lastUpdate<:checkTimeLimit " @@ -2002,11 +2010,11 @@ def get_workers_to_propagate(self, max_workers, check_interval): timeLimit = timeNow - datetime.timedelta(seconds=check_interval) # get workerIDs varMap = dict() - varMap[':checkTimeLimit'] = timeLimit + varMap[":checkTimeLimit"] = timeLimit self.execute(sqlW, varMap) resW = self.cur.fetchall() tmpWorkers = [] - for workerID, in resW: + for (workerID,) in resW: tmpWorkers.append(workerID) # partially randomize to increase hit rate nWorkers = int(max_workers * 0.2) @@ -2018,15 +2026,15 @@ def get_workers_to_propagate(self, max_workers, check_interval): for workerID in tmpWorkers: # lock worker varMap = dict() - varMap[':workerID'] = workerID - varMap[':timeNow'] = timeNow - varMap[':checkTimeLimit'] = timeLimit + varMap[":workerID"] = workerID + varMap[":timeNow"] = timeNow + varMap[":checkTimeLimit"] = timeLimit self.execute(sqlL, varMap) nRow = self.cur.rowcount if nRow > 0: # get worker varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -2034,15 +2042,15 @@ def get_workers_to_propagate(self, max_workers, check_interval): retVal.append(workSpec) # get associated PandaIDs varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlA, varMap) resA = self.cur.fetchall() workSpec.pandaid_list = [] - for pandaID, in resA: + for (pandaID,) in resA: workSpec.pandaid_list.append(pandaID) # commit self.commit() - tmpLog.debug('got {0} workers'.format(len(retVal))) + tmpLog.debug("got {0} workers".format(len(retVal))) return retVal except Exception: # roll back @@ -2056,8 +2064,8 @@ def get_workers_to_propagate(self, max_workers, check_interval): def get_workers_to_feed_events(self, max_workers, lock_interval, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_to_feed_events') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_to_feed_events") + tmpLog.debug("start") # sql to get workers sqlW = "SELECT workerID, status FROM {0} ".format(workTableName) sqlW += "WHERE eventsRequest=:eventsRequest AND status IN (:status1,:status2) " @@ -2075,10 +2083,10 @@ def get_workers_to_feed_events(self, max_workers, lock_interval, locked_by): timeNow = datetime.datetime.utcnow() lockTimeLimit = timeNow - datetime.timedelta(seconds=lock_interval) varMap = dict() - varMap[':status1'] = WorkSpec.ST_running - varMap[':status2'] = WorkSpec.ST_submitted - varMap[':eventsRequest'] = WorkSpec.EV_requestEvents - varMap[':lockTimeLimit'] = lockTimeLimit + varMap[":status1"] = WorkSpec.ST_running + varMap[":status2"] = WorkSpec.ST_submitted + varMap[":eventsRequest"] = WorkSpec.EV_requestEvents + varMap[":lockTimeLimit"] = lockTimeLimit self.execute(sqlW, varMap) resW = self.cur.fetchall() tmpWorkers = dict() @@ -2088,12 +2096,12 @@ def get_workers_to_feed_events(self, max_workers, lock_interval, locked_by): for workerID, workStatus in iteritems(tmpWorkers): # lock worker varMap = dict() - varMap[':workerID'] = workerID - varMap[':timeNow'] = timeNow - varMap[':status'] = workStatus - varMap[':eventsRequest'] = WorkSpec.EV_requestEvents - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':lockedBy'] = locked_by + varMap[":workerID"] = workerID + varMap[":timeNow"] = timeNow + varMap[":status"] = workStatus + varMap[":eventsRequest"] = WorkSpec.EV_requestEvents + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":lockedBy"] = locked_by self.execute(sqlL, varMap) nRow = self.cur.rowcount # commit @@ -2103,7 +2111,7 @@ def get_workers_to_feed_events(self, max_workers, lock_interval, locked_by): continue # get worker varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -2111,7 +2119,7 @@ def get_workers_to_feed_events(self, max_workers, lock_interval, locked_by): if workSpec.computingSite not in retVal: retVal[workSpec.computingSite] = [] retVal[workSpec.computingSite].append(workSpec) - tmpLog.debug('got {0} workers'.format(len(retVal))) + tmpLog.debug("got {0} workers".format(len(retVal))) return retVal except Exception: # roll back @@ -2179,37 +2187,36 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ else: isMultiWorkers = False for jobSpec in jobspec_list: - tmpLog = core_utils.make_logger(_logger, 'PandaID={0} by {1}'.format(jobSpec.PandaID, locked_by), - method_name='update_jobs_workers') + tmpLog = core_utils.make_logger(_logger, "PandaID={0} by {1}".format(jobSpec.PandaID, locked_by), method_name="update_jobs_workers") # check job varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlCJ, varMap) resCJ = self.cur.fetchone() - tmpJobStatus, = resCJ + (tmpJobStatus,) = resCJ # don't update cancelled jobs - if tmpJobStatus == ['cancelled']: + if tmpJobStatus == ["cancelled"]: pass else: # get nWorkers - tmpLog.debug('start') + tmpLog.debug("start") activeWorkers = set() if isMultiWorkers: varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":PandaID"] = jobSpec.PandaID + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlNW, varMap) resNW = self.cur.fetchall() - for tmpWorkerID, in resNW: + for (tmpWorkerID,) in resNW: activeWorkers.add(tmpWorkerID) jobSpec.nWorkers = len(activeWorkers) # get all LFNs allLFNs = dict() varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':type'] = 'input' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":type"] = "input" self.execute(sqlFL, varMap) resFL = self.cur.fetchall() for tmpLFN, tmpFileID in resFL: @@ -2222,13 +2229,13 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ # insert file if fileSpec.lfn not in allLFNs: if jobSpec.zipPerMB is None or fileSpec.isZip in [0, 1]: - if fileSpec.fileType != 'checkpoint': - fileSpec.status = 'defined' + if fileSpec.fileType != "checkpoint": + fileSpec.status = "defined" jobSpec.hasOutFile = JobSpec.HO_hasOutput else: - fileSpec.status = 'renewed' + fileSpec.status = "renewed" else: - fileSpec.status = 'pending' + fileSpec.status = "pending" varMap = fileSpec.values_list() self.execute(sqlFI, varMap) fileSpec.fileID = self.cur.lastrowid @@ -2239,24 +2246,24 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ # associate to itself if fileSpec.isZip == 1: varMap = dict() - varMap[':status'] = fileSpec.status - varMap[':fileID'] = fileSpec.fileID - varMap[':zipFileID'] = fileSpec.fileID + varMap[":status"] = fileSpec.status + varMap[":fileID"] = fileSpec.fileID + varMap[":zipFileID"] = fileSpec.fileID self.execute(sqlFU, varMap) elif fileSpec.isZip == 1 and fileSpec.eventRangeID is not None: # add a fake file with eventRangeID which has the same lfn/zipFileID as zip file varMap = dict() - varMap[':PandaID'] = fileSpec.PandaID - varMap[':lfn'] = fileSpec.lfn - varMap[':eventRangeID'] = fileSpec.eventRangeID + varMap[":PandaID"] = fileSpec.PandaID + varMap[":lfn"] = fileSpec.lfn + varMap[":eventRangeID"] = fileSpec.eventRangeID self.execute(sqlFE, varMap) resFE = self.cur.fetchone() if resFE is None: if fileSpec.lfn not in zipFileRes: # get file varMap = dict() - varMap[':PandaID'] = fileSpec.PandaID - varMap[':lfn'] = fileSpec.lfn + varMap[":PandaID"] = fileSpec.PandaID + varMap[":lfn"] = fileSpec.lfn self.execute(sqlFC, varMap) resFC = self.cur.fetchone() zipFileRes[fileSpec.lfn] = resFC @@ -2264,83 +2271,81 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ resFC = zipFileRes[fileSpec.lfn] zipFileSpec = FileSpec() zipFileSpec.pack(resFC) - fileSpec.status = 'zipped' + fileSpec.status = "zipped" fileSpec.zipFileID = zipFileSpec.zipFileID varMap = fileSpec.values_list() self.execute(sqlFI, varMap) nFiles += 1 # mapping between event range ID and file ID fileIdMap[fileSpec.eventRangeID] = self.cur.lastrowid - elif fileSpec.fileType == 'checkpoint': + elif fileSpec.fileType == "checkpoint": # reset status of checkpoint to be uploaded again varMap = dict() - varMap[':status'] = 'renewed' - varMap[':fileID'] = allLFNs[fileSpec.lfn] - varMap[':zipFileID'] = None + varMap[":status"] = "renewed" + varMap[":fileID"] = allLFNs[fileSpec.lfn] + varMap[":zipFileID"] = None self.execute(sqlFU, varMap) if nFiles > 0: - tmpLog.debug('inserted {0} files'.format(nFiles)) + tmpLog.debug("inserted {0} files".format(nFiles)) # check pending files - if jobSpec.zipPerMB is not None and \ - not (jobSpec.zipPerMB == 0 and jobSpec.subStatus != 'to_transfer'): + if jobSpec.zipPerMB is not None and not (jobSpec.zipPerMB == 0 and jobSpec.subStatus != "to_transfer"): # get workerID and provenanceID of pending files zippedFileIDs = [] varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':status'] = 'pending' - varMap[':type'] = 'input' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":status"] = "pending" + varMap[":type"] = "input" self.execute(sqlPW, varMap) resPW = self.cur.fetchall() for subTotalSize, tmpProvenanceID, tmpWorkerID in resPW: - if jobSpec.subStatus == 'to_transfer' \ - or (jobSpec.zipPerMB > 0 and subTotalSize > jobSpec.zipPerMB * 1024 * 1024) \ - or (tmpWorkerID is not None and tmpWorkerID not in activeWorkers): + if ( + jobSpec.subStatus == "to_transfer" + or (jobSpec.zipPerMB > 0 and subTotalSize > jobSpec.zipPerMB * 1024 * 1024) + or (tmpWorkerID is not None and tmpWorkerID not in activeWorkers) + ): sqlFPx = sqlFP varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':status'] = 'pending' - varMap[':type'] = 'input' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":status"] = "pending" + varMap[":type"] = "input" if tmpProvenanceID is None: - sqlFPx += 'AND provenanceID IS NULL ' + sqlFPx += "AND provenanceID IS NULL " else: - varMap[':provenanceID'] = tmpProvenanceID - sqlFPx += 'AND provenanceID=:provenanceID ' + varMap[":provenanceID"] = tmpProvenanceID + sqlFPx += "AND provenanceID=:provenanceID " if tmpWorkerID is None: - sqlFPx += 'AND workerID IS NULL ' + sqlFPx += "AND workerID IS NULL " else: - varMap[':workerID'] = tmpWorkerID - sqlFPx += 'AND workerID=:workerID' + varMap[":workerID"] = tmpWorkerID + sqlFPx += "AND workerID=:workerID" # get pending files self.execute(sqlFPx, varMap) resFP = self.cur.fetchall() - tmpLog.debug('got {0} pending files for workerID={1} provenanceID={2}'.format( - len(resFP), - tmpWorkerID, - tmpProvenanceID)) + tmpLog.debug("got {0} pending files for workerID={1} provenanceID={2}".format(len(resFP), tmpWorkerID, tmpProvenanceID)) # make subsets subTotalSize = 0 subFileIDs = [] for tmpFileID, tmpFsize, tmpLFN in resFP: - if jobSpec.zipPerMB > 0 and subTotalSize > 0 \ - and (subTotalSize + tmpFsize > jobSpec.zipPerMB * 1024 * 1024): + if jobSpec.zipPerMB > 0 and subTotalSize > 0 and (subTotalSize + tmpFsize > jobSpec.zipPerMB * 1024 * 1024): zippedFileIDs.append(subFileIDs) subFileIDs = [] subTotalSize = 0 subTotalSize += tmpFsize subFileIDs.append((tmpFileID, tmpLFN)) - if (jobSpec.subStatus == 'to_transfer' - or (jobSpec.zipPerMB > 0 and subTotalSize > jobSpec.zipPerMB * 1024 * 1024) - or (tmpWorkerID is not None and tmpWorkerID not in activeWorkers)) \ - and len(subFileIDs) > 0: + if ( + jobSpec.subStatus == "to_transfer" + or (jobSpec.zipPerMB > 0 and subTotalSize > jobSpec.zipPerMB * 1024 * 1024) + or (tmpWorkerID is not None and tmpWorkerID not in activeWorkers) + ) and len(subFileIDs) > 0: zippedFileIDs.append(subFileIDs) # make zip files for subFileIDs in zippedFileIDs: # insert zip file fileSpec = FileSpec() - fileSpec.status = 'zipping' - fileSpec.lfn = 'panda.' + subFileIDs[0][-1] + '.zip' - fileSpec.scope = 'panda' - fileSpec.fileType = 'zip_output' + fileSpec.status = "zipping" + fileSpec.lfn = "panda." + subFileIDs[0][-1] + ".zip" + fileSpec.scope = "panda" + fileSpec.fileType = "zip_output" fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.isZip = 1 @@ -2350,9 +2355,9 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ varMaps = [] for tmpFileID, tmpLFN in subFileIDs: varMap = dict() - varMap[':status'] = 'zipped' - varMap[':fileID'] = tmpFileID - varMap[':zipFileID'] = self.cur.lastrowid + varMap[":status"] = "zipped" + varMap[":fileID"] = tmpFileID + varMap[":zipFileID"] = self.cur.lastrowid varMaps.append(varMap) self.executemany(sqlFU, varMaps) # set zip output flag @@ -2365,17 +2370,17 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ if len(jobSpec.events) > 0: # get event ranges varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlEC, varMap) resEC = self.cur.fetchall() for tmpEventRangeID, tmpEventStatus in resEC: - if tmpEventStatus in ['running']: + if tmpEventStatus in ["running"]: eventRangesSet.add(tmpEventRangeID) else: doneEventRangesSet.add(tmpEventRangeID) # check associated file varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlEF, varMap) resEF = self.cur.fetchall() for tmpEventRangeID, tmpStat in resEF: @@ -2388,16 +2393,15 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ if eventSpec.eventRangeID in doneEventRangesSet: continue # set subStatus - if eventSpec.eventStatus == 'finished': + if eventSpec.eventStatus == "finished": # check associated file - if eventSpec.eventRangeID not in eventFileStat or \ - eventFileStat[eventSpec.eventRangeID] == 'finished': - eventSpec.subStatus = 'finished' - elif eventFileStat[eventSpec.eventRangeID] == 'failed': - eventSpec.eventStatus = 'failed' - eventSpec.subStatus = 'failed' + if eventSpec.eventRangeID not in eventFileStat or eventFileStat[eventSpec.eventRangeID] == "finished": + eventSpec.subStatus = "finished" + elif eventFileStat[eventSpec.eventRangeID] == "failed": + eventSpec.eventStatus = "failed" + eventSpec.subStatus = "failed" else: - eventSpec.subStatus = 'transferring' + eventSpec.subStatus = "transferring" else: eventSpec.subStatus = eventSpec.eventStatus # set fileID @@ -2409,40 +2413,39 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ varMapsEI.append(varMap) else: varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':eventRangeID'] = eventSpec.eventRangeID - varMap[':eventStatus'] = eventSpec.eventStatus - varMap[':subStatus'] = eventSpec.subStatus + varMap[":PandaID"] = jobSpec.PandaID + varMap[":eventRangeID"] = eventSpec.eventRangeID + varMap[":eventStatus"] = eventSpec.eventStatus + varMap[":subStatus"] = eventSpec.subStatus varMapsEU.append(varMap) if len(varMapsEI) > 0: self.executemany(sqlEI, varMapsEI) - tmpLog.debug('inserted {0} event'.format(len(varMapsEI))) + tmpLog.debug("inserted {0} event".format(len(varMapsEI))) if len(varMapsEU) > 0: self.executemany(sqlEU, varMapsEU) - tmpLog.debug('updated {0} event'.format(len(varMapsEU))) + tmpLog.debug("updated {0} event".format(len(varMapsEU))) # update job varMap = jobSpec.values_map(only_changed=True) if len(varMap) > 0: - tmpLog.debug('update job') + tmpLog.debug("update job") # sql to update job sqlJ = "UPDATE {0} SET {1} ".format(jobTableName, jobSpec.bind_update_changes_expression()) sqlJ += "WHERE PandaID=:PandaID " jobSpec.lockedBy = None jobSpec.modificationTime = timeNow varMap = jobSpec.values_map(only_changed=True) - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlJ, varMap) nRow = self.cur.rowcount - tmpLog.debug('done with {0}'.format(nRow)) - tmpLog.debug('all done for job') + tmpLog.debug("done with {0}".format(nRow)) + tmpLog.debug("all done for job") # commit self.commit() # update worker retVal = True for idxW, workSpec in enumerate(workspec_list): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workSpec.workerID), - method_name='update_jobs_workers') - tmpLog.debug('update worker') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workSpec.workerID), method_name="update_jobs_workers") + tmpLog.debug("update worker") workSpec.lockedBy = None if workSpec.status == WorkSpec.ST_running and workSpec.startTime is None: workSpec.startTime = timeNow @@ -2462,15 +2465,15 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ sqlW += "AND (status NOT IN (:st1,:st2,:st3,:st4)) " varMap = workSpec.values_map(only_changed=True) if len(varMap) > 0: - varMap[':workerID'] = workSpec.workerID - varMap[':cr_lockedBy'] = locked_by - varMap[':st1'] = WorkSpec.ST_cancelled - varMap[':st2'] = WorkSpec.ST_finished - varMap[':st3'] = WorkSpec.ST_failed - varMap[':st4'] = WorkSpec.ST_missed + varMap[":workerID"] = workSpec.workerID + varMap[":cr_lockedBy"] = locked_by + varMap[":st1"] = WorkSpec.ST_cancelled + varMap[":st2"] = WorkSpec.ST_finished + varMap[":st3"] = WorkSpec.ST_failed + varMap[":st4"] = WorkSpec.ST_missed self.execute(sqlW, varMap) nRow = self.cur.rowcount - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) if nRow == 0: retVal = False # insert relationship if necessary @@ -2478,8 +2481,8 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ varMapsIR = [] for pandaID in panda_ids_list[idxW]: varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':workerID'] = workSpec.workerID + varMap[":PandaID"] = pandaID + varMap[":workerID"] = workSpec.workerID self.execute(sqlCR, varMap) resCR = self.cur.fetchone() if resCR is None: @@ -2490,7 +2493,7 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ varMapsIR.append(varMap) if len(varMapsIR) > 0: self.executemany(sqlIR, varMapsIR) - tmpLog.debug('all done for worker') + tmpLog.debug("all done for worker") # commit self.commit() # return @@ -2507,9 +2510,8 @@ def update_jobs_workers(self, jobspec_list, workspec_list, locked_by, panda_ids_ def get_jobs_with_worker_id(self, worker_id, locked_by, with_file=False, only_running=False, slim=False): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(worker_id), - method_name='get_jobs_with_worker_id') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(worker_id), method_name="get_jobs_with_worker_id") + tmpLog.debug("start") # sql to get PandaIDs sqlP = "SELECT PandaID FROM {0} ".format(jobWorkerTableName) sqlP += "WHERE workerID=:workerID " @@ -2529,41 +2531,41 @@ def get_jobs_with_worker_id(self, worker_id, locked_by, with_file=False, only_ru jobChunkList = [] timeNow = datetime.datetime.utcnow() varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlP, varMap) resW = self.cur.fetchall() - for pandaID, in resW: + for (pandaID,) in resW: # get job varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJ, varMap) resJ = self.cur.fetchone() # make job jobSpec = JobSpec() jobSpec.pack(resJ, slim=slim) - if only_running and jobSpec.subStatus not in ['running', 'submitted', 'queued', 'idle']: + if only_running and jobSpec.subStatus not in ["running", "submitted", "queued", "idle"]: continue jobSpec.lockedBy = locked_by # for old jobs without extractions if jobSpec.jobParamsExtForLog is None: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJJ, varMap) resJJ = self.cur.fetchone() - jobSpec.set_blob_attribute('jobParams', resJJ[0]) + jobSpec.set_blob_attribute("jobParams", resJJ[0]) jobSpec.get_output_file_attributes() jobSpec.get_logfile_info() # lock job if locked_by is not None: varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':lockedBy'] = locked_by - varMap[':timeNow'] = timeNow + varMap[":PandaID"] = pandaID + varMap[":lockedBy"] = locked_by + varMap[":timeNow"] = timeNow self.execute(sqlL, varMap) # get files if with_file: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlF, varMap) resFileList = self.cur.fetchall() for resFile in resFileList: @@ -2574,7 +2576,7 @@ def get_jobs_with_worker_id(self, worker_id, locked_by, with_file=False, only_ru jobChunkList.append(jobSpec) # commit self.commit() - tmpLog.debug('got {0} job chunks'.format(len(jobChunkList))) + tmpLog.debug("got {0} job chunks".format(len(jobChunkList))) return jobChunkList except Exception: # roll back @@ -2588,9 +2590,8 @@ def get_jobs_with_worker_id(self, worker_id, locked_by, with_file=False, only_ru def get_ready_workers(self, queue_name, n_ready): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'queue={0}'.format(queue_name), - method_name='get_ready_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "queue={0}".format(queue_name), method_name="get_ready_workers") + tmpLog.debug("start") # sql to get workers sqlG = "SELECT {0} FROM {1} ".format(WorkSpec.column_names(), workTableName) sqlG += "WHERE computingSite=:queueName AND (status=:status_ready OR (status=:status_running " @@ -2601,9 +2602,9 @@ def get_ready_workers(self, queue_name, n_ready): sqlP += "WHERE workerID=:workerID " # get workers varMap = dict() - varMap[':status_ready'] = WorkSpec.ST_ready - varMap[':status_running'] = WorkSpec.ST_running - varMap[':queueName'] = queue_name + varMap[":status_ready"] = WorkSpec.ST_ready + varMap[":status_running"] = WorkSpec.ST_running + varMap[":queueName"] = queue_name self.execute(sqlG, varMap) resList = self.cur.fetchall() retVal = [] @@ -2612,7 +2613,7 @@ def get_ready_workers(self, queue_name, n_ready): workSpec.pack(res) # get number of jobs varMap = dict() - varMap[':workerID'] = workSpec.workerID + varMap[":workerID"] = workSpec.workerID self.execute(sqlP, varMap) resP = self.cur.fetchone() if resP is not None and resP[0] > 0: @@ -2620,7 +2621,7 @@ def get_ready_workers(self, queue_name, n_ready): retVal.append(workSpec) # commit self.commit() - tmpLog.debug('got {0}'.format(str(retVal))) + tmpLog.debug("got {0}".format(str(retVal))) return retVal except Exception: # roll back @@ -2634,22 +2635,21 @@ def get_ready_workers(self, queue_name, n_ready): def get_worker_with_id(self, worker_id): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(worker_id), - method_name='get_worker_with_id') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(worker_id), method_name="get_worker_with_id") + tmpLog.debug("start") # sql to get a worker sqlG = "SELECT {0} FROM {1} ".format(WorkSpec.column_names(), workTableName) sqlG += "WHERE workerID=:workerID " # get a worker varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlG, varMap) res = self.cur.fetchone() workSpec = WorkSpec() workSpec.pack(res) # commit self.commit() - tmpLog.debug('got') + tmpLog.debug("got") return workSpec except Exception: # roll back @@ -2660,14 +2660,22 @@ def get_worker_with_id(self, worker_id): return None # get jobs to trigger or check output transfer or zip output - def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_lock, locked_by, - sub_status, has_out_file_flag, bad_has_out_file_flag_list=None, - max_files_per_job=None): + def get_jobs_for_stage_out( + self, + max_jobs, + interval_without_lock, + interval_with_lock, + locked_by, + sub_status, + has_out_file_flag, + bad_has_out_file_flag_list=None, + max_files_per_job=None, + ): try: # get logger - msgPfx = 'thr={0}'.format(locked_by) - tmpLog = core_utils.make_logger(_logger, msgPfx, method_name='get_jobs_for_stage_out') - tmpLog.debug('start') + msgPfx = "thr={0}".format(locked_by) + tmpLog = core_utils.make_logger(_logger, msgPfx, method_name="get_jobs_for_stage_out") + tmpLog.debug("start") # sql to get PandaIDs without FOR UPDATE which causes deadlock in MariaDB sql = "SELECT PandaID FROM {0} ".format(jobTableName) sql += "WHERE " @@ -2675,7 +2683,7 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ if bad_has_out_file_flag_list is not None: sql += "AND (hasOutFile IS NULL OR hasOutFile NOT IN (" for badFlag in bad_has_out_file_flag_list: - tmpKey = ':badHasOutFile{0}'.format(badFlag) + tmpKey = ":badHasOutFile{0}".format(badFlag) sql += "{0},".format(tmpKey) sql = sql[:-1] sql += ")) " @@ -2692,7 +2700,7 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ if bad_has_out_file_flag_list is not None: sqlL += "AND (hasOutFile IS NULL OR hasOutFile NOT IN (" for badFlag in bad_has_out_file_flag_list: - tmpKey = ':badHasOutFile{0}'.format(badFlag) + tmpKey = ":badHasOutFile{0}".format(badFlag) sqlL += "{0},".format(tmpKey) sqlL = sqlL[:-1] sqlL += ")) " @@ -2722,30 +2730,30 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ lockTimeLimit = timeNow - datetime.timedelta(seconds=interval_with_lock) updateTimeLimit = timeNow - datetime.timedelta(seconds=interval_without_lock) varMap = dict() - varMap[':subStatus'] = sub_status - varMap[':hasOutFile'] = has_out_file_flag + varMap[":subStatus"] = sub_status + varMap[":hasOutFile"] = has_out_file_flag if bad_has_out_file_flag_list is not None: for badFlag in bad_has_out_file_flag_list: - tmpKey = ':badHasOutFile{0}'.format(badFlag) + tmpKey = ":badHasOutFile{0}".format(badFlag) varMap[tmpKey] = badFlag - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':updateTimeLimit'] = updateTimeLimit + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":updateTimeLimit"] = updateTimeLimit self.execute(sql, varMap) resList = self.cur.fetchall() jobSpecList = [] - for pandaID, in resList: + for (pandaID,) in resList: # lock job varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':timeNow'] = timeNow - varMap[':lockedBy'] = locked_by - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':updateTimeLimit'] = updateTimeLimit - varMap[':subStatus'] = sub_status - varMap[':hasOutFile'] = has_out_file_flag + varMap[":PandaID"] = pandaID + varMap[":timeNow"] = timeNow + varMap[":lockedBy"] = locked_by + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":updateTimeLimit"] = updateTimeLimit + varMap[":subStatus"] = sub_status + varMap[":hasOutFile"] = has_out_file_flag if bad_has_out_file_flag_list is not None: for badFlag in bad_has_out_file_flag_list: - tmpKey = ':badHasOutFile{0}'.format(badFlag) + tmpKey = ":badHasOutFile{0}".format(badFlag) varMap[tmpKey] = badFlag self.execute(sqlL, varMap) nRow = self.cur.rowcount @@ -2754,7 +2762,7 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ if nRow > 0: # get job varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJ, varMap) resJ = self.cur.fetchone() # make job @@ -2765,26 +2773,26 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ # for old jobs without extractions if jobSpec.jobParamsExtForLog is None: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJJ, varMap) resJJ = self.cur.fetchone() - jobSpec.set_blob_attribute('jobParams', resJJ[0]) + jobSpec.set_blob_attribute("jobParams", resJJ[0]) jobSpec.get_output_file_attributes() jobSpec.get_logfile_info() # get files varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID - varMap[':type1'] = 'input' - varMap[':type2'] = FileSpec.AUX_INPUT - varMap[':type3'] = 'checkpoint' + varMap[":PandaID"] = jobSpec.PandaID + varMap[":type1"] = "input" + varMap[":type2"] = FileSpec.AUX_INPUT + varMap[":type3"] = "checkpoint" if has_out_file_flag == JobSpec.HO_hasOutput: - varMap[':status'] = 'defined' + varMap[":status"] = "defined" elif has_out_file_flag == JobSpec.HO_hasZipOutput: - varMap[':status'] = 'zipping' + varMap[":status"] = "zipping" elif has_out_file_flag == JobSpec.HO_hasPostZipOutput: - varMap[':status'] = 'post_zipping' + varMap[":status"] = "post_zipping" else: - varMap[':status'] = 'transferring' + varMap[":status"] = "transferring" self.execute(sqlF, varMap) resFileList = self.cur.fetchall() for resFile in resFileList: @@ -2794,7 +2802,7 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ jobSpec.add_out_file(fileSpec) # increment attempt number varMap = dict() - varMap[':fileID'] = fileSpec.fileID + varMap[":fileID"] = fileSpec.fileID self.execute(sqlFU, varMap) jobSpecList.append(jobSpec) # commit @@ -2804,11 +2812,11 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ if has_out_file_flag in [JobSpec.HO_hasZipOutput, JobSpec.HO_hasPostZipOutput]: for fileSpec in jobSpec.outFiles: varMap = dict() - varMap[':PandaID'] = fileSpec.PandaID - varMap[':zipFileID'] = fileSpec.fileID - varMap[':type1'] = 'input' - varMap[':type2'] = FileSpec.AUX_INPUT - varMap[':type3'] = 'checkpoint' + varMap[":PandaID"] = fileSpec.PandaID + varMap[":zipFileID"] = fileSpec.fileID + varMap[":type1"] = "input" + varMap[":type2"] = FileSpec.AUX_INPUT + varMap[":type3"] = "checkpoint" self.execute(sqlAF, varMap) resAFs = self.cur.fetchall() for resAF in resAFs: @@ -2818,7 +2826,7 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ # get associated workers tmpWorkers = self.get_workers_with_job_id(jobSpec.PandaID, use_commit=False) jobSpec.add_workspec_list(tmpWorkers) - tmpLog.debug('got {0} jobs'.format(len(jobSpecList))) + tmpLog.debug("got {0} jobs".format(len(jobSpecList))) return jobSpecList except Exception: # roll back @@ -2832,12 +2840,10 @@ def get_jobs_for_stage_out(self, max_jobs, interval_without_lock, interval_with_ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, - 'PandaID={0} subStatus={1} thr={2}'.format(jobspec.PandaID, - jobspec.subStatus, - locked_by), - method_name='update_job_for_stage_out') - tmpLog.debug('start') + tmpLog = core_utils.make_logger( + _logger, "PandaID={0} subStatus={1} thr={2}".format(jobspec.PandaID, jobspec.subStatus, locked_by), method_name="update_job_for_stage_out" + ) + tmpLog.debug("start") # sql to update event sqlEU = "UPDATE {0} ".format(eventTableName) sqlEU += "SET eventStatus=:eventStatus,subStatus=:subStatus " @@ -2858,15 +2864,15 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): sqlLC += "WHERE PandaID=:PandaID " # lock varMap = dict() - varMap[':PandaID'] = jobspec.PandaID - varMap[':lockedBy'] = locked_by - varMap[':timeNow'] = datetime.datetime.utcnow() + varMap[":PandaID"] = jobspec.PandaID + varMap[":lockedBy"] = locked_by + varMap[":timeNow"] = datetime.datetime.utcnow() self.execute(sqlLJ, varMap) nRow = self.cur.rowcount # check just in case since nRow can be 0 if two lock actions are too close in time if nRow == 0: varMap = dict() - varMap[':PandaID'] = jobspec.PandaID + varMap[":PandaID"] = jobspec.PandaID self.execute(sqlLC, varMap) resLC = self.cur.fetchone() if resLC is not None and resLC[0] == locked_by: @@ -2874,10 +2880,10 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): # commit self.commit() if nRow == 0: - tmpLog.debug('skip since locked by another') + tmpLog.debug("skip since locked by another") return None # update files - tmpLog.debug('update {0} files'.format(len(jobspec.outFiles))) + tmpLog.debug("update {0} files".format(len(jobspec.outFiles))) for fileSpec in jobspec.outFiles: # sql to update file sqlF = "UPDATE {0} SET {1} ".format(fileTableName, fileSpec.bind_update_changes_expression()) @@ -2885,45 +2891,45 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): varMap = fileSpec.values_map(only_changed=True) updated = False if len(varMap) > 0: - varMap[':PandaID'] = fileSpec.PandaID - varMap[':fileID'] = fileSpec.fileID + varMap[":PandaID"] = fileSpec.PandaID + varMap[":fileID"] = fileSpec.fileID self.execute(sqlF, varMap) updated = True # update event status if update_event_status: if fileSpec.eventRangeID is not None: varMap = dict() - varMap[':eventRangeID'] = fileSpec.eventRangeID - varMap[':eventStatus'] = fileSpec.status - varMap[':subStatus'] = fileSpec.status - varMap[':statusFailed'] = 'failed' - varMap[':statusDone'] = 'done' + varMap[":eventRangeID"] = fileSpec.eventRangeID + varMap[":eventStatus"] = fileSpec.status + varMap[":subStatus"] = fileSpec.status + varMap[":statusFailed"] = "failed" + varMap[":statusDone"] = "done" self.execute(sqlEU, varMap) updated = True if fileSpec.isZip == 1: # update files associated with zip file varMap = dict() - varMap[':PandaID'] = fileSpec.PandaID - varMap[':zipFileID'] = fileSpec.fileID + varMap[":PandaID"] = fileSpec.PandaID + varMap[":zipFileID"] = fileSpec.fileID self.execute(sqlAE1, varMap) resAE1 = self.cur.fetchall() - for eventRangeID, in resAE1: + for (eventRangeID,) in resAE1: varMap = dict() - varMap[':eventRangeID'] = eventRangeID - varMap[':eventStatus'] = fileSpec.status - varMap[':subStatus'] = fileSpec.status - varMap[':statusFailed'] = 'failed' - varMap[':statusDone'] = 'done' + varMap[":eventRangeID"] = eventRangeID + varMap[":eventStatus"] = fileSpec.status + varMap[":subStatus"] = fileSpec.status + varMap[":statusFailed"] = "failed" + varMap[":statusDone"] = "done" self.execute(sqlAE, varMap) updated = True nRow = self.cur.rowcount - tmpLog.debug('updated {0} events'.format(nRow)) + tmpLog.debug("updated {0} events".format(nRow)) if updated: # lock job again varMap = dict() - varMap[':PandaID'] = jobspec.PandaID - varMap[':lockedBy'] = locked_by - varMap[':timeNow'] = datetime.datetime.utcnow() + varMap[":PandaID"] = jobspec.PandaID + varMap[":lockedBy"] = locked_by + varMap[":timeNow"] = datetime.datetime.utcnow() self.execute(sqlLJ, varMap) # commit self.commit() @@ -2931,19 +2937,19 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): # check just in case since nRow can be 0 if two lock actions are too close in time if nRow == 0: varMap = dict() - varMap[':PandaID'] = jobspec.PandaID + varMap[":PandaID"] = jobspec.PandaID self.execute(sqlLC, varMap) resLC = self.cur.fetchone() if resLC is not None and resLC[0] == locked_by: nRow = 1 if nRow == 0: - tmpLog.debug('skip since locked by another') + tmpLog.debug("skip since locked by another") return None # count files sqlC = "SELECT COUNT(*) cnt,status FROM {0} ".format(fileTableName) sqlC += "WHERE PandaID=:PandaID GROUP BY status " varMap = dict() - varMap[':PandaID'] = jobspec.PandaID + varMap[":PandaID"] = jobspec.PandaID self.execute(sqlC, varMap) resC = self.cur.fetchall() cntMap = {} @@ -2951,40 +2957,39 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): cntMap[fileStatus] = cnt # set job attributes jobspec.stagerLock = None - if 'zipping' in cntMap: + if "zipping" in cntMap: jobspec.hasOutFile = JobSpec.HO_hasZipOutput - elif 'post_zipping' in cntMap: + elif "post_zipping" in cntMap: jobspec.hasOutFile = JobSpec.HO_hasPostZipOutput - elif 'defined' in cntMap: + elif "defined" in cntMap: jobspec.hasOutFile = JobSpec.HO_hasOutput - elif 'transferring' in cntMap: + elif "transferring" in cntMap: jobspec.hasOutFile = JobSpec.HO_hasTransfer else: jobspec.hasOutFile = JobSpec.HO_noOutput - if jobspec.subStatus == 'to_transfer': + if jobspec.subStatus == "to_transfer": # change subStatus when no more files to trigger transfer - if jobspec.hasOutFile not in \ - [JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, JobSpec.HO_hasPostZipOutput]: - jobspec.subStatus = 'transferring' + if jobspec.hasOutFile not in [JobSpec.HO_hasOutput, JobSpec.HO_hasZipOutput, JobSpec.HO_hasPostZipOutput]: + jobspec.subStatus = "transferring" jobspec.stagerTime = None - elif jobspec.subStatus == 'transferring': + elif jobspec.subStatus == "transferring": # all done if jobspec.hasOutFile == JobSpec.HO_noOutput: jobspec.trigger_propagation() - if 'failed' in cntMap: - jobspec.status = 'failed' - jobspec.subStatus = 'failed_to_stage_out' + if "failed" in cntMap: + jobspec.status = "failed" + jobspec.subStatus = "failed_to_stage_out" else: - jobspec.subStatus = 'staged' + jobspec.subStatus = "staged" # get finished files jobspec.reset_out_file() sqlFF = "SELECT {0} FROM {1} ".format(FileSpec.column_names(), fileTableName) sqlFF += "WHERE PandaID=:PandaID AND status=:status AND fileType IN (:type1,:type2) " varMap = dict() - varMap[':PandaID'] = jobspec.PandaID - varMap[':status'] = 'finished' - varMap[':type1'] = 'output' - varMap[':type2'] = 'log' + varMap[":PandaID"] = jobspec.PandaID + varMap[":status"] = "finished" + varMap[":type1"] = "output" + varMap[":type2"] = "log" self.execute(sqlFF, varMap) resFileList = self.cur.fetchall() for resFile in resFileList: @@ -2998,12 +3003,12 @@ def update_job_for_stage_out(self, jobspec, update_event_status, locked_by): sqlJ += "WHERE PandaID=:PandaID AND stagerLock=:lockedBy " # update job varMap = jobspec.values_map(only_changed=True) - varMap[':PandaID'] = jobspec.PandaID - varMap[':lockedBy'] = locked_by + varMap[":PandaID"] = jobspec.PandaID + varMap[":lockedBy"] = locked_by self.execute(sqlJ, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") # return return jobspec.subStatus except Exception: @@ -3020,7 +3025,7 @@ def add_seq_number(self, number_name, init_value): # check if already there sqlC = "SELECT curVal FROM {0} WHERE numberName=:numberName ".format(seqNumberTableName) varMap = dict() - varMap[':numberName'] = number_name + varMap[":numberName"] = number_name self.execute(sqlC, varMap) res = self.cur.fetchone() # insert if missing @@ -3049,22 +3054,21 @@ def add_seq_number(self, number_name, init_value): def get_next_seq_number(self, number_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'name={0}'.format(number_name), - method_name='get_next_seq_number') + tmpLog = core_utils.make_logger(_logger, "name={0}".format(number_name), method_name="get_next_seq_number") # increment sqlU = "UPDATE {0} SET curVal=curVal+1 WHERE numberName=:numberName ".format(seqNumberTableName) varMap = dict() - varMap[':numberName'] = number_name + varMap[":numberName"] = number_name self.execute(sqlU, varMap) # get sqlG = "SELECT curVal FROM {0} WHERE numberName=:numberName ".format(seqNumberTableName) varMap = dict() - varMap[':numberName'] = number_name + varMap[":numberName"] = number_name self.execute(sqlG, varMap) - retVal, = self.cur.fetchone() + (retVal,) = self.cur.fetchone() # commit self.commit() - tmpLog.debug('got {0}'.format(retVal)) + tmpLog.debug("got {0}".format(retVal)) return retVal except Exception: # roll back @@ -3078,8 +3082,7 @@ def get_next_seq_number(self, number_name): def get_cache_last_update_time(self, main_key, sub_key): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'mainKey={0} subKey={1}'.format(main_key, sub_key), - method_name='get_cache_last_update_time') + tmpLog = core_utils.make_logger(_logger, "mainKey={0} subKey={1}".format(main_key, sub_key), method_name="get_cache_last_update_time") # get varMap = dict() varMap[":mainKey"] = main_key @@ -3090,10 +3093,10 @@ def get_cache_last_update_time(self, main_key, sub_key): self.execute(sqlU, varMap) retVal = self.cur.fetchone() if retVal is not None: - retVal, = retVal + (retVal,) = retVal # commit self.commit() - tmpLog.debug('got {0}'.format(retVal)) + tmpLog.debug("got {0}".format(retVal)) return retVal except Exception: # roll back @@ -3107,8 +3110,7 @@ def get_cache_last_update_time(self, main_key, sub_key): def refresh_cache(self, main_key, sub_key, new_info): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'mainKey={0} subKey={1}'.format(main_key, sub_key), - method_name='refresh_cache') + tmpLog = core_utils.make_logger(_logger, "mainKey={0} subKey={1}".format(main_key, sub_key), method_name="refresh_cache") # make spec cacheSpec = CacheSpec() cacheSpec.lastUpdate = datetime.datetime.utcnow() @@ -3142,12 +3144,12 @@ def refresh_cache(self, main_key, sub_key, new_info): # commit self.commit() # put into global dict - cacheKey = 'cache|{0}|{1}'.format(main_key, sub_key) + cacheKey = "cache|{0}|{1}".format(main_key, sub_key) globalDict = core_utils.get_global_dict() globalDict.acquire() globalDict[cacheKey] = cacheSpec.data globalDict.release() - tmpLog.debug('refreshed') + tmpLog.debug("refreshed") return True except Exception: # roll back @@ -3162,11 +3164,10 @@ def get_cache(self, main_key, sub_key=None, from_local_cache=True): useDB = False try: # get logger - tmpLog = core_utils.make_logger(_logger, 'mainKey={0} subKey={1}'.format(main_key, sub_key), - method_name='get_cache') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "mainKey={0} subKey={1}".format(main_key, sub_key), method_name="get_cache") + tmpLog.debug("start") # get from global dict - cacheKey = 'cache|{0}|{1}'.format(main_key, sub_key) + cacheKey = "cache|{0}|{1}".format(main_key, sub_key) globalDict = core_utils.get_global_dict() # lock dict globalDict.acquire() @@ -3202,7 +3203,7 @@ def get_cache(self, main_key, sub_key=None, from_local_cache=True): globalDict[cacheKey] = cacheSpec.data # release dict globalDict.release() - tmpLog.debug('done') + tmpLog.debug("done") # return return cacheSpec except Exception: @@ -3217,8 +3218,8 @@ def get_cache(self, main_key, sub_key=None, from_local_cache=True): # store commands def store_commands(self, command_specs): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='store_commands') - tmpLog.debug('{0} commands'.format(len(command_specs))) + tmpLog = core_utils.make_logger(_logger, method_name="store_commands") + tmpLog.debug("{0} commands".format(len(command_specs))) if not command_specs: return True try: @@ -3248,17 +3249,17 @@ def store_commands(self, command_specs): def get_commands_for_receiver(self, receiver, command_pattern=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_commands_for_receiver') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_commands_for_receiver") + tmpLog.debug("start") # sql to get commands varMap = dict() - varMap[':receiver'] = receiver - varMap[':processed'] = 0 + varMap[":receiver"] = receiver + varMap[":processed"] = 0 sqlG = "SELECT {0} FROM {1} ".format(CommandSpec.column_names(), commandTableName) sqlG += "WHERE receiver=:receiver AND processed=:processed " if command_pattern is not None: - varMap[':command'] = command_pattern - if '%' in command_pattern: + varMap[":command"] = command_pattern + if "%" in command_pattern: sqlG += "AND command LIKE :command " else: sqlG += "AND command=:command " @@ -3273,14 +3274,14 @@ def get_commands_for_receiver(self, receiver, command_pattern=None): commandSpec.pack(res) # lock varMap = dict() - varMap[':command_id'] = commandSpec.command_id - varMap[':processed'] = 1 + varMap[":command_id"] = commandSpec.command_id + varMap[":processed"] = 1 self.execute(sqlL, varMap) # append commandSpecList.append(commandSpec) # commit self.commit() - tmpLog.debug('got {0} commands'.format(len(commandSpecList))) + tmpLog.debug("got {0} commands".format(len(commandSpecList))) return commandSpecList except Exception: # dump error @@ -3292,17 +3293,19 @@ def get_commands_for_receiver(self, receiver, command_pattern=None): def get_commands_ack(self): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_commands_ack') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_commands_ack") + tmpLog.debug("start") # sql to get commands that have been processed and need acknowledgement sql = """ SELECT command_id FROM {0} WHERE ack_requested=1 AND processed=1 - """.format(commandTableName) + """.format( + commandTableName + ) self.execute(sql) command_ids = [row[0] for row in self.cur.fetchall()] - tmpLog.debug('command_ids {0}'.format(command_ids)) + tmpLog.debug("command_ids {0}".format(command_ids)) return command_ids except Exception: # dump error @@ -3315,15 +3318,17 @@ def clean_commands_by_id(self, commands_ids): Deletes the commands specified in a list of IDs """ # get logger - tmpLog = core_utils.make_logger(_logger, method_name='clean_commands_by_id') + tmpLog = core_utils.make_logger(_logger, method_name="clean_commands_by_id") try: # sql to delete a specific command sql = """ DELETE FROM {0} - WHERE command_id=:command_id""".format(commandTableName) + WHERE command_id=:command_id""".format( + commandTableName + ) for command_id in commands_ids: - var_map = {':command_id': command_id} + var_map = {":command_id": command_id} self.execute(sql, var_map) self.commit() return True @@ -3336,13 +3341,15 @@ def clean_processed_commands(self): """ Deletes the commands that have been processed and do not need acknowledgement """ - tmpLog = core_utils.make_logger(_logger, method_name='clean_processed_commands') + tmpLog = core_utils.make_logger(_logger, method_name="clean_processed_commands") try: # sql to delete all processed commands that do not need an ACK sql = """ DELETE FROM {0} WHERE (ack_requested=0 AND processed=1) - """.format(commandTableName) + """.format( + commandTableName + ) self.execute(sql) self.commit() return True @@ -3355,8 +3362,8 @@ def clean_processed_commands(self): def get_workers_to_kill(self, max_workers, check_interval): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_to_kill') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_to_kill") + tmpLog.debug("start") # sql to get worker IDs sqlW = "SELECT workerID,status,configID FROM {0} ".format(workTableName) sqlW += "WHERE killTime IS NOT NULL AND killTime<:checkTimeLimit " @@ -3372,7 +3379,7 @@ def get_workers_to_kill(self, max_workers, check_interval): timeLimit = timeNow - datetime.timedelta(seconds=check_interval) # get workerIDs varMap = dict() - varMap[':checkTimeLimit'] = timeLimit + varMap[":checkTimeLimit"] = timeLimit self.execute(sqlW, varMap) resW = self.cur.fetchall() retVal = dict() @@ -3382,20 +3389,20 @@ def get_workers_to_kill(self, max_workers, check_interval): configID = None # lock or release worker varMap = dict() - varMap[':workerID'] = workerID - varMap[':checkTimeLimit'] = timeLimit + varMap[":workerID"] = workerID + varMap[":checkTimeLimit"] = timeLimit if workerStatus in (WorkSpec.ST_cancelled, WorkSpec.ST_failed, WorkSpec.ST_finished): # release - varMap[':setTime'] = None + varMap[":setTime"] = None else: # lock - varMap[':setTime'] = timeNow + varMap[":setTime"] = timeNow self.execute(sqlL, varMap) # get worker nRow = self.cur.rowcount - if nRow == 1 and varMap[':setTime'] is not None: + if nRow == 1 and varMap[":setTime"] is not None: varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -3406,7 +3413,7 @@ def get_workers_to_kill(self, max_workers, check_interval): retVal[queueName][configID].append(workSpec) # commit self.commit() - tmpLog.debug('got {0} workers'.format(len(retVal))) + tmpLog.debug("got {0} workers".format(len(retVal))) return retVal except Exception: # roll back @@ -3420,23 +3427,21 @@ def get_workers_to_kill(self, max_workers, check_interval): def get_worker_stats(self, site_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_worker_stats") + tmpLog.debug("start") # sql to get nQueueLimit sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # get nQueueLimit varMap = dict() - varMap[':siteName'] = site_name + varMap[":siteName"] = site_name self.execute(sqlQ, varMap) resQ = self.cur.fetchall() retMap = dict() for computingSite, jobType, resourceType, nNewWorkers in resQ: retMap.setdefault(jobType, {}) if resourceType not in retMap[jobType]: - retMap[jobType][resourceType] = {'running': 0, - 'submitted': 0, - 'to_submit': nNewWorkers} + retMap[jobType][resourceType] = {"running": 0, "submitted": 0, "to_submit": nNewWorkers} # get worker stats sqlW = "SELECT wt.status, wt.computingSite, pq.jobType, pq.resourceType, COUNT(*) cnt " @@ -3445,22 +3450,19 @@ def get_worker_stats(self, site_name): sqlW += "GROUP BY wt.status, wt.computingSite, pq.jobType, pq.resourceType " # get worker stats varMap = dict() - varMap[':siteName'] = site_name - varMap[':st1'] = 'running' - varMap[':st2'] = 'submitted' + varMap[":siteName"] = site_name + varMap[":st1"] = "running" + varMap[":st2"] = "submitted" self.execute(sqlW, varMap) resW = self.cur.fetchall() for workerStatus, computingSite, jobType, resourceType, cnt in resW: retMap.setdefault(jobType, {}) if resourceType not in retMap: - retMap[jobType][resourceType] = {'running': 0, - 'submitted': 0, - 'to_submit': 0 - } + retMap[jobType][resourceType] = {"running": 0, "submitted": 0, "to_submit": 0} retMap[jobType][resourceType][workerStatus] = cnt # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -3474,8 +3476,8 @@ def get_worker_stats(self, site_name): def get_worker_stats_bulk(self, active_ups_queues): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats_bulk') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_worker_stats_bulk") + tmpLog.debug("start") # sql to get nQueueLimit sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) @@ -3486,10 +3488,8 @@ def get_worker_stats_bulk(self, active_ups_queues): for computingSite, jobType, resourceType, nNewWorkers in resQ: retMap.setdefault(computingSite, {}) retMap[computingSite].setdefault(jobType, {}) - if resourceType and resourceType != 'ANY' and resourceType not in retMap[computingSite][jobType]: - retMap[computingSite][jobType][resourceType] = {'running': 0, - 'submitted': 0, - 'to_submit': nNewWorkers} + if resourceType and resourceType != "ANY" and resourceType not in retMap[computingSite][jobType]: + retMap[computingSite][jobType][resourceType] = {"running": 0, "submitted": 0, "to_submit": nNewWorkers} # get worker stats sqlW = "SELECT wt.status, wt.computingSite, wt.jobType, wt.resourceType, COUNT(*) cnt " @@ -3498,29 +3498,27 @@ def get_worker_stats_bulk(self, active_ups_queues): sqlW += "GROUP BY wt.status,wt.computingSite, wt.jobType, wt.resourceType " # get worker stats varMap = dict() - varMap[':st1'] = 'running' - varMap[':st2'] = 'submitted' + varMap[":st1"] = "running" + varMap[":st2"] = "submitted" self.execute(sqlW, varMap) resW = self.cur.fetchall() for workerStatus, computingSite, jobType, resourceType, cnt in resW: - if resourceType and resourceType != 'ANY': + if resourceType and resourceType != "ANY": retMap.setdefault(computingSite, {}) retMap[computingSite].setdefault(jobType, {}) - retMap[computingSite][jobType].setdefault(resourceType, {'running': 0, - 'submitted': 0, - 'to_submit': 0}) + retMap[computingSite][jobType].setdefault(resourceType, {"running": 0, "submitted": 0, "to_submit": 0}) retMap[computingSite][jobType][resourceType][workerStatus] = cnt # if there are no jobs for an active UPS queue, it needs to be initialized so that the pilot streaming # on panda server starts processing the queue if active_ups_queues: for ups_queue in active_ups_queues: - if ups_queue not in retMap or not retMap[ups_queue] or retMap[ups_queue] == {'ANY': {}}: - retMap[ups_queue] = {'managed': {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}}} + if ups_queue not in retMap or not retMap[ups_queue] or retMap[ups_queue] == {"ANY": {}}: + retMap[ups_queue] = {"managed": {"SCORE": {"running": 0, "submitted": 0, "to_submit": 0}}} # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -3534,18 +3532,18 @@ def get_worker_stats_bulk(self, active_ups_queues): def get_worker_stats_full(self, filter_site_list=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats_full') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_worker_stats_full") + tmpLog.debug("start") # sql to get nQueueLimit varMap = dict() sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) if filter_site_list is not None: site_var_name_list = [] for j, site in enumerate(filter_site_list): - site_var_name = ':site{0}'.format(j) + site_var_name = ":site{0}".format(j) site_var_name_list.append(site_var_name) varMap[site_var_name] = site - filter_queue_str = ','.join(site_var_name_list) + filter_queue_str = ",".join(site_var_name_list) sqlQ += "WHERE siteName IN ({0}) ".format(filter_queue_str) # get nQueueLimit self.execute(sqlQ, varMap) @@ -3557,9 +3555,7 @@ def get_worker_stats_full(self, filter_site_list=None): resourceType = str(resourceType) retMap.setdefault(computingSite, {}) retMap[computingSite].setdefault(jobType, {}) - retMap[computingSite][jobType][resourceType] = {'running': 0, - 'submitted': 0, - 'to_submit': nNewWorkers} + retMap[computingSite][jobType][resourceType] = {"running": 0, "submitted": 0, "to_submit": nNewWorkers} # get worker stats varMap = dict() sqlW = "SELECT wt.status, wt.computingSite, wt.jobType, wt.resourceType, COUNT(*) cnt " @@ -3567,10 +3563,10 @@ def get_worker_stats_full(self, filter_site_list=None): if filter_site_list is not None: site_var_name_list = [] for j, site in enumerate(filter_site_list): - site_var_name = ':site{0}'.format(j) + site_var_name = ":site{0}".format(j) site_var_name_list.append(site_var_name) varMap[site_var_name] = site - filter_queue_str = ','.join(site_var_name_list) + filter_queue_str = ",".join(site_var_name_list) sqlW += "WHERE wt.computingSite IN ({0}) ".format(filter_queue_str) sqlW += "GROUP BY wt.status,wt.computingSite, wt.jobType, wt.resourceType " # get worker stats @@ -3583,13 +3579,11 @@ def get_worker_stats_full(self, filter_site_list=None): resourceType = str(resourceType) retMap.setdefault(computingSite, {}) retMap[computingSite].setdefault(jobType, {}) - retMap[computingSite][jobType].setdefault(resourceType, {'running': 0, - 'submitted': 0, - 'to_submit': 0}) + retMap[computingSite][jobType].setdefault(resourceType, {"running": 0, "submitted": 0, "to_submit": 0}) retMap[computingSite][jobType][resourceType][workerStatus] = cnt # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -3603,9 +3597,8 @@ def get_worker_stats_full(self, filter_site_list=None): def mark_workers_to_kill_by_pandaid(self, panda_id, delay_seconds=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(panda_id), - method_name='mark_workers_to_kill_by_pandaid') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "PandaID={0}".format(panda_id), method_name="mark_workers_to_kill_by_pandaid") + tmpLog.debug("start") # sql to set killTime sqlL = "UPDATE {0} SET killTime=:setTime ".format(workTableName) sqlL += "WHERE workerID=:workerID AND killTime IS NULL AND NOT status IN (:st1,:st2,:st3) " @@ -3621,23 +3614,23 @@ def mark_workers_to_kill_by_pandaid(self, panda_id, delay_seconds=None): setTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=delay_seconds) # get workers varMap = dict() - varMap[':pandaID'] = panda_id + varMap[":pandaID"] = panda_id self.execute(sqlA, varMap) resA = self.cur.fetchall() nRow = 0 - for workerID, in resA: + for (workerID,) in resA: # set killTime varMap = dict() - varMap[':workerID'] = workerID - varMap[':setTime'] = setTime - varMap[':st1'] = WorkSpec.ST_finished - varMap[':st2'] = WorkSpec.ST_failed - varMap[':st3'] = WorkSpec.ST_cancelled + varMap[":workerID"] = workerID + varMap[":setTime"] = setTime + varMap[":st1"] = WorkSpec.ST_finished + varMap[":st2"] = WorkSpec.ST_failed + varMap[":st3"] = WorkSpec.ST_cancelled self.execute(sqlL, varMap) nRow += self.cur.rowcount # commit self.commit() - tmpLog.debug('set killTime to {0} workers'.format(nRow)) + tmpLog.debug("set killTime to {0} workers".format(nRow)) return nRow except Exception: # roll back @@ -3651,8 +3644,8 @@ def mark_workers_to_kill_by_pandaid(self, panda_id, delay_seconds=None): def mark_workers_to_kill_by_workerids(self, worker_ids, delay_seconds=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='mark_workers_to_kill_by_workerids') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="mark_workers_to_kill_by_workerids") + tmpLog.debug("start") # sql to set killTime sqlL = "UPDATE {0} SET killTime=:setTime ".format(workTableName) sqlL += "WHERE workerID=:workerID AND killTime IS NULL AND NOT status IN (:st1,:st2,:st3) " @@ -3666,17 +3659,17 @@ def mark_workers_to_kill_by_workerids(self, worker_ids, delay_seconds=None): varMaps = [] for worker_id in worker_ids: varMap = dict() - varMap[':workerID'] = worker_id - varMap[':setTime'] = setTime - varMap[':st1'] = WorkSpec.ST_finished - varMap[':st2'] = WorkSpec.ST_failed - varMap[':st3'] = WorkSpec.ST_cancelled + varMap[":workerID"] = worker_id + varMap[":setTime"] = setTime + varMap[":st1"] = WorkSpec.ST_finished + varMap[":st2"] = WorkSpec.ST_failed + varMap[":st3"] = WorkSpec.ST_cancelled varMaps.append(varMap) self.executemany(sqlL, varMaps) nRow = self.cur.rowcount # commit self.commit() - tmpLog.debug('set killTime with {0}'.format(nRow)) + tmpLog.debug("set killTime with {0}".format(nRow)) return nRow except Exception: # roll back @@ -3690,24 +3683,24 @@ def mark_workers_to_kill_by_workerids(self, worker_ids, delay_seconds=None): def get_workers_for_cleanup(self, max_workers, status_timeout_map): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_for_cleanup') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_for_cleanup") + tmpLog.debug("start") # sql to get worker IDs timeNow = datetime.datetime.utcnow() modTimeLimit = timeNow - datetime.timedelta(minutes=60) varMap = dict() - varMap[':timeLimit'] = modTimeLimit + varMap[":timeLimit"] = modTimeLimit sqlW = "SELECT workerID, configID FROM {0} ".format(workTableName) sqlW += "WHERE lastUpdate IS NULL AND (" for tmpStatus, tmpTimeout in iteritems(status_timeout_map): - tmpStatusKey = ':status_{0}'.format(tmpStatus) - tmpTimeoutKey = ':timeLimit_{0}'.format(tmpStatus) - sqlW += '(status={0} AND endTime<={1}) OR '.format(tmpStatusKey, tmpTimeoutKey) + tmpStatusKey = ":status_{0}".format(tmpStatus) + tmpTimeoutKey = ":timeLimit_{0}".format(tmpStatus) + sqlW += "(status={0} AND endTime<={1}) OR ".format(tmpStatusKey, tmpTimeoutKey) varMap[tmpStatusKey] = tmpStatus varMap[tmpTimeoutKey] = timeNow - datetime.timedelta(hours=tmpTimeout) sqlW = sqlW[:-4] - sqlW += ') ' - sqlW += 'AND modificationTime<:timeLimit ' + sqlW += ") " + sqlW += "AND modificationTime<:timeLimit " sqlW += "ORDER BY modificationTime LIMIT {0} ".format(max_workers) # sql to lock or release worker sqlL = "UPDATE {0} SET modificationTime=:setTime ".format(workTableName) @@ -3740,9 +3733,9 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): for workerID, configID in resW: # lock worker varMap = dict() - varMap[':workerID'] = workerID - varMap[':setTime'] = timeNow - varMap[':timeLimit'] = modTimeLimit + varMap[":workerID"] = workerID + varMap[":setTime"] = timeNow + varMap[":timeLimit"] = modTimeLimit self.execute(sqlL, varMap) # commit self.commit() @@ -3753,14 +3746,14 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): configID = None # check associated jobs varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlA, varMap) - nActJobs, = self.cur.fetchone() + (nActJobs,) = self.cur.fetchone() # cleanup when there is no active job if nActJobs == 0: # get worker varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -3774,12 +3767,12 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): checkedLFNs = set() keepLFNs = set() varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID self.execute(sqlP, varMap) resP = self.cur.fetchall() - for pandaID, in resP: + for (pandaID,) in resP: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlJ, varMap) resJ = self.cur.fetchone() jobSpec = JobSpec() @@ -3787,9 +3780,9 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): jobSpecs.append(jobSpec) # get LFNs not to be deleted varMap = dict() - varMap[':PandaID'] = pandaID - varMap[':fileType1'] = 'input' - varMap[':fileType2'] = FileSpec.AUX_INPUT + varMap[":PandaID"] = pandaID + varMap[":fileType1"] = "input" + varMap[":fileType2"] = FileSpec.AUX_INPUT self.execute(sqlD, varMap) resDs = self.cur.fetchall() for tmpLFN, tmpTodelete in resDs: @@ -3797,7 +3790,7 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): keepLFNs.add(tmpLFN) # get files to be deleted varMap = dict() - varMap[':PandaID'] = jobSpec.PandaID + varMap[":PandaID"] = jobSpec.PandaID self.execute(sqlF, varMap) resFs = self.cur.fetchall() for resF in resFs: @@ -3812,7 +3805,7 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): jobSpec.add_file(fileSpec) workSpec.set_jobspec_list(jobSpecs) iWorkers += 1 - tmpLog.debug('got {0} workers'.format(iWorkers)) + tmpLog.debug("got {0} workers".format(iWorkers)) return retVal except Exception: # roll back @@ -3826,9 +3819,8 @@ def get_workers_for_cleanup(self, max_workers, status_timeout_map): def delete_worker(self, worker_id): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(worker_id), - method_name='delete_worker') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(worker_id), method_name="delete_worker") + tmpLog.debug("start") # sql to get jobs sqlJ = "SELECT PandaID FROM {0} ".format(jobWorkerTableName) sqlJ += "WHERE workerID=:workerID " @@ -3849,12 +3841,12 @@ def delete_worker(self, worker_id): sqlDW += "WHERE workerID=:workerID " # get jobs varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlJ, varMap) resJ = self.cur.fetchall() - for pandaID, in resJ: + for (pandaID,) in resJ: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID # delete job self.execute(sqlDJ, varMap) # delete files @@ -3865,11 +3857,11 @@ def delete_worker(self, worker_id): self.execute(sqlDR, varMap) # delete worker varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlDW, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -3883,22 +3875,22 @@ def delete_worker(self, worker_id): def release_jobs(self, panda_ids, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='release_jobs') - tmpLog.debug('start for {0} jobs'.format(len(panda_ids))) + tmpLog = core_utils.make_logger(_logger, method_name="release_jobs") + tmpLog.debug("start for {0} jobs".format(len(panda_ids))) # sql to release job sql = "UPDATE {0} SET lockedBy=NULL ".format(jobTableName) sql += "WHERE PandaID=:pandaID AND lockedBy=:lockedBy " nJobs = 0 for pandaID in panda_ids: varMap = dict() - varMap[':pandaID'] = pandaID - varMap[':lockedBy'] = locked_by + varMap[":pandaID"] = pandaID + varMap[":lockedBy"] = locked_by self.execute(sql, varMap) if self.cur.rowcount > 0: nJobs += 1 # commit self.commit() - tmpLog.debug('released {0} jobs'.format(nJobs)) + tmpLog.debug("released {0} jobs".format(nJobs)) # return return True except Exception: @@ -3913,38 +3905,39 @@ def release_jobs(self, panda_ids, locked_by): def clone_queue_with_new_job_and_resource_type(self, site_name, queue_name, job_type, resource_type, new_workers): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'site_name={0} queue_name={1}'.format(site_name, queue_name), - method_name='clone_queue_with_new_job_and_resource_type') - tmpLog.debug('start') + tmpLog = core_utils.make_logger( + _logger, "site_name={0} queue_name={1}".format(site_name, queue_name), method_name="clone_queue_with_new_job_and_resource_type" + ) + tmpLog.debug("start") # get the values from one of the existing queues sql_select_queue = "SELECT {0} FROM {1} ".format(PandaQueueSpec.column_names(), pandaQueueTableName) sql_select_queue += "WHERE siteName=:siteName " var_map = dict() - var_map[':siteName'] = site_name + var_map[":siteName"] = site_name self.execute(sql_select_queue, var_map) queue = self.cur.fetchone() - if queue: # a queue to clone was found + if queue: # a queue to clone was found var_map = {} attribute_list = [] attr_binding_list = [] - for attribute, value in zip(PandaQueueSpec.column_names().split(','), queue): - attr_binding = ':{0}'.format(attribute) - if attribute == 'resourceType': + for attribute, value in zip(PandaQueueSpec.column_names().split(","), queue): + attr_binding = ":{0}".format(attribute) + if attribute == "resourceType": var_map[attr_binding] = resource_type - elif attribute == 'jobType': + elif attribute == "jobType": var_map[attr_binding] = job_type - elif attribute == 'nNewWorkers': + elif attribute == "nNewWorkers": var_map[attr_binding] = new_workers - elif attribute == 'uniqueName': + elif attribute == "uniqueName": var_map[attr_binding] = core_utils.get_unique_queue_name(queue_name, resource_type, job_type) else: var_map[attr_binding] = value attribute_list.append(attribute) attr_binding_list.append(attr_binding) - sql_insert = "INSERT IGNORE INTO {0} ({1}) ".format(pandaQueueTableName, ','.join(attribute_list)) - sql_values = "VALUES ({0}) ".format(','.join(attr_binding_list)) + sql_insert = "INSERT IGNORE INTO {0} ({1}) ".format(pandaQueueTableName, ",".join(attribute_list)) + sql_values = "VALUES ({0}) ".format(",".join(attr_binding_list)) self.execute(sql_insert + sql_values, var_map) else: @@ -3960,8 +3953,8 @@ def clone_queue_with_new_job_and_resource_type(self, site_name, queue_name, job_ def set_queue_limit(self, site_name, params): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'siteName={0}'.format(site_name), method_name='set_queue_limit') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "siteName={0}".format(site_name), method_name="set_queue_limit") + tmpLog.debug("start") # sql to reset queue limits before setting new command to avoid old values being repeated again and again sql_reset = "UPDATE {0} ".format(pandaQueueTableName) @@ -3985,13 +3978,13 @@ def set_queue_limit(self, site_name, params): # reset nqueued for all job & resource types varMap = dict() - varMap[':zero'] = 0 - varMap[':siteName'] = site_name + varMap[":zero"] = 0 + varMap[":siteName"] = site_name self.execute(sql_reset, varMap) # get job & resource types varMap = dict() - varMap[':siteName'] = site_name + varMap[":siteName"] = site_name self.execute(sql_get_job_resource, varMap) results = self.cur.fetchall() job_resource_type_list = set() @@ -4006,25 +3999,25 @@ def set_queue_limit(self, site_name, params): for job_type, job_values in iteritems(params): ret_map.setdefault(job_type, {}) for resource_type, value in iteritems(job_values): - tmpLog.debug('Processing rt {0} -> {1}'.format(resource_type, value)) + tmpLog.debug("Processing rt {0} -> {1}".format(resource_type, value)) # get num of submitted workers varMap = dict() - varMap[':siteName'] = site_name - varMap[':jobType'] = job_type - varMap[':resourceType'] = resource_type - varMap[':status'] = 'submitted' + varMap[":siteName"] = site_name + varMap[":jobType"] = job_type + varMap[":resourceType"] = resource_type + varMap[":status"] = "submitted" self.execute(sql_count_workers, varMap) res = self.cur.fetchone() - tmpLog.debug('{0} has {1} submitted workers'.format(resource_type, res)) + tmpLog.debug("{0} has {1} submitted workers".format(resource_type, res)) if value is None: value = 0 varMap = dict() - varMap[':nQueue'] = value - varMap[':siteName'] = site_name - varMap[':jobType'] = job_type - varMap[':resourceType'] = resource_type + varMap[":nQueue"] = value + varMap[":siteName"] = site_name + varMap[":jobType"] = job_type + varMap[":resourceType"] = resource_type self.execute(sql_update_queue, varMap) iUp = self.cur.rowcount @@ -4034,19 +4027,17 @@ def set_queue_limit(self, site_name, params): ret_map[job_type][resource_type] = value else: # no queue was updated, we need to create a new one for the resource type - cloned = self.clone_queue_with_new_job_and_resource_type(site_name, queue_name, job_type, - resource_type, value) + cloned = self.clone_queue_with_new_job_and_resource_type(site_name, queue_name, job_type, resource_type, value) if cloned: ret_map[job_type][resource_type] = value iUp = 1 nUp += iUp - tmpLog.debug('set nNewWorkers={0} to {1}:{2}:{3} with {4}'.format(value, queue_name, job_type, - resource_type, iUp)) + tmpLog.debug("set nNewWorkers={0} to {1}:{2}:{3} with {4}".format(value, queue_name, job_type, resource_type, iUp)) # commit self.commit() - tmpLog.debug('updated {0} queues'.format(nUp)) + tmpLog.debug("updated {0} queues".format(nUp)) return ret_map except Exception: @@ -4061,9 +4052,8 @@ def set_queue_limit(self, site_name, params): def get_num_missed_workers(self, queue_name, criteria): try: # get logger - tmpLog = core_utils.make_logger(_logger,"queue={0}".format(queue_name), - method_name='get_num_missed_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "queue={0}".format(queue_name), method_name="get_num_missed_workers") + tmpLog.debug("start") # get worker stats sqlW = "SELECT COUNT(*) cnt " sqlW += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) @@ -4071,25 +4061,25 @@ def get_num_missed_workers(self, queue_name, criteria): # get worker stats varMap = dict() for attr, val in iteritems(criteria): - if attr == 'timeLimit': + if attr == "timeLimit": sqlW += "AND wt.submitTime>:timeLimit " - varMap[':timeLimit'] = val - elif attr in ['siteName']: + varMap[":timeLimit"] = val + elif attr in ["siteName"]: sqlW += "AND pq.{0}=:{0} ".format(attr) - varMap[':{0}'.format(attr)] = val - elif attr in ['computingSite', 'computingElement']: + varMap[":{0}".format(attr)] = val + elif attr in ["computingSite", "computingElement"]: sqlW += "AND wt.{0}=:{0} ".format(attr) - varMap[':{0}'.format(attr)] = val - varMap[':status'] = 'missed' + varMap[":{0}".format(attr)] = val + varMap[":status"] = "missed" self.execute(sqlW, varMap) resW = self.cur.fetchone() if resW is None: nMissed = 0 else: - nMissed, = resW + (nMissed,) = resW # commit self.commit() - tmpLog.debug('got nMissed={0} for {1}'.format(nMissed, str(criteria))) + tmpLog.debug("got nMissed={0} for {1}".format(nMissed, str(criteria))) return nMissed except Exception: # roll back @@ -4103,9 +4093,8 @@ def get_num_missed_workers(self, queue_name, criteria): def get_workers_with_job_id(self, panda_id, use_commit=True): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'pandaID={0}'.format(panda_id), - method_name='get_workers_with_job_id') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "pandaID={0}".format(panda_id), method_name="get_workers_with_job_id") + tmpLog.debug("start") # sql to get workerIDs sqlW = "SELECT workerID FROM {0} WHERE PandaID=:PandaID ".format(jobWorkerTableName) sqlW += "ORDER BY workerID " @@ -4114,13 +4103,13 @@ def get_workers_with_job_id(self, panda_id, use_commit=True): sqlG += "WHERE workerID=:workerID " # get workerIDs varMap = dict() - varMap[':PandaID'] = panda_id + varMap[":PandaID"] = panda_id self.execute(sqlW, varMap) retList = [] - for worker_id, in self.cur.fetchall(): + for (worker_id,) in self.cur.fetchall(): # get a worker varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlG, varMap) res = self.cur.fetchone() workSpec = WorkSpec() @@ -4129,7 +4118,7 @@ def get_workers_with_job_id(self, panda_id, use_commit=True): # commit if use_commit: self.commit() - tmpLog.debug('got {0} workers'.format(len(retList))) + tmpLog.debug("got {0} workers".format(len(retList))) return retList except Exception: # roll back @@ -4144,15 +4133,15 @@ def get_workers_with_job_id(self, panda_id, use_commit=True): def clean_process_locks(self): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='clean_process_locks') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="clean_process_locks") + tmpLog.debug("start") # delete locks sqlW = "DELETE FROM {0} ".format(processLockTableName) # get worker stats self.execute(sqlW) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4166,14 +4155,13 @@ def clean_process_locks(self): def get_process_lock(self, process_name, locked_by, lock_interval): try: # get logger - tmpLog = core_utils.make_logger(_logger, "proc={0} by={1}".format(process_name, locked_by), - method_name='get_process_lock') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "proc={0} by={1}".format(process_name, locked_by), method_name="get_process_lock") + tmpLog.debug("start") # delete old lock sqlD = "DELETE FROM {0} ".format(processLockTableName) sqlD += "WHERE lockTime<:timeLimit " varMap = dict() - varMap[':timeLimit'] = datetime.datetime.utcnow() - datetime.timedelta(hours=6) + varMap[":timeLimit"] = datetime.datetime.utcnow() - datetime.timedelta(hours=6) self.execute(sqlD, varMap) # commit self.commit() @@ -4181,7 +4169,7 @@ def get_process_lock(self, process_name, locked_by, lock_interval): sqlC = "SELECT lockTime FROM {0} ".format(processLockTableName) sqlC += "WHERE processName=:processName " varMap = dict() - varMap[':processName'] = process_name + varMap[":processName"] = process_name self.execute(sqlC, varMap) resC = self.cur.fetchone() retVal = False @@ -4198,23 +4186,23 @@ def get_process_lock(self, process_name, locked_by, lock_interval): self.execute(sqlI, varMap) retVal = True else: - oldLockTime, = resC + (oldLockTime,) = resC timeLimit = timeNow - datetime.timedelta(seconds=lock_interval) if oldLockTime <= timeLimit: # update lock if old sqlU = "UPDATE {0} SET lockedBy=:lockedBy,lockTime=:timeNow ".format(processLockTableName) sqlU += "WHERE processName=:processName AND lockTime<=:timeLimit " varMap = dict() - varMap[':processName'] = process_name - varMap[':lockedBy'] = locked_by - varMap[':timeLimit'] = timeLimit - varMap[':timeNow'] = timeNow + varMap[":processName"] = process_name + varMap[":lockedBy"] = locked_by + varMap[":timeLimit"] = timeLimit + varMap[":timeNow"] = timeNow self.execute(sqlU, varMap) if self.cur.rowcount > 0: retVal = True # commit self.commit() - tmpLog.debug('done with {0}'.format(retVal)) + tmpLog.debug("done with {0}".format(retVal)) return retVal except Exception: # roll back @@ -4228,19 +4216,18 @@ def get_process_lock(self, process_name, locked_by, lock_interval): def release_process_lock(self, process_name, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, "proc={0} by={1}".format(process_name, locked_by), - method_name='release_process_lock') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "proc={0} by={1}".format(process_name, locked_by), method_name="release_process_lock") + tmpLog.debug("start") # delete old lock sqlC = "DELETE FROM {0} ".format(processLockTableName) sqlC += "WHERE processName=:processName AND lockedBy=:lockedBy " varMap = dict() - varMap[':processName'] = process_name - varMap[':lockedBy'] = locked_by + varMap[":processName"] = process_name + varMap[":lockedBy"] = locked_by self.execute(sqlC, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4254,9 +4241,8 @@ def release_process_lock(self, process_name, locked_by): def get_file_status(self, lfn, file_type, endpoint, job_status): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'lfn={0} endpoint={1}'.format(lfn, endpoint), - method_name='get_file_status') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "lfn={0} endpoint={1}".format(lfn, endpoint), method_name="get_file_status") + tmpLog.debug("start") # sql to get files sqlF = "SELECT f.status, f.path, COUNT(*) cnt FROM {0} f, {1} j ".format(fileTableName, jobTableName) sqlF += "WHERE j.PandaID=f.PandaID AND j.status=:jobStatus " @@ -4266,20 +4252,20 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): sqlF += "GROUP BY f.status, f.path " # get files varMap = dict() - varMap[':lfn'] = lfn - varMap[':type'] = file_type - varMap[':jobStatus'] = job_status + varMap[":lfn"] = lfn + varMap[":type"] = file_type + varMap[":jobStatus"] = job_status if endpoint is not None: - varMap[':endpoint'] = endpoint + varMap[":endpoint"] = endpoint self.execute(sqlF, varMap) retMap = dict() for status, path, cnt in self.cur.fetchall(): - retMap.setdefault(status, {'cnt': 0, 'path': set()}) - retMap[status]['cnt'] += cnt - retMap[status]['path'].add(path) + retMap.setdefault(status, {"cnt": 0, "path": set()}) + retMap[status]["cnt"] += cnt + retMap[status]["path"].add(path) # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -4293,8 +4279,8 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): def change_file_status(self, panda_id, data, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(panda_id), method_name='change_file_status') - tmpLog.debug('start lockedBy={0}'.format(locked_by)) + tmpLog = core_utils.make_logger(_logger, "PandaID={0}".format(panda_id), method_name="change_file_status") + tmpLog.debug("start lockedBy={0}".format(locked_by)) # sql to check lock of job sqlJ = "SELECT lockedBy FROM {0} ".format(jobTableName) sqlJ += "WHERE PandaID=:PandaID FOR UPDATE " @@ -4303,26 +4289,26 @@ def change_file_status(self, panda_id, data, locked_by): sqlF += "SET status=:status WHERE fileID=:fileID " # check lock varMap = dict() - varMap[':PandaID'] = panda_id + varMap[":PandaID"] = panda_id self.execute(sqlJ, varMap) resJ = self.cur.fetchone() if resJ is None: - tmpLog.debug('skip since job not found') + tmpLog.debug("skip since job not found") else: - lockedBy, = resJ + (lockedBy,) = resJ if lockedBy != locked_by: - tmpLog.debug('skip since lockedBy is inconsistent in DB {0}'.format(lockedBy)) + tmpLog.debug("skip since lockedBy is inconsistent in DB {0}".format(lockedBy)) else: # update files for tmpFileID, tmpLFN, newStatus in data: varMap = dict() - varMap[':fileID'] = tmpFileID - varMap[':status'] = newStatus + varMap[":fileID"] = tmpFileID + varMap[":status"] = newStatus self.execute(sqlF, varMap) - tmpLog.debug('set new status {0} to {1}'.format(newStatus, tmpLFN)) + tmpLog.debug("set new status {0} to {1}".format(newStatus, tmpLFN)) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4336,9 +4322,8 @@ def change_file_status(self, panda_id, data, locked_by): def get_group_for_file(self, lfn, file_type, endpoint): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'lfn={0} endpoint={1}'.format(lfn, endpoint), - method_name='get_group_for_file') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "lfn={0} endpoint={1}".format(lfn, endpoint), method_name="get_group_for_file") + tmpLog.debug("start") # sql to get group with the latest update sqlF = "SELECT * FROM (" sqlF += "SELECT groupID,groupStatus,groupUpdateTime FROM {0} ".format(fileTableName) @@ -4350,21 +4335,21 @@ def get_group_for_file(self, lfn, file_type, endpoint): sqlF += ") AS TMP LIMIT 1 " # get group varMap = dict() - varMap[':lfn'] = lfn - varMap[':type'] = file_type - varMap[':ngStatus'] = 'failed' + varMap[":lfn"] = lfn + varMap[":type"] = file_type + varMap[":ngStatus"] = "failed" if endpoint is not None: - varMap[':endpoint'] = endpoint + varMap[":endpoint"] = endpoint self.execute(sqlF, varMap) resF = self.cur.fetchone() if resF is None: retVal = None else: groupID, groupStatus, groupUpdateTime = resF - retVal = {'groupID': groupID, 'groupStatus': groupStatus, 'groupUpdateTime': groupUpdateTime} + retVal = {"groupID": groupID, "groupStatus": groupStatus, "groupUpdateTime": groupUpdateTime} # commit self.commit() - tmpLog.debug('got {0}'.format(str(retVal))) + tmpLog.debug("got {0}".format(str(retVal))) return retVal except Exception: # roll back @@ -4378,15 +4363,14 @@ def get_group_for_file(self, lfn, file_type, endpoint): def get_files_with_group_id(self, group_id): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'groupID={0}'.format(group_id), - method_name='get_files_with_group_id') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "groupID={0}".format(group_id), method_name="get_files_with_group_id") + tmpLog.debug("start") # sql to get files sqlF = "SELECT {0} FROM {1} ".format(FileSpec.column_names(), fileTableName) sqlF += "WHERE groupID=:groupID " # get files varMap = dict() - varMap[':groupID'] = group_id + varMap[":groupID"] = group_id retList = [] self.execute(sqlF, varMap) for resFile in self.cur.fetchall(): @@ -4395,7 +4379,7 @@ def get_files_with_group_id(self, group_id): retList.append(fileSpec) # commit self.commit() - tmpLog.debug('got {0} files'.format(len(retList))) + tmpLog.debug("got {0} files".format(len(retList))) return retList except Exception: # roll back @@ -4409,21 +4393,20 @@ def get_files_with_group_id(self, group_id): def update_file_group_status(self, group_id, status_string): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'groupID={0}'.format(group_id), - method_name='update_file_group_status') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "groupID={0}".format(group_id), method_name="update_file_group_status") + tmpLog.debug("start") # sql to get files sqlF = "UPDATE {0} set groupStatus=:groupStatus ".format(fileTableName) sqlF += "WHERE groupID=:groupID " # get files varMap = dict() - varMap[':groupID'] = group_id - varMap[':groupStatus'] = status_string + varMap[":groupID"] = group_id + varMap[":groupStatus"] = status_string self.execute(sqlF, varMap) nRow = self.cur.rowcount # commit self.commit() - tmpLog.debug('updated {0} files'.format(nRow)) + tmpLog.debug("updated {0} files".format(nRow)) return True except Exception: # roll back @@ -4437,23 +4420,22 @@ def update_file_group_status(self, group_id, status_string): def get_file_group_status(self, group_id): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'groupID={0}'.format(group_id), - method_name='get_file_group_status') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "groupID={0}".format(group_id), method_name="get_file_group_status") + tmpLog.debug("start") # sql to get files sqlF = "SELECT DISTINCT groupStatus FROM {0} ".format(fileTableName) sqlF += "WHERE groupID=:groupID " # get files varMap = dict() - varMap[':groupID'] = group_id + varMap[":groupID"] = group_id self.execute(sqlF, varMap) res = self.cur.fetchall() retVal = set() - for groupStatus, in res: + for (groupStatus,) in res: retVal.add(groupStatus) # commit self.commit() - tmpLog.debug('get {0}'.format(str(retVal))) + tmpLog.debug("get {0}".format(str(retVal))) return retVal except Exception: # roll back @@ -4466,35 +4448,35 @@ def get_file_group_status(self, group_id): # lock job again def lock_job_again(self, panda_id, time_column, lock_column, locked_by): try: - tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(panda_id), method_name='lock_job_again') - tmpLog.debug('start column={0} id={1}'.format(lock_column, locked_by)) + tmpLog = core_utils.make_logger(_logger, "PandaID={0}".format(panda_id), method_name="lock_job_again") + tmpLog.debug("start column={0} id={1}".format(lock_column, locked_by)) # check lock sqlC = "SELECT {0},{1} FROM {2} ".format(lock_column, time_column, jobTableName) sqlC += "WHERE PandaID=:pandaID " sqlC += "FOR UPDATE " varMap = dict() - varMap[':pandaID'] = panda_id + varMap[":pandaID"] = panda_id self.execute(sqlC, varMap) resC = self.cur.fetchone() if resC is None: retVal = False - tmpLog.debug('not found') + tmpLog.debug("not found") else: oldLockedBy, oldLockedTime = resC if oldLockedBy != locked_by: - tmpLog.debug('locked by another {0} at {1}'.format(oldLockedBy, oldLockedTime)) + tmpLog.debug("locked by another {0} at {1}".format(oldLockedBy, oldLockedTime)) retVal = False else: # update locked time sqlU = "UPDATE {0} SET {1}=:timeNow WHERE pandaID=:pandaID ".format(jobTableName, time_column) varMap = dict() - varMap[':pandaID'] = panda_id - varMap[':timeNow'] = datetime.datetime.utcnow() + varMap[":pandaID"] = panda_id + varMap[":timeNow"] = datetime.datetime.utcnow() self.execute(sqlU, varMap) retVal = True # commit self.commit() - tmpLog.debug('done with {0}'.format(retVal)) + tmpLog.debug("done with {0}".format(retVal)) # return return retVal except Exception: @@ -4509,9 +4491,8 @@ def lock_job_again(self, panda_id, time_column, lock_column, locked_by): def set_file_group(self, file_specs, group_id, status_string): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'groupID={0}'.format(group_id), - method_name='set_file_group') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "groupID={0}".format(group_id), method_name="set_file_group") + tmpLog.debug("start") timeNow = datetime.datetime.utcnow() # sql to update files sqlF = "UPDATE {0} ".format(fileTableName) @@ -4520,14 +4501,14 @@ def set_file_group(self, file_specs, group_id, status_string): # update files for fileSpec in file_specs: varMap = dict() - varMap[':groupID'] = group_id - varMap[':groupStatus'] = status_string - varMap[':groupUpdateTime'] = timeNow - varMap[':lfn'] = fileSpec.lfn + varMap[":groupID"] = group_id + varMap[":groupStatus"] = status_string + varMap[":groupUpdateTime"] = timeNow + varMap[":lfn"] = fileSpec.lfn self.execute(sqlF, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4541,16 +4522,15 @@ def set_file_group(self, file_specs, group_id, status_string): def refresh_file_group_info(self, job_spec): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'pandaID={0}'.format(job_spec.PandaID), - method_name='refresh_file_group_info') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "pandaID={0}".format(job_spec.PandaID), method_name="refresh_file_group_info") + tmpLog.debug("start") # sql to get info sqlF = "SELECT groupID,groupStatus,groupUpdateTime FROM {0} ".format(fileTableName) sqlF += "WHERE lfn=:lfn " # get info for fileSpec in job_spec.inFiles.union(job_spec.outFiles): varMap = dict() - varMap[':lfn'] = fileSpec.lfn + varMap[":lfn"] = fileSpec.lfn self.execute(sqlF, varMap) resF = self.cur.fetchone() if resF is None: @@ -4561,7 +4541,7 @@ def refresh_file_group_info(self, job_spec): fileSpec.groupUpdateTime = groupUpdateTime # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4575,19 +4555,18 @@ def refresh_file_group_info(self, job_spec): def increment_submission_attempt(self, panda_id, new_number): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'pandaID={0}'.format(panda_id), - method_name='increment_submission_attempt') - tmpLog.debug('start with newNum={0}'.format(new_number)) + tmpLog = core_utils.make_logger(_logger, "pandaID={0}".format(panda_id), method_name="increment_submission_attempt") + tmpLog.debug("start with newNum={0}".format(new_number)) # sql to update attempt number sqlL = "UPDATE {0} SET submissionAttempts=:newNum ".format(jobTableName) sqlL += "WHERE PandaID=:PandaID " varMap = dict() - varMap[':PandaID'] = panda_id - varMap[':newNum'] = new_number + varMap[":PandaID"] = panda_id + varMap[":newNum"] = new_number self.execute(sqlL, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4601,8 +4580,8 @@ def increment_submission_attempt(self, panda_id, new_number): def get_worker_limits(self, site_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, token='site_name={0}'.format(site_name), method_name='get_worker_limits') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, token="site_name={0}".format(site_name), method_name="get_worker_limits") + tmpLog.debug("start") # sql to get queue limits sqlQ = "SELECT maxWorkers, nQueueLimitWorker, nQueueLimitWorkerRatio," @@ -4619,19 +4598,19 @@ def get_worker_limits(self, site_name): # get varMap = dict() - varMap[':siteName'] = site_name + varMap[":siteName"] = site_name self.execute(sqlQ, varMap) resQ = self.cur.fetchall() # count resource types varMap = dict() - varMap[':computingSite'] = site_name - varMap[':siteName'] = site_name + varMap[":computingSite"] = site_name + varMap[":siteName"] = site_name self.execute(sqlNT, varMap) resNT = self.cur.fetchall() # count running workers varMap = dict() - varMap[':computingSite'] = site_name - varMap[':status1'] = 'running' + varMap[":computingSite"] = site_name + varMap[":status1"] = "running" self.execute(sqlNR, varMap) resNR = self.cur.fetchall() @@ -4639,12 +4618,11 @@ def get_worker_limits(self, site_name): retMap = dict() nRunning = 0 nRT = 1 - for cnt, in resNR: + for (cnt,) in resNR: nRunning = cnt - for cnt, in resNT: + for (cnt,) in resNT: nRT = max(nRT, cnt) - for maxWorkers, nQueueLimitWorker_orig, nQueueLimitWorkerRatio, \ - nQueueLimitWorkerMax, nQueueLimitWorkerMin_orig in resQ: + for maxWorkers, nQueueLimitWorker_orig, nQueueLimitWorkerRatio, nQueueLimitWorkerMax, nQueueLimitWorkerMin_orig in resQ: if nQueueLimitWorkerRatio is not None and nQueueLimitWorkerRatio > 0: nQueueLimitWorkerByRatio = int(nRunning * nQueueLimitWorkerRatio / 100) nQueueLimitWorkerMin = 1 @@ -4666,14 +4644,16 @@ def get_worker_limits(self, site_name): nQueueLimitWorker = maxWorkers nQueueLimitWorkerPerRT = nQueueLimitWorker nQueueLimitWorker = min(nQueueLimitWorker, maxWorkers) - retMap.update({ - 'maxWorkers': maxWorkers, - 'nQueueLimitWorker': nQueueLimitWorker, - 'nQueueLimitWorkerPerRT': nQueueLimitWorkerPerRT, - }) + retMap.update( + { + "maxWorkers": maxWorkers, + "nQueueLimitWorker": nQueueLimitWorker, + "nQueueLimitWorkerPerRT": nQueueLimitWorkerPerRT, + } + ) # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -4687,8 +4667,8 @@ def get_worker_limits(self, site_name): def get_worker_ce_stats(self, site_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_ce_stats') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_worker_ce_stats") + tmpLog.debug("start") # get worker CE stats sqlW = "SELECT wt.status,wt.computingSite,wt.computingElement,COUNT(*) cnt " sqlW += "FROM {0} wt ".format(workTableName) @@ -4696,22 +4676,22 @@ def get_worker_ce_stats(self, site_name): sqlW += "GROUP BY wt.status,wt.computingElement " # get worker CE stats varMap = dict() - varMap[':siteName'] = site_name - varMap[':st1'] = 'running' - varMap[':st2'] = 'submitted' + varMap[":siteName"] = site_name + varMap[":st1"] = "running" + varMap[":st2"] = "submitted" self.execute(sqlW, varMap) resW = self.cur.fetchall() retMap = dict() for workerStatus, computingSite, computingElement, cnt in resW: if computingElement not in retMap: retMap[computingElement] = { - 'running': 0, - 'submitted': 0, + "running": 0, + "submitted": 0, } retMap[computingElement][workerStatus] = cnt # commit self.commit() - tmpLog.debug('got {0}'.format(str(retMap))) + tmpLog.debug("got {0}".format(str(retMap))) return retMap except Exception: # roll back @@ -4725,8 +4705,8 @@ def get_worker_ce_stats(self, site_name): def get_worker_ce_backend_throughput(self, site_name, time_window): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_ce_backend_throughput') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_worker_ce_backend_throughput") + tmpLog.debug("start") # get worker CE throughput sqlW = "SELECT wt.computingElement,wt.status,COUNT(*) cnt " sqlW += "FROM {0} wt ".format(workTableName) @@ -4739,31 +4719,30 @@ def get_worker_ce_backend_throughput(self, site_name, time_window): # time window start and end timeWindowEnd = datetime.datetime.utcnow() timeWindowStart = timeWindowEnd - datetime.timedelta(seconds=time_window) - timeWindowMiddle = timeWindowEnd - datetime.timedelta(seconds=time_window/2) + timeWindowMiddle = timeWindowEnd - datetime.timedelta(seconds=time_window / 2) # get worker CE throughput varMap = dict() - varMap[':siteName'] = site_name - varMap[':st1'] = 'submitted' - varMap[':st2'] = 'running' - varMap[':st3'] = 'finished' - varMap[':timeWindowStart'] = timeWindowStart - varMap[':timeWindowEnd'] = timeWindowEnd - varMap[':timeWindowMiddle'] = timeWindowMiddle + varMap[":siteName"] = site_name + varMap[":st1"] = "submitted" + varMap[":st2"] = "running" + varMap[":st3"] = "finished" + varMap[":timeWindowStart"] = timeWindowStart + varMap[":timeWindowEnd"] = timeWindowEnd + varMap[":timeWindowMiddle"] = timeWindowMiddle self.execute(sqlW, varMap) resW = self.cur.fetchall() retMap = dict() for computingElement, workerStatus, cnt in resW: if computingElement not in retMap: retMap[computingElement] = { - 'submitted': 0, - 'running': 0, - 'finished': 0, + "submitted": 0, + "running": 0, + "finished": 0, } retMap[computingElement][workerStatus] = cnt # commit self.commit() - tmpLog.debug('got {0} with time_window={1} for site {2}'.format( - str(retMap), time_window, site_name)) + tmpLog.debug("got {0} with time_window={1} for site {2}".format(str(retMap), time_window, site_name)) return retMap except Exception: # roll back @@ -4777,20 +4756,20 @@ def get_worker_ce_backend_throughput(self, site_name, time_window): def add_dialog_message(self, message, level, module_name, identifier=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='add_dialog_message') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="add_dialog_message") + tmpLog.debug("start") # delete old messages sqlS = "SELECT diagID FROM {0} ".format(diagTableName) sqlS += "WHERE creationTime<:timeLimit " varMap = dict() - varMap[':timeLimit'] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) + varMap[":timeLimit"] = datetime.datetime.utcnow() - datetime.timedelta(minutes=60) self.execute(sqlS, varMap) resS = self.cur.fetchall() sqlD = "DELETE FROM {0} ".format(diagTableName) sqlD += "WHERE diagID=:diagID " - for diagID, in resS: + for (diagID,) in resS: varMap = dict() - varMap[':diagID'] = diagID + varMap[":diagID"] = diagID self.execute(sqlD, varMap) # commit self.commit() @@ -4811,7 +4790,7 @@ def add_dialog_message(self, message, level, module_name, identifier=None): self.execute(sqlI, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4825,8 +4804,8 @@ def add_dialog_message(self, message, level, module_name, identifier=None): def get_dialog_messages_to_send(self, n_messages, lock_interval): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_dialog_messages_to_send') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_dialog_messages_to_send") + tmpLog.debug("start") # sql to select messages sqlD = "SELECT diagID FROM {0} ".format(diagTableName) sqlD += "WHERE (lockTime IS NULL OR lockTime<:timeLimit) " @@ -4841,22 +4820,22 @@ def get_dialog_messages_to_send(self, n_messages, lock_interval): # select messages timeLimit = datetime.datetime.utcnow() - datetime.timedelta(seconds=lock_interval) varMap = dict() - varMap[':timeLimit'] = timeLimit + varMap[":timeLimit"] = timeLimit self.execute(sqlD, varMap) resD = self.cur.fetchall() diagList = [] - for diagID, in resD: + for (diagID,) in resD: # lock varMap = dict() - varMap[':diagID'] = diagID - varMap[':timeLimit'] = timeLimit - varMap[':timeNow'] = datetime.datetime.utcnow() + varMap[":diagID"] = diagID + varMap[":timeLimit"] = timeLimit + varMap[":timeNow"] = datetime.datetime.utcnow() self.execute(sqlL, varMap) nRow = self.cur.rowcount if nRow == 1: # get varMap = dict() - varMap[':diagID'] = diagID + varMap[":diagID"] = diagID self.execute(sqlM, varMap) resM = self.cur.fetchone() # make spec @@ -4865,7 +4844,7 @@ def get_dialog_messages_to_send(self, n_messages, lock_interval): diagList.append(diagSpec) # commit self.commit() - tmpLog.debug('got {0} messages'.format(len(diagList))) + tmpLog.debug("got {0} messages".format(len(diagList))) return diagList except Exception: # roll back @@ -4879,19 +4858,19 @@ def get_dialog_messages_to_send(self, n_messages, lock_interval): def delete_dialog_messages(self, ids): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='delete_dialog_messages') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="delete_dialog_messages") + tmpLog.debug("start") # sql to delete message sqlM = "DELETE FROM {0} ".format(diagTableName) sqlM += "WHERE diagID=:diagID " for diagID in ids: # lock varMap = dict() - varMap[':diagID'] = diagID + varMap[":diagID"] = diagID self.execute(sqlM, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -4905,9 +4884,8 @@ def delete_dialog_messages(self, ids): def delete_old_jobs(self, timeout): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'timeout={0}'.format(timeout), - method_name='delete_old_jobs') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "timeout={0}".format(timeout), method_name="delete_old_jobs") + tmpLog.debug("start") # sql to get old jobs to be deleted sqlGJ = "SELECT PandaID FROM {0} ".format(jobTableName) sqlGJ += "WHERE subStatus=:subStatus AND propagatorTime IS NULL " @@ -4927,15 +4905,15 @@ def delete_old_jobs(self, timeout): sqlDR += "WHERE PandaID=:PandaID " # get jobs varMap = dict() - varMap[':subStatus'] = 'done' - varMap[':timeLimit1'] = datetime.datetime.utcnow() - datetime.timedelta(hours=timeout) - varMap[':timeLimit2'] = datetime.datetime.utcnow() - datetime.timedelta(hours=timeout*2) + varMap[":subStatus"] = "done" + varMap[":timeLimit1"] = datetime.datetime.utcnow() - datetime.timedelta(hours=timeout) + varMap[":timeLimit2"] = datetime.datetime.utcnow() - datetime.timedelta(hours=timeout * 2) self.execute(sqlGJ, varMap) resGJ = self.cur.fetchall() nDel = 0 - for pandaID, in resGJ: + for (pandaID,) in resGJ: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID # delete job self.execute(sqlDJ, varMap) iDel = self.cur.rowcount @@ -4949,7 +4927,7 @@ def delete_old_jobs(self, timeout): self.execute(sqlDR, varMap) # commit self.commit() - tmpLog.debug('deleted {0} jobs'.format(nDel)) + tmpLog.debug("deleted {0} jobs".format(nDel)) return True except Exception: # roll back @@ -4963,31 +4941,34 @@ def delete_old_jobs(self, timeout): def get_active_workers(self, n_workers, seconds_ago=0): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_active_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_active_workers") + tmpLog.debug("start") # sql to get workers sqlW = "SELECT {0} FROM {1} ".format(WorkSpec.column_names(), workTableName) sqlW += "WHERE status IN (:st_submitted,:st_running,:st_idle) " sqlW += "AND modificationTime<:timeLimit " sqlW += "ORDER BY modificationTime,computingSite LIMIT {0} ".format(n_workers) # sql to get jobs - sqlJ = "SELECT j.{columns} FROM {jobWorkerTableName} jw, {jobTableName} j ".format(columns=JobSpec.column_names(), jobTableName=jobTableName, jobWorkerTableName=jobWorkerTableName) + sqlJ = "SELECT j.{columns} FROM {jobWorkerTableName} jw, {jobTableName} j ".format( + columns=JobSpec.column_names(), jobTableName=jobTableName, jobWorkerTableName=jobWorkerTableName + ) sqlJ += "WHERE j.PandaID=jw.PandaID AND jw.workerID=:workerID " # parameter map varMap = dict() - varMap[':timeLimit'] = datetime.datetime.utcnow() - datetime.timedelta(seconds=seconds_ago) - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":timeLimit"] = datetime.datetime.utcnow() - datetime.timedelta(seconds=seconds_ago) + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlW, varMap) resW = self.cur.fetchall() + def _get_workspec_from_record(rec): workspec = WorkSpec() workspec.pack(rec) jobspec_list = [] workspec.pandaid_list = [] varMap = dict() - varMap[':workerID'] = workspec.workerID + varMap[":workerID"] = workspec.workerID self.execute(sqlJ, varMap) resJ = self.cur.fetchall() for one_job in resJ: @@ -4997,8 +4978,9 @@ def _get_workspec_from_record(rec): workspec.pandaid_list.append(jobspec.PandaID) workspec.set_jobspec_list(jobspec_list) return workspec + retVal = map(_get_workspec_from_record, resW) - tmpLog.debug('got {0} workers'.format(len(resW))) + tmpLog.debug("got {0} workers".format(len(resW))) return retVal except Exception: # roll back @@ -5015,34 +4997,34 @@ def lock_workers(self, worker_id_list, lock_interval): lockTimeLimit = timeNow - datetime.timedelta(seconds=lock_interval) retVal = True # get logger - tmpLog = core_utils.make_logger(_logger, method_name='lock_worker') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="lock_worker") + tmpLog.debug("start") # loop for worker_id, attrs in iteritems(worker_id_list): varMap = dict() - varMap[':workerID'] = worker_id - varMap[':timeNow'] = timeNow - varMap[':lockTimeLimit'] = lockTimeLimit - varMap[':st1'] = WorkSpec.ST_cancelled - varMap[':st2'] = WorkSpec.ST_finished - varMap[':st3'] = WorkSpec.ST_failed - varMap[':st4'] = WorkSpec.ST_missed + varMap[":workerID"] = worker_id + varMap[":timeNow"] = timeNow + varMap[":lockTimeLimit"] = lockTimeLimit + varMap[":st1"] = WorkSpec.ST_cancelled + varMap[":st2"] = WorkSpec.ST_finished + varMap[":st3"] = WorkSpec.ST_failed + varMap[":st4"] = WorkSpec.ST_missed # extract lockedBy - varMap[':lockedBy'] = attrs['lockedBy'] - if attrs['lockedBy'] is None: - del attrs['lockedBy'] + varMap[":lockedBy"] = attrs["lockedBy"] + if attrs["lockedBy"] is None: + del attrs["lockedBy"] # sql to lock worker sqlL = "UPDATE {0} SET modificationTime=:timeNow".format(workTableName) for attrKey, attrVal in iteritems(attrs): - sqlL += ',{0}=:{0}'.format(attrKey) - varMap[':{0}'.format(attrKey)] = attrVal + sqlL += ",{0}=:{0}".format(attrKey) + varMap[":{0}".format(attrKey)] = attrVal sqlL += " WHERE workerID=:workerID AND (lockedBy IS NULL " sqlL += "OR (modificationTime<:lockTimeLimit AND lockedBy IS NOT NULL)) " sqlL += "AND (status NOT IN (:st1,:st2,:st3,:st4)) " # lock worker self.execute(sqlL, varMap) nRow = self.cur.rowcount - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) # false if failed to lock if nRow == 0: retVal = False @@ -5066,18 +5048,18 @@ def get_queue_config_dumps(self): # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_queue_config_dumps') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_queue_config_dumps") + tmpLog.debug("start") # sql to get used IDs sqlIJ = "SELECT DISTINCT configID FROM {0} ".format(jobTableName) self.execute(sqlIJ) resIJ = self.cur.fetchall() - for tmpID, in resIJ: + for (tmpID,) in resIJ: configIDs.add(tmpID) sqlIW = "SELECT DISTINCT configID FROM {0} ".format(workTableName) self.execute(sqlIW) resIW = self.cur.fetchall() - for tmpID, in resIW: + for (tmpID,) in resIW: configIDs.add(tmpID) # sql to delete sqlD = "DELETE FROM {0} WHERE configID=:configID ".format(queueConfigDumpTableName) @@ -5094,7 +5076,7 @@ def get_queue_config_dumps(self): # delete if unused and too old if dumpSpec.configID not in configIDs and dumpSpec.creationTime < timeLimit: varMap = dict() - varMap[':configID'] = dumpSpec.configID + varMap[":configID"] = dumpSpec.configID self.execute(sqlD, varMap) iDel += 1 else: @@ -5102,7 +5084,7 @@ def get_queue_config_dumps(self): iDump += 1 # commit self.commit() - tmpLog.debug('got {0} dumps and delete {1} dumps'.format(iDump, iDel)) + tmpLog.debug("got {0} dumps and delete {1} dumps".format(iDump, iDel)) # return return retVal except Exception: @@ -5120,14 +5102,14 @@ def add_queue_config_dump(self, dump_spec): sqlJ = "INSERT INTO {0} ({1}) ".format(queueConfigDumpTableName, QueueConfigDumpSpec.column_names()) sqlJ += QueueConfigDumpSpec.bind_values_expression() # get logger - tmpLog = core_utils.make_logger(_logger, method_name='add_queue_config_dumps') - tmpLog.debug('start for {0}'.format(dump_spec.dumpUniqueName)) + tmpLog = core_utils.make_logger(_logger, method_name="add_queue_config_dumps") + tmpLog.debug("start for {0}".format(dump_spec.dumpUniqueName)) varMap = dump_spec.values_list() # insert self.execute(sqlJ, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") # return return True except Exception: @@ -5145,19 +5127,19 @@ def get_config_id_dump(self, dump_spec): sqlJ = "SELECT configID FROM {0} ".format(queueConfigDumpTableName) sqlJ += "WHERE queueName=:queueName AND dumpUniqueName=:dumpUniqueName " # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_config_id_for_dump') - tmpLog.debug('start for {0}:{1}'.format(dump_spec.queueName, dump_spec.dumpUniqueName)) + tmpLog = core_utils.make_logger(_logger, method_name="get_config_id_for_dump") + tmpLog.debug("start for {0}:{1}".format(dump_spec.queueName, dump_spec.dumpUniqueName)) # get varMap = dict() - varMap[':queueName'] = dump_spec.queueName - varMap[':dumpUniqueName'] = dump_spec.dumpUniqueName + varMap[":queueName"] = dump_spec.queueName + varMap[":dumpUniqueName"] = dump_spec.dumpUniqueName self.execute(sqlJ, varMap) resJ = self.cur.fetchone() if resJ is not None: - configID, = resJ + (configID,) = resJ else: configID = None - tmpLog.debug('got configID={0}'.format(configID)) + tmpLog.debug("got configID={0}".format(configID)) # return return configID except Exception: @@ -5172,9 +5154,8 @@ def get_config_id_dump(self, dump_spec): def purge_pq(self, queue_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, 'queueName={0}'.format(queue_name), - method_name='purge_pq') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "queueName={0}".format(queue_name), method_name="purge_pq") + tmpLog.debug("start") # sql to get jobs sqlJ = "SELECT PandaID FROM {0} ".format(jobTableName) sqlJ += "WHERE computingSite=:computingSite " @@ -5210,12 +5191,12 @@ def purge_pq(self, queue_name): sqlDP += "WHERE queueName=:queueName " # get jobs varMap = dict() - varMap[':computingSite'] = queue_name + varMap[":computingSite"] = queue_name self.execute(sqlJ, varMap) resJ = self.cur.fetchall() - for pandaID, in resJ: + for (pandaID,) in resJ: varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID # delete job self.execute(sqlDJ, varMap) # delete files @@ -5226,33 +5207,33 @@ def purge_pq(self, queue_name): self.execute(sqlDRJ, varMap) # get workers varMap = dict() - varMap[':computingSite'] = queue_name + varMap[":computingSite"] = queue_name self.execute(sqlW, varMap) resW = self.cur.fetchall() - for workerID, in resW: + for (workerID,) in resW: varMap = dict() - varMap[':workerID'] = workerID + varMap[":workerID"] = workerID # delete workers self.execute(sqlDW, varMap) # delete relations self.execute(sqlDRW, varMap) # get queue configs varMap = dict() - varMap[':queueName'] = queue_name + varMap[":queueName"] = queue_name self.execute(sqlQ, varMap) resQ = self.cur.fetchall() - for configID, in resQ: + for (configID,) in resQ: varMap = dict() - varMap[':configID'] = configID + varMap[":configID"] = configID # delete queue configs self.execute(sqlDQ, varMap) # delete panda queue varMap = dict() - varMap[':queueName'] = queue_name + varMap[":queueName"] = queue_name self.execute(sqlDP, varMap) # commit self.commit() - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: # roll back @@ -5267,21 +5248,20 @@ def disable_multi_workers(self, panda_id): tmpLog = None try: # get logger - tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(panda_id), - method_name='disable_multi_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "PandaID={0}".format(panda_id), method_name="disable_multi_workers") + tmpLog.debug("start") # sql to update flag sqlJ = "UPDATE {0} SET moreWorkers=0 ".format(jobTableName) sqlJ += "WHERE PandaID=:pandaID AND nWorkers IS NOT NULL AND nWorkersLimit IS NOT NULL " sqlJ += "AND nWorkers>0 " # set flag varMap = dict() - varMap[':pandaID'] = panda_id + varMap[":pandaID"] = panda_id self.execute(sqlJ, varMap) nRow = self.cur.rowcount # commit self.commit() - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) # return return nRow except Exception: @@ -5297,26 +5277,25 @@ def update_panda_queue_attribute(self, key, value, site_name=None, queue_name=No tmpLog = None try: # get logger - tmpLog = core_utils.make_logger(_logger, 'site={0} queue={1}'.format(site_name, queue_name), - method_name='update_panda_queue') - tmpLog.debug('start key={0}'.format(key)) + tmpLog = core_utils.make_logger(_logger, "site={0} queue={1}".format(site_name, queue_name), method_name="update_panda_queue") + tmpLog.debug("start key={0}".format(key)) # sql to update sqlJ = "UPDATE {0} SET {1}=:{1} ".format(pandaQueueTableName, key) sqlJ += "WHERE " varMap = dict() - varMap[':{0}'.format(key)] = value + varMap[":{0}".format(key)] = value if site_name is not None: sqlJ += "siteName=:siteName " - varMap[':siteName'] = site_name + varMap[":siteName"] = site_name else: sqlJ += "queueName=:queueName " - varMap[':queueName'] = queue_name + varMap[":queueName"] = queue_name # update self.execute(sqlJ, varMap) nRow = self.cur.rowcount # commit self.commit() - tmpLog.debug('done with {0}'.format(nRow)) + tmpLog.debug("done with {0}".format(nRow)) # return return True except Exception: @@ -5331,9 +5310,8 @@ def update_panda_queue_attribute(self, key, value, site_name=None, queue_name=No def delete_orphaned_job_info(self): try: # get logger - tmpLog = core_utils.make_logger(_logger, - method_name='delete_orphaned_job_info') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="delete_orphaned_job_info") + tmpLog.debug("start") # sql to get job info to be deleted sqlGJ = "SELECT PandaID FROM {0} " sqlGJ += "WHERE PandaID NOT IN (" @@ -5356,17 +5334,17 @@ def delete_orphaned_job_info(self): self.execute(sqlGJ.format(tableName, jobTableName)) resGJ = self.cur.fetchall() nDel = 0 - for pandaID, in resGJ: + for (pandaID,) in resGJ: # delete varMap = dict() - varMap[':PandaID'] = pandaID + varMap[":PandaID"] = pandaID self.execute(sqlDJ.format(tableName), varMap) iDel = self.cur.rowcount if iDel > 0: nDel += iDel # commit self.commit() - tmpLog.debug('deleted {0} records from {1}'.format(nDel, tableName)) + tmpLog.debug("deleted {0} records from {1}".format(nDel, tableName)) return True except Exception: # roll back @@ -5379,36 +5357,35 @@ def delete_orphaned_job_info(self): # lock worker again to feed events def lock_worker_again_to_feed_events(self, worker_id, locked_by): try: - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(worker_id), - method_name='lock_worker_again_to_feed_events') - tmpLog.debug('start id={0}'.format(locked_by)) + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(worker_id), method_name="lock_worker_again_to_feed_events") + tmpLog.debug("start id={0}".format(locked_by)) # check lock sqlC = "SELECT eventFeedLock,eventFeedTime FROM {0} ".format(workTableName) sqlC += "WHERE workerID=:workerID " sqlC += "FOR UPDATE " varMap = dict() - varMap[':workerID'] = worker_id + varMap[":workerID"] = worker_id self.execute(sqlC, varMap) resC = self.cur.fetchone() if resC is None: retVal = False - tmpLog.debug('not found') + tmpLog.debug("not found") else: oldLockedBy, oldLockedTime = resC if oldLockedBy != locked_by: - tmpLog.debug('locked by another {0} at {1}'.format(oldLockedBy, oldLockedTime)) + tmpLog.debug("locked by another {0} at {1}".format(oldLockedBy, oldLockedTime)) retVal = False else: # update locked time sqlU = "UPDATE {0} SET eventFeedTime=:timeNow WHERE workerID=:workerID ".format(workTableName) varMap = dict() - varMap[':workerID'] = worker_id - varMap[':timeNow'] = datetime.datetime.utcnow() + varMap[":workerID"] = worker_id + varMap[":timeNow"] = datetime.datetime.utcnow() self.execute(sqlU, varMap) retVal = True # commit self.commit() - tmpLog.debug('done with {0}'.format(retVal)) + tmpLog.debug("done with {0}".format(retVal)) # return return retVal except Exception: @@ -5422,8 +5399,8 @@ def lock_worker_again_to_feed_events(self, worker_id, locked_by): # insert service metrics def insert_service_metrics(self, service_metric_spec): # get logger - tmpLog = core_utils.make_logger(_logger, method_name='insert_service_metrics') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="insert_service_metrics") + tmpLog.debug("start") try: sql = "INSERT INTO {0} ({1}) ".format(serviceMetricsTableName, ServiceMetricSpec.column_names()) sql += ServiceMetricSpec.bind_values_expression() @@ -5445,12 +5422,12 @@ def insert_service_metrics(self, service_metric_spec): def get_service_metrics(self, last_update): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_service_metrics') - tmpLog.debug('start with last_update: {0}'.format(last_update)) + tmpLog = core_utils.make_logger(_logger, method_name="get_service_metrics") + tmpLog.debug("start with last_update: {0}".format(last_update)) sql = "SELECT creationTime, hostName, metrics FROM {0} ".format(serviceMetricsTableName) sql += "WHERE creationTime>=:last_update " - var_map = {':last_update': last_update} + var_map = {":last_update": last_update} self.execute(sql, var_map) res = self.cur.fetchall() @@ -5458,13 +5435,13 @@ def get_service_metrics(self, last_update): res_corrected = [] for entry in res: try: - res_corrected.append([entry[0].strftime('%Y-%m-%d %H:%M:%S.%f'), entry[1], entry[2]]) + res_corrected.append([entry[0].strftime("%Y-%m-%d %H:%M:%S.%f"), entry[1], entry[2]]) except Exception: pass # commit self.commit() - tmpLog.debug('got {0}'.format(str(res))) + tmpLog.debug("got {0}".format(str(res))) return res_corrected except Exception: # roll back @@ -5478,23 +5455,23 @@ def get_service_metrics(self, last_update): def release_site(self, site_name, locked_by): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='release_site') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="release_site") + tmpLog.debug("start") # sql to release site sql = "UPDATE {0} SET lockedBy=NULL ".format(pandaQueueTableName) sql += "WHERE siteName=:siteName AND lockedBy=:lockedBy " # release site varMap = dict() - varMap[':siteName'] = site_name - varMap[':lockedBy'] = locked_by + varMap[":siteName"] = site_name + varMap[":lockedBy"] = locked_by self.execute(sql, varMap) n_done = self.cur.rowcount > 0 # commit self.commit() if n_done >= 1: - tmpLog.debug('released {0}'.format(site_name)) + tmpLog.debug("released {0}".format(site_name)) else: - tmpLog.debug('found nothing to release. Skipped'.format(site_name)) + tmpLog.debug("found nothing to release. Skipped".format(site_name)) # return return True except Exception: @@ -5509,36 +5486,28 @@ def release_site(self, site_name, locked_by): def get_workers_from_ids(self, ids): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_workers_from_ids') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_workers_from_ids") + tmpLog.debug("start") # sql to get workers sqlW = ( - "SELECT workerID,configID,mapType FROM {workTableName} " - "WHERE workerID IN ({ids_str}) " - "AND status IN (:st_submitted,:st_running,:st_idle) " - ).format(workTableName=workTableName, ids_str=','.join([ str(_) for _ in ids])) + "SELECT workerID,configID,mapType FROM {workTableName} " "WHERE workerID IN ({ids_str}) " "AND status IN (:st_submitted,:st_running,:st_idle) " + ).format(workTableName=workTableName, ids_str=",".join([str(_) for _ in ids])) # sql to get associated workerIDs sqlA = ( "SELECT t.workerID FROM {jobWorkerTableName} t, {jobWorkerTableName} s, {workTableName} w " "WHERE s.PandaID=t.PandaID AND s.workerID=:workerID " "AND w.workerID=t.workerID AND w.status IN (:st_submitted,:st_running,:st_idle) " - ).format(jobWorkerTableName=jobWorkerTableName, workTableName=workTableName) + ).format(jobWorkerTableName=jobWorkerTableName, workTableName=workTableName) # sql to get associated workers - sqlG = ( - "SELECT {0} FROM {1} " - "WHERE workerID=:workerID " - ).format(WorkSpec.column_names(), workTableName) + sqlG = ("SELECT {0} FROM {1} " "WHERE workerID=:workerID ").format(WorkSpec.column_names(), workTableName) # sql to get associated PandaIDs - sqlP = ( - "SELECT PandaID FROM {0} " - "WHERE workerID=:workerID " - ).format(jobWorkerTableName) + sqlP = ("SELECT PandaID FROM {0} " "WHERE workerID=:workerID ").format(jobWorkerTableName) # get workerIDs timeNow = datetime.datetime.utcnow() varMap = dict() - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlW, varMap) resW = self.cur.fetchall() tmpWorkers = set() @@ -5555,14 +5524,14 @@ def get_workers_from_ids(self, ids): continue # get associated workerIDs varMap = dict() - varMap[':workerID'] = workerID - varMap[':st_submitted'] = WorkSpec.ST_submitted - varMap[':st_running'] = WorkSpec.ST_running - varMap[':st_idle'] = WorkSpec.ST_idle + varMap[":workerID"] = workerID + varMap[":st_submitted"] = WorkSpec.ST_submitted + varMap[":st_running"] = WorkSpec.ST_running + varMap[":st_idle"] = WorkSpec.ST_idle self.execute(sqlA, varMap) resA = self.cur.fetchall() workerIDtoScan = set() - for tmpWorkID, in resA: + for (tmpWorkID,) in resA: workerIDtoScan.add(tmpWorkID) # add original ID just in case since no relation when job is not yet bound workerIDtoScan.add(workerID) @@ -5577,7 +5546,7 @@ def get_workers_from_ids(self, ids): checkedIDs.add(tmpWorkID) # get worker varMap = dict() - varMap[':workerID'] = tmpWorkID + varMap[":workerID"] = tmpWorkID self.execute(sqlG, varMap) resG = self.cur.fetchone() workSpec = WorkSpec() @@ -5587,11 +5556,11 @@ def get_workers_from_ids(self, ids): workersList.append(workSpec) # get associated PandaIDs varMap = dict() - varMap[':workerID'] = tmpWorkID + varMap[":workerID"] = tmpWorkID self.execute(sqlP, varMap) resP = self.cur.fetchall() workSpec.pandaid_list = [] - for tmpPandaID, in resP: + for (tmpPandaID,) in resP: workSpec.pandaid_list.append(tmpPandaID) if len(workSpec.pandaid_list) > 0: workSpec.nJobs = len(workSpec.pandaid_list) @@ -5602,7 +5571,7 @@ def get_workers_from_ids(self, ids): retVal.setdefault(queueName, dict()) retVal[queueName].setdefault(configID, []) retVal[queueName][configID].append(workersList) - tmpLog.debug('got {0}'.format(str(retVal))) + tmpLog.debug("got {0}".format(str(retVal))) return retVal except Exception: # roll back @@ -5616,31 +5585,33 @@ def get_workers_from_ids(self, ids): def mark_workers_to_kill_by_query(self, params, delay_seconds=None): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='mark_workers_to_kill_by_query') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="mark_workers_to_kill_by_query") + tmpLog.debug("start") # sql to set killTime sqlL = "UPDATE {0} SET killTime=:setTime ".format(workTableName) sqlL += "WHERE workerID=:workerID AND killTime IS NULL AND NOT status IN (:st1,:st2,:st3) " # sql to get workers constraints_query_string_list = [] tmp_varMap = {} - constraint_map = {'status': params.get('status', [WorkSpec.ST_submitted]), - 'computingSite': params.get('computingSite', []), - 'computingElement': params.get('computingElement', []), - 'submissionHost': params.get('submissionHost', [])} - tmpLog.debug('query {0}'.format(constraint_map)) + constraint_map = { + "status": params.get("status", [WorkSpec.ST_submitted]), + "computingSite": params.get("computingSite", []), + "computingElement": params.get("computingElement", []), + "submissionHost": params.get("submissionHost", []), + } + tmpLog.debug("query {0}".format(constraint_map)) for attribute, match_list in iteritems(constraint_map): - if match_list == 'ALL': + if match_list == "ALL": pass elif not match_list: - tmpLog.debug('{0} constraint is not specified in the query. Skipped'.format(attribute)) + tmpLog.debug("{0} constraint is not specified in the query. Skipped".format(attribute)) return 0 else: - one_param_list = [':param_{0}_{1}'.format(attribute, v_i) for v_i in range(len(match_list))] + one_param_list = [":param_{0}_{1}".format(attribute, v_i) for v_i in range(len(match_list))] tmp_varMap.update(zip(one_param_list, match_list)) - params_string = '(' + ','.join(one_param_list) + ')' - constraints_query_string_list.append('{0} IN {1}'.format(attribute, params_string)) - constraints_query_string = ' AND '.join(constraints_query_string_list) + params_string = "(" + ",".join(one_param_list) + ")" + constraints_query_string_list.append("{0} IN {1}".format(attribute, params_string)) + constraints_query_string = " AND ".join(constraints_query_string_list) sqlW = "SELECT workerID FROM {0} ".format(workTableName) sqlW += "WHERE {0} ".format(constraints_query_string) # set time to trigger sweeper @@ -5656,19 +5627,19 @@ def mark_workers_to_kill_by_query(self, params, delay_seconds=None): self.execute(sqlW, varMap) resW = self.cur.fetchall() nRow = 0 - for workerID, in resW: + for (workerID,) in resW: # set killTime varMap = dict() - varMap[':workerID'] = workerID - varMap[':setTime'] = setTime - varMap[':st1'] = WorkSpec.ST_finished - varMap[':st2'] = WorkSpec.ST_failed - varMap[':st3'] = WorkSpec.ST_cancelled + varMap[":workerID"] = workerID + varMap[":setTime"] = setTime + varMap[":st1"] = WorkSpec.ST_finished + varMap[":st2"] = WorkSpec.ST_failed + varMap[":st3"] = WorkSpec.ST_cancelled self.execute(sqlL, varMap) nRow += self.cur.rowcount # commit self.commit() - tmpLog.debug('set killTime to {0} workers'.format(nRow)) + tmpLog.debug("set killTime to {0} workers".format(nRow)) return nRow except Exception: # roll back @@ -5682,23 +5653,22 @@ def mark_workers_to_kill_by_query(self, params, delay_seconds=None): def get_all_active_input_files(self): try: # get logger - tmpLog = core_utils.make_logger(_logger, - method_name='get_all_active_input_files') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="get_all_active_input_files") + tmpLog.debug("start") # sql to get files sqlF = "SELECT lfn FROM {0} ".format(fileTableName) sqlF += "WHERE fileType IN (:type1,:type2) " # get files varMap = dict() - varMap[':type1'] = 'input' - varMap[':type2'] = FileSpec.AUX_INPUT + varMap[":type1"] = "input" + varMap[":type2"] = FileSpec.AUX_INPUT self.execute(sqlF, varMap) ret = set() - for lfn, in self.cur.fetchall(): + for (lfn,) in self.cur.fetchall(): ret.add(lfn) # commit self.commit() - tmpLog.debug('got {0} files'.format(len(ret))) + tmpLog.debug("got {0} files".format(len(ret))) return ret except Exception: # roll back diff --git a/pandaharvester/harvestercore/db_proxy_pool.py b/pandaharvester/harvestercore/db_proxy_pool.py index 58aa40de..bd4fc502 100644 --- a/pandaharvester/harvestercore/db_proxy_pool.py +++ b/pandaharvester/harvestercore/db_proxy_pool.py @@ -6,7 +6,7 @@ from . import core_utils # logger -_logger = core_utils.setup_logger('db_proxy_pool') +_logger = core_utils.setup_logger("db_proxy_pool") # method wrapper @@ -18,19 +18,19 @@ def __init__(self, method_name, pool): # method emulation def __call__(self, *args, **kwargs): - tmpLog = core_utils.make_logger(_logger, 'method={0}'.format(self.methodName), method_name='call') + tmpLog = core_utils.make_logger(_logger, "method={0}".format(self.methodName), method_name="call") sw = core_utils.get_stopwatch() try: # get connection con = self.pool.get() - tmpLog.debug('got lock. qsize={0} {1}'.format(self.pool.qsize(), sw.get_elapsed_time())) + tmpLog.debug("got lock. qsize={0} {1}".format(self.pool.qsize(), sw.get_elapsed_time())) sw.reset() # get function func = getattr(con, self.methodName) # exec return func(*args, **kwargs) finally: - tmpLog.debug('release lock' + sw.get_elapsed_time()) + tmpLog.debug("release lock" + sw.get_elapsed_time()) self.pool.put(con) @@ -46,7 +46,7 @@ def __init__(self, read_only=False): # initialize def initialize(self, read_only=False): # install members - object.__setattr__(self, 'pool', None) + object.__setattr__(self, "pool", None) # connection pool self.pool = queue.Queue(harvester_config.db.nConnections) currentThr = threading.current_thread() @@ -54,9 +54,9 @@ def initialize(self, read_only=False): thrID = None else: thrID = currentThr.ident - thrName = '{0}-{1}'.format(os.getpid(), thrID) + thrName = "{0}-{1}".format(os.getpid(), thrID) for i in range(harvester_config.db.nConnections): - con = DBProxy(thr_name='{0}-{1}'.format(thrName, i), read_only=read_only) + con = DBProxy(thr_name="{0}-{1}".format(thrName, i), read_only=read_only) self.pool.put(con) # override __new__ to have a singleton @@ -64,7 +64,7 @@ def __new__(cls, *args, **kwargs): if cls.instance is None: with cls.lock: if cls.instance is None: - if 'read_only' in kwargs and kwargs['read_only']: + if "read_only" in kwargs and kwargs["read_only"]: read_only = True else: read_only = False diff --git a/pandaharvester/harvestercore/diag_spec.py b/pandaharvester/harvestercore/diag_spec.py index ac1bbbf4..7123278c 100644 --- a/pandaharvester/harvestercore/diag_spec.py +++ b/pandaharvester/harvestercore/diag_spec.py @@ -9,15 +9,16 @@ class DiagSpec(SpecBase): # attributes - attributesWithTypes = ('diagID:integer primary key autoincrement', - 'moduleName:text / index', - 'identifier:text', - 'creationTime:timestamp / index', - 'lockTime:timestamp / index', - 'messageLevel:text', - 'lockedBy:integer / index', - 'diagMessage:varchar(500)' - ) + attributesWithTypes = ( + "diagID:integer primary key autoincrement", + "moduleName:text / index", + "identifier:text", + "creationTime:timestamp / index", + "lockTime:timestamp / index", + "messageLevel:text", + "lockedBy:integer / index", + "diagMessage:varchar(500)", + ) # constructor def __init__(self): @@ -26,14 +27,9 @@ def __init__(self): # convert to propagate def convert_to_propagate(self): data = dict() - for attr in ['diagID', - 'moduleName', - 'identifier', - 'creationTime', - 'messageLevel', - 'diagMessage']: + for attr in ["diagID", "moduleName", "identifier", "creationTime", "messageLevel", "diagMessage"]: val = getattr(self, attr) if isinstance(val, datetime.datetime): - val = val.strftime('%Y-%m-%d %H:%M:%S.%f') + val = val.strftime("%Y-%m-%d %H:%M:%S.%f") data[attr] = val return data diff --git a/pandaharvester/harvestercore/event_spec.py b/pandaharvester/harvestercore/event_spec.py index e8cc974f..2048ebd8 100644 --- a/pandaharvester/harvestercore/event_spec.py +++ b/pandaharvester/harvestercore/event_spec.py @@ -11,15 +11,16 @@ class EventSpec(SpecBase): # attributes - attributesWithTypes = ('eventRangeID:text / index', - 'PandaID:integer / index', - 'eventStatus:text', - 'coreCount:integer', - 'cpuConsumptionTime:integer', - 'subStatus:text / index', - 'fileID:integer', - 'loss:text' - ) + attributesWithTypes = ( + "eventRangeID:text / index", + "PandaID:integer / index", + "eventStatus:text", + "coreCount:integer", + "cpuConsumptionTime:integer", + "subStatus:text / index", + "fileID:integer", + "loss:text", + ) # constructor def __init__(self): @@ -30,14 +31,13 @@ def to_data(self): data = {} for attr in self.attributes: # ignore some attributes - if attr not in ['eventRangeID', 'eventStatus', 'coreCount', - 'cpuConsumptionTime', 'loss']: + if attr not in ["eventRangeID", "eventStatus", "coreCount", "cpuConsumptionTime", "loss"]: continue val = getattr(self, attr) # don't propagate finished until subStatus is finished - if attr == 'eventStatus': - if val == 'finished' and not self.is_final_status(): - val = 'running' + if attr == "eventStatus": + if val == "finished" and not self.is_final_status(): + val = "running" if val is not None: data[attr] = val return data @@ -53,4 +53,4 @@ def from_data(self, data, panda_id): # final status def is_final_status(self): - return self.subStatus in ['finished', 'done', 'failed'] + return self.subStatus in ["finished", "done", "failed"] diff --git a/pandaharvester/harvestercore/fifos.py b/pandaharvester/harvestercore/fifos.py index d90da70a..e1c03ab9 100644 --- a/pandaharvester/harvestercore/fifos.py +++ b/pandaharvester/harvestercore/fifos.py @@ -7,6 +7,7 @@ from future.utils import iteritems import json + try: import cPickle as pickle except ImportError: @@ -24,15 +25,17 @@ from pandaharvester.harvestercore.db_interface import DBInterface # attribute list -_attribute_list = ['id', 'item', 'score'] +_attribute_list = ["id", "item", "score"] # fifo object spec -FifoObject = collections.namedtuple('FifoObject', _attribute_list, rename=False) +FifoObject = collections.namedtuple("FifoObject", _attribute_list, rename=False) # logger -_logger = core_utils.setup_logger('fifos') +_logger = core_utils.setup_logger("fifos") # base class of fifo message queue + + class FIFOBase(object): # constructor def __init__(self, **kwarg): @@ -48,11 +51,11 @@ def get_pid(self): thread_id = get_ident() if thread_id is None: thread_id = 0 - return '{0}_{1}-{2}'.format(self.hostname, self.os_pid, format(get_ident(), 'x')) + return "{0}_{1}-{2}".format(self.hostname, self.os_pid, format(get_ident(), "x")) # make logger def make_logger(self, base_log, token=None, method_name=None, send_dialog=True): - if send_dialog and hasattr(self, 'dbInterface'): + if send_dialog and hasattr(self, "dbInterface"): hook = self.dbInterface else: hook = None @@ -60,25 +63,33 @@ def make_logger(self, base_log, token=None, method_name=None, send_dialog=True): # intialize fifo from harvester configuration def _initialize_fifo(self, force_enable=False): - self.fifoName = '{0}_fifo'.format(self.titleName) + self.fifoName = "{0}_fifo".format(self.titleName) self.config = getattr(harvester_config, self.titleName) if force_enable: self.enabled = True - elif hasattr(self.config, 'fifoEnable') and self.config.fifoEnable: + elif hasattr(self.config, "fifoEnable") and self.config.fifoEnable: self.enabled = True else: self.enabled = False return pluginConf = vars(self.config).copy() - pluginConf.update( {'titleName': self.titleName} ) - if hasattr(self.config, 'fifoModule') and hasattr(self.config, 'fifoClass'): - pluginConf.update( {'module': self.config.fifoModule, - 'name': self.config.fifoClass,} ) + pluginConf.update({"titleName": self.titleName}) + if hasattr(self.config, "fifoModule") and hasattr(self.config, "fifoClass"): + pluginConf.update( + { + "module": self.config.fifoModule, + "name": self.config.fifoClass, + } + ) else: - if not hasattr(harvester_config, 'fifo'): + if not hasattr(harvester_config, "fifo"): return - pluginConf.update( {'module': harvester_config.fifo.fifoModule, - 'name': harvester_config.fifo.fifoClass,} ) + pluginConf.update( + { + "module": harvester_config.fifo.fifoModule, + "name": harvester_config.fifo.fifoClass, + } + ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf) @@ -94,14 +105,14 @@ def decode(self, item_serialized): # size of queue def size(self): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='size') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="size") retVal = self.fifo.size() - mainLog.debug('size={0}'.format(retVal)) + mainLog.debug("size={0}".format(retVal)) return retVal # enqueue def put(self, item, score=None, encode_item=True): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='put') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="put") if encode_item: item_serialized = self.encode(item) else: @@ -109,12 +120,12 @@ def put(self, item, score=None, encode_item=True): if score is None: score = time.time() retVal = self.fifo.put(item_serialized, score) - mainLog.debug('score={0}'.format(score)) + mainLog.debug("score={0}".format(score)) return retVal # enqueue by id, which is unique def putbyid(self, id, item, score=None, encode_item=True): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='putbyid') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="putbyid") if encode_item: item_serialized = self.encode(item) else: @@ -122,12 +133,12 @@ def putbyid(self, id, item, score=None, encode_item=True): if score is None: score = time.time() retVal = self.fifo.putbyid(id, item_serialized, score) - mainLog.debug('id={0} score={1}'.format(id, score)) + mainLog.debug("id={0} score={1}".format(id, score)) return retVal # dequeue to get the first fifo object def get(self, timeout=None, protective=False, decode_item=True): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='get') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="get") object_tuple = self.fifo.get(timeout, protective) if object_tuple is None: retVal = None @@ -138,12 +149,12 @@ def get(self, timeout=None, protective=False, decode_item=True): else: item = item_serialized retVal = FifoObject(id, item, score) - mainLog.debug('called. protective={0} decode_item={1}'.format(protective, decode_item)) + mainLog.debug("called. protective={0} decode_item={1}".format(protective, decode_item)) return retVal # dequeue to get the last fifo object def getlast(self, timeout=None, protective=False, decode_item=True): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='getlast') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="getlast") object_tuple = self.fifo.getlast(timeout, protective) if object_tuple is None: retVal = None @@ -154,16 +165,15 @@ def getlast(self, timeout=None, protective=False, decode_item=True): else: item = item_serialized retVal = FifoObject(id, item, score) - mainLog.debug('called. protective={0} decode_item={1}'.format(protective, decode_item)) + mainLog.debug("called. protective={0} decode_item={1}".format(protective, decode_item)) return retVal # dequeue list of objects with some conditions - def getmany(self, mode='first', minscore=None, maxscore=None, count=None, - protective=False, temporary=False, decode_item=True): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='getmany') + def getmany(self, mode="first", minscore=None, maxscore=None, count=None, protective=False, temporary=False, decode_item=True): + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="getmany") object_tuple_list = self.fifo.getmany(mode, minscore, maxscore, count, protective, temporary) if not object_tuple_list: - mainLog.debug('empty list') + mainLog.debug("empty list") ret_list = [] for object_tuple in object_tuple_list: id, item_serialized, score = object_tuple @@ -173,18 +183,21 @@ def getmany(self, mode='first', minscore=None, maxscore=None, count=None, item = item_serialized val_tuple = FifoObject(id, item, score) ret_list.append(val_tuple) - mainLog.debug('mode={0} minscore={1} maxscore={2} count={3} protective={4} temporary={5} decode_item={6}'.format( - mode, minscore, maxscore, count, protective, temporary, decode_item)) + mainLog.debug( + "mode={0} minscore={1} maxscore={2} count={3} protective={4} temporary={5} decode_item={6}".format( + mode, minscore, maxscore, count, protective, temporary, decode_item + ) + ) return ret_list # get tuple of the first object and its score without dequeuing # If item is large un unnecessary to show int peek, set skip_item=True def peek(self, skip_item=False): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='peek') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="peek") object_tuple = self.fifo.peek(skip_item=skip_item) if object_tuple is None: retVal = None - mainLog.debug('fifo empty') + mainLog.debug("fifo empty") else: id, item_serialized, score = object_tuple if item_serialized is None and score is None: @@ -193,16 +206,16 @@ def peek(self, skip_item=False): if score is None: score = time.time() retVal = FifoObject(id, item_serialized, score) - mainLog.debug('score={0}'.format(score)) + mainLog.debug("score={0}".format(score)) return retVal # get tuple of the last object and its score without dequeuing def peeklast(self, skip_item=False): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='peeklast') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="peeklast") object_tuple = self.fifo.peeklast(skip_item=skip_item) if object_tuple is None: retVal = None - mainLog.debug('fifo empty') + mainLog.debug("fifo empty") else: id, item_serialized, score = object_tuple if item_serialized is None and score is None: @@ -211,16 +224,16 @@ def peeklast(self, skip_item=False): if score is None: score = time.time() retVal = FifoObject(id, item_serialized, score) - mainLog.debug('score={0}'.format(score)) + mainLog.debug("score={0}".format(score)) return retVal # get tuple of the object by id without dequeuing def peekbyid(self, id, temporary=False, skip_item=False): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='peekbyid') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="peekbyid") object_tuple = self.fifo.peekbyid(id, temporary, skip_item=skip_item) if object_tuple is None: retVal = None - mainLog.debug('fifo empty') + mainLog.debug("fifo empty") else: id_gotten, item_serialized, score = object_tuple if item_serialized is None and score is None: @@ -229,15 +242,15 @@ def peekbyid(self, id, temporary=False, skip_item=False): if score is None: score = time.time() retVal = FifoObject(id, item_serialized, score) - mainLog.debug('id={0} score={1} temporary={2}'.format(id, score, temporary)) + mainLog.debug("id={0} score={1} temporary={2}".format(id, score, temporary)) return retVal # get list of object tuples without dequeuing - def peekmany(self, mode='first', minscore=None, maxscore=None, count=None, skip_item=False): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='peekmany') + def peekmany(self, mode="first", minscore=None, maxscore=None, count=None, skip_item=False): + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="peekmany") object_tuple_list = self.fifo.peekmany(mode, minscore, maxscore, count, skip_item) if not object_tuple_list: - mainLog.debug('empty list') + mainLog.debug("empty list") ret_list = [] for object_tuple in object_tuple_list: id_gotten, item_serialized, score = object_tuple @@ -248,39 +261,39 @@ def peekmany(self, mode='first', minscore=None, maxscore=None, count=None, skip_ score = time.time() val_tuple = FifoObject(id, item_serialized, score) ret_list.append(val_tuple) - mainLog.debug('mode={0} minscore={1} maxscore={2} count={3}'.format(mode, minscore, maxscore, count)) + mainLog.debug("mode={0} minscore={1} maxscore={2} count={3}".format(mode, minscore, maxscore, count)) return ret_list # delete objects by list of ids from temporary space, return the number of objects successfully deleted def delete(self, ids): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='release') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="release") retVal = self.fifo.delete(ids) - mainLog.debug('released {0} objects in {1}'.format(retVal, ids)) + mainLog.debug("released {0} objects in {1}".format(retVal, ids)) return retVal # restore objects by list of ids from temporary space to fifo; ids=None to restore all objects def restore(self, ids=None): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='restore') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="restore") retVal = self.fifo.restore(ids) if ids is None: - mainLog.debug('restored all objects') + mainLog.debug("restored all objects") else: - mainLog.debug('restored objects in {0}'.format(ids)) + mainLog.debug("restored objects in {0}".format(ids)) return retVal # update a object by its id with some conditions - def update(self, id, item=None, score=None, temporary=None, cond_score='gt'): - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='update') + def update(self, id, item=None, score=None, temporary=None, cond_score="gt"): + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="update") retVal = self.fifo.update(id, item, score, temporary, cond_score) update_report_list = [] if item is not None: - update_report_list.append('item={0}'.format(item)) + update_report_list.append("item={0}".format(item)) if score is not None: - update_report_list.append('score={0}'.format(score)) + update_report_list.append("score={0}".format(score)) if temporary is not None: - update_report_list.append('temporary={0}'.format(temporary)) - update_report = ' '.join(update_report_list) - mainLog.debug('update id={0} cond_score={1}: return={2}, {3}'.format(id, cond_score, retVal, update_report)) + update_report_list.append("temporary={0}".format(temporary)) + update_report = " ".join(update_report_list) + mainLog.debug("update id={0} cond_score={1}: return={2}, {3}".format(id, cond_score, retVal, update_report)) return retVal @@ -289,23 +302,27 @@ class SpecialFIFOBase(FIFOBase): # constructor def __init__(self, **kwarg): FIFOBase.__init__(self, **kwarg) - self.fifoName = '{0}_fifo'.format(self.titleName) + self.fifoName = "{0}_fifo".format(self.titleName) pluginConf = {} - pluginConf.update( {'titleName': self.titleName} ) - pluginConf.update( {'module': harvester_config.fifo.fifoModule, - 'name': harvester_config.fifo.fifoClass,} ) + pluginConf.update({"titleName": self.titleName}) + pluginConf.update( + { + "module": harvester_config.fifo.fifoModule, + "name": harvester_config.fifo.fifoClass, + } + ) pluginFactory = PluginFactory() self.fifo = pluginFactory.get_plugin(pluginConf) # Benchmark fifo class BenchmarkFIFO(SpecialFIFOBase): - titleName = 'benchmark' + titleName = "benchmark" # monitor fifo class MonitorFIFO(FIFOBase): - titleName = 'monitor' + titleName = "monitor" # constructor def __init__(self, **kwarg): @@ -334,7 +351,7 @@ def populate(self, seconds_ago=0, clear_fifo=False): timeNow_timestamp = time.time() score = timeNow_timestamp for workspec in workspec_iterator: - workspec.set_work_params({'lastCheckAt': timeNow_timestamp}) + workspec.set_work_params({"lastCheckAt": timeNow_timestamp}) if last_queueName is None: try: score = timegm(workspec.modificationTime.utctimetuple()) @@ -342,8 +359,7 @@ def populate(self, seconds_ago=0, clear_fifo=False): pass workspec_chunk = [[workspec]] last_queueName = workspec.computingSite - elif workspec.computingSite == last_queueName \ - and len(workspec_chunk) < fifoMaxWorkersPerChunk: + elif workspec.computingSite == last_queueName and len(workspec_chunk) < fifoMaxWorkersPerChunk: workspec_chunk.append([workspec]) else: self.put((last_queueName, workspec_chunk), score) @@ -363,7 +379,7 @@ def to_check_workers(self, check_interval=harvester_config.monitor.checkInterval retVal False otherwise. Return retVal, overhead_time """ - mainLog = self.make_logger(_logger, 'id={0}-{1}'.format(self.fifoName, self.get_pid()), method_name='to_check_worker') + mainLog = self.make_logger(_logger, "id={0}-{1}".format(self.fifoName, self.get_pid()), method_name="to_check_worker") retVal = False overhead_time = None timeNow_timestamp = time.time() @@ -374,27 +390,26 @@ def to_check_workers(self, check_interval=harvester_config.monitor.checkInterval if overhead_time > 0: retVal = True if score < 0: - mainLog.debug('True. Preempting') + mainLog.debug("True. Preempting") overhead_time = None else: - mainLog.debug('True') - mainLog.info('Overhead time is {0:.3f} sec'.format(overhead_time)) + mainLog.debug("True") + mainLog.info("Overhead time is {0:.3f} sec".format(overhead_time)) else: - mainLog.debug('False. Workers too young to check') - mainLog.debug('Overhead time is {0:.3f} sec'.format(overhead_time)) + mainLog.debug("False. Workers too young to check") + mainLog.debug("Overhead time is {0:.3f} sec".format(overhead_time)) else: - mainLog.debug('False. Got nothing in FIFO') + mainLog.debug("False. Got nothing in FIFO") return retVal, overhead_time class MonitorEventFIFO(SpecialFIFOBase): - titleName = 'monitorEvent' + titleName = "monitorEvent" # constructor def __init__(self, **kwarg): - self.config = getattr(harvester_config, 'monitor') + self.config = getattr(harvester_config, "monitor") self.enabled = False - if hasattr(self.config, 'fifoEnable') and self.config.fifoEnable \ - and getattr(self.config, 'eventBasedEnable', False): + if hasattr(self.config, "fifoEnable") and self.config.fifoEnable and getattr(self.config, "eventBasedEnable", False): self.enabled = True SpecialFIFOBase.__init__(self, **kwarg) diff --git a/pandaharvester/harvestercore/file_spec.py b/pandaharvester/harvestercore/file_spec.py index 55be4487..6e8c1624 100644 --- a/pandaharvester/harvestercore/file_spec.py +++ b/pandaharvester/harvestercore/file_spec.py @@ -7,48 +7,46 @@ class FileSpec(SpecBase): - # file type - AUX_INPUT = 'aux_input' + AUX_INPUT = "aux_input" # attributes - attributesWithTypes = ('fileID:integer primary key autoincrement', - 'PandaID:integer / index', - 'taskID:integer', - 'lfn:text / index', - 'status:text / index', - 'fsize:integer', - 'chksum:text', - 'path:text', - 'fileType:text', - 'eventRangeID:text', - 'modificationTime:timestamp', - 'fileAttributes:blob', - 'isZip:integer', - 'zipFileID:integer / index', - 'objstoreID:integer', - 'endpoint:text', - 'groupID:text / index', - 'groupStatus:text / index', - 'groupUpdateTime:timestamp / index', - 'attemptNr:integer', - 'todelete:integer / index', - 'scope:text', - 'pathConvention:integer', - 'provenanceID:text / index', - 'workerID:integer / index', - 'url:text' - ) + attributesWithTypes = ( + "fileID:integer primary key autoincrement", + "PandaID:integer / index", + "taskID:integer", + "lfn:text / index", + "status:text / index", + "fsize:integer", + "chksum:text", + "path:text", + "fileType:text", + "eventRangeID:text", + "modificationTime:timestamp", + "fileAttributes:blob", + "isZip:integer", + "zipFileID:integer / index", + "objstoreID:integer", + "endpoint:text", + "groupID:text / index", + "groupStatus:text / index", + "groupUpdateTime:timestamp / index", + "attemptNr:integer", + "todelete:integer / index", + "scope:text", + "pathConvention:integer", + "provenanceID:text / index", + "workerID:integer / index", + "url:text", + ) # attributes initialized with 0 - zeroAttrs = ('attemptNr', - 'todelete' - ) + zeroAttrs = ("attemptNr", "todelete") # constructor def __init__(self): SpecBase.__init__(self) - object.__setattr__(self, 'associatedFiles', set()) + object.__setattr__(self, "associatedFiles", set()) # add associated files def add_associated_file(self, filespec): diff --git a/pandaharvester/harvestercore/job_spec.py b/pandaharvester/harvestercore/job_spec.py index 227abdc6..6f2116c1 100644 --- a/pandaharvester/harvestercore/job_spec.py +++ b/pandaharvester/harvestercore/job_spec.py @@ -28,69 +28,67 @@ class JobSpec(SpecBase): AUX_allReady = 4 # attributes - attributesWithTypes = ('PandaID:integer primary key', - 'taskID:integer / index', - 'attemptNr:integer', - 'status:text', - 'subStatus:text / index', - 'currentPriority:integer / index', - 'computingSite:text / index', - 'creationTime:timestamp', - 'modificationTime:timestamp / index', - 'stateChangeTime:timestamp', - 'startTime:timestamp', - 'endTime:timestamp', - 'nCore:integer', - 'jobParams:blob', - 'jobAttributes:blob', - 'hasOutFile:integer', - 'metaData:blob', - 'outputFilesToReport:blob', - 'lockedBy:text', - 'propagatorLock:text', - 'propagatorTime:timestamp / index', - 'preparatorTime:timestamp / index', - 'submitterTime:timestamp', - 'stagerLock:text', - 'stagerTime:timestamp / index', - 'zipPerMB:integer', - 'nWorkers:integer', - 'nWorkersLimit:integer', - 'submissionAttempts:integer', - 'jobsetID:integer', - 'pilotClosed:integer', - 'configID:integer / index', - 'nRemainingEvents:integer', - 'moreWorkers:integer', - 'maxWorkersInTotal:integer', - 'nWorkersInTotal:integer', - 'jobParamsExtForOutput:blob', - 'jobParamsExtForLog:blob', - 'auxInput:integer' - ) + attributesWithTypes = ( + "PandaID:integer primary key", + "taskID:integer / index", + "attemptNr:integer", + "status:text", + "subStatus:text / index", + "currentPriority:integer / index", + "computingSite:text / index", + "creationTime:timestamp", + "modificationTime:timestamp / index", + "stateChangeTime:timestamp", + "startTime:timestamp", + "endTime:timestamp", + "nCore:integer", + "jobParams:blob", + "jobAttributes:blob", + "hasOutFile:integer", + "metaData:blob", + "outputFilesToReport:blob", + "lockedBy:text", + "propagatorLock:text", + "propagatorTime:timestamp / index", + "preparatorTime:timestamp / index", + "submitterTime:timestamp", + "stagerLock:text", + "stagerTime:timestamp / index", + "zipPerMB:integer", + "nWorkers:integer", + "nWorkersLimit:integer", + "submissionAttempts:integer", + "jobsetID:integer", + "pilotClosed:integer", + "configID:integer / index", + "nRemainingEvents:integer", + "moreWorkers:integer", + "maxWorkersInTotal:integer", + "nWorkersInTotal:integer", + "jobParamsExtForOutput:blob", + "jobParamsExtForLog:blob", + "auxInput:integer", + ) # attributes initialized with 0 - zeroAttrs = ('nWorkers', - 'submissionAttempts', - 'nWorkersInTotal' - ) + zeroAttrs = ("nWorkers", "submissionAttempts", "nWorkersInTotal") # attributes to skip when slim reading - skipAttrsToSlim = ('jobParams') + skipAttrsToSlim = "jobParams" # constructor def __init__(self): SpecBase.__init__(self) - object.__setattr__(self, 'events', set()) - object.__setattr__(self, 'zipEventMap', {}) - object.__setattr__(self, 'inFiles', set()) - object.__setattr__(self, 'outFiles', set()) - object.__setattr__(self, 'zipFileMap', {}) - object.__setattr__(self, 'workspec_list', []) + object.__setattr__(self, "events", set()) + object.__setattr__(self, "zipEventMap", {}) + object.__setattr__(self, "inFiles", set()) + object.__setattr__(self, "outFiles", set()) + object.__setattr__(self, "zipFileMap", {}) + object.__setattr__(self, "workspec_list", []) # add file def add_file(self, filespec): - if filespec.fileType == 'input': + if filespec.fileType == "input": self.add_in_file(filespec) else: self.add_out_file(filespec) @@ -122,35 +120,34 @@ def add_event(self, event_spec, zip_filespec): else: zipFileID = zip_filespec.fileID if zipFileID not in self.zipEventMap: - self.zipEventMap[zipFileID] = {'events': set(), - 'zip': zip_filespec} - self.zipEventMap[zipFileID]['events'].add(event_spec) + self.zipEventMap[zipFileID] = {"events": set(), "zip": zip_filespec} + self.zipEventMap[zipFileID]["events"].add(event_spec) self.events.add(event_spec) # convert from Job JSON def convert_job_json(self, data): # decode secrets try: - if 'secrets' in data: - data['secrets'] = json.loads(data['secrets']) + if "secrets" in data: + data["secrets"] = json.loads(data["secrets"]) except Exception: pass - self.PandaID = data['PandaID'] - if data['taskID'] == 'NULL': + self.PandaID = data["PandaID"] + if data["taskID"] == "NULL": self.taskID = None else: - self.taskID = data['taskID'] - self.attemptNr = data['attemptNr'] - if data['jobsetID'] == 'NULL': + self.taskID = data["taskID"] + self.attemptNr = data["attemptNr"] + if data["jobsetID"] == "NULL": self.jobsetID = None else: - self.jobsetID = data['jobsetID'] - self.currentPriority = data['currentPriority'] + self.jobsetID = data["jobsetID"] + self.currentPriority = data["currentPriority"] self.jobParams = data self.jobParamsExtForOutput = self.get_output_file_attributes() self.jobParamsExtForLog = self.get_logfile_info() - if 'zipPerMB' in data: - self.zipPerMB = data['zipPerMB'] + if "zipPerMB" in data: + self.zipPerMB = data["zipPerMB"] # trigger propagation def trigger_propagation(self): @@ -170,7 +167,7 @@ def set_attributes(self, attrs): return attrs = copy.copy(attrs) # set work attribute - for attName in ['pilotErrorCode', 'pilotErrorDiag', 'exeErrorCode', 'exeErrorDiag']: + for attName in ["pilotErrorCode", "pilotErrorDiag", "exeErrorCode", "exeErrorDiag"]: if attName in attrs: if self.PandaID not in attrs: attrs[self.PandaID] = dict() @@ -180,19 +177,19 @@ def set_attributes(self, attrs): return attrs = copy.copy(attrs[self.PandaID]) # set metadata and outputs to dedicated attributes - if 'metaData' in attrs: - self.metaData = attrs['metaData'] - del attrs['metaData'] - if 'xml' in attrs: - self.outputFilesToReport = attrs['xml'] - del attrs['xml'] + if "metaData" in attrs: + self.metaData = attrs["metaData"] + del attrs["metaData"] + if "xml" in attrs: + self.outputFilesToReport = attrs["xml"] + del attrs["xml"] if self.jobAttributes is None: self.jobAttributes = attrs else: for key, val in iteritems(attrs): if key not in self.jobAttributes or self.jobAttributes[key] != val: self.jobAttributes[key] = val - self.force_update('jobAttributes') + self.force_update("jobAttributes") # set one attribute def set_one_attribute(self, attr, value): @@ -200,7 +197,7 @@ def set_one_attribute(self, attr, value): self.jobAttributes = dict() if attr not in self.jobAttributes or self.jobAttributes[attr] != value: self.jobAttributes[attr] = value - self.force_update('jobAttributes') + self.force_update("jobAttributes") # check if an attribute is there def has_attribute(self, attr): @@ -218,21 +215,20 @@ def get_one_attribute(self, attr): def is_final_status(self, job_status=None): if job_status is None: job_status = self.status - return job_status in ['finished', 'failed', 'cancelled', 'missed'] + return job_status in ["finished", "failed", "cancelled", "missed"] # get status def get_status(self): # don't report the final status while staging-out - if self.is_final_status() and self.subStatus not in ['killed'] and \ - (self.subStatus in ['to_transfer', 'transferring'] or not self.all_events_done()): - return 'transferring' + if self.is_final_status() and self.subStatus not in ["killed"] and (self.subStatus in ["to_transfer", "transferring"] or not self.all_events_done()): + return "transferring" return self.status # check if all events are done def all_events_done(self): retVal = True for eventSpec in self.events: - if eventSpec.subStatus != 'done': + if eventSpec.subStatus != "done": retVal = False break return retVal @@ -240,19 +236,19 @@ def all_events_done(self): # all files are triggered to stage-out def all_files_triggered_to_stage_out(self): for fileSpec in self.outFiles: - if fileSpec.status not in ['finished', 'failed']: - fileSpec.status = 'transferring' + if fileSpec.status not in ["finished", "failed"]: + fileSpec.status = "transferring" fileSpec.attemptNr = 0 # all files are zipped def all_files_zipped(self, use_post_zipping=False): for fileSpec in self.outFiles: - if fileSpec.status not in ['finished', 'failed']: + if fileSpec.status not in ["finished", "failed"]: fileSpec.attemptNr = 0 if use_post_zipping: - fileSpec.status = 'post_zipping' + fileSpec.status = "post_zipping" else: - fileSpec.status = 'defined' + fileSpec.status = "defined" fileSpec.groupID = None fileSpec.groupStatus = None fileSpec.groupUpdateTime = None @@ -266,31 +262,30 @@ def to_event_data(self, max_events=None): if max_events is not None and iEvents > max_events: break eventRanges = [] - for eventSpec in eventsData['events']: + for eventSpec in eventsData["events"]: eventRanges.append(eventSpec.to_data()) eventSpecs.append(eventSpec) iEvents += 1 tmpData = {} - tmpData['eventRanges'] = eventRanges - if 'sourceURL' in self.jobParams: - tmpData['sourceURL'] = self.jobParams['sourceURL'] + tmpData["eventRanges"] = eventRanges + if "sourceURL" in self.jobParams: + tmpData["sourceURL"] = self.jobParams["sourceURL"] if zipFileID is not None: - zipFileSpec = eventsData['zip'] - if zipFileSpec.status == 'finished': + zipFileSpec = eventsData["zip"] + if zipFileSpec.status == "finished": objstoreID = "{0}".format(zipFileSpec.objstoreID) if zipFileSpec.pathConvention is not None: objstoreID += "/{0}".format(zipFileSpec.pathConvention) - tmpData['zipFile'] = {'lfn': zipFileSpec.lfn, - 'objstoreID': objstoreID} + tmpData["zipFile"] = {"lfn": zipFileSpec.lfn, "objstoreID": objstoreID} if zipFileSpec.fsize not in [None, 0]: - tmpData['zipFile']['fsize'] = zipFileSpec.fsize + tmpData["zipFile"]["fsize"] = zipFileSpec.fsize if zipFileSpec.chksum is not None: - if zipFileSpec.chksum.startswith('md:'): - tmpData['zipFile']['md5'] = zipFileSpec.chksum.split(':')[-1] - elif zipFileSpec.chksum.startswith('ad:'): - tmpData['zipFile']['adler32'] = zipFileSpec.chksum.split(':')[-1] + if zipFileSpec.chksum.startswith("md:"): + tmpData["zipFile"]["md5"] = zipFileSpec.chksum.split(":")[-1] + elif zipFileSpec.chksum.startswith("ad:"): + tmpData["zipFile"]["adler32"] = zipFileSpec.chksum.split(":")[-1] else: - tmpData['zipFile']['adler32'] = zipFileSpec.chksum + tmpData["zipFile"]["adler32"] = zipFileSpec.chksum data.append(tmpData) return data, eventSpecs @@ -300,20 +295,19 @@ def get_input_file_attributes(self, skip_ready=False): attemptNrMap = dict() pathMap = dict() for fileSpec in self.inFiles: - if skip_ready and fileSpec.status == 'ready': + if skip_ready and fileSpec.status == "ready": lfnToSkip.add(fileSpec.lfn) attemptNrMap[fileSpec.lfn] = fileSpec.attemptNr pathMap[fileSpec.lfn] = fileSpec.path inFiles = {} - lfns = self.jobParams['inFiles'].split(',') - guids = self.jobParams['GUID'].split(',') - fsizes = self.jobParams['fsize'].split(',') - chksums = self.jobParams['checksum'].split(',') - scopes = self.jobParams['scopeIn'].split(',') - datasets = self.jobParams['realDatasetsIn'].split(',') - endpoints = self.jobParams['ddmEndPointIn'].split(',') - for lfn, guid, fsize, chksum, scope, dataset, endpoint in \ - zip(lfns, guids, fsizes, chksums, scopes, datasets, endpoints): + lfns = self.jobParams["inFiles"].split(",") + guids = self.jobParams["GUID"].split(",") + fsizes = self.jobParams["fsize"].split(",") + chksums = self.jobParams["checksum"].split(",") + scopes = self.jobParams["scopeIn"].split(",") + datasets = self.jobParams["realDatasetsIn"].split(",") + endpoints = self.jobParams["ddmEndPointIn"].split(",") + for lfn, guid, fsize, chksum, scope, dataset, endpoint in zip(lfns, guids, fsizes, chksums, scopes, datasets, endpoints): try: fsize = long(fsize) except Exception: @@ -324,24 +318,18 @@ def get_input_file_attributes(self, skip_ready=False): attemptNr = attemptNrMap[lfn] else: attemptNr = 0 - inFiles[lfn] = {'fsize': fsize, - 'guid': guid, - 'checksum': chksum, - 'scope': scope, - 'dataset': dataset, - 'endpoint': endpoint, - 'attemptNr': attemptNr} + inFiles[lfn] = {"fsize": fsize, "guid": guid, "checksum": chksum, "scope": scope, "dataset": dataset, "endpoint": endpoint, "attemptNr": attemptNr} # add path - if 'inFilePaths' in self.jobParams: + if "inFilePaths" in self.jobParams: for lfn in lfns: if lfn not in inFiles or lfn not in pathMap: continue - inFiles[lfn]['path'] = pathMap[lfn] + inFiles[lfn]["path"] = pathMap[lfn] # delete empty file - if '' in inFiles: - del inFiles[''] - if 'NULL' in inFiles: - del inFiles['NULL'] + if "" in inFiles: + del inFiles[""] + if "NULL" in inFiles: + del inFiles["NULL"] return inFiles # set input file paths @@ -349,39 +337,37 @@ def set_input_file_paths(self, in_files): lfns = self.get_input_file_attributes().keys() paths = [] for lfn in lfns: - # check for consistency - if lfn in in_files: - paths.append(in_files[lfn]['path']) - self.jobParams['inFilePaths'] = ','.join(paths) + # check for consistency + if lfn in in_files: + paths.append(in_files[lfn]["path"]) + self.jobParams["inFilePaths"] = ",".join(paths) # trigger updating - self.force_update('jobParams') + self.force_update("jobParams") # update file specs for fileSpec in self.inFiles: if fileSpec.lfn in in_files: - fileSpec.path = in_files[fileSpec.lfn]['path'] + fileSpec.path = in_files[fileSpec.lfn]["path"] # set ready to all input files def set_all_input_ready(self): # update file specs for fileSpec in self.inFiles: - fileSpec.status = 'ready' + fileSpec.status = "ready" # get output file attributes def get_output_file_attributes(self): if self.jobParamsExtForOutput is not None: return self.jobParamsExtForOutput outFiles = {} - lfns = self.jobParams['outFiles'].split(',') - scopes = self.jobParams['scopeOut'].split(',') - scopeLog = self.jobParams['scopeLog'] - logLFN = self.jobParams['logFile'] + lfns = self.jobParams["outFiles"].split(",") + scopes = self.jobParams["scopeOut"].split(",") + scopeLog = self.jobParams["scopeLog"] + logLFN = self.jobParams["logFile"] scopes.insert(lfns.index(logLFN), scopeLog) - datasets = self.jobParams['realDatasets'].split(',') - endpoints = self.jobParams['ddmEndPointOut'].split(',') + datasets = self.jobParams["realDatasets"].split(",") + endpoints = self.jobParams["ddmEndPointOut"].split(",") for lfn, scope, dataset, endpoint in zip(lfns, scopes, datasets, endpoints): - outFiles[lfn] = {'scope': scope, - 'dataset': dataset, - 'endpoint': endpoint} + outFiles[lfn] = {"scope": scope, "dataset": dataset, "endpoint": endpoint} self.jobParamsExtForOutput = outFiles return outFiles @@ -390,8 +376,8 @@ def get_logfile_info(self): if self.jobParamsExtForLog is not None: return self.jobParamsExtForLog retMap = dict() - retMap['lfn'] = self.jobParams['logFile'] - retMap['guid'] = self.jobParams['logGUID'] + retMap["lfn"] = self.jobParams["logFile"] + retMap["guid"] = self.jobParams["logGUID"] self.jobParamsExtForLog = retMap return retMap @@ -425,15 +411,53 @@ def get_job_attributes_for_panda(self): return data # extract only panda attributes # FIXME use set literal for python >=2.7 - panda_attributes = ['token', 'transExitCode', 'pilotErrorCode', 'pilotErrorDiag', 'timestamp', - 'node', 'workdir', 'cpuConsumptionTime', 'cpuConsumptionUnit', 'remainingSpace', - 'schedulerID', 'pilotID', 'siteName', 'messageLevel', 'pilotLog', - 'cpuConversionFactor', 'exeErrorCode', 'exeErrorDiag', 'pilotTiming', - 'computingElement', 'startTime', 'endTime', 'nEvents', 'nInputFiles', - 'batchID', 'attemptNr', 'jobMetrics', - 'stdout', 'coreCount', 'maxRSS', 'maxVMEM', 'maxSWAP', 'maxPSS', - 'avgRSS', 'avgVMEM', 'avgSWAP', 'avgPSS', 'totRCHAR', 'totWCHAR', 'totRBYTES', - 'totWBYTES', 'rateRCHAR', 'rateWCHAR', 'rateRBYTES', 'rateWBYTES'] + panda_attributes = [ + "token", + "transExitCode", + "pilotErrorCode", + "pilotErrorDiag", + "timestamp", + "node", + "workdir", + "cpuConsumptionTime", + "cpuConsumptionUnit", + "remainingSpace", + "schedulerID", + "pilotID", + "siteName", + "messageLevel", + "pilotLog", + "cpuConversionFactor", + "exeErrorCode", + "exeErrorDiag", + "pilotTiming", + "computingElement", + "startTime", + "endTime", + "nEvents", + "nInputFiles", + "batchID", + "attemptNr", + "jobMetrics", + "stdout", + "coreCount", + "maxRSS", + "maxVMEM", + "maxSWAP", + "maxPSS", + "avgRSS", + "avgVMEM", + "avgSWAP", + "avgPSS", + "totRCHAR", + "totWCHAR", + "totRBYTES", + "totWBYTES", + "rateRCHAR", + "rateWCHAR", + "rateRBYTES", + "rateWBYTES", + ] panda_attributes = set(panda_attributes) for aName, aValue in iteritems(self.jobAttributes): if aName in panda_attributes: @@ -444,11 +468,11 @@ def get_job_attributes_for_panda(self): # get job status from attributes def get_job_status_from_attributes(self): - if self.jobAttributes is None or 'jobStatus' not in self.jobAttributes: + if self.jobAttributes is None or "jobStatus" not in self.jobAttributes: return None - if self.jobAttributes['jobStatus'] not in ['finished', 'failed']: + if self.jobAttributes["jobStatus"] not in ["finished", "failed"]: return None - return self.jobAttributes['jobStatus'] + return self.jobAttributes["jobStatus"] # set group to files def set_groups_to_files(self, id_map): @@ -456,13 +480,13 @@ def set_groups_to_files(self, id_map): # reverse mapping revMap = dict() for gID, items in iteritems(id_map): - for lfn in items['lfns']: + for lfn in items["lfns"]: revMap[lfn] = gID # update file specs for fileSpec in self.inFiles.union(self.outFiles): if fileSpec.lfn in revMap: fileSpec.groupID = revMap[fileSpec.lfn] - fileSpec.groupStatus = id_map[fileSpec.groupID]['groupStatus'] + fileSpec.groupStatus = id_map[fileSpec.groupID]["groupStatus"] fileSpec.groupUpdateTime = timeNow # update group status in files @@ -478,18 +502,16 @@ def update_group_status_in_files(self, group_id, group_status): def get_groups_of_input_files(self, skip_ready=False): groups = dict() for fileSpec in self.inFiles: - if skip_ready and fileSpec.status == 'ready': + if skip_ready and fileSpec.status == "ready": continue - groups[fileSpec.groupID] = {'groupUpdateTime': fileSpec.groupUpdateTime, - 'groupStatus': fileSpec.groupStatus} + groups[fileSpec.groupID] = {"groupUpdateTime": fileSpec.groupUpdateTime, "groupStatus": fileSpec.groupStatus} return groups # get groups of output files def get_groups_of_output_files(self): groups = dict() for fileSpec in self.outFiles: - groups[fileSpec.groupID] = {'groupUpdateTime': fileSpec.groupUpdateTime, - 'groupStatus': fileSpec.groupStatus} + groups[fileSpec.groupID] = {"groupUpdateTime": fileSpec.groupUpdateTime, "groupStatus": fileSpec.groupStatus} return groups # get output file specs @@ -499,7 +521,7 @@ def get_output_file_specs(self, skip_done=False): else: retList = [] for fileSpec in self.outFiles: - if fileSpec.status not in ['finished', 'failed']: + if fileSpec.status not in ["finished", "failed"]: retList.append(fileSpec) return retList @@ -508,21 +530,21 @@ def get_input_file_specs(self, group_id, skip_ready=False): retList = [] for fileSpec in self.inFiles: if fileSpec.groupID == group_id: - if skip_ready and fileSpec.status in ['ready', 'failed']: + if skip_ready and fileSpec.status in ["ready", "failed"]: continue retList.append(fileSpec) return retList # set pilot error def set_pilot_error(self, error_code, error_dialog): - if not self.has_attribute('pilotErrorCode'): - self.set_one_attribute('pilotErrorCode', error_code) - if not self.has_attribute('pilotErrorDiag'): - self.set_one_attribute('pilotErrorDiag', error_dialog) + if not self.has_attribute("pilotErrorCode"): + self.set_one_attribute("pilotErrorCode", error_code) + if not self.has_attribute("pilotErrorDiag"): + self.set_one_attribute("pilotErrorDiag", error_dialog) # not to suppress heartbeat def not_suppress_heartbeat(self): - if self.subStatus in ['missed']: + if self.subStatus in ["missed"]: return True return False @@ -541,26 +563,25 @@ def get_job_params(self, strip): else: newParams = dict() for k, v in iteritems(self.jobParams): - if k in ['prodDBlocks', 'realDatasetsIn', 'dispatchDblock', 'ddmEndPointIn', 'scopeIn', - 'dispatchDBlockToken', 'prodDBlockToken']: + if k in ["prodDBlocks", "realDatasetsIn", "dispatchDblock", "ddmEndPointIn", "scopeIn", "dispatchDBlockToken", "prodDBlockToken"]: continue newParams[k] = v return newParams # get pilot type def get_pilot_type(self): - if 'prodSourceLabel' not in self.jobParams: + if "prodSourceLabel" not in self.jobParams: return None - if self.jobParams['prodSourceLabel'] == 'rc_test': - return 'RC' - elif self.jobParams['prodSourceLabel'] == 'rc_test2': - return 'RC' - elif self.jobParams['prodSourceLabel'] == 'rc_alrb': - return 'ALRB' - elif self.jobParams['prodSourceLabel'] == 'ptest': - return 'PT' - elif self.jobParams['prodSourceLabel']: - return 'PR' + if self.jobParams["prodSourceLabel"] == "rc_test": + return "RC" + elif self.jobParams["prodSourceLabel"] == "rc_test2": + return "RC" + elif self.jobParams["prodSourceLabel"] == "rc_alrb": + return "ALRB" + elif self.jobParams["prodSourceLabel"] == "ptest": + return "PT" + elif self.jobParams["prodSourceLabel"]: + return "PR" else: return None @@ -570,11 +591,11 @@ def manipulate_job_params_for_container(self): for fileSpec in self.inFiles: for k, v in iteritems(self.jobParams): # only container image - if k == 'container_name': + if k == "container_name": if v == fileSpec.url: self.jobParams[k] = fileSpec.path updated = True - elif k == 'containerOptions': + elif k == "containerOptions": for kk, vv in iteritems(v): if kk == "containerImage": if vv == fileSpec.url: @@ -583,4 +604,4 @@ def manipulate_job_params_for_container(self): continue # trigger updating if updated: - self.force_update('jobParams') + self.force_update("jobParams") diff --git a/pandaharvester/harvestercore/job_worker_relation_spec.py b/pandaharvester/harvestercore/job_worker_relation_spec.py index a79f3b85..bf53ef99 100644 --- a/pandaharvester/harvestercore/job_worker_relation_spec.py +++ b/pandaharvester/harvestercore/job_worker_relation_spec.py @@ -9,10 +9,11 @@ # relationship spec class JobWorkerRelationSpec(SpecBase): # attributes - attributesWithTypes = ('PandaID:integer / index', - 'workerID:integer / index', - 'relationType:text', - ) + attributesWithTypes = ( + "PandaID:integer / index", + "workerID:integer / index", + "relationType:text", + ) # constructor def __init__(self): diff --git a/pandaharvester/harvestercore/panda_queue_spec.py b/pandaharvester/harvestercore/panda_queue_spec.py index 210e29d7..985b8d8b 100644 --- a/pandaharvester/harvestercore/panda_queue_spec.py +++ b/pandaharvester/harvestercore/panda_queue_spec.py @@ -8,29 +8,31 @@ class PandaQueueSpec(SpecBase): # attributes - attributesWithTypes = ('queueName:text / index', - 'nQueueLimitJob:integer', - 'nQueueLimitWorker:integer', - 'maxWorkers:integer', - 'jobFetchTime:timestamp / index', - 'submitTime:timestamp / index', - 'lockedBy:text', - 'siteName:text / index', - 'jobType:text', - 'resourceType:text', - 'nNewWorkers:integer', - 'uniqueName:text / unique', - 'nQueueLimitJobRatio:integer', - 'nQueueLimitJobMax:integer', - 'nQueueLimitJobMin:integer', - 'nQueueLimitWorkerRatio:integer', - 'nQueueLimitWorkerMax:integer', - 'nQueueLimitWorkerMin:integer', - ) + attributesWithTypes = ( + "queueName:text / index", + "nQueueLimitJob:integer", + "nQueueLimitWorker:integer", + "maxWorkers:integer", + "jobFetchTime:timestamp / index", + "submitTime:timestamp / index", + "lockedBy:text", + "siteName:text / index", + "jobType:text", + "resourceType:text", + "nNewWorkers:integer", + "uniqueName:text / unique", + "nQueueLimitJobRatio:integer", + "nQueueLimitJobMax:integer", + "nQueueLimitJobMin:integer", + "nQueueLimitWorkerRatio:integer", + "nQueueLimitWorkerMax:integer", + "nQueueLimitWorkerMin:integer", + ) # catchall resource type - RT_catchall = 'ANY' - JT_catchall = 'ANY' + RT_catchall = "ANY" + JT_catchall = "ANY" # constructor + def __init__(self): SpecBase.__init__(self) diff --git a/pandaharvester/harvestercore/pilot_errors.py b/pandaharvester/harvestercore/pilot_errors.py index 5bf3ef91..2fa41a7a 100644 --- a/pandaharvester/harvestercore/pilot_errors.py +++ b/pandaharvester/harvestercore/pilot_errors.py @@ -3,29 +3,58 @@ class PilotErrors(PilotErrorCodesObj): - """ Pilot error handling """ + """Pilot error handling""" pilot_error_msg = PilotErrorCodesObj._error_messages - pilot_error_msg.update({ - # can have additional error codes here - }) + pilot_error_msg.update( + { + # can have additional error codes here + } + ) getErrorCodes = [1097, 1099, 1100, 1103, 1107, 1113, 1130, 1145, 1151, 1164, 1167, 1168, 1171, 1175, 1178, 1179, 1180, 1182] putErrorCodes = [1101, 1114, 1122, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1140, 1141, 1152, 1154, 1155, 1181] recoverableErrorCodes = [0] + putErrorCodes # Error codes that will issue a Pilot-controlled resubmission - PilotResubmissionErrorCodes = [1008, 1098, 1099, 1110, 1113, 1114, 1115, 1116, 1117, 1137, 1139, 1151, 1152, 1171, 1172, 1177, 1179, 1180, 1181, 1182, 1188, 1189, 1195, 1196, 1197, 1219] + PilotResubmissionErrorCodes = [ + 1008, + 1098, + 1099, + 1110, + 1113, + 1114, + 1115, + 1116, + 1117, + 1137, + 1139, + 1151, + 1152, + 1171, + 1172, + 1177, + 1179, + 1180, + 1181, + 1182, + 1188, + 1189, + 1195, + 1196, + 1197, + 1219, + ] # Error codes used with FAX fail-over (only an error code in this list will allow FAX fail-over) PilotFAXErrorCodes = [1103] + PilotResubmissionErrorCodes # Mapping between payload exit code and pilot errors pilot_code_dict = PilotAux.get_error_code_translation_dictionary() - avail_exit_codes = [ value[0] for value in pilot_code_dict.values() ] + avail_exit_codes = [value[0] for value in pilot_code_dict.values()] def getPilotErrorDiag(self, code=0): - """ Return text corresponding to error code """ + """Return text corresponding to error code""" pilotErrorDiag = "" if code in self.pilot_error_msg.keys(): pilotErrorDiag = self.pilot_error_msg[code] @@ -34,14 +63,14 @@ def getPilotErrorDiag(self, code=0): return pilotErrorDiag def isGetErrorCode(self, code=0): - """ Determine whether code is in the put error list or not """ + """Determine whether code is in the put error list or not""" state = False if code in self.getErrorCodes: state = True return state def isPutErrorCode(self, code=0): - """ Determine whether code is in the put error list or not """ + """Determine whether code is in the put error list or not""" state = False if code in self.putErrorCodes: state = True @@ -49,18 +78,18 @@ def isPutErrorCode(self, code=0): @classmethod def isRecoverableErrorCode(self, code=0): - """ Determine whether code is a recoverable error code or not """ + """Determine whether code is a recoverable error code or not""" return code in self.recoverableErrorCodes def isPilotResubmissionErrorCode(self, code=0): - """ Determine whether code issues a Pilot-controlled resubmission """ + """Determine whether code issues a Pilot-controlled resubmission""" state = False if code in self.PilotResubmissionErrorCodes: state = True return state def isPilotFAXErrorCode(self, code=0): - """ Determine whether code allows for a FAX fail-over """ + """Determine whether code allows for a FAX fail-over""" state = False if code in self.PilotFAXErrorCodes: state = True @@ -72,10 +101,10 @@ def getErrorStr(self, code): Avoids exception if an error is not in the dictionary. An empty string is returned if the error is not in the dictionary. """ - return self.pilot_error_msg.get(code, '') + return self.pilot_error_msg.get(code, "") def getErrorName(self, code): - """ From the error code to get the error name""" + """From the error code to get the error name""" for k in self.__class__.__dict__.keys(): if self.__class__.__dict__[k] == code: return k @@ -85,7 +114,7 @@ def convertToPilotErrors(self, exit_code): """ Convert payload exit code to pilot error code and error dialogue message """ - pilot_error_code, pilot_error_diag = None, '' + pilot_error_code, pilot_error_diag = None, "" if exit_code in self.avail_exit_codes: pilot_error_code = PilotAux.convert_to_pilot_error_code(exit_code) pilot_error_diag = self.getPilotErrorDiag(pilot_error_code) @@ -93,8 +122,7 @@ def convertToPilotErrors(self, exit_code): class PilotException(Exception): - - def __init__(self, message, code=PilotErrors.GENERALERROR, state='', *args): + def __init__(self, message, code=PilotErrors.GENERALERROR, state="", *args): self.code = code self.state = state self.message = message @@ -110,7 +138,7 @@ def code(self, code): self.code_description = PilotErrors.getErrorStr(code) def __str__(self): - return "%s: %s: %s%s" % (self.__class__.__name__, self.code, self.message, ' : %s' % self.args if self.args else '') + return "%s: %s: %s%s" % (self.__class__.__name__, self.code, self.message, " : %s" % self.args if self.args else "") def __repr__(self): - return "%s: %s: %s%s" % (self.__class__.__name__, repr(self.code), repr(self.message), ' : %s' % repr(self.args) if self.args else '') + return "%s: %s: %s%s" % (self.__class__.__name__, repr(self.code), repr(self.message), " : %s" % repr(self.args) if self.args else "") diff --git a/pandaharvester/harvestercore/plugin_base.py b/pandaharvester/harvestercore/plugin_base.py index 09050f80..286d3356 100644 --- a/pandaharvester/harvestercore/plugin_base.py +++ b/pandaharvester/harvestercore/plugin_base.py @@ -9,7 +9,7 @@ def __init__(self, **kwarg): # make logger def make_logger(self, base_log, token=None, method_name=None, send_dialog=True): - if send_dialog and hasattr(self, 'dbInterface'): + if send_dialog and hasattr(self, "dbInterface"): hook = self.dbInterface else: hook = None diff --git a/pandaharvester/harvestercore/plugin_factory.py b/pandaharvester/harvestercore/plugin_factory.py index d376c0a8..298c33be 100644 --- a/pandaharvester/harvestercore/plugin_factory.py +++ b/pandaharvester/harvestercore/plugin_factory.py @@ -4,7 +4,7 @@ from .db_interface import DBInterface # logger -_logger = core_utils.setup_logger('plugin_factory') +_logger = core_utils.setup_logger("plugin_factory") # plugin factory @@ -17,26 +17,26 @@ def __init__(self, no_db=False): # get plugin key def get_plugin_key(self, plugin_conf): # use module + class as key - moduleName = plugin_conf['module'] - className = plugin_conf['name'] - pluginKey = '{0}.{1}'.format(moduleName, className) + moduleName = plugin_conf["module"] + className = plugin_conf["name"] + pluginKey = "{0}.{1}".format(moduleName, className) return pluginKey # get plugin instance def get_plugin(self, plugin_conf): # use module + class as key - moduleName = plugin_conf['module'] - className = plugin_conf['name'] - pluginKey = '{0}.{1}'.format(moduleName, className) + moduleName = plugin_conf["module"] + className = plugin_conf["name"] + pluginKey = "{0}.{1}".format(moduleName, className) if moduleName is None or className is None: return None # get class if pluginKey not in self.classMap: - tmpLog = core_utils.make_logger(_logger, method_name='get_plugin') + tmpLog = core_utils.make_logger(_logger, method_name="get_plugin") # import module tmpLog.debug("importing {0}".format(moduleName)) mod = __import__(moduleName) - for subModuleName in moduleName.split('.')[1:]: + for subModuleName in moduleName.split(".")[1:]: mod = getattr(mod, subModuleName) # get class tmpLog.debug("getting class {0}".format(className)) @@ -46,17 +46,17 @@ def get_plugin(self, plugin_conf): # make args args = {} for tmpKey, tmpVal in iteritems(plugin_conf): - if tmpKey in ['module', 'name']: + if tmpKey in ["module", "name"]: continue args[tmpKey] = tmpVal # add database interface if not self.noDB: - args['dbInterface'] = DBInterface() + args["dbInterface"] = DBInterface() # instantiate cls = self.classMap[pluginKey] impl = cls(**args) # bare instance when middleware is used - if 'original_config' in plugin_conf and 'bareFunctions' in plugin_conf: - bare_impl = self.get_plugin(plugin_conf['original_config']) + if "original_config" in plugin_conf and "bareFunctions" in plugin_conf: + bare_impl = self.get_plugin(plugin_conf["original_config"]) impl.bare_impl = bare_impl return impl diff --git a/pandaharvester/harvestercore/process_lock_spec.py b/pandaharvester/harvestercore/process_lock_spec.py index 358fbc8f..897c6cc9 100644 --- a/pandaharvester/harvestercore/process_lock_spec.py +++ b/pandaharvester/harvestercore/process_lock_spec.py @@ -8,10 +8,7 @@ class ProcessLockSpec(SpecBase): # attributes - attributesWithTypes = ('processName:text primary key', - 'lockedBy:text', - 'lockTime:timestamp' - ) + attributesWithTypes = ("processName:text primary key", "lockedBy:text", "lockTime:timestamp") # constructor def __init__(self): diff --git a/pandaharvester/harvestercore/queue_config_dump_spec.py b/pandaharvester/harvestercore/queue_config_dump_spec.py index 26f9eb55..6baaa2d1 100644 --- a/pandaharvester/harvestercore/queue_config_dump_spec.py +++ b/pandaharvester/harvestercore/queue_config_dump_spec.py @@ -11,13 +11,14 @@ class QueueConfigDumpSpec(SpecBase): # attributes - attributesWithTypes = ('configID:integer primary key', - 'queueName:text / index', - 'checksum:text', - 'dumpUniqueName:text / unique', - 'creationTime:timestamp / index', - 'data:blob' - ) + attributesWithTypes = ( + "configID:integer primary key", + "queueName:text / index", + "checksum:text", + "dumpUniqueName:text / unique", + "creationTime:timestamp / index", + "data:blob", + ) # constructor def __init__(self): @@ -28,12 +29,12 @@ def set_data(self, data): self.data = copy.deepcopy(data) # don't record status try: - del self.data['queueStatus'] + del self.data["queueStatus"] except Exception: pass # get checksum m = hashlib.md5() - m.update(json.dumps(self.data).encode('utf-8')) + m.update(json.dumps(self.data).encode("utf-8")) self.checksum = m.hexdigest() # set unique name - self.dumpUniqueName = '{0}_{1}'.format(self.queueName, self.checksum) + self.dumpUniqueName = "{0}_{1}".format(self.queueName, self.checksum) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index 42dbb3ee..ea6aaf1f 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -26,7 +26,7 @@ # logger -_logger = core_utils.setup_logger('queue_config_mapper') +_logger = core_utils.setup_logger("queue_config_mapper") _dbInterface = DBInterface() @@ -41,21 +41,20 @@ def _make_logger(base_log=_logger, token=None, method_name=None, send_dialog=Tru # class for queue config class QueueConfig(object): - def __init__(self, queue_name): self.queueName = queue_name self.pandaQueueName = None - self.prodSourceLabel = 'managed' + self.prodSourceLabel = "managed" # default parameters self.mapType = WorkSpec.MT_OneToOne self.useJobLateBinding = False self.zipPerMB = None - self.siteName = '' + self.siteName = "" self.maxWorkers = 0 self.nNewWorkers = 0 self.maxNewWorkersPerCycle = 0 - self.noHeartbeat = '' - self.runMode = 'self' + self.noHeartbeat = "" + self.runMode = "self" self.resourceType = PandaQueueSpec.RT_catchall self.jobType = PandaQueueSpec.JT_catchall self.getJobCriteria = None @@ -71,7 +70,7 @@ def __init__(self, queue_name): # get list of status without heartbeat def get_no_heartbeat_status(self): - return self.noHeartbeat.split(',') + return self.noHeartbeat.split(",") # check if status without heartbeat def is_no_heartbeat_status(self, status): @@ -85,12 +84,12 @@ def get_source_label(self, job_type=None, is_gu=None): # return 'test' # grandly unified queues: prodsourcelabel in job has precedence over queue prodsourcelabel - if job_type in ('user', 'panda'): - return 'user' + if job_type in ("user", "panda"): + return "user" # grandly unified queues: call to getJobs should not request for a particular prodSourceLabel if is_gu: - return 'unified' + return "unified" return self.prodSourceLabel @@ -105,28 +104,26 @@ def update_attributes(self, data): # get synchronization level between job and worker def get_synchronization_level(self): - if self.mapType == WorkSpec.MT_NoJob or self.truePilot or self.is_no_heartbeat_status('finished'): + if self.mapType == WorkSpec.MT_NoJob or self.truePilot or self.is_no_heartbeat_status("finished"): return 1 return None # str def __str__(self): - header = self.queueName + '\n' + '-' * len(self.queueName) + '\n' - tmpStr = '' - pluginStr = '' - keys = list(self.__dict__.keys()) - keys.sort() + header = self.queueName + "\n" + "-" * len(self.queueName) + "\n" + tmpStr = "" + pluginStr = "" + keys = sorted(self.__dict__.keys()) for key in keys: val = self.__dict__[key] if isinstance(val, dict): - pluginStr += ' {0} :\n'.format(key) - pKeys = list(val.keys()) - pKeys.sort() + pluginStr += " {0} :\n".format(key) + pKeys = sorted(val.keys()) for pKey in pKeys: pVal = val[pKey] - pluginStr += ' {0} = {1}\n'.format(pKey, pVal) + pluginStr += " {0} = {1}\n".format(pKey, pVal) else: - tmpStr += ' {0} = {1}\n'.format(key, val) + tmpStr += " {0} = {1}\n".format(key, val) return header + tmpStr + pluginStr @@ -146,41 +143,34 @@ class QueueConfigMapper(six.with_metaclass(SingletonWithID, object)): FQ = FT updated with RQ, then updated with DQ, then updated with LQ """ - mandatory_attrs = set([ - 'messenger', - 'monitor', - 'preparator', - 'stager', - 'submitter', - 'sweeper', - 'workerMaker', - ]) - dynamic_queue_generic_attrs = set([ - 'nQueueLimitWorker', - 'maxWorkers', - 'maxNewWorkersPerCycle', - 'nQueueLimitJob', - 'nQueueLimitJobRatio', - 'nQueueLimitJobMax', - 'nQueueLimitJobMin', - 'nQueueLimitWorkerRatio', - 'nQueueLimitWorkerMax', - 'nQueueLimitWorkerMin', - ]) - updatable_plugin_attrs = set([ - 'common', - 'messenger', - 'monitor', - 'preparator', - 'stager', - 'submitter', - 'sweeper', - 'workerMaker', - 'throttler', - 'zipper', - 'aux_preparator', - 'extractor' - ]) + mandatory_attrs = set( + [ + "messenger", + "monitor", + "preparator", + "stager", + "submitter", + "sweeper", + "workerMaker", + ] + ) + dynamic_queue_generic_attrs = set( + [ + "nQueueLimitWorker", + "maxWorkers", + "maxNewWorkersPerCycle", + "nQueueLimitJob", + "nQueueLimitJobRatio", + "nQueueLimitJobMax", + "nQueueLimitJobMin", + "nQueueLimitWorkerRatio", + "nQueueLimitWorkerMax", + "nQueueLimitWorkerMin", + ] + ) + updatable_plugin_attrs = set( + ["common", "messenger", "monitor", "preparator", "stager", "submitter", "sweeper", "workerMaker", "throttler", "zipper", "aux_preparator", "extractor"] + ) # constructor def __init__(self, update_db=True): @@ -207,61 +197,56 @@ def __init__(self, update_db=True): # load config from DB cache of URL with validation def _load_config_from_cache(self): - mainLog = _make_logger(method_name='QueueConfigMapper._load_config_from_cache') + mainLog = _make_logger(method_name="QueueConfigMapper._load_config_from_cache") # load config json on URL if self.configFromCacher: - queueConfig_cacheSpec = self.dbProxy.get_cache('queues_config_file') + queueConfig_cacheSpec = self.dbProxy.get_cache("queues_config_file") if queueConfig_cacheSpec is not None: queueConfigJson = queueConfig_cacheSpec.data if isinstance(queueConfigJson, dict): return queueConfigJson else: - mainLog.error('Invalid JSON in cache queues_config_file. Skipped') + mainLog.error("Invalid JSON in cache queues_config_file. Skipped") else: - mainLog.debug('queues config not fount in cache. Skipped') + mainLog.debug("queues config not fount in cache. Skipped") else: - mainLog.debug('queues config URL not set. Skipped') + mainLog.debug("queues config URL not set. Skipped") return None # load config from local json file with syntax validation @staticmethod def _load_config_from_file(): - mainLog = _make_logger(method_name='QueueConfigMapper._load_config_from_file') + mainLog = _make_logger(method_name="QueueConfigMapper._load_config_from_file") # define config file path if os.path.isabs(harvester_config.qconf.configFile): confFilePath = harvester_config.qconf.configFile else: # check if in PANDA_HOME confFilePath = None - if 'PANDA_HOME' in os.environ: - confFilePath = os.path.join(os.environ['PANDA_HOME'], - 'etc/panda', - harvester_config.qconf.configFile) + if "PANDA_HOME" in os.environ: + confFilePath = os.path.join(os.environ["PANDA_HOME"], "etc/panda", harvester_config.qconf.configFile) if not os.path.exists(confFilePath): confFilePath = None # look into /etc/panda if confFilePath is None: - confFilePath = os.path.join('/etc/panda', - harvester_config.qconf.configFile) + confFilePath = os.path.join("/etc/panda", harvester_config.qconf.configFile) # load from config file try: with open(confFilePath) as f: queueConfigJson = json.load(f) except OSError as e: - mainLog.error('Cannot read file: {0} ; {1}'.format(confFilePath, e)) + mainLog.error("Cannot read file: {0} ; {1}".format(confFilePath, e)) return None except JSONDecodeError as e: - mainLog.error('Invalid JSON in file: {0} ; {1}'.format(confFilePath, e)) + mainLog.error("Invalid JSON in file: {0} ; {1}".format(confFilePath, e)) return None return queueConfigJson # get resolver module @staticmethod def _get_resolver(): - if hasattr(harvester_config.qconf, 'resolverModule') and \ - hasattr(harvester_config.qconf, 'resolverClass'): - pluginConf = {'module': harvester_config.qconf.resolverModule, - 'name': harvester_config.qconf.resolverClass} + if hasattr(harvester_config.qconf, "resolverModule") and hasattr(harvester_config.qconf, "resolverClass"): + pluginConf = {"module": harvester_config.qconf.resolverModule, "name": harvester_config.qconf.resolverClass} pluginFactory = PluginFactory() resolver = pluginFactory.get_plugin(pluginConf) else: @@ -272,12 +257,12 @@ def _get_resolver(): def _update_last_reload_time(self, ts=None): if ts is None: ts = time.time() - new_info = '{0:.3f}'.format(ts) - return self.dbProxy.refresh_cache('_qconf_last_reload', '_universal', new_info) + new_info = "{0:.3f}".format(ts) + return self.dbProxy.refresh_cache("_qconf_last_reload", "_universal", new_info) # get last reload time def _get_last_reload_time(self): - cacheSpec = self.dbProxy.get_cache('_qconf_last_reload', '_universal', from_local_cache=False) + cacheSpec = self.dbProxy.get_cache("_qconf_last_reload", "_universal", from_local_cache=False) if cacheSpec is None: return None timestamp = float(cacheSpec.data) @@ -285,39 +270,41 @@ def _get_last_reload_time(self): # load data def load_data(self, refill_table=False): - mainLog = _make_logger(method_name='QueueConfigMapper.load_data') + mainLog = _make_logger(method_name="QueueConfigMapper.load_data") # check if to update with self.lock: time_now = datetime.datetime.utcnow() updateInterval_td = datetime.timedelta(seconds=self.updateInterval) checkInterval_td = datetime.timedelta(seconds=self.checkInterval) # skip if lastCheck is fresh (within checkInterval) - if (self.lastCheck is not None - and time_now - self.lastCheck < checkInterval_td): + if self.lastCheck is not None and time_now - self.lastCheck < checkInterval_td: return self.lastCheck = time_now # get last_reload_timestamp from DB last_reload_timestamp = self._get_last_reload_time() self.lastReload = None if last_reload_timestamp is None else datetime.datetime.utcfromtimestamp(last_reload_timestamp) # skip if lastReload is fresh and lastUpdate fresher than lastReload (within updateInterval) - if (self.lastReload is not None and self.lastUpdate is not None + if ( + self.lastReload is not None + and self.lastUpdate is not None and self.lastReload < self.lastUpdate - and time_now - self.lastReload < updateInterval_td): + and time_now - self.lastReload < updateInterval_td + ): return # start with self.lock: # update timesatmp of last reload, lock with check interval - got_timesatmp_update_lock = self.dbProxy.get_process_lock('qconf_reload', 'qconf_universal', self.updateInterval) + got_timesatmp_update_lock = self.dbProxy.get_process_lock("qconf_reload", "qconf_universal", self.updateInterval) if got_timesatmp_update_lock: now_ts = time.time() retVal = self._update_last_reload_time(now_ts) self.lastReload = datetime.datetime.utcfromtimestamp(now_ts) if retVal: - mainLog.debug('updated last reload timestamp') + mainLog.debug("updated last reload timestamp") else: - mainLog.warning('failed to update last reload timestamp. Skipped') + mainLog.warning("failed to update last reload timestamp. Skipped") else: - mainLog.debug('did not get qconf_reload timestamp lock. Skipped to update last reload timestamp') + mainLog.debug("did not get qconf_reload timestamp lock. Skipped to update last reload timestamp") # init newQueueConfig = dict() localTemplatesDict = dict() @@ -333,53 +320,48 @@ def load_data(self, refill_table=False): # get resolver resolver = self._get_resolver() if resolver is None: - mainLog.debug('No resolver is configured') + mainLog.debug("No resolver is configured") # load config json from cacher (RT & RQ) queueConfigJson_cacher = self._load_config_from_cache() if queueConfigJson_cacher is not None: for queueName, queueDict in iteritems(queueConfigJson_cacher): - if queueDict.get('isTemplateQueue') is True \ - or queueName.endswith('_TEMPLATE'): + if queueDict.get("isTemplateQueue") is True or queueName.endswith("_TEMPLATE"): # is RT - queueDict['isTemplateQueue'] = True - queueDict.pop('templateQueueName', None) + queueDict["isTemplateQueue"] = True + queueDict.pop("templateQueueName", None) remoteTemplatesDict[queueName] = queueDict else: # is RQ - queueDict['isTemplateQueue'] = False + queueDict["isTemplateQueue"] = False remoteQueuesDict[queueName] = queueDict # load config from local json file (LT & LQ) queueConfigJson_local = self._load_config_from_file() if queueConfigJson_local is not None: for queueName, queueDict in iteritems(queueConfigJson_local): - if queueDict.get('isTemplateQueue') is True \ - or queueName.endswith('_TEMPLATE'): + if queueDict.get("isTemplateQueue") is True or queueName.endswith("_TEMPLATE"): # is LT - queueDict['isTemplateQueue'] = True - queueDict.pop('templateQueueName', None) + queueDict["isTemplateQueue"] = True + queueDict.pop("templateQueueName", None) localTemplatesDict[queueName] = queueDict else: # is LQ - queueDict['isTemplateQueue'] = False + queueDict["isTemplateQueue"] = False localQueuesDict[queueName] = queueDict else: - mainLog.warning('Failed to load config from local json file. Skipped') + mainLog.warning("Failed to load config from local json file. Skipped") # fill in final template (FT) finalTemplatesDict.update(remoteTemplatesDict) finalTemplatesDict.update(localTemplatesDict) finalTemplatesDict.pop(None, None) # remove queues with invalid templateQueueName - for acr, queuesDict in [('RQ', remoteQueuesDict), ('LQ', localQueuesDict)]: + for acr, queuesDict in [("RQ", remoteQueuesDict), ("LQ", localQueuesDict)]: for queueName, queueDict in iteritems(queuesDict.copy()): - templateQueueName = queueDict.get('templateQueueName') - if templateQueueName is not None \ - and templateQueueName not in finalTemplatesDict: + templateQueueName = queueDict.get("templateQueueName") + if templateQueueName is not None and templateQueueName not in finalTemplatesDict: del queuesDict[queueName] - mainLog.warning('Invalid templateQueueName "{0}" for {1} ({2}). Skipped'.format( - templateQueueName, queueName, acr)) + mainLog.warning('Invalid templateQueueName "{0}" for {1} ({2}). Skipped'.format(templateQueueName, queueName, acr)) # get queue names from resolver and fill in dynamic queue (DQ) - if resolver is not None \ - and 'DYNAMIC' in harvester_config.qconf.queueList: + if resolver is not None and "DYNAMIC" in harvester_config.qconf.queueList: getQueuesDynamic = True dynamicQueuesNameList = resolver.get_all_queue_names() for queueName in dynamicQueuesNameList.copy(): @@ -393,16 +375,13 @@ def load_data(self, refill_table=False): if resolver_harvester_template: templateQueueName = resolver_harvester_template elif not (resolver_type is None or resolver_workflow is None): - templateQueueName = '{pq_type}.{workflow}'.format( - pq_type=resolver_type, - workflow=resolver_workflow) + templateQueueName = "{pq_type}.{workflow}".format(pq_type=resolver_type, workflow=resolver_workflow) else: templateQueueName = harvester_config.qconf.defaultTemplateQueueName if templateQueueName not in finalTemplatesDict: # remove queues with invalid templateQueueName dynamicQueuesNameList.discard(queueName) - mainLog.warning('Invalid templateQueueName "{0}" for {1} (DQ). Skipped'.format( - templateQueueName, queueName)) + mainLog.warning('Invalid templateQueueName "{0}" for {1} (DQ). Skipped'.format(templateQueueName, queueName)) continue # parameters resolver_harvester_params = resolver.get_harvester_params(queueName) @@ -410,8 +389,8 @@ def load_data(self, refill_table=False): if key in self.dynamic_queue_generic_attrs: queueDict[key] = val # fill in dynamic queue configs - queueDict['templateQueueName'] = templateQueueName - queueDict['isTemplateQueue'] = False + queueDict["templateQueueName"] = templateQueueName + queueDict["isTemplateQueue"] = False dynamicQueuesDict[queueName] = queueDict # fill in all queue name list (names of RQ + DQ + LQ) allQueuesNameList |= set(remoteQueuesDict) @@ -429,103 +408,96 @@ def load_data(self, refill_table=False): if queueName not in queuesDict: continue tmp_queueDict = queuesDict[queueName] - tmp_templateQueueName = tmp_queueDict.get('templateQueueName') + tmp_templateQueueName = tmp_queueDict.get("templateQueueName") if tmp_templateQueueName is not None: templateQueueName = tmp_templateQueueName # prepare queueDict queueDict = dict() if templateQueueName in finalTemplatesDict: queueDict.update(copy.deepcopy(finalTemplatesDict[templateQueueName])) - for acr, templatesDict in [('RT', remoteTemplatesDict), ('LT', localTemplatesDict)]: + for acr, templatesDict in [("RT", remoteTemplatesDict), ("LT", localTemplatesDict)]: if templateQueueName in templatesDict: templateSourceList.append(acr) # update queueDict - for acr, queuesDict in [('RQ', remoteQueuesDict), - ('DQ', dynamicQueuesDict), ('LQ', localQueuesDict)]: + for acr, queuesDict in [("RQ", remoteQueuesDict), ("DQ", dynamicQueuesDict), ("LQ", localQueuesDict)]: if queueName not in queuesDict: continue queueSourceList.append(acr) tmp_queueDict = queuesDict[queueName] for key, val in iteritems(tmp_queueDict): val = copy.deepcopy(val) - if key in self.updatable_plugin_attrs \ - and isinstance(queueDict.get(key), dict) \ - and isinstance(val, dict): + if key in self.updatable_plugin_attrs and isinstance(queueDict.get(key), dict) and isinstance(val, dict): # update plugin parameters instead of overwriting whole plugin section queueDict[key].update(val) else: queueDict[key] = val # record sources of the queue config and its templates in log if templateQueueName: - mainLog.debug(('queue {queueName} comes from {queueSource} ' - '(with template {templateName} ' - 'from {templateSource})').format( - queueName=queueName, - templateName=templateQueueName, - queueSource=','.join(queueSourceList), - templateSource=','.join(templateSourceList) )) + mainLog.debug( + ("queue {queueName} comes from {queueSource} " "(with template {templateName} " "from {templateSource})").format( + queueName=queueName, + templateName=templateQueueName, + queueSource=",".join(queueSourceList), + templateSource=",".join(templateSourceList), + ) + ) else: - mainLog.debug('queue {queueName} comes from {queueSource}'.format( - queueName=queueName, - queueSource=','.join(queueSourceList))) + mainLog.debug("queue {queueName} comes from {queueSource}".format(queueName=queueName, queueSource=",".join(queueSourceList))) # prepare queueConfig if queueName in newQueueConfig: queueConfig = newQueueConfig[queueName] else: queueConfig = QueueConfig(queueName) # queueName = siteName/resourceType - queueConfig.siteName = queueConfig.queueName.split('/')[0] + queueConfig.siteName = queueConfig.queueName.split("/")[0] if queueConfig.siteName != queueConfig.queueName: - queueConfig.resourceType = queueConfig.queueName.split('/')[-1] + queueConfig.resourceType = queueConfig.queueName.split("/")[-1] # get common attributes commonAttrDict = dict() - if isinstance(queueDict.get('common'), dict): - commonAttrDict = queueDict.get('common') + if isinstance(queueDict.get("common"), dict): + commonAttrDict = queueDict.get("common") # according to queueDict for key, val in iteritems(queueDict): - if isinstance(val, dict) and 'module' in val and 'name' in val: + if isinstance(val, dict) and "module" in val and "name" in val: # plugin attributes val = copy.deepcopy(val) # fill in common attributes for all plugins for c_key, c_val in iteritems(commonAttrDict): - if c_key not in val and c_key not in ('module', 'name'): + if c_key not in val and c_key not in ("module", "name"): val[c_key] = c_val # check module and class name try: - _t3mP_1Mp0R7_mO6U1e__ = importlib.import_module(val['module']) - _t3mP_1Mp0R7_N4m3__ = getattr(_t3mP_1Mp0R7_mO6U1e__, val['name']) + _t3mP_1Mp0R7_mO6U1e__ = importlib.import_module(val["module"]) + _t3mP_1Mp0R7_N4m3__ = getattr(_t3mP_1Mp0R7_mO6U1e__, val["name"]) except Exception as _e: invalidQueueList.add(queueConfig.queueName) - mainLog.error('Module or class not found. Omitted {0} in queue config ({1})'.format( - queueConfig.queueName, _e)) + mainLog.error("Module or class not found. Omitted {0} in queue config ({1})".format(queueConfig.queueName, _e)) continue else: del _t3mP_1Mp0R7_mO6U1e__ del _t3mP_1Mp0R7_N4m3__ # fill in siteName and queueName - if 'siteName' not in val: - val['siteName'] = queueConfig.siteName - if 'queueName' not in val: - val['queueName'] = queueConfig.queueName + if "siteName" not in val: + val["siteName"] = queueConfig.siteName + if "queueName" not in val: + val["queueName"] = queueConfig.queueName # middleware - if 'middleware' in val and val['middleware'] in queueDict: + if "middleware" in val and val["middleware"] in queueDict: # keep original config - val['original_config'] = copy.deepcopy(val) + val["original_config"] = copy.deepcopy(val) # overwrite with middleware config - for m_key, m_val in iteritems(queueDict[val['middleware']]): + for m_key, m_val in iteritems(queueDict[val["middleware"]]): val[m_key] = m_val setattr(queueConfig, key, val) # delete isTemplateQueue attribute try: - if getattr(queueConfig, 'isTemplateQueue'): - mainLog.error('Internal error: isTemplateQueue is True. Omitted {0} in queue config'.format( - queueConfig.queueName)) + if getattr(queueConfig, "isTemplateQueue"): + mainLog.error("Internal error: isTemplateQueue is True. Omitted {0} in queue config".format(queueConfig.queueName)) invalidQueueList.add(queueConfig.queueName) else: - delattr(queueConfig, 'isTemplateQueue') + delattr(queueConfig, "isTemplateQueue") except AttributeError as _e: - mainLog.error('Internal error with attr "isTemplateQueue". Omitted {0} in queue config ({1})'.format( - queueConfig.queueName, _e)) + mainLog.error('Internal error with attr "isTemplateQueue". Omitted {0} in queue config ({1})'.format(queueConfig.queueName, _e)) invalidQueueList.add(queueConfig.queueName) # get Panda Queue Name if resolver is not None: @@ -533,8 +505,8 @@ def load_data(self, refill_table=False): # additional criteria for getJob if queueConfig.getJobCriteria is not None: tmpCriteria = dict() - for tmpItem in queueConfig.getJobCriteria.split(','): - tmpKey, tmpVal = tmpItem.split('=') + for tmpItem in queueConfig.getJobCriteria.split(","): + tmpKey, tmpVal = tmpItem.split("=") tmpCriteria[tmpKey] = tmpVal if len(tmpCriteria) == 0: queueConfig.getJobCriteria = None @@ -542,12 +514,11 @@ def load_data(self, refill_table=False): queueConfig.getJobCriteria = tmpCriteria # nullify job attributes if NoJob mapType if queueConfig.mapType == WorkSpec.MT_NoJob: - for attName in ['nQueueLimitJob', 'nQueueLimitJobRatio', - 'nQueueLimitJobMax', 'nQueueLimitJobMin']: + for attName in ["nQueueLimitJob", "nQueueLimitJobRatio", "nQueueLimitJobMax", "nQueueLimitJobMin"]: setattr(queueConfig, attName, None) # heartbeat suppression - if queueConfig.truePilot and queueConfig.noHeartbeat == '': - queueConfig.noHeartbeat = 'running,transferring,finished,failed' + if queueConfig.truePilot and queueConfig.noHeartbeat == "": + queueConfig.noHeartbeat = "running,transferring,finished,failed" # set unique name queueConfig.set_unique_name() # put into new queue configs @@ -561,16 +532,16 @@ def load_data(self, refill_table=False): invalidQueueList.add(queueConfig.queueName) missing_attr_list.append(_attr) if missing_attr_list: - mainLog.error('Missing mandatory attributes {0} . Omitted {1} in queue config'.format( - ','.join(missing_attr_list), queueConfig.queueName)) + mainLog.error( + "Missing mandatory attributes {0} . Omitted {1} in queue config".format(",".join(missing_attr_list), queueConfig.queueName) + ) # delete invalid queues for invalidQueueName in invalidQueueList: if invalidQueueName in newQueueConfig: del newQueueConfig[invalidQueueName] # auto blacklisting autoBlacklist = False - if resolver is not None and hasattr(harvester_config.qconf, 'autoBlacklist') and \ - harvester_config.qconf.autoBlacklist: + if resolver is not None and hasattr(harvester_config.qconf, "autoBlacklist") and harvester_config.qconf.autoBlacklist: autoBlacklist = True # get queue dumps queueConfigDumps = self.dbProxy.get_queue_config_dumps() @@ -581,14 +552,14 @@ def load_data(self, refill_table=False): if queueConfig.queueStatus is None and autoBlacklist: queueConfig.queueStatus = resolver.get_queue_status(queueName) # get dynamic information - if 'DYNAMIC' in harvester_config.qconf.queueList: + if "DYNAMIC" in harvester_config.qconf.queueList: # UPS queue if resolver is not None and resolver.is_ups_queue(queueName): - queueConfig.runMode = 'slave' - queueConfig.mapType = 'NoJob' + queueConfig.runMode = "slave" + queueConfig.mapType = "NoJob" # set online if undefined if queueConfig.queueStatus is None: - queueConfig.queueStatus = 'online' + queueConfig.queueStatus = "online" queueConfig.queueStatus = queueConfig.queueStatus.lower() # look for configID dumpSpec = QueueConfigDumpSpec() @@ -599,26 +570,30 @@ def load_data(self, refill_table=False): else: # add dump dumpSpec.creationTime = datetime.datetime.utcnow() - dumpSpec.configID = self.dbProxy.get_next_seq_number('SEQ_configID') + dumpSpec.configID = self.dbProxy.get_next_seq_number("SEQ_configID") tmpStat = self.dbProxy.add_queue_config_dump(dumpSpec) if not tmpStat: dumpSpec.configID = self.dbProxy.get_config_id_dump(dumpSpec) if dumpSpec.configID is None: - mainLog.error('failed to get configID for {0}'.format(dumpSpec.dumpUniqueName)) + mainLog.error("failed to get configID for {0}".format(dumpSpec.dumpUniqueName)) continue queueConfigDumps[dumpSpec.dumpUniqueName] = dumpSpec queueConfig.configID = dumpSpec.configID # ignore offline - if queueConfig.queueStatus == 'offline': + if queueConfig.queueStatus == "offline": continue # filter for pilot version - if hasattr(harvester_config.qconf, 'pilotVersion') and \ - pandaQueueDict.get(queueConfig.siteName) is not None and \ - pandaQueueDict.get(queueConfig.siteName).get('pilot_version') != str(harvester_config.qconf.pilotVersion): + if ( + hasattr(harvester_config.qconf, "pilotVersion") + and pandaQueueDict.get(queueConfig.siteName) is not None + and pandaQueueDict.get(queueConfig.siteName).get("pilot_version") != str(harvester_config.qconf.pilotVersion) + ): continue - if 'ALL' not in harvester_config.qconf.queueList and \ - 'DYNAMIC' not in harvester_config.qconf.queueList and \ - queueName not in harvester_config.qconf.queueList: + if ( + "ALL" not in harvester_config.qconf.queueList + and "DYNAMIC" not in harvester_config.qconf.queueList + and queueName not in harvester_config.qconf.queueList + ): continue activeQueues[queueName] = queueConfig self.queueConfig = newQueueConfig @@ -636,9 +611,9 @@ def load_data(self, refill_table=False): # update database if self.toUpdateDB: self.dbProxy.fill_panda_queue_table(self.activeQueues.keys(), self, refill_table=refill_table) - mainLog.debug('updated to DB') + mainLog.debug("updated to DB") # done - mainLog.debug('done') + mainLog.debug("done") # check if valid queue def has_queue(self, queue_name, config_id=None): @@ -675,7 +650,7 @@ def get_active_ups_queues(self): active_queues = self.get_active_queues() for queue_name, queue_attribs in iteritems(active_queues): try: - if queue_attribs.runMode == 'slave' and queue_attribs.mapType == 'NoJob': + if queue_attribs.runMode == "slave" and queue_attribs.mapType == "NoJob": active_ups_queues.append(queue_name) except KeyError: continue diff --git a/pandaharvester/harvestercore/resource_type_mapper.py b/pandaharvester/harvestercore/resource_type_mapper.py index c627054e..1b168d4a 100644 --- a/pandaharvester/harvestercore/resource_type_mapper.py +++ b/pandaharvester/harvestercore/resource_type_mapper.py @@ -8,39 +8,35 @@ class ResourceType(object): - def __init__(self, resource_type_dict): """ Initialize resource type name and attributes """ # name - self.resource_name = resource_type_dict['resource_name'] + self.resource_name = resource_type_dict["resource_name"] # cores - self.min_core = resource_type_dict['mincore'] - self.max_core = resource_type_dict['maxcore'] + self.min_core = resource_type_dict["mincore"] + self.max_core = resource_type_dict["maxcore"] # memory - self.min_ram_per_core = resource_type_dict['minrampercore'] - self.max_ram_per_core = resource_type_dict['maxrampercore'] + self.min_ram_per_core = resource_type_dict["minrampercore"] + self.max_ram_per_core = resource_type_dict["maxrampercore"] class ResourceTypeMapper(object): - def __init__(self): self.lock = threading.Lock() self.resource_types = {} self.last_update = None def load_data(self): - with self.lock: - # check interval time_now = datetime.datetime.utcnow() if self.last_update is not None and time_now - self.last_update < datetime.timedelta(minutes=10): return db_proxy = DBProxy() - resource_type_cache = db_proxy.get_cache('resource_types.json') + resource_type_cache = db_proxy.get_cache("resource_types.json") if resource_type_cache: resource_type_list = resource_type_cache.data else: @@ -49,7 +45,7 @@ def load_data(self): for resource_type_dict in resource_type_list: try: resource_type = ResourceType(resource_type_dict) - resource_name = resource_type_dict['resource_name'] + resource_name = resource_type_dict["resource_name"] self.resource_types[resource_name] = resource_type except KeyError: continue @@ -70,10 +66,10 @@ def calculate_worker_requirements(self, resource_name, queue_config): resource_type = self.resource_types[resource_name] # retrieve the queue configuration - site_maxrss = queue_config.get('maxrss', 0) or 0 - site_corecount = queue_config.get('corecount', 1) or 1 + site_maxrss = queue_config.get("maxrss", 0) or 0 + site_corecount = queue_config.get("corecount", 1) or 1 - unified_queue = queue_config.get('capability', '') == 'ucore' + unified_queue = queue_config.get("capability", "") == "ucore" if not unified_queue: # site is not unified, just request whatever is configured in AGIS return site_maxrss, site_corecount @@ -84,8 +80,7 @@ def calculate_worker_requirements(self, resource_name, queue_config): worker_cores = site_corecount if resource_type.max_ram_per_core: - worker_memory = min(resource_type.max_ram_per_core * worker_cores, - (site_maxrss / site_corecount) * worker_cores) + worker_memory = min(resource_type.max_ram_per_core * worker_cores, (site_maxrss / site_corecount) * worker_cores) else: worker_memory = (site_maxrss / site_corecount) * worker_cores worker_memory = int(math.ceil(worker_memory)) diff --git a/pandaharvester/harvestercore/seq_number_spec.py b/pandaharvester/harvestercore/seq_number_spec.py index 9f54092f..9a3c6ba3 100644 --- a/pandaharvester/harvestercore/seq_number_spec.py +++ b/pandaharvester/harvestercore/seq_number_spec.py @@ -8,9 +8,10 @@ class SeqNumberSpec(SpecBase): # attributes - attributesWithTypes = ('numberName:text', - 'curVal:integer', - ) + attributesWithTypes = ( + "numberName:text", + "curVal:integer", + ) # constructor def __init__(self): diff --git a/pandaharvester/harvestercore/service_metrics_spec.py b/pandaharvester/harvestercore/service_metrics_spec.py index e4c31d81..60758ef1 100644 --- a/pandaharvester/harvestercore/service_metrics_spec.py +++ b/pandaharvester/harvestercore/service_metrics_spec.py @@ -8,12 +8,14 @@ import json import socket + class ServiceMetricSpec(SpecBase): # attributes - attributesWithTypes = ('creationTime:timestamp / index', - 'hostName:text', - 'metrics:blob', - ) + attributesWithTypes = ( + "creationTime:timestamp / index", + "hostName:text", + "metrics:blob", + ) # constructor def __init__(self, service_metrics): @@ -21,4 +23,4 @@ def __init__(self, service_metrics): self.creationTime = datetime.datetime.utcnow() self.hostName = socket.getfqdn() - self.metrics = service_metrics # blobs are automatically translated to json \ No newline at end of file + self.metrics = service_metrics # blobs are automatically translated to json diff --git a/pandaharvester/harvestercore/spec_base.py b/pandaharvester/harvestercore/spec_base.py index bf9bf231..f866f6b4 100644 --- a/pandaharvester/harvestercore/spec_base.py +++ b/pandaharvester/harvestercore/spec_base.py @@ -21,14 +21,14 @@ def default(self, obj): if isinstance(obj, rpyc.core.netref.BaseNetref): retVal = rpyc.utils.classic.obtain(obj) else: - retVal = {'_non_json_object': pickle.dumps(obj)} + retVal = {"_non_json_object": pickle.dumps(obj)} return retVal # hook for decoder def as_python_object(dct): - if '_non_json_object' in dct: - return pickle.loads(str(dct['_non_json_object'])) + if "_non_json_object" in dct: + return pickle.loads(str(dct["_non_json_object"])) return dct @@ -42,13 +42,13 @@ class SpecBase(object): # constructor def __init__(self): # remove types - object.__setattr__(self, 'attributes', []) - object.__setattr__(self, 'serializedAttrs', set()) + object.__setattr__(self, "attributes", []) + object.__setattr__(self, "serializedAttrs", set()) for attr in self.attributesWithTypes: - attr, attrType = attr.split(':') + attr, attrType = attr.split(":") attrType = attrType.split()[0] self.attributes.append(attr) - if attrType in ['blob']: + if attrType in ["blob"]: self.serializedAttrs.add(attr) # install attributes for attr in self.attributes: @@ -57,7 +57,7 @@ def __init__(self): else: object.__setattr__(self, attr, None) # map of changed attributes - object.__setattr__(self, 'changedAttrs', {}) + object.__setattr__(self, "changedAttrs", {}) # override __setattr__ to collect changed attributes def __setattr__(self, name, value): @@ -71,7 +71,7 @@ def __setattr__(self, name, value): # keep state for pickle def __getstate__(self): odict = self.__dict__.copy() - del odict['changedAttrs'] + del odict["changedAttrs"] return odict # restore state from the unpickled state values @@ -82,7 +82,7 @@ def __setstate__(self, state): # reset changed attribute list def reset_changed_list(self): - object.__setattr__(self, 'changedAttrs', {}) + object.__setattr__(self, "changedAttrs", {}) # force update def force_update(self, name): @@ -100,7 +100,7 @@ def has_updated_attributes(self): # pack into attributes def pack(self, values, slim=False): - if hasattr(values, '_asdict'): + if hasattr(values, "_asdict"): values = values._asdict() for attr in self.attributes: if slim and attr in self.skipAttrsToSlim: @@ -126,7 +126,7 @@ def set_blob_attribute(self, key, val): def column_names(cls, prefix=None, slim=False): ret = "" for attr in cls.attributesWithTypes: - attr = attr.split(':')[0] + attr = attr.split(":")[0] if slim and attr in cls.skipAttrsToSlim: continue if prefix is None: @@ -142,7 +142,7 @@ def column_names(cls, prefix=None, slim=False): def bind_values_expression(cls): ret = "VALUES(" for attr in cls.attributesWithTypes: - attr = attr.split(':')[0] + attr = attr.split(":")[0] ret += ":%s," % attr ret = ret[:-1] ret += ")" @@ -155,9 +155,9 @@ def bind_update_changes_expression(self): ret = "" for attr in self.attributes: if attr in self.changedAttrs: - ret += '%s=:%s,' % (attr, attr) + ret += "%s=:%s," % (attr, attr) ret = ret[:-1] - ret += ' ' + ret += " " return ret # return map of values @@ -176,7 +176,7 @@ def values_map(self, only_changed=False): val = None if attr in self.serializedAttrs: val = json.dumps(val, cls=PythonObjectEncoder) - ret[':%s' % attr] = val + ret[":%s" % attr] = val return ret # return list of values diff --git a/pandaharvester/harvestercore/work_spec.py b/pandaharvester/harvestercore/work_spec.py index 2e16680d..f927b1e4 100644 --- a/pandaharvester/harvestercore/work_spec.py +++ b/pandaharvester/harvestercore/work_spec.py @@ -16,31 +16,24 @@ # work spec class WorkSpec(SpecBase): # worker statuses - ST_submitted = 'submitted' - ST_running = 'running' - ST_finished = 'finished' - ST_failed = 'failed' - ST_ready = 'ready' - ST_cancelled = 'cancelled' - ST_idle = 'idle' - ST_missed = 'missed' - ST_pending = 'pending' + ST_submitted = "submitted" + ST_running = "running" + ST_finished = "finished" + ST_failed = "failed" + ST_ready = "ready" + ST_cancelled = "cancelled" + ST_idle = "idle" + ST_missed = "missed" + ST_pending = "pending" # list of worker statuses - ST_LIST = [ST_submitted, - ST_running, - ST_finished, - ST_failed, - ST_ready, - ST_cancelled, - ST_idle, - ST_missed] + ST_LIST = [ST_submitted, ST_running, ST_finished, ST_failed, ST_ready, ST_cancelled, ST_idle, ST_missed] # type of mapping between job and worker - MT_NoJob = 'NoJob' - MT_OneToOne = 'OneToOne' - MT_MultiJobs = 'ManyToOne' - MT_MultiWorkers = 'OneToMany' + MT_NoJob = "NoJob" + MT_OneToOne = "OneToOne" + MT_MultiJobs = "ManyToOne" + MT_MultiWorkers = "OneToMany" # events EV_noEvents = 0 @@ -48,73 +41,74 @@ class WorkSpec(SpecBase): EV_requestEvents = 2 # attributes - attributesWithTypes = ('workerID:integer primary key', - 'batchID:text', - 'mapType:text', - 'queueName:text', - 'status:text / index', - 'hasJob:integer', - 'workParams:blob', - 'workAttributes:blob', - 'eventsRequestParams:blob', - 'eventsRequest:integer / index', - 'computingSite:text / index', - 'creationTime:timestamp', - 'submitTime:timestamp / index', - 'startTime:timestamp', - 'endTime:timestamp', - 'nCore:integer', - 'walltime:timestamp', - 'accessPoint:text', - 'modificationTime:timestamp / index', - 'lastUpdate:timestamp / index', - 'eventFeedTime:timestamp / index', - 'lockedBy:text', - 'postProcessed:integer', - 'nodeID:text', - 'minRamCount:integer', - 'maxDiskCount:integer', - 'maxWalltime:integer', - 'killTime:timestamp / index', - 'computingElement:text', - 'nJobsToReFill:integer / index', - 'logFilesToUpload:blob', - 'jobType:text', - 'resourceType:text', - 'nativeExitCode:integer', - 'nativeStatus:text', - 'diagMessage:varchar(500)', - 'nJobs:integer', - 'submissionHost:text', - 'configID:integer / index', - 'syncLevel:integer', - 'checkTime:timestamp', - 'ioIntensity:integer', - 'harvesterHost:text', - 'pilotType:text', - 'eventFeedLock:text', - 'errorCode:integer', - 'errorDiag:text' - ) + attributesWithTypes = ( + "workerID:integer primary key", + "batchID:text", + "mapType:text", + "queueName:text", + "status:text / index", + "hasJob:integer", + "workParams:blob", + "workAttributes:blob", + "eventsRequestParams:blob", + "eventsRequest:integer / index", + "computingSite:text / index", + "creationTime:timestamp", + "submitTime:timestamp / index", + "startTime:timestamp", + "endTime:timestamp", + "nCore:integer", + "walltime:timestamp", + "accessPoint:text", + "modificationTime:timestamp / index", + "lastUpdate:timestamp / index", + "eventFeedTime:timestamp / index", + "lockedBy:text", + "postProcessed:integer", + "nodeID:text", + "minRamCount:integer", + "maxDiskCount:integer", + "maxWalltime:integer", + "killTime:timestamp / index", + "computingElement:text", + "nJobsToReFill:integer / index", + "logFilesToUpload:blob", + "jobType:text", + "resourceType:text", + "nativeExitCode:integer", + "nativeStatus:text", + "diagMessage:varchar(500)", + "nJobs:integer", + "submissionHost:text", + "configID:integer / index", + "syncLevel:integer", + "checkTime:timestamp", + "ioIntensity:integer", + "harvesterHost:text", + "pilotType:text", + "eventFeedLock:text", + "errorCode:integer", + "errorDiag:text", + ) # attributes to skip when slim reading - skipAttrsToSlim = ('workParams', 'workAttributes') + skipAttrsToSlim = ("workParams", "workAttributes") # constructor def __init__(self): SpecBase.__init__(self) - object.__setattr__(self, 'isNew', False) - object.__setattr__(self, 'nextLookup', False) - object.__setattr__(self, 'jobspec_list', None) - object.__setattr__(self, 'pandaid_list', None) - object.__setattr__(self, 'new_status', False) - object.__setattr__(self, 'pilot_closed', False) + object.__setattr__(self, "isNew", False) + object.__setattr__(self, "nextLookup", False) + object.__setattr__(self, "jobspec_list", None) + object.__setattr__(self, "pandaid_list", None) + object.__setattr__(self, "new_status", False) + object.__setattr__(self, "pilot_closed", False) # keep state for pickle def __getstate__(self): odict = SpecBase.__getstate__(self) - del odict['isNew'] - del odict['new_status'] + del odict["isNew"] + del odict["new_status"] return odict # set status @@ -134,30 +128,30 @@ def set_status(self, value): # get access point def get_access_point(self): # replace placeholders - if '$' in self.accessPoint: - patts = re.findall('\$\{([a-zA-Z\d_.]+)\}', self.accessPoint) + if "$" in self.accessPoint: + patts = re.findall("\$\{([a-zA-Z\d_.]+)\}", self.accessPoint) for patt in patts: - tmpKey = '${' + patt + '}' + tmpKey = "${" + patt + "}" tmpVar = None if hasattr(self, patt): tmpVar = str(getattr(self, patt)) - elif patt == 'harvesterID': + elif patt == "harvesterID": tmpVar = harvester_config.master.harvester_id else: - _match = re.search('^_workerID_((?:\d+.)*\d)$', patt) + _match = re.search("^_workerID_((?:\d+.)*\d)$", patt) if _match: workerID_str = str(self.workerID) - digit_list = _match.group(1).split('.') + digit_list = _match.group(1).split(".") string_list = [] for _d in digit_list: digit = int(_d) try: - _n = workerID_str[(-1-digit)] + _n = workerID_str[(-1 - digit)] except IndexError: - string_list.append('0') + string_list.append("0") else: string_list.append(_n) - tmpVar = ''.join(string_list) + tmpVar = "".join(string_list) if tmpVar is not None: self.accessPoint = self.accessPoint.replace(tmpKey, tmpVar) return self.accessPoint @@ -185,16 +179,16 @@ def convert_to_job_status(self, status=None): if status is None: status = self.status if status in [self.ST_submitted, self.ST_ready]: - jobStatus = 'starting' + jobStatus = "starting" jobSubStatus = status elif status in [self.ST_finished, self.ST_failed, self.ST_cancelled]: jobStatus = status - jobSubStatus = 'to_transfer' + jobSubStatus = "to_transfer" elif status in [self.ST_missed]: - jobStatus = 'missed' + jobStatus = "missed" jobSubStatus = status else: - jobStatus = 'running' + jobStatus = "running" jobSubStatus = status return jobStatus, jobSubStatus @@ -218,7 +212,7 @@ def trigger_propagation(self): # disable propagation def disable_propagation(self): self.lastUpdate = None - self.force_update('lastUpdate') + self.force_update("lastUpdate") # final status def is_final_status(self): @@ -227,43 +221,40 @@ def is_final_status(self): # convert to propagate def convert_to_propagate(self): data = dict() - for attr in ['workerID', - 'batchID', - 'queueName', - 'status', - 'computingSite', - 'nCore', - 'nodeID', - 'submitTime', - 'startTime', - 'endTime', - 'jobType', - 'resourceType', - 'nativeExitCode', - 'nativeStatus', - 'diagMessage', - 'nJobs', - 'computingElement', - 'syncLevel', - 'submissionHost', - 'harvesterHost', - 'errorCode' - ]: + for attr in [ + "workerID", + "batchID", + "queueName", + "status", + "computingSite", + "nCore", + "nodeID", + "submitTime", + "startTime", + "endTime", + "jobType", + "resourceType", + "nativeExitCode", + "nativeStatus", + "diagMessage", + "nJobs", + "computingElement", + "syncLevel", + "submissionHost", + "harvesterHost", + "errorCode", + ]: val = getattr(self, attr) if val is not None: if isinstance(val, datetime.datetime): - val = 'datetime/' + val.strftime('%Y-%m-%d %H:%M:%S.%f') + val = "datetime/" + val.strftime("%Y-%m-%d %H:%M:%S.%f") data[attr] = val - if self.errorCode not in [None, 0] and self.errorDiag not in [None, '']: - data['diagMessage'] = self.errorDiag + if self.errorCode not in [None, 0] and self.errorDiag not in [None, ""]: + data["diagMessage"] = self.errorDiag if self.pandaid_list is not None: - data['pandaid_list'] = self.pandaid_list + data["pandaid_list"] = self.pandaid_list if self.workAttributes is not None: - for attr in ['stdOut', - 'stdErr', - 'batchLog', - 'jdl' - ]: + for attr in ["stdOut", "stdErr", "batchLog", "jdl"]: if attr in self.workAttributes: data[attr] = self.workAttributes[attr] return data @@ -287,7 +278,7 @@ def set_work_params(self, data): for key, val in iteritems(data): if key not in self.workParams or self.workParams[key] != val: self.workParams[key] = val - self.force_update('workParams') + self.force_update("workParams") # get work params def get_work_params(self, name): @@ -310,7 +301,7 @@ def set_work_attributes(self, data): for key, val in iteritems(data): if key not in self.workAttributes or self.workAttributes[key] != val: self.workAttributes[key] = val - self.force_update('workAttributes') + self.force_update("workAttributes") # get work attribute def get_work_attribute(self, name): @@ -331,34 +322,30 @@ def update_log_files_to_upload(self, file_path, position, remote_name=None, stre if stream_type is not None: # delete existing stream for tmp_file_path, tmpDict in iteritems(self.logFilesToUpload.copy()): - if tmpDict['stream_type'] == stream_type: + if tmpDict["stream_type"] == stream_type: del self.logFilesToUpload[tmp_file_path] if file_path not in self.logFilesToUpload: - self.logFilesToUpload[file_path] = {'position': position, - 'remote_name': remote_name, - 'stream_type': stream_type} - self.force_update('logFilesToUpload') - elif self.logFilesToUpload[file_path]['position'] != position: - self.logFilesToUpload[file_path]['position'] = position - self.force_update('logFilesToUpload') + self.logFilesToUpload[file_path] = {"position": position, "remote_name": remote_name, "stream_type": stream_type} + self.force_update("logFilesToUpload") + elif self.logFilesToUpload[file_path]["position"] != position: + self.logFilesToUpload[file_path]["position"] = position + self.force_update("logFilesToUpload") # set log file def set_log_file(self, log_type, stream): - if log_type == 'stdout': - keyName = 'stdOut' - elif log_type == 'stderr': - keyName = 'stdErr' - elif log_type == 'jdl': - keyName = 'jdl' + if log_type == "stdout": + keyName = "stdOut" + elif log_type == "stderr": + keyName = "stdErr" + elif log_type == "jdl": + keyName = "jdl" else: - keyName = 'batchLog' - if stream.startswith('http'): + keyName = "batchLog" + if stream.startswith("http"): url = stream else: - remoteName = '{0}__{1}'.format(harvester_config.master.harvester_id, - os.path.basename(stream)) - url = '{0}/{1}'.format(harvester_config.pandacon.pandaCacheURL_R, - remoteName) + remoteName = "{0}__{1}".format(harvester_config.master.harvester_id, os.path.basename(stream)) + url = "{0}/{1}".format(harvester_config.pandacon.pandaCacheURL_R, remoteName) # set file to periodically upload self.update_log_files_to_upload(stream, 0, remoteName, keyName) self.set_work_attributes({keyName: url}) @@ -371,26 +358,24 @@ def get_log_files_to_upload(self): if not os.path.exists(filePath): continue fileSize = os.stat(filePath).st_size - if fileSize <= fileInfo['position']: + if fileSize <= fileInfo["position"]: continue - retList.append((filePath, fileInfo['position'], fileSize-fileInfo['position'], - fileInfo['remote_name'])) + retList.append((filePath, fileInfo["position"], fileSize - fileInfo["position"], fileInfo["remote_name"])) return retList # set dialog message def set_dialog_message(self, msg): - if msg not in (None, ''): + if msg not in (None, ""): msg = msg[:500] self.diagMessage = msg # set pilot error def set_pilot_error(self, error_code, error_dialog): - self.set_work_attributes({'pilotErrorCode': error_code, - 'pilotErrorDiag': error_dialog}) + self.set_work_attributes({"pilotErrorCode": error_code, "pilotErrorDiag": error_dialog}) # check if has pilot error def has_pilot_error(self): - return self.has_work_attribute('pilotErrorCode') + return self.has_work_attribute("pilotErrorCode") # set pilot_closed def set_pilot_closed(self): @@ -400,5 +385,5 @@ def set_pilot_closed(self): def set_supplemental_error(self, error_code, error_diag): if error_code is not None: self.errorCode = error_code - if error_diag not in (None, ''): + if error_diag not in (None, ""): self.errorDiag = str(error_diag)[:256] diff --git a/pandaharvester/harvestercore/worker_errors.py b/pandaharvester/harvestercore/worker_errors.py index 2c975dca..20478075 100644 --- a/pandaharvester/harvestercore/worker_errors.py +++ b/pandaharvester/harvestercore/worker_errors.py @@ -2,10 +2,10 @@ # Integer values for errors must be not less than 1000 -class WorkerErrors (object): +class WorkerErrors(object): error_codes = { - 'SUCCEEDED': 0, - 'UNKNOWN': 1000, - 'PREEMPTED': 1001, - 'GENERAL_ERROR': 9000, + "SUCCEEDED": 0, + "UNKNOWN": 1000, + "PREEMPTED": 1001, + "GENERAL_ERROR": 9000, } diff --git a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py index e206c81a..cddc9496 100644 --- a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py +++ b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py @@ -5,7 +5,7 @@ from pandaharvester.harvestercore import core_utils # logger -_logger = core_utils.setup_logger('arcproxy_cred_manager') +_logger = core_utils.setup_logger("arcproxy_cred_manager") # credential manager with no-voms proxy using arcproxy @@ -17,24 +17,21 @@ def __init__(self, **kwarg): # check proxy def check_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='check_credential') + mainLog = self.make_logger(_logger, method_name="check_credential") # output is lifetime left of voms extension in seconds comStr = "arcproxy -i vomsACvalidityLeft -P {0}".format(self.outCertFile) mainLog.debug(comStr) try: - p = subprocess.run(comStr.split(), - encoding='utf-8', - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.run(comStr.split(), encoding="utf-8", stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut = p.stdout.strip() stdErr = p.stderr retCode = p.returncode except Exception: core_utils.dump_error_message(mainLog) return False - mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) - if retCode != 0 or not re.match(r'\d+', stdOut): - mainLog.error('Unexpected output from arcproxy: {0}'.format(stdOut)) + mainLog.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) + if retCode != 0 or not re.match(r"\d+", stdOut): + mainLog.error("Unexpected output from arcproxy: {0}".format(stdOut)) return False # return whether lifetime is greater than three days return int(stdOut) > 3600 * 72 @@ -42,22 +39,17 @@ def check_credential(self): # renew proxy def renew_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='renew_credential') - comStr = "arcproxy -S {0} -P {1} -c validityPeriod=96h -c vomsACvalidityPeriod=96h -C {2} -K {2}".format(self.voms, - self.outCertFile, - self.inCertFile) + mainLog = self.make_logger(_logger, method_name="renew_credential") + comStr = "arcproxy -S {0} -P {1} -c validityPeriod=96h -c vomsACvalidityPeriod=96h -C {2} -K {2}".format(self.voms, self.outCertFile, self.inCertFile) mainLog.debug(comStr) try: - p = subprocess.run(comStr.split(), - encoding='utf-8', - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.run(comStr.split(), encoding="utf-8", stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut = p.stdout stdErr = p.stderr retCode = p.returncode - mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + mainLog.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) except Exception: - stdOut = '' + stdOut = "" stdErr = core_utils.dump_error_message(mainLog) retCode = -1 return retCode == 0, "{0} {1}".format(stdOut, stdErr) diff --git a/pandaharvester/harvestercredmanager/base_cred_manager.py b/pandaharvester/harvestercredmanager/base_cred_manager.py index e11d5329..f86084a6 100644 --- a/pandaharvester/harvestercredmanager/base_cred_manager.py +++ b/pandaharvester/harvestercredmanager/base_cred_manager.py @@ -3,7 +3,6 @@ # base credential manager class BaseCredManager(PluginBase): - # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) @@ -18,4 +17,4 @@ def check_credential(self): # renew proxy def renew_credential(self): - return True, '' + return True, "" diff --git a/pandaharvester/harvestercredmanager/dummy_cred_manager.py b/pandaharvester/harvestercredmanager/dummy_cred_manager.py index 8145bedd..0632d154 100644 --- a/pandaharvester/harvestercredmanager/dummy_cred_manager.py +++ b/pandaharvester/harvestercredmanager/dummy_cred_manager.py @@ -3,7 +3,6 @@ # dummy credential manager class DummyCredManager(BaseCredManager): - # constructor def __init__(self, **kwarg): BaseCredManager.__init__(self, **kwarg) @@ -14,4 +13,4 @@ def check_credential(self): # renew proxy def renew_credential(self): - return True, '' + return True, "" diff --git a/pandaharvester/harvestercredmanager/grid_cred_manager.py b/pandaharvester/harvestercredmanager/grid_cred_manager.py index 512768ed..329df790 100644 --- a/pandaharvester/harvestercredmanager/grid_cred_manager.py +++ b/pandaharvester/harvestercredmanager/grid_cred_manager.py @@ -8,7 +8,7 @@ # logger -_logger = core_utils.setup_logger('grid_cred_manager') +_logger = core_utils.setup_logger("grid_cred_manager") # credential manager using grid-proxy @@ -20,39 +20,32 @@ def __init__(self, **kwarg): # check proxy def check_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='check_credential') + mainLog = self.make_logger(_logger, method_name="check_credential") comStr = "grid-proxy-info -exists -hours 72 -file {0}".format(self.outCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode except Exception: core_utils.dump_error_message(mainLog) return False - mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + mainLog.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) return retCode == 0 # renew proxy def renew_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='renew_credential') - comStr = "grid-proxy-init -out {0} -valid 96:00 -cert {1}".format(self.outCertFile, - self.inCertFile) + mainLog = self.make_logger(_logger, method_name="renew_credential") + comStr = "grid-proxy-init -out {0} -valid 96:00 -cert {1}".format(self.outCertFile, self.inCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode - mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + mainLog.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) except Exception: - stdOut = '' + stdOut = "" stdErr = core_utils.dump_error_message(mainLog) retCode = -1 return retCode == 0, "{0} {1}".format(stdOut, stdErr) diff --git a/pandaharvester/harvestercredmanager/iam_token_cred_manager.py b/pandaharvester/harvestercredmanager/iam_token_cred_manager.py index 12a0ef0c..7d455e81 100644 --- a/pandaharvester/harvestercredmanager/iam_token_cred_manager.py +++ b/pandaharvester/harvestercredmanager/iam_token_cred_manager.py @@ -10,31 +10,33 @@ from pandaharvester.harvestermisc.token_utils import endpoint_to_filename, WLCG_scopes, IssuerBroker # logger -_logger = core_utils.setup_logger('iam_token_cred_manager') +_logger = core_utils.setup_logger("iam_token_cred_manager") # allowed target types -ALL_TARGET_TYPES = ['common', 'ce'] +ALL_TARGET_TYPES = ["common", "ce"] # default port for CEs default_port_map = { - 'htcondor-ce': 9619, - } + "htcondor-ce": 9619, +} # credential manager with IAM token + + class IamTokenCredManager(BaseCredManager): # constructor def __init__(self, **kwarg): BaseCredManager.__init__(self, **kwarg) # make logger - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='__init__') + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="__init__") # attributes - if hasattr(self, 'inFile'): + if hasattr(self, "inFile"): # parse inFile setup configuration try: with open(self.inFile) as f: self.setupMap = json.load(f) except Exception as e: - tmp_log.error('Error with inFile. {0}: {1}'.format(e.__class__.__name__, e)) + tmp_log.error("Error with inFile. {0}: {1}".format(e.__class__.__name__, e)) self.setupMap = {} raise else: @@ -42,77 +44,73 @@ def __init__(self, **kwarg): self.setupMap = dict(vars(self)) # validate setupMap try: - self.client_cred_file = self.setupMap['client_cred_file'] + self.client_cred_file = self.setupMap["client_cred_file"] with open(self.client_cred_file) as f: client_cred_dict = json.load(f) - self.issuer = client_cred_dict['issuer'] - self.client_id = client_cred_dict['client_id'] - self.client_secret = client_cred_dict['client_secret'] - self.target_type = self.setupMap['target_type'] - self.out_dir = self.setupMap['out_dir'] - self.lifetime = self.setupMap.get('lifetime', 14*24*60*60) - self.target_list = self.setupMap.get('target_list') - self.target_list_file = self.setupMap.get('target_list_file') - self.update_ts_path = self.setupMap.get('update_ts_path', os.path.join(self.out_dir, '_last_update')) - self.check_interval = self.setupMap.get('check_interval', 300) - self.refresh_interval = self.setupMap.get('refresh_interval', 3600) + self.issuer = client_cred_dict["issuer"] + self.client_id = client_cred_dict["client_id"] + self.client_secret = client_cred_dict["client_secret"] + self.target_type = self.setupMap["target_type"] + self.out_dir = self.setupMap["out_dir"] + self.lifetime = self.setupMap.get("lifetime", 14 * 24 * 60 * 60) + self.target_list = self.setupMap.get("target_list") + self.target_list_file = self.setupMap.get("target_list_file") + self.update_ts_path = self.setupMap.get("update_ts_path", os.path.join(self.out_dir, "_last_update")) + self.check_interval = self.setupMap.get("check_interval", 300) + self.refresh_interval = self.setupMap.get("refresh_interval", 3600) except KeyError as e: - tmp_log.error('Missing attributes in setup. {0}'.format(traceback.format_exc())) + tmp_log.error("Missing attributes in setup. {0}".format(traceback.format_exc())) raise else: if self.target_type not in ALL_TARGET_TYPES: - tmp_log.error('Unsupported target_type: {0}'.format(self.target_type)) - raise Exception('Unsupported target_type') + tmp_log.error("Unsupported target_type: {0}".format(self.target_type)) + raise Exception("Unsupported target_type") # initialize self.targets_dict = dict() # handle targets self._handle_target_types() # issuer broker - self.issuer_broker = IssuerBroker(self.issuer, self.client_id, self.client_secret, - name=self.setup_name) + self.issuer_broker = IssuerBroker(self.issuer, self.client_id, self.client_secret, name=self.setup_name) def _is_updated(self): now_time = time.time() ret = False - if os.path.isfile(self.update_ts_path) \ - and now_time - os.path.getmtime(self.update_ts_path) < self.check_interval: + if os.path.isfile(self.update_ts_path) and now_time - os.path.getmtime(self.update_ts_path) < self.check_interval: ret = True return ret - + def _is_fresh(self, token_path): now_time = time.time() ret = False - if os.path.isfile(token_path) and os.path.getsize(token_path) > 0 \ - and now_time - os.path.getmtime(token_path) < self.refresh_interval: + if os.path.isfile(token_path) and os.path.getsize(token_path) > 0 and now_time - os.path.getmtime(token_path) < self.refresh_interval: ret = True return ret def _update_ts(self): - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='_update_ts') - with open(self.update_ts_path, 'w') as f: + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="_update_ts") + with open(self.update_ts_path, "w") as f: f.write(str(self.out_dir)) - tmp_log.debug('updated timestamp file {0}'.format(self.update_ts_path)) - + tmp_log.debug("updated timestamp file {0}".format(self.update_ts_path)) def _clean_up(self): - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='_clean_up') + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="_clean_up") now_time = time.time() for filename in os.listdir(self.out_dir): file_path = os.path.join(self.out_dir, filename) if now_time - os.path.getmtime(file_path) > self.lifetime: if os.path.isfile(file_path): os.remove(file_path) - tmp_log.debug('deleted old token file {0}'.format(file_path)) + tmp_log.debug("deleted old token file {0}".format(file_path)) def _handle_target_types(self): # make logger - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='_handle_target_types') + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="_handle_target_types") try: self.panda_queues_dict = PandaQueuesDict() except Exception as e: - tmp_log.error('Problem calling PandaQueuesDict. {0}'.format(traceback.format_exc())) + tmp_log.error("Problem calling PandaQueuesDict. {0}".format(traceback.format_exc())) raise - if self.target_type == 'common': + if self.target_type == "common": if not self.target_list: pass else: @@ -120,14 +118,14 @@ def _handle_target_types(self): self.targets_dict[target] = {} # scope self.scope = "" - elif self.target_type == 'ce': + elif self.target_type == "ce": try: # retrieve CEs from CRIC for site, val in self.panda_queues_dict.items(): - if val.get('status') == 'offline': + if val.get("status") == "offline": # do not generate token for offline PQs, but for online, brokeroff, pause, ... continue - ce_q_list = val.get('queues') + ce_q_list = val.get("queues") if ce_q_list: # loop over all ce queues for ce_q in ce_q_list: @@ -135,26 +133,26 @@ def _handle_target_types(self): # if not ce_status or ce_status == 'DISABLED': # # skip disabled ce queues # continue - ce_endpoint = ce_q.get('ce_endpoint') - ce_hostname = re.sub(':\w*', '', ce_endpoint) - ce_flavour = ce_q.get('ce_flavour') + ce_endpoint = ce_q.get("ce_endpoint") + ce_hostname = re.sub(":\w*", "", ce_endpoint) + ce_flavour = ce_q.get("ce_flavour") ce_flavour_str = str(ce_flavour).lower() ce_endpoint_modified = ce_endpoint if ce_endpoint == ce_hostname: # no port, add default port if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] - ce_endpoint_modified = '{0}:{1}'.format(ce_hostname, default_port) + ce_endpoint_modified = "{0}:{1}".format(ce_hostname, default_port) if ce_endpoint_modified and ce_flavour: target_attr_dict = { - 'ce_flavour': ce_flavour, - } + "ce_flavour": ce_flavour, + } self.targets_dict[ce_endpoint_modified] = target_attr_dict else: # do not generate token if no queues of CE continue except Exception as e: - tmp_log.error('Problem retrieving CEs from CRIC. {0}'.format(traceback.format_exc())) + tmp_log.error("Problem retrieving CEs from CRIC. {0}".format(traceback.format_exc())) raise # retrieve CEs from local file if self.target_list_file: @@ -164,11 +162,11 @@ def _handle_target_types(self): if target_str: target = target_str.rstrip() target_attr_dict = { - 'ce_flavour': None, - } + "ce_flavour": None, + } self.targets_dict[target] = target_attr_dict except Exception as e: - tmp_log.error('Problem retrieving CEs from local file. {0}'.format(traceback.format_exc())) + tmp_log.error("Problem retrieving CEs from local file. {0}".format(traceback.format_exc())) raise # scope for CE self.scope = WLCG_scopes.COMPUTE_ALL @@ -176,24 +174,24 @@ def _handle_target_types(self): # check proxy def check_credential(self): # make logger - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='check_credential') + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="check_credential") # clean up self._clean_up() # same update period as credmanager agent is_fresh = self._is_updated() if is_fresh: - tmp_log.debug('last renewal is still recent, skipped') + tmp_log.debug("last renewal is still recent, skipped") else: - tmp_log.debug('to renew tokens') + tmp_log.debug("to renew tokens") return is_fresh # renew proxy def renew_credential(self): # make logger - tmp_log = self.make_logger(_logger, 'config={0}'.format(self.setup_name), method_name='renew_credential') + tmp_log = self.make_logger(_logger, "config={0}".format(self.setup_name), method_name="renew_credential") # go all_ok = True - all_err_str = '' + all_err_str = "" for target in self.targets_dict: try: # write to file @@ -202,21 +200,21 @@ def renew_credential(self): # check token freshness if self._is_fresh(token_path): # token still fresh, skip it - tmp_log.debug('token for {0} at {1} still fresh; skipped'.format(target, token_path)) + tmp_log.debug("token for {0} at {1} still fresh; skipped".format(target, token_path)) else: # renew access token of target access_token = self.issuer_broker.get_access_token(aud=target, scope=self.scope) - with open(token_path, 'w') as f: + with open(token_path, "w") as f: f.write(access_token) - tmp_log.info('renewed token for {0} at {1}'.format(target, token_path)) + tmp_log.info("renewed token for {0} at {1}".format(target, token_path)) except Exception as e: - err_str = 'Problem getting token for {0}. {1}'.format(target, traceback.format_exc()) + err_str = "Problem getting token for {0}. {1}".format(target, traceback.format_exc()) tmp_log.error(err_str) all_ok = False - all_err_str = 'failed to get some tokens. Check the plugin log for details ' + all_err_str = "failed to get some tokens. Check the plugin log for details " continue # update last timestamp self._update_ts() - tmp_log.debug('done') + tmp_log.debug("done") # return return all_ok, all_err_str diff --git a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py index 85941170..b5d1d827 100644 --- a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py +++ b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py @@ -7,7 +7,7 @@ from pandaharvester.harvestermisc.info_utils_k8s import PandaQueuesDictK8s # logger -_logger = core_utils.setup_logger('k8s_secret_cred_manager') +_logger = core_utils.setup_logger("k8s_secret_cred_manager") # credential manager with k8s secret @@ -16,9 +16,9 @@ class K8sSecretCredManager(BaseCredManager): def __init__(self, **kwarg): BaseCredManager.__init__(self, **kwarg) # make logger - tmp_log = self.make_logger(_logger, method_name='__init__') + tmp_log = self.make_logger(_logger, method_name="__init__") # attributes - if hasattr(self, 'inFile') or hasattr(self, 'inCertFile'): + if hasattr(self, "inFile") or hasattr(self, "inCertFile"): # set up with json in inFile try: self.inFile @@ -29,7 +29,7 @@ def __init__(self, **kwarg): with open(self.inFile) as f: self.setupMap = json.load(f) except Exception as e: - tmp_log.error('Error with inFile/inCertFile . {0}: {1}'.format(e.__class__.__name__, e)) + tmp_log.error("Error with inFile/inCertFile . {0}: {1}".format(e.__class__.__name__, e)) self.setupMap = {} raise else: @@ -37,11 +37,11 @@ def __init__(self, **kwarg): self.setupMap = dict(vars(self)) # validate setupMap try: - self.k8s_config_file = self.setupMap['k8s_config_file'] - self.proxy_files = self.setupMap['proxy_files'] - self.secret_name = self.setupMap.get('secret_name', 'proxy-secret') + self.k8s_config_file = self.setupMap["k8s_config_file"] + self.proxy_files = self.setupMap["proxy_files"] + self.secret_name = self.setupMap.get("secret_name", "proxy-secret") except KeyError as e: - tmp_log.error('Missing attributes in setup. {0}: {1}'.format(e.__class__.__name__, e)) + tmp_log.error("Missing attributes in setup. {0}: {1}".format(e.__class__.__name__, e)) raise try: @@ -49,11 +49,9 @@ def __init__(self, **kwarg): self.panda_queues_dict = PandaQueuesDictK8s() self.namespace = self.panda_queues_dict.get_k8s_namespace(self.queueName) # k8s client - self.k8s_client = k8s_Client(namespace=self.namespace, queue_name=self.queueName, - config_file=self.k8s_config_file) + self.k8s_client = k8s_Client(namespace=self.namespace, queue_name=self.queueName, config_file=self.k8s_config_file) except Exception as e: - tmp_log.error('Problem instantiating k8s client for {0}. {1}'.format(self.k8s_config_file, - traceback.format_exc())) + tmp_log.error("Problem instantiating k8s client for {0}. {1}".format(self.k8s_config_file, traceback.format_exc())) raise # check proxy @@ -65,15 +63,13 @@ def check_credential(self): # renew proxy def renew_credential(self): # make logger - tmp_log = self.make_logger(_logger, 'queueName={0}'.format(self.queueName), method_name='renew_credential') + tmp_log = self.make_logger(_logger, "queueName={0}".format(self.queueName), method_name="renew_credential") # go try: - rsp = self.k8s_client.create_or_patch_secret( - file_list=self.proxy_files, secret_name=self.secret_name) - tmp_log.debug('done') + rsp = self.k8s_client.create_or_patch_secret(file_list=self.proxy_files, secret_name=self.secret_name) + tmp_log.debug("done") except KeyError as e: - errStr = 'Error when renew proxy secret . {0}: {1}'.format( - e.__class__.__name__, e) + errStr = "Error when renew proxy secret . {0}: {1}".format(e.__class__.__name__, e) return False, errStr else: - return True, '' + return True, "" diff --git a/pandaharvester/harvestercredmanager/lancium_cred_manager.py b/pandaharvester/harvestercredmanager/lancium_cred_manager.py index 0440419c..0a9a4f2f 100644 --- a/pandaharvester/harvestercredmanager/lancium_cred_manager.py +++ b/pandaharvester/harvestercredmanager/lancium_cred_manager.py @@ -9,7 +9,7 @@ from pandaharvester.harvestermisc.info_utils import PandaQueuesDict # logger -_logger = core_utils.setup_logger('lancium_cred_manager') +_logger = core_utils.setup_logger("lancium_cred_manager") # upload cred to Lancium periodically @@ -18,10 +18,10 @@ def __init__(self, **kwarg): self.hostname = socket.getfqdn() BaseCredManager.__init__(self, **kwarg) - tmp_log = self.make_logger(_logger, method_name='__init__') + tmp_log = self.make_logger(_logger, method_name="__init__") # attributes - if hasattr(self, 'inFile') or hasattr(self, 'inCertFile'): + if hasattr(self, "inFile") or hasattr(self, "inCertFile"): # set up with json in inFile try: self.inFile @@ -32,7 +32,7 @@ def __init__(self, **kwarg): with open(self.inFile) as f: self.setupMap = json.load(f) except Exception as e: - tmp_log.error('Error with inFile/inCertFile . {0}: {1}'.format(e.__class__.__name__, e)) + tmp_log.error("Error with inFile/inCertFile . {0}: {1}".format(e.__class__.__name__, e)) self.setupMap = {} raise else: @@ -40,17 +40,17 @@ def __init__(self, **kwarg): self.setupMap = dict(vars(self)) # validate setupMap try: - self.proxy_files = self.setupMap['proxy_files'] - self.secret_name = self.setupMap.get('secret_name', 'proxy-secret') + self.proxy_files = self.setupMap["proxy_files"] + self.secret_name = self.setupMap.get("secret_name", "proxy-secret") except KeyError as e: - tmp_log.error('Missing attributes in setup. {0}: {1}'.format(e.__class__.__name__, e)) + tmp_log.error("Missing attributes in setup. {0}: {1}".format(e.__class__.__name__, e)) raise try: self.panda_queues_dict = PandaQueuesDict() self.lancium_client = LanciumClient(self.hostname, queue_name=self.queueName) except Exception as e: - tmp_log.error('Problem instantiating lancium client. {1}'.format(traceback.format_exc())) + tmp_log.error("Problem instantiating lancium client. {1}".format(traceback.format_exc())) raise # check proxy @@ -59,29 +59,29 @@ def check_credential(self): return False def upload_proxies(self, proxy_files): - tmp_log = self.make_logger(_logger, method_name='upload_proxies') + tmp_log = self.make_logger(_logger, method_name="upload_proxies") - tmp_log.debug('Start uploading proxies') + tmp_log.debug("Start uploading proxies") for local_file in proxy_files: try: - tmp_log.debug('Uploading proxy {0}...'.format(local_file)) + tmp_log.debug("Uploading proxy {0}...".format(local_file)) base_name = os.path.basename(local_file) lancium_file = os.path.join(SECRETS_PATH, base_name) self.lancium_client.upload_file(local_file, lancium_file) except Exception: - tmp_log.error('Problem uploading proxy {0}. {1}'.format(local_file, traceback.format_exc())) + tmp_log.error("Problem uploading proxy {0}. {1}".format(local_file, traceback.format_exc())) - tmp_log.debug('Done uploading proxies') + tmp_log.debug("Done uploading proxies") # renew proxy def renew_credential(self): - tmp_log = self.make_logger(_logger, 'queueName={0}'.format(self.queueName), method_name='renew_credential') + tmp_log = self.make_logger(_logger, "queueName={0}".format(self.queueName), method_name="renew_credential") try: self.upload_proxies(self.proxy_files) - tmp_log.debug('done') + tmp_log.debug("done") except KeyError as e: - err_str = 'Error renewing proxy secret. {0}: {1}'.format(e.__class__.__name__, e) + err_str = "Error renewing proxy secret. {0}: {1}".format(e.__class__.__name__, e) return False, err_str else: - return True, '' + return True, "" diff --git a/pandaharvester/harvestercredmanager/no_voms_cred_manager.py b/pandaharvester/harvestercredmanager/no_voms_cred_manager.py index c60a3bc2..46aeb82c 100644 --- a/pandaharvester/harvestercredmanager/no_voms_cred_manager.py +++ b/pandaharvester/harvestercredmanager/no_voms_cred_manager.py @@ -7,7 +7,7 @@ from pandaharvester.harvestercore import core_utils # logger -_logger = core_utils.setup_logger('no_voms_cred_manager') +_logger = core_utils.setup_logger("no_voms_cred_manager") # credential manager with no-voms proxy @@ -16,81 +16,77 @@ class NoVomsCredManager(BaseCredManager): def __init__(self, **kwarg): BaseCredManager.__init__(self, **kwarg) # make logger - main_log = self.make_logger(_logger, method_name='__init__') + main_log = self.make_logger(_logger, method_name="__init__") # set up with direct attributes self.setupMap = dict(vars(self)) # setupMap - self.genFromKeyCert = self.setupMap.get('genFromKeyCert') - self.key = self.setupMap.get('key') - self.cert = self.setupMap.get('cert') - self.checkPeriod = self.setupMap.get('checkPeriod', 1) - self.lifetime = self.setupMap.get('lifetime', 96) - self.renewCommand = self.setupMap.get('renewCommand', 'voms-proxy-init') - self.extraRenewOpts = self.setupMap.get('extraRenewOpts', '') - self.lifetimeOptFormat = self.setupMap.get('lifetimeOptFormat', '-valid {lifetime}:00') + self.genFromKeyCert = self.setupMap.get("genFromKeyCert") + self.key = self.setupMap.get("key") + self.cert = self.setupMap.get("cert") + self.checkPeriod = self.setupMap.get("checkPeriod", 1) + self.lifetime = self.setupMap.get("lifetime", 96) + self.renewCommand = self.setupMap.get("renewCommand", "voms-proxy-init") + self.extraRenewOpts = self.setupMap.get("extraRenewOpts", "") + self.lifetimeOptFormat = self.setupMap.get("lifetimeOptFormat", "-valid {lifetime}:00") # check proxy lifetime for monitoring/alerting purposes def check_credential_lifetime(self): - main_log = self.make_logger(_logger, method_name='check_credential_lifetime') + main_log = self.make_logger(_logger, method_name="check_credential_lifetime") lifetime = None try: command_str = "voms-proxy-info -timeleft -file {0}".format(self.outCertFile) p = subprocess.Popen(command_str.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() return_code = p.returncode - main_log.debug('retCode={0} stdout={1} stderr={2}'.format(return_code, stdout, stderr)) + main_log.debug("retCode={0} stdout={1} stderr={2}".format(return_code, stdout, stderr)) if return_code == 0: # OK lifetime = int(stdout) / 3600 except Exception: core_utils.dump_error_message(main_log) if isinstance(lifetime, float): - main_log.debug('returning lifetime {0:.3f}'.format(lifetime)) + main_log.debug("returning lifetime {0:.3f}".format(lifetime)) else: - main_log.debug('returning lifetime {0}'.format(lifetime)) + main_log.debug("returning lifetime {0}".format(lifetime)) return lifetime # check proxy def check_credential(self): # make logger - main_log = self.make_logger(_logger, method_name='check_credential') + main_log = self.make_logger(_logger, method_name="check_credential") # lifetime threshold to trigger renew in hour threshold = max(self.lifetime - self.checkPeriod, 0) comStr = "voms-proxy-info -exists -hours {0} -file {1}".format(threshold, self.outCertFile) main_log.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode except Exception: core_utils.dump_error_message(main_log) return False - main_log.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + main_log.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) return retCode == 0 # renew proxy def renew_credential(self): # make logger - main_log = self.make_logger(_logger, method_name='renew_credential') + main_log = self.make_logger(_logger, method_name="renew_credential") # voms or no-voms - voms_option = '' + voms_option = "" if self.voms is not None: - voms_option = '-voms {0}'.format(self.voms) + voms_option = "-voms {0}".format(self.voms) # generate proxy with a long lifetime proxy (default) or from key/cert pair if self.genFromKeyCert: - noregen_option = '' + noregen_option = "" usercert_value = self.cert userkey_value = self.key else: - noregen_option = '-noregen' + noregen_option = "-noregen" usercert_value = self.inCertFile userkey_value = self.inCertFile lifetimeOpt = self.lifetimeOptFormat.format(lifetime=self.lifetime) # command - comStr = "{renew_command} -rfc {noregen_option} {voms_option} "\ - "-out {out} {lifetime} -cert={cert} -key={key} {extrea_renew_opts}".format( + comStr = "{renew_command} -rfc {noregen_option} {voms_option} " "-out {out} {lifetime} -cert={cert} -key={key} {extrea_renew_opts}".format( renew_command=self.renewCommand, noregen_option=noregen_option, voms_option=voms_option, @@ -98,19 +94,16 @@ def renew_credential(self): lifetime=lifetimeOpt, cert=usercert_value, key=userkey_value, - extrea_renew_opts=self.extraRenewOpts + extrea_renew_opts=self.extraRenewOpts, ) main_log.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode - main_log.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + main_log.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) except Exception: - stdOut = '' + stdOut = "" stdErr = core_utils.dump_error_message(main_log) retCode = -1 return retCode == 0, "{0} {1}".format(stdOut, stdErr) diff --git a/pandaharvester/harvestercredmanager/proxy_cache_cred_manager.py b/pandaharvester/harvestercredmanager/proxy_cache_cred_manager.py index 16fb868d..67f4f245 100644 --- a/pandaharvester/harvestercredmanager/proxy_cache_cred_manager.py +++ b/pandaharvester/harvestercredmanager/proxy_cache_cred_manager.py @@ -8,7 +8,7 @@ from pandaharvester.harvestercore.communicator_pool import CommunicatorPool # logger -_logger = core_utils.setup_logger('proxy_cache_cred_manager') +_logger = core_utils.setup_logger("proxy_cache_cred_manager") # credential manager with proxy cache @@ -20,33 +20,30 @@ def __init__(self, **kwarg): # check proxy def check_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='check_credential') + mainLog = self.make_logger(_logger, method_name="check_credential") comStr = "voms-proxy-info -exists -hours 72 -file {0}".format(self.outCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode except Exception: core_utils.dump_error_message(mainLog) return False - mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) + mainLog.debug("retCode={0} stdOut={1} stdErr={2}".format(retCode, stdOut, stdErr)) return retCode == 0 # renew proxy def renew_credential(self): # make logger - mainLog = self.make_logger(_logger, method_name='renew_credential') + mainLog = self.make_logger(_logger, method_name="renew_credential") # make communication channel to PanDA com = CommunicatorPool() proxy, msg = com.get_proxy(self.voms, (self.inCertFile, self.inCertFile)) if proxy is not None: - pFile = open(self.outCertFile, 'w') + pFile = open(self.outCertFile, "w") pFile.write(proxy) pFile.close() else: - mainLog.error('failed to renew credential with a server message : {0}'.format(msg)) + mainLog.error("failed to renew credential with a server message : {0}".format(msg)) return proxy is not None, msg diff --git a/pandaharvester/harvesterextractor/aux_extractor.py b/pandaharvester/harvesterextractor/aux_extractor.py index 33cbe672..c9760e95 100644 --- a/pandaharvester/harvesterextractor/aux_extractor.py +++ b/pandaharvester/harvesterextractor/aux_extractor.py @@ -12,28 +12,28 @@ def __init__(self, **kwarg): # get auxiliary input files def get_aux_inputs(self, jobspec): url_list = [] - jobPars = jobspec.jobParams['jobPars'] + jobPars = jobspec.jobParams["jobPars"] # transformation - trf = jobspec.jobParams['transformation'] - if trf is not None and trf.startswith('http'): + trf = jobspec.jobParams["transformation"] + if trf is not None and trf.startswith("http"): url_list.append(trf) # extract source URL - tmpM = re.search(' --sourceURL\s+([^\s]+)', jobPars) + tmpM = re.search(" --sourceURL\s+([^\s]+)", jobPars) if tmpM is not None: sourceURL = tmpM.group(1) - jobspec.jobParams['sourceURL'] = sourceURL + jobspec.jobParams["sourceURL"] = sourceURL # extract sandbox - if jobspec.jobParams['prodSourceLabel'] == 'user': - tmpM = re.search('-a\s+([^\s]+)', jobPars) + if jobspec.jobParams["prodSourceLabel"] == "user": + tmpM = re.search("-a\s+([^\s]+)", jobPars) else: - tmpM = re.search('-i\s+([^\s]+)', jobPars) + tmpM = re.search("-i\s+([^\s]+)", jobPars) if tmpM is not None: lfn = tmpM.group(1) - url = '{0}/cache/{1}'.format(sourceURL, lfn) + url = "{0}/cache/{1}".format(sourceURL, lfn) url_list.append(url) # extract container image - if 'container_name' in jobspec.jobParams: - url = jobspec.jobParams['container_name'] + if "container_name" in jobspec.jobParams: + url = jobspec.jobParams["container_name"] if self.containerPrefix is not None and not url.startswith(self.containerPrefix): url = self.containerPrefix + url url_list.append(url) diff --git a/pandaharvester/harvesterextractor/base_extractor.py b/pandaharvester/harvesterextractor/base_extractor.py index 337bd327..3ed9a260 100644 --- a/pandaharvester/harvesterextractor/base_extractor.py +++ b/pandaharvester/harvesterextractor/base_extractor.py @@ -4,7 +4,6 @@ # base extractor class BaseExtractor(PluginBase): - # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) @@ -13,7 +12,6 @@ def __init__(self, **kwarg): def make_aux_inputs(self, url_list): retVal = dict() for url in url_list: - lfn = url.split('/')[-1] - retVal[lfn] = {'scope': 'aux_input', 'INTERNAL_FileType': FileSpec.AUX_INPUT, - 'INTERNAL_URL': url} + lfn = url.split("/")[-1] + retVal[lfn] = {"scope": "aux_input", "INTERNAL_FileType": FileSpec.AUX_INPUT, "INTERNAL_URL": url} return retVal diff --git a/pandaharvester/harvesterfifo/mysql_fifo.py b/pandaharvester/harvesterfifo/mysql_fifo.py index dcba48ad..1d70c189 100644 --- a/pandaharvester/harvesterfifo/mysql_fifo.py +++ b/pandaharvester/harvesterfifo/mysql_fifo.py @@ -13,12 +13,12 @@ class MysqlFifo(PluginBase): # constructor def __init__(self, **kwarg): self.reconnectTimeout = 300 - if hasattr(harvester_config, 'fifo') and hasattr(harvester_config.fifo, 'reconnectTimeout'): + if hasattr(harvester_config, "fifo") and hasattr(harvester_config.fifo, "reconnectTimeout"): self.reconnectTimeout = harvester_config.db.reconnectTimeout - elif hasattr(harvester_config.db, 'reconnectTimeout'): + elif hasattr(harvester_config.db, "reconnectTimeout"): self.reconnectTimeout = harvester_config.db.reconnectTimeout PluginBase.__init__(self, **kwarg) - self.tableName = '{title}_FIFO'.format(title=self.titleName) + self.tableName = "{title}_FIFO".format(title=self.titleName) # get connection, cursor and error types self._connect_db() # create table for fifo @@ -33,29 +33,29 @@ def __init__(self, **kwarg): # get connection, cursor and error types def _connect_db(self): # DB access attribues - if hasattr(self, 'db_host'): + if hasattr(self, "db_host"): db_host = self.db_host else: try: db_host = harvester_config.fifo.db_host except AttributeError: - db_host = '127.0.0.1' - if hasattr(self, 'db_port'): + db_host = "127.0.0.1" + if hasattr(self, "db_port"): db_port = self.db_port else: try: db_port = harvester_config.fifo.db_port except AttributeError: db_port = 3306 - if hasattr(self, 'db_user'): + if hasattr(self, "db_user"): db_user = self.db_user else: db_user = harvester_config.fifo.db_user - if hasattr(self, 'db_password'): + if hasattr(self, "db_password"): db_password = self.db_password else: db_password = harvester_config.fifo.db_password - if hasattr(self, 'db_schema'): + if hasattr(self, "db_schema"): db_schema = self.db_schema else: db_schema = harvester_config.fifo.db_schema @@ -66,15 +66,13 @@ def _connect_db(self): try: import mysql.connector except ImportError: - raise Exception('No available MySQL DB API installed. Please pip install mysqlclient or mysql-connection-python') + raise Exception("No available MySQL DB API installed. Please pip install mysqlclient or mysql-connection-python") else: - self.con = mysql.connector.connect(user=db_user, passwd=db_password, db=db_schema, - host=db_host, port=db_port, charset='utf8') + self.con = mysql.connector.connect(user=db_user, passwd=db_password, db=db_schema, host=db_host, port=db_port, charset="utf8") self.cur = self.con.cursor(buffered=True) self.OperationalError = mysql.connector.errors.OperationalError else: - self.con = MySQLdb.connect(user=db_user, passwd=db_password, - db=db_schema, host=db_host, port=db_port) + self.con = MySQLdb.connect(user=db_user, passwd=db_password, db=db_schema, host=db_host, port=db_port) self.cur = self.con.cursor() self.OperationalError = MySQLdb.OperationalError @@ -119,7 +117,9 @@ def _wrapped_method(self, *args, **kwargs): raise exc else: raise exc + return _wrapped_method + return _decorator(method) # wrapper for execute @@ -147,40 +147,29 @@ def rollback(self): # make table def _make_table(self): sql_make_table = ( - 'CREATE TABLE IF NOT EXISTS {table_name} ' - '(' - ' id BIGINT NOT NULL AUTO_INCREMENT,' - ' item LONGBLOB,' - ' score DOUBLE,' - ' temporary TINYINT DEFAULT 0,' - ' PRIMARY KEY (id) ' - ')' - ).format(table_name=self.tableName) + "CREATE TABLE IF NOT EXISTS {table_name} " + "(" + " id BIGINT NOT NULL AUTO_INCREMENT," + " item LONGBLOB," + " score DOUBLE," + " temporary TINYINT DEFAULT 0," + " PRIMARY KEY (id) " + ")" + ).format(table_name=self.tableName) self.execute(sql_make_table) # make index def _make_index(self): - sql_make_index = ( - 'CREATE INDEX IF NOT EXISTS score_index ON {table_name} ' - '(score)' - ).format(table_name=self.tableName) + sql_make_index = ("CREATE INDEX IF NOT EXISTS score_index ON {table_name} " "(score)").format(table_name=self.tableName) self.execute(sql_make_index) def _push(self, item, score): - sql_push = ( - 'INSERT INTO {table_name} ' - '(item, score) ' - 'VALUES (%s, %s) ' - ).format(table_name=self.tableName) + sql_push = ("INSERT INTO {table_name} " "(item, score) " "VALUES (%s, %s) ").format(table_name=self.tableName) params = (item, score) self.execute(sql_push, params) def _push_by_id(self, id, item, score): - sql_push = ( - 'INSERT IGNORE INTO {table_name} ' - '(id, item, score) ' - 'VALUES (%s, %s, %s) ' - ).format(table_name=self.tableName) + sql_push = ("INSERT IGNORE INTO {table_name} " "(id, item, score) " "VALUES (%s, %s, %s) ").format(table_name=self.tableName) params = (id, item, score) self.execute(sql_push, params) n_row = self.cur.rowcount @@ -189,29 +178,15 @@ def _push_by_id(self, id, item, score): else: return False - def _pop(self, timeout=None, protective=False, mode='first'): - sql_pop_get_first = ( - 'SELECT id, item, score FROM {table_name} ' - 'WHERE temporary = 0 ' - 'ORDER BY score LIMIT 1 ' - ).format(table_name=self.tableName) - sql_pop_get_last = ( - 'SELECT id, item, score FROM {table_name} ' - 'WHERE temporary = 0 ' - 'ORDER BY score DESC LIMIT 1 ' - ).format(table_name=self.tableName) - sql_pop_to_temp = ( - 'UPDATE {table_name} SET temporary = 1 ' - 'WHERE id = %s AND temporary = 0 ' - ).format(table_name=self.tableName) - sql_pop_del = ( - 'DELETE FROM {table_name} ' - 'WHERE id = %s AND temporary = 0 ' - ).format(table_name=self.tableName) + def _pop(self, timeout=None, protective=False, mode="first"): + sql_pop_get_first = ("SELECT id, item, score FROM {table_name} " "WHERE temporary = 0 " "ORDER BY score LIMIT 1 ").format(table_name=self.tableName) + sql_pop_get_last = ("SELECT id, item, score FROM {table_name} " "WHERE temporary = 0 " "ORDER BY score DESC LIMIT 1 ").format(table_name=self.tableName) + sql_pop_to_temp = ("UPDATE {table_name} SET temporary = 1 " "WHERE id = %s AND temporary = 0 ").format(table_name=self.tableName) + sql_pop_del = ("DELETE FROM {table_name} " "WHERE id = %s AND temporary = 0 ").format(table_name=self.tableName) mode_sql_map = { - 'first': sql_pop_get_first, - 'last': sql_pop_get_last, - } + "first": sql_pop_get_first, + "last": sql_pop_get_last, + } sql_pop_get = mode_sql_map[mode] keep_polling = True got_object = False @@ -250,41 +225,31 @@ def _pop(self, timeout=None, protective=False, mode='first'): raise _exc tries += 1 time.sleep(wait) - wait = min(max_wait, tries/10.0 + wait) + wait = min(max_wait, tries / 10.0 + wait) return None - def _peek(self, mode='first', id=None, skip_item=False): + def _peek(self, mode="first", id=None, skip_item=False): if skip_item: - columns_str = 'id, score' + columns_str = "id, score" else: - columns_str = 'id, item, score' - sql_peek_first = ( - 'SELECT {columns} FROM {table_name} ' - 'WHERE temporary = 0 ' - 'ORDER BY score LIMIT 1 ' - ).format(columns=columns_str, table_name=self.tableName) - sql_peek_last = ( - 'SELECT {columns} FROM {table_name} ' - 'WHERE temporary = 0 ' - 'ORDER BY score DESC LIMIT 1 ' - ).format(columns=columns_str, table_name=self.tableName) - sql_peek_by_id = ( - 'SELECT {columns} FROM {table_name} ' - 'WHERE id = %s AND temporary = 0 ' - ).format(columns=columns_str, table_name=self.tableName) - sql_peek_by_id_temp = ( - 'SELECT {columns} FROM {table_name} ' - 'WHERE id = %s AND temporary = 1 ' - ).format(columns=columns_str, table_name=self.tableName) + columns_str = "id, item, score" + sql_peek_first = ("SELECT {columns} FROM {table_name} " "WHERE temporary = 0 " "ORDER BY score LIMIT 1 ").format( + columns=columns_str, table_name=self.tableName + ) + sql_peek_last = ("SELECT {columns} FROM {table_name} " "WHERE temporary = 0 " "ORDER BY score DESC LIMIT 1 ").format( + columns=columns_str, table_name=self.tableName + ) + sql_peek_by_id = ("SELECT {columns} FROM {table_name} " "WHERE id = %s AND temporary = 0 ").format(columns=columns_str, table_name=self.tableName) + sql_peek_by_id_temp = ("SELECT {columns} FROM {table_name} " "WHERE id = %s AND temporary = 1 ").format(columns=columns_str, table_name=self.tableName) mode_sql_map = { - 'first': sql_peek_first, - 'last': sql_peek_last, - 'id': sql_peek_by_id, - 'idtemp': sql_peek_by_id_temp, - } + "first": sql_peek_first, + "last": sql_peek_last, + "id": sql_peek_by_id, + "idtemp": sql_peek_by_id_temp, + } sql_peek = mode_sql_map[mode] try: - if mode in ('id', 'idtemp'): + if mode in ("id", "idtemp"): params = (id,) self.execute(sql_peek, params) else: @@ -306,32 +271,29 @@ def _peek(self, mode='first', id=None, skip_item=False): def _update(self, id, item=None, score=None, temporary=None, cond_score=None): cond_score_str_map = { - 'gt': 'AND score < %s', - 'ge': 'AND score <= %s', - 'lt': 'AND score > %s', - 'le': 'AND score >= %s', - } - cond_score_str = cond_score_str_map.get(cond_score, '') + "gt": "AND score < %s", + "ge": "AND score <= %s", + "lt": "AND score > %s", + "le": "AND score >= %s", + } + cond_score_str = cond_score_str_map.get(cond_score, "") attr_set_list = [] params = [] if item is not None: - attr_set_list.append('item = %s') + attr_set_list.append("item = %s") params.append(item) if score is not None: - attr_set_list.append('score = %s') + attr_set_list.append("score = %s") params.append(score) if temporary is not None: - attr_set_list.append('temporary = %s') + attr_set_list.append("temporary = %s") params.append(temporary) - attr_set_str = ' , '.join(attr_set_list) + attr_set_str = " , ".join(attr_set_list) if not attr_set_str: return False - sql_update = ( - 'UPDATE IGNORE {table_name} SET ' - '{attr_set_str} ' - 'WHERE id = %s ' - '{cond_score_str} ' - ).format(table_name=self.tableName, attr_set_str=attr_set_str, cond_score_str=cond_score_str) + sql_update = ("UPDATE IGNORE {table_name} SET " "{attr_set_str} " "WHERE id = %s " "{cond_score_str} ").format( + table_name=self.tableName, attr_set_str=attr_set_str, cond_score_str=cond_score_str + ) params.append(id) if cond_score_str: params.append(score) @@ -348,9 +310,7 @@ def _update(self, id, item=None, score=None, temporary=None, cond_score=None): # number of objects in queue def size(self): - sql_size = ( - 'SELECT COUNT(id) FROM {table_name}' - ).format(table_name=self.tableName) + sql_size = ("SELECT COUNT(id) FROM {table_name}").format(table_name=self.tableName) try: self.execute(sql_size) res = self.cur.fetchall() @@ -387,44 +347,38 @@ def get(self, timeout=None, protective=False): # dequeue the last object def getlast(self, timeout=None, protective=False): - return self._pop(timeout=timeout, protective=protective, mode='last') + return self._pop(timeout=timeout, protective=protective, mode="last") # dequeue list of objects with some conditions - def getmany(self, mode='first', minscore=None, maxscore=None, count=None, - protective=False, temporary=False): - temporary_str = 'temporary = 1' if temporary else 'temporary = 0' - minscore_str = '' if minscore is None else 'AND score >= {0}'.format(float(minscore)) - maxscore_str = '' if maxscore is None else 'AND score <= {0}'.format(float(maxscore)) - count_str = '' if count is None else 'LIMIT {0}'.format(int(count)) + def getmany(self, mode="first", minscore=None, maxscore=None, count=None, protective=False, temporary=False): + temporary_str = "temporary = 1" if temporary else "temporary = 0" + minscore_str = "" if minscore is None else "AND score >= {0}".format(float(minscore)) + maxscore_str = "" if maxscore is None else "AND score <= {0}".format(float(maxscore)) + count_str = "" if count is None else "LIMIT {0}".format(int(count)) mode_rank_map = { - 'first': '', - 'last': 'DESC', - } + "first": "", + "last": "DESC", + } sql_get_many = ( - 'SELECT id, item, score FROM {table_name} ' - 'WHERE ' - '{temporary_str} ' - '{minscore_str} ' - '{maxscore_str} ' - 'ORDER BY score {rank} ' - '{count_str} ' - ).format(table_name=self.tableName, temporary_str=temporary_str, - minscore_str=minscore_str, maxscore_str=maxscore_str, - rank=mode_rank_map[mode], count_str=count_str) - sql_pop_to_temp = ( - 'UPDATE {table_name} SET temporary = 1 ' - 'WHERE id = %s AND temporary = 0 ' - ).format(table_name=self.tableName) - sql_pop_del = ( - 'DELETE FROM {table_name} ' - 'WHERE id = %s AND temporary = {temporary} ' - ).format(table_name=self.tableName, temporary=(1 if temporary else 0)) + "SELECT id, item, score FROM {table_name} " "WHERE " "{temporary_str} " "{minscore_str} " "{maxscore_str} " "ORDER BY score {rank} " "{count_str} " + ).format( + table_name=self.tableName, + temporary_str=temporary_str, + minscore_str=minscore_str, + maxscore_str=maxscore_str, + rank=mode_rank_map[mode], + count_str=count_str, + ) + sql_pop_to_temp = ("UPDATE {table_name} SET temporary = 1 " "WHERE id = %s AND temporary = 0 ").format(table_name=self.tableName) + sql_pop_del = ("DELETE FROM {table_name} " "WHERE id = %s AND temporary = {temporary} ").format( + table_name=self.tableName, temporary=(1 if temporary else 0) + ) ret_list = [] try: self.execute(sql_get_many) res = self.cur.fetchall() for _rec in res: - got_object =False + got_object = False id, item, score = _rec params = (id,) if protective: @@ -448,38 +402,33 @@ def peek(self, skip_item=False): # get tuple of (id, item, score) of the last object without dequeuing it def peeklast(self, skip_item=False): - return self._peek(mode='last', skip_item=skip_item) + return self._peek(mode="last", skip_item=skip_item) # get tuple of (id, item, score) of object by id without dequeuing it def peekbyid(self, id, temporary=False, skip_item=False): if temporary: - return self._peek(mode='idtemp', id=id, skip_item=skip_item) + return self._peek(mode="idtemp", id=id, skip_item=skip_item) else: - return self._peek(mode='id', id=id, skip_item=skip_item) + return self._peek(mode="id", id=id, skip_item=skip_item) # get list of object tuples without dequeuing it - def peekmany(self, mode='first', minscore=None, maxscore=None, count=None, skip_item=False): - minscore_str = '' if minscore is None else 'AND score >= {0}'.format(float(minscore)) - maxscore_str = '' if maxscore is None else 'AND score <= {0}'.format(float(maxscore)) - count_str = '' if count is None else 'LIMIT {0}'.format(int(count)) + def peekmany(self, mode="first", minscore=None, maxscore=None, count=None, skip_item=False): + minscore_str = "" if minscore is None else "AND score >= {0}".format(float(minscore)) + maxscore_str = "" if maxscore is None else "AND score <= {0}".format(float(maxscore)) + count_str = "" if count is None else "LIMIT {0}".format(int(count)) mode_rank_map = { - 'first': '', - 'last': 'DESC', - } + "first": "", + "last": "DESC", + } if skip_item: - columns_str = 'id, score' + columns_str = "id, score" else: - columns_str = 'id, item, score' + columns_str = "id, item, score" sql_peek_many = ( - 'SELECT {columns} FROM {table_name} ' - 'WHERE temporary = 0 ' - '{minscore_str} ' - '{maxscore_str} ' - 'ORDER BY score {rank} ' - '{count_str} ' - ).format(columns=columns_str, table_name=self.tableName, - minscore_str=minscore_str, maxscore_str=maxscore_str, - rank=mode_rank_map[mode], count_str=count_str) + "SELECT {columns} FROM {table_name} " "WHERE temporary = 0 " "{minscore_str} " "{maxscore_str} " "ORDER BY score {rank} " "{count_str} " + ).format( + columns=columns_str, table_name=self.tableName, minscore_str=minscore_str, maxscore_str=maxscore_str, rank=mode_rank_map[mode], count_str=count_str + ) try: self.execute(sql_peek_many) res = self.cur.fetchall() @@ -499,12 +448,8 @@ def peekmany(self, mode='first', minscore=None, maxscore=None, count=None, skip_ # drop all objects in queue and index and reset the table def clear(self): - sql_clear_index = ( - 'DROP INDEX IF EXISTS score_index ON {table_name} ' - ).format(table_name=self.tableName) - sql_clear_table = ( - 'DROP TABLE IF EXISTS {table_name} ' - ).format(table_name=self.tableName) + sql_clear_index = ("DROP INDEX IF EXISTS score_index ON {table_name} ").format(table_name=self.tableName) + sql_clear_table = ("DROP TABLE IF EXISTS {table_name} ").format(table_name=self.tableName) # self.execute(sql_clear_index) try: self.execute(sql_clear_table) @@ -515,11 +460,10 @@ def clear(self): # delete objects by list of id def delete(self, ids): - sql_delete_template = 'DELETE FROM {table_name} WHERE id in ({placeholders} ) ' + sql_delete_template = "DELETE FROM {table_name} WHERE id in ({placeholders} ) " if isinstance(ids, (list, tuple)): - placeholders_str = ','.join([' %s'] * len(ids)) - sql_delete = sql_delete_template.format( - table_name=self.tableName, placeholders=placeholders_str) + placeholders_str = ",".join([" %s"] * len(ids)) + sql_delete = sql_delete_template.format(table_name=self.tableName, placeholders=placeholders_str) try: self.execute(sql_delete, ids) n_row = self.cur.rowcount @@ -529,22 +473,19 @@ def delete(self, ids): raise _e return n_row else: - raise TypeError('ids should be list or tuple') + raise TypeError("ids should be list or tuple") # Move objects in temporary space to the queue def restore(self, ids): if ids is None: - sql_restore = ( - 'UPDATE {table_name} SET temporary = 0 WHERE temporary != 0 ' - ).format(table_name=self.tableName) + sql_restore = ("UPDATE {table_name} SET temporary = 0 WHERE temporary != 0 ").format(table_name=self.tableName) elif isinstance(ids, (list, tuple)): - placeholders_str = ','.join([' %s'] * len(ids)) - sql_restore = ( - 'UPDATE {table_name} SET temporary = 0 ' - 'WHERE temporary != 0 AND id in ({placeholders} ) ' - ).format(table_name=self.tableName, placeholders=placeholders_str) + placeholders_str = ",".join([" %s"] * len(ids)) + sql_restore = ("UPDATE {table_name} SET temporary = 0 " "WHERE temporary != 0 AND id in ({placeholders} ) ").format( + table_name=self.tableName, placeholders=placeholders_str + ) else: - raise TypeError('ids should be list or tuple or None') + raise TypeError("ids should be list or tuple or None") try: self.execute(sql_restore) self.commit() diff --git a/pandaharvester/harvesterfifo/redis_fifo.py b/pandaharvester/harvesterfifo/redis_fifo.py index fe949909..ec35997d 100644 --- a/pandaharvester/harvesterfifo/redis_fifo.py +++ b/pandaharvester/harvesterfifo/redis_fifo.py @@ -18,44 +18,44 @@ class RedisFifo(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) _redis_conn_opt_dict = {} - if hasattr(self, 'redisHost'): - _redis_conn_opt_dict['host'] = self.redisHost - elif hasattr(harvester_config.fifo, 'redisHost'): - _redis_conn_opt_dict['host'] = harvester_config.fifo.redisHost - if hasattr(self, 'redisPort'): - _redis_conn_opt_dict['port'] = self.redisPort - elif hasattr(harvester_config.fifo, 'redisPort'): - _redis_conn_opt_dict['port'] = harvester_config.fifo.redisPort - if hasattr(self, 'redisDB'): - _redis_conn_opt_dict['db'] = self.redisDB - elif hasattr(harvester_config.fifo, 'redisDB'): - _redis_conn_opt_dict['db'] = harvester_config.fifo.redisDB - if hasattr(self, 'redisPassword'): - _redis_conn_opt_dict['password'] = self.redisPassword - elif hasattr(harvester_config.fifo, 'redisPassword'): - _redis_conn_opt_dict['password'] = harvester_config.fifo.redisPassword + if hasattr(self, "redisHost"): + _redis_conn_opt_dict["host"] = self.redisHost + elif hasattr(harvester_config.fifo, "redisHost"): + _redis_conn_opt_dict["host"] = harvester_config.fifo.redisHost + if hasattr(self, "redisPort"): + _redis_conn_opt_dict["port"] = self.redisPort + elif hasattr(harvester_config.fifo, "redisPort"): + _redis_conn_opt_dict["port"] = harvester_config.fifo.redisPort + if hasattr(self, "redisDB"): + _redis_conn_opt_dict["db"] = self.redisDB + elif hasattr(harvester_config.fifo, "redisDB"): + _redis_conn_opt_dict["db"] = harvester_config.fifo.redisDB + if hasattr(self, "redisPassword"): + _redis_conn_opt_dict["password"] = self.redisPassword + elif hasattr(harvester_config.fifo, "redisPassword"): + _redis_conn_opt_dict["password"] = harvester_config.fifo.redisPassword self.qconn = redis.StrictRedis(**_redis_conn_opt_dict) - self.id_score = '{0}-fifo_id-score'.format(self.titleName) - self.id_item = '{0}-fifo_id-item'.format(self.titleName) - self.id_temp = '{0}-fifo_id-temp'.format(self.titleName) + self.id_score = "{0}-fifo_id-score".format(self.titleName) + self.id_item = "{0}-fifo_id-item".format(self.titleName) + self.id_temp = "{0}-fifo_id-temp".format(self.titleName) def __len__(self): return self.qconn.zcard(self.id_score) - def _peek(self, mode='first', id=None, skip_item=False): - if mode == 'first': + def _peek(self, mode="first", id=None, skip_item=False): + if mode == "first": try: id_gotten, score = self.qconn.zrange(self.id_score, 0, 0, withscores=True)[0] except IndexError: return None - elif mode == 'last': + elif mode == "last": try: id_gotten, score = self.qconn.zrevrange(self.id_score, 0, 0, withscores=True)[0] except IndexError: return None else: resVal = self.qconn.sismember(self.id_temp, id) - if (mode == 'id' and not resVal) or (mode == 'idtemp' and resVal): + if (mode == "id" and not resVal) or (mode == "idtemp" and resVal): id_gotten = id score = self.qconn.zscore(self.id_score, id) else: @@ -69,7 +69,7 @@ def _peek(self, mode='first', id=None, skip_item=False): else: return (id_gotten, item, score) - def _pop(self, timeout=None, protective=False, mode='first'): + def _pop(self, timeout=None, protective=False, mode="first"): keep_polling = True wait = 0.1 max_wait = 2 @@ -80,7 +80,7 @@ def _pop(self, timeout=None, protective=False, mode='first'): peeked_tuple = self._peek(mode=mode) if peeked_tuple is None: time.sleep(wait) - wait = min(max_wait, tries/10.0 + wait) + wait = min(max_wait, tries / 10.0 + wait) else: id, item, score = peeked_tuple while True: @@ -123,7 +123,7 @@ def put(self, item, score): try: pipeline.watch(self.id_score, self.id_item) pipeline.multi() - pipeline.execute_command('ZADD', self.id_score, 'NX', score, id) + pipeline.execute_command("ZADD", self.id_score, "NX", score, id) pipeline.hsetnx(self.id_item, id, item) resVal = pipeline.execute() except redis.WatchError: @@ -134,7 +134,7 @@ def put(self, item, score): if resVal[-2] == 1 and resVal[-1] == 1: return True if time.time() > generate_id_attempt_timestamp + 60: - raise Exception('Cannot generate unique id') + raise Exception("Cannot generate unique id") return False time.sleep(0.0001) return False @@ -146,7 +146,7 @@ def putbyid(self, id, item, score): try: pipeline.watch(self.id_score, self.id_item) pipeline.multi() - pipeline.execute_command('ZADD', self.id_score, 'NX', score, id) + pipeline.execute_command("ZADD", self.id_score, "NX", score, id) pipeline.hsetnx(self.id_item, id, item) resVal = pipeline.execute() except redis.WatchError: @@ -160,11 +160,11 @@ def putbyid(self, id, item, score): # dequeue the first object def get(self, timeout=None, protective=False): - return self._pop(timeout=timeout, protective=protective, mode='first') + return self._pop(timeout=timeout, protective=protective, mode="first") # dequeue the last object def getlast(self, timeout=None, protective=False): - return self._pop(timeout=timeout, protective=protective, mode='last') + return self._pop(timeout=timeout, protective=protective, mode="last") # get tuple of (id, item, score) of the first object without dequeuing it def peek(self, skip_item=False): @@ -172,14 +172,14 @@ def peek(self, skip_item=False): # get tuple of (id, item, score) of the last object without dequeuing it def peeklast(self, skip_item=False): - return self._peek(mode='last', skip_item=skip_item) + return self._peek(mode="last", skip_item=skip_item) # get tuple of (id, item, score) of object by id without dequeuing it def peekbyid(self, id, temporary=False, skip_item=False): if temporary: - return self._peek(mode='idtemp', id=id, skip_item=skip_item) + return self._peek(mode="idtemp", id=id, skip_item=skip_item) else: - return self._peek(mode='id', id=id, skip_item=skip_item) + return self._peek(mode="id", id=id, skip_item=skip_item) # drop all objects in queue def clear(self): @@ -215,7 +215,7 @@ def delete(self, ids): n_row = resVal[-1] return n_row else: - raise TypeError('ids should be list or tuple') + raise TypeError("ids should be list or tuple") # Move objects in temporary space to the queue def restore(self, ids): @@ -234,7 +234,7 @@ def restore(self, ids): pipeline.srem(self.id_temp, *ids) pipeline.execute() else: - raise TypeError('ids should be list or tuple or None') + raise TypeError("ids should be list or tuple or None") except redis.WatchError: continue else: diff --git a/pandaharvester/harvesterfifo/sqlite_fifo.py b/pandaharvester/harvesterfifo/sqlite_fifo.py index b550f205..d0299dc4 100644 --- a/pandaharvester/harvesterfifo/sqlite_fifo.py +++ b/pandaharvester/harvesterfifo/sqlite_fifo.py @@ -19,78 +19,41 @@ class SqliteFifo(PluginBase): - # template of SQL commands - _create_sql = ( - 'CREATE TABLE IF NOT EXISTS queue_table ' - '(' - ' id INTEGER PRIMARY KEY,' - ' item BLOB,' - ' score REAL,' - ' temporary INTEGER DEFAULT 0 ' - ')' - ) - _create_index_sql = ( - 'CREATE INDEX IF NOT EXISTS score_index ON queue_table ' - '(score)' - ) - _count_sql = 'SELECT COUNT(id) FROM queue_table' - _iterate_sql = 'SELECT id, item, score FROM queue_table' - _write_lock_sql = 'BEGIN IMMEDIATE' - _exclusive_lock_sql = 'BEGIN EXCLUSIVE' - _push_sql = 'INSERT INTO queue_table (item,score) VALUES (?,?)' - _push_by_id_sql = 'INSERT OR IGNORE INTO queue_table (id,item,score) VALUES (?,?,?)' - _lpop_get_sql_template = ( - 'SELECT {columns} FROM queue_table ' - 'WHERE temporary = 0 ' - 'ORDER BY score LIMIT 1' - ) - _rpop_get_sql_template = ( - 'SELECT {columns} FROM queue_table ' - 'WHERE temporary = 0 ' - 'ORDER BY score DESC LIMIT 1' - ) - _get_by_id_sql_template = ( - 'SELECT {columns} FROM queue_table ' - 'WHERE id = ? ' - 'AND temporary = {temp}' - ) + _create_sql = "CREATE TABLE IF NOT EXISTS queue_table " "(" " id INTEGER PRIMARY KEY," " item BLOB," " score REAL," " temporary INTEGER DEFAULT 0 " ")" + _create_index_sql = "CREATE INDEX IF NOT EXISTS score_index ON queue_table " "(score)" + _count_sql = "SELECT COUNT(id) FROM queue_table" + _iterate_sql = "SELECT id, item, score FROM queue_table" + _write_lock_sql = "BEGIN IMMEDIATE" + _exclusive_lock_sql = "BEGIN EXCLUSIVE" + _push_sql = "INSERT INTO queue_table (item,score) VALUES (?,?)" + _push_by_id_sql = "INSERT OR IGNORE INTO queue_table (id,item,score) VALUES (?,?,?)" + _lpop_get_sql_template = "SELECT {columns} FROM queue_table " "WHERE temporary = 0 " "ORDER BY score LIMIT 1" + _rpop_get_sql_template = "SELECT {columns} FROM queue_table " "WHERE temporary = 0 " "ORDER BY score DESC LIMIT 1" + _get_by_id_sql_template = "SELECT {columns} FROM queue_table " "WHERE id = ? " "AND temporary = {temp}" _get_many_template = ( - 'SELECT id, item, score FROM queue_table ' - 'WHERE ' - '{temporary_str} ' - '{minscore_str} ' - '{maxscore_str} ' - 'ORDER BY score {rank} ' - '{count_str} ' - ) - _pop_del_sql = 'DELETE FROM queue_table WHERE id = ?' - _move_to_temp_sql = 'UPDATE queue_table SET temporary = 1 WHERE id = ?' - _move_many_to_temp_sql_template = 'UPDATE queue_table SET temporary = 1 WHERE id in ({0})' - _del_sql_template = 'DELETE FROM queue_table WHERE id in ({0})' - _clear_delete_table_sql = 'DELETE FROM queue_table' - _clear_drop_table_sql = 'DROP TABLE IF EXISTS queue_table' + "SELECT id, item, score FROM queue_table " "WHERE " "{temporary_str} " "{minscore_str} " "{maxscore_str} " "ORDER BY score {rank} " "{count_str} " + ) + _pop_del_sql = "DELETE FROM queue_table WHERE id = ?" + _move_to_temp_sql = "UPDATE queue_table SET temporary = 1 WHERE id = ?" + _move_many_to_temp_sql_template = "UPDATE queue_table SET temporary = 1 WHERE id in ({0})" + _del_sql_template = "DELETE FROM queue_table WHERE id in ({0})" + _clear_delete_table_sql = "DELETE FROM queue_table" + _clear_drop_table_sql = "DROP TABLE IF EXISTS queue_table" _clear_zero_id_sql = 'DELETE FROM sqlite_sequence WHERE name = "queue_table"' - _peek_sql = ( - 'SELECT id, item, score FROM queue_table ' - 'WHERE temporary = 0 ' - 'ORDER BY score LIMIT 1' - ) - _restore_sql = 'UPDATE queue_table SET temporary = 0 WHERE temporary != 0' - _restore_sql_template = ( - 'UPDATE queue_table SET temporary = 0 ' - 'WHERE temporary != 0 AND id in ({0})' - ) + _peek_sql = "SELECT id, item, score FROM queue_table " "WHERE temporary = 0 " "ORDER BY score LIMIT 1" + _restore_sql = "UPDATE queue_table SET temporary = 0 WHERE temporary != 0" + _restore_sql_template = "UPDATE queue_table SET temporary = 0 " "WHERE temporary != 0 AND id in ({0})" # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - if hasattr(self, 'database_filename'): + if hasattr(self, "database_filename"): _db_filename = self.database_filename else: _db_filename = harvester_config.fifo.database_filename - _db_filename = re.sub('\$\(TITLE\)', self.titleName, _db_filename) - _db_filename = re.sub('\$\(AGENT\)', self.titleName, _db_filename) + _db_filename = re.sub("\$\(TITLE\)", self.titleName, _db_filename) + _db_filename = re.sub("\$\(AGENT\)", self.titleName, _db_filename) self.db_path = os.path.abspath(_db_filename) self._connection_cache = {} with self._get_conn() as conn: @@ -138,7 +101,7 @@ def _pop(self, get_sql, timeout=None, protective=False): continue tries += 1 time.sleep(wait) - wait = min(max_wait, tries/10.0 + wait) + wait = min(max_wait, tries / 10.0 + wait) if id is not None: if protective: conn.execute(self._move_to_temp_sql, (id,)) @@ -149,10 +112,10 @@ def _pop(self, get_sql, timeout=None, protective=False): return None def _peek(self, peek_sql_template, skip_item=False, id=None, temporary=False): - columns = 'id, item, score' + columns = "id, item, score" temp = 0 if skip_item: - columns = 'id, score' + columns = "id, score" if temporary: temp = 1 peek_sql = peek_sql_template.format(columns=columns, temp=temp) @@ -182,7 +145,7 @@ def put(self, item, score): with self._get_conn() as conn: conn.execute(self._write_lock_sql) cursor = conn.execute(self._push_sql, (item_buf, score)) - n_row = cursor.rowcount + n_row = cursor.rowcount if n_row == 1: retVal = True return retVal @@ -193,35 +156,34 @@ def putbyid(self, id, item, score): item_buf = memoryviewOrBuffer(item) with self._get_conn() as conn: cursor = conn.execute(self._push_by_id_sql, (id, item_buf, score)) - n_row = cursor.rowcount + n_row = cursor.rowcount if n_row == 1: retVal = True return retVal # dequeue the first object def get(self, timeout=None, protective=False): - sql_str = self._lpop_get_sql_template.format(columns='id, item, score') + sql_str = self._lpop_get_sql_template.format(columns="id, item, score") return self._pop(get_sql=sql_str, timeout=timeout, protective=protective) # dequeue the last object def getlast(self, timeout=None, protective=False): - sql_str = self._rpop_get_sql_template.format(columns='id, item, score') + sql_str = self._rpop_get_sql_template.format(columns="id, item, score") return self._pop(get_sql=sql_str, timeout=timeout, protective=protective) # dequeue list of objects with some conditions - def getmany(self, mode='first', minscore=None, maxscore=None, count=None, - protective=False, temporary=False): - temporary_str = 'temporary = 1' if temporary else 'temporary = 0' - minscore_str = '' if minscore is None else 'AND score >= {0}'.format(float(minscore)) - maxscore_str = '' if maxscore is None else 'AND score <= {0}'.format(float(maxscore)) - count_str = '' if count is None else 'LIMIT {0}'.format(int(count)) + def getmany(self, mode="first", minscore=None, maxscore=None, count=None, protective=False, temporary=False): + temporary_str = "temporary = 1" if temporary else "temporary = 0" + minscore_str = "" if minscore is None else "AND score >= {0}".format(float(minscore)) + maxscore_str = "" if maxscore is None else "AND score <= {0}".format(float(maxscore)) + count_str = "" if count is None else "LIMIT {0}".format(int(count)) mode_rank_map = { - 'first': '', - 'last': 'DESC', - } - get_many_sql = self._get_many_template.format(temporary_str=temporary_str, - minscore_str=minscore_str, maxscore_str=maxscore_str, - rank=mode_rank_map[mode], count_str=count_str) + "first": "", + "last": "DESC", + } + get_many_sql = self._get_many_template.format( + temporary_str=temporary_str, minscore_str=minscore_str, maxscore_str=maxscore_str, rank=mode_rank_map[mode], count_str=count_str + ) ret_list = [] with self._get_conn() as conn: conn.execute(self._write_lock_sql) @@ -230,7 +192,7 @@ def getmany(self, mode='first', minscore=None, maxscore=None, count=None, for id, item_buf, score in cursor: ret_list.append((id, bytes(item_buf), score)) ids.append(id) - placeholders_str = ','.join('?' * len(ids)) + placeholders_str = ",".join("?" * len(ids)) if protective: conn.execute(self._move_many_to_temp_sql_template.format(placeholders_str), ids) else: @@ -265,7 +227,7 @@ def clear(self): # delete objects by list of id def delete(self, ids): if isinstance(ids, (list, tuple)): - placeholders_str = ','.join('?' * len(ids)) + placeholders_str = ",".join("?" * len(ids)) with self._get_conn() as conn: conn.execute(self._exclusive_lock_sql) cursor = conn.execute(self._del_sql_template.format(placeholders_str), ids) @@ -273,7 +235,7 @@ def delete(self, ids): conn.commit() return n_row else: - raise TypeError('ids should be list or tuple') + raise TypeError("ids should be list or tuple") # Move objects in temporary space to the queue def restore(self, ids): @@ -282,48 +244,45 @@ def restore(self, ids): if ids is None: conn.execute(self._restore_sql) elif isinstance(ids, (list, tuple)): - placeholders_str = ','.join('?' * len(ids)) + placeholders_str = ",".join("?" * len(ids)) conn.execute(self._restore_sql_template.format(placeholders_str), ids) else: - raise TypeError('ids should be list or tuple or None') + raise TypeError("ids should be list or tuple or None") # update a object by its id with some conditions def update(self, id, item=None, score=None, temporary=None, cond_score=None): cond_score_str_map = { - 'gt': 'AND score < ?', - 'ge': 'AND score <= ?', - 'lt': 'AND score > ?', - 'le': 'AND score >= ?', - } - cond_score_str = cond_score_str_map.get(cond_score, '') + "gt": "AND score < ?", + "ge": "AND score <= ?", + "lt": "AND score > ?", + "le": "AND score >= ?", + } + cond_score_str = cond_score_str_map.get(cond_score, "") attr_set_list = [] params = [] if item is not None: item_buf = memoryviewOrBuffer(item) - attr_set_list.append('item = ?') + attr_set_list.append("item = ?") params.append(item_buf) if score is not None: - attr_set_list.append('score = ?') + attr_set_list.append("score = ?") params.append(score) if temporary is not None: - attr_set_list.append('temporary = ?') + attr_set_list.append("temporary = ?") params.append(temporary) - attr_set_str = ' , '.join(attr_set_list) + attr_set_str = " , ".join(attr_set_list) if not attr_set_str: return False - sql_update = ( - 'UPDATE OR IGNORE queue_table SET ' - '{attr_set_str} ' - 'WHERE id = ? ' - '{cond_score_str} ' - ).format(attr_set_str=attr_set_str, id=id, cond_score_str=cond_score_str) + sql_update = ("UPDATE OR IGNORE queue_table SET " "{attr_set_str} " "WHERE id = ? " "{cond_score_str} ").format( + attr_set_str=attr_set_str, id=id, cond_score_str=cond_score_str + ) params.append(id) if cond_score_str: params.append(score) retVal = False with self._get_conn() as conn: cursor = conn.execute(sql_update, params) - n_row = cursor.rowcount + n_row = cursor.rowcount if n_row >= 1: retVal = True return retVal diff --git a/pandaharvester/harvesterfilesyncer/base_file_syncer.py b/pandaharvester/harvesterfilesyncer/base_file_syncer.py index 9e07df65..7b494e88 100644 --- a/pandaharvester/harvesterfilesyncer/base_file_syncer.py +++ b/pandaharvester/harvesterfilesyncer/base_file_syncer.py @@ -3,7 +3,6 @@ # base file syncer class BaseFileSyncer(PluginBase): - # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) @@ -14,4 +13,4 @@ def check(self): # update files def update(self): - return True, '' + return True, "" diff --git a/pandaharvester/harvesterfilesyncer/git_file_syncer.py b/pandaharvester/harvesterfilesyncer/git_file_syncer.py index 1623de78..e394df50 100644 --- a/pandaharvester/harvesterfilesyncer/git_file_syncer.py +++ b/pandaharvester/harvesterfilesyncer/git_file_syncer.py @@ -5,16 +5,12 @@ from pandaharvester.harvestercore import core_utils # logger -_logger = core_utils.setup_logger('git_file_syncer') +_logger = core_utils.setup_logger("git_file_syncer") # run command def run_command(command_str, cwd=None): - p = subprocess.Popen(command_str.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=cwd) + p = subprocess.Popen(command_str.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) std_out, std_err = p.communicate() ret_code = p.returncode return ret_code, std_out, std_err @@ -26,82 +22,97 @@ class GitFileSyncer(BaseFileSyncer): def __init__(self, **kwarg): BaseFileSyncer.__init__(self, **kwarg) # make logger - main_log = self.make_logger(_logger, method_name='__init__') + main_log = self.make_logger(_logger, method_name="__init__") # set up with direct attributes self.setupMap = dict(vars(self)) # setupMap # self.checkPeriod = self.setupMap.get('checkPeriod', 1) # self.lifetime = self.setupMap.get('lifetime', 96) - self.targetDir = self.setupMap.get('targetDir') - self.sourceURL = self.setupMap.get('sourceURL') - self.sourceBranch = self.setupMap.get('sourceBranch', 'master') - self.sourceRemoteName = self.setupMap.get('sourceRemoteName', 'origin') - self.sourceSubdir = self.setupMap.get('sourceSubdir', '') + self.targetDir = self.setupMap.get("targetDir") + self.sourceURL = self.setupMap.get("sourceURL") + self.sourceBranch = self.setupMap.get("sourceBranch", "master") + self.sourceRemoteName = self.setupMap.get("sourceRemoteName", "origin") + self.sourceSubdir = self.setupMap.get("sourceSubdir", "") # update def update(self): # make logger - main_log = self.make_logger(_logger, method_name='update') - main_log.info('start') + main_log = self.make_logger(_logger, method_name="update") + main_log.info("start") # initialize err_msg_list = [] ret_val = False # execute command and store result in result list + def execute_command(command, **kwargs): ret_code, std_out, std_err = run_command(command, **kwargs) if ret_code != 0: - main_log.error('command: {} ; kwargs: {} ; ret_code={} ; stdout: {} ; stderr: {}'.format( - command, str(kwargs), ret_code, std_out, std_err)) + main_log.error("command: {} ; kwargs: {} ; ret_code={} ; stdout: {} ; stderr: {}".format(command, str(kwargs), ret_code, std_out, std_err)) err_msg_list.append(std_err) else: - main_log.debug('command: {} ; kwargs: {} ; ret_code={} ; stdout: {} ; stderr: {}'.format( - command, str(kwargs), ret_code, std_out, std_err)) + main_log.debug("command: {} ; kwargs: {} ; ret_code={} ; stdout: {} ; stderr: {}".format(command, str(kwargs), ret_code, std_out, std_err)) return ret_code, std_out, std_err + # run try: # assure the local target directory target_dir_path = pathlib.Path(self.targetDir) target_dir_path.mkdir(mode=0o755, parents=True, exist_ok=True) - main_log.debug('assure local target directory {}'.format(str(target_dir_path))) + main_log.debug("assure local target directory {}".format(str(target_dir_path))) # git init - execute_command('git init -q', cwd=target_dir_path) + execute_command("git init -q", cwd=target_dir_path) # git remote - ret_code, std_out, std_err = execute_command('git remote set-url {name} {url}'.format( - name=self.sourceRemoteName, - url=self.sourceURL, - ), cwd=target_dir_path) + ret_code, std_out, std_err = execute_command( + "git remote set-url {name} {url}".format( + name=self.sourceRemoteName, + url=self.sourceURL, + ), + cwd=target_dir_path, + ) if ret_code == 128: - execute_command('git remote add -f -t {branch} {name} {url}'.format( - branch=self.sourceBranch, - name=self.sourceRemoteName, - url=self.sourceURL, - ), cwd=target_dir_path) + execute_command( + "git remote add -f -t {branch} {name} {url}".format( + branch=self.sourceBranch, + name=self.sourceRemoteName, + url=self.sourceURL, + ), + cwd=target_dir_path, + ) else: - execute_command('git remote set-branches {name} {branch}'.format( - branch=self.sourceBranch, - name=self.sourceRemoteName, - ), cwd=target_dir_path) + execute_command( + "git remote set-branches {name} {branch}".format( + branch=self.sourceBranch, + name=self.sourceRemoteName, + ), + cwd=target_dir_path, + ) # git config - execute_command('git config core.sparseCheckout true', cwd=target_dir_path) + execute_command("git config core.sparseCheckout true", cwd=target_dir_path) # modify sparse checkout list file - sparse_checkout_config_path = target_dir_path / '.git/info/sparse-checkout' - with sparse_checkout_config_path.open('w') as f: + sparse_checkout_config_path = target_dir_path / ".git/info/sparse-checkout" + with sparse_checkout_config_path.open("w") as f: f.write(self.sourceSubdir) - main_log.debug('wrote {} in git sparse-checkout file'.format(self.sourceSubdir)) + main_log.debug("wrote {} in git sparse-checkout file".format(self.sourceSubdir)) # git fetch (without refspec so remote can be updated) - execute_command('git fetch {name}'.format( - name=self.sourceRemoteName, - ), cwd=target_dir_path) + execute_command( + "git fetch {name}".format( + name=self.sourceRemoteName, + ), + cwd=target_dir_path, + ) # git reset to the branch - execute_command('git reset --hard {name}/{branch}'.format( - name=self.sourceRemoteName, - branch=self.sourceBranch, - ), cwd=target_dir_path) + execute_command( + "git reset --hard {name}/{branch}".format( + name=self.sourceRemoteName, + branch=self.sourceBranch, + ), + cwd=target_dir_path, + ) # git clean - execute_command('git clean -d -x -f', cwd=target_dir_path) + execute_command("git clean -d -x -f", cwd=target_dir_path) # return val ret_val = True - main_log.info('done') + main_log.info("done") except Exception: err_msg_list.append(core_utils.dump_error_message(main_log)) return ret_val, str(err_msg_list) diff --git a/pandaharvester/harvestermessenger/act_messenger.py b/pandaharvester/harvestermessenger/act_messenger.py index 0d964fe9..6d3a2a19 100644 --- a/pandaharvester/harvestermessenger/act_messenger.py +++ b/pandaharvester/harvestermessenger/act_messenger.py @@ -16,19 +16,20 @@ jsonEventsUpdateFileName = harvester_config.payload_interaction.updateEventsFile # suffix to read json -suffixReadJson = '.read' +suffixReadJson = ".read" # logger -baseLogger = core_utils.setup_logger('act_messenger') +baseLogger = core_utils.setup_logger("act_messenger") + class ACTMessenger(BaseMessenger): - '''Mechanism for passing information about completed jobs back to harvester.''' + """Mechanism for passing information about completed jobs back to harvester.""" def __init__(self, **kwarg): BaseMessenger.__init__(self, **kwarg) # Set up aCT DB connection - self.log = core_utils.make_logger(baseLogger, 'aCT messenger', method_name='__init__') + self.log = core_utils.make_logger(baseLogger, "aCT messenger", method_name="__init__") self.actDB = aCTDBPanda(self.log) # get access point @@ -40,39 +41,35 @@ def get_access_point(self, workspec, panda_id): return accessPoint def post_processing(self, workspec, jobspec_list, map_type): - '''Now done in stager''' + """Now done in stager""" return True def get_work_attributes(self, workspec): - '''Get info from the job to pass back to harvester''' + """Get info from the job to pass back to harvester""" # Just return existing attributes. Attributes are added to workspec for # finished jobs in post_processing return workspec.workAttributes def events_requested(self, workspec): - '''Used to tell harvester that the worker requests events.''' + """Used to tell harvester that the worker requests events.""" # Not yet implemented, dynamic event fetching not supported yet return {} def feed_events(self, workspec, events_dict): - ''' + """ Harvester has an event range to pass to job events_dict is {pandaid: [{eventrange1}, {eventrange2}, ..]} - ''' + """ # get logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='feed_events') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="feed_events") retVal = True if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]: # insert the event range into aCT DB and mark the job ready to go for pandaid, eventranges in events_dict.items(): - desc = {'eventranges': json.dumps(eventranges), - 'actpandastatus': 'sent', - 'pandastatus': 'sent', - 'arcjobid': None} - tmpLog.info('Inserting {0} events for job {1}'.format(len(eventranges), pandaid)) + desc = {"eventranges": json.dumps(eventranges), "actpandastatus": "sent", "pandastatus": "sent", "arcjobid": None} + tmpLog.info("Inserting {0} events for job {1}".format(len(eventranges), pandaid)) try: self.actDB.updateJob(pandaid, desc) except Exception as e: @@ -81,15 +78,14 @@ def feed_events(self, workspec, events_dict): elif workspec.mapType == WorkSpec.MT_MultiJobs: # TOBEFIXED pass - tmpLog.debug('done') + tmpLog.debug("done") return retVal def events_to_update(self, workspec): - '''Report events processed for harvester to update''' + """Report events processed for harvester to update""" # get logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='events_to_update') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="events_to_update") # look for the json just under the access point retDict = dict() for pandaID in workspec.pandaid_list: @@ -99,20 +95,20 @@ def events_to_update(self, workspec): jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged - tmpLog.debug('looking for event update file {0}'.format(readJsonPath)) + tmpLog.debug("looking for event update file {0}".format(readJsonPath)) if os.path.exists(readJsonPath): pass else: - tmpLog.debug('looking for event update file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for event update file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") continue try: # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: - tmpLog.error('failed to rename json') + tmpLog.error("failed to rename json") continue # load json nData = 0 @@ -126,40 +122,39 @@ def events_to_update(self, workspec): retDict[tmpPandaID] = tmpDict nData += len(tmpDict) except Exception as x: - tmpLog.error('failed to load json: {0}'.format(str(x))) + tmpLog.error("failed to load json: {0}".format(str(x))) # delete empty file if nData == 0: try: os.remove(readJsonPath) except Exception: pass - tmpLog.debug('got {0} events for PandaID={1}'.format(nData, pandaID)) + tmpLog.debug("got {0} events for PandaID={1}".format(nData, pandaID)) return retDict def acknowledge_events_files(self, workspec): - '''Acknowledge that events were picked up by harvester''' + """Acknowledge that events were picked up by harvester""" # get logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='acknowledge_events_files') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="acknowledge_events_files") # remove request file for pandaID in workspec.pandaid_list: accessPoint = self.get_access_point(workspec, pandaID) try: jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) jsonFilePath += suffixReadJson - jsonFilePath_rename = jsonFilePath + '.' + str(datetime.datetime.utcnow()) + jsonFilePath_rename = jsonFilePath + "." + str(datetime.datetime.utcnow()) os.rename(jsonFilePath, jsonFilePath_rename) except Exception: pass try: jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) jsonFilePath += suffixReadJson - jsonFilePath_rename = jsonFilePath + '.' + str(datetime.datetime.utcnow()) + jsonFilePath_rename = jsonFilePath + "." + str(datetime.datetime.utcnow()) os.rename(jsonFilePath, jsonFilePath_rename) except Exception: pass - tmpLog.debug('done') + tmpLog.debug("done") return # setup access points @@ -180,43 +175,44 @@ def setup_access_points(self, workspec_list): return True except Exception: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='setup_access_points') + tmpLog = core_utils.make_logger(_logger, method_name="setup_access_points") core_utils.dump_error_message(tmpLog) return False - # The remaining methods do not apply to ARC + def feed_jobs(self, workspec, jobspec_list): - '''Pass job to worker. No-op for Grid''' + """Pass job to worker. No-op for Grid""" return True def get_files_to_stage_out(self, workspec): - '''Not required in Grid case''' + """Not required in Grid case""" return {} def job_requested(self, workspec): - '''Used in pull model to say that worker is ready for a job''' + """Used in pull model to say that worker is ready for a job""" return False def setup_access_points(self, workspec_list): - '''Access is through CE so nothing to set up here''' + """Access is through CE so nothing to set up here""" pass def get_panda_ids(self, workspec): - '''For pull model, get panda IDs assigned to jobs''' + """For pull model, get panda IDs assigned to jobs""" return [] def kill_requested(self, workspec): - '''Worker wants to kill itself (?)''' + """Worker wants to kill itself (?)""" return False def is_alive(self, workspec, time_limit): - '''Check if worker is alive, not for Grid''' + """Check if worker is alive, not for Grid""" return True def test(): pass -if __name__ == '__main__': + +if __name__ == "__main__": test() diff --git a/pandaharvester/harvestermessenger/apache_messenger.py b/pandaharvester/harvestermessenger/apache_messenger.py index 6925fdd9..0903d38d 100644 --- a/pandaharvester/harvestermessenger/apache_messenger.py +++ b/pandaharvester/harvestermessenger/apache_messenger.py @@ -6,7 +6,7 @@ # logger -_logger = core_utils.setup_logger('apache_messenger') +_logger = core_utils.setup_logger("apache_messenger") http_server_messenger.set_logger(_logger) @@ -16,7 +16,7 @@ def __init__(self, *args, **kwargs): self.responseCode = None self.form = dict() self.message = None - self.headerList = [('Content-Type', 'text/plain')] + self.headerList = [("Content-Type", "text/plain")] http_server_messenger.HttpHandler.__init__(self, *args, **kwargs) def setup(self): @@ -38,7 +38,7 @@ def set_form(self, form): self.form = form def do_postprocessing(self, message): - self.message = message.encode('ascii') + self.message = message.encode("ascii") def send_header(self, keyword, value): self.headerList = [(keyword, value)] @@ -49,22 +49,22 @@ def application(environ, start_response): try: # get params try: - request_body_size = int(environ.get('CONTENT_LENGTH', 0)) + request_body_size = int(environ.get("CONTENT_LENGTH", 0)) except Exception as e: - _logger.warning('Zero request body due to {0}: {1}'.format(e.__class__.__name__, e)) + _logger.warning("Zero request body due to {0}: {1}".format(e.__class__.__name__, e)) request_body_size = 0 # check token - if getattr(harvester_config.frontend, 'authEnable', True): + if getattr(harvester_config.frontend, "authEnable", True): try: - auth_str = environ.get('HTTP_AUTHORIZATION', '').split()[-1] + auth_str = environ.get("HTTP_AUTHORIZATION", "").split()[-1] token = HarvesterToken() payload = token.get_payload(auth_str) except Exception as e: - _logger.warning('Invalid token due to {0}: {1}'.format(e.__class__.__name__, e)) - errMsg = 'Auth failed: Invalid token' - start_response('403 Forbidden', [('Content-Type', 'text/plain')]) - return [errMsg.encode('ascii')] - request_body = environ['wsgi.input'].read(request_body_size) + _logger.warning("Invalid token due to {0}: {1}".format(e.__class__.__name__, e)) + errMsg = "Auth failed: Invalid token" + start_response("403 Forbidden", [("Content-Type", "text/plain")]) + return [errMsg.encode("ascii")] + request_body = environ["wsgi.input"].read(request_body_size) params = json.loads(request_body) # make handler handler = ApacheHandler(None, None, None) @@ -77,5 +77,5 @@ def application(environ, start_response): return [handler.message] except Exception: errMsg = core_utils.dump_error_message(_logger) - start_response('500 Phrase', [('Content-Type', 'text/plain')]) + start_response("500 Phrase", [("Content-Type", "text/plain")]) return [errMsg] diff --git a/pandaharvester/harvestermessenger/base_messenger.py b/pandaharvester/harvestermessenger/base_messenger.py index e58eacc2..1dcac818 100644 --- a/pandaharvester/harvestermessenger/base_messenger.py +++ b/pandaharvester/harvestermessenger/base_messenger.py @@ -77,4 +77,4 @@ def is_alive(self, workspec, time_limit): # clean up. Called by sweeper agent to clean up stuff made by messenger for the worker def clean_up(self, workspec): - return (None, 'skipped') + return (None, "skipped") diff --git a/pandaharvester/harvestermessenger/http_server_messenger.py b/pandaharvester/harvestermessenger/http_server_messenger.py index 20304583..51bf43be 100644 --- a/pandaharvester/harvestermessenger/http_server_messenger.py +++ b/pandaharvester/harvestermessenger/http_server_messenger.py @@ -10,6 +10,7 @@ from queue import Queue from http.server import HTTPServer, BaseHTTPRequestHandler + # try: # from urllib.parse import parse_qsl # except ImportError: @@ -21,7 +22,7 @@ from pandaharvester.harvestermessenger import shared_file_messenger # logger -_logger = core_utils.setup_logger('http_server_messenger') +_logger = core_utils.setup_logger("http_server_messenger") shared_file_messenger.set_logger(_logger) @@ -33,7 +34,6 @@ def set_logger(master_logger): # handler for http front-end class HttpHandler(BaseHTTPRequestHandler): - def __init__(self, *args, **kwargs): self.dbProxy = DBProxy() self.tmpLog = None @@ -44,7 +44,7 @@ def log_message(self, format, *args): pass def get_form(self): - dataStr = self.rfile.read(int(self.headers['Content-Length'])) + dataStr = self.rfile.read(int(self.headers["Content-Length"])) return json.loads(dataStr) def do_postprocessing(self, message): @@ -59,97 +59,87 @@ def do_POST(self): form = None methodName = None dataStr = None - message = '' + message = "" # parse the form data posted try: form = self.get_form() except Exception: - message = 'corrupted json' + message = "corrupted json" toSkip = True # check parameters if not toSkip: toSkip = True # method is not set - if 'methodName' not in form: - message = 'methodName is not given' + if "methodName" not in form: + message = "methodName is not given" self.send_response(400) - elif 'workerID' not in form: - message = 'workerID is not given' + elif "workerID" not in form: + message = "workerID is not given" self.send_response(400) - elif 'data' not in form: - message = 'data is not given' + elif "data" not in form: + message = "data is not given" self.send_response(400) else: toSkip = False # get worker if not toSkip: try: - workerID = form['workerID'] + workerID = form["workerID"] workSpec = self.dbProxy.get_worker_with_id(workerID) if workSpec is None: - message = 'workerID={0} not found in DB'.format(workerID) + message = "workerID={0} not found in DB".format(workerID) self.send_response(400) else: # chose file and operation for each action - methodName = form['methodName'] + methodName = form["methodName"] opType = None - filePath = '' - if methodName == 'requestJobs': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonJobRequestFileName) - opType = 'w' - elif methodName == 'getJobs': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jobSpecFileName) - opType = 'r' - elif methodName == 'requestEventRanges': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonEventsRequestFileName) - opType = 'w' - elif methodName == 'getEventRanges': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonEventsFeedFileName) - opType = 'r' - elif methodName == 'updateJobs': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonAttrsFileName) - opType = 'w' - elif methodName == 'uploadJobReport': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonJobReport) - opType = 'w' - elif methodName == 'uploadEventOutputDump': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.jsonOutputsFileName) - opType = 'w' - elif methodName == 'setPandaIDs': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.pandaIDsFile) - opType = 'w' - elif methodName == 'killWorker': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.killWorkerFile) - opType = 'w' - elif methodName == 'heartbeat': - filePath = os.path.join(workSpec.get_access_point(), - shared_file_messenger.heartbeatFile) - opType = 'w' + filePath = "" + if methodName == "requestJobs": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonJobRequestFileName) + opType = "w" + elif methodName == "getJobs": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jobSpecFileName) + opType = "r" + elif methodName == "requestEventRanges": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonEventsRequestFileName) + opType = "w" + elif methodName == "getEventRanges": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonEventsFeedFileName) + opType = "r" + elif methodName == "updateJobs": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonAttrsFileName) + opType = "w" + elif methodName == "uploadJobReport": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonJobReport) + opType = "w" + elif methodName == "uploadEventOutputDump": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.jsonOutputsFileName) + opType = "w" + elif methodName == "setPandaIDs": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.pandaIDsFile) + opType = "w" + elif methodName == "killWorker": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.killWorkerFile) + opType = "w" + elif methodName == "heartbeat": + filePath = os.path.join(workSpec.get_access_point(), shared_file_messenger.heartbeatFile) + opType = "w" else: self.send_response(501) - message = 'method not implemented' + message = "method not implemented" toSkip = True # take action if not toSkip: # write actions - if opType == 'w': + if opType == "w": # check if file exists. Methods such as heartbeat however need to overwrite the file - if os.path.exists(filePath) and methodName not in ['heartbeat']: - message = 'previous request is not yet processed' + if os.path.exists(filePath) and methodName not in ["heartbeat"]: + message = "previous request is not yet processed" self.send_response(503) else: - with open(filePath, 'w') as fileHandle: - json.dump(form['data'], fileHandle) - message = 'OK' + with open(filePath, "w") as fileHandle: + json.dump(form["data"], fileHandle) + message = "OK" self.send_response(200) else: # read actions @@ -158,22 +148,21 @@ def do_POST(self): try: _message = json.load(fileHandle) message = json.dumps(_message) - self.send_header('Content-Type', 'application/json') + self.send_header("Content-Type", "application/json") except JSONDecodeError: _f_qs = open(filePath).read() # _message = dict(parse_qsl(_f_qs, keep_blank_values=True)) message = _f_qs - self.send_header('Content-Type', 'text/plain') + self.send_header("Content-Type", "text/plain") self.send_response(200) else: - message = 'previous request is not yet processed' + message = "previous request is not yet processed" self.send_response(503) except Exception: self.send_response(500) message = core_utils.dump_error_message(_logger) if harvester_config.frontend.verbose: - self.tmpLog.debug('ip={3} - method={0} json={1} msg={2}'.format(methodName, dataStr, message, - self.client_address[0])) + self.tmpLog.debug("ip={3} - method={0} json={1} msg={2}".format(methodName, dataStr, message, self.client_address[0])) # set the response self.do_postprocessing(message) return @@ -216,8 +205,8 @@ def __new__(cls, *args, **kwargs): if cls.instance is None: with cls.lock: if cls.instance is None: - if harvester_config.frontend.type == 'simple': - httpd = ThreadedHttpServer(('', harvester_config.frontend.portNumber), HttpHandler) + if harvester_config.frontend.type == "simple": + httpd = ThreadedHttpServer(("", harvester_config.frontend.portNumber), HttpHandler) thr = threading.Thread(target=httpd.serve_forever) thr.daemon = True thr.start() @@ -226,6 +215,7 @@ def __new__(cls, *args, **kwargs): cls.instance = 1 return cls.instance + # start frontend frontend = FrontendLauncher() diff --git a/pandaharvester/harvestermessenger/k8s_messenger.py b/pandaharvester/harvestermessenger/k8s_messenger.py index b770bb1d..448fa025 100644 --- a/pandaharvester/harvestermessenger/k8s_messenger.py +++ b/pandaharvester/harvestermessenger/k8s_messenger.py @@ -6,18 +6,17 @@ from pandaharvester.harvestermisc.info_utils_k8s import PandaQueuesDictK8s # logger -_logger = core_utils.setup_logger('k8s_messenger') +_logger = core_utils.setup_logger("k8s_messenger") # Messenger for generic Kubernetes clusters class K8sMessenger(BaseMessenger): - def __init__(self, **kwargs): BaseMessenger.__init__(self, **kwargs) try: self.logDir except AttributeError: - print('K8sMessenger: Missing attribute logDir') + print("K8sMessenger: Missing attribute logDir") raise # retrieve the k8s namespace from CRIC @@ -34,29 +33,28 @@ def post_processing(self, workspec, jobspec_list, map_type): - Store or upload logs """ # get logger - tmp_log = core_utils.make_logger(_logger, 'queueName={0} workerID={1}'.format(self.queueName, workspec.workerID), - method_name='post_processing') - tmp_log.debug('start') + tmp_log = core_utils.make_logger(_logger, "queueName={0} workerID={1}".format(self.queueName, workspec.workerID), method_name="post_processing") + tmp_log.debug("start") if self._all_pods_list is None: - tmp_log.error('No pod information') - tmp_log.debug('done') + tmp_log.error("No pod information") + tmp_log.debug("done") return None try: # fetch and store logs job_id = workspec.batchID pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - pod_name_list = [pods_info['name'] for pods_info in pods_list] - outlog_filename = os.path.join(self.logDir, 'gridK8S.{0}.{1}.out'.format(workspec.workerID, workspec.batchID)) - with open(outlog_filename, 'w') as f: + pod_name_list = [pods_info["name"] for pods_info in pods_list] + outlog_filename = os.path.join(self.logDir, "gridK8S.{0}.{1}.out".format(workspec.workerID, workspec.batchID)) + with open(outlog_filename, "w") as f: for pod_name in pod_name_list: current_log_str = self.k8s_client.get_pod_logs(pod_name) f.write(current_log_str) # upload logs pass # return - tmp_log.debug('done') + tmp_log.debug("done") return True except Exception: core_utils.dump_error_message(tmp_log) diff --git a/pandaharvester/harvestermessenger/shared_file_messenger.py b/pandaharvester/harvestermessenger/shared_file_messenger.py index 1fe2a26e..fefccb19 100644 --- a/pandaharvester/harvestermessenger/shared_file_messenger.py +++ b/pandaharvester/harvestermessenger/shared_file_messenger.py @@ -73,37 +73,37 @@ try: killWorkerFile = harvester_config.payload_interaction.killWorkerFile except Exception: - killWorkerFile = 'kill_worker.json' + killWorkerFile = "kill_worker.json" # json for heartbeats from the worker try: heartbeatFile = harvester_config.payload_interaction.heartbeatFile except Exception: - heartbeatFile = 'worker_heartbeat.json' + heartbeatFile = "worker_heartbeat.json" # task specific persistent dir try: taskWorkBaseDir = harvester_config.payload_interaction.taskWorkBaseDir except Exception: - taskWorkBaseDir = '/tmp/workdir' + taskWorkBaseDir = "/tmp/workdir" # task-level work state file try: taskWorkStateFile = harvester_config.payload_interaction.taskWorkStateFile except Exception: - taskWorkStateFile = 'state.json' + taskWorkStateFile = "state.json" # task-level work dir -taskWorkDirPathFile = 'task_workdir_path.txt' +taskWorkDirPathFile = "task_workdir_path.txt" # post-processing job attributes -postProcessAttrs = 'post_process_job_attrs.json' +postProcessAttrs = "post_process_job_attrs.json" # suffix to read json -suffixReadJson = '.read' +suffixReadJson = ".read" # logger -_logger = core_utils.setup_logger('shared_file_messenger') +_logger = core_utils.setup_logger("shared_file_messenger") def set_logger(master_logger): @@ -113,16 +113,16 @@ def set_logger(master_logger): # filter for log.tgz def filter_log_tgz(extra=None): - patt = ['*.log', '*.txt', '*.xml', '*.json', 'log*'] + patt = ["*.log", "*.txt", "*.xml", "*.json", "log*"] if extra is not None: patt += extra - return '-o '.join(['-name "{0}" '.format(i) for i in patt]) + return "-o ".join(['-name "{0}" '.format(i) for i in patt]) # tar a single directory def tar_directory(dir_name, tar_name=None, max_depth=None, extra_files=None, sub_tarball_name=None): if tar_name is None: - tarFilePath = os.path.join(os.path.dirname(dir_name), '{0}.subdir.tar.gz'.format(os.path.basename(dir_name))) + tarFilePath = os.path.join(os.path.dirname(dir_name), "{0}.subdir.tar.gz".format(os.path.basename(dir_name))) else: tarFilePath = tar_name # check if sub-tarball already exists @@ -130,25 +130,22 @@ def tar_directory(dir_name, tar_name=None, max_depth=None, extra_files=None, sub if sub_tarball_name is not None: subTarballPath = os.path.join(dir_name, sub_tarball_name) if os.path.exists(subTarballPath): - com = 'mv {} {}'.format(subTarballPath, tarFilePath) + com = "mv {} {}".format(subTarballPath, tarFilePath) # make sub-tarball if com is None: - com = 'cd {0}; '.format(dir_name) - com += 'find . ' + com = "cd {0}; ".format(dir_name) + com += "find . " if max_depth is not None: - com += '-maxdepth {0} '.format(max_depth) - com += r'-type f \( ' + filter_log_tgz(extra_files) + r'\) ' + com += "-maxdepth {0} ".format(max_depth) + com += r"-type f \( " + filter_log_tgz(extra_files) + r"\) " com += r'| grep -v {0} | tr "\n" "\0" | '.format(jobSpecFileName) - com += 'tar ' - if distutils.spawn.find_executable('pigz') is None: - com += '-z ' + com += "tar " + if distutils.spawn.find_executable("pigz") is None: + com += "-z " else: - com += '-I pigz ' - com += '-c -f {0} --null -T -'.format(tarFilePath) - p = subprocess.Popen(com, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + com += "-I pigz " + com += "-c -f {0} --null -T -".format(tarFilePath) + p = subprocess.Popen(com, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode return com, retCode, stdOut, stdErr @@ -181,24 +178,24 @@ def scan_files_in_dir(dir_name, patterns=None, zip_patterns=None): # make dict tmpFileDict = dict() pfn = os.path.join(root, filename) - tmpFileDict['path'] = pfn - tmpFileDict['fsize'] = os.stat(pfn).st_size - tmpFileDict['guid'] = str(uuid.uuid4()) - tmpFileDict['chksum'] = core_utils.calc_adler32(pfn) - tmpFileDict['eventStatus'] = "finished" + tmpFileDict["path"] = pfn + tmpFileDict["fsize"] = os.stat(pfn).st_size + tmpFileDict["guid"] = str(uuid.uuid4()) + tmpFileDict["chksum"] = core_utils.calc_adler32(pfn) + tmpFileDict["eventStatus"] = "finished" if is_zipped: lfns = [] # extract actual event filenames from zip with tarfile.open(pfn) as f: for tar_info in f.getmembers(): lfns.append(os.path.basename(tar_info.name)) - tmpFileDict['type'] = 'zip_output' + tmpFileDict["type"] = "zip_output" else: lfns = [os.path.basename(pfn)] - tmpFileDict['type'] = 'es_output' + tmpFileDict["type"] = "es_output" for lfn in lfns: tmpDict = copy.copy(tmpFileDict) - tmpDict['eventRangeID'] = lfn.split('.')[-1] + tmpDict["eventRangeID"] = lfn.split(".")[-1] fileList.append(tmpDict) return fileList @@ -208,7 +205,7 @@ def scan_files_in_dir(dir_name, patterns=None, zip_patterns=None): class SharedFileMessenger(BaseMessenger): # constructor def __init__(self, **kwarg): - self.jobSpecFileFormat = 'json' + self.jobSpecFileFormat = "json" self.stripJobParams = False self.scanInPostProcess = False self.leftOverPatterns = None @@ -234,7 +231,7 @@ def get_task_access_point(self, workspec, jobspec): if os.path.exists(tmp_file): with open(tmp_file) as f: return f.read() - if jobspec.jobParams and 'onSiteMerging' in jobspec.jobParams: + if jobspec.jobParams and "onSiteMerging" in jobspec.jobParams: return os.path.join(taskWorkBaseDir, str(jobspec.taskID)) return None @@ -242,8 +239,7 @@ def get_task_access_point(self, workspec, jobspec): # * the worker needs to put a json under the access point def get_work_attributes(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='get_work_attributes') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_work_attributes") allRetDict = dict() numofreads = 0 sw_readreports = core_utils.get_stopwatch() @@ -251,49 +247,48 @@ def get_work_attributes(self, workspec): # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName) - tmpLog.debug('looking for attributes file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for attributes file {0}".format(jsonFilePath)) retDict = dict() if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found attributes file') + tmpLog.debug("not found attributes file") else: try: with open(jsonFilePath) as jsonFile: retDict = json.load(jsonFile) except Exception: - tmpLog.debug('failed to load {0}'.format(jsonFilePath)) + tmpLog.debug("failed to load {0}".format(jsonFilePath)) # look for job report jsonFilePath = os.path.join(accessPoint, jsonJobReport) - tmpLog.debug('looking for job report file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for job report file {0}".format(jsonFilePath)) sw_checkjobrep = core_utils.get_stopwatch() if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found job report file') + tmpLog.debug("not found job report file") else: try: sw_readrep = core_utils.get_stopwatch() with open(jsonFilePath) as jsonFile: tmpDict = json.load(jsonFile) - retDict['metaData'] = tmpDict - tmpLog.debug('got {0} kB of job report. {1} sec.'.format(os.stat(jsonFilePath).st_size / 1024, - sw_readrep.get_elapsed_time())) + retDict["metaData"] = tmpDict + tmpLog.debug("got {0} kB of job report. {1} sec.".format(os.stat(jsonFilePath).st_size / 1024, sw_readrep.get_elapsed_time())) numofreads += 1 except Exception: - tmpLog.debug('failed to load {0}'.format(jsonFilePath)) + tmpLog.debug("failed to load {0}".format(jsonFilePath)) tmpLog.debug("Check file and read file time: {0} sec.".format(sw_checkjobrep.get_elapsed_time())) # loop for post-processing job attributes jsonFilePath = os.path.join(accessPoint, postProcessAttrs) - tmpLog.debug('looking for post-processing job attributes file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for post-processing job attributes file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found post-processing job attributes file') + tmpLog.debug("not found post-processing job attributes file") else: try: with open(jsonFilePath) as jsonFile: tmpDict = json.load(jsonFile) retDict.update(tmpDict) except Exception: - tmpLog.debug('failed to load {0}'.format(jsonFilePath)) + tmpLog.debug("failed to load {0}".format(jsonFilePath)) allRetDict[pandaID] = retDict tmpLog.debug("Reading {0} job report files {1}".format(numofreads, sw_readreports.get_elapsed_time())) @@ -303,8 +298,7 @@ def get_work_attributes(self, workspec): # * the worker needs to put a json under the access point def get_files_to_stage_out(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='get_files_to_stage_out') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_files_to_stage_out") fileDict = dict() # look for the json just under the access point for pandaID in workspec.pandaid_list: @@ -313,21 +307,21 @@ def get_files_to_stage_out(self, workspec): jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged - tmpLog.debug('looking for output file {0}'.format(readJsonPath)) + tmpLog.debug("looking for output file {0}".format(readJsonPath)) if os.path.exists(readJsonPath): pass else: - tmpLog.debug('looking for output file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for output file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") continue try: - tmpLog.debug('found') + tmpLog.debug("found") # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: - tmpLog.error('failed to rename json') + tmpLog.error("failed to rename json") continue # load json toSkip = False @@ -336,12 +330,12 @@ def get_files_to_stage_out(self, workspec): with open(readJsonPath) as jsonFile: loadDict = json.load(jsonFile) except Exception: - tmpLog.error('failed to load json') + tmpLog.error("failed to load json") toSkip = True # test validity of data format (ie it should be a Dictionary) if not toSkip: if not isinstance(loadDict, dict): - tmpLog.error('loaded data is not a dictionary') + tmpLog.error("loaded data is not a dictionary") toSkip = True # collect files and events nData = 0 @@ -353,69 +347,68 @@ def get_files_to_stage_out(self, workspec): tmpPandaID = long(tmpPandaID) # test if tmpEventMapList is a list if not isinstance(tmpEventMapList, list): - tmpLog.error('loaded data item is not a list') + tmpLog.error("loaded data item is not a list") toSkip = True break for tmpEventInfo in tmpEventMapList: try: nData += 1 - if 'eventRangeID' in tmpEventInfo: - tmpEventRangeID = tmpEventInfo['eventRangeID'] + if "eventRangeID" in tmpEventInfo: + tmpEventRangeID = tmpEventInfo["eventRangeID"] else: tmpEventRangeID = None - if 'path' in tmpEventInfo: + if "path" in tmpEventInfo: tmpFileDict = dict() - pfn = tmpEventInfo['path'] + pfn = tmpEventInfo["path"] lfn = os.path.basename(pfn) - tmpFileDict['path'] = pfn + tmpFileDict["path"] = pfn if pfn not in sizeMap: - if 'fsize' in tmpEventInfo: - sizeMap[pfn] = tmpEventInfo['fsize'] + if "fsize" in tmpEventInfo: + sizeMap[pfn] = tmpEventInfo["fsize"] else: sizeMap[pfn] = os.stat(pfn).st_size - tmpFileDict['fsize'] = sizeMap[pfn] - tmpFileDict['type'] = tmpEventInfo['type'] - if tmpEventInfo['type'] in ['log', 'output', 'checkpoint']: + tmpFileDict["fsize"] = sizeMap[pfn] + tmpFileDict["type"] = tmpEventInfo["type"] + if tmpEventInfo["type"] in ["log", "output", "checkpoint"]: # disable zipping - tmpFileDict['isZip'] = 0 - elif tmpEventInfo['type'] == 'zip_output': + tmpFileDict["isZip"] = 0 + elif tmpEventInfo["type"] == "zip_output": # already zipped - tmpFileDict['isZip'] = 1 - elif 'isZip' in tmpEventInfo: - tmpFileDict['isZip'] = tmpEventInfo['isZip'] + tmpFileDict["isZip"] = 1 + elif "isZip" in tmpEventInfo: + tmpFileDict["isZip"] = tmpEventInfo["isZip"] # guid - if 'guid' in tmpEventInfo: - tmpFileDict['guid'] = tmpEventInfo['guid'] + if "guid" in tmpEventInfo: + tmpFileDict["guid"] = tmpEventInfo["guid"] else: - tmpFileDict['guid'] = str(uuid.uuid4()) + tmpFileDict["guid"] = str(uuid.uuid4()) # get checksum if pfn not in chksumMap: - if 'chksum' in tmpEventInfo: - chksumMap[pfn] = tmpEventInfo['chksum'] + if "chksum" in tmpEventInfo: + chksumMap[pfn] = tmpEventInfo["chksum"] else: chksumMap[pfn] = core_utils.calc_adler32(pfn) - tmpFileDict['chksum'] = chksumMap[pfn] + tmpFileDict["chksum"] = chksumMap[pfn] if tmpPandaID not in fileDict: fileDict[tmpPandaID] = dict() if lfn not in fileDict[tmpPandaID]: fileDict[tmpPandaID][lfn] = [] fileDict[tmpPandaID][lfn].append(tmpFileDict) # skip if unrelated to events - if tmpFileDict['type'] not in ['es_output', 'zip_output']: + if tmpFileDict["type"] not in ["es_output", "zip_output"]: continue - tmpFileDict['eventRangeID'] = tmpEventRangeID + tmpFileDict["eventRangeID"] = tmpEventRangeID if tmpPandaID not in eventsList: eventsList[tmpPandaID] = list() - eventsList[tmpPandaID].append({'eventRangeID': tmpEventRangeID, - 'eventStatus': tmpEventInfo['eventStatus']}) + eventsList[tmpPandaID].append({"eventRangeID": tmpEventRangeID, "eventStatus": tmpEventInfo["eventStatus"]}) except Exception: core_utils.dump_error_message(tmpLog) # dump events if not toSkip: if len(eventsList) > 0: curName = os.path.join(accessPoint, jsonEventsUpdateFileName) - newName = curName + '.new' - f = open(newName, 'w') + newName = curName + ".new" + f = open(newName, "w") json.dump(eventsList, f) f.close() os.rename(newName, curName) @@ -425,39 +418,37 @@ def get_files_to_stage_out(self, workspec): os.remove(readJsonPath) except Exception: pass - tmpLog.debug('got {0} files for PandaID={1}'.format(nData, pandaID)) + tmpLog.debug("got {0} files for PandaID={1}".format(nData, pandaID)) return fileDict # check if job is requested. # * the worker needs to put a json under the access point def job_requested(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='job_requested') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="job_requested") # look for the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), jsonJobRequestFileName) - tmpLog.debug('looking for job request file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for job request file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") return False # read nJobs try: with open(jsonFilePath) as jsonFile: tmpDict = json.load(jsonFile) - nJobs = tmpDict['nJobs'] + nJobs = tmpDict["nJobs"] except Exception: # request 1 job by default nJobs = 1 - tmpLog.debug('requesting {0} jobs'.format(nJobs)) + tmpLog.debug("requesting {0} jobs".format(nJobs)) return nJobs # feed jobs # * worker_jobspec.json is put under the access point def feed_jobs(self, workspec, jobspec_list): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='feed_jobs') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="feed_jobs") retVal = True # get PFC pfc = core_utils.make_pool_file_catalog(jobspec_list) @@ -466,17 +457,17 @@ def feed_jobs(self, workspec, jobspec_list): accessPoint = self.get_access_point(workspec, jobSpec.PandaID) jobSpecFilePath = os.path.join(accessPoint, jobSpecFileName) xmlFilePath = os.path.join(accessPoint, xmlPoolCatalogFileName) - tmpLog.debug('feeding jobs to {0}'.format(jobSpecFilePath)) + tmpLog.debug("feeding jobs to {0}".format(jobSpecFilePath)) try: # put job spec file - with open(jobSpecFilePath, 'w') as jobSpecFile: + with open(jobSpecFilePath, "w") as jobSpecFile: jobParams = jobSpec.get_job_params(self.stripJobParams) - if self.jobSpecFileFormat == 'cgi': + if self.jobSpecFileFormat == "cgi": jobSpecFile.write(urlencode(jobParams)) else: json.dump({jobSpec.PandaID: jobParams}, jobSpecFile) # put PFC.xml - with open(xmlFilePath, 'w') as pfcFile: + with open(xmlFilePath, "w") as pfcFile: pfcFile.write(pfc) # make symlink for fileSpec in jobSpec.inFiles: @@ -494,7 +485,7 @@ def feed_jobs(self, workspec, jobspec_list): # put PandaIDs file try: jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile) - with open(jsonFilePath, 'w') as jsonPandaIDsFile: + with open(jsonFilePath, "w") as jsonPandaIDsFile: json.dump(pandaIDs, jsonPandaIDsFile) except Exception: core_utils.dump_error_message(tmpLog) @@ -505,44 +496,42 @@ def feed_jobs(self, workspec, jobspec_list): os.remove(reqFilePath) except Exception: pass - tmpLog.debug('done') + tmpLog.debug("done") return retVal # request events. # * the worker needs to put a json under the access point def events_requested(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='events_requested') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="events_requested") # look for the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName) - tmpLog.debug('looking for event request file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for event request file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") return {} try: with open(jsonFilePath) as jsonFile: retDict = json.load(jsonFile) except Exception: - tmpLog.debug('failed to load json') + tmpLog.debug("failed to load json") return {} - tmpLog.debug('found') + tmpLog.debug("found") return retDict # feed events # * worker_events.json is put under the access point def feed_events(self, workspec, events_dict): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='feed_events') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="feed_events") retVal = True if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]: # put the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) - tmpLog.debug('feeding events to {0}'.format(jsonFilePath)) + tmpLog.debug("feeding events to {0}".format(jsonFilePath)) try: - with open(jsonFilePath, 'w') as jsonFile: + with open(jsonFilePath, "w") as jsonFile: json.dump(events_dict, jsonFile) except Exception: core_utils.dump_error_message(tmpLog) @@ -556,15 +545,14 @@ def feed_events(self, workspec, events_dict): os.remove(jsonFilePath) except Exception: pass - tmpLog.debug('done') + tmpLog.debug("done") return retVal # update events. # * the worker needs to put a json under the access point def events_to_update(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='events_to_update') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="events_to_update") # look for the json just under the access point retDict = dict() for pandaID in workspec.pandaid_list: @@ -574,20 +562,20 @@ def events_to_update(self, workspec): jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged - tmpLog.debug('looking for event update file {0}'.format(readJsonPath)) + tmpLog.debug("looking for event update file {0}".format(readJsonPath)) if os.path.exists(readJsonPath): pass else: - tmpLog.debug('looking for event update file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for event update file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") continue try: # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: - tmpLog.error('failed to rename json') + tmpLog.error("failed to rename json") continue # load json nData = 0 @@ -601,40 +589,39 @@ def events_to_update(self, workspec): retDict[tmpPandaID] = tmpDict nData += len(tmpDict) except Exception: - tmpLog.error('failed to load json') + tmpLog.error("failed to load json") # delete empty file if nData == 0: try: os.remove(readJsonPath) except Exception: pass - tmpLog.debug('got {0} events for PandaID={1}'.format(nData, pandaID)) + tmpLog.debug("got {0} events for PandaID={1}".format(nData, pandaID)) return retDict # acknowledge events and files # * delete json.read files def acknowledge_events_files(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='acknowledge_events_files') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="acknowledge_events_files") # remove request file for pandaID in workspec.pandaid_list: accessPoint = self.get_access_point(workspec, pandaID) try: jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) jsonFilePath += suffixReadJson - jsonFilePath_rename = jsonFilePath + '.' + datetime.datetime.utcnow().strftime('%Y-%m-%d_%H_%M_%S.%f') + jsonFilePath_rename = jsonFilePath + "." + datetime.datetime.utcnow().strftime("%Y-%m-%d_%H_%M_%S.%f") os.rename(jsonFilePath, jsonFilePath_rename) except Exception: pass try: jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) jsonFilePath += suffixReadJson - jsonFilePath_rename = jsonFilePath + '.' + datetime.datetime.utcnow().strftime('%Y-%m-%d_%H_%M_%S.%f') + jsonFilePath_rename = jsonFilePath + "." + datetime.datetime.utcnow().strftime("%Y-%m-%d_%H_%M_%S.%f") os.rename(jsonFilePath, jsonFilePath_rename) except Exception: pass - tmpLog.debug('done') + tmpLog.debug("done") return # setup access points @@ -660,18 +647,18 @@ def setup_access_points(self, workspec_list): if taskAccessDir: if not os.path.exists(taskAccessDir): os.mkdir(taskAccessDir) - with open(os.path.join(subAccessPoint, taskWorkDirPathFile), 'w') as f: + with open(os.path.join(subAccessPoint, taskWorkDirPathFile), "w") as f: f.write(taskAccessDir) return True except Exception: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='setup_access_points') + tmpLog = core_utils.make_logger(_logger, method_name="setup_access_points") core_utils.dump_error_message(tmpLog) return False # filter for log.tar.gz def filter_log_tgz(self, name): - for tmpPatt in ['*.log', '*.txt', '*.xml', '*.json', 'log*']: + for tmpPatt in ["*.log", "*.txt", "*.xml", "*.json", "log*"]: if fnmatch.fnmatch(name, tmpPatt): return True return False @@ -679,14 +666,13 @@ def filter_log_tgz(self, name): # post-processing (archiving log files and collecting job metrics) def post_processing(self, workspec, jobspec_list, map_type): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='post_processing') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="post_processing") try: for jobSpec in jobspec_list: # check if log is already there hasLog = False for fileSpec in jobSpec.outFiles: - if fileSpec.fileType == 'log': + if fileSpec.fileType == "log": hasLog = True break fileDict = dict() @@ -698,80 +684,67 @@ def post_processing(self, workspec, jobspec_list, map_type): if not hasLog: logFileInfo = jobSpec.get_logfile_info() # make log.tar.gz - logFilePath = os.path.join(accessPoint, logFileInfo['lfn']) + logFilePath = os.path.join(accessPoint, logFileInfo["lfn"]) if map_type == WorkSpec.MT_MultiWorkers: # append suffix - logFilePath += '._{0}'.format(workspec.workerID) - tmpLog.debug('making {0}'.format(logFilePath)) - dirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) - if os.path.isdir(os.path.join(accessPoint, name))] + logFilePath += "._{0}".format(workspec.workerID) + tmpLog.debug("making {0}".format(logFilePath)) + dirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) if os.path.isdir(os.path.join(accessPoint, name))] # tar sub dirs - tmpLog.debug('tar for {0} sub dirs'.format(len(dirs))) - with Pool(max_workers=self.maxWorkersForZip if self.maxWorkersForZip - else multiprocessing.cpu_count()) as pool: - retValList = pool.map(lambda x, y: tar_directory(x, sub_tarball_name=y), - dirs, itertools.repeat(self.subTarballName)) + tmpLog.debug("tar for {0} sub dirs".format(len(dirs))) + with Pool(max_workers=self.maxWorkersForZip if self.maxWorkersForZip else multiprocessing.cpu_count()) as pool: + retValList = pool.map(lambda x, y: tar_directory(x, sub_tarball_name=y), dirs, itertools.repeat(self.subTarballName)) for dirName, (comStr, retCode, stdOut, stdErr) in zip(dirs, retValList): if retCode != 0: - tmpLog.warning('failed to sub-tar {0} with {1} -> {2}:{3}'.format( - dirName, comStr, stdOut, stdErr)) + tmpLog.warning("failed to sub-tar {0} with {1} -> {2}:{3}".format(dirName, comStr, stdOut, stdErr)) # tar main dir - tmpLog.debug('tar for main dir') + tmpLog.debug("tar for main dir") comStr, retCode, stdOut, stdErr = tar_directory(accessPoint, logFilePath, 1, ["*.subdir.tar.gz"]) - tmpLog.debug('used command : ' + comStr) + tmpLog.debug("used command : " + comStr) if retCode != 0: - tmpLog.warning('failed to tar {0} with {1} -> {2}:{3}'.format( - accessPoint, comStr, stdOut, stdErr)) + tmpLog.warning("failed to tar {0} with {1} -> {2}:{3}".format(accessPoint, comStr, stdOut, stdErr)) # make file dict fileDict.setdefault(jobSpec.PandaID, []) - fileDict[jobSpec.PandaID].append({'path': logFilePath, - 'type': 'log', - 'isZip': 0}) + fileDict[jobSpec.PandaID].append({"path": logFilePath, "type": "log", "isZip": 0}) # look for leftovers if self.scanInPostProcess: - tmpLog.debug('scanning leftovers in {0}'.format(accessPoint)) + tmpLog.debug("scanning leftovers in {0}".format(accessPoint)) # set the directory paths to scan for left over files dirs = [] if self.outputSubDir is None: - dirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) - if os.path.isdir(os.path.join(accessPoint, name))] + dirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) if os.path.isdir(os.path.join(accessPoint, name))] else: # loop over directories first level from accessPoint and then add subdirectory name. - upperdirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) - if os.path.isdir(os.path.join(accessPoint, name))] - dirs = [os.path.join(dirname, self.outputSubDir) for dirname in upperdirs - if os.path.isdir(os.path.join(dirname, self.outputSubDir))] + upperdirs = [os.path.join(accessPoint, name) for name in os.listdir(accessPoint) if os.path.isdir(os.path.join(accessPoint, name))] + dirs = [os.path.join(dirname, self.outputSubDir) for dirname in upperdirs if os.path.isdir(os.path.join(dirname, self.outputSubDir))] patterns = [] patterns_zip = [] - for tmp_patterns, tmp_left_over_patterns in \ - [[patterns, self.leftOverPatterns], [patterns_zip, self.leftOverZipPatterns]]: + for tmp_patterns, tmp_left_over_patterns in [[patterns, self.leftOverPatterns], [patterns_zip, self.leftOverZipPatterns]]: if tmp_left_over_patterns is None: continue for scanPat in tmp_left_over_patterns: # replace placeholders - if '%PANDAID' in scanPat: - scanPat = scanPat.replace('%PANDAID', str(jobSpec.PandaID)) - if '%TASKID' in scanPat: - scanPat = scanPat.replace('%TASKID', str(jobSpec.taskID)) - if '%OUTPUT_FILE' in scanPat: - logFileName = jobSpec.get_logfile_info()['lfn'] + if "%PANDAID" in scanPat: + scanPat = scanPat.replace("%PANDAID", str(jobSpec.PandaID)) + if "%TASKID" in scanPat: + scanPat = scanPat.replace("%TASKID", str(jobSpec.taskID)) + if "%OUTPUT_FILE" in scanPat: + logFileName = jobSpec.get_logfile_info()["lfn"] for outputName in jobSpec.get_output_file_attributes().keys(): if outputName == logFileName: continue - tmp_patterns.append(scanPat.replace('%OUTPUT_FILE', outputName)) + tmp_patterns.append(scanPat.replace("%OUTPUT_FILE", outputName)) else: tmp_patterns.append(scanPat) # scan files nLeftOvers = 0 - with Pool(max_workers=self.maxWorkersForZip if self.maxWorkersForZip - else multiprocessing.cpu_count()) as pool: - retValList = pool.map(scan_files_in_dir, dirs, [patterns] * len(dirs), - [patterns_zip] * len(dirs)) + with Pool(max_workers=self.maxWorkersForZip if self.maxWorkersForZip else multiprocessing.cpu_count()) as pool: + retValList = pool.map(scan_files_in_dir, dirs, [patterns] * len(dirs), [patterns_zip] * len(dirs)) for retVal in retValList: fileDict.setdefault(jobSpec.PandaID, []) fileDict[jobSpec.PandaID] += retVal nLeftOvers += len(retVal) - tmpLog.debug('got {0} leftovers'.format(nLeftOvers)) + tmpLog.debug("got {0} leftovers".format(nLeftOvers)) # look into task-level work state file taskAccessDir = self.get_task_access_point(workspec, jobSpec) if taskAccessDir: @@ -785,39 +758,39 @@ def post_processing(self, workspec, jobspec_list, map_type): if "merged" in tmpData: output_lfns = set() fileDict.setdefault(jobSpec.PandaID, []) - for tmpIn, tmpOuts in iteritems(tmpData['merged']): + for tmpIn, tmpOuts in iteritems(tmpData["merged"]): for tmpLFN, tmpFileDict in iteritems(tmpOuts): if tmpLFN in output_lfns: continue output_lfns.add(tmpLFN) nInTaskState += 1 - pfn = tmpFileDict['path'] - if 'fsize' not in tmpFileDict: - tmpFileDict['fsize'] = os.stat(pfn).st_size - tmpFileDict['type'] = 'output' - if 'guid' not in tmpFileDict: - tmpFileDict['guid'] = str(uuid.uuid4()) - if 'chksum' not in tmpFileDict: - tmpFileDict['chksum'] = core_utils.calc_adler32(pfn) + pfn = tmpFileDict["path"] + if "fsize" not in tmpFileDict: + tmpFileDict["fsize"] = os.stat(pfn).st_size + tmpFileDict["type"] = "output" + if "guid" not in tmpFileDict: + tmpFileDict["guid"] = str(uuid.uuid4()) + if "chksum" not in tmpFileDict: + tmpFileDict["chksum"] = core_utils.calc_adler32(pfn) fileDict.setdefault(jobSpec.PandaID, []) fileDict[jobSpec.PandaID].append(tmpFileDict) doneInputs.add(tmpIn) except Exception: core_utils.dump_error_message(tmpLog) - tmpLog.error('failed to parse task-level work state file {0}'.format(taskWorkStatePath)) + tmpLog.error("failed to parse task-level work state file {0}".format(taskWorkStatePath)) raise - tmpLog.debug('got {0} output files from task state file'.format(nInTaskState)) + tmpLog.debug("got {0} output files from task state file".format(nInTaskState)) # skipped files skippedInputs = [fileSpec.lfn for fileSpec in jobSpec.inFiles if fileSpec.lfn not in doneInputs] - with open(os.path.join(accessPoint, postProcessAttrs), 'w') as f: - json.dump({'skippedInputs': skippedInputs}, f) - tmpLog.debug('set {0} input files to skip'.format(len(skippedInputs))) + with open(os.path.join(accessPoint, postProcessAttrs), "w") as f: + json.dump({"skippedInputs": skippedInputs}, f) + tmpLog.debug("set {0} input files to skip".format(len(skippedInputs))) # make json to stage-out if len(fileDict) > 0: jsonFilePath = os.path.join(origAccessPoint, jsonOutputsFileName) - with open(jsonFilePath, 'w') as jsonFile: + with open(jsonFilePath, "w") as jsonFile: json.dump(fileDict, jsonFile) - tmpLog.debug('done') + tmpLog.debug("done") return True except Exception: core_utils.dump_error_message(tmpLog) @@ -826,96 +799,92 @@ def post_processing(self, workspec, jobspec_list, map_type): # get PandaIDs for pull model def get_panda_ids(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='get_panda_ids') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_panda_ids") # look for the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile) - tmpLog.debug('looking for PandaID file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for PandaID file {0}".format(jsonFilePath)) retVal = [] if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") return retVal try: with open(jsonFilePath) as jsonFile: retVal = json.load(jsonFile) except Exception: - tmpLog.debug('failed to load json') + tmpLog.debug("failed to load json") return retVal - tmpLog.debug('found') + tmpLog.debug("found") return retVal # check if requested to kill the worker itself def kill_requested(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_requested') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="kill_requested") # look for the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), killWorkerFile) - tmpLog.debug('looking for kill request file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for kill request file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - tmpLog.debug('not found') + tmpLog.debug("not found") return False - tmpLog.debug('kill requested') + tmpLog.debug("kill requested") return True # check if the worker is alive def is_alive(self, workspec, time_limit): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='is_alive') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="is_alive") # json file jsonFilePath = os.path.join(workspec.get_access_point(), heartbeatFile) - tmpLog.debug('looking for heartbeat file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for heartbeat file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # no heartbeat file was found - tmpLog.debug('startTime: {0}, now: {1}'.format(workspec.startTime, datetime.datetime.utcnow())) + tmpLog.debug("startTime: {0}, now: {1}".format(workspec.startTime, datetime.datetime.utcnow())) if not workspec.startTime: # the worker didn't even have time to start - tmpLog.debug('heartbeat not found, but no startTime yet for worker') + tmpLog.debug("heartbeat not found, but no startTime yet for worker") return True elif datetime.datetime.utcnow() - workspec.startTime < datetime.timedelta(minutes=time_limit): # the worker is too young and maybe didn't have time to generate the heartbeat - tmpLog.debug('heartbeat not found, but worker too young') + tmpLog.debug("heartbeat not found, but worker too young") return True else: # the worker is old and the heartbeat should be expected - tmpLog.debug('not found') + tmpLog.debug("not found") return None try: mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(jsonFilePath)) - tmpLog.debug('last modification time : {0}'.format(mtime)) + tmpLog.debug("last modification time : {0}".format(mtime)) if datetime.datetime.utcnow() - mtime > datetime.timedelta(minutes=time_limit): - tmpLog.debug('too old') + tmpLog.debug("too old") return False - tmpLog.debug('OK') + tmpLog.debug("OK") return True except Exception: - tmpLog.debug('failed to get mtime') + tmpLog.debug("failed to get mtime") return None # clean up. Called by sweeper agent to clean up stuff made by messenger for the worker # for shared_file_messenger, clean up worker the directory of access point def clean_up(self, workspec): # get logger - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), - method_name='clean_up') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="clean_up") # Remove from top directory of access point of worker - errStr = '' + errStr = "" worker_accessPoint = workspec.get_access_point() if os.path.isdir(worker_accessPoint): try: shutil.rmtree(worker_accessPoint) except Exception as _e: - errStr = 'failed to remove directory {0} : {1}'.format(worker_accessPoint, _e) + errStr = "failed to remove directory {0} : {1}".format(worker_accessPoint, _e) tmpLog.error(errStr) else: - tmpLog.debug('done') + tmpLog.debug("done") return (True, errStr) elif not os.path.exists(worker_accessPoint): - tmpLog.debug('accessPoint directory already gone. Skipped') + tmpLog.debug("accessPoint directory already gone. Skipped") return (None, errStr) else: - errStr = '{0} is not a directory'.format(worker_accessPoint) + errStr = "{0} is not a directory".format(worker_accessPoint) tmpLog.error(errStr) return (False, errStr) diff --git a/pandaharvester/harvestermiddleware/direct_ssh_bot.py b/pandaharvester/harvestermiddleware/direct_ssh_bot.py index b96b6664..537fcba3 100644 --- a/pandaharvester/harvestermiddleware/direct_ssh_bot.py +++ b/pandaharvester/harvestermiddleware/direct_ssh_bot.py @@ -6,48 +6,47 @@ from pandaharvester.harvestercore.plugin_factory import PluginFactory # logger -_logger = core_utils.setup_logger('direct_ssh_bot') +_logger = core_utils.setup_logger("direct_ssh_bot") # SSH bot runs a function and exits immediately class DirectSshBot(object): - # execution def run(self): tmpLog = _logger try: # get parameters param_dict = json.load(sys.stdin) - plugin_config = param_dict['plugin_config'] - function_name = param_dict['function_name'] - tmpLog = core_utils.make_logger(_logger, 'pid={0}'.format(os.getpid()), - method_name=function_name) - tmpLog.debug('start') - args = core_utils.unpickle_from_text(str(param_dict['args'])) - kwargs = core_utils.unpickle_from_text(str(param_dict['kwargs'])) + plugin_config = param_dict["plugin_config"] + function_name = param_dict["function_name"] + tmpLog = core_utils.make_logger(_logger, "pid={0}".format(os.getpid()), method_name=function_name) + tmpLog.debug("start") + args = core_utils.unpickle_from_text(str(param_dict["args"])) + kwargs = core_utils.unpickle_from_text(str(param_dict["kwargs"])) # get plugin pluginFactory = PluginFactory(no_db=True) core = pluginFactory.get_plugin(plugin_config) # execute ret = getattr(core, function_name)(*args, **kwargs) # make return - return_dict = {'return': core_utils.pickle_to_text(ret), - 'args': core_utils.pickle_to_text(args), - 'kwargs': core_utils.pickle_to_text(kwargs)} - tmpLog.debug('done') + return_dict = {"return": core_utils.pickle_to_text(ret), "args": core_utils.pickle_to_text(args), "kwargs": core_utils.pickle_to_text(kwargs)} + tmpLog.debug("done") except Exception as e: errMsg = core_utils.dump_error_message(tmpLog) - return_dict = {'exception': core_utils.pickle_to_text(e), - 'dialog': core_utils.pickle_to_text(errMsg)} + return_dict = {"exception": core_utils.pickle_to_text(e), "dialog": core_utils.pickle_to_text(errMsg)} return json.dumps(return_dict) + # main body + + def main(): # run bot bot = DirectSshBot() ret = bot.run() # propagate results via stdout - print (ret) + print(ret) + if __name__ == "__main__": main() diff --git a/pandaharvester/harvestermiddleware/direct_ssh_herder.py b/pandaharvester/harvestermiddleware/direct_ssh_herder.py index a200a62c..fafbdd6d 100644 --- a/pandaharvester/harvestermiddleware/direct_ssh_herder.py +++ b/pandaharvester/harvestermiddleware/direct_ssh_herder.py @@ -8,12 +8,12 @@ from .ssh_master_pool import sshMasterPool # logger -_logger = core_utils.setup_logger('direct_ssh_herder') +_logger = core_utils.setup_logger("direct_ssh_herder") # is mutable object to handle def is_mutable(obj): - return isinstance(obj, (list, dict)) or hasattr(obj, '__dict__') + return isinstance(obj, (list, dict)) or hasattr(obj, "__dict__") # update changes recursively of an object from a new object @@ -40,14 +40,14 @@ def update_object(old_obj, new_obj): update_object(old_obj[k], new_obj[k]) else: old_obj[k] = new_obj[k] - elif hasattr(old_obj, '__dict__'): + elif hasattr(old_obj, "__dict__"): for k in old_obj.__dict__: try: new_obj.__dict__[k] except KeyError: pass else: - if k in ['isNew', 'new_status']: + if k in ["isNew", "new_status"]: # skip attributes omitted in workspec pickling pass elif is_mutable(old_obj.__dict__[k]): @@ -58,7 +58,6 @@ def update_object(old_obj, new_obj): # function class class Method(object): - # constructor def __init__(self, plugin_config, function_name, conn): self.plugin_config = plugin_config @@ -68,74 +67,73 @@ def __init__(self, plugin_config, function_name, conn): # execution def __call__(self, *args, **kwargs): tmpLog = core_utils.make_logger(_logger, method_name=self.function_name) - tmpLog.debug('start') + tmpLog.debug("start") if self.conn is None: - tmpLog.warning('connection is not alive; method {0} returns None'.format(self.function_name)) + tmpLog.warning("connection is not alive; method {0} returns None".format(self.function_name)) return None - params = {'plugin_config': self.plugin_config, - 'function_name': self.function_name, - 'args': core_utils.pickle_to_text(args), - 'kwargs': core_utils.pickle_to_text(kwargs)} + params = { + "plugin_config": self.plugin_config, + "function_name": self.function_name, + "args": core_utils.pickle_to_text(args), + "kwargs": core_utils.pickle_to_text(kwargs), + } stdout, stderr = self.conn.communicate(input=six.b(json.dumps(params))) if self.conn.returncode == 0: return_dict = json.loads(stdout) - if 'exception' in return_dict: - errMsg = core_utils.unpickle_from_text(str(return_dict['dialog'])) - tmpLog.error('Exception from remote : ' + errMsg) - raise core_utils.unpickle_from_text(str(return_dict['exception'])) + if "exception" in return_dict: + errMsg = core_utils.unpickle_from_text(str(return_dict["dialog"])) + tmpLog.error("Exception from remote : " + errMsg) + raise core_utils.unpickle_from_text(str(return_dict["exception"])) # propagate changes in mutable args - new_args = core_utils.unpickle_from_text(str(return_dict['args'])) + new_args = core_utils.unpickle_from_text(str(return_dict["args"])) for old_arg, new_arg in zip(args, new_args): update_object(old_arg, new_arg) - new_kwargs = core_utils.unpickle_from_text(str(return_dict['kwargs'])) + new_kwargs = core_utils.unpickle_from_text(str(return_dict["kwargs"])) for key in kwargs: old_kwarg = kwargs[key] new_kwarg = new_kwargs[key] update_object(old_kwarg, new_kwarg) - return core_utils.unpickle_from_text(str(return_dict['return'])) + return core_utils.unpickle_from_text(str(return_dict["return"])) else: - tmpLog.error('execution failed with {0}; method={1} returns None'.format(self.conn.returncode, - self.function_name)) + tmpLog.error("execution failed with {0}; method={1} returns None".format(self.conn.returncode, self.function_name)) return None # Direct SSH herder class DirectSshHerder(PluginBase): - # constructor def __init__(self, **kwarg): - tmpLog = core_utils.make_logger(_logger, method_name='__init__') + tmpLog = core_utils.make_logger(_logger, method_name="__init__") PluginBase.__init__(self, **kwarg) self.bare_impl = None - self.sshUserName = getattr(self, 'sshUserName', None) - self.sshPassword = getattr(self, 'sshPassword', None) - self.privateKey = getattr(self, 'privateKey', None) - self.passPhrase = getattr(self, 'passPhrase', None) - self.jumpHost = getattr(self, 'jumpHost', None) - self.jumpPort = getattr(self, 'jumpPort', 22) - self.remoteHost = getattr(self, 'remoteHost', None) - self.remotePort = getattr(self, 'remotePort', 22) - self.bareFunctions = getattr(self, 'bareFunctions', list()) - self.sockDir = getattr(self, 'sockDir', '/tmp') - self.numMasters = getattr(self, 'numMasters', 1) - self.execStr = getattr(self, 'execStr', '') - self.connectionLifetime = getattr(self, 'connectionLifetime', None) + self.sshUserName = getattr(self, "sshUserName", None) + self.sshPassword = getattr(self, "sshPassword", None) + self.privateKey = getattr(self, "privateKey", None) + self.passPhrase = getattr(self, "passPhrase", None) + self.jumpHost = getattr(self, "jumpHost", None) + self.jumpPort = getattr(self, "jumpPort", 22) + self.remoteHost = getattr(self, "remoteHost", None) + self.remotePort = getattr(self, "remotePort", 22) + self.bareFunctions = getattr(self, "bareFunctions", list()) + self.sockDir = getattr(self, "sockDir", "/tmp") + self.numMasters = getattr(self, "numMasters", 1) + self.execStr = getattr(self, "execStr", "") + self.connectionLifetime = getattr(self, "connectionLifetime", None) try: self._get_connection() except Exception as e: core_utils.dump_error_message(tmpLog) - tmpLog.error('failed to get connection') + tmpLog.error("failed to get connection") # get attribute def __getattr__(self, item): if item in self.__dict__: - return self.__dict__[item] + return self.__dict__[item] # bare functions - if 'bareFunctions' in self.__dict__ and self.__dict__['bareFunctions'] is not None \ - and item in self.__dict__['bareFunctions']: - return getattr(object.__getattribute__(self, 'bare_impl'), item) + if "bareFunctions" in self.__dict__ and self.__dict__["bareFunctions"] is not None and item in self.__dict__["bareFunctions"]: + return getattr(object.__getattribute__(self, "bare_impl"), item) # remote functions - bare_impl = object.__getattribute__(self, 'bare_impl') + bare_impl = object.__getattribute__(self, "bare_impl") if hasattr(bare_impl, item): if isinstance(getattr(bare_impl, item), types.MethodType): conn = self._get_connection() @@ -147,16 +145,24 @@ def __getattr__(self, item): # ssh connection def _get_connection(self): - tmpLog = core_utils.make_logger(_logger, method_name='_get_connection') - tmpLog.debug('start') - sshMasterPool.make_control_master(self.remoteHost, self.remotePort, self.numMasters, - ssh_username=self.sshUserName, ssh_password=self.sshPassword, - private_key=self.privateKey, pass_phrase=self.passPhrase, - jump_host=self.jumpHost, jump_port=self.jumpPort, sock_dir=self.sockDir, - connection_lifetime=self.connectionLifetime) + tmpLog = core_utils.make_logger(_logger, method_name="_get_connection") + tmpLog.debug("start") + sshMasterPool.make_control_master( + self.remoteHost, + self.remotePort, + self.numMasters, + ssh_username=self.sshUserName, + ssh_password=self.sshPassword, + private_key=self.privateKey, + pass_phrase=self.passPhrase, + jump_host=self.jumpHost, + jump_port=self.jumpPort, + sock_dir=self.sockDir, + connection_lifetime=self.connectionLifetime, + ) conn = sshMasterPool.get_connection(self.remoteHost, self.remotePort, self.execStr) if conn is not None: - tmpLog.debug('connected successfully') + tmpLog.debug("connected successfully") else: - tmpLog.error('failed to connect') + tmpLog.error("failed to connect") return conn diff --git a/pandaharvester/harvestermiddleware/rpc_bot.py b/pandaharvester/harvestermiddleware/rpc_bot.py index 23ad9c25..3a21ef73 100644 --- a/pandaharvester/harvestermiddleware/rpc_bot.py +++ b/pandaharvester/harvestermiddleware/rpc_bot.py @@ -10,8 +10,8 @@ # rpyc configuration -rpyc.core.protocol.DEFAULT_CONFIG['allow_pickle'] = True -rpyc.core.protocol.DEFAULT_CONFIG['sync_request_timeout'] = 1800 +rpyc.core.protocol.DEFAULT_CONFIG["allow_pickle"] = True +rpyc.core.protocol.DEFAULT_CONFIG["sync_request_timeout"] = 1800 # logger setup @@ -20,19 +20,21 @@ def setupLogger(logger, pid=None, to_file=None): hdlr = logging.FileHandler(to_file) else: hdlr = logging.StreamHandler() + def emit_decorator(fn): def func(*args): - formatter = logging.Formatter('%(asctime)s %(levelname)s]({0})(%(name)s.%(funcName)s) %(message)s'.format(pid)) + formatter = logging.Formatter("%(asctime)s %(levelname)s]({0})(%(name)s.%(funcName)s) %(message)s".format(pid)) hdlr.setFormatter(formatter) return fn(*args) + return func + hdlr.emit = emit_decorator(hdlr.emit) logger.addHandler(hdlr) # RPC bot running on remote node class RpcBot(rpyc.Service): - # initialization action def on_connect(self, conn): self.pluginFactory = PluginFactory(no_db=True) @@ -185,33 +187,24 @@ def exposed_resolve_input_paths(self, plugin_config, jobspec): def main(): # arguments parser = argparse.ArgumentParser() - parser.add_argument('--pid', action='store', dest='pid', default='/var/tmp/harvester_rpc.pid', - help='pid filename') - parser.add_argument('--port', dest='port', type=int, default=18861, - help='the TCP port to bind to') - parser.add_argument('--backlog', dest='backlog', type=int, default=10, - help='backlog for the port') - parser.add_argument('--stdout', action='store', dest='stdout', default='/var/tmp/harvester_rpc.out', - help='stdout filename') - parser.add_argument('--stderr', action='store', dest='stderr', default='/var/tmp/harvester_rpc.err', - help='stderr filename') + parser.add_argument("--pid", action="store", dest="pid", default="/var/tmp/harvester_rpc.pid", help="pid filename") + parser.add_argument("--port", dest="port", type=int, default=18861, help="the TCP port to bind to") + parser.add_argument("--backlog", dest="backlog", type=int, default=10, help="backlog for the port") + parser.add_argument("--stdout", action="store", dest="stdout", default="/var/tmp/harvester_rpc.out", help="stdout filename") + parser.add_argument("--stderr", action="store", dest="stderr", default="/var/tmp/harvester_rpc.err", help="stderr filename") options = parser.parse_args() # logger - _logger = logging.getLogger('rpc_bot') + _logger = logging.getLogger("rpc_bot") setupLogger(_logger, pid=os.getpid()) # make daemon context - outfile = open(options.stdout, 'a+') - errfile = open(options.stderr, 'a+') - dc = daemon.DaemonContext( - pidfile=daemon.pidfile.PIDLockFile(options.pid), - stdout=outfile, - stderr=errfile) + outfile = open(options.stdout, "a+") + errfile = open(options.stderr, "a+") + dc = daemon.DaemonContext(pidfile=daemon.pidfile.PIDLockFile(options.pid), stdout=outfile, stderr=errfile) # run thread server with dc: from rpyc.utils.server import ThreadPoolServer - t = ThreadPoolServer(RpcBot, port=options.port, backlog=options.backlog, - logger=_logger, - protocol_config={"allow_all_attrs": True}) + + t = ThreadPoolServer(RpcBot, port=options.port, backlog=options.backlog, logger=_logger, protocol_config={"allow_all_attrs": True}) t.start() # finalize outfile.close() diff --git a/pandaharvester/harvestermiddleware/rpc_herder.py b/pandaharvester/harvestermiddleware/rpc_herder.py index cb3552a6..d924cda8 100644 --- a/pandaharvester/harvestermiddleware/rpc_herder.py +++ b/pandaharvester/harvestermiddleware/rpc_herder.py @@ -8,17 +8,16 @@ # logger -_logger = core_utils.setup_logger('rpc_herder') +_logger = core_utils.setup_logger("rpc_herder") # rpyc configuration -rpyc.core.protocol.DEFAULT_CONFIG['allow_pickle'] = True -rpyc.core.protocol.DEFAULT_CONFIG['sync_request_timeout'] = 1800 +rpyc.core.protocol.DEFAULT_CONFIG["allow_pickle"] = True +rpyc.core.protocol.DEFAULT_CONFIG["sync_request_timeout"] = 1800 # RPC herder class RpcHerder(PluginBase): - # decorator def require_alive(func): @functools.wraps(func) @@ -30,44 +29,51 @@ def wrapper(self, *args, **kwargs): return rpyc.utils.classic.obtain(retVal) else: tmpLog = core_utils.make_logger(_logger, method_name=func.__name__) - tmpLog.warning('instance not alive; method {0} returns None'.format(func.__name__)) + tmpLog.warning("instance not alive; method {0} returns None".format(func.__name__)) return None + return wrapper # constructor def __init__(self, **kwarg): - tmpLog = core_utils.make_logger(_logger, method_name='__init__') + tmpLog = core_utils.make_logger(_logger, method_name="__init__") PluginBase.__init__(self, **kwarg) - self.sshUserName = getattr(self, 'sshUserName', None) - self.sshPassword = getattr(self, 'sshPassword', None) - self.privateKey = getattr(self, 'privateKey', None) - self.passPhrase = getattr(self, 'passPhrase', None) - self.jumpHost = getattr(self, 'jumpHost', None) - self.jumpPort = getattr(self, 'jumpPort', 22) - self.remotePort = getattr(self, 'remotePort', 22) - self.bareFunctions = getattr(self, 'bareFunctions', list()) + self.sshUserName = getattr(self, "sshUserName", None) + self.sshPassword = getattr(self, "sshPassword", None) + self.privateKey = getattr(self, "privateKey", None) + self.passPhrase = getattr(self, "passPhrase", None) + self.jumpHost = getattr(self, "jumpHost", None) + self.jumpPort = getattr(self, "jumpPort", 22) + self.remotePort = getattr(self, "remotePort", 22) + self.bareFunctions = getattr(self, "bareFunctions", list()) # is connected only if ssh forwarding works self.is_connected = False try: self._get_connection() except Exception as e: - tmpLog.error('failed to get connection ; {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("failed to get connection ; {0}: {1}".format(e.__class__.__name__, e)) else: self.is_connected = True # ssh and rpc connect def _get_connection(self): - tmpLog = core_utils.make_logger(_logger, method_name='_get_connection') - tmpLog.debug('start') - sshTunnelPool.make_tunnel_server(self.remoteHost, self.remotePort, self.remoteBindPort, self.numTunnels, - ssh_username=self.sshUserName, ssh_password=self.sshPassword, - private_key=self.privateKey, pass_phrase=self.passPhrase, - jump_host=self.jumpHost, jump_port=self.jumpPort) + tmpLog = core_utils.make_logger(_logger, method_name="_get_connection") + tmpLog.debug("start") + sshTunnelPool.make_tunnel_server( + self.remoteHost, + self.remotePort, + self.remoteBindPort, + self.numTunnels, + ssh_username=self.sshUserName, + ssh_password=self.sshPassword, + private_key=self.privateKey, + pass_phrase=self.passPhrase, + jump_host=self.jumpHost, + jump_port=self.jumpPort, + ) tunnelHost, tunnelPort, tunnelCore = sshTunnelPool.get_tunnel(self.remoteHost, self.remotePort) - self.conn = rpyc.connect(tunnelHost, tunnelPort, config={"allow_all_attrs": True, - "allow_setattr": True, - "allow_delattr": True}) - tmpLog.debug('connected successfully to {0}:{1}'.format(tunnelHost, tunnelPort)) + self.conn = rpyc.connect(tunnelHost, tunnelPort, config={"allow_all_attrs": True, "allow_setattr": True, "allow_delattr": True}) + tmpLog.debug("connected successfully to {0}:{1}".format(tunnelHost, tunnelPort)) ###################### # submitter section @@ -75,15 +81,15 @@ def _get_connection(self): # submit workers @require_alive def submit_workers(self, workspec_list): - tmpLog = core_utils.make_logger(_logger, method_name='submit_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="submit_workers") + tmpLog.debug("start") try: ret = self.conn.root.submit_workers(self.original_config, workspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret ###################### @@ -92,15 +98,15 @@ def submit_workers(self, workspec_list): # check workers @require_alive def check_workers(self, workspec_list): - tmpLog = core_utils.make_logger(_logger, method_name='check_workers') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="check_workers") + tmpLog.debug("start") try: ret = self.conn.root.check_workers(self.original_config, workspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret ###################### @@ -109,15 +115,15 @@ def check_workers(self, workspec_list): # kill worker @require_alive def kill_worker(self, workspec): - tmpLog = core_utils.make_logger(_logger, method_name='kill_worker') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="kill_worker") + tmpLog.debug("start") try: ret = self.conn.root.kill_worker(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # FIXME: cannot have this yet otherwise sweeper agent see this method while the real plugin may not implemented this method yet... @@ -138,15 +144,15 @@ def kill_worker(self, workspec): # cleanup for a worker @require_alive def sweep_worker(self, workspec): - tmpLog = core_utils.make_logger(_logger, method_name='sweep_worker') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="sweep_worker") + tmpLog.debug("start") try: ret = self.conn.root.sweep_worker(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret ###################### @@ -155,197 +161,197 @@ def sweep_worker(self, workspec): # setup access points @require_alive def setup_access_points(self, workspec_list): - tmpLog = core_utils.make_logger(_logger, method_name='setup_access_points') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="setup_access_points") + tmpLog.debug("start") try: ret = self.conn.root.setup_access_points(self.original_config, workspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # feed jobs @require_alive def feed_jobs(self, workspec, jobspec_list): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_jobs') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="feed_jobs") + tmpLog.debug("start") try: ret = self.conn.root.feed_jobs(self.original_config, workspec, jobspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # request job @require_alive def job_requested(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='job_requested') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="job_requested") + tmpLog.debug("start") try: ret = self.conn.root.job_requested(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # request kill @require_alive def kill_requested(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="kill_requested") + tmpLog.debug("start") try: ret = self.conn.root.kill_requested(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # is alive @require_alive def is_alive(self, workspec, worker_heartbeat_limit): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="is_alive") + tmpLog.debug("start") try: ret = self.conn.root.is_alive(self.original_config, workspec, worker_heartbeat_limit) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # get work attributes @require_alive def get_work_attributes(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='get_work_attributes') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_work_attributes") + tmpLog.debug("start") try: ret = self.conn.root.get_work_attributes(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # get output files @require_alive def get_files_to_stage_out(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='get_files_to_stage_out') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_files_to_stage_out") + tmpLog.debug("start") try: ret = self.conn.root.get_files_to_stage_out(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # feed events @require_alive def feed_events(self, workspec, events_dict): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_events') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="feed_events") + tmpLog.debug("start") try: ret = self.conn.root.feed_events(self.original_config, workspec, events_dict) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # get events @require_alive def events_to_update(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='events_to_update') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="events_to_update") + tmpLog.debug("start") try: ret = self.conn.root.events_to_update(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # request events @require_alive def events_requested(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='events_requested') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="events_requested") + tmpLog.debug("start") try: ret = self.conn.root.events_requested(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # get PandaIDs @require_alive def get_panda_ids(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='get_panda_ids') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="get_panda_ids") + tmpLog.debug("start") try: ret = self.conn.root.get_panda_ids(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # post processing @require_alive def post_processing(self, workspec, jobspec_list, map_type): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="post_processing") + tmpLog.debug("start") try: ret = self.conn.root.post_processing(self.original_config, workspec, jobspec_list, map_type) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # send ACK @require_alive def acknowledge_events_files(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='acknowledge_events_files') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="acknowledge_events_files") + tmpLog.debug("start") try: ret = self.conn.root.acknowledge_events_files(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # clean up @require_alive def clean_up(self, workspec): - tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='clean_up') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, "workerID={0}".format(workspec.workerID), method_name="clean_up") + tmpLog.debug("start") try: ret = self.conn.root.clean_up(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret ###################### @@ -354,43 +360,43 @@ def clean_up(self, workspec): # check stage out status @require_alive def check_stage_out_status(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="check_stage_out_status") + tmpLog.debug("start") try: ret = self.conn.root.check_stage_out_status(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # trigger stage out @require_alive def trigger_stage_out(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="trigger_stage_out") + tmpLog.debug("start") try: ret = self.conn.root.trigger_stage_out(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # zip output files @require_alive def zip_output(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='zip_output') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="zip_output") + tmpLog.debug("start") try: ret = self.conn.root.zip_output(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret ###################### @@ -399,41 +405,41 @@ def zip_output(self, jobspec): # check stage in status @require_alive def check_stage_in_status(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='check_stage_in_status') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="check_stage_in_status") + tmpLog.debug("start") try: ret = self.conn.root.check_stage_in_status(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # trigger preparation @require_alive def trigger_preparation(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="trigger_preparation") + tmpLog.debug("start") try: ret = self.conn.root.trigger_preparation(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret # resolve input file paths @require_alive def resolve_input_paths(self, jobspec): - tmpLog = core_utils.make_logger(_logger, method_name='resolve_input_paths') - tmpLog.debug('start') + tmpLog = core_utils.make_logger(_logger, method_name="resolve_input_paths") + tmpLog.debug("start") try: ret = self.conn.root.resolve_input_paths(self.original_config, jobspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: - tmpLog.debug('done') + tmpLog.debug("done") return ret diff --git a/pandaharvester/harvestermiddleware/ssh_master_pool.py b/pandaharvester/harvestermiddleware/ssh_master_pool.py index 8d5bc3eb..b9c36731 100644 --- a/pandaharvester/harvestermiddleware/ssh_master_pool.py +++ b/pandaharvester/harvestermiddleware/ssh_master_pool.py @@ -7,6 +7,7 @@ import six import pexpect import tempfile + try: import subprocess32 as subprocess except Exception: @@ -20,12 +21,11 @@ pexpect_spawn = pexpect.spawnu # logger -baseLogger = core_utils.setup_logger('ssh_master_pool') +baseLogger = core_utils.setup_logger("ssh_master_pool") # Pool of SSH control masters class SshMasterPool(object): - # constructor def __init__(self): self.lock = threading.Lock() @@ -34,13 +34,26 @@ def __init__(self): # make a dict key def make_dict_key(self, host, port): - return '{0}:{1}'.format(host, port) + return "{0}:{1}".format(host, port) # make a control master - def make_control_master(self, remote_host, remote_port, num_masters=1, - ssh_username=None, ssh_password=None, private_key=None, pass_phrase=None, - jump_host=None, jump_port=None, login_timeout=60, reconnect=False, - with_lock=True, sock_dir=None, connection_lifetime=None): + def make_control_master( + self, + remote_host, + remote_port, + num_masters=1, + ssh_username=None, + ssh_password=None, + private_key=None, + pass_phrase=None, + jump_host=None, + jump_port=None, + login_timeout=60, + reconnect=False, + with_lock=True, + sock_dir=None, + connection_lifetime=None, + ): dict_key = self.make_dict_key(remote_host, remote_port) if with_lock: self.lock.acquire() @@ -49,32 +62,33 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, self.pool[dict_key] = [] # preserve parameters if not reconnect: - self.params[dict_key] = {'num_masters': num_masters, - 'ssh_username': ssh_username, - 'ssh_password': ssh_password, - 'private_key': private_key, - 'pass_phrase': pass_phrase, - 'jump_host': jump_host, - 'jump_port': jump_port, - 'login_timeout': login_timeout, - 'sock_dir': sock_dir, - 'connection_lifetime': connection_lifetime, - } + self.params[dict_key] = { + "num_masters": num_masters, + "ssh_username": ssh_username, + "ssh_password": ssh_password, + "private_key": private_key, + "pass_phrase": pass_phrase, + "jump_host": jump_host, + "jump_port": jump_port, + "login_timeout": login_timeout, + "sock_dir": sock_dir, + "connection_lifetime": connection_lifetime, + } else: - num_masters = self.params[dict_key]['num_masters'] - ssh_username = self.params[dict_key]['ssh_username'] - ssh_password = self.params[dict_key]['ssh_password'] - private_key = self.params[dict_key]['private_key'] - pass_phrase = self.params[dict_key]['pass_phrase'] - jump_host = self.params[dict_key]['jump_host'] - jump_port = self.params[dict_key]['jump_port'] - login_timeout = self.params[dict_key]['login_timeout'] - sock_dir = self.params[dict_key]['sock_dir'] - connection_lifetime = self.params[dict_key]['connection_lifetime'] + num_masters = self.params[dict_key]["num_masters"] + ssh_username = self.params[dict_key]["ssh_username"] + ssh_password = self.params[dict_key]["ssh_password"] + private_key = self.params[dict_key]["private_key"] + pass_phrase = self.params[dict_key]["pass_phrase"] + jump_host = self.params[dict_key]["jump_host"] + jump_port = self.params[dict_key]["jump_port"] + login_timeout = self.params[dict_key]["login_timeout"] + sock_dir = self.params[dict_key]["sock_dir"] + connection_lifetime = self.params[dict_key]["connection_lifetime"] # make a master for i in range(num_masters - len(self.pool[dict_key])): # make a socket file - sock_file = os.path.join(sock_dir, 'sock_{0}_{1}'.format(remote_host, uuid.uuid4().hex)) + sock_file = os.path.join(sock_dir, "sock_{0}_{1}".format(remote_host, uuid.uuid4().hex)) com = "ssh -M -S {sock_file} " com += "-p {remote_port} {ssh_username}@{remote_host} " com += "-o ServerAliveInterval=120 -o ServerAliveCountMax=2 " @@ -82,22 +96,28 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, com += "-i {private_key} " if jump_host is not None and jump_port is not None: com += '-o ProxyCommand="ssh -p {jump_port} {ssh_username}@{jump_host} -W %h:%p" ' - com = com.format(remote_host=remote_host, remote_port=remote_port, - ssh_username=ssh_username, private_key=private_key, jump_host=jump_host, - jump_port=jump_port, sock_file=sock_file) - loginString = 'login_to_be_confirmed_with ' + uuid.uuid4().hex + com = com.format( + remote_host=remote_host, + remote_port=remote_port, + ssh_username=ssh_username, + private_key=private_key, + jump_host=jump_host, + jump_port=jump_port, + sock_file=sock_file, + ) + loginString = "login_to_be_confirmed_with " + uuid.uuid4().hex com += "'echo {0}; bash".format(loginString) # list of expected strings expected_list = [ pexpect.EOF, pexpect.TIMEOUT, "(?i)are you sure you want to continue connecting", - '(?i)password:', - '(?i)enter passphrase for key.*', + "(?i)password:", + "(?i)enter passphrase for key.*", loginString, - ] + ] c = pexpect_spawn(com, echo=False) - baseLogger.debug('pexpect_spawn') + baseLogger.debug("pexpect_spawn") c.logfile_read = baseLogger.handlers[0].stream isOK = False for iTry in range(3): @@ -108,8 +128,7 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, break if idx == 1: # timeout - baseLogger.error('timeout when making a master with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("timeout when making a master with com={0} out={1}".format(com, c.buffer)) c.close() break if idx == 2: @@ -118,8 +137,7 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, idx = c.expect(expected_list, timeout=login_timeout) if idx == 1: # timeout - baseLogger.error('timeout after accepting new cert with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("timeout after accepting new cert with com={0} out={1}".format(com, c.buffer)) c.close() break if idx == 3: @@ -129,12 +147,11 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, # passphrase prompt c.sendline(pass_phrase) elif idx == 0: - baseLogger.error('something weired with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("something weired with com={0} out={1}".format(com, c.buffer)) c.close() break # exec to confirm login - c.sendline('echo {0}'.format(loginString)) + c.sendline("echo {0}".format(loginString)) if isOK: conn_exp_time = (time.time() + connection_lifetime) if connection_lifetime is not None else None self.pool[dict_key].append((sock_file, c, conn_exp_time)) @@ -143,7 +160,7 @@ def make_control_master(self, remote_host, remote_port, num_masters=1, # get a connection def get_connection(self, remote_host, remote_port, exec_string): - baseLogger.debug('get_connection start') + baseLogger.debug("get_connection start") dict_key = self.make_dict_key(remote_host, remote_port) self.lock.acquire() active_masters = [] @@ -156,20 +173,18 @@ def get_connection(self, remote_host, remote_port, exec_string): self.pool[dict_key].remove((sock_file, child, conn_exp_time)) someClosed = True if child.isalive(): - baseLogger.debug('a connection process is dead') + baseLogger.debug("a connection process is dead") else: - baseLogger.debug('a connection is expired') + baseLogger.debug("a connection is expired") if someClosed: self.make_control_master(remote_host, remote_port, reconnect=True, with_lock=False) active_masters = [item for item in self.pool[dict_key] if os.path.exists(item[0])] - baseLogger.debug('reconnected; now {0} active connections'.format(len(active_masters))) + baseLogger.debug("reconnected; now {0} active connections".format(len(active_masters))) if len(active_masters) > 0: sock_file, child, conn_exp_time = random.choice(active_masters) - con = subprocess.Popen(['ssh', 'dummy', '-S', sock_file, exec_string], - shell=False, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + con = subprocess.Popen( + ["ssh", "dummy", "-S", sock_file, exec_string], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) else: con = None self.lock.release() diff --git a/pandaharvester/harvestermiddleware/ssh_tunnel_pool.py b/pandaharvester/harvestermiddleware/ssh_tunnel_pool.py index b6ca1805..724e53bf 100644 --- a/pandaharvester/harvestermiddleware/ssh_tunnel_pool.py +++ b/pandaharvester/harvestermiddleware/ssh_tunnel_pool.py @@ -14,12 +14,11 @@ pexpect_spawn = pexpect.spawnu # logger -baseLogger = core_utils.setup_logger('ssh_tunnel_pool') +baseLogger = core_utils.setup_logger("ssh_tunnel_pool") # Pool of SSH tunnels class SshTunnelPool(object): - # constructor def __init__(self): self.lock = threading.Lock() @@ -28,13 +27,25 @@ def __init__(self): # make a dict key def make_dict_key(self, host, port): - return '{0}:{1}'.format(host, port) + return "{0}:{1}".format(host, port) # make a tunnel server - def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, num_tunnels=1, - ssh_username=None, ssh_password=None, private_key=None, pass_phrase=None, - jump_host=None, jump_port=None, login_timeout=60, reconnect=False, - with_lock=True): + def make_tunnel_server( + self, + remote_host, + remote_port, + remote_bind_port=None, + num_tunnels=1, + ssh_username=None, + ssh_password=None, + private_key=None, + pass_phrase=None, + jump_host=None, + jump_port=None, + login_timeout=60, + reconnect=False, + with_lock=True, + ): dict_key = self.make_dict_key(remote_host, remote_port) if with_lock: self.lock.acquire() @@ -43,31 +54,32 @@ def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, nu self.pool[dict_key] = [] # preserve parameters if not reconnect: - self.params[dict_key] = {'remote_bind_port': remote_bind_port, - 'num_tunnels': num_tunnels, - 'ssh_username': ssh_username, - 'ssh_password': ssh_password, - 'private_key': private_key, - 'pass_phrase': pass_phrase, - 'jump_host': jump_host, - 'jump_port': jump_port, - 'login_timeout': login_timeout - } + self.params[dict_key] = { + "remote_bind_port": remote_bind_port, + "num_tunnels": num_tunnels, + "ssh_username": ssh_username, + "ssh_password": ssh_password, + "private_key": private_key, + "pass_phrase": pass_phrase, + "jump_host": jump_host, + "jump_port": jump_port, + "login_timeout": login_timeout, + } else: - remote_bind_port = self.params[dict_key]['remote_bind_port'] - num_tunnels = self.params[dict_key]['num_tunnels'] - ssh_username = self.params[dict_key]['ssh_username'] - ssh_password = self.params[dict_key]['ssh_password'] - private_key = self.params[dict_key]['private_key'] - pass_phrase = self.params[dict_key]['pass_phrase'] - jump_host = self.params[dict_key]['jump_host'] - jump_port = self.params[dict_key]['jump_port'] - login_timeout = self.params[dict_key]['login_timeout'] + remote_bind_port = self.params[dict_key]["remote_bind_port"] + num_tunnels = self.params[dict_key]["num_tunnels"] + ssh_username = self.params[dict_key]["ssh_username"] + ssh_password = self.params[dict_key]["ssh_password"] + private_key = self.params[dict_key]["private_key"] + pass_phrase = self.params[dict_key]["pass_phrase"] + jump_host = self.params[dict_key]["jump_host"] + jump_port = self.params[dict_key]["jump_port"] + login_timeout = self.params[dict_key]["login_timeout"] # make a tunnel server for i in range(num_tunnels - len(self.pool[dict_key])): # get a free port s = socket.socket() - s.bind(('', 0)) + s.bind(("", 0)) com = "ssh -L {local_bind_port}:127.0.0.1:{remote_bind_port} " com += "-p {remote_port} {ssh_username}@{remote_host} " com += "-o ServerAliveInterval=120 -o ServerAliveCountMax=2 " @@ -76,20 +88,27 @@ def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, nu if jump_port is not None: com += '-o ProxyCommand="ssh -p {jump_port} {ssh_username}@{jump_host} -W %h:%p" ' local_bind_port = s.getsockname()[1] - com = com.format(remote_host=remote_host, remote_port=remote_port, remote_bind_port=remote_bind_port, - ssh_username=ssh_username, private_key=private_key, jump_host=jump_host, - jump_port=jump_port, local_bind_port=local_bind_port) + com = com.format( + remote_host=remote_host, + remote_port=remote_port, + remote_bind_port=remote_bind_port, + ssh_username=ssh_username, + private_key=private_key, + jump_host=jump_host, + jump_port=jump_port, + local_bind_port=local_bind_port, + ) s.close() # list of expected strings - loginString = 'login_to_be_confirmed_with ' + uuid.uuid4().hex + loginString = "login_to_be_confirmed_with " + uuid.uuid4().hex expected_list = [ pexpect.EOF, pexpect.TIMEOUT, "(?i)are you sure you want to continue connecting", - '(?i)password:', - '(?i)enter passphrase for key.*', + "(?i)password:", + "(?i)enter passphrase for key.*", loginString, - ] + ] c = pexpect_spawn(com, echo=False) c.logfile_read = baseLogger.handlers[0].stream isOK = False @@ -101,8 +120,7 @@ def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, nu break if idx == 1: # timeout - baseLogger.error('timeout when making a tunnel with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("timeout when making a tunnel with com={0} out={1}".format(com, c.buffer)) c.close() break if idx == 2: @@ -111,8 +129,7 @@ def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, nu idx = c.expect(expected_list, timeout=login_timeout) if idx == 1: # timeout - baseLogger.error('timeout after accepting new cert with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("timeout after accepting new cert with com={0} out={1}".format(com, c.buffer)) c.close() break if idx == 3: @@ -122,12 +139,11 @@ def make_tunnel_server(self, remote_host, remote_port, remote_bind_port=None, nu # passphrase prompt c.sendline(pass_phrase) elif idx == 0: - baseLogger.error('something weired with com={0} out={1}'.format(com, - c.buffer)) + baseLogger.error("something weired with com={0} out={1}".format(com, c.buffer)) c.close() break # exec to confirm login - c.sendline('echo {0}'.format(loginString)) + c.sendline("echo {0}".format(loginString)) if isOK: self.pool[dict_key].append((local_bind_port, c)) if with_lock: diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index 2f7a6675..98176adf 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -13,8 +13,9 @@ from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestermisc.info_utils import PandaQueuesDict -_base_logger = core_utils.setup_logger('apfmon') -NO_CE = 'noCE' +_base_logger = core_utils.setup_logger("apfmon") +NO_CE = "noCE" + def apfmon_active(method, *args, **kwargs): if cls.__active: @@ -24,43 +25,41 @@ def apfmon_active(method, *args, **kwargs): def clean_ce(ce): - return ce.split('.')[0].split('://')[-1] + return ce.split(".")[0].split("://")[-1] class Apfmon(object): - def __init__(self, queue_config_mapper): - try: self.__active = harvester_config.apfmon.active - except: + except BaseException: self.__active = False try: self.__worker_timeout = harvester_config.apfmon.worker_timeout - except: + except BaseException: self.__worker_timeout = 0.5 try: self.__worker_update_timeout = harvester_config.apfmon.worker_timeout - except: + except BaseException: self.__worker_update_timeout = 0.2 try: self.__label_timeout = harvester_config.apfmon.worker_timeout - except: + except BaseException: self.__label_timeout = 1 # TODO: make proper exception handling and defaults try: self.harvester_id = harvester_config.master.harvester_id - except: - self.harvester_id = 'DUMMY' + except BaseException: + self.harvester_id = "DUMMY" try: self.base_url = harvester_config.apfmon.base_url - except: - self.base_url = 'http://apfmon.lancs.ac.uk/api' + except BaseException: + self.base_url = "http://apfmon.lancs.ac.uk/api" self.queue_config_mapper = queue_config_mapper @@ -70,46 +69,42 @@ def create_factory(self): """ start_time = time.time() - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='create_factory') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="create_factory") if not self.__active: - tmp_log.debug('APFMon reporting not enabled') + tmp_log.debug("APFMon reporting not enabled") return try: - tmp_log.debug('start') + tmp_log.debug("start") - url = '{0}/factories/{1}'.format(self.base_url, self.harvester_id) + url = "{0}/factories/{1}".format(self.base_url, self.harvester_id) - f = {'url': 'url_to_logs', - 'email': 'atlas-adc-harvester-central-support@cern.ch', - 'version': panda_pkg_info.release_version} + f = {"url": "url_to_logs", "email": "atlas-adc-harvester-central-support@cern.ch", "version": panda_pkg_info.release_version} payload = json.dumps(f) r = requests.put(url, data=payload, timeout=self.__label_timeout) - tmp_log.debug('registration ended with {0} {1}'.format(r.status_code, r.text)) + tmp_log.debug("registration ended with {0} {1}".format(r.status_code, r.text)) end_time = time.time() - tmp_log.debug('done (took {0})'.format(end_time - start_time)) - except: - tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) + tmp_log.debug("done (took {0})".format(end_time - start_time)) + except BaseException: + tmp_log.error("Excepted with: {0}".format(traceback.format_exc())) def create_labels(self): """ Creates or updates a collection of labels (=panda queue+CE) """ start_time = time.time() - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='create_labels') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="create_labels") if not self.__active: - tmp_log.debug('APFMon reporting not enabled') + tmp_log.debug("APFMon reporting not enabled") return try: - tmp_log.debug('start') + tmp_log.debug("start") - url = '{0}/labels'.format(self.base_url) + url = "{0}/labels".format(self.base_url) # get the active queues from the config mapper all_sites = self.queue_config_mapper.get_active_queues().keys() @@ -122,61 +117,56 @@ def create_labels(self): try: site_info = panda_queues_dict.get(site, dict()) if not site_info: - tmp_log.warning('No site info for {0}'.format(site)) + tmp_log.warning("No site info for {0}".format(site)) continue - + # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something # in local configuration, otherwise set it to a dummy value try: - ce = self.queue_config_mapper.queueConfig[site].submitter['ceEndpoint'] - queues = [{'ce_endpoint': ce}] + ce = self.queue_config_mapper.queueConfig[site].submitter["ceEndpoint"] + queues = [{"ce_endpoint": ce}] except KeyError: - if site_info['queues']: - queues = site_info['queues'] + if site_info["queues"]: + queues = site_info["queues"] else: - queues = [{'ce_endpoint': NO_CE}] + queues = [{"ce_endpoint": NO_CE}] for queue in queues: try: - ce = clean_ce(queue['ce_endpoint']) - except: - ce = '' + ce = clean_ce(queue["ce_endpoint"]) + except BaseException: + ce = "" try: - ce_queue_id = queue['ce_queue_id'] + ce_queue_id = queue["ce_queue_id"] except KeyError: ce_queue_id = 0 - labels.append({'name': '{0}-{1}'.format(site, ce), - 'wmsqueue': site, - 'ce_queue_id': ce_queue_id, - 'factory': self.harvester_id}) - except: - tmp_log.error('Excepted for site {0} with: {1}'.format(site, traceback.format_exc())) + labels.append({"name": "{0}-{1}".format(site, ce), "wmsqueue": site, "ce_queue_id": ce_queue_id, "factory": self.harvester_id}) + except BaseException: + tmp_log.error("Excepted for site {0} with: {1}".format(site, traceback.format_exc())) continue payload = json.dumps(labels) r = requests.put(url, data=payload, timeout=self.__label_timeout) - tmp_log.debug('label creation for {0} ended with {1} {2}'.format(sites, r.status_code, r.text)) + tmp_log.debug("label creation for {0} ended with {1} {2}".format(sites, r.status_code, r.text)) end_time = time.time() - tmp_log.debug('done (took {0})'.format(end_time - start_time)) - except: - tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) + tmp_log.debug("done (took {0})".format(end_time - start_time)) + except BaseException: + tmp_log.error("Excepted with: {0}".format(traceback.format_exc())) def massage_label_data(self, data): - - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='massage_label_data') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="massage_label_data") if not data: return data try: - any = data['ANY'] + any = data["ANY"] agg = {} for rtype in data: - if rtype == 'ANY': + if rtype == "ANY": continue else: for value in data[rtype]: @@ -184,14 +174,14 @@ def massage_label_data(self, data): agg[value] += data[rtype][value] if agg: - data['ANY'] = agg + data["ANY"] = agg else: - data['ANY'] = any + data["ANY"] = any - tmp_log.debug('Massaged to data: {0}'.format(data)) + tmp_log.debug("Massaged to data: {0}".format(data)) except Exception: - tmp_log.debug('Exception in data: {0}'.format(data)) + tmp_log.debug("Exception in data: {0}".format(data)) return data @@ -200,15 +190,14 @@ def update_label(self, site, msg, data): Updates a label (=panda queue+CE) """ start_time = time.time() - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='update_label') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="update_label") if not self.__active: - tmp_log.debug('APFMon reporting not enabled') + tmp_log.debug("APFMon reporting not enabled") return try: - tmp_log.debug('start') + tmp_log.debug("start") data = self.massage_label_data(data) # get the active queues from the config mapper @@ -217,58 +206,57 @@ def update_label(self, site, msg, data): site_info = panda_queues_dict.get(site, dict()) if not site_info: - tmp_log.warning('No site info for {0}'.format(site)) + tmp_log.warning("No site info for {0}".format(site)) return # when no CEs associated to a queue, e.g. P1, HPCs, etc. Try to see if there is something # in local configuration, otherwise set it to a dummy value try: - ce = self.queue_config_mapper.queueConfig[site].submitter['ceEndpoint'] - queues = [{'ce_endpoint': ce}] + ce = self.queue_config_mapper.queueConfig[site].submitter["ceEndpoint"] + queues = [{"ce_endpoint": ce}] except KeyError: - if site_info['queues']: - queues = site_info['queues'] + if site_info["queues"]: + queues = site_info["queues"] else: - queues = [{'ce_endpoint': NO_CE}] + queues = [{"ce_endpoint": NO_CE}] for queue in queues: try: try: - ce = clean_ce(queue['ce_endpoint']) - except: - ce = '' + ce = clean_ce(queue["ce_endpoint"]) + except BaseException: + ce = "" - label_data = {'status': msg, 'data': data} - label = '{0}-{1}'.format(site, ce) - label_id = '{0}:{1}'.format(self.harvester_id, label) - url = '{0}/labels/{1}'.format(self.base_url, label_id) + label_data = {"status": msg, "data": data} + label = "{0}-{1}".format(site, ce) + label_id = "{0}:{1}".format(self.harvester_id, label) + url = "{0}/labels/{1}".format(self.base_url, label_id) r = requests.post(url, data=json.dumps(label_data), timeout=self.__label_timeout) - tmp_log.debug('label update for {0} ended with {1} {2}'.format(label, r.status_code, r.text)) - except: - tmp_log.error('Excepted for site {0} with: {1}'.format(label, traceback.format_exc())) + tmp_log.debug("label update for {0} ended with {1} {2}".format(label, r.status_code, r.text)) + except BaseException: + tmp_log.error("Excepted for site {0} with: {1}".format(label, traceback.format_exc())) end_time = time.time() - tmp_log.debug('done (took {0})'.format(end_time - start_time)) - except: - tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) + tmp_log.debug("done (took {0})".format(end_time - start_time)) + except BaseException: + tmp_log.error("Excepted with: {0}".format(traceback.format_exc())) def create_workers(self, worker_spec_list): """ Creates a worker """ start_time = time.time() - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='create_workers') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="create_workers") if not self.__active: - tmp_log.debug('APFMon reporting not enabled') + tmp_log.debug("APFMon reporting not enabled") return try: - tmp_log.debug('start') + tmp_log.debug("start") - url = '{0}/jobs'.format(self.base_url) + url = "{0}/jobs".format(self.base_url) for worker_spec_shard in core_utils.create_shards(worker_spec_list, 20): apfmon_workers = [] @@ -276,59 +264,58 @@ def create_workers(self, worker_spec_list): batch_id = worker_spec.batchID worker_id = worker_spec.workerID if not batch_id: - tmp_log.debug('no batchID found for workerID {0}... skipping'.format(worker_id)) + tmp_log.debug("no batchID found for workerID {0}... skipping".format(worker_id)) continue factory = self.harvester_id computingsite = worker_spec.computingSite try: ce = clean_ce(worker_spec.computingElement) except AttributeError: - tmp_log.debug('no CE found for workerID {0} batchID {1}'.format(worker_id, batch_id)) + tmp_log.debug("no CE found for workerID {0} batchID {1}".format(worker_id, batch_id)) ce = NO_CE # extract the log URLs - stdout_url = '' - stderr_url = '' - log_url = '' - jdl_url = '' + stdout_url = "" + stderr_url = "" + log_url = "" + jdl_url = "" work_attribs = worker_spec.workAttributes if work_attribs: - if 'stdOut' in work_attribs: - stdout_url = work_attribs['stdOut'] + if "stdOut" in work_attribs: + stdout_url = work_attribs["stdOut"] # jdl_url = '{0}.jdl'.format(stdout_url[:-4]) - if 'stdErr' in work_attribs: - stderr_url = work_attribs['stdErr'] - if 'batchLog' in work_attribs: - log_url = work_attribs['batchLog'] - if 'jdl' in work_attribs: - jdl_url = work_attribs['jdl'] - - apfmon_worker = {'cid': batch_id, - 'factory': factory, - 'label': '{0}-{1}'.format(computingsite, ce), - 'jdlurl': jdl_url, - 'stdouturl': stdout_url, - 'stderrurl': stderr_url, - 'logurl': log_url - } - tmp_log.debug('packed worker: {0}'.format(apfmon_worker)) + if "stdErr" in work_attribs: + stderr_url = work_attribs["stdErr"] + if "batchLog" in work_attribs: + log_url = work_attribs["batchLog"] + if "jdl" in work_attribs: + jdl_url = work_attribs["jdl"] + + apfmon_worker = { + "cid": batch_id, + "factory": factory, + "label": "{0}-{1}".format(computingsite, ce), + "jdlurl": jdl_url, + "stdouturl": stdout_url, + "stderrurl": stderr_url, + "logurl": log_url, + } + tmp_log.debug("packed worker: {0}".format(apfmon_worker)) apfmon_workers.append(apfmon_worker) payload = json.dumps(apfmon_workers) try: r = requests.put(url, data=payload, timeout=self.__worker_timeout) - tmp_log.debug('worker creation for {0} ended with {1} {2}'.format(apfmon_workers, r.status_code, r.text)) - except: - tmp_log.debug( - 'worker creation for {0} failed with'.format(apfmon_workers, format(traceback.format_exc()))) - + tmp_log.debug("worker creation for {0} ended with {1} {2}".format(apfmon_workers, r.status_code, r.text)) + except BaseException: + tmp_log.debug("worker creation for {0} failed with".format(apfmon_workers, format(traceback.format_exc()))) end_time = time.time() - tmp_log.debug('done (took {0})'.format(end_time - start_time)) - except: - tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) + tmp_log.debug("done (took {0})".format(end_time - start_time)) + except BaseException: + tmp_log.error("Excepted with: {0}".format(traceback.format_exc())) def convert_status(self, harvester_status): """ @@ -336,14 +323,14 @@ def convert_status(self, harvester_status): :param harvester_status :return: list with apfmon_status. Usually it's just one status, except for exiting&done """ - if harvester_status == 'submitted': - return 'created' - if harvester_status in ['running', 'idle']: - return 'running' - if harvester_status in ['missed', 'failed', 'cancelled']: - return 'fault' - if harvester_status == 'finished': - return 'done' + if harvester_status == "submitted": + return "created" + if harvester_status in ["running", "idle"]: + return "running" + if harvester_status in ["missed", "failed", "cancelled"]: + return "fault" + if harvester_status == "finished": + return "done" def update_worker(self, worker_spec, worker_status): """ @@ -351,45 +338,45 @@ def update_worker(self, worker_spec, worker_status): connectivity on the worker node """ start_time = time.time() - tmp_log = core_utils.make_logger(_base_logger, 'harvester_id={0}'.format(self.harvester_id), - method_name='update_worker') + tmp_log = core_utils.make_logger(_base_logger, "harvester_id={0}".format(self.harvester_id), method_name="update_worker") if not self.__active: - tmp_log.debug('APFMon reporting not enabled') + tmp_log.debug("APFMon reporting not enabled") return try: - tmp_log.debug('start') + tmp_log.debug("start") batch_id = worker_spec.batchID factory = self.harvester_id - url = '{0}/jobs/{1}:{2}'.format(self.base_url, factory, batch_id) + url = "{0}/jobs/{1}:{2}".format(self.base_url, factory, batch_id) apfmon_status = self.convert_status(worker_status) apfmon_worker = {} - apfmon_worker['state'] = apfmon_status + apfmon_worker["state"] = apfmon_status # For final states include panda id's if available (push mode only) - if apfmon_status in ('fault', 'done') and hasattr(worker_spec, 'pandaid_list') and worker_spec.pandaid_list: - apfmon_worker['ids'] = ','.join(str(x) for x in worker_spec.pandaid_list) + if apfmon_status in ("fault", "done") and hasattr(worker_spec, "pandaid_list") and worker_spec.pandaid_list: + apfmon_worker["ids"] = ",".join(str(x) for x in worker_spec.pandaid_list) - tmp_log.debug('updating worker {0}: {1}'.format(batch_id, apfmon_worker)) + tmp_log.debug("updating worker {0}: {1}".format(batch_id, apfmon_worker)) r = requests.post(url, data=apfmon_worker, timeout=self.__worker_update_timeout) - tmp_log.debug('worker update for {0} ended with {1} {2}'.format(batch_id, r.status_code, r.text)) + tmp_log.debug("worker update for {0} ended with {1} {2}".format(batch_id, r.status_code, r.text)) end_time = time.time() - tmp_log.debug('done (took {0})'.format(end_time - start_time)) - except: - tmp_log.error('Excepted with: {0}'.format(traceback.format_exc())) + tmp_log.debug("done (took {0})".format(end_time - start_time)) + except BaseException: + tmp_log.error("Excepted with: {0}".format(traceback.format_exc())) -if __name__== "__main__": +if __name__ == "__main__": """ Quick tests """ from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper + queue_config_mapper = QueueConfigMapper() apfmon = Apfmon(queue_config_mapper) @@ -398,23 +385,31 @@ def update_worker(self, worker_spec, worker_status): worker_a = WorkSpec() worker_a.batchID = 1 - worker_a.computingSite = 'CERN-PROD-DEV_UCORE' - worker_a.computingElement = 'bla1' - worker_a.workAttributes = {"batchLog": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log", "stdErr": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err", "stdOut": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.out"} + worker_a.computingSite = "CERN-PROD-DEV_UCORE" + worker_a.computingElement = "bla1" + worker_a.workAttributes = { + "batchLog": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log", + "stdErr": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err", + "stdOut": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.out", + } worker_a.pandaid_list = [1234, 5678] worker_b = WorkSpec() worker_b.batchID = 2 - worker_b.computingSite = 'CERN-PROD-DEV_UCORE' - worker_b.computingElement = 'bla2' - worker_b.workAttributes = {"batchLog": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log", "stdErr": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err", "stdOut": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.out"} + worker_b.computingSite = "CERN-PROD-DEV_UCORE" + worker_b.computingElement = "bla2" + worker_b.workAttributes = { + "batchLog": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.log", + "stdErr": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.err", + "stdOut": "https://aipanda024.cern.ch/condor_logs/18-07-19_09/grid.9659.0.out", + } workers = [worker_a, worker_b] apfmon.create_workers(workers) - worker_a.status = 'running' - worker_b.status = 'running' + worker_a.status = "running" + worker_b.status = "running" apfmon.update_workers(workers) - worker_a.status = 'finished' - worker_b.status = 'failed' + worker_a.status = "finished" + worker_b.status = "failed" apfmon.update_workers(workers) diff --git a/pandaharvester/harvestermisc/arc_parser.py b/pandaharvester/harvestermisc/arc_parser.py index 08d7da21..30287483 100644 --- a/pandaharvester/harvestermisc/arc_parser.py +++ b/pandaharvester/harvestermisc/arc_parser.py @@ -5,7 +5,7 @@ class ARCParser: - '''Converts panda job description to ARC job description using AGIS info''' + """Converts panda job description to ARC job description using AGIS info""" def __init__(self, jobdesc, pandaqueue, siteinfo, logurl, schedulerid, osmap, tmpdir, eventranges, log): self.log = log @@ -13,27 +13,27 @@ def __init__(self, jobdesc, pandaqueue, siteinfo, logurl, schedulerid, osmap, tm self.pandajob = urllib.urlencode(jobdesc) # The json of the job description self.jobdesc = jobdesc - self.pandaid = self.jobdesc['PandaID'] + self.pandaid = self.jobdesc["PandaID"] self.xrsl = {} self.siteinfo = siteinfo - self.ncores = siteinfo['corecount'] + self.ncores = siteinfo["corecount"] self.logurl = logurl self.schedulerid = schedulerid self.defaults = {} - self.defaults['memory'] = 2000 - self.defaults['cputime'] = 2*1440*60 - self.sitename = siteinfo['panda_resource'] + self.defaults["memory"] = 2000 + self.defaults["cputime"] = 2 * 1440 * 60 + self.sitename = siteinfo["panda_resource"] self.schedconfig = pandaqueue - self.truepilot = 'mv' not in siteinfo['copytools'] or len(siteinfo['copytools']) > 1 + self.truepilot = "mv" not in siteinfo["copytools"] or len(siteinfo["copytools"]) > 1 self.osmap = osmap # Set to 1 week if not specified - self.maxwalltime = siteinfo['maxtime']/60 + self.maxwalltime = siteinfo["maxtime"] / 60 if self.maxwalltime == 0: - self.maxwalltime = 7*24*60 + self.maxwalltime = 7 * 24 * 60 self.tmpdir = tmpdir - self.inputfiledir = os.path.join(self.tmpdir, 'inputfiles') + self.inputfiledir = os.path.join(self.tmpdir, "inputfiles") self.inputjobdir = os.path.join(self.inputfiledir, str(self.pandaid)) self.eventranges = eventranges self.longjob = False @@ -41,115 +41,109 @@ def __init__(self, jobdesc, pandaqueue, siteinfo, logurl, schedulerid, osmap, tm if len(self.pandajob) > 50000: self.longjob = True self.artes = None - self.cvmfs = siteinfo['is_cvmfs'] + self.cvmfs = siteinfo["is_cvmfs"] # ES merge jobs need unique guids because pilot uses them as dict keys - if not self.truepilot and self.jobdesc.get('eventServiceMerge') == 'True': - if self.pandajob.startswith('GUID'): - esjobdesc = self.pandajob[self.pandajob.find('&'):] + if not self.truepilot and self.jobdesc.get("eventServiceMerge") == "True": + if self.pandajob.startswith("GUID"): + esjobdesc = self.pandajob[self.pandajob.find("&") :] else: - esjobdesc = self.pandajob[:self.pandajob.find('&GUID')] + self.pandajob[self.pandajob.find('&', self.pandajob.find('&GUID')+5):] - esjobdesc += '&GUID=%s' % '%2C'.join(['DUMMYGUID%i' % i for i in range(len(self.jobdesc['GUID'].split(',')))]) + esjobdesc = self.pandajob[: self.pandajob.find("&GUID")] + self.pandajob[self.pandajob.find("&", self.pandajob.find("&GUID") + 5) :] + esjobdesc += "&GUID=%s" % "%2C".join(["DUMMYGUID%i" % i for i in range(len(self.jobdesc["GUID"].split(",")))]) self.pandajob = esjobdesc - def getNCores(self): - # Unified panda queues: always use coreCount from job description try: - self.ncores = int(self.jobdesc.get('coreCount', 1)) - except: # corecount is NULL + self.ncores = int(self.jobdesc.get("coreCount", 1)) + except BaseException: # corecount is NULL self.ncores = 1 - self.xrsl['count'] = '(count=%d)' % self.ncores + self.xrsl["count"] = "(count=%d)" % self.ncores # force single-node jobs for now if self.ncores > 1: - self.xrsl['countpernode'] = '(runtimeenvironment = APPS/HEP/ATLAS-MULTICORE-1.0)' - if self.sitename.find('RAL-LCG2') < 0 and self.sitename.find('TOKYO') < 0 and self.sitename.find('FZK') < 0: - self.xrsl['countpernode'] = '(runtimeenvironment = APPS/HEP/ATLAS-MULTICORE-1.0)' + self.xrsl["countpernode"] = "(runtimeenvironment = APPS/HEP/ATLAS-MULTICORE-1.0)" + if self.sitename.find("RAL-LCG2") < 0 and self.sitename.find("TOKYO") < 0 and self.sitename.find("FZK") < 0: + self.xrsl["countpernode"] = "(runtimeenvironment = APPS/HEP/ATLAS-MULTICORE-1.0)" return self.ncores def setJobname(self): - - if 'jobName' in self.jobdesc: - jobname = self.jobdesc['jobName'] + if "jobName" in self.jobdesc: + jobname = self.jobdesc["jobName"] else: jobname = "pandajob" - self.xrsl['jobname'] = '(jobname = "%s")' % jobname + self.xrsl["jobname"] = '(jobname = "%s")' % jobname def setDisk(self): - - if 'maxDiskCount' in self.jobdesc: - disk = int(self.jobdesc['maxDiskCount']) + if "maxDiskCount" in self.jobdesc: + disk = int(self.jobdesc["maxDiskCount"]) else: disk = 500 # Add input file sizes - if 'fsize' in self.jobdesc: - disk += sum([int(f) for f in self.jobdesc['fsize'].split(',')]) / 1000000 + if "fsize" in self.jobdesc: + disk += sum([int(f) for f in self.jobdesc["fsize"].split(",")]) / 1000000 # Add safety factor disk += 2000 - self.log.debug('%s: disk space %d' % (self.pandaid, disk)) - self.xrsl['disk'] = "(disk = %d)" % disk + self.log.debug("%s: disk space %d" % (self.pandaid, disk)) + self.xrsl["disk"] = "(disk = %d)" % disk def setTime(self): - - if 'maxCpuCount' in self.jobdesc: - cpucount = int(self.jobdesc['maxCpuCount']) + if "maxCpuCount" in self.jobdesc: + cpucount = int(self.jobdesc["maxCpuCount"]) # hack for group production!!! if cpucount == 600: - cpucount = 24*3600 + cpucount = 24 * 3600 cpucount = int(2 * cpucount) - self.log.info('%s: job maxCpuCount %s' % (self.pandaid, cpucount)) + self.log.info("%s: job maxCpuCount %s" % (self.pandaid, cpucount)) else: - cpucount = 2*24*3600 - self.log.info('%s: Using default maxCpuCount %s' % (self.pandaid, cpucount)) + cpucount = 2 * 24 * 3600 + self.log.info("%s: Using default maxCpuCount %s" % (self.pandaid, cpucount)) if cpucount == 0: - cpucount = 2*24*3600 + cpucount = 2 * 24 * 3600 # shorten installation jobs try: - if self.jobdesc['prodSourceLabel'] == 'install': - cpucount = 12*3600 - except: + if self.jobdesc["prodSourceLabel"] == "install": + cpucount = 12 * 3600 + except BaseException: pass if int(cpucount) <= 0: - cpucount = self.defaults['cputime'] + cpucount = self.defaults["cputime"] walltime = int(cpucount / 60) # JEDI analysis hack walltime = max(60, walltime) walltime = min(self.maxwalltime, walltime) - if self.sitename.startswith('BOINC'): + if self.sitename.startswith("BOINC"): walltime = min(240, walltime) cputime = self.getNCores() * walltime - self.log.info('%s: walltime: %d, cputime: %d' % (self.pandaid, walltime, cputime)) + self.log.info("%s: walltime: %d, cputime: %d" % (self.pandaid, walltime, cputime)) - self.xrsl['time'] = '(walltime=%d)(cputime=%d)' % (walltime, cputime) + self.xrsl["time"] = "(walltime=%d)(cputime=%d)" % (walltime, cputime) def setMemory(self): - - if 'minRamCount' in self.jobdesc: - memory = int(self.jobdesc['minRamCount']) - elif not self.sitename.startswith('ANALY'): + if "minRamCount" in self.jobdesc: + memory = int(self.jobdesc["minRamCount"]) + elif not self.sitename.startswith("ANALY"): memory = 4000 else: memory = 2000 if memory <= 0: - memory = self.defaults['memory'] + memory = self.defaults["memory"] # fix until maxrrs in pandajob is better known if memory <= 500: memory = 500 - if 'BOINC' in self.sitename: + if "BOINC" in self.sitename: memory = 2400 if self.getNCores() > 1: @@ -160,108 +154,109 @@ def setMemory(self): memory = memory / self.getNCores() # fix memory to 500MB units - memory = int(memory-1)/500*500 + 500 + memory = int(memory - 1) / 500 * 500 + 500 - self.xrsl['memory'] = '(memory = %d)' % (memory) + self.xrsl["memory"] = "(memory = %d)" % (memory) def setRTE(self): - - self.artes = '' + self.artes = "" if self.truepilot: - self.xrsl['rtes'] = "(runtimeenvironment = ENV/PROXY)(runtimeenvironment = APPS/HEP/ATLAS-SITE-LCG)" + self.xrsl["rtes"] = "(runtimeenvironment = ENV/PROXY)(runtimeenvironment = APPS/HEP/ATLAS-SITE-LCG)" return - if self.siteinfo['type'] == 'analysis': + if self.siteinfo["type"] == "analysis": # Require proxy for analysis - self.xrsl['rtes'] = "(runtimeenvironment = ENV/PROXY)(runtimeenvironment = APPS/HEP/ATLAS-SITE)" + self.xrsl["rtes"] = "(runtimeenvironment = ENV/PROXY)(runtimeenvironment = APPS/HEP/ATLAS-SITE)" return if self.cvmfs: # Normal sites with cvmfs - self.xrsl['rtes'] = "(runtimeenvironment = APPS/HEP/ATLAS-SITE)" + self.xrsl["rtes"] = "(runtimeenvironment = APPS/HEP/ATLAS-SITE)" return # Old-style RTEs for special sites with no cvmfs atlasrtes = [] - for (package, cache) in zip(self.jobdesc['swRelease'].split('\n'), self.jobdesc['homepackage'].split('\n')): - if cache.find('Production') > 1 and cache.find('AnalysisTransforms') < 0: - rte = package.split('-')[0].upper() + '-' + cache.split('/')[1] - elif cache.find('AnalysisTransforms') != -1: + for package, cache in zip(self.jobdesc["swRelease"].split("\n"), self.jobdesc["homepackage"].split("\n")): + if cache.find("Production") > 1 and cache.find("AnalysisTransforms") < 0: + rte = package.split("-")[0].upper() + "-" + cache.split("/")[1] + elif cache.find("AnalysisTransforms") != -1: rte = package.upper() - res = re.match('AnalysisTransforms-(.+)_(.+)', cache) + res = re.match("AnalysisTransforms-(.+)_(.+)", cache) if res is not None: - if res.group(1).find('AtlasProduction') != -1: + if res.group(1).find("AtlasProduction") != -1: rte = "ATLAS-" + res.group(2) else: rte = "ATLAS-" + res.group(1).upper() + "-" + res.group(2) else: - rte = cache.replace('Atlas', 'Atlas-').replace('/', '-').upper() + rte = cache.replace("Atlas", "Atlas-").replace("/", "-").upper() rte = str(rte) - rte = rte.replace('ATLAS-', '') - rte += "-"+self.jobdesc['cmtConfig'].upper() - - if cache.find('AnalysisTransforms') < 0: - rte = rte.replace('PHYSICS-', 'ATLASPHYSICS-') - rte = rte.replace('PROD2-', 'ATLASPROD2-') - rte = rte.replace('PROD1-', 'ATLASPROD1-') - rte = rte.replace('DERIVATION-', 'ATLASDERIVATION-') - rte = rte.replace('P1HLT-', 'ATLASP1HLT-') - rte = rte.replace('TESTHLT-', 'ATLASTESTHLT-') - rte = rte.replace('CAFHLT-', 'ATLASCAFHLT-') - rte = rte.replace('21.0.13.1', 'ATLASPRODUCTION-21.0.13.1') - rte = rte.replace('21.0.20.1', 'ATLASPRODUCTION-21.0.20.1') - if cache.find('AnalysisTransforms') != -1: - res=re.match('(21\..+)', rte) + rte = rte.replace("ATLAS-", "") + rte += "-" + self.jobdesc["cmtConfig"].upper() + + if cache.find("AnalysisTransforms") < 0: + rte = rte.replace("PHYSICS-", "ATLASPHYSICS-") + rte = rte.replace("PROD2-", "ATLASPROD2-") + rte = rte.replace("PROD1-", "ATLASPROD1-") + rte = rte.replace("DERIVATION-", "ATLASDERIVATION-") + rte = rte.replace("P1HLT-", "ATLASP1HLT-") + rte = rte.replace("TESTHLT-", "ATLASTESTHLT-") + rte = rte.replace("CAFHLT-", "ATLASCAFHLT-") + rte = rte.replace("21.0.13.1", "ATLASPRODUCTION-21.0.13.1") + rte = rte.replace("21.0.20.1", "ATLASPRODUCTION-21.0.20.1") + if cache.find("AnalysisTransforms") != -1: + res = re.match("(21\..+)", rte) if res is not None: - rte = rte.replace('21', 'OFFLINE-21') + rte = rte.replace("21", "OFFLINE-21") - if rte.find('NULL') != -1: - rte = 'PYTHON-CVMFS-X86_64-SLC6-GCC47-OPT' + if rte.find("NULL") != -1: + rte = "PYTHON-CVMFS-X86_64-SLC6-GCC47-OPT" atlasrtes.append(rte) - self.xrsl['rtes'] = "" + self.xrsl["rtes"] = "" for rte in atlasrtes[-1:]: - self.xrsl['rtes'] += "(runtimeenvironment = APPS/HEP/ATLAS-" + rte + ")" + self.xrsl["rtes"] += "(runtimeenvironment = APPS/HEP/ATLAS-" + rte + ")" - if self.siteinfo['type'] == 'analysis': - self.xrsl['rtes'] += "(runtimeenvironment = ENV/PROXY)" + if self.siteinfo["type"] == "analysis": + self.xrsl["rtes"] += "(runtimeenvironment = ENV/PROXY)" self.artes = ",".join(atlasrtes) def setExecutable(self): - - self.xrsl['executable'] = "(executable = ARCpilot)" + self.xrsl["executable"] = "(executable = ARCpilot)" def setArguments(self): - if self.artes is None: self.setRTE() # Set options for NG/true pilot if self.truepilot: - pargs = '"pilot3/pilot.py" "-h" "%s" "-s" "%s" "-f" "false" "-p" "25443" "-d" "{HOME}" "-w" "https://pandaserver.cern.ch"' % (self.schedconfig, self.sitename) + pargs = '"pilot3/pilot.py" "-h" "%s" "-s" "%s" "-f" "false" "-p" "25443" "-d" "{HOME}" "-w" "https://pandaserver.cern.ch"' % ( + self.schedconfig, + self.sitename, + ) else: - pargs = '"pilot3/pilot.py" "-h" "%s" "-s" "%s" "-F" "Nordugrid-ATLAS" "-d" "{HOME}" "-j" "false" "-f" "false" "-z" "true" "-b" "2" "-t" "false"' % (self.sitename, self.sitename) + pargs = '"pilot3/pilot.py" "-h" "%s" "-s" "%s" "-F" "Nordugrid-ATLAS" "-d" "{HOME}" "-j" "false" "-f" "false" "-z" "true" "-b" "2" "-t" "false"' % ( + self.sitename, + self.sitename, + ) pandajobarg = self.pandajob if self.longjob: pandajobarg = "FILE" - self.xrsl['arguments'] = '(arguments = "' + self.artes + '" "' + pandajobarg + '" ' + pargs + ')' + self.xrsl["arguments"] = '(arguments = "' + self.artes + '" "' + pandajobarg + '" ' + pargs + ")" def setInputsES(self, inf): - - for f, s, i in zip(self.jobdesc['inFiles'].split(","), self.jobdesc['scopeIn'].split(","), self.jobdesc['prodDBlockToken'].split(",")): - if i == 'None': + for f, s, i in zip(self.jobdesc["inFiles"].split(","), self.jobdesc["scopeIn"].split(","), self.jobdesc["prodDBlockToken"].split(",")): + if i == "None": # Rucio file - lfn = '/'.join(["rucio://rucio-lb-prod.cern.ch;rucioaccount=pilot;transferprotocol=gsiftp;cache=invariant/replicas", s, f]) + lfn = "/".join(["rucio://rucio-lb-prod.cern.ch;rucioaccount=pilot;transferprotocol=gsiftp;cache=invariant/replicas", s, f]) elif int(i) in self.osmap: - lfn = '/'.join([self.osmap[int(i)], f]) + lfn = "/".join([self.osmap[int(i)], f]) else: # TODO this exception is ignored by panda2arc raise Exception("No OS defined in AGIS for bucket id %s" % i) inf[f] = lfn def setInputs(self): - x = "" if self.truepilot: x += '(ARCpilot "http://aipanda404.cern.ch;cache=check/data/releases/ARCpilot-true")' @@ -270,7 +265,7 @@ def setInputs(self): else: x += '(ARCpilot "http://aipanda404.cern.ch;cache=check/data/releases/ARCpilot")' - if self.jobdesc['prodSourceLabel'] == 'rc_test': + if self.jobdesc["prodSourceLabel"] == "rc_test": x += '(pilotcode.tar.gz "http://pandaserver.cern.ch:25080;cache=check/cache/pilot/pilotcode-rc.tar.gz")' else: x += '(pilotcode.tar.gz "http://pandaserver.cern.ch:25080;cache=check/cache/pilot/pilotcode-PICARD.tar.gz")' @@ -281,39 +276,37 @@ def setInputs(self): if self.longjob: try: os.makedirs(self.inputjobdir) - except: + except BaseException: pass - tmpfile = self.inputjobdir+"/pandaJobData.out" + tmpfile = self.inputjobdir + "/pandaJobData.out" with open(tmpfile, "w") as f: f.write(self.pandajob) x += '(pandaJobData.out "%s/pandaJobData.out")' % self.inputjobdir if not self.truepilot: x += '(queuedata.pilot.json "http://pandaserver.cern.ch:25085;cache=check/cache/schedconfig/%s.all.json")' % self.schedconfig - if self.sitename.find('BEIJING') != -1: + if self.sitename.find("BEIJING") != -1: x += '(agis_ddmendpoints.json "/cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json")' - if 'inFiles' in self.jobdesc and not self.truepilot: + if "inFiles" in self.jobdesc and not self.truepilot: inf = {} - if self.jobdesc.get('eventServiceMerge') == 'True': + if self.jobdesc.get("eventServiceMerge") == "True": self.setInputsES(inf) - for filename, scope in zip(self.jobdesc['inFiles'].split(","), - self.jobdesc['scopeIn'].split(",")): - + for filename, scope in zip(self.jobdesc["inFiles"].split(","), self.jobdesc["scopeIn"].split(",")): # Skip files which use direct I/O: site has it enabled, token is # not 'local', file is root file and --useLocalIO is not used # don't use direct I/O - pending new mover switch - #if token != 'local' and self.siteinfo.get('direct_access_lan', False) and \ + # if token != 'local' and self.siteinfo.get('direct_access_lan', False) and \ # not ('.tar.gz' in filename or '.lib.tgz' in filename or '.raw.' in filename) and \ # '--useLocalIO' not in self.jobdesc['jobPars'][0]: # continue # Hard-coded pilot rucio account - should change based on proxy # Rucio does not expose mtime, set cache=invariant so not to download too much - urloptions = ';rucioaccount=pilot;transferprotocol=gsiftp;cache=invariant' - ruciourl = 'rucio://rucio-lb-prod.cern.ch%s/replicas' % urloptions - lfn = '/'.join([ruciourl, scope, filename]) + urloptions = ";rucioaccount=pilot;transferprotocol=gsiftp;cache=invariant" + ruciourl = "rucio://rucio-lb-prod.cern.ch%s/replicas" % urloptions + lfn = "/".join([ruciourl, scope, filename]) inf[filename] = lfn @@ -321,28 +314,25 @@ def setInputs(self): for k, v in inf.items(): x += '(%s "%s")' % (k, v) - if self.jobdesc.get('eventService') and self.eventranges: + if self.jobdesc.get("eventService") and self.eventranges: # Create tmp json file to upload with job - tmpjsonfile = os.path.join(self.tmpdir, 'eventranges-%d.json' % self.pandaid) + tmpjsonfile = os.path.join(self.tmpdir, "eventranges-%d.json" % self.pandaid) jsondata = json.loads(self.eventranges) - with open(tmpjsonfile, 'w') as f: + with open(tmpjsonfile, "w") as f: json.dump(jsondata, f) - x += '("eventranges.json" "%s")' % tmpjsonfile + x += '("eventranges.json" "%s")' % tmpjsonfile - self.xrsl['inputfiles'] = "(inputfiles = %s )" % x + self.xrsl["inputfiles"] = "(inputfiles = %s )" % x def setLog(self): - - logfile = self.jobdesc.get('logFile', 'LOGFILE') - self.xrsl['log'] = '(stdout = "%s")(join = yes)' % logfile.replace('.tgz', '') + logfile = self.jobdesc.get("logFile", "LOGFILE") + self.xrsl["log"] = '(stdout = "%s")(join = yes)' % logfile.replace(".tgz", "") def setGMLog(self): - - self.xrsl['gmlog'] = '(gmlog = "gmlog")' - self.xrsl['rerun'] = '(rerun = "2")' + self.xrsl["gmlog"] = '(gmlog = "gmlog")' + self.xrsl["rerun"] = '(rerun = "2")' def setOutputs(self): - # dynamic outputs output = '("jobSmallFiles.tgz" "")' @@ -350,53 +340,51 @@ def setOutputs(self): # needed for SCEAPI # generated output file list" output += '("output.list" "")' - self.xrsl['outputs'] = "(outputfiles = %s )" % output + self.xrsl["outputs"] = "(outputfiles = %s )" % output if self.truepilot: - self.xrsl['outputs'] = "" + self.xrsl["outputs"] = "" def setPriority(self): - - if 'currentPriority' in self.jobdesc: + if "currentPriority" in self.jobdesc: prio = 50 try: - prio = int(self.jobdesc['currentPriority']) + prio = int(self.jobdesc["currentPriority"]) if prio < 1: prio = 1 if prio > 0 and prio < 1001: - prio = prio * 90 / 1000. + prio = prio * 90 / 1000.0 prio = int(prio) if prio > 1000 and prio < 10001: - prio = 90 + (prio - 1000) / 900. + prio = 90 + (prio - 1000) / 900.0 prio = int(prio) if prio > 10000: prio = 100 - except: + except BaseException: pass - self.xrsl['priority'] = '(priority = %d )' % prio - if 'wuppertalprod' in self.sitename: - self.xrsl['priority'] = "" + self.xrsl["priority"] = "(priority = %d )" % prio + if "wuppertalprod" in self.sitename: + self.xrsl["priority"] = "" def setEnvironment(self): environment = {} - environment['PANDA_JSID'] = self.schedulerid + environment["PANDA_JSID"] = self.schedulerid if self.logurl: - environment['GTAG'] = self.logurl + environment["GTAG"] = self.logurl # Vars for APFMon if self.truepilot and self.monitorurl: - environment['APFCID'] = self.pandaid - environment['APFFID'] = schedid - environment['APFMON'] = self.monitorurl - environment['FACTORYQUEUE'] = self.sitename - - self.xrsl['environment'] = '(environment = %s)' % ''.join(['("%s" "%s")' % (k,v) for (k,v) in environment.items()]) + environment["APFCID"] = self.pandaid + environment["APFFID"] = schedid + environment["APFMON"] = self.monitorurl + environment["FACTORYQUEUE"] = self.sitename + self.xrsl["environment"] = "(environment = %s)" % "".join(['("%s" "%s")' % (k, v) for (k, v) in environment.items()]) def parse(self): self.setTime() self.setJobname() - #self.setDisk() + # self.setDisk() self.setMemory() self.setRTE() self.setExecutable() @@ -410,5 +398,5 @@ def parse(self): def getXrsl(self): x = "&" - x += '\n'.join([val for val in self.xrsl.values()]) + x += "\n".join([val for val in self.xrsl.values()]) return x diff --git a/pandaharvester/harvestermisc/cloud_openstack_utils.py b/pandaharvester/harvestermisc/cloud_openstack_utils.py index 1f9ed456..f1174670 100644 --- a/pandaharvester/harvestermisc/cloud_openstack_utils.py +++ b/pandaharvester/harvestermisc/cloud_openstack_utils.py @@ -3,26 +3,27 @@ from keystoneauth1 import loading as loading from keystoneauth1 import session as session from novaclient import client as nova_cl -#from cinderclient import client as cinder_cl + +# from cinderclient import client as cinder_cl class OS_SimpleClient(object): def __init__(self, auth_config_json_file=None): - with open(auth_config_json_file, 'r') as _f: + with open(auth_config_json_file, "r") as _f: auth_config_dict = json.load(_f) # Openstack API version - version = '2.0' #FIXME - if version == '2.0': - loader = loading.get_plugin_loader('v2password') - elif version >= '3.0': - loader = loading.get_plugin_loader('password') + version = "2.0" # FIXME + if version == "2.0": + loader = loading.get_plugin_loader("v2password") + elif version >= "3.0": + loader = loading.get_plugin_loader("password") auth = loader.load_from_options(**auth_config_dict) - #sess = keystoneauth1.session.Session(auth=auth) + # sess = keystoneauth1.session.Session(auth=auth) sess = session.Session(auth=auth) - #self.nova = novaclient.client.Client(version, session=sess) + # self.nova = novaclient.client.Client(version, session=sess) self.nova = nova_cl.Client(version, session=sess) - #self.cinder = cinderclient.client.Client(version, session=sess) - #self.cinder = cinder_cl.client.Client(version, session=sess) + # self.cinder = cinderclient.client.Client(version, session=sess) + # self.cinder = cinder_cl.client.Client(version, session=sess) diff --git a/pandaharvester/harvestermisc/frontend_utils.py b/pandaharvester/harvestermisc/frontend_utils.py index 9b1762ec..9f243b13 100644 --- a/pandaharvester/harvestermisc/frontend_utils.py +++ b/pandaharvester/harvestermisc/frontend_utils.py @@ -5,11 +5,12 @@ from pandaharvester.harvesterconfig import harvester_config -class HarvesterToken(): +class HarvesterToken: """ Methods of JSON Web Token used in harvester frontend """ - algorithm = 'HS256' + + algorithm = "HS256" def __init__(self, **kwarg): # load secret from file @@ -23,11 +24,11 @@ def __init__(self, **kwarg): self.default_lifetime = 345600 # default payload spec self.default_payload_dict = { - 'sub': 'Subject', - 'exp': 0, - 'iss': harvester_config.master.harvester_id, - 'iat': 0, - } + "sub": "Subject", + "exp": 0, + "iss": harvester_config.master.harvester_id, + "iat": 0, + } def generate(self, payload=None, header=None): """ @@ -36,8 +37,8 @@ def generate(self, payload=None, header=None): """ timestamp_now = int(time.time()) payload_dict = self.default_payload_dict.copy() - payload_dict['iat'] = timestamp_now - payload_dict['exp'] = timestamp_now + self.default_lifetime + payload_dict["iat"] = timestamp_now + payload_dict["exp"] = timestamp_now + self.default_lifetime if payload: payload_dict.update(payload) token = jwt.encode(payload_dict, key=self.secret, algorithm=self.algorithm, headers=header) diff --git a/pandaharvester/harvestermisc/gitlab_utils.py b/pandaharvester/harvestermisc/gitlab_utils.py index d4d373a7..af60b154 100644 --- a/pandaharvester/harvestermisc/gitlab_utils.py +++ b/pandaharvester/harvestermisc/gitlab_utils.py @@ -1,6 +1,7 @@ import os import json -job_params_file = '__job_params__' + +job_params_file = "__job_params__" # get job params from file @@ -11,7 +12,7 @@ def get_job_params(work_spec): # store job params in file def store_job_params(work_spec, params): - with open(make_file_path(work_spec), 'w') as f: + with open(make_file_path(work_spec), "w") as f: json.dump(params, f) diff --git a/pandaharvester/harvestermisc/globus_utils.py b/pandaharvester/harvestermisc/globus_utils.py index 636e3098..2a0e581c 100644 --- a/pandaharvester/harvestermisc/globus_utils.py +++ b/pandaharvester/harvestermisc/globus_utils.py @@ -21,11 +21,13 @@ from pandalogger.LogWrapper import LogWrapper # handle exception from globus software + + def handle_globus_exception(tmp_log): if not isinstance(tmp_log, LogWrapper): - methodName = '{0} : '.format(inspect.stack()[1][3]) + methodName = "{0} : ".format(inspect.stack()[1][3]) else: - methodName = '' + methodName = "" # extract errtype and check if it a GlobusError Class errtype, errvalue = sys.exc_info()[:2] errStat = None @@ -34,24 +36,19 @@ def handle_globus_exception(tmp_log): # Error response from the REST service, check the code and message for # details. errStat = None - errMsg += "HTTP status code: {0} Error Code: {1} Error Message: {2} ".format(errvalue.http_status, - errvalue.code, - errvalue.message) + errMsg += "HTTP status code: {0} Error Code: {1} Error Message: {2} ".format(errvalue.http_status, errvalue.code, errvalue.message) elif isinstance(errvalue, TransferAPIError): err_args = list(errvalue._get_args()) - errStat = None - errMsg += " http_status: {0} code: {1} message: {2} requestID: {3} ".format(err_args[0], - err_args[1], - err_args[2], - err_args[3]) + errStat = None + errMsg += " http_status: {0} code: {1} message: {2} requestID: {3} ".format(err_args[0], err_args[1], err_args[2], err_args[3]) elif isinstance(errvalue, NetworkError): - errStat = None + errStat = None errMsg += "Network Failure. Possibly a firewall or connectivity issue " elif isinstance(errvalue, GlobusConnectionError): - errStat = None + errStat = None errMsg += "A connection error occured while making a REST request. " elif isinstance(errvalue, GlobusTimeoutError): - errStat = None + errStat = None errMsg += "A REST request timeout. " elif isinstance(errvalue, GlobusError): errStat = False @@ -59,74 +56,81 @@ def handle_globus_exception(tmp_log): else: # some other error errStat = False errMsg = "{0} ".format(errvalue) - #errMsg += traceback.format_exc() + # errMsg += traceback.format_exc() tmp_log.error(errMsg) - return (errStat,errMsg) + return (errStat, errMsg) + # Globus create transfer client -def create_globus_transfer_client(tmpLog,globus_client_id,globus_refresh_token): + + +def create_globus_transfer_client(tmpLog, globus_client_id, globus_refresh_token): """ create Globus Transfer Client and return the transfer client """ # get logger - tmpLog.info('Creating instance of GlobusTransferClient') - # start the Native App authentication process + tmpLog.info("Creating instance of GlobusTransferClient") + # start the Native App authentication process # use the refresh token to get authorizer # create the Globus Transfer Client tc = None ErrStat = True try: client = NativeAppAuthClient(client_id=globus_client_id) - authorizer = RefreshTokenAuthorizer(refresh_token=globus_refresh_token,auth_client=client) + authorizer = RefreshTokenAuthorizer(refresh_token=globus_refresh_token, auth_client=client) tc = TransferClient(authorizer=authorizer) - except: + except BaseException: errStat, errMsg = handle_globus_exception(tmpLog) - return ErrStat,tc + return ErrStat, tc + -def check_endpoint_activation (tmpLog,tc,endpoint_id): +def check_endpoint_activation(tmpLog, tc, endpoint_id): """ - check if endpoint is activated + check if endpoint is activated """ # test we have a Globus Transfer Client - if not tc : - errStr = 'failed to get Globus Transfer Client' + if not tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr - try: + try: endpoint = tc.get_endpoint(endpoint_id) r = tc.endpoint_autoactivate(endpoint_id, if_expires_in=3600) - tmpLog.info("Endpoint - %s - activation status code %s"%(endpoint["display_name"],str(r["code"]))) - if r['code'] == 'AutoActivationFailed': - errStr = 'Endpoint({0}) Not Active! Error! Source message: {1}'.format(endpoint_id, r['message']) + tmpLog.info("Endpoint - %s - activation status code %s" % (endpoint["display_name"], str(r["code"]))) + if r["code"] == "AutoActivationFailed": + errStr = "Endpoint({0}) Not Active! Error! Source message: {1}".format(endpoint_id, r["message"]) tmpLog.debug(errStr) return False, errStr - elif r['code'] == 'AutoActivated.CachedCredential': - errStr = 'Endpoint({0}) autoactivated using a cached credential.'.format(endpoint_id) + elif r["code"] == "AutoActivated.CachedCredential": + errStr = "Endpoint({0}) autoactivated using a cached credential.".format(endpoint_id) tmpLog.debug(errStr) - return True,errStr - elif r['code'] == 'AutoActivated.GlobusOnlineCredential': - errStr = 'Endpoint({0}) autoactivated using a built-in Globus '.format(endpoint_id) + return True, errStr + elif r["code"] == "AutoActivated.GlobusOnlineCredential": + errStr = "Endpoint({0}) autoactivated using a built-in Globus ".format(endpoint_id) tmpLog.debug(errStr) - return True,errStr - elif r['code'] == 'AlreadyActivated': - errStr = 'Endpoint({0}) already active until at least {1}'.format(endpoint_id,3600) + return True, errStr + elif r["code"] == "AlreadyActivated": + errStr = "Endpoint({0}) already active until at least {1}".format(endpoint_id, 3600) tmpLog.debug(errStr) - return True,errStr - except: - errStat,errMsg = handle_globus_exception(tmpLog) + return True, errStr + except BaseException: + errStat, errMsg = handle_globus_exception(tmpLog) return errStat, {} + # get transfer tasks -def get_transfer_task_by_id(tmpLog,tc,transferID=None): + + +def get_transfer_task_by_id(tmpLog, tc, transferID=None): # test we have a Globus Transfer Client - if not tc : - errStr = 'failed to get Globus Transfer Client' + if not tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr - if transferID == None: + if transferID is None: # error need to have task ID - errStr = 'failed to provide transfer task ID ' + errStr = "failed to provide transfer task ID " tmpLog.error(errStr) return False, errStr try: @@ -136,35 +140,38 @@ def get_transfer_task_by_id(tmpLog,tc,transferID=None): tasks = {} tasks[transferID] = gRes # return - tmpLog.debug('got {0} tasks'.format(len(tasks))) + tmpLog.debug("got {0} tasks".format(len(tasks))) return True, tasks - except: - errStat,errMsg = handle_globus_exception(tmpLog) + except BaseException: + errStat, errMsg = handle_globus_exception(tmpLog) return errStat, {} + # get transfer tasks -def get_transfer_tasks(tmpLog,tc,label=None): + + +def get_transfer_tasks(tmpLog, tc, label=None): # test we have a Globus Transfer Client - if not tc : - errStr = 'failed to get Globus Transfer Client' + if not tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr try: # execute - if label == None: + if label is None: params = {"filter": "type:TRANSFER/status:SUCCEEDED,INACTIVE,FAILED,SUCCEEDED"} - gRes = tc.task_list(num_results=1000,**params) + gRes = tc.task_list(num_results=1000, **params) else: params = {"filter": "type:TRANSFER/status:SUCCEEDED,INACTIVE,FAILED,SUCCEEDED/label:{0}".format(label)} gRes = tc.task_list(**params) # parse output tasks = {} for res in gRes: - reslabel = res.data['label'] + reslabel = res.data["label"] tasks[reslabel] = res.data # return - tmpLog.debug('got {0} tasks'.format(len(tasks))) + tmpLog.debug("got {0} tasks".format(len(tasks))) return True, tasks - except: - errStat,errMsg = handle_globus_exception(tmpLog) + except BaseException: + errStat, errMsg = handle_globus_exception(tmpLog) return errStat, {} diff --git a/pandaharvester/harvestermisc/htcondor_utils.py b/pandaharvester/harvestermisc/htcondor_utils.py index ed91bc75..feeb92b1 100644 --- a/pandaharvester/harvestermisc/htcondor_utils.py +++ b/pandaharvester/harvestermisc/htcondor_utils.py @@ -1,5 +1,4 @@ - -#=== Imports =================================================== +# === Imports =================================================== import re import time @@ -32,16 +31,16 @@ try: import htcondor except ImportError: - CONDOR_API = 'command' + CONDOR_API = "command" else: - CONDOR_API = 'python' + CONDOR_API = "python" -#=============================================================== +# =============================================================== -#=== Definitions =============================================== +# === Definitions =============================================== # logger -baseLogger = core_utils.setup_logger('htcondor_utils') +baseLogger = core_utils.setup_logger("htcondor_utils") # module level lock @@ -50,28 +49,38 @@ # List of job ads required CONDOR_JOB_ADS_LIST = [ - 'ClusterId', 'ProcId', 'JobStatus', 'LastJobStatus', - 'JobStartDate', 'EnteredCurrentStatus', 'ExitCode', - 'HoldReason', 'LastHoldReason', 'RemoveReason', - 'harvesterWorkerID', + "ClusterId", + "ProcId", + "JobStatus", + "LastJobStatus", + "JobStartDate", + "EnteredCurrentStatus", + "ExitCode", + "HoldReason", + "LastHoldReason", + "RemoveReason", + "harvesterWorkerID", ] # harvesterID harvesterID = harvester_config.master.harvester_id -#=============================================================== +# =============================================================== + +# === Functions ================================================= -#=== Functions ================================================= def synchronize(func): """ synchronize decorator """ + @functools.wraps(func) def wrapper(*args, **kwargs): with moduleLock: return func(*args, **kwargs) + return wrapper @@ -92,9 +101,9 @@ def condor_job_id_from_workspec(workspec): """ batchid_str = str(workspec.batchID) # backward compatibility if workspec.batchID does not contain ProcId - if '.' not in batchid_str: - batchid_str += '.0' - return '{0}#{1}'.format(workspec.submissionHost, batchid_str) + if "." not in batchid_str: + batchid_str += ".0" + return "{0}#{1}".format(workspec.submissionHost, batchid_str) def get_host_batchid_map(workspec_list): @@ -110,8 +119,8 @@ def get_host_batchid_map(workspec_list): continue batchid_str = str(batchid) # backward compatibility if workspec.batchID does not contain ProcId - if '.' not in batchid_str: - batchid_str += '.0' + if "." not in batchid_str: + batchid_str += ".0" try: host_batchid_map[host].append(batchid_str) except KeyError: @@ -123,7 +132,7 @@ def get_batchid_from_job(job_ads_dict): """ Get batchID string from condor job dict """ - batchid = '{0}.{1}'.format(job_ads_dict['ClusterId'], job_ads_dict['ProcId']) + batchid = "{0}.{1}".format(job_ads_dict["ClusterId"], job_ads_dict["ProcId"]) return batchid @@ -131,7 +140,7 @@ def get_job_id_tuple_from_batchid(batchid): """ Get tuple (ClusterId, ProcId) from batchID string """ - batchid_str_list = str(batchid).split('.') + batchid_str_list = str(batchid).split(".") clusterid = batchid_str_list[0] procid = batchid_str_list[1] if not procid: @@ -158,17 +167,17 @@ def condor_submit_process(mp_queue, host, jdl_map_list): Function for new process to submit condor """ # initialization - errStr = '' + errStr = "" batchIDs_list = [] # parse schedd and pool name condor_schedd, condor_pool = None, None - if host in ('LOCAL', 'None'): - tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(host)) + if host in ("LOCAL", "None"): + tmpLog.debug("submissionHost is {0}, treated as local schedd. Skipped".format(host)) else: try: - condor_schedd, condor_pool = host.split(',')[0:2] + condor_schedd, condor_pool = host.split(",")[0:2] except ValueError: - tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host)) + tmpLog.error("Invalid submissionHost: {0} . Skipped".format(host)) # get schedd try: if condor_pool: @@ -181,7 +190,7 @@ def condor_submit_process(mp_queue, host, jdl_map_list): scheddAd = collector.locate(htcondor.DaemonTypes.Schedd) schedd = htcondor.Schedd(scheddAd) except Exception as e: - errStr = 'create condor collector and schedd failed; {0}: {1}'.format(e.__class__.__name__, e) + errStr = "create condor collector and schedd failed; {0}: {1}".format(e.__class__.__name__, e) else: submit_obj = htcondor.Submit() try: @@ -191,28 +200,30 @@ def condor_submit_process(mp_queue, host, jdl_map_list): clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() - batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid) - for procid in range(first_proc, first_proc + num_proc)]) + batchIDs_list.extend(["{0}.{1}".format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc)]) except RuntimeError as e: - errStr = 'submission failed; {0}: {1}'.format(e.__class__.__name__, e) + errStr = "submission failed; {0}: {1}".format(e.__class__.__name__, e) mp_queue.put((batchIDs_list, errStr)) -#=============================================================== -#=== Classes =================================================== +# =============================================================== + +# === Classes =================================================== # Condor queue cache fifo + + class CondorQCacheFifo(six.with_metaclass(SingletonWithID, SpecialFIFOBase)): global_lock_id = -1 def __init__(self, target, *args, **kwargs): - name_suffix = target.split('.')[0] - name_suffix = re.sub('-', '_', name_suffix) - self.titleName = 'CondorQCache_{0}'.format(name_suffix) + name_suffix = target.split(".")[0] + name_suffix = re.sub("-", "_", name_suffix) + self.titleName = "CondorQCache_{0}".format(name_suffix) SpecialFIFOBase.__init__(self) def lock(self, score=None): - lock_key = format(int(random.random() * 2**32), 'x') + lock_key = format(int(random.random() * 2**32), "x") if score is None: score = time.time() retVal = self.putbyid(self.global_lock_id, lock_key, score) @@ -241,9 +252,10 @@ def renew_session_and_retry(cls, func): # FIXME: currently hard-coded to_retry = True # Wrapper + def wrapper(self, *args, **kwargs): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session_if_error') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorClient.renew_session_if_error") func_name = func.__name__ try: self.schedd @@ -252,79 +264,80 @@ def wrapper(self, *args, **kwargs): is_renewed = self.renew_session() self.lock.release() if not is_renewed: - errStr = 'failed to communicate with {0}'.format(self.submissionHost) + errStr = "failed to communicate with {0}".format(self.submissionHost) tmpLog.error(errStr) - tmpLog.debug('got RuntimeError: {0}'.format(e)) + tmpLog.debug("got RuntimeError: {0}".format(e)) raise Exception(errStr) try: ret = func(self, *args, **kwargs) except RuntimeError as e: - tmpLog.debug('got RuntimeError: {0}'.format(e)) + tmpLog.debug("got RuntimeError: {0}".format(e)) if self.lock.acquire(False): is_renewed = self.renew_session() self.lock.release() if is_renewed: if to_retry: - tmpLog.debug('condor session renewed. Retrying {0}'.format(func_name)) + tmpLog.debug("condor session renewed. Retrying {0}".format(func_name)) ret = func(self, *args, **kwargs) else: - tmpLog.debug('condor session renewed') + tmpLog.debug("condor session renewed") raise else: - tmpLog.error('failed to renew condor session') + tmpLog.error("failed to renew condor session") raise else: - tmpLog.debug('another thread is renewing condor session; skipped...') + tmpLog.debug("another thread is renewing condor session; skipped...") raise - tmpLog.debug('done') + tmpLog.debug("done") return ret + return wrapper def __init__(self, submissionHost, *args, **kwargs): self.submissionHost = submissionHost # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.__init__') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorClient.__init__") # Initialize - tmpLog.debug('Initializing client') + tmpLog.debug("Initializing client") self.lock = threading.Lock() self.condor_api = CONDOR_API self.condor_schedd = None self.condor_pool = None # Parse condor command remote options from workspec - if self.submissionHost in ('LOCAL', 'None'): - tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(self.submissionHost)) + if self.submissionHost in ("LOCAL", "None"): + tmpLog.debug("submissionHost is {0}, treated as local schedd. Skipped".format(self.submissionHost)) else: try: - self.condor_schedd, self.condor_pool = self.submissionHost.split(',')[0:2] - if self.condor_schedd in ['None']: + self.condor_schedd, self.condor_pool = self.submissionHost.split(",")[0:2] + if self.condor_schedd in ["None"]: self.condor_schedd = None - if self.condor_pool in ['None']: + if self.condor_pool in ["None"]: self.condor_pool = None except ValueError: - tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(self.submissionHost)) + tmpLog.error("Invalid submissionHost: {0} . Skipped".format(self.submissionHost)) # Use Python API or fall back to command - if self.condor_api == 'python': + if self.condor_api == "python": try: self.secman = htcondor.SecMan() self.renew_session(init=True) except Exception as e: - tmpLog.error('Error when using htcondor Python API. Exception {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("Error when using htcondor Python API. Exception {0}: {1}".format(e.__class__.__name__, e)) raise - tmpLog.debug('Initialized client') + tmpLog.debug("Initialized client") @synchronize def renew_session(self, retry=3, init=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorClient.renew_session") # Clear security session if not initialization if not init: - tmpLog.info('Renew condor session') + tmpLog.info("Renew condor session") self.secman.invalidateAllSessions() # Recreate collector and schedd object i_try = 1 while i_try <= retry: try: - tmpLog.info('Try {0}'.format(i_try)) + tmpLog.info("Try {0}".format(i_try)) if self.condor_pool: self.collector = htcondor.Collector(self.condor_pool) else: @@ -334,14 +347,14 @@ def renew_session(self, retry=3, init=False): else: self.scheddAd = self.collector.locate(htcondor.DaemonTypes.Schedd) self.schedd = htcondor.Schedd(self.scheddAd) - tmpLog.info('Success') + tmpLog.info("Success") break except Exception as e: - tmpLog.warning('Recreate condor collector and schedd failed: {0}'.format(e)) + tmpLog.warning("Recreate condor collector and schedd failed: {0}".format(e)) if i_try < retry: - tmpLog.warning('Failed. Retry...') + tmpLog.warning("Failed. Retry...") else: - tmpLog.warning('Retry {0} times. Still failed. Skipped'.format(i_try)) + tmpLog.warning("Retry {0} times. Still failed. Skipped".format(i_try)) return False i_try += 1 self.secman.invalidateAllSessions() @@ -357,8 +370,8 @@ class CondorJobQuery(six.with_metaclass(SingletonWithID, CondorClient)): classLock = threading.Lock() # Query commands orig_comStr_list = [ - 'condor_q -xml', - 'condor_history -xml', + "condor_q -xml", + "condor_history -xml", ] # Bad text of redundant xml roots to eleminate from condor XML badtext = """ @@ -370,12 +383,14 @@ class CondorJobQuery(six.with_metaclass(SingletonWithID, CondorClient)): """ def __init__(self, cacheEnable=False, cacheRefreshInterval=None, useCondorHistory=True, *args, **kwargs): - self.submissionHost = str(kwargs.get('id')) + self.submissionHost = str(kwargs.get("id")) # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobQuery.__init__') + tmpLog = core_utils.make_logger( + baseLogger, "submissionHost={0} thrid={1} oid={2}".format(self.submissionHost, get_ident(), id(self)), method_name="CondorJobQuery.__init__" + ) # Initialize with self.classLock: - tmpLog.debug('Start') + tmpLog.debug("Start") CondorClient.__init__(self, self.submissionHost, *args, **kwargs) # For condor_q cache self.cacheEnable = cacheEnable @@ -383,19 +398,19 @@ def __init__(self, cacheEnable=False, cacheRefreshInterval=None, useCondorHistor self.cache = ([], 0) self.cacheRefreshInterval = cacheRefreshInterval self.useCondorHistory = useCondorHistory - tmpLog.debug('Initialize done') + tmpLog.debug("Initialize done") def get_all(self, batchIDs_list=[], allJobs=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobQuery.get_all') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobQuery.get_all") # Get all - tmpLog.debug('Start') + tmpLog.debug("Start") job_ads_all_dict = {} - if self.condor_api == 'python': + if self.condor_api == "python": try: job_ads_all_dict = self.query_with_python(batchIDs_list, allJobs) except Exception as e: - tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("Exception {0}: {1}".format(e.__class__.__name__, e)) raise else: job_ads_all_dict = self.query_with_command(batchIDs_list) @@ -403,56 +418,55 @@ def get_all(self, batchIDs_list=[], allJobs=False): def query_with_command(self, batchIDs_list=[]): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobQuery.query_with_command') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobQuery.query_with_command") # Start query - tmpLog.debug('Start query') + tmpLog.debug("Start query") job_ads_all_dict = {} batchIDs_set = set(batchIDs_list) for orig_comStr in self.orig_comStr_list: # String of batchIDs - batchIDs_str = ' '.join(list(batchIDs_set)) + batchIDs_str = " ".join(list(batchIDs_set)) # Command - if 'condor_q' in orig_comStr or ('condor_history' in orig_comStr and batchIDs_set): - name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else '' - pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else '' + if "condor_q" in orig_comStr or ("condor_history" in orig_comStr and batchIDs_set): + name_opt = "-name {0}".format(self.condor_schedd) if self.condor_schedd else "" + pool_opt = "-pool {0}".format(self.condor_pool) if self.condor_pool else "" ids = batchIDs_str - comStr = '{cmd} {name_opt} {pool_opt} {ids}'.format(cmd=orig_comStr, - name_opt=name_opt, - pool_opt=pool_opt, - ids=ids) + comStr = "{cmd} {name_opt} {pool_opt} {ids}".format(cmd=orig_comStr, name_opt=name_opt, pool_opt=pool_opt, ids=ids) else: # tmpLog.debug('No batch job left to query in this cycle by this thread') continue - tmpLog.debug('check with {0}'.format(comStr)) + tmpLog.debug("check with {0}".format(comStr)) (retCode, stdOut, stdErr) = _runShell(comStr) if retCode == 0: # Command succeeded - job_ads_xml_str = '\n'.join(str(stdOut).split(self.badtext)) - if '' in job_ads_xml_str: + job_ads_xml_str = "\n".join(str(stdOut).split(self.badtext)) + if "" in job_ads_xml_str: # Found at least one job # XML parsing xml_root = ET.fromstring(job_ads_xml_str) + def _getAttribute_tuple(attribute_xml_element): # Attribute name - _n = str(attribute_xml_element.get('n')) + _n = str(attribute_xml_element.get("n")) # Attribute value text - _t = ' '.join(attribute_xml_element.itertext()) + _t = " ".join(attribute_xml_element.itertext()) return (_n, _t) + # Every batch job - for _c in xml_root.findall('c'): + for _c in xml_root.findall("c"): job_ads_dict = dict() # Every attribute - attribute_iter = map(_getAttribute_tuple, _c.findall('a')) + attribute_iter = map(_getAttribute_tuple, _c.findall("a")) job_ads_dict.update(attribute_iter) batchid = get_batchid_from_job(job_ads_dict) - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = job_ads_dict # Remove batch jobs already gotten from the list if batchid in batchIDs_set: batchIDs_set.discard(batchid) else: # Job not found - tmpLog.debug('job not found with {0}'.format(comStr)) + tmpLog.debug("job not found with {0}".format(comStr)) continue else: # Command failed @@ -461,73 +475,73 @@ def _getAttribute_tuple(attribute_xml_element): if len(batchIDs_set) > 0: # Job unfound via both condor_q or condor_history, marked as unknown worker in harvester for batchid in batchIDs_set: - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = dict() - tmpLog.info( 'Unfound batch jobs of submissionHost={0}: {1}'.format( - self.submissionHost, ' '.join(list(batchIDs_set)) ) ) + tmpLog.info("Unfound batch jobs of submissionHost={0}: {1}".format(self.submissionHost, " ".join(list(batchIDs_set)))) # Return return job_ads_all_dict @CondorClient.renew_session_and_retry def query_with_python(self, batchIDs_list=[], allJobs=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobQuery.query_with_python') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobQuery.query_with_python") # Start query - tmpLog.debug('Start query') + tmpLog.debug("Start query") cache_fifo = None job_ads_all_dict = {} # make id sets batchIDs_set = set(batchIDs_list) clusterids_set = set([get_job_id_tuple_from_batchid(batchid)[0] for batchid in batchIDs_list]) # query from cache + def cache_query(constraint=None, projection=CONDOR_JOB_ADS_LIST, timeout=60): # query from condor xquery and update cache to fifo def update_cache(lockInterval=90): - tmpLog.debug('update_cache') + tmpLog.debug("update_cache") # acquire lock with score timestamp score = time.time() - self.cacheRefreshInterval + lockInterval lock_key = cache_fifo.lock(score=score) if lock_key is not None: # acquired lock, update from condor schedd - tmpLog.debug('got lock, updating cache') + tmpLog.debug("got lock, updating cache") jobs_iter_orig = self.schedd.xquery(constraint=constraint, projection=projection) jobs_iter = [] for job in jobs_iter_orig: try: jobs_iter.append(dict(job)) except Exception as e: - tmpLog.error('In updating cache schedd xquery; got exception {0}: {1} ; {2}'.format( - e.__class__.__name__, e, repr(job))) + tmpLog.error("In updating cache schedd xquery; got exception {0}: {1} ; {2}".format(e.__class__.__name__, e, repr(job))) timeNow = time.time() cache_fifo.put(jobs_iter, timeNow) self.cache = (jobs_iter, timeNow) # release lock retVal = cache_fifo.unlock(key=lock_key) if retVal: - tmpLog.debug('done update cache and unlock') + tmpLog.debug("done update cache and unlock") else: - tmpLog.warning('cannot unlock... Maybe something wrong') + tmpLog.warning("cannot unlock... Maybe something wrong") return jobs_iter else: - tmpLog.debug('cache fifo locked by other thread. Skipped') + tmpLog.debug("cache fifo locked by other thread. Skipped") return None + # remove invalid or outdated caches from fifo + def cleanup_cache(timeout=60): - tmpLog.debug('cleanup_cache') + tmpLog.debug("cleanup_cache") id_list = list() attempt_timestamp = time.time() n_cleanup = 0 while True: if time.time() > attempt_timestamp + timeout: - tmpLog.debug('time is up when cleanup cache. Skipped') + tmpLog.debug("time is up when cleanup cache. Skipped") break peeked_tuple = cache_fifo.peek(skip_item=True) if peeked_tuple is None: - tmpLog.debug('empty cache fifo') + tmpLog.debug("empty cache fifo") break - elif peeked_tuple.score is not None \ - and time.time() <= peeked_tuple.score + self.cacheRefreshInterval: - tmpLog.debug('nothing expired') + elif peeked_tuple.score is not None and time.time() <= peeked_tuple.score + self.cacheRefreshInterval: + tmpLog.debug("nothing expired") break elif peeked_tuple.id is not None: retVal = cache_fifo.delete([peeked_tuple.id]) @@ -535,9 +549,10 @@ def cleanup_cache(timeout=60): n_cleanup += retVal else: # problematic - tmpLog.warning('got nothing when cleanup cache, maybe problematic. Skipped') + tmpLog.warning("got nothing when cleanup cache, maybe problematic. Skipped") break - tmpLog.debug('cleaned up {0} objects in cache fifo'.format(n_cleanup)) + tmpLog.debug("cleaned up {0} objects in cache fifo".format(n_cleanup)) + # start jobs_iter = tuple() try: @@ -545,7 +560,7 @@ def cleanup_cache(timeout=60): while True: if time.time() > attempt_timestamp + timeout: # skip cache_query if too long - tmpLog.debug('cache_query got timeout ({0} seconds). Skipped '.format(timeout)) + tmpLog.debug("cache_query got timeout ({0} seconds). Skipped ".format(timeout)) break # get latest cache peeked_tuple = cache_fifo.peeklast(skip_item=True) @@ -554,12 +569,12 @@ def cleanup_cache(timeout=60): if peeked_tuple.id == cache_fifo.global_lock_id: if time.time() <= peeked_tuple.score + self.cacheRefreshInterval: # lock - tmpLog.debug('got fifo locked. Wait and retry...') + tmpLog.debug("got fifo locked. Wait and retry...") time.sleep(random.uniform(1, 5)) continue else: # expired lock - tmpLog.debug('got lock expired. Clean up and retry...') + tmpLog.debug("got lock expired. Clean up and retry...") cleanup_cache() continue elif time.time() <= peeked_tuple.score + self.cacheRefreshInterval: @@ -567,24 +582,26 @@ def cleanup_cache(timeout=60): _obj, _last_update = self.cache if _last_update >= peeked_tuple.score: # valid local cache - tmpLog.debug('valid local cache') + tmpLog.debug("valid local cache") jobs_iter = _obj else: # valid fifo cache - tmpLog.debug('update local cache from fifo') + tmpLog.debug("update local cache from fifo") peeked_tuple_with_item = cache_fifo.peeklast() - if peeked_tuple_with_item is not None \ - and peeked_tuple.id != cache_fifo.global_lock_id \ - and peeked_tuple_with_item.item is not None: + if ( + peeked_tuple_with_item is not None + and peeked_tuple.id != cache_fifo.global_lock_id + and peeked_tuple_with_item.item is not None + ): jobs_iter = cache_fifo.decode(peeked_tuple_with_item.item) self.cache = (jobs_iter, peeked_tuple_with_item.score) else: - tmpLog.debug('peeked invalid cache fifo object. Wait and retry...') + tmpLog.debug("peeked invalid cache fifo object. Wait and retry...") time.sleep(random.uniform(1, 5)) continue else: # cache expired - tmpLog.debug('update cache in fifo') + tmpLog.debug("update cache in fifo") retVal = update_cache() if retVal is not None: jobs_iter = retVal @@ -595,7 +612,7 @@ def cleanup_cache(timeout=60): if cache_fifo.size() == 0: if time.time() > attempt_timestamp + random.uniform(10, 30): # have waited for long enough, update cache - tmpLog.debug('waited enough, update cache in fifo') + tmpLog.debug("waited enough, update cache in fifo") retVal = update_cache() if retVal is not None: jobs_iter = retVal @@ -606,25 +623,26 @@ def cleanup_cache(timeout=60): continue except Exception as _e: tb_str = traceback.format_exc() - tmpLog.error('Error querying from cache fifo; {0} ; {1}'.format(_e, tb_str)) + tmpLog.error("Error querying from cache fifo; {0} ; {1}".format(_e, tb_str)) return jobs_iter + # query method options query_method_list = [self.schedd.xquery] if self.cacheEnable: - cache_fifo = CondorQCacheFifo(target=self.submissionHost, id='{0},{1}'.format(self.submissionHost, get_ident())) + cache_fifo = CondorQCacheFifo(target=self.submissionHost, id="{0},{1}".format(self.submissionHost, get_ident())) query_method_list.insert(0, cache_query) if self.useCondorHistory: query_method_list.append(self.schedd.history) # Go for query_method in query_method_list: # Make constraint - clusterids_str = ','.join(list(clusterids_set)) + clusterids_str = ",".join(list(clusterids_set)) if query_method is cache_query or allJobs: constraint = 'harvesterID =?= "{0}"'.format(harvesterID) else: - constraint = 'member(ClusterID, {{{0}}})'.format(clusterids_str) + constraint = "member(ClusterID, {{{0}}})".format(clusterids_str) if allJobs: - tmpLog.debug('Query method: {0} ; allJobs'.format(query_method.__name__)) + tmpLog.debug("Query method: {0} ; allJobs".format(query_method.__name__)) else: tmpLog.debug('Query method: {0} ; clusterids: "{1}"'.format(query_method.__name__, clusterids_str)) # Query @@ -633,10 +651,9 @@ def cleanup_cache(timeout=60): try: job_ads_dict = dict(job) except Exception as e: - tmpLog.error('In doing schedd xquery or history; got exception {0}: {1} ; {2}'.format( - e.__class__.__name__, e, repr(job))) + tmpLog.error("In doing schedd xquery or history; got exception {0}: {1} ; {2}".format(e.__class__.__name__, e, repr(job))) batchid = get_batchid_from_job(job_ads_dict) - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = job_ads_dict # Remove batch jobs already gotten from the list if not allJobs: @@ -647,10 +664,9 @@ def cleanup_cache(timeout=60): if not allJobs and len(batchIDs_set) > 0: # Job unfound via both condor_q or condor_history, marked as unknown worker in harvester for batchid in batchIDs_set: - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = dict() - tmpLog.info( 'Unfound batch jobs of submissionHost={0}: {1}'.format( - self.submissionHost, ' '.join(list(batchIDs_set)) ) ) + tmpLog.info("Unfound batch jobs of submissionHost={0}: {1}".format(self.submissionHost, " ".join(list(batchIDs_set)))) # Return return job_ads_all_dict @@ -661,22 +677,24 @@ class CondorJobSubmit(six.with_metaclass(SingletonWithID, CondorClient)): classLock = threading.Lock() def __init__(self, *args, **kwargs): - self.submissionHost = str(kwargs.get('id')) + self.submissionHost = str(kwargs.get("id")) # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobSubmit.__init__') + tmpLog = core_utils.make_logger( + baseLogger, "submissionHost={0} thrid={1} oid={2}".format(self.submissionHost, get_ident(), id(self)), method_name="CondorJobSubmit.__init__" + ) # Initialize - tmpLog.debug('Start') + tmpLog.debug("Start") self.lock = threading.Lock() CondorClient.__init__(self, self.submissionHost, *args, **kwargs) - tmpLog.debug('Initialize done') + tmpLog.debug("Initialize done") def submit(self, jdl_list, use_spool=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobSubmit.submit") # Get all - tmpLog.debug('Start') + tmpLog.debug("Start") job_ads_all_dict = {} - if self.condor_api == 'python': + if self.condor_api == "python": try: # TODO: submit_with_python will meet segfault or c++ error after many times of submission; need help from condor team # TODO: submit_with_python_proces has no such error but spawns some processes that will not terminate after harvester stops @@ -685,79 +703,78 @@ def submit(self, jdl_list, use_spool=False): # retVal = self.submit_with_python_proces(jdl_list, use_spool) retVal = self.submit_with_command(jdl_list, use_spool) except Exception as e: - tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("Exception {0}: {1}".format(e.__class__.__name__, e)) raise else: retVal = self.submit_with_command(jdl_list, use_spool) return retVal - def submit_with_command(self, jdl_list, use_spool=False, tmp_str='', keep_temp_sdf=False): + def submit_with_command(self, jdl_list, use_spool=False, tmp_str="", keep_temp_sdf=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_command') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobSubmit.submit_with_command") # Initialize - errStr = '' + errStr = "" batchIDs_list = [] # make sdf temp file from jdls - tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=(not keep_temp_sdf), - suffix='_{0}_cluster_submit.sdf'.format(tmp_str)) + tmpFile = tempfile.NamedTemporaryFile(mode="w", delete=(not keep_temp_sdf), suffix="_{0}_cluster_submit.sdf".format(tmp_str)) sdf_file = tmpFile.name - tmpFile.write('\n\n'.join(jdl_list)) + tmpFile.write("\n\n".join(jdl_list)) tmpFile.flush() # make condor remote options - name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else '' - pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else '' - spool_opt = '-remote -spool' if use_spool and self.condor_schedd else '' + name_opt = "-name {0}".format(self.condor_schedd) if self.condor_schedd else "" + pool_opt = "-pool {0}".format(self.condor_pool) if self.condor_pool else "" + spool_opt = "-remote -spool" if use_spool and self.condor_schedd else "" # command - comStr = 'condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format( - sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt) + comStr = "condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}".format( + sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt + ) # submit - tmpLog.debug('submit with command: {0}'.format(comStr)) + tmpLog.debug("submit with command: {0}".format(comStr)) try: - p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except Exception as e: - stdOut = '' + stdOut = "" stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 - errStr = '{0}: {1}'.format(e.__class__.__name__, e) + errStr = "{0}: {1}".format(e.__class__.__name__, e) finally: tmpFile.close() - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) if retCode == 0: # extract clusterid and n_jobs job_id_match = None - for tmp_line_str in stdOut.split('\n'): - job_id_match = re.search('^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) + for tmp_line_str in stdOut.split("\n"): + job_id_match = re.search("^(\d+) job[(]s[)] submitted to cluster (\d+)\.$", tmp_line_str) if job_id_match: break if job_id_match is not None: n_jobs = int(job_id_match.group(1)) clusterid = job_id_match.group(2) - batchIDs_list = ['{0}.{1}'.format(clusterid, procid) for procid in range(n_jobs)] - tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) + batchIDs_list = ["{0}.{1}".format(clusterid, procid) for procid in range(n_jobs)] + tmpLog.debug("submitted {0} jobs: {1}".format(n_jobs, " ".join(batchIDs_list))) else: - errStr = 'no job submitted: {0}'.format(errStr) + errStr = "no job submitted: {0}".format(errStr) tmpLog.error(errStr) else: - errStr = '{0} ; {1}'.format(stdErr, errStr) - tmpLog.error('submission failed: {0}'.format(errStr)) + errStr = "{0} ; {1}".format(stdErr, errStr) + tmpLog.error("submission failed: {0}".format(errStr)) # Return return (batchIDs_list, errStr) @CondorClient.renew_session_and_retry def submit_with_python(self, jdl_list, use_spool=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_python') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobSubmit.submit_with_python") # Start - tmpLog.debug('Start') + tmpLog.debug("Start") # Initialize - errStr = '' + errStr = "" batchIDs_list = [] # Make list of jdl map with dummy submit objects - jdl_map_list = [ dict(htcondor.Submit(jdl).items()) for jdl in jdl_list ] + jdl_map_list = [dict(htcondor.Submit(jdl).items()) for jdl in jdl_list] # Go submit_obj = htcondor.Submit() try: @@ -767,28 +784,27 @@ def submit_with_python(self, jdl_list, use_spool=False): clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() - batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid) - for procid in range(first_proc, first_proc + num_proc)]) + batchIDs_list.extend(["{0}.{1}".format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc)]) except RuntimeError as e: - errStr = '{0}: {1}'.format(e.__class__.__name__, e) - tmpLog.error('submission failed: {0}'.format(errStr)) + errStr = "{0}: {1}".format(e.__class__.__name__, e) + tmpLog.error("submission failed: {0}".format(errStr)) raise if batchIDs_list: n_jobs = len(batchIDs_list) - tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) + tmpLog.debug("submitted {0} jobs: {1}".format(n_jobs, " ".join(batchIDs_list))) elif not errStr: - tmpLog.error('submitted nothing') - tmpLog.debug('Done') + tmpLog.error("submitted nothing") + tmpLog.debug("Done") # Return return (batchIDs_list, errStr) def submit_with_python_process(self, jdl_list, use_spool=False): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_python_process') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobSubmit.submit_with_python_process") # Start - tmpLog.debug('Start') + tmpLog.debug("Start") # Make list of jdl map with dummy submit objects - jdl_map_list = [ dict(htcondor.Submit(jdl).items()) for jdl in jdl_list ] + jdl_map_list = [dict(htcondor.Submit(jdl).items()) for jdl in jdl_list] # Go mp_queue = multiprocessing.Queue() mp_process = multiprocessing.Process(target=condor_submit_process, args=(mp_queue, self.submissionHost, jdl_map_list)) @@ -800,10 +816,10 @@ def submit_with_python_process(self, jdl_list, use_spool=False): mp_process.join() if batchIDs_list: n_jobs = len(batchIDs_list) - tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) + tmpLog.debug("submitted {0} jobs: {1}".format(n_jobs, " ".join(batchIDs_list))) elif not errStr: - tmpLog.error('submitted nothing') - tmpLog.debug('Done') + tmpLog.error("submitted nothing") + tmpLog.debug("Done") # Return return (batchIDs_list, errStr) @@ -814,26 +830,28 @@ class CondorJobManage(six.with_metaclass(SingletonWithID, CondorClient)): classLock = threading.Lock() def __init__(self, *args, **kwargs): - self.submissionHost = str(kwargs.get('id')) + self.submissionHost = str(kwargs.get("id")) # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobManage.__init__') + tmpLog = core_utils.make_logger( + baseLogger, "submissionHost={0} thrid={1} oid={2}".format(self.submissionHost, get_ident(), id(self)), method_name="CondorJobManage.__init__" + ) # Initialize - tmpLog.debug('Start') + tmpLog.debug("Start") self.lock = threading.Lock() CondorClient.__init__(self, self.submissionHost, *args, **kwargs) - tmpLog.debug('Initialize done') + tmpLog.debug("Initialize done") def remove(self, batchIDs_list=[]): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobManage.remove") # Get all - tmpLog.debug('Start') + tmpLog.debug("Start") job_ads_all_dict = {} - if self.condor_api == 'python': + if self.condor_api == "python": try: retVal = self.remove_with_python(batchIDs_list) except Exception as e: - tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("Exception {0}: {1}".format(e.__class__.__name__, e)) raise else: retVal = self.remove_with_command(batchIDs_list) @@ -841,7 +859,7 @@ def remove(self, batchIDs_list=[]): def remove_with_command(self, batchIDs_list=[]): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_command') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobManage.remove_with_command") # if workspec.batchID is None: # tmpLog.info('Found workerID={0} has submissionHost={1} batchID={2} . Cannot kill. Skipped '.format( # workspec.workerID, workspec.submissionHost, workspec.batchID)) @@ -894,12 +912,12 @@ def remove_with_command(self, batchIDs_list=[]): @CondorClient.renew_session_and_retry def remove_with_python(self, batchIDs_list=[]): # Make logger - tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_python') + tmpLog = core_utils.make_logger(baseLogger, "submissionHost={0}".format(self.submissionHost), method_name="CondorJobManage.remove_with_python") # Start - tmpLog.debug('Start') + tmpLog.debug("Start") # Acquire class lock with self.classLock: - tmpLog.debug('Got class lock') + tmpLog.debug("Got class lock") # Initialize ret_list = [] retMap = {} @@ -907,18 +925,18 @@ def remove_with_python(self, batchIDs_list=[]): n_jobs = len(batchIDs_list) act_ret = self.schedd.act(htcondor.JobAction.Remove, batchIDs_list) # Check if all jobs clear (off from schedd queue) - is_all_clear = (n_jobs == act_ret['TotalAlreadyDone'] + act_ret['TotalNotFound'] + act_ret['TotalSuccess']) + is_all_clear = n_jobs == act_ret["TotalAlreadyDone"] + act_ret["TotalNotFound"] + act_ret["TotalSuccess"] if act_ret and is_all_clear: - tmpLog.debug('removed {0} jobs: {1}'.format(n_jobs, ','.join(batchIDs_list))) + tmpLog.debug("removed {0} jobs: {1}".format(n_jobs, ",".join(batchIDs_list))) for batchid in batchIDs_list: - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) - retMap[condor_job_id] = (True, '') + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) + retMap[condor_job_id] = (True, "") else: - tmpLog.error('job removal failed; batchIDs_list={0}, got: {1}'.format(batchIDs_list, act_ret)) + tmpLog.error("job removal failed; batchIDs_list={0}, got: {1}".format(batchIDs_list, act_ret)) # need to query queue for unterminated jobs not removed yet - clusterids_set = set([ get_job_id_tuple_from_batchid(batchid)[0] for batchid in batchIDs_list ]) - clusterids_str = ','.join(list(clusterids_set)) - constraint = 'member(ClusterID, {{{0}}}) && JobStatus =!= 3 && JobStatus =!= 4'.format(clusterids_str) + clusterids_set = set([get_job_id_tuple_from_batchid(batchid)[0] for batchid in batchIDs_list]) + clusterids_str = ",".join(list(clusterids_set)) + constraint = "member(ClusterID, {{{0}}}) && JobStatus =!= 3 && JobStatus =!= 4".format(clusterids_str) jobs_iter = self.schedd.xquery(constraint=constraint, projection=CONDOR_JOB_ADS_LIST) all_batchid_map = {} ok_batchid_list = [] @@ -928,17 +946,21 @@ def remove_with_python(self, batchIDs_list=[]): batchid = get_batchid_from_job(job_ads_dict) all_batchid_map[batchid] = job_ads_dict for batchid in batchIDs_list: - condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) + condor_job_id = "{0}#{1}".format(self.submissionHost, batchid) if batchid in all_batchid_map: ng_batchid_list.append(batchid) - retMap[condor_job_id] = (False, 'batchID={0} still unterminated in condor queue'.format(batchid)) + retMap[condor_job_id] = (False, "batchID={0} still unterminated in condor queue".format(batchid)) else: ok_batchid_list.append(batchid) - retMap[condor_job_id] = (True, '') - tmpLog.debug('removed {0} jobs: {1} ; failed to remove {2} jobs: {3}'.format( - len(ok_batchid_list), ','.join(ok_batchid_list), len(ng_batchid_list), ','.join(ng_batchid_list))) - tmpLog.debug('Done') + retMap[condor_job_id] = (True, "") + tmpLog.debug( + "removed {0} jobs: {1} ; failed to remove {2} jobs: {3}".format( + len(ok_batchid_list), ",".join(ok_batchid_list), len(ng_batchid_list), ",".join(ng_batchid_list) + ) + ) + tmpLog.debug("Done") # Return return retMap -#=============================================================== + +# =============================================================== diff --git a/pandaharvester/harvestermisc/idds_utils.py b/pandaharvester/harvestermisc/idds_utils.py index 9f93a4c5..daded67f 100644 --- a/pandaharvester/harvestermisc/idds_utils.py +++ b/pandaharvester/harvestermisc/idds_utils.py @@ -1,5 +1,6 @@ import os import requests + try: import subprocess32 as subprocess except ImportError: @@ -8,18 +9,18 @@ # get HP point def get_hp_point(idds_url, task_id, point_id, tmp_log, verbose): - url = os.path.join(idds_url, 'idds', 'hpo', str(task_id), 'null', str(point_id), 'null', 'null') + url = os.path.join(idds_url, "idds", "hpo", str(task_id), "null", str(point_id), "null", "null") try: if verbose: tmp_log.debug("getting HP point from {0}".format(url)) r = requests.get(url, verify=False) if verbose: - tmp_log.debug('status: {0}, body: {1}'.format(r.status_code, r.text)) + tmp_log.debug("status: {0}, body: {1}".format(r.status_code, r.text)) if r.status_code != requests.codes.ok: False, "bad http status {0} when getting point (ID={1}) : {2}".format(r.status_code, point_id, r.text) tmp_dict = r.json() for i in tmp_dict: - if i['id'] == point_id: + if i["id"] == point_id: return True, i except Exception as e: errStr = "failed to get point (ID={0}) : {1}".format(point_id, str(e)) @@ -29,17 +30,17 @@ def get_hp_point(idds_url, task_id, point_id, tmp_log, verbose): # update HP point def update_hp_point(idds_url, task_id, point_id, loss, tmp_log, verbose): - url = os.path.join(idds_url, 'idds', 'hpo', str(task_id), 'null', str(point_id), str(loss)) + url = os.path.join(idds_url, "idds", "hpo", str(task_id), "null", str(point_id), str(loss)) try: if verbose: tmp_log.debug("updating HP point at {0}".format(url)) r = requests.put(url, verify=False) if verbose: - tmp_log.debug('status: {0}, body: {1}'.format(r.status_code, r.text)) + tmp_log.debug("status: {0}, body: {1}".format(r.status_code, r.text)) if r.status_code != requests.codes.ok: False, "bad http status {0} when updating point (ID={1}) : {2}".format(r.status_code, point_id, r.text) tmp_dict = r.json() - if tmp_dict['status'] == 0: + if tmp_dict["status"] == 0: return True, None except Exception as e: errStr = "failed to update point (ID={0}) : {1}".format(point_id, str(e)) diff --git a/pandaharvester/harvestermisc/info_utils.py b/pandaharvester/harvestermisc/info_utils.py index b7a25b29..5b81f937 100644 --- a/pandaharvester/harvestermisc/info_utils.py +++ b/pandaharvester/harvestermisc/info_utils.py @@ -11,7 +11,7 @@ harvesterID = harvester_config.master.harvester_id -resolver_config = getattr(harvester_config.qconf, 'resolverConfig', {}) +resolver_config = getattr(harvester_config.qconf, "resolverConfig", {}) class PandaQueuesDict(six.with_metaclass(SingletonWithID, dict, PluginBase)): @@ -20,13 +20,14 @@ class PandaQueuesDict(six.with_metaclass(SingletonWithID, dict, PluginBase)): Key is PanDA Resource name (rather than PanDA Queue name) Able to query with either PanDA Queue name or PanDA Resource name """ + def __init__(self, **kwargs): dict.__init__(self) PluginBase.__init__(self, **kwargs) self.lock = threading.Lock() self.dbInterface = DBInterface() - self.cacher_key = kwargs.get('cacher_key', 'panda_queues.json') - self.refresh_period = resolver_config.get('refreshPeriod', 300) + self.cacher_key = kwargs.get("cacher_key", "panda_queues.json") + self.refresh_period = resolver_config.get("refreshPeriod", 300) self.last_refresh_ts = 0 self._refresh() @@ -44,10 +45,10 @@ def _refresh(self): self.last_refresh_ts = time.time() if panda_queues_cache and isinstance(panda_queues_cache.data, dict): panda_queues_dict = panda_queues_cache.data - for (k, v) in iteritems(panda_queues_dict): + for k, v in iteritems(panda_queues_dict): try: - panda_resource = v['panda_resource'] - assert k == v['nickname'] + panda_resource = v["panda_resource"] + assert k == v["nickname"] except Exception: pass else: @@ -76,7 +77,7 @@ def get_panda_queue_name(self, panda_resource): Return PanDA Queue name with specified PanDA Resource name """ try: - panda_queue = self.get(panda_resource).get('nickname') + panda_queue = self.get(panda_resource).get("nickname") return panda_queue except Exception: return None @@ -87,17 +88,15 @@ def get_queue_status(self, panda_resource): if panda_queue_dict is None: return None # offline if not with harvester or not of this harvester instance - if panda_queue_dict.get('pilot_manager') not in ['Harvester'] \ - or panda_queue_dict.get('harvester') != harvesterID: - return 'offline' - return panda_queue_dict['status'] + if panda_queue_dict.get("pilot_manager") not in ["Harvester"] or panda_queue_dict.get("harvester") != harvesterID: + return "offline" + return panda_queue_dict["status"] # get all queue names of this harvester instance def get_all_queue_names(self): names = set() for queue_name, queue_dict in iteritems(self): - if queue_dict.get('pilot_manager') in ['Harvester'] \ - and queue_dict.get('harvester') == harvesterID: + if queue_dict.get("pilot_manager") in ["Harvester"] and queue_dict.get("harvester") == harvesterID: names.add(queue_name) return names @@ -106,8 +105,7 @@ def is_ups_queue(self, panda_resource): panda_queue_dict = self.get(panda_resource) if panda_queue_dict is None: return False - if panda_queue_dict.get('capability') == 'ucore' \ - and panda_queue_dict.get('workflow') == 'pull_ups': + if panda_queue_dict.get("capability") == "ucore" and panda_queue_dict.get("workflow") == "pull_ups": return True return False @@ -118,8 +116,7 @@ def is_grandly_unified_queue(self, panda_resource): return False # initial, temporary nomenclature - if 'grandly_unified' in panda_queue_dict.get('catchall') \ - or panda_queue_dict.get('type') == 'unified': + if "grandly_unified" in panda_queue_dict.get("catchall") or panda_queue_dict.get("type") == "unified": return True return False @@ -130,7 +127,7 @@ def get_harvester_params(self, panda_resource): if panda_queue_dict is None: return dict() else: - return panda_queue_dict.get('params', dict()) + return panda_queue_dict.get("params", dict()) # get harvester_template def get_harvester_template(self, panda_resource): @@ -138,7 +135,7 @@ def get_harvester_template(self, panda_resource): if panda_queue_dict is None: return None else: - return panda_queue_dict.get('harvester_template', '') + return panda_queue_dict.get("harvester_template", "") # get a tuple of type (production, analysis, etc.) and workflow def get_type_workflow(self, panda_resource): @@ -147,22 +144,22 @@ def get_type_workflow(self, panda_resource): pq_type = None workflow = None else: - pq_type = panda_queue_dict.get('type') - if pq_type == 'unified': # use production templates - pq_type = 'production' - workflow = panda_queue_dict.get('workflow') + pq_type = panda_queue_dict.get("type") + if pq_type == "unified": # use production templates + pq_type = "production" + workflow = panda_queue_dict.get("workflow") return pq_type, workflow def get_prorated_maxwdir_GiB(self, panda_resource, worker_corecount): try: panda_queue_dict = self.get(panda_resource) - maxwdir = panda_queue_dict.get('maxwdir') / 1024 # convert to GiB - corecount = panda_queue_dict.get('corecount') - if panda_queue_dict.get('capability') == 'ucore': + maxwdir = panda_queue_dict.get("maxwdir") / 1024 # convert to GiB + corecount = panda_queue_dict.get("corecount") + if panda_queue_dict.get("capability") == "ucore": maxwdir_prorated = maxwdir * worker_corecount / corecount else: maxwdir_prorated = maxwdir except Exception: maxwdir_prorated = 0 - return maxwdir_prorated \ No newline at end of file + return maxwdir_prorated diff --git a/pandaharvester/harvestermisc/info_utils_k8s.py b/pandaharvester/harvestermisc/info_utils_k8s.py index 88af4555..69308514 100644 --- a/pandaharvester/harvestermisc/info_utils_k8s.py +++ b/pandaharvester/harvestermisc/info_utils_k8s.py @@ -2,39 +2,38 @@ class PandaQueuesDictK8s(PandaQueuesDict): - def get_k8s_scheduler_settings(self, panda_resource): # this is how the affinity settings are declared in CRIC - key_affinity = 'k8s.scheduler.use_score_mcore_affinity' - key_anti_affinity = 'k8s.scheduler.use_score_mcore_anti_affinity' + key_affinity = "k8s.scheduler.use_score_mcore_affinity" + key_anti_affinity = "k8s.scheduler.use_score_mcore_anti_affinity" params = self.get_harvester_params(panda_resource) ret_map = {} try: - ret_map['use_affinity'] = params[key_affinity] + ret_map["use_affinity"] = params[key_affinity] except KeyError: # return default value - ret_map['use_affinity'] = True + ret_map["use_affinity"] = True try: - ret_map['use_anti_affinity'] = params[key_anti_affinity] + ret_map["use_anti_affinity"] = params[key_anti_affinity] except KeyError: # return default value - ret_map['use_anti_affinity'] = False + ret_map["use_anti_affinity"] = False # this is how the affinity settings are declared in CRIC - key_priority_class = 'k8s.scheduler.priorityClassName' - key_priority_class_score = 'k8s.scheduler.priorityClassName.score' - key_priority_class_score_himem = 'k8s.scheduler.priorityClassName.score_himem' - key_priority_class_mcore = 'k8s.scheduler.priorityClassName.mcore' - key_priority_class_mcore_himem = 'k8s.scheduler.priorityClassName.mcore_himem' - - ret_map['priority_class'] = params.get(key_priority_class, None) - ret_map['priority_class_score'] = params.get(key_priority_class_score, None) - ret_map['priority_class_score_himem'] = params.get(key_priority_class_score_himem, None) - ret_map['priority_class_mcore'] = params.get(key_priority_class_mcore, None) - ret_map['priority_class_mcore_himem'] = params.get(key_priority_class_mcore_himem, None) + key_priority_class = "k8s.scheduler.priorityClassName" + key_priority_class_score = "k8s.scheduler.priorityClassName.score" + key_priority_class_score_himem = "k8s.scheduler.priorityClassName.score_himem" + key_priority_class_mcore = "k8s.scheduler.priorityClassName.mcore" + key_priority_class_mcore_himem = "k8s.scheduler.priorityClassName.mcore_himem" + + ret_map["priority_class"] = params.get(key_priority_class, None) + ret_map["priority_class_score"] = params.get(key_priority_class_score, None) + ret_map["priority_class_score_himem"] = params.get(key_priority_class_score_himem, None) + ret_map["priority_class_mcore"] = params.get(key_priority_class_mcore, None) + ret_map["priority_class_mcore_himem"] = params.get(key_priority_class_mcore_himem, None) return ret_map @@ -43,49 +42,49 @@ def get_k8s_resource_settings(self, panda_resource): ret_map = {} # this is how the CPU parameters are declared in CRIC - key_cpu_scheduling_ratio = 'k8s.resources.requests.cpu_scheduling_ratio' - ret_map['cpu_scheduling_ratio'] = params.get(key_cpu_scheduling_ratio, 90) + key_cpu_scheduling_ratio = "k8s.resources.requests.cpu_scheduling_ratio" + ret_map["cpu_scheduling_ratio"] = params.get(key_cpu_scheduling_ratio, 90) # this is how the memory parameters are declared in CRIC - key_memory_limit = 'k8s.resources.limits.use_memory_limit' - key_memory_limit_safety_factor = 'k8s.resources.limits.memory_limit_safety_factor' - key_memory_limit_min_offset = 'k8s.resources.limits.memory_limit_min_offset' - key_memory_scheduling_ratio = 'k8s.resources.requests.memory_scheduling_ratio' + key_memory_limit = "k8s.resources.limits.use_memory_limit" + key_memory_limit_safety_factor = "k8s.resources.limits.memory_limit_safety_factor" + key_memory_limit_min_offset = "k8s.resources.limits.memory_limit_min_offset" + key_memory_scheduling_ratio = "k8s.resources.requests.memory_scheduling_ratio" - ret_map['use_memory_limit'] = params.get(key_memory_limit, False) - ret_map['memory_limit_safety_factor'] = params.get(key_memory_limit_safety_factor, 100) - ret_map['memory_limit_min_offset'] = params.get(key_memory_limit_min_offset, 0) # in MiB to be consistent with minRamCount - ret_map['memory_scheduling_ratio'] = params.get(key_memory_scheduling_ratio, 100) + ret_map["use_memory_limit"] = params.get(key_memory_limit, False) + ret_map["memory_limit_safety_factor"] = params.get(key_memory_limit_safety_factor, 100) + ret_map["memory_limit_min_offset"] = params.get(key_memory_limit_min_offset, 0) # in MiB to be consistent with minRamCount + ret_map["memory_scheduling_ratio"] = params.get(key_memory_scheduling_ratio, 100) # this is how the ephemeral storage parameters are declared in CRIC - key_ephemeral_storage = 'k8s.resources.use_ephemeral_storage_resource_specs' - key_ephemeral_storage_resources_offset = 'k8s.resources.ephemeral_storage_offset' - key_ephemeral_storage_limit_safety_factor = 'k8s.resources.limits.ephemeral_storage_limit_safety_factor' + key_ephemeral_storage = "k8s.resources.use_ephemeral_storage_resource_specs" + key_ephemeral_storage_resources_offset = "k8s.resources.ephemeral_storage_offset" + key_ephemeral_storage_limit_safety_factor = "k8s.resources.limits.ephemeral_storage_limit_safety_factor" - ret_map['use_ephemeral_storage'] = params.get(key_ephemeral_storage, True) # use ephemeral storage unless explicitly disabled - ret_map['ephemeral_storage_limit_safety_factor'] = params.get(key_ephemeral_storage_limit_safety_factor, 100) - ret_map['ephemeral_storage_offset'] = params.get(key_ephemeral_storage_resources_offset, 0) # should come in MiB + ret_map["use_ephemeral_storage"] = params.get(key_ephemeral_storage, True) # use ephemeral storage unless explicitly disabled + ret_map["ephemeral_storage_limit_safety_factor"] = params.get(key_ephemeral_storage_limit_safety_factor, 100) + ret_map["ephemeral_storage_offset"] = params.get(key_ephemeral_storage_resources_offset, 0) # should come in MiB # decide whether to kill on maxtime - use_active_deadline_seconds = 'k8s.use_active_deadline_seconds' + use_active_deadline_seconds = "k8s.use_active_deadline_seconds" - ret_map['use_active_deadline_seconds'] = params.get(use_active_deadline_seconds, True) # kill on max time + ret_map["use_active_deadline_seconds"] = params.get(use_active_deadline_seconds, True) # kill on max time return ret_map def get_k8s_namespace(self, panda_resource): - default_namespace = 'default' + default_namespace = "default" # 1. check if there is an associated CE and use the queue name as namespace panda_queue_dict = self.get(panda_resource, {}) try: - namespace = panda_queue_dict['queues'][0]['ce_queue_name'] + namespace = panda_queue_dict["queues"][0]["ce_queue_name"] return namespace except (KeyError, TypeError, IndexError, ValueError): pass # 2. alternatively, check if namespace defined in the associated parameter section - key_namespace = 'k8s.namespace' + key_namespace = "k8s.namespace" params = self.get_harvester_params(panda_resource) try: @@ -97,28 +96,25 @@ def get_k8s_namespace(self, panda_resource): return namespace def get_k8s_host_image(self, panda_resource): - # check if host_image defined in the associated parameter section - key_host_image = 'k8s.host_image' + key_host_image = "k8s.host_image" params = self.get_harvester_params(panda_resource) host_image = params.get(key_host_image, None) return host_image def get_k8s_pilot_dir(self, panda_resource): - # TODO: check if it can be replaced by an existing PQ field like tmpdir or wntmpdir # check if pilot_dir_mount defined in the associated parameter section - key_pilot_dir = 'k8s.volumes.pilot_dir_mount' + key_pilot_dir = "k8s.volumes.pilot_dir_mount" params = self.get_harvester_params(panda_resource) - pilot_dir = params.get(key_pilot_dir, '/pilotdir') + pilot_dir = params.get(key_pilot_dir, "/pilotdir") return pilot_dir def get_k8s_annotations(self, panda_resource): - # check if there are annotations to be specified - key_safe_to_evict = 'k8s.annotations.safe_to_evict' + key_safe_to_evict = "k8s.annotations.safe_to_evict" params = self.get_harvester_params(panda_resource) safe_to_evict = params.get(key_safe_to_evict, None) @@ -126,10 +122,9 @@ def get_k8s_annotations(self, panda_resource): return safe_to_evict def get_k8s_pilot_proxy_check(self, panda_resource): - # by default we tell the pilot not to run the proxy checks # but some sites insist on activating it - key_pilot_proxy_check = 'k8s.pilot_proxy_check' + key_pilot_proxy_check = "k8s.pilot_proxy_check" params = self.get_harvester_params(panda_resource) pilot_proxy_check = params.get(key_pilot_proxy_check, False) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 8d8f6444..b9d5f1ec 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -14,26 +14,26 @@ from pandaharvester.harvestermisc.info_utils_k8s import PandaQueuesDictK8s from pandaharvester.harvestercore import core_utils -base_logger = core_utils.setup_logger('k8s_utils') +base_logger = core_utils.setup_logger("k8s_utils") -CONFIG_DIR = '/scratch/jobconfig' -EXEC_DIR = '/scratch/executables' -GiB_TO_GB = 2 ** 30 / 10.0 ** 9 +CONFIG_DIR = "/scratch/jobconfig" +EXEC_DIR = "/scratch/executables" +GiB_TO_GB = 2**30 / 10.0**9 # command and image defaults DEF_COMMAND = ["/usr/bin/bash"] DEF_ARGS = ["-c", "cd; python $EXEC_DIR/pilots_starter.py || true"] -DEF_IMAGE = 'atlasadc/atlas-grid-centos7' +DEF_IMAGE = "atlasadc/atlas-grid-centos7" -class k8s_Client(object): +class k8s_Client(object): def __init__(self, namespace, config_file=None, queue_name=None): if not os.path.isfile(config_file): - raise RuntimeError('Cannot find k8s config file: {0}'.format(config_file)) + raise RuntimeError("Cannot find k8s config file: {0}".format(config_file)) config.load_kube_config(config_file=config_file) self.corev1 = client.CoreV1Api() self.batchv1 = client.BatchV1Api() - self.deletev1 = client.V1DeleteOptions(propagation_policy='Background') + self.deletev1 = client.V1DeleteOptions(propagation_policy="Background") self.panda_queues_dict = PandaQueuesDictK8s() self.namespace = namespace @@ -45,211 +45,202 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, - pilot_python_option, pilot_version, host_image, cert, max_time=None): - - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='create_job_from_yaml') + def create_job_from_yaml( + self, yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, pilot_version, host_image, cert, max_time=None + ): + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="create_job_from_yaml") # consider PULL mode as default, unless specified - submit_mode = 'PULL' + submit_mode = "PULL" # create the configmap in push mode worker_id = None - if work_spec.mapType != 'NoJob': - submit_mode = 'PUSH' + if work_spec.mapType != "NoJob": + submit_mode = "PUSH" worker_id = str(work_spec.workerID) res = self.create_configmap(work_spec) if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang - return res, 'Failed to create a configmap' + return res, "Failed to create a configmap" # retrieve panda queue information queue_name = self.panda_queues_dict.get_panda_queue_name(work_spec.computingSite) # set the worker name - worker_name = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) # this will be the batch id later on - yaml_content['metadata']['name'] = worker_name + worker_name = yaml_content["metadata"]["name"] + "-" + str(work_spec.workerID) # this will be the batch id later on + yaml_content["metadata"]["name"] = worker_name # set the resource type and other metadata to filter the pods - yaml_content['spec']['template'].setdefault('metadata', {}) - yaml_content['spec']['template']['metadata'].update({'labels': - {'resourceType': str(work_spec.resourceType), - 'prodSourceLabel': str(prod_source_label), - 'pq': str(work_spec.computingSite) - } - }) + yaml_content["spec"]["template"].setdefault("metadata", {}) + yaml_content["spec"]["template"]["metadata"].update( + {"labels": {"resourceType": str(work_spec.resourceType), "prodSourceLabel": str(prod_source_label), "pq": str(work_spec.computingSite)}} + ) # this flag should be respected by the k8s autoscaler not relocate (kill) the job during a scale down safe_to_evict = self.panda_queues_dict.get_k8s_annotations(work_spec.computingSite) if safe_to_evict is False: - yaml_content['spec']['template']['metadata'].update({'annotations': - { - 'cluster-autoscaler.kubernetes.io/safe-to-evict': 'false' - } - }) + yaml_content["spec"]["template"]["metadata"].update({"annotations": {"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"}}) # fill the container details. we can only handle one container (take the first, delete the rest) - yaml_containers = yaml_content['spec']['template']['spec']['containers'] - del (yaml_containers[1:len(yaml_containers)]) + yaml_containers = yaml_content["spec"]["template"]["spec"]["containers"] + del yaml_containers[1 : len(yaml_containers)] container_env = yaml_containers[0] - container_env.setdefault('resources', {}) + container_env.setdefault("resources", {}) # set the container image if host_image: # images defined in CRIC have absolute preference - container_env['image'] = host_image - elif 'image' not in container_env: # take default image only if not defined in yaml template - container_env['image'] = DEF_IMAGE + container_env["image"] = host_image + elif "image" not in container_env: # take default image only if not defined in yaml template + container_env["image"] = DEF_IMAGE - if 'command' not in container_env: - container_env['command'] = DEF_COMMAND - container_env['args'] = DEF_ARGS + if "command" not in container_env: + container_env["command"] = DEF_COMMAND + container_env["args"] = DEF_ARGS # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod # The CPU & memory settings will affect the QoS for the pod - container_env.setdefault('resources', {}) + container_env.setdefault("resources", {}) resource_settings = self.panda_queues_dict.get_k8s_resource_settings(work_spec.computingSite) pilot_dir = self.panda_queues_dict.get_k8s_pilot_dir(work_spec.computingSite) # CPU resources - cpu_scheduling_ratio = resource_settings['cpu_scheduling_ratio'] + cpu_scheduling_ratio = resource_settings["cpu_scheduling_ratio"] if work_spec.nCore > 0: # CPU requests - container_env['resources'].setdefault('requests', {}) - if 'cpu' not in container_env['resources']['requests']: - container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpu_scheduling_ratio / 100.0) + container_env["resources"].setdefault("requests", {}) + if "cpu" not in container_env["resources"]["requests"]: + container_env["resources"]["requests"]["cpu"] = str(work_spec.nCore * cpu_scheduling_ratio / 100.0) # CPU limits - container_env['resources'].setdefault('limits', {}) - if 'cpu' not in container_env['resources']['limits']: - container_env['resources']['limits']['cpu'] = str(work_spec.nCore) + container_env["resources"].setdefault("limits", {}) + if "cpu" not in container_env["resources"]["limits"]: + container_env["resources"]["limits"]["cpu"] = str(work_spec.nCore) # Memory resources - use_memory_limit = resource_settings['use_memory_limit'] - memory_limit_safety_factor = resource_settings['memory_limit_safety_factor'] - memory_limit_min_offset = resource_settings['memory_limit_min_offset'] - memory_scheduling_ratio = resource_settings['memory_scheduling_ratio'] + use_memory_limit = resource_settings["use_memory_limit"] + memory_limit_safety_factor = resource_settings["memory_limit_safety_factor"] + memory_limit_min_offset = resource_settings["memory_limit_min_offset"] + memory_scheduling_ratio = resource_settings["memory_scheduling_ratio"] if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory requests - container_env['resources'].setdefault('requests', {}) - if 'memory' not in container_env['resources']['requests']: + container_env["resources"].setdefault("requests", {}) + if "memory" not in container_env["resources"]["requests"]: memory_request = str(work_spec.minRamCount * memory_scheduling_ratio / 100.0) - container_env['resources']['requests']['memory'] = str(memory_request) + 'Mi' + container_env["resources"]["requests"]["memory"] = str(memory_request) + "Mi" # memory limits: kubernetes is very aggressive killing jobs due to memory, hence making this field optional # and adding configuration possibilities to add a safety factor if use_memory_limit: - container_env['resources'].setdefault('limits', {}) - if 'memory' not in container_env['resources']['limits']: - mem_limit = max(work_spec.minRamCount + memory_limit_min_offset, - work_spec.minRamCount * memory_limit_safety_factor / 100.0) - container_env['resources']['limits']['memory'] = str(mem_limit) + 'Mi' + container_env["resources"].setdefault("limits", {}) + if "memory" not in container_env["resources"]["limits"]: + mem_limit = max(work_spec.minRamCount + memory_limit_min_offset, work_spec.minRamCount * memory_limit_safety_factor / 100.0) + container_env["resources"]["limits"]["memory"] = str(mem_limit) + "Mi" # Ephemeral storage resources - use_ephemeral_storage = resource_settings['use_ephemeral_storage'] - ephemeral_storage_offset_GiB = resource_settings['ephemeral_storage_offset'] / 1024 - ephemeral_storage_limit_safety_factor = resource_settings['ephemeral_storage_limit_safety_factor'] + use_ephemeral_storage = resource_settings["use_ephemeral_storage"] + ephemeral_storage_offset_GiB = resource_settings["ephemeral_storage_offset"] / 1024 + ephemeral_storage_limit_safety_factor = resource_settings["ephemeral_storage_limit_safety_factor"] if use_ephemeral_storage: - maxwdir_prorated_GiB = self.panda_queues_dict.get_prorated_maxwdir_GiB(work_spec.computingSite, - work_spec.nCore) + maxwdir_prorated_GiB = self.panda_queues_dict.get_prorated_maxwdir_GiB(work_spec.computingSite, work_spec.nCore) # ephemeral storage requests - container_env['resources'].setdefault('requests', {}) - if 'ephemeral-storage' not in container_env['resources']['requests']: + container_env["resources"].setdefault("requests", {}) + if "ephemeral-storage" not in container_env["resources"]["requests"]: eph_storage_request_GiB = maxwdir_prorated_GiB + ephemeral_storage_offset_GiB eph_storage_request_MiB = round(eph_storage_request_GiB * 1024, 2) - container_env['resources']['requests']['ephemeral-storage'] = str(eph_storage_request_MiB) + 'Mi' + container_env["resources"]["requests"]["ephemeral-storage"] = str(eph_storage_request_MiB) + "Mi" # ephemeral storage limits - container_env['resources'].setdefault('limits', {}) - if 'ephemeral-storage' not in container_env['resources']['limits']: - eph_storage_limit_GiB = (maxwdir_prorated_GiB + ephemeral_storage_offset_GiB) \ - * ephemeral_storage_limit_safety_factor / 100.0 + container_env["resources"].setdefault("limits", {}) + if "ephemeral-storage" not in container_env["resources"]["limits"]: + eph_storage_limit_GiB = (maxwdir_prorated_GiB + ephemeral_storage_offset_GiB) * ephemeral_storage_limit_safety_factor / 100.0 eph_storage_limit_MiB = round(eph_storage_limit_GiB * 1024, 2) - container_env['resources']['limits']['ephemeral-storage'] = str(eph_storage_limit_MiB) + 'Mi' + container_env["resources"]["limits"]["ephemeral-storage"] = str(eph_storage_limit_MiB) + "Mi" # add the ephemeral storage and mount it on pilot_dir - yaml_content['spec']['template']['spec'].setdefault('volumes', []) - yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] - exists = list(filter(lambda vol: vol['name'] == 'pilot-dir', yaml_volumes)) + yaml_content["spec"]["template"]["spec"].setdefault("volumes", []) + yaml_volumes = yaml_content["spec"]["template"]["spec"]["volumes"] + exists = list(filter(lambda vol: vol["name"] == "pilot-dir", yaml_volumes)) if not exists: - yaml_volumes.append({'name': 'pilot-dir', 'emptyDir': {}}) + yaml_volumes.append({"name": "pilot-dir", "emptyDir": {}}) - container_env.setdefault('volumeMounts', []) - exists = list(filter(lambda vol_mount: vol_mount['name'] == 'pilot-dir', container_env['volumeMounts'])) + container_env.setdefault("volumeMounts", []) + exists = list(filter(lambda vol_mount: vol_mount["name"] == "pilot-dir", container_env["volumeMounts"])) if not exists: - container_env['volumeMounts'].append({'name': 'pilot-dir', 'mountPath': pilot_dir}) + container_env["volumeMounts"].append({"name": "pilot-dir", "mountPath": pilot_dir}) - container_env.setdefault('env', []) + container_env.setdefault("env", []) # try to retrieve the stdout log file name try: - log_file_name = work_spec.workAttributes['stdout'] + log_file_name = work_spec.workAttributes["stdout"] except (KeyError, AttributeError): - tmp_log.debug('work_spec does not have stdout workAttribute, using default') - log_file_name = '' + tmp_log.debug("work_spec does not have stdout workAttribute, using default") + log_file_name = "" # get the option to activate the pilot proxy check pilot_proxy_check = self.panda_queues_dict.get_k8s_pilot_proxy_check(work_spec.computingSite) - container_env['env'].extend([ - {'name': 'computingSite', 'value': work_spec.computingSite}, - {'name': 'pandaQueueName', 'value': queue_name}, - {'name': 'resourceType', 'value': work_spec.resourceType}, - {'name': 'prodSourceLabel', 'value': prod_source_label}, - {'name': 'pilotType', 'value': pilot_type}, - {'name': 'pilotUrlOpt', 'value': pilot_url_str}, - {'name': 'pythonOption', 'value': pilot_python_option}, - {'name': 'pilotVersion', 'value': pilot_version}, - {'name': 'jobType', 'value': work_spec.jobType}, - {'name': 'proxySecretPath', 'value': cert}, - {'name': 'workerID', 'value': str(work_spec.workerID)}, - {'name': 'pilotProxyCheck', 'value': str(pilot_proxy_check)}, - {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, - {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, - {'name': 'stdout_name', 'value': log_file_name}, - {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, - {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, - {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id}, - {'name': 'submit_mode', 'value': submit_mode}, - {'name': 'EXEC_DIR', 'value': EXEC_DIR}, - {'name': 'TMPDIR', 'value': pilot_dir}, - {'name': 'HOME', 'value': pilot_dir}, - {'name': 'PANDA_HOSTNAME', 'valueFrom': {'fieldRef': {'apiVersion': 'v1', 'fieldPath': 'spec.nodeName'}}}, - {'name': 'K8S_JOB_ID', 'value': worker_name} - ]) + container_env["env"].extend( + [ + {"name": "computingSite", "value": work_spec.computingSite}, + {"name": "pandaQueueName", "value": queue_name}, + {"name": "resourceType", "value": work_spec.resourceType}, + {"name": "prodSourceLabel", "value": prod_source_label}, + {"name": "pilotType", "value": pilot_type}, + {"name": "pilotUrlOpt", "value": pilot_url_str}, + {"name": "pythonOption", "value": pilot_python_option}, + {"name": "pilotVersion", "value": pilot_version}, + {"name": "jobType", "value": work_spec.jobType}, + {"name": "proxySecretPath", "value": cert}, + {"name": "workerID", "value": str(work_spec.workerID)}, + {"name": "pilotProxyCheck", "value": str(pilot_proxy_check)}, + {"name": "logs_frontend_w", "value": harvester_config.pandacon.pandaCacheURL_W}, + {"name": "logs_frontend_r", "value": harvester_config.pandacon.pandaCacheURL_R}, + {"name": "stdout_name", "value": log_file_name}, + {"name": "PANDA_JSID", "value": "harvester-" + harvester_config.master.harvester_id}, + {"name": "HARVESTER_WORKER_ID", "value": str(work_spec.workerID)}, + {"name": "HARVESTER_ID", "value": harvester_config.master.harvester_id}, + {"name": "submit_mode", "value": submit_mode}, + {"name": "EXEC_DIR", "value": EXEC_DIR}, + {"name": "TMPDIR", "value": pilot_dir}, + {"name": "HOME", "value": pilot_dir}, + {"name": "PANDA_HOSTNAME", "valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "spec.nodeName"}}}, + {"name": "K8S_JOB_ID", "value": worker_name}, + ] + ) # add the pilots starter configmap - yaml_content['spec']['template']['spec'].setdefault('volumes', []) - yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] - yaml_volumes.append({'name': 'pilots-starter', 'configMap': {'name': 'pilots-starter'}}) + yaml_content["spec"]["template"]["spec"].setdefault("volumes", []) + yaml_volumes = yaml_content["spec"]["template"]["spec"]["volumes"] + yaml_volumes.append({"name": "pilots-starter", "configMap": {"name": "pilots-starter"}}) # mount the volume to the filesystem - container_env.setdefault('volumeMounts', []) - container_env['volumeMounts'].append({'name': 'pilots-starter', 'mountPath': EXEC_DIR}) + container_env.setdefault("volumeMounts", []) + container_env["volumeMounts"].append({"name": "pilots-starter", "mountPath": EXEC_DIR}) # in push mode, add the configmap as a volume to the pod - if submit_mode == 'PUSH' and worker_id: - yaml_content['spec']['template']['spec'].setdefault('volumes', []) - yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] - yaml_volumes.append({'name': 'job-config', 'configMap': {'name': worker_id}}) + if submit_mode == "PUSH" and worker_id: + yaml_content["spec"]["template"]["spec"].setdefault("volumes", []) + yaml_volumes = yaml_content["spec"]["template"]["spec"]["volumes"] + yaml_volumes.append({"name": "job-config", "configMap": {"name": worker_id}}) # mount the volume to the filesystem - container_env.setdefault('volumeMounts', []) - container_env['volumeMounts'].append({'name': 'job-config', 'mountPath': CONFIG_DIR}) + container_env.setdefault("volumeMounts", []) + container_env["volumeMounts"].append({"name": "job-config", "mountPath": CONFIG_DIR}) # set the affinity scheduling_settings = self.panda_queues_dict.get_k8s_scheduler_settings(work_spec.computingSite) - use_affinity = scheduling_settings['use_affinity'] - use_anti_affinity = scheduling_settings['use_anti_affinity'] - if (use_affinity or use_anti_affinity) and 'affinity' not in yaml_content['spec']['template']['spec']: + use_affinity = scheduling_settings["use_affinity"] + use_anti_affinity = scheduling_settings["use_anti_affinity"] + if (use_affinity or use_anti_affinity) and "affinity" not in yaml_content["spec"]["template"]["spec"]: yaml_content = self.set_affinity(yaml_content, use_affinity, use_anti_affinity) # set the priority classes. Specific priority classes have precedence over general priority classes priority_class = None - priority_class_key = 'priority_class_{0}'.format(work_spec.resourceType.lower()) + priority_class_key = "priority_class_{0}".format(work_spec.resourceType.lower()) priority_class_specific = scheduling_settings.get(priority_class_key, None) - priority_class_key = 'priority_class' + priority_class_key = "priority_class" priority_class_general = scheduling_settings.get(priority_class_key, None) if priority_class_specific: @@ -257,17 +248,17 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, pilot elif priority_class_general: priority_class = priority_class_general - if priority_class and 'priorityClassName' not in yaml_content['spec']['template']['spec']: - yaml_content['spec']['template']['spec']['priorityClassName'] = priority_class + if priority_class and "priorityClassName" not in yaml_content["spec"]["template"]["spec"]: + yaml_content["spec"]["template"]["spec"]["priorityClassName"] = priority_class # set max_time to avoid having a pod running forever - use_active_deadline_seconds = resource_settings['use_active_deadline_seconds'] - if 'activeDeadlineSeconds' not in yaml_content['spec']['template']['spec'] and use_active_deadline_seconds: + use_active_deadline_seconds = resource_settings["use_active_deadline_seconds"] + if "activeDeadlineSeconds" not in yaml_content["spec"]["template"]["spec"] and use_active_deadline_seconds: if not max_time: # 4 days max_time = 4 * 24 * 23600 - yaml_content['spec']['template']['spec']['activeDeadlineSeconds'] = max_time + yaml_content["spec"]["template"]["spec"]["activeDeadlineSeconds"] = max_time - tmp_log.debug('creating job {0}'.format(yaml_content)) + tmp_log.debug("creating job {0}".format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content @@ -275,24 +266,23 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, pilot def generate_ls_from_wsl(self, workspec_list=[]): if workspec_list: batch_ids_list = [workspec.batchID for workspec in workspec_list if workspec.batchID] - batch_ids_concat = ','.join(batch_ids_list) - label_selector = 'job-name in ({0})'.format(batch_ids_concat) + batch_ids_concat = ",".join(batch_ids_list) + label_selector = "job-name in ({0})".format(batch_ids_concat) else: - label_selector = '' + label_selector = "" return label_selector def get_workers_info(self, workspec_list=[]): - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='get_workers_info') - tmp_log.debug('start') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="get_workers_info") + tmp_log.debug("start") label_selector = self.generate_ls_from_wsl(workspec_list) # get detailed information for available pods pods_dict = self.get_pods_info(label_selector) if pods_dict is None: # communication failure to the cluster - tmp_log.error('Communication failure to cluster. Stopping') + tmp_log.error("Communication failure to cluster. Stopping") return None # complement pod information with coarse job information @@ -309,7 +299,7 @@ def get_workers_info(self, workspec_list=[]): worker_info.update(jobs_dict[batch_id]) workers_dict[batch_id] = worker_info - tmp_log.debug('done') + tmp_log.debug("done") return workers_dict def get_pods_info(self, label_selector): @@ -317,35 +307,33 @@ def get_pods_info(self, label_selector): # We use job information in case the pod has been deleted (e.g. in Google bulk exercises), because the job # should persist up the TTL. - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='get_pods_info') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="get_pods_info") try: ret = self.corev1.list_namespaced_pod(namespace=self.namespace, label_selector=label_selector) except Exception as _e: - tmp_log.error('Failed call to list_namespaced_pod with: {0}'.format(_e)) + tmp_log.error("Failed call to list_namespaced_pod with: {0}".format(_e)) return None # None needs to be treated differently than [] by the caller pods_dict = {} for i in ret.items: - - job_name = i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None + job_name = i.metadata.labels["job-name"] if i.metadata.labels and "job-name" in i.metadata.labels else None # pod information pod_info = { - 'pod_name': i.metadata.name, - 'pod_start_time': i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time, - 'pod_status': i.status.phase, - 'pod_status_conditions': i.status.conditions, - 'pod_status_message': i.status.message, - 'containers_state': [] + "pod_name": i.metadata.name, + "pod_start_time": i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time, + "pod_status": i.status.phase, + "pod_status_conditions": i.status.conditions, + "pod_status_message": i.status.message, + "containers_state": [], } # sub-container information if i.status.container_statuses: for cs in i.status.container_statuses: if cs.state: - pod_info['containers_state'].append(cs.state) + pod_info["containers_state"].append(cs.state) pods_dict[job_name] = pod_info @@ -353,13 +341,11 @@ def get_pods_info(self, label_selector): def filter_pods_info(self, pods_list, job_name=None): if job_name: - pods_list = [i for i in pods_list if i['job_name'] == job_name] + pods_list = [i for i in pods_list if i["job_name"] == job_name] return pods_list def get_jobs_info(self, label_selector): - - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='get_jobs_info') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="get_jobs_info") jobs_dict = {} @@ -381,103 +367,92 @@ def get_jobs_info(self, label_selector): n_pods_failed = i.status.failed job_info = { - 'job_status': status, - 'job_status_reason': status_reason, - 'job_status_message': status_message, - 'n_pods_succeeded': n_pods_succeeded, - 'n_pods_failed': n_pods_failed + "job_status": status, + "job_status_reason": status_reason, + "job_status_message": status_message, + "n_pods_succeeded": n_pods_succeeded, + "n_pods_failed": n_pods_failed, } jobs_dict[name] = job_info except Exception as _e: - tmp_log.error('Failed call to list_namespaced_job with: {0}'.format(_e)) + tmp_log.error("Failed call to list_namespaced_job with: {0}".format(_e)) return jobs_dict def delete_pods(self, pod_name_list): - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='delete_pods') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="delete_pods") - tmp_log.debug('Going to delete {0} PODs: {1}'.format(len(pod_name_list), pod_name_list)) + tmp_log.debug("Going to delete {0} PODs: {1}".format(len(pod_name_list), pod_name_list)) ret_list = list() for pod_name in pod_name_list: - rsp = {'name': pod_name} + rsp = {"name": pod_name} try: - self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, - grace_period_seconds=0) + self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) except ApiException as _e: - rsp['errMsg'] = '' if _e.status == 404 else _e.reason + rsp["errMsg"] = "" if _e.status == 404 else _e.reason except Exception as _e: - rsp['errMsg'] = _e.reason + rsp["errMsg"] = _e.reason else: - rsp['errMsg'] = '' + rsp["errMsg"] = "" ret_list.append(rsp) - tmp_log.debug('Done with: {0}'.format(ret_list)) + tmp_log.debug("Done with: {0}".format(ret_list)) return ret_list def delete_job(self, job_name): - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0} job_name={1}'.format(self.queue_name, job_name), - method_name='delete_job') - tmp_log.debug('Going to delete JOB {0}'.format(job_name)) + tmp_log = core_utils.make_logger(base_logger, "queue_name={0} job_name={1}".format(self.queue_name, job_name), method_name="delete_job") + tmp_log.debug("Going to delete JOB {0}".format(job_name)) try: - self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, - grace_period_seconds=0) - tmp_log.debug('Deleted JOB {0}'.format(job_name)) + self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + tmp_log.debug("Deleted JOB {0}".format(job_name)) except Exception as _e: - tmp_log.error('Failed to delete JOB {0} with: {1}'.format(job_name, _e)) + tmp_log.error("Failed to delete JOB {0} with: {1}".format(job_name, _e)) def delete_config_map(self, config_map_name): - self.corev1.delete_namespaced_config_map(name=config_map_name, namespace=self.namespace, body=self.deletev1, - grace_period_seconds=0) + self.corev1.delete_namespaced_config_map(name=config_map_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) def set_affinity(self, yaml_content, use_affinity, use_anti_affinity): - if not use_affinity and not use_anti_affinity: # we are not supposed to use any affinity setting for this queue return yaml_content - yaml_content['spec']['template']['spec']['affinity'] = {} - yaml_affinity = yaml_content['spec']['template']['spec']['affinity'] + yaml_content["spec"]["template"]["spec"]["affinity"] = {} + yaml_affinity = yaml_content["spec"]["template"]["spec"]["affinity"] - scores = ['SCORE', 'SCORE_HIMEM'] - mcores = ['MCORE', 'MCORE_HIMEM'] + scores = ["SCORE", "SCORE_HIMEM"] + mcores = ["MCORE", "MCORE_HIMEM"] - anti_affinity_matrix = {'SCORE': mcores, - 'SCORE_HIMEM': mcores, - 'MCORE': scores, - 'MCORE_HIMEM': scores} + anti_affinity_matrix = {"SCORE": mcores, "SCORE_HIMEM": mcores, "MCORE": scores, "MCORE_HIMEM": scores} affinity_spec = { - 'preferredDuringSchedulingIgnoredDuringExecution': [ - {'weight': 100, 'podAffinityTerm': { - 'labelSelector': { - 'matchExpressions': [ - { - 'key': 'resourceType', - 'operator': 'In', - 'values': ['SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'] - } - ] + "preferredDuringSchedulingIgnoredDuringExecution": [ + { + "weight": 100, + "podAffinityTerm": { + "labelSelector": { + "matchExpressions": [{"key": "resourceType", "operator": "In", "values": ["SCORE", "SCORE_HIMEM", "MCORE", "MCORE_HIMEM"]}] + }, + "topologyKey": "kubernetes.io/hostname", }, - 'topologyKey': 'kubernetes.io/hostname'} - } + } ] } - resource_type = yaml_content['spec']['template']['metadata']['labels']['resourceType'] + resource_type = yaml_content["spec"]["template"]["metadata"]["labels"]["resourceType"] if use_affinity and resource_type in scores: # resource type SCORE* should attract each other instead of spreading across the nodes - yaml_affinity['podAffinity'] = copy.deepcopy(affinity_spec) + yaml_affinity["podAffinity"] = copy.deepcopy(affinity_spec) if use_anti_affinity: # SCORE* will repel MCORE* and viceversa. The main reasoning was to keep nodes for MCORE # This setting depends on the size of the node vs the MCORE job - yaml_affinity['podAntiAffinity'] = copy.deepcopy(affinity_spec) - yaml_affinity['podAntiAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm'][ - 'labelSelector']['matchExpressions'][0]['values'] = anti_affinity_matrix[resource_type] + yaml_affinity["podAntiAffinity"] = copy.deepcopy(affinity_spec) + yaml_affinity["podAntiAffinity"]["preferredDuringSchedulingIgnoredDuringExecution"][0]["podAffinityTerm"]["labelSelector"]["matchExpressions"][0][ + "values" + ] = anti_affinity_matrix[resource_type] return yaml_content @@ -486,46 +461,44 @@ def create_or_patch_secret(self, file_list, secret_name): # kind = 'Secret' # type='kubernetes.io/tls' rsp = None - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='create_or_patch_secret') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="create_or_patch_secret") - metadata = {'name': secret_name, 'namespace': self.namespace} + metadata = {"name": secret_name, "namespace": self.namespace} data = {} for file_name in file_list: filename = os.path.basename(file_name) - with open(file_name, 'rb') as f: + with open(file_name, "rb") as f: content = f.read() data[filename] = base64.b64encode(content).decode() body = client.V1Secret(data=data, metadata=metadata) try: try: rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) - tmp_log.debug('Patched secret') + tmp_log.debug("Patched secret") except ApiException as e: - tmp_log.debug('Exception when patching secret: {0} . Try to create secret instead...'.format(e)) + tmp_log.debug("Exception when patching secret: {0} . Try to create secret instead...".format(e)) rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) - tmp_log.debug('Created secret') + tmp_log.debug("Created secret") except Exception as e: - tmp_log.error('Exception when patching or creating secret: {0}.'.format(e)) + tmp_log.error("Exception when patching or creating secret: {0}.".format(e)) return rsp def create_configmap(self, work_spec): # useful guide: https://matthewpalmer.net/kubernetes-app-developer/articles/ultimate-configmap-guide-kubernetes.html - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='create_configmap') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="create_configmap") try: worker_id = str(work_spec.workerID) # Get the access point. The messenger should have dropped the input files for the pilot here access_point = work_spec.get_access_point() - pjd = 'pandaJobData.out' + pjd = "pandaJobData.out" job_data_file = os.path.join(access_point, pjd) with open(job_data_file) as f: job_data_contents = f.read() - pfc = 'PoolFileCatalog_H.xml' + pfc = "PoolFileCatalog_H.xml" pool_file_catalog_file = os.path.join(access_point, pfc) with open(pool_file_catalog_file) as f: pool_file_catalog_contents = f.read() @@ -534,60 +507,57 @@ def create_configmap(self, work_spec): data = {pjd: job_data_contents, pfc: pool_file_catalog_contents} # instantiate the configmap object - metadata = {'name': worker_id, 'namespace': self.namespace} + metadata = {"name": worker_id, "namespace": self.namespace} config_map = client.V1ConfigMap(api_version="v1", kind="ConfigMap", data=data, metadata=metadata) # create the configmap object in K8s api_response = self.corev1.create_namespaced_config_map(namespace=self.namespace, body=config_map) - tmp_log.debug('Created configmap for worker id: {0}'.format(worker_id)) + tmp_log.debug("Created configmap for worker id: {0}".format(worker_id)) return True except Exception as e: - tmp_log.error('Could not create configmap with: {0}'.format(e)) + tmp_log.error("Could not create configmap with: {0}".format(e)) return False def create_or_patch_configmap_starter(self): # useful guide: https://matthewpalmer.net/kubernetes-app-developer/articles/ultimate-configmap-guide-kubernetes.html - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='create_or_patch_configmap_starter') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="create_or_patch_configmap_starter") try: - fn = 'pilots_starter.py' + fn = "pilots_starter.py" dirname = os.path.dirname(__file__) - pilots_starter_file = os.path.join(dirname, '../harvestercloud/{0}'.format(fn)) + pilots_starter_file = os.path.join(dirname, "../harvestercloud/{0}".format(fn)) with open(pilots_starter_file) as f: pilots_starter_contents = f.read() data = {fn: pilots_starter_contents} - name = 'pilots-starter' + name = "pilots-starter" # instantiate the configmap object - metadata = {'name': name, 'namespace': self.namespace} + metadata = {"name": name, "namespace": self.namespace} config_map = client.V1ConfigMap(api_version="v1", kind="ConfigMap", data=data, metadata=metadata) try: api_response = self.corev1.patch_namespaced_config_map(name=name, body=config_map, namespace=self.namespace) - tmp_log.debug('Patched pilots-starter config_map') + tmp_log.debug("Patched pilots-starter config_map") except ApiException as e: - tmp_log.debug('Exception when patching pilots-starter config_map: {0} . Try to create it instead...' - .format(e)) + tmp_log.debug("Exception when patching pilots-starter config_map: {0} . Try to create it instead...".format(e)) api_response = self.corev1.create_namespaced_config_map(namespace=self.namespace, body=config_map) - tmp_log.debug('Created pilots-starter config_map') + tmp_log.debug("Created pilots-starter config_map") return True except Exception as e: - tmp_log.error('Could not create configmap with: {0}'.format(e)) + tmp_log.error("Could not create configmap with: {0}".format(e)) return False def get_pod_logs(self, pod_name, previous=False): - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='get_pod_logs') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="get_pod_logs") try: rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) - tmp_log.debug('Log file retrieved for {0}'.format(pod_name)) + tmp_log.debug("Log file retrieved for {0}".format(pod_name)) except Exception as e: - tmp_log.debug('Exception when getting logs for pod {0} : {1}. Skipped'.format(pod_name, e)) + tmp_log.debug("Exception when getting logs for pod {0} : {1}. Skipped".format(pod_name, e)) raise else: return rsp diff --git a/pandaharvester/harvestermisc/lancium_utils.py b/pandaharvester/harvestermisc/lancium_utils.py index a397b976..9aedfa3f 100644 --- a/pandaharvester/harvestermisc/lancium_utils.py +++ b/pandaharvester/harvestermisc/lancium_utils.py @@ -2,6 +2,8 @@ Lancium python API wrapper functions """ +from lancium.api.Data import Data +from lancium.api.Job import Job import os import time import datetime @@ -21,40 +23,41 @@ try: api_key = harvester_config.lancium.api_key except AttributeError: - raise RuntimeError('The configuration is missing the [lancium] section and/or the api_key entry') + raise RuntimeError("The configuration is missing the [lancium] section and/or the api_key entry") # The key needs to be set before importing the lancium API -os.environ['LANCIUM_API_KEY'] = api_key +os.environ["LANCIUM_API_KEY"] = api_key -from lancium.api.Job import Job -from lancium.api.Data import Data # logger -base_logger = core_utils.setup_logger('lancium_utils') +base_logger = core_utils.setup_logger("lancium_utils") -SECRETS_PATH = '/voms/' -SCRIPTS_PATH = '/scripts/' +SECRETS_PATH = "/voms/" +SCRIPTS_PATH = "/scripts/" LANCIUM_JOB_ATTRS_LIST = [ - 'id', - 'name', - 'status', - 'created_at', - 'updated_at', - 'submitted_at', - 'completed_at', - 'exit_code', + "id", + "name", + "status", + "created_at", + "updated_at", + "submitted_at", + "completed_at", + "exit_code", ] + def fake_callback(total_chunks, current_chunk): pass + def get_job_name_from_workspec(workspec): - job_name = '{0}:{1}'.format(harvester_config.master.harvester_id, workspec.workerID) - return job_name + job_name = "{0}:{1}".format(harvester_config.master.harvester_id, workspec.workerID) + return job_name + def get_workerid_from_job_name(job_name): - tmp_str_list = job_name.split(':') + tmp_str_list = job_name.split(":") harvester_id = None worker_id = None try: @@ -64,14 +67,17 @@ def get_workerid_from_job_name(job_name): pass return (harvester_id, worker_id) + def get_full_batch_id(submission_host, batch_id): - full_batch_id = '{0}#{1}'.format(submission_host, batch_id) + full_batch_id = "{0}#{1}".format(submission_host, batch_id) return full_batch_id + def get_full_batch_id_from_workspec(workspec): - full_batch_id = '{0}#{1}'.format(workspec.submissionHost, workspec.batchID) + full_batch_id = "{0}#{1}".format(workspec.submissionHost, workspec.batchID) return full_batch_id + def get_host_batch_id_map(workspec_list): """ Get a dictionary of submissionHost: list of batchIDs from workspec_list @@ -89,76 +95,75 @@ def get_host_batch_id_map(workspec_list): host_batch_id_map[host] = [batch_id] return host_batch_id_map + def timestamp_to_datetime(timestamp_str): - return datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ') + return datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%fZ") class LanciumClient(object): - def __init__(self, submission_host, queue_name=None): self.submission_host = submission_host self.queue_name = queue_name - + def upload_file(self, local_path, lancium_path, force=True): - tmp_log = core_utils.make_logger(base_logger, method_name='upload_file') + tmp_log = core_utils.make_logger(base_logger, method_name="upload_file") try: tmp_log.debug("Uploading file {0}".format(local_path)) - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='upload_file') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="upload_file") - data = Data().create(lancium_path, 'file', source=os.path.abspath(local_path), force=force) + data = Data().create(lancium_path, "file", source=os.path.abspath(local_path), force=force) data.upload(os.path.abspath(local_path), fake_callback) ex = data.show(lancium_path)[0] tmp_log.debug("Done: {0}".format(ex.__dict__)) - return True, '' + return True, "" except Exception as _e: - error_message = 'Failed to upload file with {0}'.format(_e) - tmp_log.error('Failed to upload the file with {0}'.format(traceback.format_exc())) + error_message = "Failed to upload file with {0}".format(_e) + tmp_log.error("Failed to upload the file with {0}".format(traceback.format_exc())) return False, error_message def submit_job(self, **jobparams): # create and submit a job to lancium - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0}'.format(self.queue_name), - method_name='submit_job') + tmp_log = core_utils.make_logger(base_logger, "queue_name={0}".format(self.queue_name), method_name="submit_job") try: - tmp_log.debug('Creating and submitting a job') + tmp_log.debug("Creating and submitting a job") job = Job().create(**jobparams) - tmp_log.debug('Job created. name: {0}, id: {1}, status: {2}'.format(job.name, job.id, job.status)) + tmp_log.debug("Job created. name: {0}, id: {1}, status: {2}".format(job.name, job.id, job.status)) job.submit() - tmp_log.debug('Job submitted. name: {0}, id: {1}, status: {2}'.format(job.name, job.id, job.status)) + tmp_log.debug("Job submitted. name: {0}, id: {1}, status: {2}".format(job.name, job.id, job.status)) batch_id = str(job.id) return True, batch_id except Exception as _e: - error_message = 'Failed to create or submit a job with {0}'.format(_e) - tmp_log.error('Failed to create or submit a job with {0}'.format(traceback.format_exc())) + error_message = "Failed to create or submit a job with {0}".format(_e) + tmp_log.error("Failed to create or submit a job with {0}".format(traceback.format_exc())) return False, error_message def delete_job(self, job_id): # delete job by job ID - tmp_log = core_utils.make_logger(base_logger, 'queue_name={0} job_id={1}'.format(self.queue_name, job_id), - method_name='delete_job') - tmp_log.debug('Going to delete job {0}'.format(job_id)) + tmp_log = core_utils.make_logger(base_logger, "queue_name={0} job_id={1}".format(self.queue_name, job_id), method_name="delete_job") + tmp_log.debug("Going to delete job {0}".format(job_id)) Job.delete(job_id) - tmp_log.debug('Deleted job {0}'.format(job_id)) + tmp_log.debug("Deleted job {0}".format(job_id)) + class LanciumJobsCacheFifo(SpecialFIFOBase, metaclass=SingletonWithID): """ Cache FIFO for Lancium jobs """ + global_lock_id = -1 def __init__(self, target, *args, **kwargs): - name_suffix = target.split('.')[0] - name_suffix = re.sub('-', '_', name_suffix) - self.titleName = 'LanciumJobsCache_{0}'.format(name_suffix) + name_suffix = target.split(".")[0] + name_suffix = re.sub("-", "_", name_suffix) + self.titleName = "LanciumJobsCache_{0}".format(name_suffix) SpecialFIFOBase.__init__(self) def lock(self, score=None): - lock_key = format(int(random.random() * 2**32), 'x') + lock_key = format(int(random.random() * 2**32), "x") if score is None: score = time.time() retVal = self.putbyid(self.global_lock_id, lock_key, score) @@ -182,40 +187,42 @@ class LanciumJobQuery(object, metaclass=SingletonWithID): classLock = threading.Lock() def __init__(self, cacheEnable=False, cacheRefreshInterval=None, *args, **kwargs): - self.submission_host = str(kwargs.get('id')) + self.submission_host = str(kwargs.get("id")) # Make logger - tmpLog = core_utils.make_logger(base_logger, 'submissionHost={0} thrid={1} oid={2}'.format( - self.submission_host, get_ident(), id(self)), method_name='LanciumJobQuery.__init__') + tmpLog = core_utils.make_logger( + base_logger, "submissionHost={0} thrid={1} oid={2}".format(self.submission_host, get_ident(), id(self)), method_name="LanciumJobQuery.__init__" + ) # Initialize with self.classLock: - tmpLog.debug('Start') + tmpLog.debug("Start") # For cache self.cacheEnable = cacheEnable if self.cacheEnable: self.cache = ([], 0) self.cacheRefreshInterval = cacheRefreshInterval - tmpLog.debug('Initialize done') + tmpLog.debug("Initialize done") def query_jobs(self, batchIDs_list=[], all_jobs=False): # Make logger - tmpLog = core_utils.make_logger(base_logger, 'submissionHost={0}'.format(self.submission_host), method_name='LanciumJobQuery.query_jobs') + tmpLog = core_utils.make_logger(base_logger, "submissionHost={0}".format(self.submission_host), method_name="LanciumJobQuery.query_jobs") # Start query - tmpLog.debug('Start query') + tmpLog.debug("Start query") cache_fifo = None job_attr_all_dict = {} # make id sets batchIDs_set = set(batchIDs_list) # query from cache + def cache_query(batch_id_set, timeout=60): # query from lancium job and update cache to fifo def update_cache(lockInterval=90): - tmpLog.debug('update_cache') + tmpLog.debug("update_cache") # acquire lock with score timestamp score = time.time() - self.cacheRefreshInterval + lockInterval lock_key = cache_fifo.lock(score=score) if lock_key is not None: # acquired lock, update - tmpLog.debug('got lock, updating cache') + tmpLog.debug("got lock, updating cache") all_jobs_light_list = Job().all() jobs_iter = [] for job in all_jobs_light_list: @@ -227,38 +234,38 @@ def update_cache(lockInterval=90): one_job_dict[attr] = getattr(one_job_attr, attr, None) jobs_iter.append(one_job_dict) except Exception as e: - tmpLog.error('In update_cache all job; got exception {0}: {1} ; {2}'.format( - e.__class__.__name__, e, repr(job))) + tmpLog.error("In update_cache all job; got exception {0}: {1} ; {2}".format(e.__class__.__name__, e, repr(job))) timeNow = time.time() cache_fifo.put(jobs_iter, timeNow) self.cache = (jobs_iter, timeNow) # release lock retVal = cache_fifo.unlock(key=lock_key) if retVal: - tmpLog.debug('done update cache and unlock') + tmpLog.debug("done update cache and unlock") else: - tmpLog.warning('cannot unlock... Maybe something wrong') + tmpLog.warning("cannot unlock... Maybe something wrong") return jobs_iter else: - tmpLog.debug('cache fifo locked by other thread. Skipped') + tmpLog.debug("cache fifo locked by other thread. Skipped") return None + # remove invalid or outdated caches from fifo + def cleanup_cache(timeout=60): - tmpLog.debug('cleanup_cache') + tmpLog.debug("cleanup_cache") id_list = list() attempt_timestamp = time.time() n_cleanup = 0 while True: if time.time() > attempt_timestamp + timeout: - tmpLog.debug('time is up when cleanup cache. Skipped') + tmpLog.debug("time is up when cleanup cache. Skipped") break peeked_tuple = cache_fifo.peek(skip_item=True) if peeked_tuple is None: - tmpLog.debug('empty cache fifo') + tmpLog.debug("empty cache fifo") break - elif peeked_tuple.score is not None \ - and time.time() <= peeked_tuple.score + self.cacheRefreshInterval: - tmpLog.debug('nothing expired') + elif peeked_tuple.score is not None and time.time() <= peeked_tuple.score + self.cacheRefreshInterval: + tmpLog.debug("nothing expired") break elif peeked_tuple.id is not None: retVal = cache_fifo.delete([peeked_tuple.id]) @@ -266,9 +273,10 @@ def cleanup_cache(timeout=60): n_cleanup += retVal else: # problematic - tmpLog.warning('got nothing when cleanup cache, maybe problematic. Skipped') + tmpLog.warning("got nothing when cleanup cache, maybe problematic. Skipped") break - tmpLog.debug('cleaned up {0} objects in cache fifo'.format(n_cleanup)) + tmpLog.debug("cleaned up {0} objects in cache fifo".format(n_cleanup)) + # start jobs_iter = tuple() try: @@ -276,7 +284,7 @@ def cleanup_cache(timeout=60): while True: if time.time() > attempt_timestamp + timeout: # skip cache_query if too long - tmpLog.debug('cache_query got timeout ({0} seconds). Skipped '.format(timeout)) + tmpLog.debug("cache_query got timeout ({0} seconds). Skipped ".format(timeout)) break # get latest cache peeked_tuple = cache_fifo.peeklast(skip_item=True) @@ -285,12 +293,12 @@ def cleanup_cache(timeout=60): if peeked_tuple.id == cache_fifo.global_lock_id: if time.time() <= peeked_tuple.score + self.cacheRefreshInterval: # lock - tmpLog.debug('got fifo locked. Wait and retry...') + tmpLog.debug("got fifo locked. Wait and retry...") time.sleep(random.uniform(1, 5)) continue else: # expired lock - tmpLog.debug('got lock expired. Clean up and retry...') + tmpLog.debug("got lock expired. Clean up and retry...") cleanup_cache() continue elif time.time() <= peeked_tuple.score + self.cacheRefreshInterval: @@ -298,24 +306,26 @@ def cleanup_cache(timeout=60): _obj, _last_update = self.cache if _last_update >= peeked_tuple.score: # valid local cache - tmpLog.debug('valid local cache') + tmpLog.debug("valid local cache") jobs_iter = _obj else: # valid fifo cache - tmpLog.debug('update local cache from fifo') + tmpLog.debug("update local cache from fifo") peeked_tuple_with_item = cache_fifo.peeklast() - if peeked_tuple_with_item is not None \ - and peeked_tuple.id != cache_fifo.global_lock_id \ - and peeked_tuple_with_item.item is not None: + if ( + peeked_tuple_with_item is not None + and peeked_tuple.id != cache_fifo.global_lock_id + and peeked_tuple_with_item.item is not None + ): jobs_iter = cache_fifo.decode(peeked_tuple_with_item.item) self.cache = (jobs_iter, peeked_tuple_with_item.score) else: - tmpLog.debug('peeked invalid cache fifo object. Wait and retry...') + tmpLog.debug("peeked invalid cache fifo object. Wait and retry...") time.sleep(random.uniform(1, 5)) continue else: # cache expired - tmpLog.debug('update cache in fifo') + tmpLog.debug("update cache in fifo") retVal = update_cache() if retVal is not None: jobs_iter = retVal @@ -326,7 +336,7 @@ def cleanup_cache(timeout=60): if cache_fifo.size() == 0: if time.time() > attempt_timestamp + random.uniform(10, 30): # have waited for long enough, update cache - tmpLog.debug('waited enough, update cache in fifo') + tmpLog.debug("waited enough, update cache in fifo") retVal = update_cache() if retVal is not None: jobs_iter = retVal @@ -337,9 +347,9 @@ def cleanup_cache(timeout=60): continue except Exception as _e: tb_str = traceback.format_exc() - tmpLog.error('Error querying from cache fifo; {0} ; {1}'.format(_e, tb_str)) + tmpLog.error("Error querying from cache fifo; {0} ; {1}".format(_e, tb_str)) return jobs_iter - + def direct_query(batch_id_set, **kwargs): jobs_iter = [] batch_ids = batch_id_set @@ -351,8 +361,7 @@ def direct_query(batch_id_set, **kwargs): lancium_job_id = job.id batch_ids.add(lancium_job_id) except Exception as e: - tmpLog.error('In doing Job().all(); got exception {0}: {1} '.format( - e.__class__.__name__, e)) + tmpLog.error("In doing Job().all(); got exception {0}: {1} ".format(e.__class__.__name__, e)) for batch_id in batch_id_set: try: lancium_job_id = batch_id @@ -362,13 +371,13 @@ def direct_query(batch_id_set, **kwargs): one_job_dict[attr] = getattr(one_job_attr, attr, None) jobs_iter.append(one_job_dict) except Exception as e: - tmpLog.error('In doing Job().get({0}); got exception {1}: {2} '.format( - batch_id, e.__class__.__name__, e)) + tmpLog.error("In doing Job().get({0}); got exception {1}: {2} ".format(batch_id, e.__class__.__name__, e)) return jobs_iter + # query method options query_method_list = [direct_query] if self.cacheEnable: - cache_fifo = LanciumJobsCacheFifo(target=self.submission_host, idlist='{0},{1}'.format(self.submission_host, get_ident())) + cache_fifo = LanciumJobsCacheFifo(target=self.submission_host, idlist="{0},{1}".format(self.submission_host, get_ident())) query_method_list.insert(0, cache_query) # Go for query_method in query_method_list: @@ -377,10 +386,9 @@ def direct_query(batch_id_set, **kwargs): for job in jobs_iter: try: job_attr_dict = dict(job) - batch_id = job_attr_dict['id'] + batch_id = job_attr_dict["id"] except Exception as e: - tmpLog.error('in querying; got exception {0}: {1} ; {2}'.format( - e.__class__.__name__, e, repr(job))) + tmpLog.error("in querying; got exception {0}: {1} ; {2}".format(e.__class__.__name__, e, repr(job))) else: full_batch_id = get_full_batch_id(self.submission_host, batch_id) job_attr_all_dict[full_batch_id] = job_attr_dict @@ -395,7 +403,6 @@ def direct_query(batch_id_set, **kwargs): for batch_id in batchIDs_set: full_batch_id = get_full_batch_id(self.submission_host, batch_id) job_attr_all_dict[full_batch_id] = dict() - tmpLog.info('Unfound batch jobs of submissionHost={0}: {1}'.format(self.submission_host, - ' '.join(list(batchIDs_set)))) + tmpLog.info("Unfound batch jobs of submissionHost={0}: {1}".format(self.submission_host, " ".join(list(batchIDs_set)))) # Return return job_attr_all_dict diff --git a/pandaharvester/harvestermisc/rucio_utils.py b/pandaharvester/harvestermisc/rucio_utils.py index 92c026ad..3b0f8404 100644 --- a/pandaharvester/harvestermisc/rucio_utils.py +++ b/pandaharvester/harvestermisc/rucio_utils.py @@ -4,225 +4,210 @@ """ try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils -def rucio_create_dataset(tmpLog,datasetScope,datasetName): +def rucio_create_dataset(tmpLog, datasetScope, datasetName): # create the dataset try: # register dataset - lifetime = 7*24*60*60 - tmpLog.debug('register {0}:{1} lifetime = {2}' - .format(datasetScope, datasetName,lifetime)) - executable = ['/usr/bin/env', - 'rucio', 'add-dataset'] - executable += [ '--lifetime',('%d' %lifetime)] + lifetime = 7 * 24 * 60 * 60 + tmpLog.debug("register {0}:{1} lifetime = {2}".format(datasetScope, datasetName, lifetime)) + executable = ["/usr/bin/env", "rucio", "add-dataset"] + executable += ["--lifetime", ("%d" % lifetime)] executable += [datasetName] - tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) - tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True) - stdout,stderr = process.communicate() + tmpLog.debug("rucio add-dataset command: {0} ".format(executable)) + tmpLog.debug("rucio add-dataset command (for human): %s " % " ".join(executable)) + process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) + stdout, stderr = process.communicate() if process.returncode == 0: tmpLog.debug(stdout) - return True,'' + return True, "" else: # check what failed dataset_exists = False rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'Data Identifier Already Exists' in line: + for line in stdout.split("\n"): + if "Data Identifier Already Exists" in line: dataset_exists = True break - elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + elif "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True break if dataset_exists: - errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, - datasetName) + errMsg = "dataset {0}:{1} already exists".format(datasetScope, datasetName) tmpLog.debug(errMsg) - return True,errMsg + return True, errMsg elif rucio_sessions_limit_error: # do nothing - errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) + errStr = "Rucio returned error, will retry: stdout: {0}".format(stdout) tmpLog.warning(errStr) - return None,errStr + return None, errStr else: # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + errStr = "Rucio returned error : stdout: {0}".format(stdout) tmpLog.error(errStr) - return False,errStr + return False, errStr except Exception as e: - errMsg = 'Could not create dataset {0}:{1} with {2}'.format(datasetScope, - datasetName, - str(e)) + errMsg = "Could not create dataset {0}:{1} with {2}".format(datasetScope, datasetName, str(e)) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) - return False,errMsg + return False, errMsg -def rucio_add_files_to_dataset(tmpLog,datasetScope,datasetName,fileList): - # add files to dataset + +def rucio_add_files_to_dataset(tmpLog, datasetScope, datasetName, fileList): + # add files to dataset try: - #create the to DID - to_did = '{0}:{1}'.format(datasetScope,datasetName) - executable = ['/usr/bin/env', - 'rucio', 'attach', to_did] + # create the to DID + to_did = "{0}:{1}".format(datasetScope, datasetName) + executable = ["/usr/bin/env", "rucio", "attach", to_did] # loop over the files to add for filename in fileList: - from_did = '{0}:{1}'.format(filename['scope'],filename['name']) + from_did = "{0}:{1}".format(filename["scope"], filename["name"]) executable += [from_did] - #print executable - tmpLog.debug('rucio attach command: {0} '.format(executable)) - tmpLog.debug('rucio attach command (for human): %s ' % ' '.join(executable)) + # print executable + tmpLog.debug("rucio attach command: {0} ".format(executable)) + tmpLog.debug("rucio attach command (for human): %s " % " ".join(executable)) + + process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + stdout, stderr = process.communicate() - stdout,stderr = process.communicate() - if process.returncode == 0: tmpLog.debug(stdout) - return True,'' + return True, "" else: # check what failed rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + for line in stdout.split("\n"): + if "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing - errStr = 'Rucio returned Sessions Limit error, will retry: stdout: {0}'.format(stdout) + errStr = "Rucio returned Sessions Limit error, will retry: stdout: {0}".format(stdout) tmpLog.warning(errStr) - return None,errStr + return None, errStr else: - # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + # some other Rucio error + errStr = "Rucio returned error : stdout: {0}".format(stdout) tmpLog.error(errStr) - return False,errStr - #except FileAlreadyExists: + return False, errStr + # except FileAlreadyExists: # # ignore if files already exist # pass except Exception: - errMsg = 'Could not add files to DS - {0}:{1} files - {2}'.format(datasetScope, - datasetName, - fileList) + errMsg = "Could not add files to DS - {0}:{1} files - {2}".format(datasetScope, datasetName, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) - return False,errMsg + return False, errMsg + -def rucio_add_rule(tmpLog,datasetScope,datasetName,dstRSE): +def rucio_add_rule(tmpLog, datasetScope, datasetName, dstRSE): # add rule try: - tmpLog.debug('rucio add-rule {0}:{1} 1 {2}'.format(datasetScope, datasetName, - dstRSE)) - did = '{0}:{1}'.format(datasetScope,datasetName) - executable = ['/usr/bin/env', - 'rucio', 'add-rule',did,'1',dstRSE] + tmpLog.debug("rucio add-rule {0}:{1} 1 {2}".format(datasetScope, datasetName, dstRSE)) + did = "{0}:{1}".format(datasetScope, datasetName) + executable = ["/usr/bin/env", "rucio", "add-rule", did, "1", dstRSE] - #print executable + # print executable - tmpLog.debug('rucio add-rule command: {0} '.format(executable)) - tmpLog.debug('rucio add-rule command (for human): %s ' % ' '.join(executable)) + tmpLog.debug("rucio add-rule command: {0} ".format(executable)) + tmpLog.debug("rucio add-rule command (for human): %s " % " ".join(executable)) - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + stdout, stderr = process.communicate() - stdout,stderr = process.communicate() - if process.returncode == 0: tmpLog.debug(stdout) - #parse stdout for rule id - rule_id = stdout.split('\n')[0] - return True,rule_id + # parse stdout for rule id + rule_id = stdout.split("\n")[0] + return True, rule_id else: # check what failed rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + for line in stdout.split("\n"): + if "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing - errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) + errStr = "Rucio returned error, will retry: stdout: {0}".format(stdout) tmpLog.warning(errStr) - return None,errStr + return None, errStr else: - # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + # some other Rucio error + errStr = "Rucio returned error : stdout: {0}".format(stdout) tmpLog.error(errStr) - return False,errStr + return False, errStr except Exception: core_utils.dump_error_message(tmpLog) - # treat as a temporary error + # treat as a temporary error tmpStat = False - tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) - return tmpStat,tmpMsg + tmpMsg = "failed to add a rule for {0}:{1}".format(datasetScope, datasetName) + return tmpStat, tmpMsg + # !!!!!!! # Need to add files to replica. Need to write code # # !!!!!!! -def rucio_rule_info(tmpLog,rucioRule): + +def rucio_rule_info(tmpLog, rucioRule): # get rule-info - tmpLog.debug('rucio rule-info {0}'.format(rucioRule)) - try: - executable = ['/usr/bin/env', - 'rucio', 'rule-info',rucioRule] - #print executable + tmpLog.debug("rucio rule-info {0}".format(rucioRule)) + try: + executable = ["/usr/bin/env", "rucio", "rule-info", rucioRule] + # print executable + + tmpLog.debug("rucio rule-info command: {0} ".format(executable)) + tmpLog.debug("rucio rule-info command (for human): %s " % " ".join(executable)) - tmpLog.debug('rucio rule-info command: {0} '.format(executable)) - tmpLog.debug('rucio rule-info command (for human): %s ' % ' '.join(executable)) + process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + stdout, stderr = process.communicate() - stdout,stderr = process.communicate() - if process.returncode == 0: tmpLog.debug(stdout) # parse the output to get the state: - for line in stdout.split('\n'): - if 'State:' in line: + for line in stdout.split("\n"): + if "State:" in line: # get the State varible result = line.split() - return True,result - return None,'' + return True, result + return None, "" else: # check what failed rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + for line in stdout.split("\n"): + if "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True break if rucio_sessions_limit_error: # do nothing - errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) + errStr = "Rucio returned error, will retry: stdout: {0}".format(stdout) tmpLog.warning(errStr) - return None,errStr + return None, errStr else: - # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + # some other Rucio error + errStr = "Rucio returned error : stdout: {0}".format(stdout) tmpLog.error(errStr) - return False,errStr + return False, errStr except Exception: - errMsg = 'Could not run rucio rule-info {0}'.format(rucioRule) + errMsg = "Could not run rucio rule-info {0}".format(rucioRule) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) - return False,errMsg + return False, errMsg + -''' +""" [dbenjamin@atlas28 ~]$ rucio rule-info 66a88e4d468a4845adcc66a66080b710 Id: 66a88e4d468a4845adcc66a66080b710 Account: pilot @@ -249,5 +234,4 @@ def rucio_rule_info(tmpLog,rucioRule): Notification: NO End of life: None Child Rule Id: None -''' - +""" diff --git a/pandaharvester/harvestermisc/selfcheck.py b/pandaharvester/harvestermisc/selfcheck.py index ea0f6d9f..8861aa41 100644 --- a/pandaharvester/harvestermisc/selfcheck.py +++ b/pandaharvester/harvestermisc/selfcheck.py @@ -7,12 +7,10 @@ from pandaharvester.panda_pkg_info import release_version as releaseVersion - - class harvesterPackageInfo(object): - """ - """ - _attributes = ('commit_info', 'version', 'info_digest') + """ """ + + _attributes = ("commit_info", "version", "info_digest") def __init__(self, local_info_file): self.local_info_file = local_info_file @@ -22,7 +20,7 @@ def __init__(self, local_info_file): @staticmethod def _get_hash(data): h = hashlib.md5() - h.update(str(data).encode('utf-8')) + h.update(str(data).encode("utf-8")) return h.hexdigest() @property @@ -33,7 +31,7 @@ def info_digest(self): def _local_info_dict(self): info_dict = {} try: - with open(self.local_info_file, 'r') as f: + with open(self.local_info_file, "r") as f: info_dict = json.load(f) except IOError as e: if e.errno == errno.ENOENT: @@ -46,9 +44,9 @@ def renew_local_info(self): info_dict = {} for attr in self._attributes: info_dict[attr] = getattr(self, attr) - with open(self.local_info_file, 'w') as f: + with open(self.local_info_file, "w") as f: json.dump(info_dict, f) @property def package_changed(self): - return self.info_digest != self._local_info_dict.get('info_digest') + return self.info_digest != self._local_info_dict.get("info_digest") diff --git a/pandaharvester/harvestermisc/titan_utils.py b/pandaharvester/harvestermisc/titan_utils.py index e66c492b..4c769f52 100644 --- a/pandaharvester/harvestermisc/titan_utils.py +++ b/pandaharvester/harvestermisc/titan_utils.py @@ -10,14 +10,14 @@ import datetime # logger -baseLogger = core_utils.setup_logger('titan_utils') +baseLogger = core_utils.setup_logger("titan_utils") class TitanUtils(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - tmpLog = self.make_logger(baseLogger, method_name='__init__') + tmpLog = self.make_logger(baseLogger, method_name="__init__") tmpLog.info("Titan utils initiated") def get_batchjob_info(self, batchid): @@ -27,22 +27,22 @@ def get_batchjob_info(self, batchid): :return res - dictonary with job state and some timing: """ """ - :param batchid: - :return: + :param batchid: + :return: """ - tmpLog = self.make_logger(baseLogger, method_name='get_batchjob_info') + tmpLog = self.make_logger(baseLogger, method_name="get_batchjob_info") res = {} tmpLog.info("Collect job info for batchid {}".format(batchid)) info_dict = self.get_moabjob_info(batchid) tmpLog.info("Got: {0}".format(info_dict)) if info_dict: tmpLog.debug("Translate results") - res['status'] = self.translate_status(info_dict['state']) - res['nativeStatus'] = info_dict['state'] - res['nativeExitCode'] = info_dict['exit_code'] - res['nativeExitMsg'] = self.get_message(info_dict['exit_code']) - res['start_time'] = self.fixdate(info_dict['start_time']) - res['finish_time'] = self.fixdate(info_dict['finish_time']) + res["status"] = self.translate_status(info_dict["state"]) + res["nativeStatus"] = info_dict["state"] + res["nativeExitCode"] = info_dict["exit_code"] + res["nativeExitMsg"] = self.get_message(info_dict["exit_code"]) + res["start_time"] = self.fixdate(info_dict["start_time"]) + res["finish_time"] = self.fixdate(info_dict["finish_time"]) tmpLog.info("Collected job info: {0}".format(res)) return res @@ -51,21 +51,12 @@ def get_moabjob_info(self, batchid): Parsing of checkjob output to get job state, exit code, start time, finish time (if available) :return job_info dictonary: """ - tmpLog = self.make_logger(baseLogger, method_name='get_moabjob_info') + tmpLog = self.make_logger(baseLogger, method_name="get_moabjob_info") - job_info = { - 'state': "", - 'exit_code': None, - 'queued_time': None, - 'start_time': None, - 'finish_time': None - } + job_info = {"state": "", "exit_code": None, "queued_time": None, "start_time": None, "finish_time": None} - cmd = 'checkjob -v {0}'.format(batchid) - p = subprocess.Popen(cmd.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + cmd = "checkjob -v {0}".format(batchid) + p = subprocess.Popen(cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode @@ -78,15 +69,15 @@ def get_moabjob_info(self, batchid): if checkjob_str: checkjob_out = checkjob_str.splitlines() for l in checkjob_out: - if l.startswith('State: '): - job_info['state'] = l[7:].split()[0] - elif l.startswith('Completion Code: '): - job_info['exit_code'] = int(l[17:].split()[0]) - if 'Time: ' in l: - job_info['finish_time'] = l[l.index('Time: ') + 6:] - elif l.startswith('StartTime: '): - job_info['start_time'] = l[11:] - elif l.startswith('WallTime: '): + if l.startswith("State: "): + job_info["state"] = l[7:].split()[0] + elif l.startswith("Completion Code: "): + job_info["exit_code"] = int(l[17:].split()[0]) + if "Time: " in l: + job_info["finish_time"] = l[l.index("Time: ") + 6 :] + elif l.startswith("StartTime: "): + job_info["start_time"] = l[11:] + elif l.startswith("WallTime: "): tmpLog.info(l) tmpLog.debug("checkjob parsing results: {0}".format(job_info)) return job_info @@ -97,11 +88,11 @@ def translate_status(self, status): :param status: :return: """ - submited = ['deferred', 'hold', 'idle', 'migrated', 'staged'] - running = ['starting', 'running', 'suspended', 'canceling'] - finished = ['completed'] - cancelled = ['removed'] - failed = ['vacated'] + submited = ["deferred", "hold", "idle", "migrated", "staged"] + running = ["starting", "running", "suspended", "canceling"] + finished = ["completed"] + cancelled = ["removed"] + failed = ["vacated"] status = status.lower() if status in submited: return ws.ST_submitted @@ -144,7 +135,7 @@ def get_message(self, exit_code): -10: "Job exceeded a memory limit (stopped by the resource manager)", -11: "Job exceeded a walltime limit", -12: "Job exceeded a CPU time limit", - -13: "Could not create the jobs control groups (cgroups)" + -13: "Could not create the jobs control groups (cgroups)", } if exit_code in codes_messages.keys(): @@ -159,11 +150,11 @@ def fixdate(self, date_str): :param date_str: :return: date (datetime object) """ - tmpLog = self.make_logger(baseLogger, method_name='fixdate') + tmpLog = self.make_logger(baseLogger, method_name="fixdate") if not date_str: return None tmpLog.debug("Date to fix: {0}".format(date_str)) - format_str = '%a %b %d %H:%M:%S %Y' + format_str = "%a %b %d %H:%M:%S %Y" date_str = " ".join([date_str, str(datetime.datetime.now().year)]) date = datetime.datetime.strptime(date_str, format_str) if date > datetime.datetime.now(): @@ -171,7 +162,7 @@ def fixdate(self, date_str): date = datetime.datetime.strptime(date_str, format_str) tmpLog.debug("Full date: {0}".format(str(date))) - utc_offset = datetime.timedelta(0, 18000, 0) # 5H UTC offset for Oak-Ridge + utc_offset = datetime.timedelta(0, 18000, 0) # 5H UTC offset for Oak-Ridge tmpLog.debug("UTC offset: {0}".format(str(utc_offset))) fixed_date = date + utc_offset tmpLog.debug("Fixed date: {0}".format(str(fixed_date))) @@ -181,11 +172,11 @@ def fixdate(self, date_str): def get_resources(self): """ Fucnction to provide number of nodes with walltime limit to worker maker - :return: - nodes: integer + :return: + nodes: integer walltime: intger, seconds """ - tmpLog = self.make_logger(baseLogger, method_name='get_backfill') + tmpLog = self.make_logger(baseLogger, method_name="get_backfill") tmpLog.info("Looking for gap more than '%s' sec" % self.minWalltime) nodes = 0 walltime = self.minWalltime @@ -205,21 +196,17 @@ def get_resources(self): return nodes, walltime def get_backfill(self): - # Function collect information about current available resources and # return number of nodes with possible maximum value for walltime according Titan policy # - tmpLog = self.make_logger(baseLogger, method_name='get_backfill') + tmpLog = self.make_logger(baseLogger, method_name="get_backfill") res = {} - cmd = 'showbf --blocking -p %s' % self.partition - p = subprocess.Popen(cmd.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + cmd = "showbf --blocking -p %s" % self.partition + p = subprocess.Popen(cmd.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.info('retCode={0}'.format(retCode)) + tmpLog.info("retCode={0}".format(retCode)) showbf_str = "" if retCode == 0: showbf_str = stdOut @@ -234,11 +221,10 @@ def get_backfill(self): for l in shobf_out[2:]: d = l.split() nodes = int(d[2]) - if not d[3] == 'INFINITY': + if not d[3] == "INFINITY": walltime_arr = d[3].split(":") if len(walltime_arr) < 4: - walltime_sec = int(walltime_arr[0]) * (60 * 60) + int(walltime_arr[1]) * 60 + int( - walltime_arr[2]) + walltime_sec = int(walltime_arr[0]) * (60 * 60) + int(walltime_arr[1]) * 60 + int(walltime_arr[2]) if walltime_sec > 24 * 3600: # in case we will have more than 24H walltime_sec = 24 * 3600 else: diff --git a/pandaharvester/harvestermisc/token_utils.py b/pandaharvester/harvestermisc/token_utils.py index 7ac11ff3..5af120e4 100644 --- a/pandaharvester/harvestermisc/token_utils.py +++ b/pandaharvester/harvestermisc/token_utils.py @@ -16,6 +16,7 @@ def _md5sum(data): hash_hex = hash.hexdigest() return hash_hex + def endpoint_to_filename(endpoint): """ get token file name according to service (CE, storage, etc.) endpoint @@ -26,7 +27,7 @@ def endpoint_to_filename(endpoint): class WLCG_scopes(object): - COMPUTE_ALL = 'compute.read compute.modify compute.create compute.cancel' + COMPUTE_ALL = "compute.read compute.modify compute.create compute.cancel" class IssuerBroker(object): @@ -34,19 +35,19 @@ class IssuerBroker(object): Talk to token issuer with client credentials flow """ - def __init__(self, issuer, client_id, client_secret, name='unknown'): + def __init__(self, issuer, client_id, client_secret, name="unknown"): self.issuer = issuer self.client_id = client_id self.client_secret = client_secret self.name = name self.timeout = 3 # derived attributes - self.token_request_url = '{0}/token'.format(self.issuer) + self.token_request_url = "{0}/token".format(self.issuer) self._base_post_data = { - 'grant_type': 'client_credentials', - 'client_id': self.client_id, - 'client_secret': self.client_secret, - } + "grant_type": "client_credentials", + "client_id": self.client_id, + "client_secret": self.client_secret, + } def _post(self, **kwarg): data_dict = copy.deepcopy(self._base_post_data) @@ -62,7 +63,7 @@ def get_access_token(self, aud=None, scope=None): resp_dict = json.loads(resp.text) except Exception as e: raise - token = resp_dict['access_token'] + token = resp_dict["access_token"] return token else: resp.raise_for_status() diff --git a/pandaharvester/harvestermonitor/act_monitor.py b/pandaharvester/harvestermonitor/act_monitor.py index a20b6431..c06224b2 100644 --- a/pandaharvester/harvestermonitor/act_monitor.py +++ b/pandaharvester/harvestermonitor/act_monitor.py @@ -11,7 +11,7 @@ jsonJobReport = harvester_config.payload_interaction.jobReportFile # logger -baseLogger = core_utils.setup_logger('act_monitor') +baseLogger = core_utils.setup_logger("act_monitor") # monitor for aCT plugin @@ -21,11 +21,11 @@ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # Set up aCT DB connection - self.log = core_utils.make_logger(baseLogger, 'aCT submitter', method_name='__init__') + self.log = core_utils.make_logger(baseLogger, "aCT submitter", method_name="__init__") try: self.actDB = aCTDBPanda(self.log) except Exception as e: - self.log.error('Could not connect to aCT database: {0}'.format(str(e))) + self.log.error("Could not connect to aCT database: {0}".format(str(e))) self.actDB = None # check workers @@ -33,20 +33,19 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) try: - tmpLog.debug('Querying aCT for id {0}'.format(workSpec.batchID)) - columns = ['actpandastatus', 'pandastatus', 'computingElement', 'node', 'error'] + tmpLog.debug("Querying aCT for id {0}".format(workSpec.batchID)) + columns = ["actpandastatus", "pandastatus", "computingElement", "node", "error"] actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns) except Exception as e: if self.actDB: tmpLog.error("Failed to query aCT DB: {0}".format(str(e))) # send back current status - retList.append((workSpec.status, '')) + retList.append((workSpec.status, "")) continue if not actjobs: @@ -55,42 +54,42 @@ def check_workers(self, workspec_list): retList.append((WorkSpec.ST_failed, "Job not found in aCT")) continue - actstatus = actjobs[0]['actpandastatus'] + actstatus = actjobs[0]["actpandastatus"] workSpec.nativeStatus = actstatus newStatus = WorkSpec.ST_running - errorMsg = '' - if actstatus in ['waiting', 'sent', 'starting']: + errorMsg = "" + if actstatus in ["waiting", "sent", "starting"]: newStatus = WorkSpec.ST_submitted # Handle post running states if queueconfig.truePilot: # True pilot: keep in running until really done - if actstatus in ['done', 'donecancelled']: + if actstatus in ["done", "donecancelled"]: newStatus = WorkSpec.ST_finished - elif actstatus == 'donefailed': + elif actstatus == "donefailed": # set failed here with workspec sup error - errorMsg = actjobs[0]['error'] or 'Unknown error' - error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') + errorMsg = actjobs[0]["error"] or "Unknown error" + error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") workSpec.set_supplemental_error(error_code=error_code, error_diag=errorMsg) newStatus = WorkSpec.ST_failed - tmpLog.info('ID {0} failed with error {1})'.format(workSpec.batchID, errorMsg)) - elif actstatus in ['done', 'donefailed', 'donecancelled', 'transferring', 'tovalidate']: + tmpLog.info("ID {0} failed with error {1})".format(workSpec.batchID, errorMsg)) + elif actstatus in ["done", "donefailed", "donecancelled", "transferring", "tovalidate"]: # NG mode: all post processing is now done in the stager newStatus = WorkSpec.ST_finished if newStatus != workSpec.status: - tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format(workSpec.batchID, workSpec.status, newStatus, actstatus)) + tmpLog.info("ID {0} updated status {1} -> {2} ({3})".format(workSpec.batchID, workSpec.status, newStatus, actstatus)) else: - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(actstatus, newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(actstatus, newStatus)) - if actjobs[0]['computingElement']: - workSpec.computingElement = actjobs[0]['computingElement'] - if actjobs[0]['node']: + if actjobs[0]["computingElement"]: + workSpec.computingElement = actjobs[0]["computingElement"] + if actjobs[0]["node"]: try: pandaid = workSpec.get_jobspec_list()[0].PandaID - workSpec.set_work_attributes({pandaid: {'node': actjobs[0]['node']}}) - except: - tmpLog.warning('Could not extract panda ID for worker {0}'.format(workSpec.batchID)) + workSpec.set_work_attributes({pandaid: {"node": actjobs[0]["node"]}}) + except BaseException: + tmpLog.warning("Could not extract panda ID for worker {0}".format(workSpec.batchID)) retList.append((newStatus, errorMsg)) diff --git a/pandaharvester/harvestermonitor/apfgrid_monitor.py b/pandaharvester/harvestermonitor/apfgrid_monitor.py index c032f224..a16b126c 100644 --- a/pandaharvester/harvestermonitor/apfgrid_monitor.py +++ b/pandaharvester/harvestermonitor/apfgrid_monitor.py @@ -29,60 +29,62 @@ def __call__(self, *args, **kwargs): class APFGridMonitor(object): - ''' - 1 WorkSpec.ST_submitted = 'submitted' - 2 WorkSpec.ST_running = 'running' - 4 WorkSpec.ST_finished = 'finished' - 5 WorkSpec.ST_failed = 'failed' - 6 WorkSpec.ST_ready = 'ready' - 3 WorkSpec.ST_cancelled = 'cancelled ' - - CONDOR_JOBSTATUS - 1 Idle I + """ + 1 WorkSpec.ST_submitted = 'submitted' + 2 WorkSpec.ST_running = 'running' + 4 WorkSpec.ST_finished = 'finished' + 5 WorkSpec.ST_failed = 'failed' + 6 WorkSpec.ST_ready = 'ready' + 3 WorkSpec.ST_cancelled = 'cancelled ' + + CONDOR_JOBSTATUS + 1 Idle I 2 Running R 3 Removed X 4 Completed C 5 Held H 6 Submission_err E - ''' + """ + __metaclass__ = APFGridMonitorSingleton STATUS_MAP = { - 1 : WorkSpec.ST_submitted, - 2 : WorkSpec.ST_running, - 3 : WorkSpec.ST_cancelled, - 4 : WorkSpec.ST_finished, - 5 : WorkSpec.ST_failed, - 6 : WorkSpec.ST_ready, - } - - JOBQUERYATTRIBUTES = ['match_apf_queue', - 'jobstatus', - 'workerid', - 'apf_queue', - 'apf_logurl', - 'apf_outurl', - 'apf_errurl', - ] + 1: WorkSpec.ST_submitted, + 2: WorkSpec.ST_running, + 3: WorkSpec.ST_cancelled, + 4: WorkSpec.ST_finished, + 5: WorkSpec.ST_failed, + 6: WorkSpec.ST_ready, + } + + JOBQUERYATTRIBUTES = [ + "match_apf_queue", + "jobstatus", + "workerid", + "apf_queue", + "apf_logurl", + "apf_outurl", + "apf_errurl", + ] def __init__(self, **kwarg): self.log = core_utils.make_logger(baseLogger) self.jobinfo = None - self.historyinfo = None - self.log.debug('APFGridMonitor initialized.') - + self.historyinfo = None + self.log.debug("APFGridMonitor initialized.") + def _updateJobInfo(self): self.log.debug("Getting job info from Condor...") out = condorlib.condor_q(APFGridMonitor.JOBQUERYATTRIBUTES) self.log.debug("Got jobinfo %s" % out) self.jobinfo = out - out = condorlib.condor_history(attributes = APFGridMonitor.JOBQUERYATTRIBUTES, constraints=[]) + out = condorlib.condor_history(attributes=APFGridMonitor.JOBQUERYATTRIBUTES, constraints=[]) self.log.debug("Got history info %s" % out) self.historyinfo = out alljobs = self.jobinfo + self.historyinfo for jobad in alljobs: try: - workerid = jobad['workerid'] - self.allbyworkerid[workerid]= jobad + workerid = jobad["workerid"] + self.allbyworkerid[workerid] = jobad except KeyError: # some non-harvester jobs may not have workerids, ignore them pass @@ -90,7 +92,7 @@ def _updateJobInfo(self): # check workers def check_workers(self, workspec_list): - '''Check status of workers. This method takes a list of WorkSpecs as input argument + """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. Nth element if the return list corresponds to the status of Nth WorkSpec in the given list. Worker's status is one of WorkSpec.ST_finished, WorkSpec.ST_failed, WorkSpec.ST_cancelled, WorkSpec.ST_running, @@ -99,28 +101,26 @@ def check_workers(self, workspec_list): :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) - - ''' + + """ self.jobinfo = [] self.historyinfo = [] self.allbyworkerid = {} self._updateJobInfo() - + retlist = [] for workSpec in workspec_list: - self.log.debug("Worker(workerId=%s queueName=%s computingSite=%s status=%s )" % (workSpec.workerID, - workSpec.queueName, - workSpec.computingSite, - workSpec.status) ) + self.log.debug( + "Worker(workerId=%s queueName=%s computingSite=%s status=%s )" + % (workSpec.workerID, workSpec.queueName, workSpec.computingSite, workSpec.status) + ) try: jobad = self.allbyworkerid[workSpec.workerID] - self.log.debug("Found matching job: ID %s" % jobad['workerid']) - jobstatus = int(jobad['jobstatus']) - retlist.append((APFGridMonitor.STATUS_MAP[jobstatus], '')) - except KeyError: + self.log.debug("Found matching job: ID %s" % jobad["workerid"]) + jobstatus = int(jobad["jobstatus"]) + retlist.append((APFGridMonitor.STATUS_MAP[jobstatus], "")) + except KeyError: self.log.error("No corresponding job for workspec %s" % workSpec) - retlist.append((WorkSpec.ST_cancelled, '')) - self.log.debug('retlist=%s' % retlist) + retlist.append((WorkSpec.ST_cancelled, "")) + self.log.debug("retlist=%s" % retlist) return True, retlist - - \ No newline at end of file diff --git a/pandaharvester/harvestermonitor/cloud_google_monitor.py b/pandaharvester/harvestermonitor/cloud_google_monitor.py index f178cc6b..f2723e10 100644 --- a/pandaharvester/harvestermonitor/cloud_google_monitor.py +++ b/pandaharvester/harvestermonitor/cloud_google_monitor.py @@ -4,7 +4,8 @@ from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper from pandaharvester.harvestercloud.googlecloud import compute, ZONE, PROJECT -base_logger = core_utils.setup_logger('google_monitor') +base_logger = core_utils.setup_logger("google_monitor") + class GoogleMonitor(PluginBase): def __init__(self, **kwarg): @@ -13,12 +14,12 @@ def __init__(self, **kwarg): # States taken from: https://cloud.google.com/compute/docs/instances/checking-instance-status self.vm_to_worker_status = { - 'RUNNING': WorkSpec.ST_running, - 'TERMINATED': WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted - 'STOPPING': WorkSpec.ST_finished, - 'PROVISIONING': WorkSpec.ST_submitted, - 'STAGING': WorkSpec.ST_submitted - } + "RUNNING": WorkSpec.ST_running, + "TERMINATED": WorkSpec.ST_running, # the VM is stopped, but has to be fully deleted + "STOPPING": WorkSpec.ST_finished, + "PROVISIONING": WorkSpec.ST_submitted, + "STAGING": WorkSpec.ST_submitted, + } def list_vms(self, zone): """ @@ -30,22 +31,22 @@ def list_vms(self, zone): result = compute.instances().list(project=PROJECT, zone=zone).execute() try: - vm_instances = result['items'] + vm_instances = result["items"] except KeyError: # there are no VMs running return [], {} # make a list with the VM names - vm_names = map(lambda vm_instance: vm_instance['name'], vm_instances) + vm_names = map(lambda vm_instance: vm_instance["name"], vm_instances) # make a dictionary so we can retrieve a VM by its name vm_name_to_status = {} for vm_instance in vm_instances: - vm_name_to_status[vm_instance['name']] = vm_instance['status'] + vm_name_to_status[vm_instance["name"]] = vm_instance["status"] return vm_names, vm_name_to_status - except: + except BaseException: return None, None def kill_worker(self, vm_name, zone): @@ -54,11 +55,11 @@ def kill_worker(self, vm_name, zone): """ try: - base_logger.debug('Going to kill VM {0}'.format(vm_name)) + base_logger.debug("Going to kill VM {0}".format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() - base_logger.debug('Killed VM {0}'.format(vm_name)) + base_logger.debug("Killed VM {0}".format(vm_name)) except Exception as e: - base_logger.error('Problems killing the VM: {0}'.format(e)) + base_logger.error("Problems killing the VM: {0}".format(e)) def check_workers(self, workers): """ @@ -71,7 +72,7 @@ def check_workers(self, workers): """ if not workers: - return False, 'Empty workers list received' + return False, "Empty workers list received" # it assumes that all workers belong to the same queue, which is currently the case # we assume all work_specs in the list belong to the same queue @@ -84,37 +85,37 @@ def check_workers(self, workers): # running instances vm_names, vm_name_to_status = self.list_vms(zone) if vm_names is None and vm_name_to_status is None: - error_string = 'Could not list the VMs' + error_string = "Could not list the VMs" base_logger.error(error_string) return False, error_string # extract the list of batch IDs batch_IDs = map(lambda x: str(x.batchID), workers) - base_logger.debug('Batch IDs: {0}'.format(batch_IDs)) + base_logger.debug("Batch IDs: {0}".format(batch_IDs)) ret_list = [] for batch_ID in batch_IDs: - tmp_log = self.make_logger(base_logger, 'batch ID={0}'.format(batch_ID), method_name='check_workers') + tmp_log = self.make_logger(base_logger, "batch ID={0}".format(batch_ID), method_name="check_workers") if batch_ID not in vm_names: new_status = WorkSpec.ST_finished - message = 'VM not found' + message = "VM not found" else: try: new_status = self.vm_to_worker_status[vm_name_to_status[batch_ID]] - message = 'VM status returned by GCE API' + message = "VM status returned by GCE API" # Preemptible VMs: GCE terminates a VM, but a stopped VM with its disk is left and needs to be # explicitly deleted - if vm_name_to_status[batch_ID] == 'TERMINATED': + if vm_name_to_status[batch_ID] == "TERMINATED": self.kill_worker(batch_ID, zone) except KeyError: new_status = WorkSpec.ST_missed - message = 'Unknown status to Harvester: {0}'.format(vm_name_to_status[batch_ID]) + message = "Unknown status to Harvester: {0}".format(vm_name_to_status[batch_ID]) - tmp_log.debug('new_status={0}'.format(new_status)) + tmp_log.debug("new_status={0}".format(new_status)) ret_list.append((new_status, message)) - base_logger.debug('ret_list: {0}'.format(ret_list)) + base_logger.debug("ret_list: {0}".format(ret_list)) return True, ret_list diff --git a/pandaharvester/harvestermonitor/cloud_openstack_monitor.py b/pandaharvester/harvestermonitor/cloud_openstack_monitor.py index 91fd877f..9d160a49 100644 --- a/pandaharvester/harvestermonitor/cloud_openstack_monitor.py +++ b/pandaharvester/harvestermonitor/cloud_openstack_monitor.py @@ -9,39 +9,39 @@ # setup base logger -baseLogger = core_utils.setup_logger('cloud_openstack_monitor') +baseLogger = core_utils.setup_logger("cloud_openstack_monitor") # status map -#FIXME +# FIXME vm_worker_status_map_dict = { - 'ACTIVE': WorkSpec.ST_running, - 'BUILD': WorkSpec.ST_submitted, - 'DELETED': WorkSpec.ST_finished, - 'ERROR': WorkSpec.ST_failed, - 'HARD_REBOOT': WorkSpec.ST_pending, - 'MIGRATING': WorkSpec.ST_pending, - 'PASSWORD': WorkSpec.ST_pending, - 'PAUSED': WorkSpec.ST_pending, - 'REBOOT': WorkSpec.ST_pending, - 'REBUILD': WorkSpec.ST_pending, - 'RESCUE': WorkSpec.ST_pending, - 'RESIZE': WorkSpec.ST_pending, - 'REVERT_RESIZE': WorkSpec.ST_pending, - 'SHELVED': WorkSpec.ST_pending, - 'SHELVED_OFFLOADED': WorkSpec.ST_pending, - 'SHUTOFF': WorkSpec.ST_cancelled, - 'SOFT_DELETED': WorkSpec.ST_pending, - 'SUSPENDED': WorkSpec.ST_pending, - 'UNKNOWN': WorkSpec.ST_failed, - 'VERIFY_RESIZE': WorkSpec.ST_pending, + "ACTIVE": WorkSpec.ST_running, + "BUILD": WorkSpec.ST_submitted, + "DELETED": WorkSpec.ST_finished, + "ERROR": WorkSpec.ST_failed, + "HARD_REBOOT": WorkSpec.ST_pending, + "MIGRATING": WorkSpec.ST_pending, + "PASSWORD": WorkSpec.ST_pending, + "PAUSED": WorkSpec.ST_pending, + "REBOOT": WorkSpec.ST_pending, + "REBUILD": WorkSpec.ST_pending, + "RESCUE": WorkSpec.ST_pending, + "RESIZE": WorkSpec.ST_pending, + "REVERT_RESIZE": WorkSpec.ST_pending, + "SHELVED": WorkSpec.ST_pending, + "SHELVED_OFFLOADED": WorkSpec.ST_pending, + "SHUTOFF": WorkSpec.ST_cancelled, + "SOFT_DELETED": WorkSpec.ST_pending, + "SUSPENDED": WorkSpec.ST_pending, + "UNKNOWN": WorkSpec.ST_failed, + "VERIFY_RESIZE": WorkSpec.ST_pending, } # whether to kill the vm def _toKillVM(*some_info): retVal = False - #FIXME + # FIXME # information should come from harvester messenger or else return retVal @@ -54,54 +54,54 @@ def __init__(self, **kwarg): self.nProcesses = 4 self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - # kill a vm + def _kill_a_vm(self, vm_id): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_kill_a_vm') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_kill_a_vm") try: self.vm_client.nova.delete(vm_id) except Exception as _e: - errStr = 'Failed to delete a VM with id={0} ; {1}'.format(vm_id, _e) + errStr = "Failed to delete a VM with id={0} ; {1}".format(vm_id, _e) tmpLog.error(errStr) tmpRetVal = (False, errStr) else: - tmpLog.info('Deleted a VM with id={0}'.format(vm_id)) - tmpRetVal = (True, '') + tmpLog.info("Deleted a VM with id={0}".format(vm_id)) + tmpRetVal = (True, "") return tmpRetVal - # check a vm + def _check_a_vm(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_check_a_vm') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_check_a_vm") - ## initialization + # initialization vm_id = workspec.batchID newStatus = workspec.status - errStr = '' + errStr = "" try: vm_server = self.vm_client.nova.servers.get(vm_id) vm_status = vm_server.status except Exception as _e: - errStr = 'Failed to get VM status of id={0} ; {1}'.format(vm_id, _e) + errStr = "Failed to get VM status of id={0} ; {1}".format(vm_id, _e) tmpLog.error(errStr) - tmpLog.info('Force to cancel the worker due to failure to get VM status') + tmpLog.info("Force to cancel the worker due to failure to get VM status") newStatus = WorkSpec.ST_cancelled else: newStatus = vm_worker_status_map_dict.get(vm_status) - tmpLog.info('batchID={0}: vm_status {1} -> worker_status {2}'.format(workspec.batchID, vm_status, newStatus)) + tmpLog.info("batchID={0}: vm_status {1} -> worker_status {2}".format(workspec.batchID, vm_status, newStatus)) - if _toKillVM(): #FIXME + if _toKillVM(): # FIXME self._kill_a_vm(vm_id) return (newStatus, errStr) - # check workers + def check_workers(self, workspec_list): - ## Check for all workers + # Check for all workers with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(self._check_a_vm, workspec_list) diff --git a/pandaharvester/harvestermonitor/cobalt_monitor.py b/pandaharvester/harvestermonitor/cobalt_monitor.py index ff57ccdf..38320662 100644 --- a/pandaharvester/harvestermonitor/cobalt_monitor.py +++ b/pandaharvester/harvestermonitor/cobalt_monitor.py @@ -1,7 +1,8 @@ import re + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess import json import os.path @@ -12,7 +13,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('cobalt_monitor') +baseLogger = core_utils.setup_logger("cobalt_monitor") # qstat output @@ -20,8 +21,9 @@ # =================================================== # 77734 fcurtis 06:00:00 64 queued None + # monitor for HTCONDOR batch system -class CobaltMonitor (PluginBase): +class CobaltMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) @@ -35,33 +37,28 @@ def check_workers(self, workspec_list): # print "pprint(vars(workSpec))" # pprint(vars(workSpec)) # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") # first command comStr = "qstat {0}".format(workSpec.batchID) # first check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) oldStatus = workSpec.status newStatus = None # first check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode= {0}'.format(retCode)) - tmpLog.debug('stdOut = {0}'.format(stdOut)) - tmpLog.debug('stdErr = {0}'.format(stdErr)) - errStr = '' + tmpLog.debug("retCode= {0}".format(retCode)) + tmpLog.debug("stdOut = {0}".format(stdOut)) + tmpLog.debug("stdErr = {0}".format(stdErr)) + errStr = "" if retCode == 0: # batch job is still running and has a state, output looks like this: # JobID User WallTime Nodes State Location # =================================================== # 124559 hdshin 06:00:00 64 queued None - - lines = stdOut.split('\n') + + lines = stdOut.split("\n") parts = lines[2].split() batchid = parts[0] user = parts[1] @@ -70,82 +67,81 @@ def check_workers(self, workspec_list): state = parts[4] if int(batchid) != int(workSpec.batchID): - errStr += 'qstat returned status for wrong batch id %s != %s' % (batchid,workSpec.batchID) + errStr += "qstat returned status for wrong batch id %s != %s" % (batchid, workSpec.batchID) newStatus = WorkSpec.ST_failed else: - if 'running' in state: - newStatus = WorkSpec.ST_running - elif 'queued' in state: - newStatus = WorkSpec.ST_submitted - elif 'user_hold' in state: - newStatus = WorkSpec.ST_submitted - elif 'starting' in state: - newStatus = WorkSpec.ST_running - elif 'killing' in state: - newStatus = WorkSpec.ST_failed - elif 'exiting' in state: - newStatus = WorkSpec.ST_running - elif 'maxrun_hold' in state: - newStatus = WorkSpec.ST_submitted + if "running" in state: + newStatus = WorkSpec.ST_running + elif "queued" in state: + newStatus = WorkSpec.ST_submitted + elif "user_hold" in state: + newStatus = WorkSpec.ST_submitted + elif "starting" in state: + newStatus = WorkSpec.ST_running + elif "killing" in state: + newStatus = WorkSpec.ST_failed + elif "exiting" in state: + newStatus = WorkSpec.ST_running + elif "maxrun_hold" in state: + newStatus = WorkSpec.ST_submitted else: - raise Exception('failed to parse job state "%s" qstat stdout: %s\n stderr: %s' % (state,stdOut,stdErr)) - + raise Exception('failed to parse job state "%s" qstat stdout: %s\n stderr: %s' % (state, stdOut, stdErr)) + retList.append((newStatus, errStr)) elif retCode == 1 and len(stdOut.strip()) == 0 and len(stdErr.strip()) == 0: - tmpLog.debug('job has already exited, checking cobalt log for exit status') + tmpLog.debug("job has already exited, checking cobalt log for exit status") # exit code 1 and stdOut/stdErr has no content means job exited # need to look at cobalt log to determine exit status - cobalt_logfile = os.path.join(workSpec.get_access_point(),'cobalt.log') + cobalt_logfile = os.path.join(workSpec.get_access_point(), "cobalt.log") if os.path.exists(cobalt_logfile): return_code = None job_cancelled = False for line in open(cobalt_logfile): # looking for line like this: # Thu Aug 24 19:01:20 2017 +0000 (UTC) Info: task completed normally with an exit code of 0; initiating job cleanup and removal - if 'task completed normally' in line: - start_index = line.find('exit code of ') + len('exit code of ') - end_index = line.find(';',start_index) + if "task completed normally" in line: + start_index = line.find("exit code of ") + len("exit code of ") + end_index = line.find(";", start_index) str_return_code = line[start_index:end_index] - if 'None' in str_return_code: - return_code = -1 + if "None" in str_return_code: + return_code = -1 else: - return_code = int(str_return_code) + return_code = int(str_return_code) break - elif 'maximum execution time exceeded' in line: - errStr += ' batch job exceeded wall clock time ' - elif 'user delete requested' in line: - errStr += ' job was cancelled ' + elif "maximum execution time exceeded" in line: + errStr += " batch job exceeded wall clock time " + elif "user delete requested" in line: + errStr += " job was cancelled " job_cancelled = True - - + if return_code == 0: - tmpLog.debug('job finished normally') + tmpLog.debug("job finished normally") newStatus = WorkSpec.ST_finished - retList.append((newStatus,errStr)) + retList.append((newStatus, errStr)) elif return_code is None: if job_cancelled: - tmpLog.debug('job was cancelled') - errStr += ' job cancelled ' + tmpLog.debug("job was cancelled") + errStr += " job cancelled " newStatus = WorkSpec.ST_cancelled - retList.append((newStatus,errStr)) + retList.append((newStatus, errStr)) else: - tmpLog.debug('job has no exit code, failing job') - errStr += ' exit code not found in cobalt log file %s ' % cobalt_logfile + tmpLog.debug("job has no exit code, failing job") + errStr += " exit code not found in cobalt log file %s " % cobalt_logfile newStatus = WorkSpec.ST_failed - retList.append((newStatus,errStr)) + retList.append((newStatus, errStr)) else: - tmpLog.debug(' non zero exit code %s from batch job id %s' % (return_code,workSpec.batchID)) - errStr += ' non-zero exit code %s from batch job id %s ' % (return_code,workSpec.batchID) + tmpLog.debug(" non zero exit code %s from batch job id %s" % (return_code, workSpec.batchID)) + errStr += " non-zero exit code %s from batch job id %s " % (return_code, workSpec.batchID) newStatus = WorkSpec.ST_failed - retList.append((newStatus,errStr)) + retList.append((newStatus, errStr)) else: - tmpLog.debug(' cobalt log file does not exist') - errStr += ' cobalt log file %s does not exist ' % cobalt_logfile + tmpLog.debug(" cobalt log file does not exist") + errStr += " cobalt log file %s does not exist " % cobalt_logfile newStatus = WorkSpec.ST_failed - retList.append((newStatus,errStr)) - - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(oldStatus, newStatus)) - tmpLog.debug('errStr: %s' % errStr) + retList.append((newStatus, errStr)) + + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(oldStatus, newStatus)) + tmpLog.debug("errStr: %s" % errStr) return True, retList diff --git a/pandaharvester/harvestermonitor/dummy_mcore_monitor.py b/pandaharvester/harvestermonitor/dummy_mcore_monitor.py index 21c292f8..435e18c9 100644 --- a/pandaharvester/harvestermonitor/dummy_mcore_monitor.py +++ b/pandaharvester/harvestermonitor/dummy_mcore_monitor.py @@ -6,25 +6,24 @@ from pandaharvester.harvestercore import core_utils # logger -baseLogger = core_utils.setup_logger('dummy_mcore_monitor') +baseLogger = core_utils.setup_logger("dummy_mcore_monitor") # check a worker def check_a_worker(workspec): # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='check_a_worker') - dummyFilePath = os.path.join(workspec.get_access_point(), 'status.txt') - tmpLog.debug('look for {0}'.format(dummyFilePath)) + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="check_a_worker") + dummyFilePath = os.path.join(workspec.get_access_point(), "status.txt") + tmpLog.debug("look for {0}".format(dummyFilePath)) newStatus = WorkSpec.ST_finished try: with open(dummyFilePath) as dummyFile: newStatus = dummyFile.readline() newStatus = newStatus.strip() - except: + except BaseException: pass - tmpLog.debug('newStatus={0}'.format(newStatus)) - return (newStatus, '') + tmpLog.debug("newStatus={0}".format(newStatus)) + return (newStatus, "") # dummy monitor with multi-cores @@ -36,9 +35,9 @@ def __init__(self, **kwarg): # check workers def check_workers(self, workspec_list): # make logger - tmpLog = self.make_logger(baseLogger, method_name='check_workers') - tmpLog.debug('start nWorkers={0}'.format(len(workspec_list))) + tmpLog = self.make_logger(baseLogger, method_name="check_workers") + tmpLog.debug("start nWorkers={0}".format(len(workspec_list))) with Pool() as pool: retList = pool.map(check_a_worker, workspec_list) - tmpLog.debug('done') + tmpLog.debug("done") return True, retList diff --git a/pandaharvester/harvestermonitor/dummy_monitor.py b/pandaharvester/harvestermonitor/dummy_monitor.py index a5966c5b..56396352 100644 --- a/pandaharvester/harvestermonitor/dummy_monitor.py +++ b/pandaharvester/harvestermonitor/dummy_monitor.py @@ -4,7 +4,7 @@ from pandaharvester.harvestercore import core_utils # logger -baseLogger = core_utils.setup_logger('dummy_monitor') +baseLogger = core_utils.setup_logger("dummy_monitor") # dummy monitor @@ -29,20 +29,19 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') - dummyFilePath = os.path.join(workSpec.get_access_point(), 'status.txt') - tmpLog.debug('look for {0}'.format(dummyFilePath)) + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") + dummyFilePath = os.path.join(workSpec.get_access_point(), "status.txt") + tmpLog.debug("look for {0}".format(dummyFilePath)) newStatus = WorkSpec.ST_finished try: with open(dummyFilePath) as dummyFile: newStatus = dummyFile.readline() newStatus = newStatus.strip() - if newStatus == 'finished': + if newStatus == "finished": workSpec.nativeExitCode = 0 - workSpec.nativeStatus = 'done' + workSpec.nativeStatus = "done" except Exception: pass - tmpLog.debug('newStatus={0}'.format(newStatus)) - retList.append((newStatus, 'dialog_message')) + tmpLog.debug("newStatus={0}".format(newStatus)) + retList.append((newStatus, "dialog_message")) return True, retList diff --git a/pandaharvester/harvestermonitor/gitlab_monitor.py b/pandaharvester/harvestermonitor/gitlab_monitor.py index 2f59c3db..60f07d79 100644 --- a/pandaharvester/harvestermonitor/gitlab_monitor.py +++ b/pandaharvester/harvestermonitor/gitlab_monitor.py @@ -7,7 +7,7 @@ from pandaharvester.harvestermisc.gitlab_utils import get_job_params # logger -baseLogger = core_utils.setup_logger('gitlab_monitor') +baseLogger = core_utils.setup_logger("gitlab_monitor") # dummy monitor @@ -22,41 +22,38 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") try: params = get_job_params(workSpec) - url = '{}/{}/pipelines/{}'.format(params['project_api'], params['project_id'], - workSpec.batchID.split()[0]) + url = "{}/{}/pipelines/{}".format(params["project_api"], params["project_id"], workSpec.batchID.split()[0]) try: - tmpLog.debug('check pipeline at {}'.format(url)) - r = requests.get(url, headers={'PRIVATE-TOKEN': params['secrets'][params['access_token']]}, - timeout=self.timeout) + tmpLog.debug("check pipeline at {}".format(url)) + r = requests.get(url, headers={"PRIVATE-TOKEN": params["secrets"][params["access_token"]]}, timeout=self.timeout) response = r.json() - tmpLog.debug('got {}'.format(str(response))) + tmpLog.debug("got {}".format(str(response))) except Exception: err_str = core_utils.dump_error_message(tmpLog) retList.append((WorkSpec.ST_idle, err_str)) continue - newMsg = '' - if 'status' not in response: + newMsg = "" + if "status" not in response: newStatus = WorkSpec.ST_idle - if 'message' in response: - newMsg = response['message'] + if "message" in response: + newMsg = response["message"] else: - newMsg = 'failed to check due to unknown reason' + newMsg = "failed to check due to unknown reason" else: - if response['status'] == 'success': + if response["status"] == "success": newStatus = WorkSpec.ST_finished - elif response['status'] == 'failed': + elif response["status"] == "failed": newStatus = WorkSpec.ST_failed - elif response['status'] == 'created': + elif response["status"] == "created": newStatus = WorkSpec.ST_submitted - elif response['status'] == 'pending': + elif response["status"] == "pending": newStatus = WorkSpec.ST_pending else: newStatus = WorkSpec.ST_running - tmpLog.debug('newStatus={0}'.format(newStatus)) + tmpLog.debug("newStatus={0}".format(newStatus)) retList.append((newStatus, newMsg)) except Exception: err_str = core_utils.dump_error_message(tmpLog) diff --git a/pandaharvester/harvestermonitor/globus_compute_monitor.py b/pandaharvester/harvestermonitor/globus_compute_monitor.py index ac98ef50..17baa5b3 100644 --- a/pandaharvester/harvestermonitor/globus_compute_monitor.py +++ b/pandaharvester/harvestermonitor/globus_compute_monitor.py @@ -19,7 +19,7 @@ # logger -baseLogger = core_utils.setup_logger('globus_compute_monitor') +baseLogger = core_utils.setup_logger("globus_compute_monitor") # monitor for globus compute batch system @@ -42,21 +42,21 @@ def get_messenger(self, workSpec): def get_panda_argparser(self): if self.parser is None: - parser = argparse.ArgumentParser(description='PanDA argparser') - parser.add_argument('-j', type=str, required=False, default='', help='j') - parser.add_argument('--sourceURL', type=str, required=False, default='', help='source url') - parser.add_argument('-r', type=str, required=False, default='', help='directory') - parser.add_argument('-l', '--lib', required=False, action='store_true', default=False, help='library') - parser.add_argument('-o', '--output', type=str, required=False, default='', help='output') - parser.add_argument('-p', '--program', type=str, required=False, default='', help='program') - parser.add_argument('-a', '--archive', type=str, required=False, default='', help='source archive file') + parser = argparse.ArgumentParser(description="PanDA argparser") + parser.add_argument("-j", type=str, required=False, default="", help="j") + parser.add_argument("--sourceURL", type=str, required=False, default="", help="source url") + parser.add_argument("-r", type=str, required=False, default="", help="directory") + parser.add_argument("-l", "--lib", required=False, action="store_true", default=False, help="library") + parser.add_argument("-o", "--output", type=str, required=False, default="", help="output") + parser.add_argument("-p", "--program", type=str, required=False, default="", help="program") + parser.add_argument("-a", "--archive", type=str, required=False, default="", help="source archive file") self.parser = parser return self.parser def get_out_file_infos(self, workSpec, jobSpec, logFile, ret, logger): base_dir = os.path.dirname(logFile) - job_pars = jobSpec.jobParams['jobPars'] + job_pars = jobSpec.jobParams["jobPars"] job_arguments = shlex.split(job_pars) parser = self.get_panda_argparser() job_args, _ = parser.parse_known_args(job_arguments) @@ -65,7 +65,7 @@ def get_out_file_infos(self, workSpec, jobSpec, logFile, ret, logger): outFileInfos = [] if output: - scopes = jobSpec.jobParams['scopeOut'].split(',') + scopes = jobSpec.jobParams["scopeOut"].split(",") output = ast.literal_eval(output) keys = list(output.keys()) @@ -74,13 +74,13 @@ def get_out_file_infos(self, workSpec, jobSpec, logFile, ret, logger): scope = scopes[0] pfn = os.path.join(base_dir, lfn) - with open(pfn, 'w') as fp: + with open(pfn, "w") as fp: result = None if ret: - result = ret.get('result', None) + result = ret.get("result", None) fp.write(str(result)) - outFileInfo = {'lfn': lfn, 'path': pfn} + outFileInfo = {"lfn": lfn, "path": pfn} outFileInfos.append(outFileInfo) for key, scope in zip(keys[1:], scopes[1:]): @@ -89,41 +89,42 @@ def get_out_file_infos(self, workSpec, jobSpec, logFile, ret, logger): dest = os.path.join(base_dir, lfn) if os.path.exists(src): os.rename(src, dest) - outFileInfo = {'lfn': lfn, 'path': dest} + outFileInfo = {"lfn": lfn, "path": dest} outFileInfos.append(outFileInfo) return outFileInfos def get_state_data_structure(self, workSpec, jobSpec, ret, error): if ret: - status = ret.get('status', None) + status = ret.get("status", None) else: status = None - state = 'failed' + state = "failed" if status: - if status in ['success']: - state = 'finished' - data = {'jobId': jobSpec.PandaID, - 'state': state, - # 'timestamp': time_stamp(), - 'siteName': workSpec.computingSite, # args.site, - 'node': None, - # 'attemptNr': None, - 'startTime': None, - 'jobMetrics': None, - 'metaData': None, - 'xml': None, - 'coreCount': 1, - 'cpuConsumptionTime': None, - 'cpuConversionFactor': None, - 'cpuConsumptionUnit': None, - 'cpu_architecture_level': None, - # 'maxRSS', 'maxVMEM', 'maxSWAP', 'maxPSS', 'avgRSS', 'avgVMEM', 'avgSWAP', 'avgPSS' - } + if status in ["success"]: + state = "finished" + data = { + "jobId": jobSpec.PandaID, + "state": state, + # 'timestamp': time_stamp(), + "siteName": workSpec.computingSite, # args.site, + "node": None, + # 'attemptNr': None, + "startTime": None, + "jobMetrics": None, + "metaData": None, + "xml": None, + "coreCount": 1, + "cpuConsumptionTime": None, + "cpuConversionFactor": None, + "cpuConsumptionUnit": None, + "cpu_architecture_level": None, + # 'maxRSS', 'maxVMEM', 'maxSWAP', 'maxPSS', 'avgRSS', 'avgVMEM', 'avgSWAP', 'avgPSS' + } return data def set_work_attributes(self, workSpec, logFile, work_rets, logger): - rets = work_rets.get('ret', {}) - error = work_rets.get('err', None) + rets = work_rets.get("ret", {}) + error = work_rets.get("err", None) messenger = self.get_messenger(workSpec) jsonAttrsFileName = harvester_config.payload_interaction.workerAttributesFile @@ -131,11 +132,7 @@ def set_work_attributes(self, workSpec, logFile, work_rets, logger): jsonJobReport = harvester_config.payload_interaction.jobReportFile jsonOutputsFileName = harvester_config.payload_interaction.eventStatusDumpJsonFile - jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, - None, - with_file=True, - only_running=False, - slim=False) + jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, with_file=True, only_running=False, slim=False) jobSpec_map = {} for jobSpec in jobSpecs: jobSpec_map[jobSpec.PandaID] = jobSpec @@ -145,7 +142,7 @@ def set_work_attributes(self, workSpec, logFile, work_rets, logger): ret = rets.get(pandaID, None) logger.debug("pandaID %s ret: %s" % (pandaID, str(ret))) if ret: - ret = ret.get('ret', {}) + ret = ret.get("ret", {}) attrs = self.get_state_data_structure(workSpec, jobSpec, ret, error) accessPoint = messenger.get_access_point(workSpec, pandaID) @@ -154,29 +151,28 @@ def set_work_attributes(self, workSpec, logFile, work_rets, logger): # outputs jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) - logger.debug('set attributes file {0}'.format(jsonFilePath)) - logger.debug('jobSpec: %s' % str(jobSpec)) + logger.debug("set attributes file {0}".format(jsonFilePath)) + logger.debug("jobSpec: %s" % str(jobSpec)) # logger.debug('jobSpec jobParams: %s' % str(jobSpec.jobParams)) outFile_infos = self.get_out_file_infos(workSpec, jobSpec, logFile, ret, logger) logger.debug("outFile_infos: %s" % str(outFile_infos)) out_files = {str(pandaID): []} for outFile_info in outFile_infos: - out_files[str(pandaID)].append({'path': outFile_info['path'], - 'type': 'output'}) - with open(jsonFilePath, 'w') as jsonFile: + out_files[str(pandaID)].append({"path": outFile_info["path"], "type": "output"}) + with open(jsonFilePath, "w") as jsonFile: json.dump(out_files, jsonFile) # work attr jsonFilePath = os.path.join(accessPoint, jsonAttrsFileName) - logger.debug('set attributes file {0}'.format(jsonFilePath)) - with open(jsonFilePath, 'w') as jsonFile: + logger.debug("set attributes file {0}".format(jsonFilePath)) + with open(jsonFilePath, "w") as jsonFile: json.dump(attrs, jsonFile) # job report jsonFilePath = os.path.join(accessPoint, jsonJobReport) - logger.debug('set attributes file {0}'.format(jsonFilePath)) - with open(jsonFilePath, 'w') as jsonFile: + logger.debug("set attributes file {0}".format(jsonFilePath)) + with open(jsonFilePath, "w") as jsonFile: json.dump(attrs, jsonFile) # post process @@ -193,14 +189,12 @@ def check_workers(self, workspec_list): if self.gc_client is None: self.gc_client = Client() except Exception as ex: - tmpLog = self.make_logger(baseLogger, "init_gc_client", - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "init_gc_client", method_name="check_workers") tmpLog.error("Failed to init gc client: %s" % str(ex)) for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") errStr, errLogStr, outLogStr = None, None, None work_rets = {} @@ -211,7 +205,7 @@ def check_workers(self, workspec_list): errLogStr = errStr newStatus = WorkSpec.ST_failed tmpRetVal = (newStatus, errStr) - work_rets['err'] = errStr + work_rets["err"] = errStr else: try: # jobSpecs = workSpec.get_jobspec_list() @@ -239,7 +233,7 @@ def check_workers(self, workspec_list): tmpLog.info("worker terminated: %s" % ex) tmpLog.debug(traceback.format_exc()) errLogStr = errStr + "\n" + str(traceback.format_exc()) - work_rets['err'] = errStr + work_rets["err"] = errStr else: newStatus = None all_finished = True @@ -249,13 +243,13 @@ def check_workers(self, workspec_list): all_finished = False newStatus = WorkSpec.ST_running break - if rets[batch_id].get("pending", True) or rets[batch_id].get("status", None) in ['waiting-for-launch', 'running']: + if rets[batch_id].get("pending", True) or rets[batch_id].get("status", None) in ["waiting-for-launch", "running"]: newStatus = WorkSpec.ST_running all_finished = False break else: batch_status = rets[batch_id].get("status", None) - if batch_status and batch_status != 'success': + if batch_status and batch_status != "success": all_finished = False if newStatus is None: @@ -269,17 +263,17 @@ def check_workers(self, workspec_list): if newStatus in [WorkSpec.ST_finished, WorkSpec.ST_failed]: new_rets = {} for panda_id, batch_id in zip(panda_ids, list(rets.keys())): - new_rets[panda_id] = {'funcx_id': batch_id, 'ret': rets[batch_id]} + new_rets[panda_id] = {"funcx_id": batch_id, "ret": rets[batch_id]} outLogStr = str(new_rets) - work_rets['ret'] = new_rets + work_rets["ret"] = new_rets except Exception as ex: newStatus = WorkSpec.ST_failed errStr = "Failed to parse worker result: %s" % ex tmpLog.error(errStr) tmpLog.debug(traceback.format_exc()) errLogStr = errStr + "\n" + str(traceback.format_exc()) - work_rets['err'] = errStr + work_rets["err"] = errStr tmpRetVal = (newStatus, errStr) except Exception as ex: @@ -287,7 +281,7 @@ def check_workers(self, workspec_list): errStr = str(ex) tmpLog.error(errStr) tmpLog.debug(traceback.format_exc()) - work_rets['err'] = errStr + work_rets["err"] = errStr newStatus = WorkSpec.ST_failed tmpRetVal = (newStatus, errStr) @@ -299,9 +293,9 @@ def check_workers(self, workspec_list): stdOut = os.path.join(baseDir, stdOut) stdErr = os.path.join(baseDir, stdErr) tmpLog.info("stdout: %s, stderr: %s" % (stdOut, stdErr)) - with open(stdOut, 'w') as fp: + with open(stdOut, "w") as fp: fp.write(str(outLogStr)) - with open(stdErr, 'w') as fp: + with open(stdErr, "w") as fp: fp.write(str(errLogStr)) try: diff --git a/pandaharvester/harvestermonitor/htcondor_monitor.py b/pandaharvester/harvestermonitor/htcondor_monitor.py index 279767a4..a860d05b 100644 --- a/pandaharvester/harvestermonitor/htcondor_monitor.py +++ b/pandaharvester/harvestermonitor/htcondor_monitor.py @@ -16,26 +16,26 @@ # logger -baseLogger = core_utils.setup_logger('htcondor_monitor') +baseLogger = core_utils.setup_logger("htcondor_monitor") # Native HTCondor job status map CONDOR_JOB_STATUS_MAP = { - '1': 'idle', - '2': 'running', - '3': 'removed', - '4': 'completed', - '5': 'held', - '6': 'transferring_output', - '7': 'suspended', - } + "1": "idle", + "2": "running", + "3": "removed", + "4": "completed", + "5": "held", + "6": "transferring_output", + "7": "suspended", +} # Condor jobs held with these reasons should be killed TO_KILL_HOLD_REASONS = [ - 'Job not found', - 'Failed to start GAHP', - ] + "Job not found", + "Failed to start GAHP", +] # pilot error object @@ -45,109 +45,117 @@ # Check one worker def _check_one_worker(workspec, job_ads_all_dict, cancel_unknown=False, held_timeout=3600): # Make logger for one single worker - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_check_one_worker') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_check_one_worker") # Initialize newStatus newStatus = workspec.status - errStr = '' + errStr = "" try: job_ads_dict = job_ads_all_dict[condor_job_id_from_workspec(workspec)] except KeyError: got_job_ads = False except Exception as e: got_job_ads = False - tmpLog.error('With error {0}'.format(e)) + tmpLog.error("With error {0}".format(e)) else: got_job_ads = True # Parse job ads if got_job_ads: # Check JobStatus try: - batchStatus = str(job_ads_dict['JobStatus']) + batchStatus = str(job_ads_dict["JobStatus"]) except KeyError: # Propagate native condor job status as unknown - workspec.nativeStatus = 'unknown' + workspec.nativeStatus = "unknown" if cancel_unknown: newStatus = WorkSpec.ST_cancelled - errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Regard the worker as canceled'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get JobStatus of job submissionHost={0} batchID={1}. Regard the worker as canceled".format( + workspec.submissionHost, workspec.batchID + ) tmpLog.error(errStr) else: newStatus = None - errStr = 'cannot get JobStatus of job submissionHost={0} batchID={1}. Skipped'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get JobStatus of job submissionHost={0} batchID={1}. Skipped".format(workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) else: # Try to get LastJobStatus - lastBatchStatus = str(job_ads_dict.get('LastJobStatus', '')) + lastBatchStatus = str(job_ads_dict.get("LastJobStatus", "")) # Set batchStatus if lastBatchStatus is terminated status - if (lastBatchStatus in ['3', '4'] and batchStatus not in ['3', '4']) \ - or (lastBatchStatus in ['4'] and batchStatus in ['3']): + if (lastBatchStatus in ["3", "4"] and batchStatus not in ["3", "4"]) or (lastBatchStatus in ["4"] and batchStatus in ["3"]): batchStatus = lastBatchStatus - tmpLog.warning('refer to LastJobStatus={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (Jobstatus={3})'.format( - lastBatchStatus, workspec.submissionHost, workspec.batchID, str(job_ads_dict['JobStatus']))) + tmpLog.warning( + "refer to LastJobStatus={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (Jobstatus={3})".format( + lastBatchStatus, workspec.submissionHost, workspec.batchID, str(job_ads_dict["JobStatus"]) + ) + ) # Propagate native condor job status - workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get(batchStatus, 'unexpected') - if batchStatus in ['2', '6']: + workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get(batchStatus, "unexpected") + if batchStatus in ["2", "6"]: # 2 running, 6 transferring output newStatus = WorkSpec.ST_running - elif batchStatus in ['1', '7']: + elif batchStatus in ["1", "7"]: # 1 idle, 7 suspended - if job_ads_dict.get('JobStartDate'): + if job_ads_dict.get("JobStartDate"): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted - elif batchStatus in ['3']: + elif batchStatus in ["3"]: # 3 removed if not errStr: - errStr = 'Condor HoldReason: {0} ; Condor RemoveReason: {1} '.format( - job_ads_dict.get('LastHoldReason'), job_ads_dict.get('RemoveReason')) + errStr = "Condor HoldReason: {0} ; Condor RemoveReason: {1} ".format(job_ads_dict.get("LastHoldReason"), job_ads_dict.get("RemoveReason")) newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['5']: + elif batchStatus in ["5"]: # 5 held - hold_reason = job_ads_dict.get('HoldReason') - errStr = 'Condor HoldReason: {0} '.format(hold_reason) - if ( - hold_reason in TO_KILL_HOLD_REASONS - or int(time.time()) - int(job_ads_dict.get('EnteredCurrentStatus', 0)) > held_timeout - ): + hold_reason = job_ads_dict.get("HoldReason") + errStr = "Condor HoldReason: {0} ".format(hold_reason) + if hold_reason in TO_KILL_HOLD_REASONS or int(time.time()) - int(job_ads_dict.get("EnteredCurrentStatus", 0)) > held_timeout: # Kill the job if held too long or other reasons if hold_reason in TO_KILL_HOLD_REASONS: - tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to HoldReason: {2}'.format(workspec.submissionHost, workspec.batchID, hold_reason)) + tmpLog.debug( + "trying to kill job submissionHost={0} batchID={1} due to HoldReason: {2}".format( + workspec.submissionHost, workspec.batchID, hold_reason + ) + ) else: - tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to held too long'.format(workspec.submissionHost, workspec.batchID)) + tmpLog.debug("trying to kill job submissionHost={0} batchID={1} due to held too long".format(workspec.submissionHost, workspec.batchID)) for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map([workspec])): condor_job_manage = CondorJobManage(id=workspec.submissionHost) try: ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} - ret_err_str = 'failed to kill job. Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "failed to kill job. Exception {0}: {1}".format(e.__class__.__name__, e) tmpLog.error(ret_err_str) else: ret = ret_map.get(condor_job_id_from_workspec(workspec)) if ret and ret[0]: - tmpLog.info('killed held job submissionHost={0} batchID={1}'.format(workspec.submissionHost, workspec.batchID)) + tmpLog.info("killed held job submissionHost={0} batchID={1}".format(workspec.submissionHost, workspec.batchID)) else: - tmpLog.error('cannot kill held job submissionHost={0} batchID={1}'.format(workspec.submissionHost, workspec.batchID)) + tmpLog.error("cannot kill held job submissionHost={0} batchID={1}".format(workspec.submissionHost, workspec.batchID)) newStatus = WorkSpec.ST_cancelled - errStr += ' ; Worker canceled by harvester due to held too long or not found' + errStr += " ; Worker canceled by harvester due to held too long or not found" # Mark the PanDA job as closed instead of failed workspec.set_pilot_closed() - tmpLog.debug('Called workspec set_pilot_closed') + tmpLog.debug("Called workspec set_pilot_closed") else: - if job_ads_dict.get('JobStartDate'): + if job_ads_dict.get("JobStartDate"): newStatus = WorkSpec.ST_idle else: newStatus = WorkSpec.ST_submitted - elif batchStatus in ['4']: + elif batchStatus in ["4"]: # 4 completed try: - payloadExitCode_str = str(job_ads_dict['ExitCode']) + payloadExitCode_str = str(job_ads_dict["ExitCode"]) payloadExitCode = int(payloadExitCode_str) except KeyError: - errStr = 'cannot get ExitCode of job submissionHost={0} batchID={1}. Regard the worker as failed'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get ExitCode of job submissionHost={0} batchID={1}. Regard the worker as failed".format( + workspec.submissionHost, workspec.batchID + ) tmpLog.warning(errStr) newStatus = WorkSpec.ST_failed except ValueError: - errStr = 'got invalid ExitCode {0} of job submissionHost={1} batchID={2}. Regard the worker as failed'.format(payloadExitCode_str, workspec.submissionHost, workspec.batchID) + errStr = "got invalid ExitCode {0} of job submissionHost={1} batchID={2}. Regard the worker as failed".format( + payloadExitCode_str, workspec.submissionHost, workspec.batchID + ) tmpLog.warning(errStr) newStatus = WorkSpec.ST_failed else: @@ -159,38 +167,41 @@ def _check_one_worker(workspec, job_ads_all_dict, cancel_unknown=False, held_tim else: # Other return codes are considered failed newStatus = WorkSpec.ST_failed - errStr = 'Payload execution error: returned non-zero {0}'.format(payloadExitCode) + errStr = "Payload execution error: returned non-zero {0}".format(payloadExitCode) tmpLog.debug(errStr) # Map return code to Pilot error code reduced_exit_code = payloadExitCode // 256 if (payloadExitCode % 256 == 0) else payloadExitCode pilot_error_code, pilot_error_diag = PILOT_ERRORS.convertToPilotErrors(reduced_exit_code) if pilot_error_code is not None: workspec.set_pilot_error(pilot_error_code, pilot_error_diag) - tmpLog.info('Payload return code = {0}'.format(payloadExitCode)) + tmpLog.info("Payload return code = {0}".format(payloadExitCode)) else: - errStr = 'cannot get reasonable JobStatus of job submissionHost={0} batchID={1}. Regard the worker as failed by default'.format( - workspec.submissionHost, workspec.batchID) + errStr = "cannot get reasonable JobStatus of job submissionHost={0} batchID={1}. Regard the worker as failed by default".format( + workspec.submissionHost, workspec.batchID + ) tmpLog.error(errStr) newStatus = WorkSpec.ST_failed - tmpLog.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( - workspec.submissionHost, workspec.batchID, batchStatus, newStatus)) + tmpLog.info( + "submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}".format(workspec.submissionHost, workspec.batchID, batchStatus, newStatus) + ) else: # Propagate native condor job status as unknown - workspec.nativeStatus = 'unknown' + workspec.nativeStatus = "unknown" if cancel_unknown: - errStr = 'condor job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default'.format( - workspec.submissionHost, workspec.batchID) + errStr = "condor job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default".format( + workspec.submissionHost, workspec.batchID + ) tmpLog.error(errStr) newStatus = WorkSpec.ST_cancelled - tmpLog.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( - workspec.submissionHost, workspec.batchID, '3', newStatus)) + tmpLog.info( + "submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}".format(workspec.submissionHost, workspec.batchID, "3", newStatus) + ) else: - errStr = 'condor job submissionHost={0} batchID={1} not found. Skipped'.format( - workspec.submissionHost, workspec.batchID) + errStr = "condor job submissionHost={0} batchID={1} not found. Skipped".format(workspec.submissionHost, workspec.batchID) tmpLog.warning(errStr) newStatus = None # Set supplemental error message - error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if errStr else WorkerErrors.error_codes.get('SUCCEEDED') + error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") if errStr else WorkerErrors.error_codes.get("SUCCEEDED") workspec.set_supplemental_error(error_code=error_code, error_diag=errStr) # Return return (newStatus, errStr) @@ -239,84 +250,79 @@ def __init__(self, **kwarg): # check workers def check_workers(self, workspec_list): # Make logger for batch job query - tmpLog = self.make_logger(baseLogger, '{0}'.format('batch job query'), - method_name='check_workers') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "{0}".format("batch job query"), method_name="check_workers") + tmpLog.debug("start") # Loop over submissionHost job_ads_all_dict = {} for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map(workspec_list)): # Record batch job query result to this dict, with key = batchID try: - job_query = CondorJobQuery( cacheEnable=self.cacheEnable, - cacheRefreshInterval=self.cacheRefreshInterval, - useCondorHistory=self.useCondorHistory, - id=submissionHost) + job_query = CondorJobQuery( + cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, useCondorHistory=self.useCondorHistory, id=submissionHost + ) host_job_ads_dict = job_query.get_all(batchIDs_list=batchIDs_list) except Exception as e: host_job_ads_dict = {} - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmpLog.error(ret_err_str) job_ads_all_dict.update(host_job_ads_dict) # Check for all workers with Pool(self.nProcesses) as _pool: - retIterator = _pool.map(lambda _x: _check_one_worker( - _x, job_ads_all_dict, - cancel_unknown=self.cancelUnknown, - held_timeout=self.heldTimeout), - workspec_list) + retIterator = _pool.map( + lambda _x: _check_one_worker(_x, job_ads_all_dict, cancel_unknown=self.cancelUnknown, held_timeout=self.heldTimeout), workspec_list + ) retList = list(retIterator) - tmpLog.debug('done') + tmpLog.debug("done") return True, retList # report updated workers info to monitor to check def report_updated_workers(self, time_window): - ## Make logger for batch job query - tmpLog = self.make_logger(baseLogger, method_name='report_updated_workers') - tmpLog.debug('start') - ## Get now timestamp + # Make logger for batch job query + tmpLog = self.make_logger(baseLogger, method_name="report_updated_workers") + tmpLog.debug("start") + # Get now timestamp timeNow = time.time() - ## Set of submission hosts + # Set of submission hosts submission_host_set = set() for submissionHost in self.submissionHost_list: submission_host_set.add(submissionHost) for condorHostConfig in self.condorHostConfig_list: try: - with open(condorHostConfig, 'r') as f: + with open(condorHostConfig, "r") as f: condor_host_config_map = json.load(f) for _schedd, _cm in condor_host_config_map.items(): - _pool = _cm['pool'] - submissionHost = '{0},{1}'.format(_schedd, _pool) + _pool = _cm["pool"] + submissionHost = "{0},{1}".format(_schedd, _pool) submission_host_set.add(submissionHost) except Exception as e: - err_str = 'failed to parse condorHostConfig {0}; {1}: {2}'.format(condorHostConfig, e.__class__.__name__, e) + err_str = "failed to parse condorHostConfig {0}; {1}: {2}".format(condorHostConfig, e.__class__.__name__, e) tmpLog.error(err_str) continue - ## Loop over submissionHost and get all jobs + # Loop over submissionHost and get all jobs job_ads_all_dict = {} for submissionHost in submission_host_set: try: - job_query = CondorJobQuery( cacheEnable=self.cacheEnable, - cacheRefreshInterval=self.cacheRefreshInterval, - useCondorHistory=self.useCondorHistory, - id=submissionHost) + job_query = CondorJobQuery( + cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, useCondorHistory=self.useCondorHistory, id=submissionHost + ) job_ads_all_dict.update(job_query.get_all(allJobs=True)) - tmpLog.debug('got information of condor jobs on {0}'.format(submissionHost)) + tmpLog.debug("got information of condor jobs on {0}".format(submissionHost)) except Exception as e: - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmpLog.error(ret_err_str) - ## Choose workers updated within a time window + # Choose workers updated within a time window workers_to_check_list = [] for condor_job_id, job_ads in six.iteritems(job_ads_all_dict): - ## put in worker cache fifo, with lock mechanism - job_EnteredCurrentStatus = job_ads.get('EnteredCurrentStatus') + # put in worker cache fifo, with lock mechanism + job_EnteredCurrentStatus = job_ads.get("EnteredCurrentStatus") if not (job_EnteredCurrentStatus > timeNow - time_window): continue - workerid = job_ads.get('harvesterWorkerID') + workerid = job_ads.get("harvesterWorkerID") if workerid is None: continue else: workerid = int(workerid) workers_to_check_list.append((workerid, job_EnteredCurrentStatus)) - tmpLog.debug('got {0} workers'.format(len(workers_to_check_list))) - tmpLog.debug('done') + tmpLog.debug("got {0} workers".format(len(workers_to_check_list))) + tmpLog.debug("done") return workers_to_check_list diff --git a/pandaharvester/harvestermonitor/k8s_monitor.py b/pandaharvester/harvestermonitor/k8s_monitor.py index d65a4c41..c2a70b64 100644 --- a/pandaharvester/harvestermonitor/k8s_monitor.py +++ b/pandaharvester/harvestermonitor/k8s_monitor.py @@ -11,11 +11,13 @@ from pandaharvester.harvestermisc.info_utils_k8s import PandaQueuesDictK8s # logger -base_logger = core_utils.setup_logger('k8s_monitor') +base_logger = core_utils.setup_logger("k8s_monitor") -BAD_CONTAINER_STATES = ['CreateContainerError', 'CrashLoopBackOff', "FailedMount"] +BAD_CONTAINER_STATES = ["CreateContainerError", "CrashLoopBackOff", "FailedMount"] # monitor for K8S + + class K8sMonitor(PluginBase): # constructor def __init__(self, **kwarg): @@ -46,70 +48,69 @@ def __init__(self, **kwarg): self._all_workers_dict = [] def check_pods_status(self, pods_status_list, containers_state_list, pod_status_message_list): - sub_msg = '' + sub_msg = "" - if 'Unknown' in pods_status_list: - if all(item == 'Unknown' for item in pods_status_list): + if "Unknown" in pods_status_list: + if all(item == "Unknown" for item in pods_status_list): new_status = None - elif 'Running' in pods_status_list: + elif "Running" in pods_status_list: new_status = WorkSpec.ST_running else: new_status = WorkSpec.ST_idle else: # Pod in Pending status - if all(item == 'Pending' for item in pods_status_list): + if all(item == "Pending" for item in pods_status_list): new_status = WorkSpec.ST_submitted # default is submitted, but consider certain cases for item in containers_state_list: if item.waiting and item.waiting.reason in BAD_CONTAINER_STATES: new_status = WorkSpec.ST_failed # change state to failed # Pod in Succeeded status - elif 'Succeeded' in pods_status_list: - if all((item.terminated is not None and item.terminated.reason == 'Completed') for item in containers_state_list): + elif "Succeeded" in pods_status_list: + if all((item.terminated is not None and item.terminated.reason == "Completed") for item in containers_state_list): new_status = WorkSpec.ST_finished else: sub_mesg_list = [] for item in containers_state_list: - msg_str = '' + msg_str = "" if item.terminated is None: - state = 'UNKNOWN' + state = "UNKNOWN" if item.running is not None: - state = 'running' + state = "running" elif item.waiting is not None: - state = 'waiting' - msg_str = 'container not terminated yet ({0}) while pod Succeeded'.format(state) - elif item.terminated.reason != 'Completed': - msg_str = 'container terminated by k8s for reason {0}'.format(item.terminated.reason) + state = "waiting" + msg_str = "container not terminated yet ({0}) while pod Succeeded".format(state) + elif item.terminated.reason != "Completed": + msg_str = "container terminated by k8s for reason {0}".format(item.terminated.reason) sub_mesg_list.append(msg_str) - sub_msg = ';'.join(sub_mesg_list) + sub_msg = ";".join(sub_mesg_list) new_status = WorkSpec.ST_cancelled # Pod in Running status - elif 'Running' in pods_status_list: + elif "Running" in pods_status_list: new_status = WorkSpec.ST_running # Pod in Failed status - elif 'Failed' in pods_status_list: + elif "Failed" in pods_status_list: new_status = WorkSpec.ST_failed try: - sub_msg = ';'.join(pod_status_message_list) - except: - sub_msg = '' + sub_msg = ";".join(pod_status_message_list) + except BaseException: + sub_msg = "" else: new_status = WorkSpec.ST_idle return new_status, sub_msg - def check_job_status(self, job_status, job_status_reason, job_status_message, - n_pods_succeeded, n_pods_failed): + def check_job_status(self, job_status, job_status_reason, job_status_message, n_pods_succeeded, n_pods_failed): new_status = None - sub_msg = '' + sub_msg = "" - if n_pods_succeeded or job_status == 'Complete': + if n_pods_succeeded or job_status == "Complete": new_status = WorkSpec.ST_finished - sub_msg = '' - elif n_pods_failed or job_status == 'Failed': + sub_msg = "" + elif n_pods_failed or job_status == "Failed": new_status = WorkSpec.ST_failed sub_msg = job_status_message + job_status_reason # in principle the job information should only apply to final states, but consider other states in the future @@ -117,20 +118,20 @@ def check_job_status(self, job_status, job_status_reason, job_status_message, def check_a_worker(self, workspec): # set logger - tmp_log = self.make_logger(base_logger, 'queueName={0} workerID={1} batchID={2}'. - format(self.queueName, workspec.workerID, workspec.batchID), - method_name='check_a_worker') + tmp_log = self.make_logger( + base_logger, "queueName={0} workerID={1} batchID={2}".format(self.queueName, workspec.workerID, workspec.batchID), method_name="check_a_worker" + ) # initialization job_id = workspec.batchID - err_str = '' + err_str = "" time_now = datetime.datetime.utcnow() pods_status_list = [] pods_status_message_list = [] pods_name_to_delete_list = [] - job_status = '' - job_status_reason = '' - job_status_message = '' + job_status = "" + job_status_reason = "" + job_status_message = "" n_pods_succeeded = 0 n_pods_failed = 0 try: @@ -140,91 +141,92 @@ def check_a_worker(self, workspec): worker_info = self._all_workers_dict[job_id] # make list of status of the pods belonging to our job - if 'pod_status' in worker_info and 'containers_state' in worker_info and 'pod_name' in worker_info: - pods_status_list.append(worker_info['pod_status']) - pods_status_message_list.append(worker_info['pod_status_message']) - containers_state_list.extend(worker_info['containers_state']) - pods_sup_diag_list.append(worker_info['pod_name']) + if "pod_status" in worker_info and "containers_state" in worker_info and "pod_name" in worker_info: + pods_status_list.append(worker_info["pod_status"]) + pods_status_message_list.append(worker_info["pod_status_message"]) + containers_state_list.extend(worker_info["containers_state"]) + pods_sup_diag_list.append(worker_info["pod_name"]) # get backup info about the job - if 'job_status' in worker_info and 'job_status_reason' in worker_info and 'job_status_message' in worker_info: - job_status = worker_info['job_status'] - job_status_reason = worker_info['job_status_reason'] - job_status_message = worker_info['job_status_message'] - n_pods_succeeded = worker_info['n_pods_succeeded'] - n_pods_failed = worker_info['n_pods_failed'] + if "job_status" in worker_info and "job_status_reason" in worker_info and "job_status_message" in worker_info: + job_status = worker_info["job_status"] + job_status_reason = worker_info["job_status_reason"] + job_status_message = worker_info["job_status_message"] + n_pods_succeeded = worker_info["n_pods_succeeded"] + n_pods_failed = worker_info["n_pods_failed"] # make a list of pods that should be removed # 1. pods being queued too long - if 'pod_status' in worker_info and worker_info['pod_status'] in ['Pending', 'Unknown'] \ - and worker_info['pod_start_time'] \ - and time_now - worker_info['pod_start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): - pods_name_to_delete_list.append(worker_info['pod_name']) + if ( + "pod_status" in worker_info + and worker_info["pod_status"] in ["Pending", "Unknown"] + and worker_info["pod_start_time"] + and time_now - worker_info["pod_start_time"] > datetime.timedelta(seconds=self.podQueueTimeLimit) + ): + pods_name_to_delete_list.append(worker_info["pod_name"]) # 2. pods with containers in bad states - if 'pod_status' in worker_info and worker_info['pod_status'] in ['Pending', 'Unknown']: - for item in worker_info['containers_state']: + if "pod_status" in worker_info and worker_info["pod_status"] in ["Pending", "Unknown"]: + for item in worker_info["containers_state"]: if item.waiting and item.waiting.reason in BAD_CONTAINER_STATES: - pods_name_to_delete_list.append(worker_info['pod_name']) + pods_name_to_delete_list.append(worker_info["pod_name"]) except Exception as _e: - err_str = 'Failed to get status for id={0} ; {1}'.format(job_id, traceback.format_exc()) + err_str = "Failed to get status for id={0} ; {1}".format(job_id, traceback.format_exc()) tmp_log.error(err_str) new_status = None else: # we didn't find neither the pod nor the job for the worker if not pods_status_list and not job_status: # there were no pods found belonging to our job - err_str = 'JOB id={0} not found'.format(job_id) + err_str = "JOB id={0} not found".format(job_id) tmp_log.error(err_str) - tmp_log.info('Force to cancel the worker due to JOB not found') + tmp_log.info("Force to cancel the worker due to JOB not found") new_status = WorkSpec.ST_cancelled # we found a pod for the worker, it has precedence over the job information elif pods_status_list: # we found pods belonging to our job. Obtain the final status - tmp_log.debug('pods_status_list={0}'.format(pods_status_list)) + tmp_log.debug("pods_status_list={0}".format(pods_status_list)) new_status, sub_msg = self.check_pods_status(pods_status_list, containers_state_list, pods_status_message_list) if sub_msg: err_str += sub_msg - tmp_log.debug('new_status={0}'.format(new_status)) + tmp_log.debug("new_status={0}".format(new_status)) # we didn't find the pod, but there was still a job for the worker else: - new_status, sub_msg = self.check_job_status(job_status, job_status_reason, job_status_message, - n_pods_succeeded, n_pods_failed) + new_status, sub_msg = self.check_job_status(job_status, job_status_reason, job_status_message, n_pods_succeeded, n_pods_failed) if sub_msg: err_str += sub_msg - tmp_log.debug('new_status={0}'.format(new_status)) + tmp_log.debug("new_status={0}".format(new_status)) # delete pods that have been queueing too long if pods_name_to_delete_list: - tmp_log.debug('Deleting pods queuing too long') + tmp_log.debug("Deleting pods queuing too long") ret_list = self.k8s_client.delete_pods(pods_name_to_delete_list) deleted_pods_list = [] for item in ret_list: - if item['errMsg'] == '': - deleted_pods_list.append(item['name']) - tmp_log.debug('Deleted pods queuing too long: {0}'.format( - ','.join(deleted_pods_list))) + if item["errMsg"] == "": + deleted_pods_list.append(item["name"]) + tmp_log.debug("Deleted pods queuing too long: {0}".format(",".join(deleted_pods_list))) # supplemental diag messages - sup_error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if err_str else WorkerErrors.error_codes.get('SUCCEEDED') - sup_error_diag = 'PODs=' + ','.join(pods_sup_diag_list) + ' ; ' + err_str + sup_error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") if err_str else WorkerErrors.error_codes.get("SUCCEEDED") + sup_error_diag = "PODs=" + ",".join(pods_sup_diag_list) + " ; " + err_str workspec.set_supplemental_error(error_code=sup_error_code, error_diag=sup_error_diag) return new_status, err_str def check_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, 'queueName={0}'.format(self.queueName), method_name='check_workers') - tmp_log.debug('start') + tmp_log = self.make_logger(base_logger, "queueName={0}".format(self.queueName), method_name="check_workers") + tmp_log.debug("start") ret_list = list() if not workspec_list: - err_str = 'empty workspec_list' + err_str = "empty workspec_list" tmp_log.debug(err_str) - ret_list.append(('', err_str)) + ret_list.append(("", err_str)) return False, ret_list workers_info = self.k8s_client.get_workers_info(workspec_list=workspec_list) if workers_info is None: # there was a communication issue to the K8S cluster - tmp_log.debug('done without answer') + tmp_log.debug("done without answer") return False, ret_list self._all_workers_dict = workers_info @@ -235,5 +237,5 @@ def check_workers(self, workspec_list): ret_list = list(ret_iterator) - tmp_log.debug('done') + tmp_log.debug("done") return True, ret_list diff --git a/pandaharvester/harvestermonitor/lancium_monitor.py b/pandaharvester/harvestermonitor/lancium_monitor.py index 9b8fc621..43d11312 100644 --- a/pandaharvester/harvestermonitor/lancium_monitor.py +++ b/pandaharvester/harvestermonitor/lancium_monitor.py @@ -12,9 +12,8 @@ from pandaharvester.harvestermisc.lancium_utils import LanciumJobQuery - # logger -base_logger = core_utils.setup_logger('lancium_monitor') +base_logger = core_utils.setup_logger("lancium_monitor") # pilot error object PILOT_ERRORS = PilotErrors() @@ -23,66 +22,72 @@ # Check one worker def _check_one_worker(workspec, job_attr_all_dict, cancel_unknown=False, held_timeout=3600): # Make logger for one single worker - tmp_log = core_utils.make_logger(base_logger, 'workerID={0}'.format(workspec.workerID), method_name='_check_one_worker') + tmp_log = core_utils.make_logger(base_logger, "workerID={0}".format(workspec.workerID), method_name="_check_one_worker") # Initialize newStatus newStatus = workspec.status - errStr = '' + errStr = "" try: job_attr_dict = job_attr_all_dict[get_full_batch_id_from_workspec(workspec)] except KeyError: got_job_attr = False except Exception as e: got_job_attr = False - tmp_log.error('With error {0}'.format(e)) + tmp_log.error("With error {0}".format(e)) else: got_job_attr = True # Parse job ads if got_job_attr: - # Check + # Check try: # FIXME - new_batch_status = job_attr_dict['status'] + new_batch_status = job_attr_dict["status"] except KeyError: # Propagate native job status as unknown - workspec.nativeStatus = 'unknown' + workspec.nativeStatus = "unknown" if cancel_unknown: newStatus = WorkSpec.ST_cancelled - errStr = 'cannot get job status of submissionHost={0} batchID={1}. Regard the worker as canceled'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get job status of submissionHost={0} batchID={1}. Regard the worker as canceled".format( + workspec.submissionHost, workspec.batchID + ) tmp_log.error(errStr) else: newStatus = None - errStr = 'cannot get job status of submissionHost={0} batchID={1}. Skipped'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get job status of submissionHost={0} batchID={1}. Skipped".format(workspec.submissionHost, workspec.batchID) tmp_log.warning(errStr) else: # Possible native statuses: "created" "submitted" "queued" "ready" "running" "error" "finished" "delete pending" last_batch_status = workspec.nativeStatus batchStatus = new_batch_status # Set batchStatus if last_batch_status is terminated status - if (last_batch_status in ['error', 'finished', 'delete pending'] and new_batch_status not in ['error', 'finished', 'delete pending']) \ - or (last_batch_status in ['error', 'finished'] and new_batch_status in ['delete pending']): + if (last_batch_status in ["error", "finished", "delete pending"] and new_batch_status not in ["error", "finished", "delete pending"]) or ( + last_batch_status in ["error", "finished"] and new_batch_status in ["delete pending"] + ): batchStatus = last_batch_status - tmp_log.warning('refer to last_batch_status={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (new_batch_status={3})'.format( - last_batch_status, workspec.submissionHost, workspec.batchID, new_batch_status)) + tmp_log.warning( + "refer to last_batch_status={0} as new status of job submissionHost={1} batchID={2} to avoid reversal in status (new_batch_status={3})".format( + last_batch_status, workspec.submissionHost, workspec.batchID, new_batch_status + ) + ) # Propagate native job status workspec.nativeStatus = batchStatus - if batchStatus in ['running']: + if batchStatus in ["running"]: # running newStatus = WorkSpec.ST_running - elif batchStatus in ['created', 'submitted', 'queued', 'ready']: + elif batchStatus in ["created", "submitted", "queued", "ready"]: # pre-running newStatus = WorkSpec.ST_submitted - elif batchStatus in ['error']: + elif batchStatus in ["error"]: # failed - errStr += 'job error_string: {0} '.format(job_attr_dict.get('error_string')) + errStr += "job error_string: {0} ".format(job_attr_dict.get("error_string")) newStatus = WorkSpec.ST_failed - elif batchStatus in ['delete pending']: + elif batchStatus in ["delete pending"]: # cancelled - errStr = 'job error_string: {0} '.format(job_attr_dict.get('error_string')) + errStr = "job error_string: {0} ".format(job_attr_dict.get("error_string")) newStatus = WorkSpec.ST_cancelled # Mark the PanDA job as closed instead of failed workspec.set_pilot_closed() - tmp_log.debug('Called workspec set_pilot_closed') - elif batchStatus in ['finished']: + tmp_log.debug("Called workspec set_pilot_closed") + elif batchStatus in ["finished"]: # finished # try: # payloadExitCode_str = str(job_attr_dict['exit_code']) @@ -116,42 +121,45 @@ def _check_one_worker(workspec, job_attr_all_dict, cancel_unknown=False, held_ti # finished newStatus = WorkSpec.ST_finished try: - payloadExitCode_str = str(job_attr_dict['exit_code']) + payloadExitCode_str = str(job_attr_dict["exit_code"]) payloadExitCode = int(payloadExitCode_str) except KeyError: - errStr = 'cannot get exit_code of submissionHost={0} batchID={1}'.format(workspec.submissionHost, workspec.batchID) + errStr = "cannot get exit_code of submissionHost={0} batchID={1}".format(workspec.submissionHost, workspec.batchID) tmp_log.warning(errStr) except ValueError: - errStr = 'got invalid exit_code {0} of submissionHost={1} batchID={2}'.format(payloadExitCode_str, workspec.submissionHost, workspec.batchID) + errStr = "got invalid exit_code {0} of submissionHost={1} batchID={2}".format( + payloadExitCode_str, workspec.submissionHost, workspec.batchID + ) tmp_log.warning(errStr) else: # Propagate exit_code code workspec.nativeExitCode = payloadExitCode - tmp_log.info('Payload return code = {0}'.format(payloadExitCode)) + tmp_log.info("Payload return code = {0}".format(payloadExitCode)) else: - errStr = 'cannot get reasonable job status of submissionHost={0} batchID={1}. Regard the worker as failed by default'.format( - workspec.submissionHost, workspec.batchID) + errStr = "cannot get reasonable job status of submissionHost={0} batchID={1}. Regard the worker as failed by default".format( + workspec.submissionHost, workspec.batchID + ) tmp_log.error(errStr) newStatus = WorkSpec.ST_failed - tmp_log.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( - workspec.submissionHost, workspec.batchID, batchStatus, newStatus)) + tmp_log.info( + "submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}".format(workspec.submissionHost, workspec.batchID, batchStatus, newStatus) + ) else: # Propagate native job status as unknown - workspec.nativeStatus = 'unknown' + workspec.nativeStatus = "unknown" if cancel_unknown: - errStr = 'job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default'.format( - workspec.submissionHost, workspec.batchID) + errStr = "job submissionHost={0} batchID={1} not found. Regard the worker as canceled by default".format(workspec.submissionHost, workspec.batchID) tmp_log.error(errStr) newStatus = WorkSpec.ST_cancelled - tmp_log.info('submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}'.format( - workspec.submissionHost, workspec.batchID, '3', newStatus)) + tmp_log.info( + "submissionHost={0} batchID={1} : batchStatus {2} -> workerStatus {3}".format(workspec.submissionHost, workspec.batchID, "3", newStatus) + ) else: - errStr = 'job submissionHost={0} batchID={1} not found. Skipped'.format( - workspec.submissionHost, workspec.batchID) + errStr = "job submissionHost={0} batchID={1} not found. Skipped".format(workspec.submissionHost, workspec.batchID) tmp_log.warning(errStr) newStatus = None # Set supplemental error message - error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if errStr else WorkerErrors.error_codes.get('SUCCEEDED') + error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") if errStr else WorkerErrors.error_codes.get("SUCCEEDED") workspec.set_supplemental_error(error_code=error_code, error_diag=errStr) # Return return (newStatus, errStr) @@ -192,39 +200,34 @@ def __init__(self, **kwarg): # check workers def check_workers(self, workspec_list): # Make logger for batch job query - tmp_log = self.make_logger(base_logger, '{0}'.format('batch job query'), - method_name='check_workers') - tmp_log.debug('start') + tmp_log = self.make_logger(base_logger, "{0}".format("batch job query"), method_name="check_workers") + tmp_log.debug("start") # Loop over submissionHost job_attr_all_dict = {} for submissionHost, batchIDs_list in get_host_batch_id_map(workspec_list).items(): # Record batch job query result to this dict, with key = batchID try: - job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, - cacheRefreshInterval=self.cacheRefreshInterval, - id=submissionHost) + job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, id=submissionHost) host_job_attr_dict = job_query.query_jobs(batchIDs_list=batchIDs_list) except Exception as e: host_job_attr_dict = {} - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmp_log.error(ret_err_str) job_attr_all_dict.update(host_job_attr_dict) # Check for all workers with Pool(self.nProcesses) as _pool: - retIterator = _pool.map(lambda _x: _check_one_worker( - _x, job_attr_all_dict, - cancel_unknown=self.cancelUnknown, - held_timeout=self.heldTimeout), - workspec_list) + retIterator = _pool.map( + lambda _x: _check_one_worker(_x, job_attr_all_dict, cancel_unknown=self.cancelUnknown, held_timeout=self.heldTimeout), workspec_list + ) retList = list(retIterator) - tmp_log.debug('done') + tmp_log.debug("done") return True, retList # report updated workers info to monitor to check def report_updated_workers(self, time_window): # Make logger for batch job query - tmp_log = self.make_logger(base_logger, method_name='report_updated_workers') - tmp_log.debug('start') + tmp_log = self.make_logger(base_logger, method_name="report_updated_workers") + tmp_log.debug("start") # Get now timestamp timeNow = time.time() # Set of submission hosts @@ -235,32 +238,30 @@ def report_updated_workers(self, time_window): job_attr_all_dict = {} for submissionHost in submission_host_set: try: - job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, - cacheRefreshInterval=self.cacheRefreshInterval, - id=submissionHost) + job_query = LanciumJobQuery(cacheEnable=self.cacheEnable, cacheRefreshInterval=self.cacheRefreshInterval, id=submissionHost) job_attr_all_dict.update(job_query.query_jobs(all_jobs=True)) - tmp_log.debug('got information of jobs on {0}'.format(submissionHost)) + tmp_log.debug("got information of jobs on {0}".format(submissionHost)) except Exception as e: - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmp_log.error(ret_err_str) # Choose workers updated within a time window workers_to_check_list = [] for full_batch_id, job_attr_dict in job_attr_all_dict.items(): # put in worker cache fifo, with lock mechanism - job_update_at_str = job_attr_dict.get('updated_at') + job_update_at_str = job_attr_dict.get("updated_at") try: job_update_at = timestamp_to_datetime(job_update_at_str) except Exception as e: - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmp_log.error(ret_err_str) job_update_at = None if job_update_at is not None and not (job_update_at > timeNow - time_window): continue - job_name = job_attr_dict.get('name') + job_name = job_attr_dict.get("name") harvester_id, worker_id = get_workerid_from_job_name(job_name) if worker_id is None or harvester_id != harvester_config.master.harvester_id: continue workers_to_check_list.append((worker_id, job_update_at)) - tmp_log.debug('got {0} workers'.format(len(workers_to_check_list))) - tmp_log.debug('done') + tmp_log.debug("got {0} workers".format(len(workers_to_check_list))) + tmp_log.debug("done") return workers_to_check_list diff --git a/pandaharvester/harvestermonitor/lsf_monitor.py b/pandaharvester/harvestermonitor/lsf_monitor.py index ea226f60..73b2574d 100644 --- a/pandaharvester/harvestermonitor/lsf_monitor.py +++ b/pandaharvester/harvestermonitor/lsf_monitor.py @@ -4,7 +4,7 @@ try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils @@ -12,7 +12,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('lsf_monitor') +baseLogger = core_utils.setup_logger("lsf_monitor") # monitor for LSF batch system @@ -26,24 +26,20 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") # command - comStr = 'bjobs -a -noheader -o {0} {1} '.format(quote("jobid:10 stat:10"),workSpec.batchID) + comStr = "bjobs -a -noheader -o {0} {1} ".format(quote("jobid:10 stat:10"), workSpec.batchID) comStr_split = split(comStr) # check - p = subprocess.Popen(comStr_split, - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(comStr_split, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('len(stdOut) = {0} stdOut={1}'.format(len(str(stdOut)),stdOut)) - tmpLog.debug('len(stdErr) = {0} stdErr={1}'.format(len(str(stdErr)),stdErr)) - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("len(stdOut) = {0} stdOut={1}".format(len(str(stdOut)), stdOut)) + tmpLog.debug("len(stdErr) = {0} stdErr={1}".format(len(str(stdErr)), stdErr)) + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" if retCode == 0: # check if any came back on stdOut otherwise check stdErr tempresponse = "" @@ -51,39 +47,37 @@ def check_workers(self, workspec_list): tempresponse = str(stdOut) else: tempresponse = str(stdErr) - #tmpLog.debug('tempresponse = {0}'.format(tempresponse)) + # tmpLog.debug('tempresponse = {0}'.format(tempresponse)) # parse - for tmpLine in tempresponse.split('\n'): - tmpMatch = re.search('{0}'.format(workSpec.batchID), tmpLine) - tmpLog.debug('tmpLine = {0} tmpMatch = {1}'.format(tmpLine,tmpMatch)) + for tmpLine in tempresponse.split("\n"): + tmpMatch = re.search("{0}".format(workSpec.batchID), tmpLine) + tmpLog.debug("tmpLine = {0} tmpMatch = {1}".format(tmpLine, tmpMatch)) if tmpMatch is not None: errStr = tmpLine # search for phrase is not found - tmpMatch = re.search('is not found', tmpLine) + tmpMatch = re.search("is not found", tmpLine) if tmpMatch is not None: - batchStatus = 'Job {0} is not found'.format(workSpec.batchID) + batchStatus = "Job {0} is not found".format(workSpec.batchID) newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - retCode)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, retCode)) else: batchStatus = tmpLine.split()[-2] - if batchStatus in ['RUN']: + if batchStatus in ["RUN"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['DONE']: + elif batchStatus in ["DONE"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['PEND', 'PROV','WAIT']: + elif batchStatus in ["PEND", "PROV", "WAIT"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) break retList.append((newStatus, errStr)) else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) - if 'Unknown Job Id Error' in errStr: + if "Unknown Job Id Error" in errStr: tmpLog.info("Mark job as finished.") newStatus = WorkSpec.ST_finished retList.append((newStatus, errStr)) diff --git a/pandaharvester/harvestermonitor/pbs_monitor.py b/pandaharvester/harvestermonitor/pbs_monitor.py index cca7c005..cc719e50 100644 --- a/pandaharvester/harvestermonitor/pbs_monitor.py +++ b/pandaharvester/harvestermonitor/pbs_monitor.py @@ -1,7 +1,8 @@ import re + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils @@ -9,7 +10,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('pbs_monitor') +baseLogger = core_utils.setup_logger("pbs_monitor") # monitor for PBS batch system @@ -23,48 +24,43 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") # command comStr = "qstat {0}".format(workSpec.batchID) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" if retCode == 0: # parse - for tmpLine in stdOut.split('\n'): - tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + for tmpLine in stdOut.split("\n"): + tmpMatch = re.search("{0} ".format(workSpec.batchID), tmpLine) if tmpMatch is not None: errStr = tmpLine batchStatus = tmpLine.split()[-2] - if batchStatus in ['R', 'E']: + if batchStatus in ["R", "E"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['C', 'H']: + elif batchStatus in ["C", "H"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['CANCELLED']: + elif batchStatus in ["CANCELLED"]: newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['Q', 'W', 'S']: + elif batchStatus in ["Q", "W", "S"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) break retList.append((newStatus, errStr)) else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) - if 'Unknown Job Id Error' in errStr: + if "Unknown Job Id Error" in errStr: tmpLog.info("Mark job as finished.") newStatus = WorkSpec.ST_finished retList.append((newStatus, errStr)) diff --git a/pandaharvester/harvestermonitor/saga_monitor.py b/pandaharvester/harvestermonitor/saga_monitor.py index 9710e21c..32e53704 100644 --- a/pandaharvester/harvestermonitor/saga_monitor.py +++ b/pandaharvester/harvestermonitor/saga_monitor.py @@ -12,7 +12,7 @@ from pandaharvester.harvestersubmitter.saga_submitter import SAGASubmitter # logger -baseLogger = core_utils.setup_logger('saga_monitor') +baseLogger = core_utils.setup_logger("saga_monitor") # monitor through SAGA @@ -22,14 +22,14 @@ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() - tmpLog = self.make_logger(baseLogger, method_name='__init__') + tmpLog = self.make_logger(baseLogger, method_name="__init__") tmpLog.info("[{0}] SAGA adaptor will be used.".format(self.adaptor)) # check workers def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. - + :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) @@ -39,25 +39,22 @@ def check_workers(self, workspec_list): except saga.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) - sagadateformat_str = '%a %b %d %H:%M:%S %Y' + sagadateformat_str = "%a %b %d %H:%M:%S %Y" retList = [] for workSpec in workspec_list: # make logger - errStr = '' - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + errStr = "" + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") tmpLog.debug("SAGA monitor started") if workSpec.batchID: - saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) + saga_submission_id = "[{0}]-[{1}]".format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) - tmpLog.debug( - 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) + tmpLog.debug("SAGA State for submission with batchid: {0} is: {1}".format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) - tmpLog.debug( - 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) + tmpLog.debug("Worker state with batchid: {0} is: {1} exit code: {2}".format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) @@ -75,7 +72,8 @@ def check_workers(self, workspec_list): if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( - workSpec.batchID, workSpec.workerID) + workSpec.batchID, workSpec.workerID + ) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: @@ -83,17 +81,18 @@ def check_workers(self, workspec_list): if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) - tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( - workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) - tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) + tmpLog.info( + "Worker {2} with BatchID={0} finished with exit code {1} and state {3}".format( + workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state + ) + ) + tmpLog.debug("Started: [{0}] finished: [{1}]".format(worker.started, worker.finished)) if worker.state == saga.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) - if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: - tmpLog.info( - "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, - self.maxqueuetime)) + if hasattr(self, "maxqueuetime") and queue_time > self.maxqueuetime: + tmpLog.info("Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code @@ -103,17 +102,17 @@ def check_workers(self, workspec_list): workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled - tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, - workSpec.nativeExitCode)) + tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails except saga.SagaException as ex: - tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) + tmpLog.info("An exception occured during retriving worker information {0}".format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( - workSpec.batchID, workSpec.workerID) + workSpec.batchID, workSpec.workerID + ) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: @@ -121,10 +120,10 @@ def check_workers(self, workspec_list): if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) - tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) + tmpLog.debug("Worker state set to: {0} ({1})".format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor - f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') + f = open(os.path.join(workSpec.accessPoint, "status.txt"), "w") f.write(workSpec.status) f.close() @@ -132,7 +131,7 @@ def check_workers(self, workspec_list): tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() - tmpLog.debug('Results: {0}'.format(retList)) + tmpLog.debug("Results: {0}".format(retList)) return True, retList @@ -143,7 +142,7 @@ def deep_checkjob(self, batchid, workerid): :param batchid: :return harvester_job_state, nativeExitCode, nativeStatus, startTime, endTime, diagMessage """ - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workerid), method_name='deep_checkjob') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workerid), method_name="deep_checkjob") harvester_job_state = None nativeexitcode = None nativestatus = None @@ -151,7 +150,7 @@ def deep_checkjob(self, batchid, workerid): starttime = None endtime = None queue_config = self.queue_config_mapper.get_queue(self.queueName) - if hasattr(queue_config, 'resource'): + if hasattr(queue_config, "resource"): resource_utils = self.pluginFactory.get_plugin(queue_config.resource) else: tmpLog.debug("Resource configuration missed for: {0}".format(self.queueName)) @@ -159,14 +158,14 @@ def deep_checkjob(self, batchid, workerid): if resource_utils: batchjob_info = resource_utils.get_batchjob_info(batchid) if batchjob_info: - tmpLog.info('Batch job info collected: {0}'.format(batchjob_info)) - harvester_job_state = batchjob_info['status'] - nativeexitcode = batchjob_info['nativeExitCode'] - nativestatus = batchjob_info['nativeStatus'] - diagmessage = batchjob_info['nativeExitMsg'] - if batchjob_info['start_time']: - starttime = batchjob_info['start_time'] - if batchjob_info['finish_time']: - endtime = batchjob_info['finish_time'] + tmpLog.info("Batch job info collected: {0}".format(batchjob_info)) + harvester_job_state = batchjob_info["status"] + nativeexitcode = batchjob_info["nativeExitCode"] + nativestatus = batchjob_info["nativeStatus"] + diagmessage = batchjob_info["nativeExitMsg"] + if batchjob_info["start_time"]: + starttime = batchjob_info["start_time"] + if batchjob_info["finish_time"]: + endtime = batchjob_info["finish_time"] return harvester_job_state, nativeexitcode, nativestatus, starttime, endtime, diagmessage diff --git a/pandaharvester/harvestermonitor/slurm_bulk_monitor.py b/pandaharvester/harvestermonitor/slurm_bulk_monitor.py index 9f78973d..75ecfa7a 100644 --- a/pandaharvester/harvestermonitor/slurm_bulk_monitor.py +++ b/pandaharvester/harvestermonitor/slurm_bulk_monitor.py @@ -8,7 +8,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('slurm_monitor') +baseLogger = core_utils.setup_logger("slurm_monitor") # monitor for SLURM batch system @@ -16,7 +16,7 @@ class SlurmBulkMonitor(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - if not hasattr(self, 'use_squeue_monitor'): + if not hasattr(self, "use_squeue_monitor"): self.use_squeue_monitor = False self.use_squeue_monitor = bool(self.use_squeue_monitor) @@ -31,12 +31,11 @@ def check_workers(self, workspec_list, bulk_size=100): def check_workers_sacct(self, workspec_list, bulk_size=100): retList = [] batch_id_status_map = {} - workspec_list_chunks = [workspec_list[i:i + bulk_size] for i in range(0, len(workspec_list), bulk_size)] + workspec_list_chunks = [workspec_list[i : i + bulk_size] for i in range(0, len(workspec_list), bulk_size)] for workspec_list_chunk in workspec_list_chunks: # make logger # worker_ids = [workSpec.workerID for workSpec in workspec_list_chunk] - tmpLog = self.make_logger(baseLogger, 'bulkWorkers', - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "bulkWorkers", method_name="check_workers") batch_id_list = [] for workSpec in workspec_list_chunk: @@ -45,23 +44,20 @@ def check_workers_sacct(self, workspec_list, bulk_size=100): # command comStr = "sacct -X --jobs={0}".format(batch_id_list_str) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() tmpLog.debug("stdout={0}".format(stdOut_str)) tmpLog.debug("stderr={0}".format(stdErr_str)) if retCode == 0: - for tmpLine in stdOut_str.split('\n'): + for tmpLine in stdOut_str.split("\n"): if len(tmpLine) == 0 or tmpLine.startswith("JobID") or tmpLine.startswith("--"): continue batchID = tmpLine.split()[0].strip() @@ -70,24 +66,23 @@ def check_workers_sacct(self, workspec_list, bulk_size=100): else: batchStatus = tmpLine.split()[5].strip() - if batchStatus in ['RUNNING', 'COMPLETING', 'STOPPED', 'SUSPENDED']: + if batchStatus in ["RUNNING", "COMPLETING", "STOPPED", "SUSPENDED"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['COMPLETED', 'PREEMPTED', 'TIMEOUT']: + elif batchStatus in ["COMPLETED", "PREEMPTED", "TIMEOUT"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['CANCELLED']: + elif batchStatus in ["CANCELLED"]: newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['CONFIGURING', 'PENDING']: + elif batchStatus in ["CONFIGURING", "PENDING"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) batch_id_status_map[batchID] = (newStatus, stdErr_str) else: # failed - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) - if 'slurm_load_jobs error: Invalid job id specified' in errStr: + if "slurm_load_jobs error: Invalid job id specified" in errStr: newStatus = WorkSpec.ST_failed for batchID in batch_id_list: batch_id_status_map[batchID] = (newStatus, errStr) @@ -101,69 +96,62 @@ def check_workers_sacct(self, workspec_list, bulk_size=100): newStatus = WorkSpec.ST_failed errStr = "Unknown batchID" retList.append((newStatus, errStr)) - tmpLog.debug("Worker {0} -> workerStatus {1} errStr {2}".format(workSpec.workerID, - newStatus, - errStr)) + tmpLog.debug("Worker {0} -> workerStatus {1} errStr {2}".format(workSpec.workerID, newStatus, errStr)) return True, retList def check_workers_squeue(self, workspec_list, bulk_size=100): retList = [] batch_id_status_map = {} - workspec_list_chunks = [workspec_list[i:i + bulk_size] for i in range(0, len(workspec_list), bulk_size)] + workspec_list_chunks = [workspec_list[i : i + bulk_size] for i in range(0, len(workspec_list), bulk_size)] for workspec_list_chunk in workspec_list_chunks: # make logger # worker_ids = [workSpec.workerID for workSpec in workspec_list_chunk] - tmpLog = self.make_logger(baseLogger, 'bulkWorkers', - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "bulkWorkers", method_name="check_workers") batch_id_list = [] for workSpec in workspec_list_chunk: batch_id_list.append(str(workSpec.batchID)) batch_id_list_str = ",".join(batch_id_list) # command - comStr = 'squeue -t all --jobs={0}'.format(batch_id_list_str) + comStr = "squeue -t all --jobs={0}".format(batch_id_list_str) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() tmpLog.debug("stdout={0}".format(stdOut_str)) tmpLog.debug("stderr={0}".format(stdErr_str)) if retCode == 0: - for tmpLine in stdOut_str.split('\n'): + for tmpLine in stdOut_str.split("\n"): tmpLine = tmpLine.strip() if len(tmpLine) == 0 or tmpLine.startswith("JobID") or tmpLine.startswith("--") or tmpLine.startswith("JOBID"): continue batchID = tmpLine.split()[0].strip() batchStatus = tmpLine.split()[4].strip() - if batchStatus in ['R', 'CG', 'ST', 'S']: + if batchStatus in ["R", "CG", "ST", "S"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['CD', 'PR', 'TO']: + elif batchStatus in ["CD", "PR", "TO"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['CA']: + elif batchStatus in ["CA"]: newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['CF', 'PD']: + elif batchStatus in ["CF", "PD"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) batch_id_status_map[batchID] = (newStatus, stdErr_str) else: # failed - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) - if 'slurm_load_jobs error: Invalid job id specified' in errStr: + if "slurm_load_jobs error: Invalid job id specified" in errStr: newStatus = WorkSpec.ST_failed for batchID in batch_id_list: batch_id_status_map[batchID] = (newStatus, errStr) @@ -177,7 +165,5 @@ def check_workers_squeue(self, workspec_list, bulk_size=100): newStatus = WorkSpec.ST_failed errStr = "Unknown batchID" retList.append((newStatus, errStr)) - tmpLog.debug("Worker {0} -> workerStatus {1} errStr {2}".format(workSpec.workerID, - newStatus, - errStr)) + tmpLog.debug("Worker {0} -> workerStatus {1} errStr {2}".format(workSpec.workerID, newStatus, errStr)) return True, retList diff --git a/pandaharvester/harvestermonitor/slurm_monitor.py b/pandaharvester/harvestermonitor/slurm_monitor.py index 77d215d7..bf2693dc 100644 --- a/pandaharvester/harvestermonitor/slurm_monitor.py +++ b/pandaharvester/harvestermonitor/slurm_monitor.py @@ -1,4 +1,5 @@ import re + try: import subprocess32 as subprocess except ImportError: @@ -9,7 +10,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('slurm_monitor') +baseLogger = core_utils.setup_logger("slurm_monitor") # monitor for SLURM batch system @@ -23,49 +24,44 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") # command comStr = "sacct --jobs={0}".format(workSpec.batchID) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: - for tmpLine in stdOut_str.split('\n'): - tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + for tmpLine in stdOut_str.split("\n"): + tmpMatch = re.search("{0} ".format(workSpec.batchID), tmpLine) if tmpMatch is not None: errStr = tmpLine batchStatus = tmpLine.split()[5] - if batchStatus in ['RUNNING', 'COMPLETING', 'STOPPED', 'SUSPENDED']: + if batchStatus in ["RUNNING", "COMPLETING", "STOPPED", "SUSPENDED"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['COMPLETED', 'PREEMPTED', 'TIMEOUT']: + elif batchStatus in ["COMPLETED", "PREEMPTED", "TIMEOUT"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['CANCELLED']: + elif batchStatus in ["CANCELLED"]: newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['CONFIGURING', 'PENDING']: + elif batchStatus in ["CONFIGURING", "PENDING"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) break retList.append((newStatus, errStr)) else: # failed - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) - if 'slurm_load_jobs error: Invalid job id specified' in errStr: + if "slurm_load_jobs error: Invalid job id specified" in errStr: newStatus = WorkSpec.ST_failed retList.append((newStatus, errStr)) return True, retList diff --git a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py index d9a68e01..0ca90690 100644 --- a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py +++ b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py @@ -1,4 +1,5 @@ import re + try: import subprocess32 as subprocess except ImportError: @@ -12,12 +13,12 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('slurm_squeue_monitor') +baseLogger = core_utils.setup_logger("slurm_squeue_monitor") # monitor for SLURM batch system with squeue class SlurmSqueueMonitor(PluginBase): - _HARVESTER_POSTMORTEM_FILENAME="FINISHED" + _HARVESTER_POSTMORTEM_FILENAME = "FINISHED" # constructor def __init__(self, **kwarg): @@ -28,75 +29,69 @@ def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") # here try to load file - current_postmortem_fname = '%s/%s' %(workSpec.accessPoint, SlurmSqueueMonitor._HARVESTER_POSTMORTEM_FILENAME) + current_postmortem_fname = "%s/%s" % (workSpec.accessPoint, SlurmSqueueMonitor._HARVESTER_POSTMORTEM_FILENAME) if os.path.exists(current_postmortem_fname): with open(current_postmortem_fname) as postmortem: try: worker_status_json = json.load(postmortem) - if 'worker_status' in worker_status_json: + if "worker_status" in worker_status_json: worker_status = None - if worker_status_json['worker_status']=='finished': - worker_status = WorkSpec.ST_finished - if worker_status_json['worker_status']=='failed': - worker_status = WorkSpec.ST_failed + if worker_status_json["worker_status"] == "finished": + worker_status = WorkSpec.ST_finished + if worker_status_json["worker_status"] == "failed": + worker_status = WorkSpec.ST_failed if worker_status is not None: - retList.append((worker_status, '')) - continue + retList.append((worker_status, "")) + continue except json.JSONDecodeError: - tmpLog.debug('Not able to parse JSON in postmortem for a worker: %s, continung with SLURM CLI' % current_postmortem_fname) + tmpLog.debug("Not able to parse JSON in postmortem for a worker: %s, continung with SLURM CLI" % current_postmortem_fname) # command comStr = "squeue -j {0}".format(workSpec.batchID) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("check with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) newStatus = workSpec.status # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - errStr = '' + tmpLog.debug("retCode={0}".format(retCode)) + errStr = "" stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: - for tmpLine in stdOut_str.split('\n'): - tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + for tmpLine in stdOut_str.split("\n"): + tmpMatch = re.search("{0} ".format(workSpec.batchID), tmpLine) if tmpMatch is not None: errStr = tmpLine batchStatus = tmpLine.split()[4] - if batchStatus in ['R', 'RUNNING', 'COMPLETING', 'STOPPED', 'SUSPENDED']: + if batchStatus in ["R", "RUNNING", "COMPLETING", "STOPPED", "SUSPENDED"]: newStatus = WorkSpec.ST_running - elif batchStatus in ['COMPLETED', 'PREEMPTED', 'TIMEOUT']: + elif batchStatus in ["COMPLETED", "PREEMPTED", "TIMEOUT"]: newStatus = WorkSpec.ST_finished - elif batchStatus in ['CANCELLED']: + elif batchStatus in ["CANCELLED"]: newStatus = WorkSpec.ST_cancelled - elif batchStatus in ['PD', 'CONFIGURING', 'PENDING']: + elif batchStatus in ["PD", "CONFIGURING", "PENDING"]: newStatus = WorkSpec.ST_submitted else: newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, - newStatus)) + tmpLog.debug("batchStatus {0} -> workerStatus {1}".format(batchStatus, newStatus)) break retList.append((newStatus, errStr)) else: # squeue does not show finished jobs, gives return code 1 # Assume finished for now. Maybe look in workdir. newStatus = WorkSpec.ST_finished - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) - #if 'slurm_load_jobs error: Invalid job id specified' in errStr: + # if 'slurm_load_jobs error: Invalid job id specified' in errStr: # newStatus = WorkSpec.ST_failed retList.append((newStatus, errStr)) return True, retList - def _get_worker_completion_details(): # try to open FINISHED file pass diff --git a/pandaharvester/harvestermover/mover_utils.py b/pandaharvester/harvestermover/mover_utils.py index 72f678dd..2f99809a 100644 --- a/pandaharvester/harvestermover/mover_utils.py +++ b/pandaharvester/harvestermover/mover_utils.py @@ -6,12 +6,8 @@ # construct file path def construct_file_path(base_path, scope, lfn): hash = hashlib.md5() - hash.update(six.b('%s:%s' % (scope, lfn))) + hash.update(six.b("%s:%s" % (scope, lfn))) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - dstURL = "{basePath}/{scope}/{hash1}/{hash2}/{lfn}".format(basePath=base_path, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=lfn) + correctedscope = "/".join(scope.split(".")) + dstURL = "{basePath}/{scope}/{hash1}/{hash2}/{lfn}".format(basePath=base_path, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=lfn) return dstURL diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index bad5b061..e4a186a1 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -1,5 +1,6 @@ import os import shutil + try: import subprocess32 as subprocess except Exception: @@ -13,7 +14,7 @@ from pandaharvester.harvestermover import mover_utils # logger -baseLogger = core_utils.setup_logger('analysis_aux_preparator') +baseLogger = core_utils.setup_logger("analysis_aux_preparator") # preparator plugin for analysis auxiliary inputs @@ -28,19 +29,18 @@ def __init__(self, **kwarg): # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") # loop over all inputs allDone = True bulkExtCommand = {} - tmpLog.debug('number of inFiles : {0}'.format(len(jobspec.inFiles))) + tmpLog.debug("number of inFiles : {0}".format(len(jobspec.inFiles))) for tmpFileSpec in jobspec.inFiles: # local access path url = tmpFileSpec.url accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) - accPathTmp = accPath + '.tmp' - tmpLog.debug('url : {0} accPath : {1}'.format(url,accPath)) + accPathTmp = accPath + ".tmp" + tmpLog.debug("url : {0} accPath : {1}".format(url, accPath)) # check if already exits if os.path.exists(accPath): continue @@ -53,59 +53,59 @@ def trigger_preparation(self, jobspec): if url.startswith(protocol): extCommand = self.externalCommand[protocol] # collect file info to execute the command later - bulkExtCommand.setdefault(protocol, {'command': extCommand, 'url': [], 'dst': [], 'lfn': []}) - bulkExtCommand[protocol]['url'].append(url) - bulkExtCommand[protocol]['dst'].append(accPath) - bulkExtCommand[protocol]['lfn'].append(tmpFileSpec.lfn) + bulkExtCommand.setdefault(protocol, {"command": extCommand, "url": [], "dst": [], "lfn": []}) + bulkExtCommand[protocol]["url"].append(url) + bulkExtCommand[protocol]["dst"].append(accPath) + bulkExtCommand[protocol]["lfn"].append(tmpFileSpec.lfn) break # execute the command later if extCommand is not None: continue # execute return_code = 1 - if url.startswith('http'): + if url.startswith("http"): try: - tmpLog.debug('getting via http from {0} to {1}'.format(url, accPathTmp)) + tmpLog.debug("getting via http from {0} to {1}".format(url, accPathTmp)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - with open(accPathTmp, 'wb') as f: + with open(accPathTmp, "wb") as f: f.write(res.content) - tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) + tmpLog.debug("Successfully fetched file - {0}".format(accPathTmp)) return_code = 0 else: - errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) + errMsg = "failed to get {0} with StatusCode={1} {2}".format(url, res.status_code, res.text) tmpLog.error(errMsg) except requests.exceptions.ReadTimeout: - tmpLog.error('read timeout when getting data from {0}'.format(url)) + tmpLog.error("read timeout when getting data from {0}".format(url)) except Exception: core_utils.dump_error_message(tmpLog) - elif url.startswith('docker'): + elif url.startswith("docker"): if self.containerRuntime is None: - tmpLog.debug('container downloading is disabled') + tmpLog.debug("container downloading is disabled") continue - if self.containerRuntime == 'docker': - args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] - return_code = self.make_image(jobspec,args) - elif self.containerRuntime == 'singularity': - args = ['singularity', 'build', '--sandbox', accPathTmp, url ] - return_code = self.make_image(jobspec,args) - elif self.containerRuntime == 'shifter': - args = ['shifterimg', 'pull', url ] - return_code = self.make_image(jobspec,args) + if self.containerRuntime == "docker": + args = ["docker", "save", "-o", accPathTmp, url.split("://")[-1]] + return_code = self.make_image(jobspec, args) + elif self.containerRuntime == "singularity": + args = ["singularity", "build", "--sandbox", accPathTmp, url] + return_code = self.make_image(jobspec, args) + elif self.containerRuntime == "shifter": + args = ["shifterimg", "pull", url] + return_code = self.make_image(jobspec, args) else: - tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) - elif url.startswith('/'): + tmpLog.error("unsupported container runtime : {0}".format(self.containerRuntime)) + elif url.startswith("/"): try: shutil.copyfile(url, accPathTmp) return_code = 0 except Exception: core_utils.dump_error_message(tmpLog) else: - tmpLog.error('unsupported protocol in {0}'.format(url)) + tmpLog.error("unsupported protocol in {0}".format(url)) # remove empty files if os.path.exists(accPathTmp) and os.path.getsize(accPathTmp) == 0: return_code = 1 - tmpLog.debug('remove empty file - {0}'.format(accPathTmp)) + tmpLog.debug("remove empty file - {0}".format(accPathTmp)) try: os.remove(accPathTmp) except Exception: @@ -121,35 +121,35 @@ def trigger_preparation(self, jobspec): allDone = False # execute external command execIdMap = {} - tmpLog.debug('bulkExtCommand : {0}'.format(bulkExtCommand)) + tmpLog.debug("bulkExtCommand : {0}".format(bulkExtCommand)) for protocol in bulkExtCommand: args = [] - for arg in bulkExtCommand[protocol]['command']['trigger']['args']: - if arg == '{src}': - arg = ','.join(bulkExtCommand[protocol]['url']) - elif arg == '{dst}': - arg = ','.join(bulkExtCommand[protocol]['dst']) + for arg in bulkExtCommand[protocol]["command"]["trigger"]["args"]: + if arg == "{src}": + arg = ",".join(bulkExtCommand[protocol]["url"]) + elif arg == "{dst}": + arg = ",".join(bulkExtCommand[protocol]["dst"]) args.append(arg) # execute try: - tmpLog.debug('executing external command: ' + ' '.join(args)) + tmpLog.debug("executing external command: " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is None: - stdout = '' + stdout = "" if stderr is None: - stderr = '' + stderr = "" # get ID of command execution such as transfer ID and batch job ID executionID = None - if return_code == 0 and 'check' in bulkExtCommand[protocol]['command']: - executionID = [s for s in stdout.split('\n') if s][-1] - dst = ','.join(bulkExtCommand[protocol]['dst']) - executionID = '{0}:{1}:{2}'.format(protocol, executionID, dst) - tmpLog.debug('executionID - {0}'.format(executionID)) - execIdMap[executionID] = {'lfns': bulkExtCommand[protocol]['lfn'], 'groupStatus': 'active'} - stdout = stdout.replace('\n', ' ') - stderr = stderr.replace('\n', ' ') + if return_code == 0 and "check" in bulkExtCommand[protocol]["command"]: + executionID = [s for s in stdout.split("\n") if s][-1] + dst = ",".join(bulkExtCommand[protocol]["dst"]) + executionID = "{0}:{1}:{2}".format(protocol, executionID, dst) + tmpLog.debug("executionID - {0}".format(executionID)) + execIdMap[executionID] = {"lfns": bulkExtCommand[protocol]["lfn"], "groupStatus": "active"} + stdout = stdout.replace("\n", " ") + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: {0}".format(stdout)) tmpLog.debug("stderr: {0}".format(stderr)) if executionID is not None: @@ -158,20 +158,20 @@ def trigger_preparation(self, jobspec): core_utils.dump_error_message(tmpLog) allDone = False # keep execution ID to check later - tmpLog.debug('execIdMap : {0}'.format(execIdMap)) + tmpLog.debug("execIdMap : {0}".format(execIdMap)) if execIdMap: jobspec.set_groups_to_files(execIdMap) # done if allDone: - tmpLog.debug('succeeded') - return True, '' + tmpLog.debug("succeeded") + return True, "" else: - errMsg = 'failed' + errMsg = "failed" tmpLog.error(errMsg) # check attemptNr for tmpFileSpec in jobspec.inFiles: if tmpFileSpec.attemptNr >= self.maxAttempts: - errMsg = 'gave up due to max attempts' + errMsg = "gave up due to max attempts" tmpLog.error(errMsg) return (False, errMsg) return None, errMsg @@ -179,42 +179,41 @@ def trigger_preparation(self, jobspec): # check status def check_stage_in_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_in_status') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_in_status") + tmpLog.debug("start") allDone = True - errMsg = '' + errMsg = "" transferGroups = jobspec.get_groups_of_input_files(skip_ready=True) for tmpGroupID in transferGroups: if tmpGroupID is None: continue - tmpGroupID_parts = tmpGroupID.split(':', 2) - tmpLog.debug('transfer group ID : {0} components: {1}'.format(tmpGroupID, tmpGroupID_parts)) - protocol, executionID, dst = tmpGroupID.split(':', 2) + tmpGroupID_parts = tmpGroupID.split(":", 2) + tmpLog.debug("transfer group ID : {0} components: {1}".format(tmpGroupID, tmpGroupID_parts)) + protocol, executionID, dst = tmpGroupID.split(":", 2) args = [] - for arg in self.externalCommand[protocol]['check']['args']: - if arg == '{id}': + for arg in self.externalCommand[protocol]["check"]["args"]: + if arg == "{id}": arg = executionID - elif arg == '{dst}': + elif arg == "{dst}": arg = dst args.append(arg) # execute try: - tmpLog.debug('executing external command: ' + ' '.join(args)) + tmpLog.debug("executing external command: " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is None: - stdout = '' + stdout = "" if stderr is None: - stderr = '' - stdout = stdout.replace('\n', ' ') - stderr = stderr.replace('\n', ' ') + stderr = "" + stdout = stdout.replace("\n", " ") + stderr = stderr.replace("\n", " ") tmpLog.debug("return_code: {0}".format(return_code)) tmpLog.debug("stdout: {0}".format(stdout)) tmpLog.debug("stderr: {0}".format(stderr)) if return_code != 0: - errMsg = '{0} is not ready'.format(tmpGroupID) + errMsg = "{0} is not ready".format(tmpGroupID) allDone = False break except Exception: @@ -225,21 +224,20 @@ def check_stage_in_status(self, jobspec): tmpLog.debug("check_stage_in_status: Return : None errMsg : {0}".format(errMsg)) return None, errMsg tmpLog.debug("check_stage_in_status: Return : True") - return True, '' + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='resolve_input_paths') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="resolve_input_paths") pathInfo = dict() for tmpFileSpec in jobspec.inFiles: url = tmpFileSpec.lfn accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) - pathInfo[tmpFileSpec.lfn] = {'path': accPath} - tmpLog.debug('lfn: {0} scope : {1} accPath : {2} pathInfo : {3}'.format(url, tmpFileSpec.scope, accPath, pathInfo)) + pathInfo[tmpFileSpec.lfn] = {"path": accPath} + tmpLog.debug("lfn: {0} scope : {1} accPath : {2} pathInfo : {3}".format(url, tmpFileSpec.scope, accPath, pathInfo)) jobspec.set_input_file_paths(pathInfo) - return True, '' + return True, "" # make local access path def make_local_access_path(self, scope, lfn): @@ -248,23 +246,21 @@ def make_local_access_path(self, scope, lfn): # run the command to create the image def make_image(self, jobspec, args): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='make_image') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="make_image") + tmpLog.debug("start") return_code = 1 try: - tmpLog.debug('executing ' + ' '.join(args)) + tmpLog.debug("executing " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is not None: - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: {0}".format(stdout)) tmpLog.debug("stderr: [0}".format(stderr)) except Exception: core_utils.dump_error_message(tmpLog) - tmpLog.debug('end with return code {0}'.format(return_code)) + tmpLog.debug("end with return code {0}".format(return_code)) return return_code - diff --git a/pandaharvester/harvesterpreparator/aux_preparator.py b/pandaharvester/harvesterpreparator/aux_preparator.py index 4fd306cb..b75e8777 100644 --- a/pandaharvester/harvesterpreparator/aux_preparator.py +++ b/pandaharvester/harvesterpreparator/aux_preparator.py @@ -3,11 +3,11 @@ from pandaharvester.harvestercore import core_utils # logger -baseLogger = core_utils.setup_logger('aux_preparator') +baseLogger = core_utils.setup_logger("aux_preparator") analysis_aux_preparator.baseLogger = baseLogger # preparator plugin for auxiliary inputs -class AuxPreparator (AnalysisAuxPreparator): - pass \ No newline at end of file +class AuxPreparator(AnalysisAuxPreparator): + pass diff --git a/pandaharvester/harvesterpreparator/dummy_bulk_preparator.py b/pandaharvester/harvesterpreparator/dummy_bulk_preparator.py index 403415d2..9e53cdfc 100644 --- a/pandaharvester/harvesterpreparator/dummy_bulk_preparator.py +++ b/pandaharvester/harvesterpreparator/dummy_bulk_preparator.py @@ -8,10 +8,10 @@ from pandaharvester.harvesterconfig import harvester_config # dummy transfer identifier -dummy_transfer_id_base = 'dummy_id_for_in' +dummy_transfer_id_base = "dummy_id_for_in" # logger -_logger = core_utils.setup_logger('dummy_bulk_preparator') +_logger = core_utils.setup_logger("dummy_bulk_preparator") # lock to get a unique ID uLock = threading.Lock() @@ -28,7 +28,7 @@ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) with uLock: global uID - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, uID) + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, uID) uID += 1 uID %= harvester_config.preparator.nThreads @@ -39,11 +39,8 @@ def trigger_preparation(self, jobspec): lfns = inFiles.keys() for inLFN in inFiles.keys(): lfns.append(inLFN) - jobspec.set_groups_to_files({self.dummy_transfer_id: {'lfns': lfns, - 'groupStatus': 'pending'} - } - ) - return True, '' + jobspec.set_groups_to_files({self.dummy_transfer_id: {"lfns": lfns, "groupStatus": "pending"}}) + return True, "" # check status def check_stage_in_status(self, jobspec): @@ -55,7 +52,7 @@ def check_stage_in_status(self, jobspec): locked = self.dbInterface.get_object_lock(self.dummy_transfer_id, lock_interval=120) if not locked: # escape since locked by another thread - msgStr = 'escape since locked by another thread' + msgStr = "escape since locked by another thread" return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock self.dbInterface.refresh_file_group_info(jobspec) @@ -63,21 +60,20 @@ def check_stage_in_status(self, jobspec): groups = jobspec.get_groups_of_input_files(skip_ready=True) # the dummy transfer ID is still there if self.dummy_transfer_id in groups: - groupUpdateTime = groups[self.dummy_transfer_id]['groupUpdateTime'] + groupUpdateTime = groups[self.dummy_transfer_id]["groupUpdateTime"] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(self.dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min. # those thresholds may be config params. - if len(fileSpecs) >= 10 or \ - groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): + if len(fileSpecs) >= 10 or groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): # submit transfer and get a real transfer ID # ... transferID = str(uuid.uuid4()) # set the real transfer ID - self.dbInterface.set_file_group(fileSpecs, transferID, 'running') - msgStr = 'real transfer submitted with ID={0}'.format(transferID) + self.dbInterface.set_file_group(fileSpecs, transferID, "running") + msgStr = "real transfer submitted with ID={0}".format(transferID) else: - msgStr = 'wait until enough files are pooled with {0}'.format(self.dummy_transfer_id) + msgStr = "wait until enough files are pooled with {0}".format(self.dummy_transfer_id) # release the lock self.dbInterface.release_object_lock(self.dummy_transfer_id) # return None to retry later @@ -88,8 +84,8 @@ def check_stage_in_status(self, jobspec): # ... # then update transfer status if successful for transferID, transferInfo in iteritems(groups): - jobspec.update_group_status_in_files(transferID, 'done') - return True, '' + jobspec.update_group_status_in_files(transferID, "done") + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): @@ -106,7 +102,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = 'dummypath/{0}'.format(inLFN) + inFile["path"] = "dummypath/{0}".format(inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/dummy_preparator.py b/pandaharvester/harvesterpreparator/dummy_preparator.py index f6271b95..615315ec 100644 --- a/pandaharvester/harvesterpreparator/dummy_preparator.py +++ b/pandaharvester/harvesterpreparator/dummy_preparator.py @@ -5,7 +5,7 @@ from pandaharvester.harvestercore import core_utils # logger -_logger = core_utils.setup_logger('dummy_preparator') +_logger = core_utils.setup_logger("dummy_preparator") # dummy plugin for preparator @@ -54,7 +54,7 @@ def trigger_preparation(self, jobspec): # -- set transfer ID which are used for later lookup # jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}}) # tmpLog.debug('done') - return True, '' + return True, "" # check status def check_stage_in_status(self, jobspec): @@ -78,7 +78,7 @@ def check_stage_in_status(self, jobspec): # -- update transfer status # for transferID, transferInfo in iteritems(transferGroups): # jobspec.update_group_status_in_files(transferID, 'done') - return True, '' + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): @@ -96,7 +96,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # -- set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = 'dummypath/{0}'.format(inLFN) + inFile["path"] = "dummypath/{0}".format(inLFN) # -- set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/go_bulk_preparator.py b/pandaharvester/harvesterpreparator/go_bulk_preparator.py index 614fb9ac..f9d01601 100644 --- a/pandaharvester/harvesterpreparator/go_bulk_preparator.py +++ b/pandaharvester/harvesterpreparator/go_bulk_preparator.py @@ -1,3 +1,4 @@ +from pandaharvester.harvestermisc import globus_utils import time import datetime import uuid @@ -17,9 +18,10 @@ # TO BE REMOVED for python2.7 import requests.packages.urllib3 + try: requests.packages.urllib3.disable_warnings() -except: +except BaseException: pass from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase @@ -29,7 +31,7 @@ # Define dummy transfer identifier -dummy_transfer_id_base = 'dummy_id_for_in' +dummy_transfer_id_base = "dummy_id_for_in" # lock to get a unique ID uLock = threading.Lock() @@ -37,18 +39,18 @@ uID = 0 # logger -_logger = core_utils.setup_logger('go_bulk_preparator') +_logger = core_utils.setup_logger("go_bulk_preparator") -from pandaharvester.harvestermisc import globus_utils def validate_transferid(transferid): - tmptransferid = transferid.replace('-','') + tmptransferid = transferid.replace("-", "") return all(c in string.hexdigits for c in tmptransferid) + def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) # Globus plugin for stager with bulk transfers. For JobSpec and DBInterface methods, see @@ -56,60 +58,60 @@ def dump(obj): class GlobusBulkPreparator(PluginBase): next_id = 0 # constructor + def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # make logger - tmpLog = self.make_logger(_logger, 'ThreadID={0}'.format(threading.current_thread().ident), - method_name='GlobusBulkPreparator __init__ {} ') - tmpLog.debug('__init__ start') + tmpLog = self.make_logger(_logger, "ThreadID={0}".format(threading.current_thread().ident), method_name="GlobusBulkPreparator __init__ {} ") + tmpLog.debug("__init__ start") self.thread_id = threading.current_thread().ident self.id = GlobusBulkPreparator.next_id GlobusBulkPreparator.next_id += 1 with uLock: global uID - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX') + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, "XXXX") uID += 1 uID %= harvester_config.preparator.nThreads # create Globus Transfer Client try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism - tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') - c_data = self.dbInterface.get_cache('globus_secret') - if (not c_data == None) and c_data.data['StatusCode'] == 0 : - tmpLog.debug('Got the globus_secrets from PanDA') - self.client_id = c_data.data['publicKey'] # client_id - self.refresh_token = c_data.data['privateKey'] # refresh_token - tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) + tmpLog.debug("about to call dbInterface.get_cache(globus_secret)") + c_data = self.dbInterface.get_cache("globus_secret") + if (c_data is not None) and c_data.data["StatusCode"] == 0: + tmpLog.debug("Got the globus_secrets from PanDA") + self.client_id = c_data.data["publicKey"] # client_id + self.refresh_token = c_data.data["privateKey"] # refresh_token + tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog, self.client_id, self.refresh_token) if not tmpStat: self.tc = None - errStr = 'failed to create Globus Transfer Client' + errStr = "failed to create Globus Transfer Client" tmpLog.error(errStr) - else : + else: self.client_id = None self.refresh_token = None self.tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' + errStr = "failed to get Globus Client ID and Refresh Token" tmpLog.error(errStr) - except: + except BaseException: core_utils.dump_error_message(tmpLog) # tmp debugging - tmpLog.debug('self.id = {0}'.format(self.id)) - tmpLog.debug('self.dummy_transfer_id = {0}'.format(self.dummy_transfer_id)) + tmpLog.debug("self.id = {0}".format(self.id)) + tmpLog.debug("self.dummy_transfer_id = {0}".format(self.dummy_transfer_id)) # tmp debugging - tmpLog.debug('__init__ finish') - + tmpLog.debug("__init__ finish") # get dummy_transfer_id + def get_dummy_transfer_id(self): return self.dummy_transfer_id # set dummy_transfer_id for testing - def set_dummy_transfer_id_testing(self,dummy_transfer_id): + def set_dummy_transfer_id_testing(self, dummy_transfer_id): self.dummy_transfer_id = dummy_transfer_id - # set FileSpec.status - def set_FileSpec_status(self,jobspec,status): + # set FileSpec.status + def set_FileSpec_status(self, jobspec, status): # loop over all input files for fileSpec in jobspec.inFiles: fileSpec.status = status @@ -117,121 +119,134 @@ def set_FileSpec_status(self,jobspec,status): # check status def check_stage_in_status(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='check_stage_in_status') - tmpLog.debug('start') + tmpLog = self.make_logger( + _logger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="check_stage_in_status" + ) + tmpLog.debug("start") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # show the dummy transfer id and set to a value with the jobspec.computingSite if needed. - tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) - if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : + tmpLog.debug("self.dummy_transfer_id = {}".format(self.dummy_transfer_id)) + if self.dummy_transfer_id == "{0}_{1}".format(dummy_transfer_id_base, "XXXX"): old_dummy_transfer_id = self.dummy_transfer_id - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.computingSite) - tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) - + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, jobspec.computingSite) + tmpLog.debug("Change self.dummy_transfer_id from {0} to {1}".format(old_dummy_transfer_id, self.dummy_transfer_id)) + # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # set flag if have db lock - have_db_lock = False + have_db_lock = False queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_input_files() = : {0}".format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: # skip if valid transfer ID not dummy one - if validate_transferid(dummy_transferID) : + if validate_transferid(dummy_transferID): continue # lock for 120 sec - tmpLog.debug('attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) + tmpLog.debug( + "attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}".format( + self.id, self.dummy_transfer_id, dummy_transferID + ) + ) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) - tmpLog.debug(' DB lock result - {0}'.format(have_db_lock)) + tmpLog.debug(" DB lock result - {0}".format(have_db_lock)) if not have_db_lock: # escape since locked by another thread - msgStr = 'escape since locked by another thread' + msgStr = "escape since locked by another thread" tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock - tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') + tmpLog.debug("self.dbInterface.refresh_file_group_info(jobspec)") self.dbInterface.refresh_file_group_info(jobspec) - tmpLog.debug('after self.dbInterface.refresh_file_group_info(jobspec)') + tmpLog.debug("after self.dbInterface.refresh_file_group_info(jobspec)") # get transfer groups again with refreshed info - tmpLog.debug('groups = jobspec.get_groups_of_input_files(skip_ready=True)') + tmpLog.debug("groups = jobspec.get_groups_of_input_files(skip_ready=True)") groups = jobspec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('after db lock and refresh - jobspec.get_groups_of_input_files(skip_ready=True) = : {0}'.format(groups)) + tmpLog.debug("after db lock and refresh - jobspec.get_groups_of_input_files(skip_ready=True) = : {0}".format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: - groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] + groupUpdateTime = groups[dummy_transferID]["groupUpdateTime"] # get files with the dummy transfer ID across jobs fileSpecs_allgroups = self.dbInterface.get_files_with_group_id(dummy_transferID) - msgStr = 'dummy_transferID = {0} self.dbInterface.get_files_with_group_id(dummy_transferID) number of files = {1}'.format(dummy_transferID,len(fileSpecs_allgroups)) + msgStr = "dummy_transferID = {0} self.dbInterface.get_files_with_group_id(dummy_transferID) number of files = {1}".format( + dummy_transferID, len(fileSpecs_allgroups) + ) tmpLog.debug(msgStr) fileSpecs = jobspec.get_input_file_specs(dummy_transferID, skip_ready=True) - msgStr = 'dummy_transferID = {0} jobspec.get_input_file_specs(dummy_transferID,skip_ready=True) number of files = {1}'.format(dummy_transferID,len(fileSpecs)) + msgStr = "dummy_transferID = {0} jobspec.get_input_file_specs(dummy_transferID,skip_ready=True) number of files = {1}".format( + dummy_transferID, len(fileSpecs) + ) tmpLog.debug(msgStr) # submit transfer if there are more than 10 files or the group was made before more than 10 min - if len(fileSpecs) >= 10 or \ - groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): - tmpLog.debug('prepare to transfer files') + if len(fileSpecs) >= 10 or groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): + tmpLog.debug("prepare to transfer files") # submit transfer and get a real transfer ID - # set the Globus destination Endpoint id and path will get them from Agis eventually - self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] - self.srcEndpoint = queueConfig.preparator['srcEndpoint'] + # set the Globus destination Endpoint id and path will get them from Agis eventually + self.Globus_srcPath = queueConfig.preparator["Globus_srcPath"] + self.srcEndpoint = queueConfig.preparator["srcEndpoint"] self.Globus_dstPath = self.basePath - #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] - self.dstEndpoint = queueConfig.preparator['dstEndpoint'] - # Test the endpoints and create the transfer data class + # self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] + self.dstEndpoint = queueConfig.preparator["dstEndpoint"] + # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) + tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.srcEndpoint) + tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' + errStr = "source Endpoint and destination Endpoint activated" tmpLog.debug(errStr) else: - errMsg = '' - if not tmpStatsrc : - errMsg += ' source Endpoint not activated ' - if not tmpStatdst : - errMsg += ' destination Endpoint not activated ' + errMsg = "" + if not tmpStatsrc: + errMsg += " source Endpoint not activated " + if not tmpStatdst: + errMsg += " destination Endpoint not activated " # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) + tmpLog.debug( + "attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}".format( + self.id, self.dummy_transfer_id, dummy_transferID + ) + ) have_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not have_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) - tmpRetVal = (None,errMsg) + tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None - tdata = TransferData(self.tc, - self.srcEndpoint, - self.dstEndpoint, - sync_level="exists") -# sync_level="checksum") - tmpLog.debug('size of tdata[DATA] - {}'.format(len(tdata['DATA']))) + tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="exists") + # sync_level="checksum") + tmpLog.debug("size of tdata[DATA] - {}".format(len(tdata["DATA"]))) - except: + except BaseException: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}'.format(self.id,self.dummy_transfer_id,dummy_transferID)) + tmpLog.debug( + "attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}, dummy_transferID - {2}".format( + self.id, self.dummy_transfer_id, dummy_transferID + ) + ) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(self.dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(self.dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal @@ -239,205 +254,209 @@ def check_stage_in_status(self, jobspec): ifile = 0 for fileSpec in fileSpecs: # only print to log file first 25 files - if ifile < 25 : + if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) - if ifile == 25 : + if ifile == 25: msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) # end debug log file test - scope = 'panda' - if fileSpec.scope is not None : + scope = "panda" + if fileSpec.scope is not None: scope = fileSpec.scope hash = hashlib.md5() if sys.version_info.major == 2: - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) if sys.version_info.major == 3: hash_string = "{0}:{1}".format(scope, fileSpec.lfn) - hash.update(bytes(hash_string, 'utf-8')) + hash.update(bytes(hash_string, "utf-8")) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - #srcURL = fileSpec.path - srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_srcPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) + correctedscope = "/".join(scope.split(".")) + # srcURL = fileSpec.path + srcURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.Globus_srcPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) # add files to transfer object - tdata - if ifile < 25 : - tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) - tdata.add_item(srcURL,dstURL) + if ifile < 25: + tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) + tdata.add_item(srcURL, dstURL) ifile += 1 - # submit transfer - tmpLog.debug('Number of files to transfer - {}'.format(len(tdata['DATA']))) + # submit transfer + tmpLog.debug("Number of files to transfer - {}".format(len(tdata["DATA"]))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) - if transfer_result['code'] == "Accepted": + if transfer_result["code"] == "Accepted": # succeeded # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - tmpLog.debug('successfully submitted id={0}'.format(transferID)) + transferID = transfer_result["task_id"] + tmpLog.debug("successfully submitted id={0}".format(transferID)) # set status for files - self.dbInterface.set_file_group(fileSpecs, transferID, 'running') - msgStr = 'submitted transfer with ID={0}'.format(transferID) + self.dbInterface.set_file_group(fileSpecs, transferID, "running") + msgStr = "submitted transfer with ID={0}".format(transferID) tmpLog.debug(msgStr) else: # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('Released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("Released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) have_db_lock = False else: - errMsg = 'Could not release DB lock for {}'.format(dummy_transferID) + errMsg = "Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) - tmpRetVal = (None, transfer_result['message']) + tmpRetVal = (None, transfer_result["message"]) return tmpRetVal except Exception as e: - errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('Released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("Released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) have_db_lock = False - else : - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + else: + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: - msgStr = 'wait until enough files are pooled' + msgStr = "wait until enough files are pooled" tmpLog.debug(msgStr) # release the lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - have_db_lock = False + tmpLog.debug("released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + have_db_lock = False else: - msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) + msgStr += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - have_db_lock = False + tmpLog.debug("released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + have_db_lock = False else: - msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) + msgStr += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs - # get transfer groups + # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_input_files(skip_ready=True)") groups = jobspec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('Number of transfer groups (skip_ready)- {0}'.format(len(groups))) - tmpLog.debug('transfer groups any state (skip_ready)- {0}'.format(groups)) + tmpLog.debug("Number of transfer groups (skip_ready)- {0}".format(len(groups))) + tmpLog.debug("transfer groups any state (skip_ready)- {0}".format(groups)) tmpLog.debug("groups = jobspec.get_groups_of_input_files()") groups = jobspec.get_groups_of_input_files() - tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) - tmpLog.debug('transfer groups any state - {0}'.format(groups)) + tmpLog.debug("Number of transfer groups - {0}".format(len(groups))) + tmpLog.debug("transfer groups any state - {0}".format(groups)) tmpLog.debug("groups = jobspec.get_groups_of_input_files(skip_ready=True)") groups = jobspec.get_groups_of_input_files(skip_ready=True) if len(groups) == 0: tmpLog.debug("jobspec.get_groups_of_input_files(skip_ready=True) returned no files ") tmpLog.debug("check_stage_in_status return status - True ") - return True,'' + return True, "" for transferID in groups: # allow only valid UUID - if validate_transferid(transferID) : + if validate_transferid(transferID): # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) + tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: - errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (str(self.tc),str(transferID)) + errStr = "failed to get transfer task; tc = %s; transferID = %s" % (str(self.tc), str(transferID)) tmpLog.error(errStr) return None, errStr - # return a temporary error when task is missing + # return a temporary error when task is missing if transferID not in transferTasks: - errStr = 'transfer task ID - {} is missing'.format(transferID) + errStr = "transfer task ID - {} is missing".format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID - if transferTasks[transferID]['status'] == 'SUCCEEDED': - tmpLog.debug('transfer task {} succeeded'.format(transferID)) - self.set_FileSpec_status(jobspec,'finished') - return True, '' + if transferTasks[transferID]["status"] == "SUCCEEDED": + tmpLog.debug("transfer task {} succeeded".format(transferID)) + self.set_FileSpec_status(jobspec, "finished") + return True, "" # failed - if transferTasks[transferID]['status'] == 'FAILED': - errStr = 'transfer task {} failed'.format(transferID) + if transferTasks[transferID]["status"] == "FAILED": + errStr = "transfer task {} failed".format(transferID) tmpLog.error(errStr) - self.set_FileSpec_status(jobspec,'failed') + self.set_FileSpec_status(jobspec, "failed") return False, errStr # another status - tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) + tmpStr = "transfer task {0} status: {1}".format(transferID, transferTasks[transferID]["status"]) tmpLog.debug(tmpStr) return None, tmpStr # end of loop over transfer groups - tmpLog.debug('End of loop over transfers groups - ending check_stage_in_status function') - return None,'no valid transfer id found' + tmpLog.debug("End of loop over transfers groups - ending check_stage_in_status function") + return None, "no valid transfer id found" + # trigger preparation + def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger( + _logger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="trigger_preparation" + ) + tmpLog.debug("start") # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # show the dummy transfer id and set to a value with the computingSite if needed. - tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) - if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : + tmpLog.debug("self.dummy_transfer_id = {}".format(self.dummy_transfer_id)) + if self.dummy_transfer_id == "{0}_{1}".format(dummy_transfer_id_base, "XXXX"): old_dummy_transfer_id = self.dummy_transfer_id - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.computingSite) - tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, jobspec.computingSite) + tmpLog.debug("Change self.dummy_transfer_id from {0} to {1}".format(old_dummy_transfer_id, self.dummy_transfer_id)) # set the dummy transfer ID which will be replaced with a real ID in check_stage_in_status() inFiles = jobspec.get_input_file_attributes(skip_ready=True) lfns = list(inFiles.keys()) - #for inLFN in inFiles.keys(): + # for inLFN in inFiles.keys(): # lfns.append(inLFN) - tmpLog.debug('number of lfns - {0} type(lfns) - {1}'.format(len(lfns),type(lfns))) - jobspec.set_groups_to_files({self.dummy_transfer_id: {'lfns': lfns,'groupStatus': 'pending'}}) + tmpLog.debug("number of lfns - {0} type(lfns) - {1}".format(len(lfns), type(lfns))) + jobspec.set_groups_to_files({self.dummy_transfer_id: {"lfns": lfns, "groupStatus": "pending"}}) if len(lfns) < 10: - msgStr = 'jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns - {1}, groupStatus - pending'.format(self.dummy_transfer_id,lfns) + msgStr = "jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns - {1}, groupStatus - pending".format(self.dummy_transfer_id, lfns) else: tmp_lfns = lfns[:10] - msgStr = 'jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns (first 25) - {1}, groupStatus - pending'.format(self.dummy_transfer_id,tmp_lfns) + msgStr = "jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns (first 25) - {1}, groupStatus - pending".format( + self.dummy_transfer_id, tmp_lfns + ) tmpLog.debug(msgStr) fileSpec_list = jobspec.get_input_file_specs(self.dummy_transfer_id, skip_ready=True) - tmpLog.debug('call jobspec.get_input_file_specs({0}, skip_ready=True) num files returned = {1}'.format(self.dummy_transfer_id,len(fileSpec_list))) - tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_input_file_specs(self.dummy_transfer_id,skip_ready=True),self.dummy_transfer_id,pending)') - tmpStat = self.dbInterface.set_file_group(fileSpec_list,self.dummy_transfer_id,'pending') - msgStr = 'called self.dbInterface.set_file_group(jobspec.get_input_file_specs(self.dummy_transfer_id,skip_ready=True),self.dummy_transfer_id,pending) return Status {}'.format(tmpStat) + tmpLog.debug("call jobspec.get_input_file_specs({0}, skip_ready=True) num files returned = {1}".format(self.dummy_transfer_id, len(fileSpec_list))) + tmpLog.debug( + "call self.dbInterface.set_file_group(jobspec.get_input_file_specs(self.dummy_transfer_id,skip_ready=True),self.dummy_transfer_id,pending)" + ) + tmpStat = self.dbInterface.set_file_group(fileSpec_list, self.dummy_transfer_id, "pending") + msgStr = "called self.dbInterface.set_file_group(jobspec.get_input_file_specs(self.dummy_transfer_id,skip_ready=True),self.dummy_transfer_id,pending) return Status {}".format( + tmpStat + ) tmpLog.debug(msgStr) - return True, '' - + return True, "" # make label for transfer task + def make_label(self, jobspec): - return "IN-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, - PandaID=jobspec.PandaID) + return "IN-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, PandaID=jobspec.PandaID) # resolve input file paths def resolve_input_paths(self, jobspec): @@ -445,10 +464,9 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" # Globus specific commands - diff --git a/pandaharvester/harvesterpreparator/go_preparator.py b/pandaharvester/harvesterpreparator/go_preparator.py index ee674b5b..f5372030 100644 --- a/pandaharvester/harvesterpreparator/go_preparator.py +++ b/pandaharvester/harvesterpreparator/go_preparator.py @@ -1,3 +1,4 @@ +from pandaharvester.harvestermisc import globus_utils import sys import os from future.utils import iteritems @@ -14,14 +15,11 @@ # logger _logger = core_utils.setup_logger() -from pandaharvester.harvestermisc import globus_utils - def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) # preparator with Globus Online @@ -30,181 +28,172 @@ class GoPreparator(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # create Globus Transfer Client - tmpLog = self.make_logger(_logger, method_name='GoPreparator __init__ ') + tmpLog = self.make_logger(_logger, method_name="GoPreparator __init__ ") try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism - tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') - c_data = self.dbInterface.get_cache('globus_secret') - if (not c_data == None) and c_data.data['StatusCode'] == 0 : - tmpLog.debug('Got the globus_secrets from PanDA') - self.client_id = c_data.data['publicKey'] # client_id - self.refresh_token = c_data.data['privateKey'] # refresh_token - tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) - if not tmpStat: - self.tc = None - errStr = 'failed to create Globus Transfer Client' - tmpLog.error(errStr) - else : - self.client_id = None - self.refresh_token = None - self.tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' - tmpLog.error(errStr) - except: + tmpLog.debug("about to call dbInterface.get_cache(globus_secret)") + c_data = self.dbInterface.get_cache("globus_secret") + if (c_data is not None) and c_data.data["StatusCode"] == 0: + tmpLog.debug("Got the globus_secrets from PanDA") + self.client_id = c_data.data["publicKey"] # client_id + self.refresh_token = c_data.data["privateKey"] # refresh_token + tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog, self.client_id, self.refresh_token) + if not tmpStat: + self.tc = None + errStr = "failed to create Globus Transfer Client" + tmpLog.error(errStr) + else: + self.client_id = None + self.refresh_token = None + self.tc = None + errStr = "failed to get Globus Client ID and Refresh Token" + tmpLog.error(errStr) + except BaseException: core_utils.dump_error_message(tmpLog) - tmpLog.debug('__init__ finished') + tmpLog.debug("__init__ finished") # check status def check_stage_in_status(self, jobspec): # get logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_in_status') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_in_status") # get groups of input files except ones already in ready state transferGroups = jobspec.get_groups_of_input_files(skip_ready=True) - #print type(transferGroups)," ",transferGroups + # print type(transferGroups)," ",transferGroups # update transfer status # get label label = self.make_label(jobspec) - tmpLog.debug('label={0}'.format(label)) + tmpLog.debug("label={0}".format(label)) # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) + tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog, self.tc, label) # return a temporary error when failed to get task if not tmpStat: - errStr = 'failed to get transfer task' + errStr = "failed to get transfer task" tmpLog.error(errStr) return None, errStr # return a fatal error when task is missing # FIXME retry instead? if label not in transferTasks: - errStr = 'transfer task is missing' + errStr = "transfer task is missing" tmpLog.error(errStr) return False, errStr # succeeded - if transferTasks[label]['status'] == 'SUCCEEDED': - transferID = transferTasks[label]['task_id'] - jobspec.update_group_status_in_files(transferID, 'done') - tmpLog.debug('transfer task succeeded') - return True, '' + if transferTasks[label]["status"] == "SUCCEEDED": + transferID = transferTasks[label]["task_id"] + jobspec.update_group_status_in_files(transferID, "done") + tmpLog.debug("transfer task succeeded") + return True, "" # failed - if transferTasks[label]['status'] == 'FAILED': - errStr = 'transfer task failed' + if transferTasks[label]["status"] == "FAILED": + errStr = "transfer task failed" tmpLog.error(errStr) return False, errStr # another status - tmpStr = 'transfer task is in {0}'.format(transferTasks[label]['status']) + tmpStr = "transfer task is in {0}".format(transferTasks[label]["status"]) tmpLog.debug(tmpStr) - return None, '' + return None, "" # trigger preparation def trigger_preparation(self, jobspec): # get logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) - tmpLog.debug('label={0}'.format(label)) + tmpLog.debug("label={0}".format(label)) # get transfer tasks - tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) + tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog, self.tc, label) if not tmpStat: - errStr = 'failed to get transfer tasks' + errStr = "failed to get transfer tasks" tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: - tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label]))) - return True, '' - # set the Globus destination Endpoint id and path will get them from Agis eventually + tmpLog.debug("skip since already queued with {0}".format(str(transferTasks[label]))) + return True, "" + # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper + queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) - self.Globus_srcPath = queueConfig.preparator['Globus_srcPath'] - self.srcEndpoint = queueConfig.preparator['srcEndpoint'] + self.Globus_srcPath = queueConfig.preparator["Globus_srcPath"] + self.srcEndpoint = queueConfig.preparator["srcEndpoint"] self.Globus_dstPath = self.basePath - #self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] - self.dstEndpoint = queueConfig.preparator['dstEndpoint'] + # self.Globus_dstPath = queueConfig.preparator['Globus_dstPath'] + self.dstEndpoint = queueConfig.preparator["dstEndpoint"] # get input files files = [] lfns = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) for inLFN, inFile in iteritems(inFiles): # set path to each file - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) - dstpath = inFile['path'] + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) + dstpath = inFile["path"] # check if path exists if not create it. if not os.access(self.basePath, os.F_OK): os.makedirs(self.basePath) - # create the file paths for the Globus source and destination endpoints - Globus_srcpath = mover_utils.construct_file_path(self.Globus_srcPath, inFile['scope'], inLFN) - Globus_dstpath = mover_utils.construct_file_path(self.Globus_dstPath, inFile['scope'], inLFN) - files.append({'scope': inFile['scope'], - 'name': inLFN, - 'Globus_dstPath': Globus_dstpath, - 'Globus_srcPath': Globus_srcpath}) + # create the file paths for the Globus source and destination endpoints + Globus_srcpath = mover_utils.construct_file_path(self.Globus_srcPath, inFile["scope"], inLFN) + Globus_dstpath = mover_utils.construct_file_path(self.Globus_dstPath, inFile["scope"], inLFN) + files.append({"scope": inFile["scope"], "name": inLFN, "Globus_dstPath": Globus_dstpath, "Globus_srcPath": Globus_srcpath}) lfns.append(inLFN) - tmpLog.debug('files[] {0}'.format(files)) + tmpLog.debug("files[] {0}".format(files)) try: # Test endpoints for activation - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) + tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.srcEndpoint) + tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' + errStr = "source Endpoint and destination Endpoint activated" tmpLog.debug(errStr) else: - errStr = '' - if not tmpStatsrc : - errStr += ' source Endpoint not activated ' - if not tmpStatdst : - errStr += ' destination Endpoint not activated ' + errStr = "" + if not tmpStatsrc: + errStr += " source Endpoint not activated " + if not tmpStatdst: + errStr += " destination Endpoint not activated " tmpLog.error(errStr) - return False,errStr + return False, errStr # both endpoints activated now prepare to transfer data if len(files) > 0: - tdata = TransferData(self.tc, - self.srcEndpoint, - self.dstEndpoint, - label=label, - sync_level="checksum") - # loop over all input files and add + tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") + # loop over all input files and add for myfile in files: - tdata.add_item(myfile['Globus_srcPath'],myfile['Globus_dstPath']) + tdata.add_item(myfile["Globus_srcPath"], myfile["Globus_dstPath"]) # submit transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) - if transfer_result['code'] == "Accepted": + if transfer_result["code"] == "Accepted": # succeeded # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}}) - tmpLog.debug('done') - return True,'' + transferID = transfer_result["task_id"] + jobspec.set_groups_to_files({transferID: {"lfns": lfns, "groupStatus": "active"}}) + tmpLog.debug("done") + return True, "" else: - return False,transfer_result['message'] + return False, transfer_result["message"] # if no files to transfer return True - return True, 'No files to transfer' - except: - errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) + return True, "No files to transfer" + except BaseException: + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) return errStat, {} - # make label for transfer task + def make_label(self, jobspec): - return "IN-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, - PandaID=jobspec.PandaID) + return "IN-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, PandaID=jobspec.PandaID) # resolve input file paths def resolve_input_paths(self, jobspec): @@ -212,9 +201,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' - - + return True, "" diff --git a/pandaharvester/harvesterpreparator/gridftp_preparator.py b/pandaharvester/harvesterpreparator/gridftp_preparator.py index 08d7eaa3..ab6b04a2 100644 --- a/pandaharvester/harvesterpreparator/gridftp_preparator.py +++ b/pandaharvester/harvesterpreparator/gridftp_preparator.py @@ -1,5 +1,6 @@ import os import tempfile + try: import subprocess32 as subprocess except Exception: @@ -10,7 +11,7 @@ from pandaharvester.harvestermover import mover_utils # logger -baseLogger = core_utils.setup_logger('gridftp_preparator') +baseLogger = core_utils.setup_logger("gridftp_preparator") # preparator plugin with GridFTP @@ -33,6 +34,8 @@ "gulOpts": "-cred /tmp/x509_u1234 -sync -sync-level 3 -verify-checksum -v" } """ + + class GridFtpPreparator(PluginBase): # constructor def __init__(self, **kwarg): @@ -45,65 +48,61 @@ def __init__(self, **kwarg): # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") # loop over all inputs inFileInfo = jobspec.get_input_file_attributes() gucInput = None for tmpFileSpec in jobspec.inFiles: # construct source and destination paths - srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) - dstPath = mover_utils.construct_file_path(self.dstBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) + srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) + dstPath = mover_utils.construct_file_path(self.dstBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) # local access path - accPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) + accPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) if self.checkLocalPath: # check if already exits if os.path.exists(accPath): # calculate checksum checksum = core_utils.calc_adler32(accPath) - checksum = 'ad:{0}'.format(checksum) - if checksum == inFileInfo[tmpFileSpec.lfn]['checksum']: + checksum = "ad:{0}".format(checksum) + if checksum == inFileInfo[tmpFileSpec.lfn]["checksum"]: continue # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): os.makedirs(os.path.dirname(accPath)) # make input for globus-url-copy if gucInput is None: - gucInput = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_guc_in.tmp') + gucInput = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_guc_in.tmp") gucInput.write("{0} {1}\n".format(srcPath, dstPath)) tmpFileSpec.attemptNr += 1 # nothing to transfer if gucInput is None: - tmpLog.debug('done with no transfers') - return True, '' + tmpLog.debug("done with no transfers") + return True, "" # transfer - tmpLog.debug('execute globus-url-copy') + tmpLog.debug("execute globus-url-copy") gucInput.close() - args = ['globus-url-copy', '-f', gucInput.name, '-cd'] + args = ["globus-url-copy", "-f", gucInput.name, "-cd"] if self.gulOpts is not None: args += self.gulOpts.split() try: - tmpLog.debug('execute: ' + ' '.join(args)) + tmpLog.debug("execute: " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: stdout, stderr = p.communicate(timeout=self.timeout) except subprocess.TimeoutExpired: p.kill() stdout, stderr = p.communicate() - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") return_code = p.returncode if stdout is not None: if not isinstance(stdout, str): stdout = stdout.decode() - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: if not isinstance(stderr, str): stderr = stderr.decode() - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: @@ -111,22 +110,22 @@ def trigger_preparation(self, jobspec): return_code = 1 os.remove(gucInput.name) if return_code == 0: - tmpLog.debug('succeeded') - return True, '' + tmpLog.debug("succeeded") + return True, "" else: - errMsg = 'failed with {0}'.format(return_code) + errMsg = "failed with {0}".format(return_code) tmpLog.error(errMsg) # check attemptNr for tmpFileSpec in jobspec.inFiles: if tmpFileSpec.attemptNr >= self.maxAttempts: - errMsg = 'gave up due to max attempts' + errMsg = "gave up due to max attempts" tmpLog.error(errMsg) return (False, errMsg) return None, errMsg # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): @@ -134,8 +133,7 @@ def resolve_input_paths(self, jobspec): inFileInfo = jobspec.get_input_file_attributes() pathInfo = dict() for tmpFileSpec in jobspec.inFiles: - accPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) - pathInfo[tmpFileSpec.lfn] = {'path': accPath} + accPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) + pathInfo[tmpFileSpec.lfn] = {"path": accPath} jobspec.set_input_file_paths(pathInfo) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator.py b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator.py index b87bf5e0..0732a8b7 100644 --- a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator.py +++ b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator.py @@ -14,13 +14,14 @@ from pilot.info import infosys # logger -baseLogger = core_utils.setup_logger('pilotmover_mt_preparator') +baseLogger = core_utils.setup_logger("pilotmover_mt_preparator") # plugin for preparator based on Pilot2.0 Data API, MultipleThreads # Pilot 2.0 should be deployed as library # default self.basePath came from preparator section of configuration file + class PilotmoverMTPreparator(PluginBase): """ Praparator bring files from remote ATLAS/Rucio storage to local facility. @@ -35,11 +36,11 @@ def __init__(self, **kwarg): # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" def stage_in(self, tmpLog, jobspec, files): try: - tmpLog.debug('To stagein files[] {0}'.format(files)) + tmpLog.debug("To stagein files[] {0}".format(files)) # get infosys # infoservice = InfoService() # infoservice.init(jobspec.computingSite, infosys.confinfo, infosys.extinfo) @@ -48,13 +49,13 @@ def stage_in(self, tmpLog, jobspec, files): infosys.queuedata.direct_access_lan = False infosys.queuedata.direct_access_wan = False # set data client, always use rucio - data_client = data.StageInClient(infosys, acopytools={'default': ['rucio']}, default_copytools='rucio') + data_client = data.StageInClient(infosys, acopytools={"default": ["rucio"]}, default_copytools="rucio") allChecked = True - ErrMsg = 'These files failed to download : ' + ErrMsg = "These files failed to download : " if len(files) > 0: result = data_client.transfer(files, use_vp=False) - tmpLog.debug('pilot.api data.StageInClient.transfer(files) result: {0}'.format(result)) + tmpLog.debug("pilot.api data.StageInClient.transfer(files) result: {0}".format(result)) # loop over each file check result all must be true for entire result to be true if result: @@ -63,12 +64,12 @@ def stage_in(self, tmpLog, jobspec, files): allChecked = False ErrMsg = ErrMsg + (" %s " % answer.lfn) else: - tmpLog.info('Looks like all files already inplace: {0}'.format(files)) + tmpLog.info("Looks like all files already inplace: {0}".format(files)) # return - tmpLog.debug('stop thread') + tmpLog.debug("stop thread") if allChecked: - return True, '' + return True, "" else: return False, ErrMsg except Exception as ex: @@ -79,56 +80,57 @@ def stage_in(self, tmpLog, jobspec, files): # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") try: # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get input files files = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) - tmpLog.debug('To check file: %s' % inFile) - if os.path.exists(inFile['path']): - checksum = core_utils.calc_adler32(inFile['path']) - checksum = 'ad:%s' % checksum - tmpLog.debug('checksum for file %s is %s' % (inFile['path'], checksum)) - if 'checksum' in inFile and inFile['checksum'] and inFile['checksum'] == checksum: - tmpLog.debug('File %s already exists at %s' % (inLFN, inFile['path'])) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) + tmpLog.debug("To check file: %s" % inFile) + if os.path.exists(inFile["path"]): + checksum = core_utils.calc_adler32(inFile["path"]) + checksum = "ad:%s" % checksum + tmpLog.debug("checksum for file %s is %s" % (inFile["path"], checksum)) + if "checksum" in inFile and inFile["checksum"] and inFile["checksum"] == checksum: + tmpLog.debug("File %s already exists at %s" % (inLFN, inFile["path"])) continue - dstpath = os.path.dirname(inFile['path']) + dstpath = os.path.dirname(inFile["path"]) # check if path exists if not create it. if not os.access(dstpath, os.F_OK): os.makedirs(dstpath) - file_data = {'scope': inFile['scope'], - 'dataset': inFile.get('dataset'), - 'lfn': inLFN, - 'ddmendpoint': inFile.get('endpoint'), - 'guid': inFile.get('guid'), - 'workdir': dstpath} - pilotfilespec = PilotFileSpec(type='input', **file_data) + file_data = { + "scope": inFile["scope"], + "dataset": inFile.get("dataset"), + "lfn": inLFN, + "ddmendpoint": inFile.get("endpoint"), + "guid": inFile.get("guid"), + "workdir": dstpath, + } + pilotfilespec = PilotFileSpec(type="input", **file_data) files.append(pilotfilespec) - tmpLog.debug('files[] {0}'.format(files)) + tmpLog.debug("files[] {0}".format(files)) allChecked = True - ErrMsg = 'These files failed to download : ' + ErrMsg = "These files failed to download : " if files: threads = [] n_files_per_thread = int((len(files) + self.n_threads - 1) / self.n_threads) - tmpLog.debug('num files per thread: %s' % n_files_per_thread) + tmpLog.debug("num files per thread: %s" % n_files_per_thread) for i in range(0, len(files), n_files_per_thread): - sub_files = files[i:i + n_files_per_thread] - thread = threading.Thread(target=self.stage_in, kwargs={'tmpLog': tmpLog, 'jobspec': jobspec, 'files': sub_files}) + sub_files = files[i : i + n_files_per_thread] + thread = threading.Thread(target=self.stage_in, kwargs={"tmpLog": tmpLog, "jobspec": jobspec, "files": sub_files}) threads.append(thread) [t.start() for t in threads] tmpLog.debug("threads: %s" % str(threads)) @@ -136,23 +138,23 @@ def trigger_preparation(self, jobspec): time.sleep(1) threads = [t for t in threads if t and t.is_alive()] - tmpLog.info('Checking all files: {0}'.format(files)) + tmpLog.info("Checking all files: {0}".format(files)) for file in files: if file.status_code != 0: allChecked = False ErrMsg = ErrMsg + (" %s " % file.lfn) for inLFN, inFile in iteritems(inFiles): - if not os.path.isfile(inFile['path']): + if not os.path.isfile(inFile["path"]): allChecked = False ErrMsg = ErrMsg + (" %s " % file.lfn) except Exception as ex: tmpLog.error(ex) tmpLog.error(traceback.format_exc()) # return - tmpLog.debug('stop') + tmpLog.debug("stop") if allChecked: - tmpLog.info('Looks like all files are successfully downloaded.') - return True, '' + tmpLog.info("Looks like all files are successfully downloaded.") + return True, "" else: return False, ErrMsg @@ -162,7 +164,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py index 0bef1361..f8bf995f 100644 --- a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py +++ b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py @@ -12,7 +12,7 @@ from pilot.info import infosys # logger -baseLogger = core_utils.setup_logger('pilotmover_mt_preparator_kari') +baseLogger = core_utils.setup_logger("pilotmover_mt_preparator_kari") # plugin for preparator based on Pilot2.0 Data API, MultipleThreads @@ -21,6 +21,7 @@ # Modified by FaHui Lin to be compatible with current pilot 2 code + class PilotmoverMTPreparator(PluginBase): """ Praparator bring files from remote ATLAS/Rucio storage to local facility. @@ -35,10 +36,10 @@ def __init__(self, **kwarg): # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" def stage_in(self, tmpLog, jobspec, files): - tmpLog.debug('To stagein files[] {0}'.format(files)) + tmpLog.debug("To stagein files[] {0}".format(files)) # get infosys # infoservice = InfoService() # infoservice.init(jobspec.computingSite, infosys.confinfo, infosys.extinfo) @@ -47,9 +48,9 @@ def stage_in(self, tmpLog, jobspec, files): infosys.queuedata.direct_access_lan = False infosys.queuedata.direct_access_wan = False # set data client, always use rucio - data_client = data.StageInClient(infosys, acopytools={'default': ['rucio']}, default_copytools='rucio') + data_client = data.StageInClient(infosys, acopytools={"default": ["rucio"]}, default_copytools="rucio") allChecked = True - ErrMsg = 'These files failed to download : ' + ErrMsg = "These files failed to download : " # change directory to basPath for input to pass pilot check_availablespace os.chdir(self.basePath) # transfer @@ -57,10 +58,10 @@ def stage_in(self, tmpLog, jobspec, files): try: result = data_client.transfer(files) except Exception as e: - tmpLog.error('error when stage_in: {0} ; {1}'.format(e.__class__.__name__, e)) + tmpLog.error("error when stage_in: {0} ; {1}".format(e.__class__.__name__, e)) raise else: - tmpLog.debug('pilot.api data.StageInClient.transfer(files) result: {0}'.format(result)) + tmpLog.debug("pilot.api data.StageInClient.transfer(files) result: {0}".format(result)) # loop over each file check result all must be true for entire result to be true if result: @@ -69,36 +70,35 @@ def stage_in(self, tmpLog, jobspec, files): allChecked = False ErrMsg = ErrMsg + (" %s " % answer.lfn) else: - tmpLog.info('Looks like all files already inplace: {0}'.format(files)) + tmpLog.info("Looks like all files already inplace: {0}".format(files)) # return - tmpLog.debug('stop thread') + tmpLog.debug("stop thread") if allChecked: - return True, '' + return True, "" else: return False, ErrMsg # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get input files files = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) - tmpLog.debug('To check file: %s' % inFile) - if os.path.exists(inFile['path']): + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) + tmpLog.debug("To check file: %s" % inFile) + if os.path.exists(inFile["path"]): # checksum = core_utils.calc_adler32(inFile['path']) # checksum = 'ad:%s' % checksum # tmpLog.debug('checksum for file %s is %s' % (inFile['path'], checksum)) @@ -107,61 +107,64 @@ def trigger_preparation(self, jobspec): # continue # lazy but unsafe check to be faster... - file_size = os.stat(inFile['path']).st_size - tmpLog.debug('file size for file %s is %s' % (inFile['path'], file_size)) - if 'fsize' in inFile and inFile['fsize'] and inFile['fsize'] == file_size: - tmpLog.debug('File %s already exists at %s' % (inLFN, inFile['path'])) + file_size = os.stat(inFile["path"]).st_size + tmpLog.debug("file size for file %s is %s" % (inFile["path"], file_size)) + if "fsize" in inFile and inFile["fsize"] and inFile["fsize"] == file_size: + tmpLog.debug("File %s already exists at %s" % (inLFN, inFile["path"])) continue - dstpath = os.path.dirname(inFile['path']) + dstpath = os.path.dirname(inFile["path"]) # check if path exists if not create it. if not os.access(dstpath, os.F_OK): os.makedirs(dstpath) file_data = { - 'scope': inFile['scope'], - 'dataset': inFile.get('dataset'), - 'lfn': inLFN, - 'ddmendpoint': inFile.get('endpoint'), - 'guid': inFile.get('guid'), - 'workdir': dstpath, - } - pilotfilespec = PilotFileSpec(type='input', **file_data) + "scope": inFile["scope"], + "dataset": inFile.get("dataset"), + "lfn": inLFN, + "ddmendpoint": inFile.get("endpoint"), + "guid": inFile.get("guid"), + "workdir": dstpath, + } + pilotfilespec = PilotFileSpec(type="input", **file_data) files.append(pilotfilespec) # tmpLog.debug('files[] {0}'.format(files)) - tmpLog.debug('path set') + tmpLog.debug("path set") allChecked = True - ErrMsg = 'These files failed to download : ' + ErrMsg = "These files failed to download : " if files: threads = [] n_files_per_thread = (len(files) + self.n_threads - 1) // self.n_threads - tmpLog.debug('num files per thread: %s' % n_files_per_thread) + tmpLog.debug("num files per thread: %s" % n_files_per_thread) for i in range(0, len(files), n_files_per_thread): - sub_files = files[i:i + n_files_per_thread] - thread = threading.Thread(target=self.stage_in, kwargs={ - 'tmpLog': tmpLog, - 'jobspec': jobspec, - 'files': sub_files, - }) + sub_files = files[i : i + n_files_per_thread] + thread = threading.Thread( + target=self.stage_in, + kwargs={ + "tmpLog": tmpLog, + "jobspec": jobspec, + "files": sub_files, + }, + ) threads.append(thread) [t.start() for t in threads] while len(threads) > 0: time.sleep(1) threads = [t for t in threads if t and t.isAlive()] - tmpLog.info('Checking all files: {0}'.format(files)) + tmpLog.info("Checking all files: {0}".format(files)) for file in files: if file.status_code != 0: allChecked = False ErrMsg = ErrMsg + (" %s " % file.lfn) for inLFN, inFile in iteritems(inFiles): - if not os.path.isfile(inFile['path']): + if not os.path.isfile(inFile["path"]): allChecked = False ErrMsg = ErrMsg + (" %s " % file.lfn) # return - tmpLog.debug('stop') + tmpLog.debug("stop") if allChecked: - tmpLog.info('Looks like all files are successfully downloaded.') - return True, '' + tmpLog.info("Looks like all files are successfully downloaded.") + return True, "" else: # keep retrying return None, ErrMsg @@ -173,7 +176,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/pilotmover_preparator.py b/pandaharvester/harvesterpreparator/pilotmover_preparator.py index b751bf99..bdc8d46a 100644 --- a/pandaharvester/harvesterpreparator/pilotmover_preparator.py +++ b/pandaharvester/harvesterpreparator/pilotmover_preparator.py @@ -8,89 +8,86 @@ from pilot.api import data # logger -baseLogger = core_utils.setup_logger('pilotmover_preparator') +baseLogger = core_utils.setup_logger("pilotmover_preparator") # plugin for preparator based on Pilot2.0 Data API # Pilot 2.0 should be deployed as library # default self.basePath came from preparator section of configuration file + class PilotmoverPreparator(PluginBase): """ - Praparator bring files from remote ATLAS/Rucio storage to local facility. + Praparator bring files from remote ATLAS/Rucio storage to local facility. """ - # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('Start. Trigger data transfer for job: {0}'.format(jobspec.PandaID)) - + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("Start. Trigger data transfer for job: {0}".format(jobspec.PandaID)) + # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get input files files = [] inFiles = jobspec.get_input_file_attributes(skip_ready=True) # set path to each file tmpLog.info("Prepare files to download (construct path and verifiy existing files)") for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # check if file exist. Skip alrady downoladed files - if os.path.exists(inFile['path']): - checksum = core_utils.calc_adler32(inFile['path']) - checksum = 'ad:%s' % checksum - #tmpLog.debug('checksum for file %s is %s' % (inFile['path'], checksum)) - if 'checksum' in inFile and inFile['checksum'] and inFile['checksum'] == checksum: - #tmpLog.debug('File %s already exists at %s' % (inLFN, inFile['path'])) + if os.path.exists(inFile["path"]): + checksum = core_utils.calc_adler32(inFile["path"]) + checksum = "ad:%s" % checksum + # tmpLog.debug('checksum for file %s is %s' % (inFile['path'], checksum)) + if "checksum" in inFile and inFile["checksum"] and inFile["checksum"] == checksum: + # tmpLog.debug('File %s already exists at %s' % (inLFN, inFile['path'])) continue - dstpath = os.path.dirname(inFile['path']) + dstpath = os.path.dirname(inFile["path"]) # check if path exists if not create it. if not os.access(dstpath, os.F_OK): os.makedirs(dstpath) - files.append({'scope': inFile['scope'], - 'name': inLFN, - 'destination': dstpath}) - tmpLog.info('Number of files to dowload: {0} for job: {1}'.format(len(files), jobspec.PandaID)) - #tmpLog.debug('files {0}'.format(files)) - tmpLog.info('Setup of Pilot2 API client') + files.append({"scope": inFile["scope"], "name": inLFN, "destination": dstpath}) + tmpLog.info("Number of files to dowload: {0} for job: {1}".format(len(files), jobspec.PandaID)) + # tmpLog.debug('files {0}'.format(files)) + tmpLog.info("Setup of Pilot2 API client") data_client = data.StageInClient(site=jobspec.computingSite) allChecked = True - ErrMsg = 'These files failed to download : ' + ErrMsg = "These files failed to download : " if len(files) > 0: tmpLog.info("Going to transfer {0} of files with one call to Pilot2 Data API".format(len(files))) try: result = data_client.transfer(files) except Exception as e: tmpLog.error("Pilot2 Data API rise error: {0}".format(e.message)) - tmpLog.debug('data_client.transfer(files) result:\n{0}'.format(result)) + tmpLog.debug("data_client.transfer(files) result:\n{0}".format(result)) tmpLog.info("Transfer call to Pilot2 Data API completed") # loop over each file check result all must be true for entire result to be true if result: for answer in result: - if answer['errno'] != 0: + if answer["errno"] != 0: allChecked = False - ErrMsg = ErrMsg + (" %s " % answer['name']) + ErrMsg = ErrMsg + (" %s " % answer["name"]) else: - tmpLog.info('Looks like all files in place. Number of files: {0}'.format(len(files))) + tmpLog.info("Looks like all files in place. Number of files: {0}".format(len(files))) # return - tmpLog.debug('Finished data transfer with {0} files for job {1}'.format(len(files), jobspec.PandaID)) + tmpLog.debug("Finished data transfer with {0} files for job {1}".format(len(files), jobspec.PandaID)) if allChecked: - return True, '' + return True, "" else: return False, ErrMsg @@ -100,7 +97,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/rse_direct_preparator.py b/pandaharvester/harvesterpreparator/rse_direct_preparator.py index 22c4ebe7..27fa4385 100644 --- a/pandaharvester/harvesterpreparator/rse_direct_preparator.py +++ b/pandaharvester/harvesterpreparator/rse_direct_preparator.py @@ -12,17 +12,19 @@ class RseDirectPreparator(PluginBase): the job and constructs input file paths that point to pfns in the storage. This means that the job directly read input files from the storage. """ + # constructor + def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" # trigger preparation def trigger_preparation(self, jobspec): - return True, '' + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): @@ -30,7 +32,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/rucio_preparator.py b/pandaharvester/harvesterpreparator/rucio_preparator.py index ffe0de2a..4188eeb3 100644 --- a/pandaharvester/harvesterpreparator/rucio_preparator.py +++ b/pandaharvester/harvesterpreparator/rucio_preparator.py @@ -9,7 +9,7 @@ from pandaharvester.harvestermover import mover_utils # logger -baseLogger = core_utils.setup_logger('rucio_preparator') +baseLogger = core_utils.setup_logger("rucio_preparator") def get_num_files(logs): @@ -46,14 +46,14 @@ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) if not hasattr(self, "rucioEnv"): self.rucioEnv = None - if not hasattr(self, 'timeout'): + if not hasattr(self, "timeout"): self.timeout = 30 * 60 # Default x509 proxy for a queue try: self.x509UserProxy except AttributeError: - self.x509UserProxy = os.getenv('X509_USER_PROXY') + self.x509UserProxy = os.getenv("X509_USER_PROXY") try: self.cacheDir @@ -67,25 +67,24 @@ def __init__(self, **kwarg): # check status def check_stage_in_status(self, jobspec): - return True, '' + return True, "" # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('Start. Trigger data transfer for job: {0}'.format(jobspec.PandaID)) + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("Start. Trigger data transfer for job: {0}".format(jobspec.PandaID)) try: - params = json.loads(jobspec.jobParams['jobPars']) - if 'input_datasets' not in params or 'input_location' not in params: - errMsg = 'input_datasets or input_location not in job parameters' + params = json.loads(jobspec.jobParams["jobPars"]) + if "input_datasets" not in params or "input_location" not in params: + errMsg = "input_datasets or input_location not in job parameters" tmpLog.error(errMsg) return True, errMsg - datasets = params['input_datasets'] # a comma-separated string + datasets = params["input_datasets"] # a comma-separated string datasets = datasets.split(",") - base_dir = params['input_location'] # dir name in EOS + base_dir = params["input_location"] # dir name in EOS if not base_dir: tmpLog.debug("input_location is not defined. will use harvester defaultDest: %s" % self.defaultDest) @@ -99,30 +98,34 @@ def trigger_preparation(self, jobspec): for dataset in datasets: upload_src_dir = os.path.join(self.cacheDir, dataset) if self.rucioEnv: - command = "%s; export X509_USER_PROXY=%s; rucio download --dir %s %s; gfal-copy -f -r -v %s %s" % (self.rucioEnv, - self.x509UserProxy, - self.cacheDir, - dataset, - upload_src_dir, - base_dir) + command = "%s; export X509_USER_PROXY=%s; rucio download --dir %s %s; gfal-copy -f -r -v %s %s" % ( + self.rucioEnv, + self.x509UserProxy, + self.cacheDir, + dataset, + upload_src_dir, + base_dir, + ) else: # command = "rucio download --dir %s %s" % (base_dir, dataset) - command = "export X509_USER_PROXY=%s; rucio download --dir %s %s; gfal-copy -f -r -v %s %s" % (self.x509UserProxy, - self.cacheDir, - dataset, - upload_src_dir, - base_dir) - tmpLog.debug('execute: ' + command) + command = "export X509_USER_PROXY=%s; rucio download --dir %s %s; gfal-copy -f -r -v %s %s" % ( + self.x509UserProxy, + self.cacheDir, + dataset, + upload_src_dir, + base_dir, + ) + tmpLog.debug("execute: " + command) exit_code = 0 try: - p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8', errors='replace') + p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8", errors="replace") stdout, stderr = p.communicate(timeout=self.timeout) exit_code = p.poll() except subprocess.TimeoutExpired: p.kill() stdout, stderr = p.communicate() exit_code = -1 - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) @@ -142,7 +145,7 @@ def trigger_preparation(self, jobspec): downloaded_datasets += 1 if final_exit_code == 0 and total_datasets == downloaded_datasets: tmpLog.info("All datasets have been downloaded") - return True, '' + return True, "" else: errMsg = "Not all datasets have been downloaded" tmpLog.error(errMsg) @@ -155,14 +158,14 @@ def trigger_preparation(self, jobspec): # resolve input file paths def resolve_input_paths(self, jobspec): # get input base location - params = json.loads(jobspec.jobParams['jobPars']) - base_dir = params['input_location'] # dir name in EOS + params = json.loads(jobspec.jobParams["jobPars"]) + base_dir = params["input_location"] # dir name in EOS # get input files inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(base_dir, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(base_dir, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterpreparator/xrdcp_preparator.py b/pandaharvester/harvesterpreparator/xrdcp_preparator.py index cf17b1e0..d4dfd2f3 100644 --- a/pandaharvester/harvesterpreparator/xrdcp_preparator.py +++ b/pandaharvester/harvesterpreparator/xrdcp_preparator.py @@ -1,5 +1,6 @@ import os import tempfile + try: import subprocess32 as subprocess except Exception: @@ -10,7 +11,7 @@ from pandaharvester.harvestermover import mover_utils # logger -baseLogger = core_utils.setup_logger('xrdcp_preparator') +baseLogger = core_utils.setup_logger("xrdcp_preparator") # preparator plugin with https://xrootd.slac.stanford.edu/ xrdcp @@ -31,6 +32,8 @@ "xrdcpOpts": "--retry 3 --cksum adler32 --debug 1" } """ + + class XrdcpPreparator(PluginBase): # constructor def __init__(self, **kwarg): @@ -43,98 +46,95 @@ def __init__(self, **kwarg): # trigger preparation def trigger_preparation(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_preparation") + tmpLog.debug("start") # get the environment harvester_env = os.environ.copy() - #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + # tmpLog.debug('Harvester environment : {}'.format(harvester_env)) # loop over all inputs inFileInfo = jobspec.get_input_file_attributes() xrdcpInput = None - allfiles_transfered = True + allfiles_transfered = True overall_errMsg = "" for tmpFileSpec in jobspec.inFiles: # construct source and destination paths - srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) + srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) # local path - localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) if self.checkLocalPath: # check if already exits if os.path.exists(localPath): # calculate checksum checksum = core_utils.calc_adler32(localPath) - checksum = 'ad:{0}'.format(checksum) - if checksum == inFileInfo[tmpFileSpec.lfn]['checksum']: + checksum = "ad:{0}".format(checksum) + if checksum == inFileInfo[tmpFileSpec.lfn]["checksum"]: continue # make directories if needed if not os.path.isdir(os.path.dirname(localPath)): os.makedirs(os.path.dirname(localPath)) - tmpLog.debug('Make directory - {0}'.format(os.path.dirname(localPath))) + tmpLog.debug("Make directory - {0}".format(os.path.dirname(localPath))) # collect list of input files if xrdcpInput is None: xrdcpInput = [srcPath] else: xrdcpInput.append[srcPath] # transfer using xrdcp one file at a time - tmpLog.debug('execute xrdcp') - args = ['xrdcp', '--nopbar', '--force'] - args_files = [srcPath,localPath] + tmpLog.debug("execute xrdcp") + args = ["xrdcp", "--nopbar", "--force"] + args_files = [srcPath, localPath] if self.xrdcpOpts is not None: args += self.xrdcpOpts.split() args += args_files tmpFileSpec.attemptNr += 1 try: - xrdcp_cmd = ' '.join(args) - tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + xrdcp_cmd = " ".join(args) + tmpLog.debug("execute: {0}".format(xrdcp_cmd)) p = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) try: stdout, stderr = p.communicate(timeout=self.timeout) except subprocess.TimeoutExpired: p.kill() stdout, stderr = p.communicate() - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") return_code = p.returncode if stdout is not None: if not isinstance(stdout, str): stdout = stdout.decode() - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: if not isinstance(stderr, str): stderr = stderr.decode() - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: core_utils.dump_error_message(tmpLog) return_code = 1 if return_code != 0: - overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) - allfiles_transfered = False - errMsg = 'failed with {0}'.format(return_code) + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath, return_code) + allfiles_transfered = False + errMsg = "failed with {0}".format(return_code) tmpLog.error(errMsg) # check attemptNr if tmpFileSpec.attemptNr >= self.maxAttempts: - errMsg = 'gave up due to max attempts' + errMsg = "gave up due to max attempts" tmpLog.error(errMsg) return (False, errMsg) # end loop over input files # nothing to transfer if xrdcpInput is None: - tmpLog.debug('done with no transfers') - return True, '' + tmpLog.debug("done with no transfers") + return True, "" # check if all files were transfered - if allfiles_transfered : - return True, '' + if allfiles_transfered: + return True, "" else: return None, overall_errMsg - # check status + def check_stage_in_status(self, jobspec): - return True, '' + return True, "" # resolve input file paths def resolve_input_paths(self, jobspec): @@ -142,8 +142,7 @@ def resolve_input_paths(self, jobspec): inFileInfo = jobspec.get_input_file_attributes() pathInfo = dict() for tmpFileSpec in jobspec.inFiles: - localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], - tmpFileSpec.lfn) - pathInfo[tmpFileSpec.lfn] = {'path': localPath} + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]["scope"], tmpFileSpec.lfn) + pathInfo[tmpFileSpec.lfn] = {"path": localPath} jobspec.set_input_file_paths(pathInfo) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterscripts/file_operation.py b/pandaharvester/harvesterscripts/file_operation.py index e44815a3..678ef514 100644 --- a/pandaharvester/harvesterscripts/file_operation.py +++ b/pandaharvester/harvesterscripts/file_operation.py @@ -12,83 +12,95 @@ import re -#=== Command functions ========================================================= +# === Command functions ========================================================= + def test(arguments): - print('file_operation: test') + print("file_operation: test") + # calculate adler32 + + def adler32(arguments): file_name = arguments.file val = 1 blockSize = 32 * 1024 * 1024 - with open(file_name, 'rb') as fp: + with open(file_name, "rb") as fp: while True: data = fp.read(blockSize) if not data: break val = zlib.adler32(data, val) if val < 0: - val += 2 ** 32 + val += 2**32 retVal = hex(val)[2:10].zfill(8).lower() print(retVal) + # write data into a temporary file; return the file name + + def write_tmpfile(arguments): - tmpArgFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=arguments.suffix, - dir=arguments.dir) - in_data = re.sub(r'\\\\', r'\\', arguments.data) - in_data = re.sub(r'\\n', '\n', in_data) + tmpArgFile = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=arguments.suffix, dir=arguments.dir) + in_data = re.sub(r"\\\\", r"\\", arguments.data) + in_data = re.sub(r"\\n", "\n", in_data) tmpArgFile.write(in_data) tmpArgFile.close() print(tmpArgFile.name) + # remove file + + def remove_file(arguments): os.remove(arguments.path) -#=== Command map =============================================================== + +# === Command map =============================================================== + commandMap = { - # test commands - 'test': test, - # adler32 commands - 'adler32': adler32, - 'write_tmpfile': write_tmpfile, - 'remove_file': remove_file, - } + # test commands + "test": test, + # adler32 commands + "adler32": adler32, + "write_tmpfile": write_tmpfile, + "remove_file": remove_file, +} + +# === Main ====================================================================== -#=== Main ====================================================================== def main(): # main parser - oparser = argparse.ArgumentParser(prog='file_operations.py', add_help=True) + oparser = argparse.ArgumentParser(prog="file_operations.py", add_help=True) subparsers = oparser.add_subparsers() # test command - test_parser = subparsers.add_parser('test', help='for testing only') - test_parser.set_defaults(which='test') + test_parser = subparsers.add_parser("test", help="for testing only") + test_parser.set_defaults(which="test") # adler32 command - adler32_parser = subparsers.add_parser('adler32', help='get adler32 checksum of the file') - adler32_parser.set_defaults(which='adler32') - adler32_parser.add_argument('file', type=str, action='store', metavar='', help='file path') + adler32_parser = subparsers.add_parser("adler32", help="get adler32 checksum of the file") + adler32_parser.set_defaults(which="adler32") + adler32_parser.add_argument("file", type=str, action="store", metavar="", help="file path") # write_tmpfile command - write_tmpfile_parser = subparsers.add_parser('write_tmpfile', help='write data to a temporary file') - write_tmpfile_parser.set_defaults(which='write_tmpfile') - write_tmpfile_parser.add_argument('--suffix', type=str, action='store', metavar='', default='xxx.tmp', help='name suffix of temporary file') - write_tmpfile_parser.add_argument('--dir', type=str, action='store', metavar='', default='/tmp', help='directory of temorary file') - write_tmpfile_parser.add_argument('data', type=str, action='store', metavar='', help='data to write in temporary file') + write_tmpfile_parser = subparsers.add_parser("write_tmpfile", help="write data to a temporary file") + write_tmpfile_parser.set_defaults(which="write_tmpfile") + write_tmpfile_parser.add_argument("--suffix", type=str, action="store", metavar="", default="xxx.tmp", help="name suffix of temporary file") + write_tmpfile_parser.add_argument("--dir", type=str, action="store", metavar="", default="/tmp", help="directory of temorary file") + write_tmpfile_parser.add_argument("data", type=str, action="store", metavar="", help="data to write in temporary file") # remove_file command - remove_file_parser = subparsers.add_parser('remove_file', help='remove a file') - remove_file_parser.set_defaults(which='remove_file') - remove_file_parser.add_argument('path', type=str, action='store', metavar='', help='file path') + remove_file_parser = subparsers.add_parser("remove_file", help="remove a file") + remove_file_parser.set_defaults(which="remove_file") + remove_file_parser.add_argument("path", type=str, action="store", metavar="", help="file path") # start parsing if len(sys.argv) == 1: oparser.print_help() sys.exit(1) arguments = oparser.parse_args(sys.argv[1:]) - ## Run command functions + # Run command functions try: command = commandMap.get(arguments.which) except AttributeError: @@ -98,9 +110,9 @@ def main(): result = command(arguments) sys.exit(result) except Exception as e: - sys.stderr.write('{0}: {1}'.format(e.__class__.__name__, e)) + sys.stderr.write("{0}: {1}".format(e.__class__.__name__, e)) sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pandaharvester/harvesterscripts/harvester_admin.py b/pandaharvester/harvesterscripts/harvester_admin.py index c8f805fa..e4694e40 100644 --- a/pandaharvester/harvesterscripts/harvester_admin.py +++ b/pandaharvester/harvesterscripts/harvester_admin.py @@ -15,62 +15,70 @@ from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestermisc.selfcheck import harvesterPackageInfo -#=== Logger =================================================== +# === Logger =================================================== + def setupLogger(logger): logger.setLevel(logging.DEBUG) hdlr = logging.StreamHandler() + def emit_decorator(fn): def func(*args): levelno = args[0].levelno - if(levelno >= logging.CRITICAL): - color = '\033[35;1m' - elif(levelno >= logging.ERROR): - color = '\033[31;1m' - elif(levelno >= logging.WARNING): - color = '\033[33;1m' - elif(levelno >= logging.INFO): - color = '\033[32;1m' - elif(levelno >= logging.DEBUG): - color = '\033[36;1m' + if levelno >= logging.CRITICAL: + color = "\033[35;1m" + elif levelno >= logging.ERROR: + color = "\033[31;1m" + elif levelno >= logging.WARNING: + color = "\033[33;1m" + elif levelno >= logging.INFO: + color = "\033[32;1m" + elif levelno >= logging.DEBUG: + color = "\033[36;1m" else: - color = '\033[0m' - formatter = logging.Formatter('{0}[%(asctime)s %(levelname)s] %(message)s\033[0m'.format(color)) + color = "\033[0m" + formatter = logging.Formatter("{0}[%(asctime)s %(levelname)s] %(message)s\033[0m".format(color)) hdlr.setFormatter(formatter) return fn(*args) + return func + hdlr.emit = emit_decorator(hdlr.emit) logger.addHandler(hdlr) -mainLogger = logging.getLogger('HarvesterAdminTool') +mainLogger = logging.getLogger("HarvesterAdminTool") setupLogger(mainLogger) -#=== Operation functions ======================================================== +# === Operation functions ======================================================== + def json_print(data): print(json.dumps(data, sort_keys=True, indent=4)) + def multithread_executer(func, n_object, n_thread): with ThreadPoolExecutor(n_thread) as _pool: retIterator = _pool.map(func, range(n_object)) return retIterator + def get_harvester_attributes(): attr_list = [ - 'harvesterID', - 'version', - 'commit_info', - 'harvester_config', + "harvesterID", + "version", + "commit_info", + "harvester_config", ] return attr_list + def repopulate_fifos(*names): fifo_class_name_map = { - 'monitor': 'MonitorFIFO', - } + "monitor": "MonitorFIFO", + } if len(names) > 0: - fifo_class_name_list = [ fifo_class_name_map.get(name) for name in names ] + fifo_class_name_list = [fifo_class_name_map.get(name) for name in names] else: fifo_class_name_list = fifo_class_name_map.values() for fifo_class_name in fifo_class_name_list: @@ -80,185 +88,213 @@ def repopulate_fifos(*names): if not fifo.enabled: continue fifo.populate(clear_fifo=True) - print('Repopulated {0} fifo'.format(fifo.titleName)) + print("Repopulated {0} fifo".format(fifo.titleName)) + # TODO -#=== Command functions ======================================================== +# === Command functions ======================================================== + def test(arguments): - mainLogger.critical('Harvester Admin Tool: test CRITICAL') - mainLogger.error('Harvester Admin Tool: test ERROR') - mainLogger.warning('Harvester Admin Tool: test WARNING') - mainLogger.info('Harvester Admin Tool: test INFO') - mainLogger.debug('Harvester Admin Tool: test DEBUG') - print('Harvester Admin Tool: test') + mainLogger.critical("Harvester Admin Tool: test CRITICAL") + mainLogger.error("Harvester Admin Tool: test ERROR") + mainLogger.warning("Harvester Admin Tool: test WARNING") + mainLogger.info("Harvester Admin Tool: test INFO") + mainLogger.debug("Harvester Admin Tool: test DEBUG") + print("Harvester Admin Tool: test") + def get(arguments): attr = arguments.attribute hpi = harvesterPackageInfo(None) if attr not in get_harvester_attributes(): - mainLogger.error('Invalid attribute: {0}'.format(attr)) + mainLogger.error("Invalid attribute: {0}".format(attr)) return - elif attr == 'version': + elif attr == "version": print(hpi.version) - elif attr == 'commit_info': + elif attr == "commit_info": print(hpi.commit_info) - elif attr == 'harvesterID': + elif attr == "harvesterID": print(harvester_config.master.harvester_id) - elif attr == 'harvester_config': + elif attr == "harvester_config": json_print(harvester_config.config_dict) + def fifo_benchmark(arguments): n_object = arguments.n_object n_thread = arguments.n_thread mq = harvesterFifos.BenchmarkFIFO() sw = core_utils.get_stopwatch() sum_dict = { - 'put_n' : 0, - 'put_time' : 0.0, - 'get_time' : 0.0, - 'get_protective_time' : 0.0, - 'clear_time' : 0.0, - } + "put_n": 0, + "put_time": 0.0, + "get_time": 0.0, + "get_protective_time": 0.0, + "clear_time": 0.0, + } + def _put_object(i_index): workspec = WorkSpec() workspec.workerID = i_index - data = {'random': [(i_index**2) % 2**16, random.random()]} + data = {"random": [(i_index**2) % 2**16, random.random()]} workspec.workAttributes = data mq.put(workspec) + def _get_object(i_index): return mq.get(timeout=3, protective=False) + def _get_object_protective(i_index): return mq.get(timeout=3, protective=True) + def put_test(): sw.reset() multithread_executer(_put_object, n_object, n_thread) - sum_dict['put_time'] += sw.get_elapsed_time_in_sec(True) - sum_dict['put_n'] += 1 - print('Put {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) - print('Now fifo size is {0}'.format(mq.size())) + sum_dict["put_time"] += sw.get_elapsed_time_in_sec(True) + sum_dict["put_n"] += 1 + print("Put {0} objects by {1} threads".format(n_object, n_thread) + sw.get_elapsed_time()) + print("Now fifo size is {0}".format(mq.size())) + def get_test(): sw.reset() multithread_executer(_get_object, n_object, n_thread) - sum_dict['get_time'] = sw.get_elapsed_time_in_sec(True) - print('Get {0} objects by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) - print('Now fifo size is {0}'.format(mq.size())) + sum_dict["get_time"] = sw.get_elapsed_time_in_sec(True) + print("Get {0} objects by {1} threads".format(n_object, n_thread) + sw.get_elapsed_time()) + print("Now fifo size is {0}".format(mq.size())) + def get_protective_test(): sw.reset() multithread_executer(_get_object_protective, n_object, n_thread) - sum_dict['get_protective_time'] = sw.get_elapsed_time_in_sec(True) - print('Get {0} objects protective dequeue by {1} threads'.format(n_object, n_thread) + sw.get_elapsed_time()) - print('Now fifo size is {0}'.format(mq.size())) + sum_dict["get_protective_time"] = sw.get_elapsed_time_in_sec(True) + print("Get {0} objects protective dequeue by {1} threads".format(n_object, n_thread) + sw.get_elapsed_time()) + print("Now fifo size is {0}".format(mq.size())) + def clear_test(): sw.reset() mq.fifo.clear() - sum_dict['clear_time'] = sw.get_elapsed_time_in_sec(True) - print('Cleared fifo' + sw.get_elapsed_time()) - print('Now fifo size is {0}'.format(mq.size())) + sum_dict["clear_time"] = sw.get_elapsed_time_in_sec(True) + print("Cleared fifo" + sw.get_elapsed_time()) + print("Now fifo size is {0}".format(mq.size())) + # Benchmark - print('Start fifo benchmark ...') + print("Start fifo benchmark ...") mq.fifo.clear() - print('Cleared fifo') + print("Cleared fifo") put_test() get_test() put_test() get_protective_test() put_test() clear_test() - print('Finished fifo benchmark') + print("Finished fifo benchmark") # summary - print('Summary:') - print('FIFO plugin is: {0}'.format(mq.fifo.__class__.__name__)) - print('Benchmark with {0} objects by {1} threads'.format(n_object, n_thread)) - print('Put : {0:.3f} ms / obj'.format(1000. * sum_dict['put_time']/(sum_dict['put_n']*n_object))) - print('Get : {0:.3f} ms / obj'.format(1000. * sum_dict['get_time']/n_object)) - print('Get protective : {0:.3f} ms / obj'.format(1000. * sum_dict['get_protective_time']/n_object)) - print('Clear : {0:.3f} ms / obj'.format(1000. * sum_dict['clear_time']/n_object)) + print("Summary:") + print("FIFO plugin is: {0}".format(mq.fifo.__class__.__name__)) + print("Benchmark with {0} objects by {1} threads".format(n_object, n_thread)) + print("Put : {0:.3f} ms / obj".format(1000.0 * sum_dict["put_time"] / (sum_dict["put_n"] * n_object))) + print("Get : {0:.3f} ms / obj".format(1000.0 * sum_dict["get_time"] / n_object)) + print("Get protective : {0:.3f} ms / obj".format(1000.0 * sum_dict["get_protective_time"] / n_object)) + print("Clear : {0:.3f} ms / obj".format(1000.0 * sum_dict["clear_time"] / n_object)) + def fifo_repopulate(arguments): - if 'ALL' in arguments.name_list: + if "ALL" in arguments.name_list: repopulate_fifos() else: repopulate_fifos(*arguments.name_list) + def cacher_refresh(arguments): from pandaharvester.harvestercore.communicator_pool import CommunicatorPool from pandaharvester.harvesterbody.cacher import Cacher + communicatorPool = CommunicatorPool() cacher = Cacher(communicatorPool) cacher.execute(force_update=True, skip_lock=True, n_thread=4) + def qconf_list(arguments): from pandaharvester.harvesterscripts import queue_config_tool + if arguments.all: queue_config_tool.list_config_ids() else: queue_config_tool.list_active_queues() + def qconf_refresh(arguments): from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper + qcm = QueueConfigMapper() qcm._update_last_reload_time() qcm.lastUpdate = None qcm.load_data(refill_table=arguments.refill) + def qconf_dump(arguments): from pandaharvester.harvesterscripts import queue_config_tool + to_print = not arguments.json try: if arguments.id_list: - res_list = [ vars(queue_config_tool.dump_queue_with_config_id(configID, to_print)) - for configID in arguments.id_list ] - resObj = { obj.get('queueName'): obj for obj in res_list } + res_list = [vars(queue_config_tool.dump_queue_with_config_id(configID, to_print)) for configID in arguments.id_list] + resObj = {obj.get("queueName"): obj for obj in res_list} elif arguments.all: res_list = queue_config_tool.dump_all_active_queues(to_print) if res_list is None or to_print: resObj = {} else: - resObj = { vars(qm).get('queueName', ''): vars(qm) for qm in res_list } + resObj = {vars(qm).get("queueName", ""): vars(qm) for qm in res_list} else: - resObj = { queue: vars(queue_config_tool.dump_active_queue(queue, to_print)) - for queue in arguments.queue_list } + resObj = {queue: vars(queue_config_tool.dump_active_queue(queue, to_print)) for queue in arguments.queue_list} except TypeError as e: - if str(e) == 'vars() argument must have __dict__ attribute': + if str(e) == "vars() argument must have __dict__ attribute": resObj = {} else: raise if arguments.json: json_print(resObj) + def qconf_purge(arguments): queueName = arguments.queue dbProxy = DBProxy() retVal = dbProxy.purge_pq(queueName) if retVal: - print('Purged {0} from harvester DB'.format(queueName)) + print("Purged {0} from harvester DB".format(queueName)) else: - mainLogger.critical('Failed to purge {0} . See panda-db_proxy.log'.format(queueName)) + mainLogger.critical("Failed to purge {0} . See panda-db_proxy.log".format(queueName)) + def kill_workers(arguments): - status_in = 'ALL' if (len(arguments.status) == 1 and arguments.status[0] == 'ALL') else arguments.status - computingSite_in = 'ALL' if (len(arguments.sites) == 1 and arguments.sites[0] == 'ALL') else arguments.sites - computingElement_in = 'ALL' if (len(arguments.ces) == 1 and arguments.ces[0] == 'ALL') else arguments.ces - submissionHost_in = 'ALL' if (len(arguments.submissionhosts) == 1 and arguments.submissionhosts[0] == 'ALL') else arguments.submissionhosts + status_in = "ALL" if (len(arguments.status) == 1 and arguments.status[0] == "ALL") else arguments.status + computingSite_in = "ALL" if (len(arguments.sites) == 1 and arguments.sites[0] == "ALL") else arguments.sites + computingElement_in = "ALL" if (len(arguments.ces) == 1 and arguments.ces[0] == "ALL") else arguments.ces + submissionHost_in = "ALL" if (len(arguments.submissionhosts) == 1 and arguments.submissionhosts[0] == "ALL") else arguments.submissionhosts dbProxy = DBProxy() - retVal = dbProxy.mark_workers_to_kill_by_query({'status': status_in, - 'computingSite': computingSite_in, - 'computingElement': computingElement_in, - 'submissionHost': submissionHost_in}) + retVal = dbProxy.mark_workers_to_kill_by_query( + {"status": status_in, "computingSite": computingSite_in, "computingElement": computingElement_in, "submissionHost": submissionHost_in} + ) if retVal is not None: msg_temp = ( - 'Sweeper will soon kill {n_workers} workers, with ' - 'status in {status_in}, ' - 'computingSite in {computingSite_in}, ' - 'computingElement in {computingElement_in}, ' - 'submissionHost in {submissionHost_in}' + "Sweeper will soon kill {n_workers} workers, with " + "status in {status_in}, " + "computingSite in {computingSite_in}, " + "computingElement in {computingElement_in}, " + "submissionHost in {submissionHost_in}" + ) + print( + msg_temp.format( + n_workers=retVal, + status_in=status_in, + computingSite_in=computingSite_in, + computingElement_in=computingElement_in, + submissionHost_in=submissionHost_in, ) - print(msg_temp.format(n_workers=retVal, status_in=status_in, computingSite_in=computingSite_in, - computingElement_in=computingElement_in, submissionHost_in=submissionHost_in)) + ) else: - mainLogger.critical('Failed to kill workers. See panda-db_proxy.log') + mainLogger.critical("Failed to kill workers. See panda-db_proxy.log") + def query_workers(arguments): dbProxy = DBProxy() @@ -271,108 +307,135 @@ def query_workers(arguments): except TypeError as e: raise -#=== Command map ======================================================= + +# === Command map ======================================================= + commandMap = { - # test commands - 'test': test, - # get commands - 'get': get, - # fifo commands - 'fifo_benchmark': fifo_benchmark, - 'fifo_repopulate': fifo_repopulate, - # cacher commands - 'cacher_refresh': cacher_refresh, - # qconf commands - 'qconf_list': qconf_list, - 'qconf_dump': qconf_dump, - 'qconf_refresh': qconf_refresh, - 'qconf_purge': qconf_purge, - # kill commands - 'kill_workers': kill_workers, - # query commands - 'query_workers': query_workers, - } - -#=== Main ====================================================== + # test commands + "test": test, + # get commands + "get": get, + # fifo commands + "fifo_benchmark": fifo_benchmark, + "fifo_repopulate": fifo_repopulate, + # cacher commands + "cacher_refresh": cacher_refresh, + # qconf commands + "qconf_list": qconf_list, + "qconf_dump": qconf_dump, + "qconf_refresh": qconf_refresh, + "qconf_purge": qconf_purge, + # kill commands + "kill_workers": kill_workers, + # query commands + "query_workers": query_workers, +} + +# === Main ====================================================== + def main(): # main parser - oparser = argparse.ArgumentParser(prog='harvester-admin', add_help=True) + oparser = argparse.ArgumentParser(prog="harvester-admin", add_help=True) subparsers = oparser.add_subparsers() - oparser.add_argument('-v', '--verbose', '--debug', action='store_true', dest='debug', help="Print more verbose output. (Debug mode !)") + oparser.add_argument("-v", "--verbose", "--debug", action="store_true", dest="debug", help="Print more verbose output. (Debug mode !)") # test command - test_parser = subparsers.add_parser('test', help='for testing only') - test_parser.set_defaults(which='test') + test_parser = subparsers.add_parser("test", help="for testing only") + test_parser.set_defaults(which="test") # get command - get_parser = subparsers.add_parser('get', help='get attributes of this harvester') - get_parser.set_defaults(which='get') - get_parser.add_argument('attribute', type=str, action='store', metavar='', choices=get_harvester_attributes(), help='attribute') + get_parser = subparsers.add_parser("get", help="get attributes of this harvester") + get_parser.set_defaults(which="get") + get_parser.add_argument("attribute", type=str, action="store", metavar="", choices=get_harvester_attributes(), help="attribute") # fifo parser - fifo_parser = subparsers.add_parser('fifo', help='fifo related') + fifo_parser = subparsers.add_parser("fifo", help="fifo related") fifo_subparsers = fifo_parser.add_subparsers() # fifo benchmark command - fifo_benchmark_parser = fifo_subparsers.add_parser('benchmark', help='benchmark fifo backend') - fifo_benchmark_parser.set_defaults(which='fifo_benchmark') - fifo_benchmark_parser.add_argument('-n', type=int, dest='n_object', action='store', default=500, metavar='', help='Benchmark with N objects') - fifo_benchmark_parser.add_argument('-t', type=int, dest='n_thread', action='store', default=1, metavar='', help='Benchmark with N threads') + fifo_benchmark_parser = fifo_subparsers.add_parser("benchmark", help="benchmark fifo backend") + fifo_benchmark_parser.set_defaults(which="fifo_benchmark") + fifo_benchmark_parser.add_argument("-n", type=int, dest="n_object", action="store", default=500, metavar="", help="Benchmark with N objects") + fifo_benchmark_parser.add_argument("-t", type=int, dest="n_thread", action="store", default=1, metavar="", help="Benchmark with N threads") # fifo repopuate command - fifo_repopulate_parser = fifo_subparsers.add_parser('repopulate', help='Repopulate agent fifo') - fifo_repopulate_parser.set_defaults(which='fifo_repopulate') - fifo_repopulate_parser.add_argument('name_list', nargs='+', type=str, action='store', metavar='', help='Name of agent fifo, e.g. "monitor" ("ALL" for all)') + fifo_repopulate_parser = fifo_subparsers.add_parser("repopulate", help="Repopulate agent fifo") + fifo_repopulate_parser.set_defaults(which="fifo_repopulate") + fifo_repopulate_parser.add_argument( + "name_list", nargs="+", type=str, action="store", metavar="", help='Name of agent fifo, e.g. "monitor" ("ALL" for all)' + ) # cacher parser - cacher_parser = subparsers.add_parser('cacher', help='cacher related') + cacher_parser = subparsers.add_parser("cacher", help="cacher related") cacher_subparsers = cacher_parser.add_subparsers() # cacher refresh command - cacher_refresh_parser = cacher_subparsers.add_parser('refresh', help='refresh cacher immediately') - cacher_refresh_parser.set_defaults(which='cacher_refresh') + cacher_refresh_parser = cacher_subparsers.add_parser("refresh", help="refresh cacher immediately") + cacher_refresh_parser.set_defaults(which="cacher_refresh") # qconf (queue configuration) parser - qconf_parser = subparsers.add_parser('qconf', help='queue configuration') + qconf_parser = subparsers.add_parser("qconf", help="queue configuration") qconf_subparsers = qconf_parser.add_subparsers() # qconf list command - qconf_list_parser = qconf_subparsers.add_parser('list', help='List queues. Only active queues listed by default') - qconf_list_parser.set_defaults(which='qconf_list') - qconf_list_parser.add_argument('-a', '--all', dest='all', action='store_true', help='List name and configID of all queues') + qconf_list_parser = qconf_subparsers.add_parser("list", help="List queues. Only active queues listed by default") + qconf_list_parser.set_defaults(which="qconf_list") + qconf_list_parser.add_argument("-a", "--all", dest="all", action="store_true", help="List name and configID of all queues") # qconf dump command - qconf_dump_parser = qconf_subparsers.add_parser('dump', help='Dump queue configurations') - qconf_dump_parser.set_defaults(which='qconf_dump') - qconf_dump_parser.add_argument('-J', '--json', dest='json', action='store_true', help='Dump configuration in JSON format') - qconf_dump_parser.add_argument('-a', '--all', dest='all', action='store_true', help='Dump configuration of all active queues') - qconf_dump_parser.add_argument('queue_list', nargs='*', type=str, action='store', metavar='', help='Name of active queue') - qconf_dump_parser.add_argument('-i', '--id', dest='id_list', nargs='+', type=int, action='store', metavar='', help='Dump configuration of queue with configID') + qconf_dump_parser = qconf_subparsers.add_parser("dump", help="Dump queue configurations") + qconf_dump_parser.set_defaults(which="qconf_dump") + qconf_dump_parser.add_argument("-J", "--json", dest="json", action="store_true", help="Dump configuration in JSON format") + qconf_dump_parser.add_argument("-a", "--all", dest="all", action="store_true", help="Dump configuration of all active queues") + qconf_dump_parser.add_argument("queue_list", nargs="*", type=str, action="store", metavar="", help="Name of active queue") + qconf_dump_parser.add_argument( + "-i", "--id", dest="id_list", nargs="+", type=int, action="store", metavar="", help="Dump configuration of queue with configID" + ) # qconf refresh command - qconf_refresh_parser = qconf_subparsers.add_parser('refresh', help='refresh queue configuration immediately') - qconf_refresh_parser.set_defaults(which='qconf_refresh') - qconf_refresh_parser.add_argument('-R', '--refill', dest='refill', action='store_true', help='Refill pq_table before refresh (cleaner)') + qconf_refresh_parser = qconf_subparsers.add_parser("refresh", help="refresh queue configuration immediately") + qconf_refresh_parser.set_defaults(which="qconf_refresh") + qconf_refresh_parser.add_argument("-R", "--refill", dest="refill", action="store_true", help="Refill pq_table before refresh (cleaner)") # qconf purge command - qconf_purge_parser = qconf_subparsers.add_parser('purge', help='Purge the queue thoroughly from harvester DB (Be careful !!)') - qconf_purge_parser.set_defaults(which='qconf_purge') - qconf_purge_parser.add_argument('queue', type=str, action='store', metavar='', help='Name of panda queue to purge') + qconf_purge_parser = qconf_subparsers.add_parser("purge", help="Purge the queue thoroughly from harvester DB (Be careful !!)") + qconf_purge_parser.set_defaults(which="qconf_purge") + qconf_purge_parser.add_argument("queue", type=str, action="store", metavar="", help="Name of panda queue to purge") # kill parser - kill_parser = subparsers.add_parser('kill', help='kill something alive') + kill_parser = subparsers.add_parser("kill", help="kill something alive") kill_subparsers = kill_parser.add_subparsers() # kill workers command - kill_workers_parser = kill_subparsers.add_parser('workers', help='Kill active workers by query') - kill_workers_parser.set_defaults(which='kill_workers') - kill_workers_parser.add_argument('--status', nargs='+', dest='status', action='store', metavar='', default=['submitted'], help='worker status (only "submitted", "idle", "running" are valid)') - kill_workers_parser.add_argument('--sites', nargs='+', dest='sites', action='store', metavar='', required=True, help='site (computingSite); "ALL" for all sites') - kill_workers_parser.add_argument('--ces', nargs='+', dest='ces', action='store', metavar='', required=True, help='CE (computingElement); "ALL" for all CEs') - kill_workers_parser.add_argument('--submissionhosts', nargs='+', dest='submissionhosts', action='store', metavar='', required=True, help='submission host (submissionHost); "ALL" for all submission hosts') + kill_workers_parser = kill_subparsers.add_parser("workers", help="Kill active workers by query") + kill_workers_parser.set_defaults(which="kill_workers") + kill_workers_parser.add_argument( + "--status", + nargs="+", + dest="status", + action="store", + metavar="", + default=["submitted"], + help='worker status (only "submitted", "idle", "running" are valid)', + ) + kill_workers_parser.add_argument( + "--sites", nargs="+", dest="sites", action="store", metavar="", required=True, help='site (computingSite); "ALL" for all sites' + ) + kill_workers_parser.add_argument( + "--ces", nargs="+", dest="ces", action="store", metavar="", required=True, help='CE (computingElement); "ALL" for all CEs' + ) + kill_workers_parser.add_argument( + "--submissionhosts", + nargs="+", + dest="submissionhosts", + action="store", + metavar="", + required=True, + help='submission host (submissionHost); "ALL" for all submission hosts', + ) # query parser - query_parser = subparsers.add_parser('query', help='query current status about harvester') + query_parser = subparsers.add_parser("query", help="query current status about harvester") query_subparsers = query_parser.add_subparsers() # query worker_stats command - query_workers_parser = query_subparsers.add_parser('workers', help='Query statistiscs of workers in queues') - query_workers_parser.set_defaults(which='query_workers') - query_workers_parser.add_argument('-a', '--all', dest='all', action='store_true', help='Show results of all queues') - query_workers_parser.add_argument('queue_list', nargs='*', type=str, action='store', metavar='', help='Name of active queue') + query_workers_parser = query_subparsers.add_parser("workers", help="Query statistiscs of workers in queues") + query_workers_parser.set_defaults(which="query_workers") + query_workers_parser.add_argument("-a", "--all", dest="all", action="store_true", help="Show results of all queues") + query_workers_parser.add_argument("queue_list", nargs="*", type=str, action="store", metavar="", help="Name of active queue") # start parsing if len(sys.argv) == 1: @@ -381,11 +444,11 @@ def main(): arguments = oparser.parse_args(sys.argv[1:]) # log level if arguments.debug: - ## Debug mode of logger + # Debug mode of logger mainLogger.setLevel(logging.DEBUG) else: mainLogger.setLevel(logging.WARNING) - ## Run command functions + # Run command functions try: command = commandMap.get(arguments.which) except AttributeError: @@ -396,17 +459,17 @@ def main(): result = command(arguments) end_time = time.time() if arguments.debug: - mainLogger.debug('ARGS: {arguments} ; RESULT: {result} '.format(arguments=arguments, result=result)) - mainLogger.debug('Action completed in {0:.3f} seconds'.format(end_time - start_time)) + mainLogger.debug("ARGS: {arguments} ; RESULT: {result} ".format(arguments=arguments, result=result)) + mainLogger.debug("Action completed in {0:.3f} seconds".format(end_time - start_time)) sys.exit(result) except (RuntimeError, NotImplementedError) as e: - mainLogger.critical('ERROR: {0}'.format(e)) + mainLogger.critical("ERROR: {0}".format(e)) sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": try: main() except KeyboardInterrupt: - mainLogger.critical('Command Interrupted !!') + mainLogger.critical("Command Interrupted !!") sys.exit(1) diff --git a/pandaharvester/harvesterscripts/prescript.py b/pandaharvester/harvesterscripts/prescript.py index cd1b9314..088ada27 100644 --- a/pandaharvester/harvesterscripts/prescript.py +++ b/pandaharvester/harvesterscripts/prescript.py @@ -7,11 +7,11 @@ def main(): - oparser = argparse.ArgumentParser(prog='prescript', add_help=True) - oparser.add_argument('-f', '--local_info_file', action='store', dest='local_info_file', help='path of harvester local info file') + oparser = argparse.ArgumentParser(prog="prescript", add_help=True) + oparser.add_argument("-f", "--local_info_file", action="store", dest="local_info_file", help="path of harvester local info file") if len(sys.argv) == 1: - print('No argument or flag specified. Did nothing') + print("No argument or flag specified. Did nothing") sys.exit(0) args = oparser.parse_args(sys.argv[1:]) @@ -19,13 +19,13 @@ def main(): hpi = harvesterPackageInfo(local_info_file=local_info_file) if hpi.package_changed: - print('Harvester package changed') - #TODO + print("Harvester package changed") + # TODO pass hpi.renew_local_info() else: - print('Harvester package unchanged. Skipped') + print("Harvester package unchanged. Skipped") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/pandaharvester/harvesterscripts/queue_config_tool.py b/pandaharvester/harvesterscripts/queue_config_tool.py index aba47786..ac2e5cc1 100644 --- a/pandaharvester/harvesterscripts/queue_config_tool.py +++ b/pandaharvester/harvesterscripts/queue_config_tool.py @@ -8,31 +8,29 @@ def list_active_queues(): """list all active queue names""" qs = qcm.get_active_queues() - ks = list(qs.keys()) - ks.sort() + ks = sorted(qs.keys()) for k in ks: - print (k) + print(k) def list_config_ids(): """list all configIDs and queue names""" qs = qcm.get_all_queues_with_config_ids() - ks = list(qs.keys()) - ks.sort() - print ('configID : queue name') - print ('--------- ------------') + ks = sorted(qs.keys()) + print("configID : queue name") + print("--------- ------------") for k in ks: - print ('{0:8} : {1}'.format(k, qs[k].queueName)) + print("{0:8} : {1}".format(k, qs[k].queueName)) def dump_active_queue(name, to_print=True): """dump configuration of an active queue with name""" if not qcm.has_queue(name): - print ("ERROR : {0} is not available".format(name)) + print("ERROR : {0} is not available".format(name)) return q = qcm.get_queue(name) if to_print: - print (q) + print(q) else: return q @@ -41,10 +39,9 @@ def dump_all_active_queues(to_print=True): """dump configuration of all active queues""" qs = qcm.get_active_queues() if to_print: - ks = list(qs.keys()) - ks.sort() + ks = sorted(qs.keys()) for k in ks: - print (qs[k]) + print(qs[k]) else: return list(qs.values()) @@ -52,11 +49,11 @@ def dump_all_active_queues(to_print=True): def dump_queue_with_config_id(config_id, to_print=True): """dump configuration of a queue with configID""" if not qcm.has_queue(None, config_id): - print ("ERROR : configID={0} is not available".format(config_id)) + print("ERROR : configID={0} is not available".format(config_id)) return q = qcm.get_queue(None, config_id) if to_print: - print (q) + print(q) else: return q @@ -67,9 +64,9 @@ def help(o=None): __builtins__.help(o) else: maxLen = len(max(globals(), key=len)) - print (('{0:' + str(maxLen) + '} : {1}').format('function name', 'description')) - print ('-' * maxLen + '- -' + '-' * maxLen) + print(("{0:" + str(maxLen) + "} : {1}").format("function name", "description")) + print("-" * maxLen + "- -" + "-" * maxLen) for i in sorted(globals()): v = globals()[i] if isinstance(v, types.FunctionType): - print (('{0:' + str(maxLen) + '} : {1}').format(i, v.__doc__)) + print(("{0:" + str(maxLen) + "} : {1}").format(i, v.__doc__)) diff --git a/pandaharvester/harvesterscripts/remote_install.py b/pandaharvester/harvesterscripts/remote_install.py index e97d2373..3fbc2bfc 100644 --- a/pandaharvester/harvesterscripts/remote_install.py +++ b/pandaharvester/harvesterscripts/remote_install.py @@ -20,27 +20,23 @@ def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) -def make_ssh_connection(ssh_host, ssh_port, ssh_username, ssh_password, pass_phrase, private_key, - jump_host, jump_port): +def make_ssh_connection(ssh_host, ssh_port, ssh_username, ssh_password, pass_phrase, private_key, jump_host, jump_port): # ssh sshClient = paramiko.SSHClient() sshClient.set_missing_host_key_policy(paramiko.AutoAddPolicy()) if jump_host is None: # direct SSH - sshClient.connect(ssh_host, ssh_port, username=ssh_username, password=ssh_password, - passphrase=pass_phrase, key_filename=private_key) + sshClient.connect(ssh_host, ssh_port, username=ssh_username, password=ssh_password, passphrase=pass_phrase, key_filename=private_key) else: # via jump host - sshClient.connect(jump_host, jump_port, username=ssh_username, password=ssh_password, - passphrase=pass_phrase, key_filename=private_key) + sshClient.connect(jump_host, jump_port, username=ssh_username, password=ssh_password, passphrase=pass_phrase, key_filename=private_key) transport = sshClient.get_transport() dst_address = (ssh_host, ssh_port) src_address = (jump_host, jump_port) channel = transport.open_channel("direct-tcpip", dst_address, src_address) jumpClient = paramiko.SSHClient() jumpClient.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - jumpClient.connect(ssh_host, ssh_port, username=ssh_username, password=ssh_password, - passphrase=pass_phrase, key_filename=private_key, sock=channel) + jumpClient.connect(ssh_host, ssh_port, username=ssh_username, password=ssh_password, passphrase=pass_phrase, key_filename=private_key, sock=channel) return sshClient @@ -48,46 +44,48 @@ def main(): logging.basicConfig() parser = argparse.ArgumentParser() - parser.add_argument('--remoteDir', action='store', dest='remoteDir', default='harvester', - help='directory on the remote target machine where harvester is installed') - parser.add_argument('--remoteBuildDir', action='store', dest='remoteBuildDir', default='harvester_build', - help='directory on the remote target machine where harvester is build') - parser.add_argument('--remotePythonSetup', action='store', dest='remotePythonSetup', default='', - help='python setup on remote target machine') - parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, - help='the name of queue where harvester is installed') - parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', - help='middleware to access the remote target machine') + parser.add_argument( + "--remoteDir", action="store", dest="remoteDir", default="harvester", help="directory on the remote target machine where harvester is installed" + ) + parser.add_argument( + "--remoteBuildDir", + action="store", + dest="remoteBuildDir", + default="harvester_build", + help="directory on the remote target machine where harvester is build", + ) + parser.add_argument("--remotePythonSetup", action="store", dest="remotePythonSetup", default="", help="python setup on remote target machine") + parser.add_argument("--queueName", action="store", dest="queueName", default=None, required=True, help="the name of queue where harvester is installed") + parser.add_argument("--middleware", action="store", dest="middleware", default="rpc", help="middleware to access the remote target machine") options = parser.parse_args() # remove ~/ which doesn't work with sftp - options.remoteDir = re.sub('^~/', '', options.remoteDir) - options.remoteBuildDir = re.sub('^~/', '', options.remoteBuildDir) + options.remoteDir = re.sub("^~/", "", options.remoteDir) + options.remoteBuildDir = re.sub("^~/", "", options.remoteBuildDir) # get queue qcm = QueueConfigMapper() qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: - print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName)) + print("ERROR: queue={0} not found in panda_queueconfig.json".format(options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): - print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware, - options.queueName)) + print("ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json".format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters - sshHost = middleware['remoteHost'] + sshHost = middleware["remoteHost"] try: - sshPort = middleware['remotePort'] + sshPort = middleware["remotePort"] except Exception: sshPort = 22 - sshUserName = middleware['sshUserName'] + sshUserName = middleware["sshUserName"] try: - sshPassword = middleware['sshPassword'] + sshPassword = middleware["sshPassword"] except Exception: sshPassword = None @@ -95,71 +93,62 @@ def main(): passPhrase = None if sshPassword is None: try: - privateKey = middleware['privateKey'] + privateKey = middleware["privateKey"] except Exception: - print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) + print("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) sys.exit(1) try: - passPhrase = middleware['passPhrase'] + passPhrase = middleware["passPhrase"] except Exception: passPhrase = None try: - jumpHost = middleware['jumpHost'] + jumpHost = middleware["jumpHost"] except Exception: jumpHost = None try: - jumpPort = middleware['jumpPort'] + jumpPort = middleware["jumpPort"] except Exception: jumpPort = 22 # ssh - sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, - jumpHost, jumpPort) + sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) # get remote python version - exec_out = sshClient.exec_command( - ';'.join([options.remotePythonSetup, - """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """]) - ) + exec_out = sshClient.exec_command(";".join([options.remotePythonSetup, """python -c 'import sys;print("{0}{1}".format(*(sys.version_info[:2])))' """])) remotePythonVer = exec_out[1].read().rstrip() sshClient.close() - print ('remote python version : {0}'.format(remotePythonVer)) + print("remote python version : {0}".format(remotePythonVer)) # make tmp dir with TemporaryDirectory() as tmpDir: harvesterGit = "git+git://github.com/PanDAWMS/panda-harvester.git" # get all dependencies - print ("getting dependencies") - p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format(tmpDir, harvesterGit), - stdout=subprocess.PIPE, - shell=True) + print("getting dependencies") + p = subprocess.Popen("pip download -d {0} {1}; rm -rf {0}/*".format(tmpDir, harvesterGit), stdout=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() packages = [] - for line in stdout.split('\n'): - if line.startswith('Successfully downloaded'): + for line in stdout.split("\n"): + if line.startswith("Successfully downloaded"): packages = line.split()[2:] packages.append(harvesterGit) - packages.append('pip') - packages.remove('pandaharvester') + packages.append("pip") + packages.remove("pandaharvester") # download packages - print ("pip download to {0}".format(tmpDir)) + print("pip download to {0}".format(tmpDir)) for package in packages: - print ("getting {0}".format(package)) - ret = subprocess.call("pip download --no-deps --python-version {0} -d {1} {2}".format(remotePythonVer, - tmpDir, package), - shell=True) + print("getting {0}".format(package)) + ret = subprocess.call("pip download --no-deps --python-version {0} -d {1} {2}".format(remotePythonVer, tmpDir, package), shell=True) if ret != 0: - print ("ERROR: failed to download {0}".format(package)) + print("ERROR: failed to download {0}".format(package)) sys.exit(1) # sftp - sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, - jumpHost, jumpPort) + sshClient = make_ssh_connection(sshHost, sshPort, sshUserName, sshPassword, passPhrase, privateKey, jumpHost, jumpPort) try: - sshClient.exec_command('rm -rf {0}; mkdir -p {0}'.format(options.remoteBuildDir)) + sshClient.exec_command("rm -rf {0}; mkdir -p {0}".format(options.remoteBuildDir)) except Exception: pass sftp = sshClient.open_sftp() @@ -168,21 +157,21 @@ def main(): if os.path.isdir(path): continue remotePath = os.path.join(options.remoteBuildDir, name) - print ("copy {0} to {1}".format(name, remotePath)) + print("copy {0} to {1}".format(name, remotePath)) sftp.put(path, remotePath) # install - print ("install harvester") + print("install harvester") buildDir = options.remoteBuildDir - if not buildDir.startswith('/'): - buildDir = '~/' + buildDir + if not buildDir.startswith("/"): + buildDir = "~/" + buildDir exec_out = sshClient.exec_command( - ';'.join([options.remotePythonSetup, - 'cd {0}'.format(options.remoteDir), - 'pip install pip pandaharvester --no-index --find-links {0}'.format(buildDir)]) - ) - print (exec_out[1].read()) - print (exec_out[2].read()) + ";".join( + [options.remotePythonSetup, "cd {0}".format(options.remoteDir), "pip install pip pandaharvester --no-index --find-links {0}".format(buildDir)] + ) + ) + print(exec_out[1].read()) + print(exec_out[2].read()) sshClient.close() diff --git a/pandaharvester/harvesterstager/act_stager.py b/pandaharvester/harvesterstager/act_stager.py index 17a2aad4..dd549e22 100644 --- a/pandaharvester/harvesterstager/act_stager.py +++ b/pandaharvester/harvesterstager/act_stager.py @@ -11,23 +11,25 @@ from act.atlas.aCTDBPanda import aCTDBPanda # logger -baseLogger = core_utils.setup_logger('act_stager') +baseLogger = core_utils.setup_logger("act_stager") # json for job report jsonJobReport = harvester_config.payload_interaction.jobReportFile # aCT stager plugin + + class ACTStager(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) # Set up aCT DB connection - self.log = core_utils.make_logger(baseLogger, 'aCT stager', method_name='__init__') + self.log = core_utils.make_logger(baseLogger, "aCT stager", method_name="__init__") try: self.actDB = aCTDBPanda(self.log) except Exception as e: - self.log.error('Could not connect to aCT database: {0}'.format(str(e))) + self.log.error("Could not connect to aCT database: {0}".format(str(e))) self.actDB = None # check status @@ -46,11 +48,10 @@ def check_stage_out_status(self, jobspec): workSpec = jobspec.get_workspec_list()[0] # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='check_workers') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="check_workers") try: - tmpLog.debug('Querying aCT for id {0}'.format(workSpec.batchID)) - columns = ['actpandastatus', 'error'] + tmpLog.debug("Querying aCT for id {0}".format(workSpec.batchID)) + columns = ["actpandastatus", "error"] actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns) except Exception as e: if self.actDB: @@ -62,35 +63,35 @@ def check_stage_out_status(self, jobspec): tmpLog.error("Job with id {0} not found in aCT".format(workSpec.batchID)) return False, "Job not found in aCT" - actstatus = actjobs[0]['actpandastatus'] + actstatus = actjobs[0]["actpandastatus"] # Only check for final states - if actstatus == 'done': + if actstatus == "done": # Do post processing self.post_processing(workSpec, jobspec) - elif actstatus == 'donefailed': + elif actstatus == "donefailed": # Call post processing to collect attributes set by aCT for failed jobs self.post_processing(workSpec, jobspec) # Set error reported by aCT - errorMsg = actjobs[0]['error'] or 'Unknown error' - error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') - jobspec.status = 'failed' + errorMsg = actjobs[0]["error"] or "Unknown error" + error_code = WorkerErrors.error_codes.get("GENERAL_ERROR") + jobspec.status = "failed" # No way to update workspec here - #workSpec.set_supplemental_error(error_code=error_code, error_diag=errorMsg) + # workSpec.set_supplemental_error(error_code=error_code, error_diag=errorMsg) jobspec.set_pilot_error(error_code, errorMsg) - tmpLog.info('Job {0} failed with error {1}'.format(jobspec.PandaID, errorMsg)) - elif actstatus == 'donecancelled': + tmpLog.info("Job {0} failed with error {1}".format(jobspec.PandaID, errorMsg)) + elif actstatus == "donecancelled": # Nothing to do pass else: # Still staging - return None, 'still staging' + return None, "still staging" - tmpLog.info('ID {0} completed in state {1}'.format(workSpec.batchID, actstatus)) + tmpLog.info("ID {0} completed in state {1}".format(workSpec.batchID, actstatus)) # Set dummy output file to finished for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' - return True, '' + fileSpec.status = "finished" + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): @@ -107,43 +108,42 @@ def trigger_stage_out(self, jobspec): fileSpec = FileSpec() fileSpec.PandaID = jobspec.PandaID fileSpec.taskID = jobspec.taskID - fileSpec.lfn = 'dummy.{0}'.format(jobspec.PandaID) - fileSpec.scope = 'dummy' - fileSpec.fileType = 'output' + fileSpec.lfn = "dummy.{0}".format(jobspec.PandaID) + fileSpec.scope = "dummy" + fileSpec.fileType = "output" jobspec.add_in_file(fileSpec) - return True, '' + return True, "" # zip output files def zip_output(self, jobspec): """Dummy""" - return True, '' + return True, "" def post_processing(self, workspec, jobspec): - ''' + """ Take the jobReport placed by aCT in the access point and fill metadata attributes of the jobspec. - ''' + """ # get logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='post_processing') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="post_processing") # look for job report jsonFilePath = os.path.join(workspec.get_access_point(), jsonJobReport) - tmpLog.debug('looking for job report file {0}'.format(jsonFilePath)) + tmpLog.debug("looking for job report file {0}".format(jsonFilePath)) try: with open(jsonFilePath) as jsonFile: jobreport = json.load(jsonFile) - except: + except BaseException: # Assume no job report available means true pilot or push mode # If job report is not available in full push mode aCT would have failed the job - tmpLog.debug('no job report at {0}'.format(jsonFilePath)) + tmpLog.debug("no job report at {0}".format(jsonFilePath)) return - tmpLog.debug('got {0} kB of job report'.format(os.stat(jsonFilePath).st_size / 1024)) + tmpLog.debug("got {0} kB of job report".format(os.stat(jsonFilePath).st_size / 1024)) tmpLog.debug("pilot info for {0}: {1}".format(jobspec.PandaID, jobreport)) # Set info for final heartbeat and final status jobspec.set_attributes({jobspec.PandaID: jobreport}) - jobspec.set_one_attribute('jobStatus', jobreport.get('state', 'failed')) - jobspec.status = jobreport.get('state', 'failed') + jobspec.set_one_attribute("jobStatus", jobreport.get("state", "failed")) + jobspec.status = jobreport.get("state", "failed") diff --git a/pandaharvester/harvesterstager/dummy_bulk_stager.py b/pandaharvester/harvesterstager/dummy_bulk_stager.py index d6bfa5d3..4f762739 100644 --- a/pandaharvester/harvesterstager/dummy_bulk_stager.py +++ b/pandaharvester/harvesterstager/dummy_bulk_stager.py @@ -5,10 +5,10 @@ from .base_stager import BaseStager # dummy transfer identifier -dummy_transfer_id = 'dummy_id_for_out' +dummy_transfer_id = "dummy_id_for_out" # logger -baseLogger = core_utils.setup_logger('dummy_bulk_stager') +baseLogger = core_utils.setup_logger("dummy_bulk_stager") # dummy plugin for stager with bulk transfers. For JobSpec and DBInterface methods, see @@ -21,9 +21,8 @@ def __init__(self, **kwarg): # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") # get transfer groups groups = jobspec.get_groups_of_output_files() # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests @@ -32,7 +31,7 @@ def check_stage_out_status(self, jobspec): locked = self.dbInterface.get_object_lock(dummy_transfer_id, lock_interval=120) if not locked: # escape since locked by another thread - msgStr = 'escape since locked by another thread' + msgStr = "escape since locked by another thread" tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock @@ -41,22 +40,21 @@ def check_stage_out_status(self, jobspec): groups = jobspec.get_groups_of_output_files() # the dummy transfer ID is still there if dummy_transfer_id in groups: - groupUpdateTime = groups[dummy_transfer_id]['groupUpdateTime'] + groupUpdateTime = groups[dummy_transfer_id]["groupUpdateTime"] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min. # those thresholds may be config params. - if len(fileSpecs) >= 10 or \ - groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): + if len(fileSpecs) >= 10 or groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): # submit transfer and get a real transfer ID # ... transferID = str(uuid.uuid4()) # set the real transfer ID - self.dbInterface.set_file_group(fileSpecs, transferID, 'running') - msgStr = 'submitted transfer with ID={0}'.format(transferID) + self.dbInterface.set_file_group(fileSpecs, transferID, "running") + msgStr = "submitted transfer with ID={0}".format(transferID) tmpLog.debug(msgStr) else: - msgStr = 'wait until enough files are pooled' + msgStr = "wait until enough files are pooled" tmpLog.debug(msgStr) # release the lock self.dbInterface.release_object_lock(dummy_transfer_id) @@ -68,9 +66,9 @@ def check_stage_out_status(self, jobspec): # ... # then set file status if successful for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' - tmpLog.debug('all finished') - return True, '' + fileSpec.status = "finished" + tmpLog.debug("all finished") + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): @@ -78,14 +76,10 @@ def trigger_stage_out(self, jobspec): lfns = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): lfns.append(fileSpec.lfn) - jobspec.set_groups_to_files({dummy_transfer_id: {'lfns': lfns, - 'groupStatus': 'pending'} - } - ) - return True, '' + jobspec.set_groups_to_files({dummy_transfer_id: {"lfns": lfns, "groupStatus": "pending"}}) + return True, "" # zip output files def zip_output(self, jobspec): - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/dummy_stager.py b/pandaharvester/harvesterstager/dummy_stager.py index f0611543..caf0cb17 100644 --- a/pandaharvester/harvesterstager/dummy_stager.py +++ b/pandaharvester/harvesterstager/dummy_stager.py @@ -4,7 +4,7 @@ import uuid # logger -_logger = core_utils.setup_logger('dummy_stager') +_logger = core_utils.setup_logger("dummy_stager") # dummy plugin for stager @@ -31,8 +31,8 @@ def check_stage_out_status(self, jobspec): :rtype: (bool, string) """ for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' - return True, '' + fileSpec.status = "finished" + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): @@ -51,7 +51,7 @@ def trigger_stage_out(self, jobspec): # fileSpec.objstoreID = 123 # fileSpec.fileAttributes['guid'] pass - return True, '' + return True, "" # zip output files def zip_output(self, jobspec): @@ -68,8 +68,7 @@ def zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) # asynchronous zip output @@ -88,18 +87,14 @@ def async_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # set some ID which can be used for lookup in post_zip_output() groupID = str(uuid.uuid4()) lfns = [] for fileSpec in jobspec.outFiles: lfns.append(fileSpec.lfn) - jobspec.set_groups_to_files({groupID: {'lfns': lfns, - 'groupStatus': 'zipping'} - } - ) - return True, '' + jobspec.set_groups_to_files({groupID: {"lfns": lfns, "groupStatus": "zipping"}}) + return True, "" # post zipping def post_zip_output(self, jobspec): @@ -114,15 +109,14 @@ def post_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # get groups for lookup groups = jobspec.get_groups_of_output_files() # do something with groupIDs pass # update file attributes for fileSpec in jobspec.outFiles: - fileSpec.path = '/path/to/zip' + fileSpec.path = "/path/to/zip" fileSpec.fsize = 12345 - fileSpec.chksum = '66bb0985' - return True, '' + fileSpec.chksum = "66bb0985" + return True, "" diff --git a/pandaharvester/harvesterstager/fts_stager.py b/pandaharvester/harvesterstager/fts_stager.py index 6afd18b1..0e337d34 100644 --- a/pandaharvester/harvesterstager/fts_stager.py +++ b/pandaharvester/harvesterstager/fts_stager.py @@ -4,16 +4,17 @@ # TO BE REMOVED for python2.7 import requests.packages.urllib3 + try: requests.packages.urllib3.disable_warnings() -except: +except BaseException: pass from pandaharvester.harvestercore import core_utils from .base_stager import BaseStager from pandaharvester.harvesterconfig import harvester_config # logger -baseLogger = core_utils.setup_logger('fts_stager') +baseLogger = core_utils.setup_logger("fts_stager") # plugin for stager with FTS @@ -25,68 +26,59 @@ def __init__(self, **kwarg): # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") # loop over all files allChecked = True oneErrMsg = None trasnferStatus = {} for fileSpec in jobspec.outFiles: # get transfer ID - transferID = fileSpec.fileAttributes['transferID'] + transferID = fileSpec.fileAttributes["transferID"] if transferID not in trasnferStatus: # get status errMsg = None try: - url = "{0}/jobs/{1}".format(self.ftsServer, - transferID) - res = requests.get(url, - timeout=self.ftsLookupTimeout, - verify=self.ca_cert, - cert=(harvester_config.pandacon.cert_file, - harvester_config.pandacon.key_file) - ) + url = "{0}/jobs/{1}".format(self.ftsServer, transferID) + res = requests.get( + url, timeout=self.ftsLookupTimeout, verify=self.ca_cert, cert=(harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) + ) if res.status_code == 200: transferData = res.json() trasnferStatus[transferID] = transferData["job_state"] - tmpLog.debug('got {0} for {1}'.format(trasnferStatus[transferID], - transferID)) + tmpLog.debug("got {0} for {1}".format(trasnferStatus[transferID], transferID)) else: - errMsg = 'StatusCode={0} {1}'.format(res.status_code, - res.text) - except: + errMsg = "StatusCode={0} {1}".format(res.status_code, res.text) + except BaseException: if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) # failed if errMsg is not None: allChecked = False - tmpLog.error('failed to get status for {0} with {1}'.format(transferID, - errMsg)) + tmpLog.error("failed to get status for {0} with {1}".format(transferID, errMsg)) # set dummy not to lookup again trasnferStatus[transferID] = None # keep one message if oneErrMsg is None: oneErrMsg = errMsg # final status - if trasnferStatus[transferID] == 'DONE': - fileSpec.status = 'finished' - elif trasnferStatus[transferID] in ['FAILED', 'CANCELED']: - fileSpec.status = 'failed' + if trasnferStatus[transferID] == "DONE": + fileSpec.status = "finished" + elif trasnferStatus[transferID] in ["FAILED", "CANCELED"]: + fileSpec.status = "failed" if allChecked: - return True, '' + return True, "" else: return False, oneErrMsg # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # loop over all files files = [] lfns = set() @@ -96,41 +88,39 @@ def trigger_stage_out(self, jobspec): if fileSpec.zipFileID is not None: continue # source and destination URLs - if fileSpec.fileType == 'es_output': + if fileSpec.fileType == "es_output": srcURL = self.srcEndpointES + fileSpec.path dstURL = self.dstEndpointES + fileSpec.path # set OS ID fileSpec.objstoreID = self.esObjStoreID else: - scope = fileAttrs[fileSpec.lfn]['scope'] + scope = fileAttrs[fileSpec.lfn]["scope"] hash = hashlib.md5() - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - if fileSpec.fileType == 'output': + correctedscope = "/".join(scope.split(".")) + if fileSpec.fileType == "output": srcURL = self.srcEndpointOut + fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.dstEndpointOut, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - elif fileSpec.fileType == 'log': + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.dstEndpointOut, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + elif fileSpec.fileType == "log": # skip if no endpoint - if self.srcEndpointLog == None: + if self.srcEndpointLog is None: continue srcURL = self.srcEndpointLog + fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.dstEndpointLog, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.dstEndpointLog, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) else: continue - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) - files.append({ - "sources": [srcURL], - "destinations": [dstURL], - }) + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) + files.append( + { + "sources": [srcURL], + "destinations": [dstURL], + } + ) lfns.add(fileSpec.lfn) # submit if files != []: @@ -138,41 +128,39 @@ def trigger_stage_out(self, jobspec): errMsg = None try: url = "{0}/jobs".format(self.ftsServer) - res = requests.post(url, - json={"Files": files}, - timeout=self.ftsLookupTimeout, - verify=self.ca_cert, - cert=(harvester_config.pandacon.cert_file, - harvester_config.pandacon.key_file) - ) + res = requests.post( + url, + json={"Files": files}, + timeout=self.ftsLookupTimeout, + verify=self.ca_cert, + cert=(harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file), + ) if res.status_code == 200: transferData = res.json() transferID = transferData["job_id"] - tmpLog.debug('successfully submitted id={0}'.format(transferID)) + tmpLog.debug("successfully submitted id={0}".format(transferID)) # set for fileSpec in jobspec.outFiles: - if fileSpec.fileAttributes == None: + if fileSpec.fileAttributes is None: fileSpec.fileAttributes = {} - fileSpec.fileAttributes['transferID'] = transferID + fileSpec.fileAttributes["transferID"] = transferID else: # HTTP error - errMsg = 'StatusCode={0} {1}'.format(res.status_code, - res.text) - except: + errMsg = "StatusCode={0} {1}".format(res.status_code, res.text) + except BaseException: if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) # failed if errMsg is not None: - tmpLog.error('failed to submit transfer to {0} with {1}'.format(url, errMsg)) + tmpLog.error("failed to submit transfer to {0} with {1}".format(url, errMsg)) tmpRetVal = (False, errMsg) # return - tmpLog.debug('done') + tmpLog.debug("done") return tmpRetVal # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/go_bulk_stager.py b/pandaharvester/harvesterstager/go_bulk_stager.py index b09ebd49..424937c7 100644 --- a/pandaharvester/harvesterstager/go_bulk_stager.py +++ b/pandaharvester/harvesterstager/go_bulk_stager.py @@ -17,9 +17,10 @@ # TO BE REMOVED for python2.7 import requests.packages.urllib3 + try: requests.packages.urllib3.disable_warnings() -except: +except BaseException: pass from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase @@ -30,7 +31,7 @@ from pandaharvester.harvesterstager.base_stager import BaseStager # Define dummy transfer identifier -dummy_transfer_id_base = 'dummy_id_for_out' +dummy_transfer_id_base = "dummy_id_for_out" # lock to get a unique ID uLock = threading.Lock() @@ -38,16 +39,17 @@ uID = 0 # logger -_logger = core_utils.setup_logger('go_bulk_stager') +_logger = core_utils.setup_logger("go_bulk_stager") def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) + def validate_transferid(transferid): - tmptransferid = transferid.replace('-','') + tmptransferid = transferid.replace("-", "") return all(c in string.hexdigits for c in tmptransferid) @@ -56,66 +58,66 @@ def validate_transferid(transferid): class GlobusBulkStager(BaseStager): next_id = 0 # constructor + def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # make logger - tmpLog = self.make_logger(_logger, 'ThreadID={0}'.format(threading.current_thread().ident), - method_name='GlobusBulkStager __init__ ') - tmpLog.debug('start') - self.EventServicejob = False + tmpLog = self.make_logger(_logger, "ThreadID={0}".format(threading.current_thread().ident), method_name="GlobusBulkStager __init__ ") + tmpLog.debug("start") + self.EventServicejob = False self.pathConvention = None self.id = GlobusBulkStager.next_id self.changeFileStatusOnSuccess = True GlobusBulkStager.next_id += 1 with uLock: global uID - #self.dummy_transfer_id = '{0}_{1}_{2}'.format(dummy_transfer_id_base,self.id,int(round(time.time() * 1000))) - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base, 'XXXX') + # self.dummy_transfer_id = '{0}_{1}_{2}'.format(dummy_transfer_id_base,self.id,int(round(time.time() * 1000))) + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, "XXXX") uID += 1 uID %= harvester_config.stager.nThreads # create Globus Transfer Client try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism - tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') - c_data = self.dbInterface.get_cache('globus_secret') - if (not c_data == None) and c_data.data['StatusCode'] == 0 : - tmpLog.debug('Got the globus_secrets from PanDA') - self.client_id = c_data.data['publicKey'] # client_id - self.refresh_token = c_data.data['privateKey'] # refresh_token - tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) + tmpLog.debug("about to call dbInterface.get_cache(globus_secret)") + c_data = self.dbInterface.get_cache("globus_secret") + if (c_data is not None) and c_data.data["StatusCode"] == 0: + tmpLog.debug("Got the globus_secrets from PanDA") + self.client_id = c_data.data["publicKey"] # client_id + self.refresh_token = c_data.data["privateKey"] # refresh_token + tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog, self.client_id, self.refresh_token) if not tmpStat: self.tc = None - errStr = 'failed to create Globus Transfer Client' + errStr = "failed to create Globus Transfer Client" tmpLog.error(errStr) - else : + else: self.client_id = None self.refresh_token = None self.tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' + errStr = "failed to get Globus Client ID and Refresh Token" tmpLog.error(errStr) - except: + except BaseException: core_utils.dump_error_message(tmpLog) - tmpLog.debug('__init__ finish') - + tmpLog.debug("__init__ finish") # get dummy_transfer_id + def get_dummy_transfer_id(self): return self.dummy_transfer_id # set dummy_transfer_id for testing - def set_dummy_transfer_id_testing(self,dummy_transfer_id): + def set_dummy_transfer_id_testing(self, dummy_transfer_id): self.dummy_transfer_id = dummy_transfer_id - # set FileSpec.objstoreID - def set_FileSpec_objstoreID(self,jobspec, objstoreID, pathConvention): + # set FileSpec.objstoreID + def set_FileSpec_objstoreID(self, jobspec, objstoreID, pathConvention): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.objstoreID = objstoreID fileSpec.pathConvention = pathConvention - # set FileSpec.status - def set_FileSpec_status(self,jobspec,status): + # set FileSpec.status + def set_FileSpec_status(self, jobspec, status): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.status = status @@ -123,50 +125,51 @@ def set_FileSpec_status(self,jobspec,status): # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger( + _logger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="check_stage_out_status" + ) + tmpLog.debug("start") # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # show the dummy transfer id and set to a value with the PandaID if needed. - tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) - if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : + tmpLog.debug("self.dummy_transfer_id = {}".format(self.dummy_transfer_id)) + if self.dummy_transfer_id == "{0}_{1}".format(dummy_transfer_id_base, "XXXX"): old_dummy_transfer_id = self.dummy_transfer_id - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.computingSite) - tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, jobspec.computingSite) + tmpLog.debug("Change self.dummy_transfer_id from {0} to {1}".format(old_dummy_transfer_id, self.dummy_transfer_id)) # set flag if have db lock - have_db_lock = False + have_db_lock = False # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # check queueConfig stager section to see if jobtype is set - if 'jobtype' in queueConfig.stager: - if queueConfig.stager['jobtype'] == "EventService" : + if "jobtype" in queueConfig.stager: + if queueConfig.stager["jobtype"] == "EventService": self.EventServicejob = True - tmpLog.debug('Setting job type to EventService') + tmpLog.debug("Setting job type to EventService") # guard against old parameter in queue config - if queueConfig.stager['jobtype'] == "Yoda" : + if queueConfig.stager["jobtype"] == "Yoda": self.EventServicejob = True - tmpLog.debug('Setting job type to EventService') + tmpLog.debug("Setting job type to EventService") # set the location of the files in fileSpec.objstoreID - # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json - self.objstoreID = int(queueConfig.stager['objStoreID_ES']) - if self.EventServicejob : - self.pathConvention = int(queueConfig.stager['pathConvention']) - tmpLog.debug('EventService Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) + # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json + self.objstoreID = int(queueConfig.stager["objStoreID_ES"]) + if self.EventServicejob: + self.pathConvention = int(queueConfig.stager["pathConvention"]) + tmpLog.debug("EventService Job - PandaID = {0} objstoreID = {1} pathConvention ={2}".format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None - tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) + tmpLog.debug("PandaID = {0} objstoreID = {1}".format(jobspec.PandaID, self.objstoreID)) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # set transferID to None @@ -175,80 +178,76 @@ def check_stage_out_status(self, jobspec): outfileattrib = jobspec.get_output_file_attributes() # get transfer groups groups = jobspec.get_groups_of_output_files() - tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_output_files() = : {0}".format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests for dummy_transferID in groups: if validate_transferid(dummy_transferID): continue # lock for 120 sec - tmpLog.debug('attempt to set DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to set DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) have_db_lock = self.dbInterface.get_object_lock(dummy_transferID, lock_interval=120) if not have_db_lock: # escape since locked by another thread - msgStr = 'escape since locked by another thread' + msgStr = "escape since locked by another thread" tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock - tmpLog.debug('self.dbInterface.refresh_file_group_info(jobspec)') + tmpLog.debug("self.dbInterface.refresh_file_group_info(jobspec)") self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info - tmpLog.debug('After db refresh call groups=jobspec.get_groups_of_output_files()') + tmpLog.debug("After db refresh call groups=jobspec.get_groups_of_output_files()") groups = jobspec.get_groups_of_output_files() - tmpLog.debug('jobspec.get_groups_of_output_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_output_files() = : {0}".format(groups)) # the dummy transfer ID is still there if dummy_transferID in groups: - groupUpdateTime = groups[dummy_transferID]['groupUpdateTime'] + groupUpdateTime = groups[dummy_transferID]["groupUpdateTime"] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id(dummy_transferID) # submit transfer if there are more than 10 files or the group was made before more than 10 min - msgStr = 'dummy_transferID = {0} number of files = {1}'.format(dummy_transferID,len(fileSpecs)) + msgStr = "dummy_transferID = {0} number of files = {1}".format(dummy_transferID, len(fileSpecs)) tmpLog.debug(msgStr) - if len(fileSpecs) >= 10 or \ - groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): - tmpLog.debug('prepare to transfer files') + if len(fileSpecs) >= 10 or groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): + tmpLog.debug("prepare to transfer files") # submit transfer and get a real transfer ID - # set the Globus destination Endpoint id and path will get them from Agis eventually - #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] - self.srcEndpoint = queueConfig.stager['srcEndpoint'] + # set the Globus destination Endpoint id and path will get them from Agis eventually + # self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] + self.srcEndpoint = queueConfig.stager["srcEndpoint"] self.Globus_srcPath = self.basePath - self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] - self.dstEndpoint = queueConfig.stager['dstEndpoint'] - # Test the endpoints and create the transfer data class + self.Globus_dstPath = queueConfig.stager["Globus_dstPath"] + self.dstEndpoint = queueConfig.stager["dstEndpoint"] + # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) + tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.srcEndpoint) + tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' + errStr = "source Endpoint and destination Endpoint activated" tmpLog.debug(errStr) else: - errMsg = '' - if not tmpStatsrc : - errMsg += ' source Endpoint not activated ' - if not tmpStatdst : - errMsg += ' destination Endpoint not activated ' + errMsg = "" + if not tmpStatsrc: + errMsg += " source Endpoint not activated " + if not tmpStatdst: + errMsg += " destination Endpoint not activated " # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) self.have_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not self.have_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) - tmpRetVal = (None,errMsg) + tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = None - tdata = TransferData(self.tc, - self.srcEndpoint, - self.dstEndpoint, - sync_level="checksum") - except: + tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") + except BaseException: errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal @@ -256,219 +255,216 @@ def check_stage_out_status(self, jobspec): ifile = 0 for fileSpec in fileSpecs: # protect against blank lfn's - if not fileSpec.lfn : - msgStr = 'fileSpec.lfn is empty' + if not fileSpec.lfn: + msgStr = "fileSpec.lfn is empty" tmpLog.debug(msgStr) continue logfile = False - scope ='panda' - if fileSpec.scope is not None : + scope = "panda" + if fileSpec.scope is not None: scope = fileSpec.scope # for EventService job set the scope to transient for non log files - if self.EventServicejob : - scope = 'transient' + if self.EventServicejob: + scope = "transient" # only print to log file first 25 files - if ifile < 25 : + if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) - if ifile == 25 : + if ifile == 25: msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() if sys.version_info.major == 2: - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) if sys.version_info.major == 3: hash_string = "{0}:{1}".format(scope, fileSpec.lfn) - hash.update(bytes(hash_string, 'utf-8')) + hash.update(bytes(hash_string, "utf-8")) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) + correctedscope = "/".join(scope.split(".")) srcURL = fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - if logfile : - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) - if ifile < 25 : - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + if logfile: + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) + if ifile < 25: + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): - if ifile < 25 : - tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) - tdata.add_item(srcURL,dstURL) + if ifile < 25: + tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) + tdata.add_item(srcURL, dstURL) else: errMsg = "source file {} does not exist".format(srcURL) # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) - tmpRetVal = (False,errMsg) + tmpRetVal = (False, errMsg) return tmpRetVal ifile += 1 - # submit transfer - tmpLog.debug('Number of files to transfer - {}'.format(len(tdata['DATA']))) + # submit transfer + tmpLog.debug("Number of files to transfer - {}".format(len(tdata["DATA"]))) try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) - if transfer_result['code'] == "Accepted": + if transfer_result["code"] == "Accepted": # succeeded # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - tmpLog.debug('successfully submitted id={0}'.format(transferID)) + transferID = transfer_result["task_id"] + tmpLog.debug("successfully submitted id={0}".format(transferID)) # set status for files - self.dbInterface.set_file_group(fileSpecs, transferID, 'running') - msgStr = 'submitted transfer with ID={0}'.format(transferID) + self.dbInterface.set_file_group(fileSpecs, transferID, "running") + msgStr = "submitted transfer with ID={0}".format(transferID) tmpLog.debug(msgStr) else: # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: - errMsg = 'Could not release DB lock for {}'.format(dummy_transferID) + errMsg = "Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) - tmpRetVal = (None, transfer_result['message']) + tmpRetVal = (None, transfer_result["message"]) return tmpRetVal except Exception as e: - errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) # release process lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if not release_db_lock: - errMsg += ' - Could not release DB lock for {}'.format(dummy_transferID) + errMsg += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(errMsg) return errStat, errMsg else: - msgStr = 'wait until enough files are pooled' + msgStr = "wait until enough files are pooled" tmpLog.debug(msgStr) # release the lock - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) + tmpLog.debug("released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) have_db_lock = False else: - msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) + msgStr += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(msgStr) # return None to retry later return None, msgStr # release the db lock if needed if have_db_lock: - tmpLog.debug('attempt to release DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) + tmpLog.debug("attempt to release DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + release_db_lock = self.dbInterface.release_object_lock(dummy_transferID) if release_db_lock: - tmpLog.debug('released DB lock for self.id - {0} dummy_transferID - {1}'.format(self.id,dummy_transferID)) - have_db_lock = False + tmpLog.debug("released DB lock for self.id - {0} dummy_transferID - {1}".format(self.id, dummy_transferID)) + have_db_lock = False else: - msgStr += ' - Could not release DB lock for {}'.format(dummy_transferID) + msgStr += " - Could not release DB lock for {}".format(dummy_transferID) tmpLog.error(msgStr) return None, msgStr # check transfer with real transfer IDs - # get transfer groups + # get transfer groups tmpLog.debug("groups = jobspec.get_groups_of_output_files()") groups = jobspec.get_groups_of_output_files() - tmpLog.debug('Number of transfer groups - {0}'.format(len(groups))) - tmpLog.debug('transfer groups any state - {0}'.format(groups)) + tmpLog.debug("Number of transfer groups - {0}".format(len(groups))) + tmpLog.debug("transfer groups any state - {0}".format(groups)) if len(groups) == 0: tmpLog.debug("jobspec.get_groups_of_output_files(skip_done=True) returned no files ") tmpLog.debug("check_stage_out_status return status - True ") - return True,'' + return True, "" for transferID in groups: # allow only valid UUID - if validate_transferid(transferID) : + if validate_transferid(transferID): # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,self.tc,transferID) + tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: - errStr = 'failed to get transfer task; tc = %s; transferID = %s' % (str(self.tc),str(transferID)) + errStr = "failed to get transfer task; tc = %s; transferID = %s" % (str(self.tc), str(transferID)) tmpLog.error(errStr) return None, errStr - # return a temporary error when task is missing + # return a temporary error when task is missing if transferID not in transferTasks: - errStr = 'transfer task ID - {} is missing'.format(transferID) + errStr = "transfer task ID - {} is missing".format(transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID - if transferTasks[transferID]['status'] == 'SUCCEEDED': - tmpLog.debug('transfer task {} succeeded'.format(transferID)) + if transferTasks[transferID]["status"] == "SUCCEEDED": + tmpLog.debug("transfer task {} succeeded".format(transferID)) self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) if self.changeFileStatusOnSuccess: - self.set_FileSpec_status(jobspec, 'finished') - return True, '' + self.set_FileSpec_status(jobspec, "finished") + return True, "" # failed - if transferTasks[transferID]['status'] == 'FAILED': - errStr = 'transfer task {} failed'.format(transferID) + if transferTasks[transferID]["status"] == "FAILED": + errStr = "transfer task {} failed".format(transferID) tmpLog.error(errStr) - self.set_FileSpec_status(jobspec,'failed') + self.set_FileSpec_status(jobspec, "failed") return False, errStr # another status - tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) + tmpStr = "transfer task {0} status: {1}".format(transferID, transferTasks[transferID]["status"]) tmpLog.debug(tmpStr) - return None, '' + return None, "" # end of loop over transfer groups - tmpLog.debug('End of loop over transfers groups - ending check_stage_out_status function') - return None,'no valid transfer id found' + tmpLog.debug("End of loop over transfers groups - ending check_stage_out_status function") + return None, "no valid transfer id found" # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger( + _logger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="trigger_stage_out" + ) + tmpLog.debug("start") # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # show the dummy transfer id and set to a value with the PandaID if needed. - tmpLog.debug('self.dummy_transfer_id = {}'.format(self.dummy_transfer_id)) - if self.dummy_transfer_id == '{0}_{1}'.format(dummy_transfer_id_base,'XXXX') : + tmpLog.debug("self.dummy_transfer_id = {}".format(self.dummy_transfer_id)) + if self.dummy_transfer_id == "{0}_{1}".format(dummy_transfer_id_base, "XXXX"): old_dummy_transfer_id = self.dummy_transfer_id - self.dummy_transfer_id = '{0}_{1}'.format(dummy_transfer_id_base,jobspec.computingSite) - tmpLog.debug('Change self.dummy_transfer_id from {0} to {1}'.format(old_dummy_transfer_id,self.dummy_transfer_id)) + self.dummy_transfer_id = "{0}_{1}".format(dummy_transfer_id_base, jobspec.computingSite) + tmpLog.debug("Change self.dummy_transfer_id from {0} to {1}".format(old_dummy_transfer_id, self.dummy_transfer_id)) # set the dummy transfer ID which will be replaced with a real ID in check_stage_out_status() lfns = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): # test if fileSpec.lfn is not empty - if not fileSpec.lfn : - msgStr = 'fileSpec.lfn is empty' + if not fileSpec.lfn: + msgStr = "fileSpec.lfn is empty" else: - msgStr = 'fileSpec.lfn is {0}'.format(fileSpec.lfn) + msgStr = "fileSpec.lfn is {0}".format(fileSpec.lfn) lfns.append(fileSpec.lfn) tmpLog.debug(msgStr) - jobspec.set_groups_to_files({self.dummy_transfer_id: {'lfns': lfns,'groupStatus': 'pending'}}) - msgStr = 'jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns - {1}, groupStatus - pending'.format(self.dummy_transfer_id,lfns) + jobspec.set_groups_to_files({self.dummy_transfer_id: {"lfns": lfns, "groupStatus": "pending"}}) + msgStr = "jobspec.set_groups_to_files - self.dummy_tranfer_id - {0}, lfns - {1}, groupStatus - pending".format(self.dummy_transfer_id, lfns) tmpLog.debug(msgStr) - tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),self.dummy_transfer_id,pending)') - tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),self.dummy_transfer_id,'pending') - tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),self.dummy_transfer_id,pending)') - return True, '' + tmpLog.debug("call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),self.dummy_transfer_id,pending)") + tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True), self.dummy_transfer_id, "pending") + tmpLog.debug("called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),self.dummy_transfer_id,pending)") + return True, "" # use tar despite name for output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) # make label for transfer task def make_label(self, jobspec): - return "OUT-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, - PandaID=jobspec.PandaID) + return "OUT-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, PandaID=jobspec.PandaID) # resolve input file paths def resolve_input_paths(self, jobspec): @@ -476,7 +472,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterstager/go_rucio_stager.py b/pandaharvester/harvesterstager/go_rucio_stager.py index 17342789..fab4a993 100644 --- a/pandaharvester/harvesterstager/go_rucio_stager.py +++ b/pandaharvester/harvesterstager/go_rucio_stager.py @@ -1,6 +1,5 @@ from rucio.client import Client as RucioClient -from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, \ - FileAlreadyExists +from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, FileAlreadyExists from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config @@ -10,15 +9,13 @@ from pandaharvester.harvesterstager.go_bulk_stager import GlobusBulkStager - # logger -_logger = core_utils.setup_logger('go_rucio_stager') +_logger = core_utils.setup_logger("go_rucio_stager") go_bulk_stager._logger = _logger # plugin with Globus + Rucio + bulk transfers class GlobusRucioStager(GlobusBulkStager): - # constructor def __init__(self, **kwarg): GlobusBulkStager.__init__(self, **kwarg) @@ -27,11 +24,10 @@ def __init__(self, **kwarg): # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('executing base check_stage_out_status') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("executing base check_stage_out_status") tmpStat, tmpMsg = GlobusBulkStager.check_stage_out_status(self, jobspec) - tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg)) + tmpLog.debug("got {0} {1}".format(tmpStat, tmpMsg)) if tmpStat is not True: return tmpStat, tmpMsg # get transfer groups @@ -42,42 +38,42 @@ def check_stage_out_status(self, jobspec): queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager - tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) + tmpLog.debug("jobspec.computingSite - {0} queueConfig.stager {1}".format(jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if srcRSE is set - if 'srcRSE' in queueConfig.stager: - srcRSE = queueConfig.stager['srcRSE'] + if "srcRSE" in queueConfig.stager: + srcRSE = queueConfig.stager["srcRSE"] else: - tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') + tmpLog.debug("Warning srcRSE not defined in stager portion of queue config file") # get destination endpoint - nucleus = jobspec.jobParams['nucleus'] - agis = self.dbInterface.get_cache('panda_queues.json').data - dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] - # if debugging log source and destination RSEs - tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) + nucleus = jobspec.jobParams["nucleus"] + agis = self.dbInterface.get_cache("panda_queues.json").data + dstRSE = [agis[x]["astorages"]["pr"][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] + # if debugging log source and destination RSEs + tmpLog.debug("srcRSE - {0} dstRSE - {1}".format(srcRSE, dstRSE)) # test that srcRSE and dstRSE are defined - tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) - errStr = '' + tmpLog.debug("srcRSE - {0} dstRSE - {1}".format(srcRSE, dstRSE)) + errStr = "" if srcRSE is None: - errStr = 'Source RSE is not defined ' + errStr = "Source RSE is not defined " if dstRSE is None: - errStr = errStr + ' Desitination RSE is not defined' - if (srcRSE is None) or (dstRSE is None) : - tmpLog.error(errStr) - return None,errStr + errStr = errStr + " Desitination RSE is not defined" + if (srcRSE is None) or (dstRSE is None): + tmpLog.error(errStr) + return None, errStr # check queueConfig stager section to see if jobtype is set - if 'jobtype' in queueConfig.stager: - if queueConfig.stager['jobtype'] == "Yoda" : + if "jobtype" in queueConfig.stager: + if queueConfig.stager["jobtype"] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID - # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data - self.objstoreID = ddm[dstRSE]['id'] - if self.Yodajob : - self.pathConvention = int(queueConfig.stager['pathConvention']) - tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) + # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data + self.objstoreID = ddm[dstRSE]["id"] + if self.Yodajob: + self.pathConvention = int(queueConfig.stager["pathConvention"]) + tmpLog.debug("Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}".format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None - tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) + tmpLog.debug("PandaID = {0} objstoreID = {1}".format(jobspec.PandaID, self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) # create the Rucio Client @@ -85,162 +81,148 @@ def check_stage_out_status(self, jobspec): # register dataset rucioAPI = RucioClient() except Exception: - core_utils.dump_error_message(tmpLog) - # treat as a temporary error - tmpStat = None - tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) - return tmpStat,tmpMsg + core_utils.dump_error_message(tmpLog) + # treat as a temporary error + tmpStat = None + tmpMsg = "failed to add a rule for {0}:{1}".format(datasetScope, datasetName) + return tmpStat, tmpMsg # loop over all transfers tmpStat = True - tmpMsg = '' + tmpMsg = "" for transferID in groups: if transferID is None: continue - datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID, transferID) - datasetScope = 'transient' + datasetName = "panda.harvester.{0}.{1}".format(jobspec.PandaID, transferID) + datasetScope = "transient" # lock have_db_lock = self.dbInterface.get_object_lock(transferID, lock_interval=120) if not have_db_lock: - msgStr = 'escape since {0} is locked by another thread'.format(transferID) + msgStr = "escape since {0} is locked by another thread".format(transferID) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(transferID) - if 'hopped' in groupStatus: + if "hopped" in groupStatus: # already succeeded pass - elif 'failed' in groupStatus: + elif "failed" in groupStatus: # transfer failure tmpStat = False - tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) - elif 'hopping' in groupStatus: + tmpMsg = "rucio rule for {0}:{1} already failed".format(datasetScope, datasetName) + elif "hopping" in groupStatus: # check rucio rule - ruleStatus = 'FAILED' + ruleStatus = "FAILED" try: - tmpLog.debug('check state for {0}:{1}'.format(datasetScope, datasetName)) + tmpLog.debug("check state for {0}:{1}".format(datasetScope, datasetName)) for ruleInfo in rucioAPI.list_did_rules(datasetScope, datasetName): - if ruleInfo['rse_expression'] != dstRSE: + if ruleInfo["rse_expression"] != dstRSE: continue - ruleStatus = ruleInfo['state'] - tmpLog.debug('got state={0}'.format(ruleStatus)) - if ruleStatus == 'OK': + ruleStatus = ruleInfo["state"] + tmpLog.debug("got state={0}".format(ruleStatus)) + if ruleStatus == "OK": break except DataIdentifierNotFound: - tmpLog.error('dataset not found') + tmpLog.error("dataset not found") except Exception: core_utils.dump_error_message(tmpLog) ruleStatus = None - if ruleStatus in ['FAILED', 'CANCELED']: + if ruleStatus in ["FAILED", "CANCELED"]: # transfer failure tmpStat = False - tmpMsg = 'rucio rule for {0}:{1} failed with {2}'.format(datasetScope, datasetName, ruleStatus) + tmpMsg = "rucio rule for {0}:{1} failed with {2}".format(datasetScope, datasetName, ruleStatus) # update file group status - self.dbInterface.update_file_group_status(transferID, 'failed') - elif ruleStatus == 'OK': + self.dbInterface.update_file_group_status(transferID, "failed") + elif ruleStatus == "OK": # update successful file group status - self.dbInterface.update_file_group_status(transferID, 'hopped') + self.dbInterface.update_file_group_status(transferID, "hopped") else: # replicating or temporary error tmpStat = None - tmpMsg = 'replicating or temporary error for {0}:{1}'.format(datasetScope, datasetName) + tmpMsg = "replicating or temporary error for {0}:{1}".format(datasetScope, datasetName) else: # make rucio rule fileSpecs = self.dbInterface.get_files_with_group_id(transferID) fileList = [] for fileSpec in fileSpecs: tmpFile = dict() - tmpFile['scope'] = datasetScope - tmpFile['name'] = fileSpec.lfn - tmpFile['bytes'] = fileSpec.fsize - tmpFile['adler32'] = fileSpec.chksum - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} - else : - tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) - tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) + tmpFile["scope"] = datasetScope + tmpFile["name"] = fileSpec.lfn + tmpFile["bytes"] = fileSpec.fsize + tmpFile["adler32"] = fileSpec.chksum + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + tmpFile["meta"] = {"guid": fileSpec.fileAttributes["guid"]} + else: + tmpLog.debug("File - {0} does not have a guid value".format(fileSpec.lfn)) + tmpLog.debug("Adding file {0} to fileList".format(fileSpec.lfn)) fileList.append(tmpFile) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] try: # register dataset - tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' - .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) + tmpLog.debug("register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}".format(datasetScope, datasetName, srcRSE, (30 * 24 * 60 * 60))) try: - rucioAPI.add_dataset(datasetScope, datasetName, - meta={'hidden': True}, - lifetime=30 * 24 * 60 * 60, - rse=srcRSE - ) + rucioAPI.add_dataset(datasetScope, datasetName, meta={"hidden": True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: - errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, - datasetName, - srcRSE) + errMsg = "Could not create dataset {0}:{1} srcRSE - {2}".format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) - tmpLog.error(errMsg) + tmpLog.error(errMsg) raise # return None,errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 - numslices = numfiles/maxfiles - if (numfiles%maxfiles) > 0 : + numslices = numfiles / maxfiles + if (numfiles % maxfiles) > 0: numslices = numslices + 1 start = 0 - for i in range(numslices) : + for i in range(numslices): try: stop = start + maxfiles - if stop > numfiles : + if stop > numfiles: stop = numfiles - rucioAPI.add_files_to_datasets([{'scope': datasetScope, - 'name': datasetName, - 'dids': fileList[start:stop], - 'rse': srcRSE}], - ignore_duplicate=True) + rucioAPI.add_files_to_datasets( + [{"scope": datasetScope, "name": datasetName, "dids": fileList[start:stop], "rse": srcRSE}], ignore_duplicate=True + ) start = stop except FileAlreadyExists: # ignore if files already exist pass except Exception: - errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, - datasetName, - srcRSE, - fileList) + errMsg = "Could not add files to DS - {0}:{1} rse - {2} files - {3}".format(datasetScope, datasetName, srcRSE, fileList) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) - return None,errMsg + return None, errMsg # add rule try: tmpDID = dict() - tmpDID['scope'] = datasetScope - tmpDID['name'] = datasetName - tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, - lifetime=30 * 24 * 60 * 60) + tmpDID["scope"] = datasetScope + tmpDID["name"] = datasetName + tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] - tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, - str(ruleIDs))) + tmpLog.debug("registered dataset {0}:{1} with rule {2}".format(datasetScope, datasetName, str(ruleIDs))) except DuplicateRule: # ignore duplicated rule - tmpLog.debug('rule is already available') + tmpLog.debug("rule is already available") except Exception: - errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) + errMsg = "Error creating rule for dataset {0}:{1}".format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) - #raise - return None,errMsg + # raise + return None, errMsg # update file group status - self.dbInterface.update_file_group_status(transferID, 'hopping') + self.dbInterface.update_file_group_status(transferID, "hopping") except Exception: core_utils.dump_error_message(tmpLog) # treat as a temporary error tmpStat = None - tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) + tmpMsg = "failed to add a rule for {0}:{1}".format(datasetScope, datasetName) # release lock self.dbInterface.release_object_lock(transferID) # escape if already failed @@ -248,6 +230,6 @@ def check_stage_out_status(self, jobspec): break # all done if tmpStat is True: - self.set_FileSpec_status(jobspec, 'finished') - tmpLog.debug('done with {0} : {1}'.format(tmpStat, tmpMsg)) + self.set_FileSpec_status(jobspec, "finished") + tmpLog.debug("done with {0} : {1}".format(tmpStat, tmpMsg)) return tmpStat, tmpMsg diff --git a/pandaharvester/harvesterstager/go_stager.py b/pandaharvester/harvesterstager/go_stager.py index ffa80a8e..687ab399 100644 --- a/pandaharvester/harvesterstager/go_stager.py +++ b/pandaharvester/harvesterstager/go_stager.py @@ -12,9 +12,10 @@ # TO BE REMOVED for python2.7 import requests.packages.urllib3 + try: requests.packages.urllib3.disable_warnings() -except: +except BaseException: pass from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase @@ -23,14 +24,13 @@ from pandaharvester.harvestermisc import globus_utils # logger -_logger = core_utils.setup_logger('go_stager') +_logger = core_utils.setup_logger("go_stager") def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) # plugin for stager with FTS @@ -39,32 +39,32 @@ class GlobusStager(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # create Globus Transfer Client - tmpLog = self.make_logger(_logger, method_name='GlobusStager __init__ ') + tmpLog = self.make_logger(_logger, method_name="GlobusStager __init__ ") try: self.tc = None # need to get client_id and refresh_token from PanDA server via harvester cache mechanism - tmpLog.debug('about to call dbInterface.get_cache(globus_secret)') - c_data = self.dbInterface.get_cache('globus_secret') - if (not c_data == None) and c_data.data['StatusCode'] == 0 : - tmpLog.debug('Got the globus_secrets from PanDA') - self.client_id = c_data.data['publicKey'] # client_id - self.refresh_token = c_data.data['privateKey'] # refresh_token - tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog,self.client_id,self.refresh_token) - if not tmpStat: - self.tc = None - errStr = 'failed to create Globus Transfer Client' - tmpLog.error(errStr) - else : - self.client_id = None - self.refresh_token = None - self.tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' - tmpLog.error(errStr) - except: + tmpLog.debug("about to call dbInterface.get_cache(globus_secret)") + c_data = self.dbInterface.get_cache("globus_secret") + if (c_data is not None) and c_data.data["StatusCode"] == 0: + tmpLog.debug("Got the globus_secrets from PanDA") + self.client_id = c_data.data["publicKey"] # client_id + self.refresh_token = c_data.data["privateKey"] # refresh_token + tmpStat, self.tc = globus_utils.create_globus_transfer_client(tmpLog, self.client_id, self.refresh_token) + if not tmpStat: + self.tc = None + errStr = "failed to create Globus Transfer Client" + tmpLog.error(errStr) + else: + self.client_id = None + self.refresh_token = None + self.tc = None + errStr = "failed to get Globus Client ID and Refresh Token" + tmpLog.error(errStr) + except BaseException: core_utils.dump_error_message(tmpLog) - # set FileSpec.status - def set_FileSpec_status(self,jobspec,status): + # set FileSpec.status + def set_FileSpec_status(self, jobspec, status): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.status = status @@ -72,172 +72,164 @@ def set_FileSpec_status(self,jobspec,status): # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") # get label label = self.make_label(jobspec) - tmpLog.debug('label={0}'.format(label)) + tmpLog.debug("label={0}".format(label)) # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) + tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog, self.tc, label) # return a temporary error when failed to get task if not tmpStat: - errStr = 'failed to get transfer task' + errStr = "failed to get transfer task" tmpLog.error(errStr) return None, errStr # return a fatal error when task is missing # FIXME retry instead? if label not in transferTasks: - errStr = 'transfer task is missing' + errStr = "transfer task is missing" tmpLog.error(errStr) return False, errStr # succeeded - transferID = transferTasks[label]['task_id'] - if transferTasks[label]['status'] == 'SUCCEEDED': - tmpLog.debug('transfer task {} succeeded'.format(transferID)) - self.set_FileSpec_status(jobspec,'finished') - return True, '' + transferID = transferTasks[label]["task_id"] + if transferTasks[label]["status"] == "SUCCEEDED": + tmpLog.debug("transfer task {} succeeded".format(transferID)) + self.set_FileSpec_status(jobspec, "finished") + return True, "" # failed - if transferTasks[label]['status'] == 'FAILED': - errStr = 'transfer task {} failed'.format(transferID) + if transferTasks[label]["status"] == "FAILED": + errStr = "transfer task {} failed".format(transferID) tmpLog.error(errStr) - self.set_FileSpec_status(jobspec,'failed') + self.set_FileSpec_status(jobspec, "failed") return False, errStr # another status - tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[label]['status']) + tmpStr = "transfer task {0} status: {1}".format(transferID, transferTasks[label]["status"]) tmpLog.debug(tmpStr) - return None, '' - + return None, "" # trigger stage out + def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # default return - tmpRetVal = (True, '') + tmpRetVal = (True, "") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # test we have a Globus Transfer Client - if not self.tc : - errStr = 'failed to get Globus Transfer Client' + if not self.tc: + errStr = "failed to get Globus Transfer Client" tmpLog.error(errStr) return False, errStr # get label label = self.make_label(jobspec) - tmpLog.debug('label={0}'.format(label)) + tmpLog.debug("label={0}".format(label)) # get transfer tasks - tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog,self.tc,label) + tmpStat, transferTasks = globus_utils.get_transfer_tasks(tmpLog, self.tc, label) if not tmpStat: - errStr = 'failed to get transfer tasks' + errStr = "failed to get transfer tasks" tmpLog.error(errStr) return False, errStr # check if already queued if label in transferTasks: - tmpLog.debug('skip since already queued with {0}'.format(str(transferTasks[label]))) - return True, '' - # set the Globus destination Endpoint id and path will get them from Agis eventually + tmpLog.debug("skip since already queued with {0}".format(str(transferTasks[label]))) + return True, "" + # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper + queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) - #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] - self.srcEndpoint = queueConfig.stager['srcEndpoint'] + # self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] + self.srcEndpoint = queueConfig.stager["srcEndpoint"] self.Globus_srcPath = self.basePath - self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] - self.dstEndpoint = queueConfig.stager['dstEndpoint'] - # Test the endpoints and create the transfer data class + self.Globus_dstPath = queueConfig.stager["Globus_dstPath"] + self.dstEndpoint = queueConfig.stager["dstEndpoint"] + # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.srcEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,self.tc,self.dstEndpoint) + tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.srcEndpoint) + tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' + errStr = "source Endpoint and destination Endpoint activated" tmpLog.debug(errStr) else: - errMsg = '' - if not tmpStatsrc : - errMsg += ' source Endpoint not activated ' - if not tmpStatdst : - errMsg += ' destination Endpoint not activated ' + errMsg = "" + if not tmpStatsrc: + errMsg += " source Endpoint not activated " + if not tmpStatdst: + errMsg += " destination Endpoint not activated " tmpLog.error(errMsg) - tmpRetVal = (False,errMsg) + tmpRetVal = (False, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data - tdata = TransferData(self.tc, - self.srcEndpoint, - self.dstEndpoint, - label=label, - sync_level="checksum") - except: - errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) + tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, label=label, sync_level="checksum") + except BaseException: + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files fileAttrs = jobspec.get_output_file_attributes() lfns = [] for fileSpec in jobspec.outFiles: - scope = fileAttrs[fileSpec.lfn]['scope'] + scope = fileAttrs[fileSpec.lfn]["scope"] hash = hashlib.md5() - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) + correctedscope = "/".join(scope.split(".")) srcURL = fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.Globus_dstPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): - tmpLog.debug("tdata.add_item({},{})".format(srcURL,dstURL)) - tdata.add_item(srcURL,dstURL) + tmpLog.debug("tdata.add_item({},{})".format(srcURL, dstURL)) + tdata.add_item(srcURL, dstURL) lfns.append(fileSpec.lfn) else: errMsg = "source file {} does not exist".format(srcURL) tmpLog.error(errMsg) - tmpRetVal = (False,errMsg) + tmpRetVal = (False, errMsg) return tmpRetVal - # submit transfer + # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) - if transfer_result['code'] == "Accepted": + if transfer_result["code"] == "Accepted": # succeeded # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - tmpLog.debug('successfully submitted id={0}'.format(transferID)) - jobspec.set_groups_to_files({transferID: {'lfns': lfns, 'groupStatus': 'active'}}) + transferID = transfer_result["task_id"] + tmpLog.debug("successfully submitted id={0}".format(transferID)) + jobspec.set_groups_to_files({transferID: {"lfns": lfns, "groupStatus": "active"}}) # set for fileSpec in jobspec.outFiles: - if fileSpec.fileAttributes == None: + if fileSpec.fileAttributes is None: fileSpec.fileAttributes = {} - fileSpec.fileAttributes['transferID'] = transferID + fileSpec.fileAttributes["transferID"] = transferID else: - tmpRetVal = (False, transfer_result['message']) + tmpRetVal = (False, transfer_result["message"]) except Exception as e: - errStat,errMsg = globus_utils.handle_globus_exception(tmpLog) + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) if errMsg is None: errtype, errvalue = sys.exc_info()[:2] errMsg = "{0} {1}".format(errtype.__name__, errvalue) - tmpRetVal = (errStat,errMsg) + tmpRetVal = (errStat, errMsg) # return - tmpLog.debug('done') + tmpLog.debug("done") return tmpRetVal # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") + tmpLog.debug("start") try: for fileSpec in jobspec.outFiles: if self.zipDir == "${SRCDIR}": @@ -249,27 +241,26 @@ def zip_output(self, jobspec): # remove zip file just in case try: os.remove(zipPath) - except: + except BaseException: pass # make zip file with zipfile.ZipFile(zipPath, "w", zipfile.ZIP_STORED) as zf: for assFileSpec in fileSpec.associatedFiles: - zf.write(assFileSpec.path,os.path.basename(assFileSpec.path)) + zf.write(assFileSpec.path, os.path.basename(assFileSpec.path)) # set path fileSpec.path = zipPath # get size statInfo = os.stat(zipPath) fileSpec.fsize = statInfo.st_size - except: + except BaseException: errMsg = core_utils.dump_error_message(tmpLog) - return False, 'failed to zip with {0}'.format(errMsg) - tmpLog.debug('done') - return True, '' + return False, "failed to zip with {0}".format(errMsg) + tmpLog.debug("done") + return True, "" # make label for transfer task def make_label(self, jobspec): - return "OUT-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, - PandaID=jobspec.PandaID) + return "OUT-{computingSite}-{PandaID}".format(computingSite=jobspec.computingSite, PandaID=jobspec.PandaID) # resolve input file paths def resolve_input_paths(self, jobspec): @@ -277,7 +268,7 @@ def resolve_input_paths(self, jobspec): inFiles = jobspec.get_input_file_attributes() # set path to each file for inLFN, inFile in iteritems(inFiles): - inFile['path'] = mover_utils.construct_file_path(self.basePath, inFile['scope'], inLFN) + inFile["path"] = mover_utils.construct_file_path(self.basePath, inFile["scope"], inLFN) # set jobspec.set_input_file_paths(inFiles) - return True, '' + return True, "" diff --git a/pandaharvester/harvesterstager/gridftp_stager.py b/pandaharvester/harvesterstager/gridftp_stager.py index b2678551..2b93dc04 100644 --- a/pandaharvester/harvesterstager/gridftp_stager.py +++ b/pandaharvester/harvesterstager/gridftp_stager.py @@ -1,6 +1,7 @@ import os import re import tempfile + try: import subprocess32 as subprocess except Exception: @@ -11,7 +12,7 @@ from pandaharvester.harvestermover import mover_utils # logger -baseLogger = core_utils.setup_logger('gridftp_stager') +baseLogger = core_utils.setup_logger("gridftp_stager") # stager plugin with GridFTP @@ -37,10 +38,12 @@ "intermediateBasePaths":[ ["file:///nfs/at3/scratch/at3sgm001/", "gsiftp://some.dtn.server//nfs/at3/scratch/at3sgm001/"], "gsiftp://another.dtn.server//scratch/data/" ] } """ + + class GridFtpStager(BaseStager): # constructor def __init__(self, **kwarg): - self.scopeForTmp = 'transient' + self.scopeForTmp = "transient" self.pathConvention = 1000 self.objstoreID = None self.gulOpts = None @@ -52,30 +55,29 @@ def __init__(self, **kwarg): # check status def check_stage_out_status(self, jobspec): for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' + fileSpec.status = "finished" fileSpec.pathConvention = self.pathConvention fileSpec.objstoreID = self.objstoreID - return True, '' + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # loop over all files gucInput = None is_multistep = isinstance(self.intermediateBasePaths, list) and len(self.intermediateBasePaths) > 0 guc_inputs_list = [None] * (len(self.intermediateBasePaths) + 1) if is_multistep else [] for fileSpec in jobspec.outFiles: # skip if already done - if fileSpec.status in ['finished', 'failed']: + if fileSpec.status in ["finished", "failed"]: continue # scope - if fileSpec.fileType in ['es_output', 'zip_output']: + if fileSpec.fileType in ["es_output", "zip_output"]: scope = self.scopeForTmp else: - scope = fileSpec.fileAttributes.get('scope') + scope = fileSpec.fileAttributes.get("scope") if scope is None: scope = fileSpec.scope # construct source and destination paths @@ -85,8 +87,8 @@ def trigger_stage_out(self, jobspec): if is_multistep: # multi-step transfer for ibp_i in range(len(self.intermediateBasePaths) + 1): - base_paths_old = self.intermediateBasePaths[ibp_i - 1] if ibp_i > 0 else '' - base_paths_new = self.intermediateBasePaths[ibp_i] if ibp_i < len(self.intermediateBasePaths) else '' + base_paths_old = self.intermediateBasePaths[ibp_i - 1] if ibp_i > 0 else "" + base_paths_new = self.intermediateBasePaths[ibp_i] if ibp_i < len(self.intermediateBasePaths) else "" src_base = base_paths_old[1] if isinstance(base_paths_old, list) else base_paths_old dst_base = base_paths_new[0] if isinstance(base_paths_new, list) else base_paths_new # construct temporary source and destination paths @@ -94,7 +96,7 @@ def trigger_stage_out(self, jobspec): tmp_dest_path = re.sub(self.srcNewBasePath, dst_base, srcPath) # make input for globus-url-copy if guc_inputs_list[ibp_i] is None: - guc_inputs_list[ibp_i] = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_guc_out_{0}.tmp'.format(ibp_i)) + guc_inputs_list[ibp_i] = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_guc_out_{0}.tmp".format(ibp_i)) guc_input = guc_inputs_list[ibp_i] if ibp_i == 0: guc_input.write("{0} {1}\n".format(srcPath, tmp_dest_path)) @@ -109,46 +111,46 @@ def trigger_stage_out(self, jobspec): # single-step transfer # make input for globus-url-copy if gucInput is None: - gucInput = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_guc_out.tmp') + gucInput = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_guc_out.tmp") gucInput.write("{0} {1}\n".format(srcPath, dstPath)) fileSpec.attemptNr += 1 # nothing to transfer if is_multistep: for guc_input in guc_inputs_list: if guc_input is None: - tmpLog.debug('done with no transfers (multistep)') - return True, '' + tmpLog.debug("done with no transfers (multistep)") + return True, "" else: if gucInput is None: - tmpLog.debug('done with no transfers') - return True, '' + tmpLog.debug("done with no transfers") + return True, "" # transfer if is_multistep: - [ guc_input.close() for guc_input in guc_inputs_list ] - tmpLog.debug('start multistep transfer') + [guc_input.close() for guc_input in guc_inputs_list] + tmpLog.debug("start multistep transfer") guc_input_i = 1 for guc_input in guc_inputs_list: - args = ['globus-url-copy', '-f', guc_input.name, '-cd'] + args = ["globus-url-copy", "-f", guc_input.name, "-cd"] if self.gulOpts is not None: args += self.gulOpts.split() try: - tmpLog.debug('execute: ' + ' '.join(args)) + tmpLog.debug("execute: " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: stdout, stderr = p.communicate(timeout=self.timeout) except subprocess.TimeoutExpired: p.kill() stdout, stderr = p.communicate() - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") return_code = p.returncode if stdout is not None: if not isinstance(stdout, str): stdout = stdout.decode() - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: if not isinstance(stderr, str): stderr = stderr.decode() - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: @@ -156,43 +158,43 @@ def trigger_stage_out(self, jobspec): return_code = 1 os.remove(guc_input.name) if return_code == 0: - tmpLog.debug('step {0} succeeded'.format(guc_input_i)) + tmpLog.debug("step {0} succeeded".format(guc_input_i)) guc_input_i += 1 else: - errMsg = 'step {0} failed with {1}'.format(guc_input_i, return_code) + errMsg = "step {0} failed with {1}".format(guc_input_i, return_code) tmpLog.error(errMsg) # check attemptNr for fileSpec in jobspec.inFiles: if fileSpec.attemptNr >= self.maxAttempts: - errMsg = 'gave up due to max attempts' + errMsg = "gave up due to max attempts" tmpLog.error(errMsg) return (False, errMsg) return None, errMsg - tmpLog.debug('multistep transfer ({0} steps) succeeded'.format(len(guc_inputs_list))) - return True, '' + tmpLog.debug("multistep transfer ({0} steps) succeeded".format(len(guc_inputs_list))) + return True, "" else: gucInput.close() - args = ['globus-url-copy', '-f', gucInput.name, '-cd'] + args = ["globus-url-copy", "-f", gucInput.name, "-cd"] if self.gulOpts is not None: args += self.gulOpts.split() try: - tmpLog.debug('execute: ' + ' '.join(args)) + tmpLog.debug("execute: " + " ".join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) try: stdout, stderr = p.communicate(timeout=self.timeout) except subprocess.TimeoutExpired: p.kill() stdout, stderr = p.communicate() - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") return_code = p.returncode if stdout is not None: if not isinstance(stdout, str): stdout = stdout.decode() - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: if not isinstance(stderr, str): stderr = stderr.decode() - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: @@ -200,15 +202,15 @@ def trigger_stage_out(self, jobspec): return_code = 1 os.remove(gucInput.name) if return_code == 0: - tmpLog.debug('succeeded') - return True, '' + tmpLog.debug("succeeded") + return True, "" else: - errMsg = 'failed with {0}'.format(return_code) + errMsg = "failed with {0}".format(return_code) tmpLog.error(errMsg) # check attemptNr for fileSpec in jobspec.inFiles: if fileSpec.attemptNr >= self.maxAttempts: - errMsg = 'gave up due to max attempts' + errMsg = "gave up due to max attempts" tmpLog.error(errMsg) return (False, errMsg) return None, errMsg diff --git a/pandaharvester/harvesterstager/rse_direct_stager.py b/pandaharvester/harvesterstager/rse_direct_stager.py index 47c18ccb..34c822c8 100644 --- a/pandaharvester/harvesterstager/rse_direct_stager.py +++ b/pandaharvester/harvesterstager/rse_direct_stager.py @@ -2,30 +2,31 @@ from .base_stager import BaseStager # logger -baseLogger = core_utils.setup_logger('rse_direct_stager') +baseLogger = core_utils.setup_logger("rse_direct_stager") # stager plugin with RSE + no data motion class RseDirectStager(BaseStager): """In the workflow for RseDirectStager, workers directly upload output files to RSE and thus there is no data motion in Harvester.""" + # constructor + def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) # check status def check_stage_out_status(self, jobspec): for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' - return True, '' + fileSpec.status = "finished" + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): - return True, '' + return True, "" # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/rucio_rse_direct_stager.py b/pandaharvester/harvesterstager/rucio_rse_direct_stager.py index 59e55a62..690051b5 100644 --- a/pandaharvester/harvesterstager/rucio_rse_direct_stager.py +++ b/pandaharvester/harvesterstager/rucio_rse_direct_stager.py @@ -1,6 +1,5 @@ from rucio.client import Client as RucioClient -from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, \ - FileAlreadyExists +from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, FileAlreadyExists from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterstager import yoda_rucio_rse_direct_stager @@ -8,234 +7,220 @@ # logger -_logger = core_utils.setup_logger('rucio_rse_direct_stager') +_logger = core_utils.setup_logger("rucio_rse_direct_stager") + +# stager plugin to use Rucio to transfer files from local RSE via local sitemover to nucleus -# stager plugin to use Rucio to transfer files from local RSE via local sitemover to nucleus -class RucioRseDirectStager(YodaRseDirectStager): +class RucioRseDirectStager(YodaRseDirectStager): # constructor def __init__(self, **kwarg): YodaRseDirectStager.__init__(self, **kwarg) self.changeFileStatusOnSuccess = False - # set FileSpec.objstoreID - def set_FileSpec_objstoreID(self,jobspec, objstoreID, pathConvention): + # set FileSpec.objstoreID + def set_FileSpec_objstoreID(self, jobspec, objstoreID, pathConvention): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.objstoreID = objstoreID fileSpec.pathConvention = pathConvention + # set FileSpec.status - # set FileSpec.status - def set_FileSpec_status(self,jobspec,status): + def set_FileSpec_status(self, jobspec, status): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.status = status - # check status of Rucio transfer # check status + def check_stage_out_status(self, jobspec): tmpStat = True - tmpMsg = '' + tmpMsg = "" # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='check_stage_out_status') - tmpLog.debug('start') - # Get the files grouped by Rucio Rule ID + tmpLog = self.make_logger( + baseLogger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="check_stage_out_status" + ) + tmpLog.debug("start") + # Get the files grouped by Rucio Rule ID groups = jobspec.get_groups_of_output_files() if len(groups) == 0: - tmpLog.debug('No Rucio Rules') - return None,'No Rucio Rules' - tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) - + tmpLog.debug("No Rucio Rules") + return None, "No Rucio Rules" + tmpLog.debug("#Rucio Rules - {0} - Rules - {1}".format(len(groups), groups)) + try: rucioAPI = RucioClient() - except: - tmpLog.error('failure to get Rucio Client try again later') - return None,'failure to get Rucio Client try again later' + except BaseException: + tmpLog.error("failure to get Rucio Client try again later") + return None, "failure to get Rucio Client try again later" - # loop over the Rucio rules + # loop over the Rucio rules for rucioRule in groups: if rucioRule is None: continue # lock have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120) if not have_db_lock: - msgStr = 'escape since {0} is locked by another thread'.format(rucioRule) + msgStr = "escape since {0} is locked by another thread".format(rucioRule) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(rucioRule) - if 'transferred' in groupStatus : + if "transferred" in groupStatus: # already succeeded pass - elif 'failed' in groupStatus : + elif "failed" in groupStatus: # transfer failure tmpStat = False - tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) - elif 'transferring' in groupStatus: + tmpMsg = "rucio rule for {0}:{1} already failed".format(datasetScope, datasetName) + elif "transferring" in groupStatus: # transfer started in Rucio check status try: - result = rucioAPI.get_replication_rule(rucioRule,False) - if result['state'] == "OK" : + result = rucioAPI.get_replication_rule(rucioRule, False) + if result["state"] == "OK": # files transfered to nucleus - tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule)) - self.dbInterface.update_file_group_status(rucioRule, 'transferred') - # set the fileSpec status for these files - self.set_FileSpec_status(jobspec,'finished') - elif result['state'] == "FAILED" : + tmpLog.debug("Files for Rucio Rule {0} successfully transferred".format(rucioRule)) + self.dbInterface.update_file_group_status(rucioRule, "transferred") + # set the fileSpec status for these files + self.set_FileSpec_status(jobspec, "finished") + elif result["state"] == "FAILED": # failed Rucio Transfer tmpStat = False - tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule) + tmpMsg = "Failed Rucio Transfer - Rucio Rule - {0}".format(rucioRule) tmpLog.debug(tmpMsg) - self.set_FileSpec_status(jobspec,'failed') - elif result['state'] == 'STUCK' : + self.set_FileSpec_status(jobspec, "failed") + elif result["state"] == "STUCK": tmpStat = None - tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule) + tmpMsg = "Rucio Transfer Rule {0} Stuck".format(rucioRule) tmpLog.debug(tmpMsg) - except: - tmpStat = None - tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule) - tmpLog.error(tmpMsg) - pass + except BaseException: + tmpStat = None + tmpMsg = "Could not get information or Rucio Rule {0}".format(rucioRule) + tmpLog.error(tmpMsg) + pass # release the lock if have_db_lock: - tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule)) - release_db_lock = self.dbInterface.release_object_lock(rucioRule) + tmpLog.debug("attempt to release DB lock for Rucio Rule {0}".format(rucioRule)) + release_db_lock = self.dbInterface.release_object_lock(rucioRule) if release_db_lock: - tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule)) - have_db_lock = False + tmpLog.debug("released DB lock for rucioRule - {0}".format(rucioRule)) + have_db_lock = False else: - msgStr = ' Could not release DB lock for {}'.format(rucioRule) + msgStr = " Could not release DB lock for {}".format(rucioRule) tmpLog.error(msgStr) return None, msgStr - tmpLog.debug('stop') - return tmpStat, tmpMsg - + tmpLog.debug("stop") + return tmpStat, tmpMsg # trigger stageout via Rucio to nucleus site + def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0} ThreadID={1} '.format(jobspec.PandaID, - threading.current_thread().ident), - method_name='trigger_stage_out') - tmpLog.debug('executing base trigger_stage_out') + tmpLog = self.make_logger( + _logger, "PandaID={0} ThreadID={1} ".format(jobspec.PandaID, threading.current_thread().ident), method_name="trigger_stage_out" + ) + tmpLog.debug("executing base trigger_stage_out") tmpStat, tmpMsg = YodaRseDirect.trigger_stage_out(self, jobspec) - tmpLog.debug('got {0} {1}'.format(tmpStat, tmpMsg)) + tmpLog.debug("got {0} {1}".format(tmpStat, tmpMsg)) if tmpStat is not True: return tmpStat, tmpMsg # Now that output files have been all copied to Local RSE register transient dataset # loop over all transfers tmpStat = None - tmpMsg = '' + tmpMsg = "" srcRSE = None dstRSE = None - datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4())) - datasetScope = 'transient' + datasetName = "panda.harvester.{0}.{1}".format(jobspec.PandaID, str(uuid.uuid4())) + datasetScope = "transient" # get destination endpoint - nucleus = jobspec.jobParams['nucleus'] - agis = self.dbInterface.get_cache('panda_queues.json').data - dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] - + nucleus = jobspec.jobParams["nucleus"] + agis = self.dbInterface.get_cache("panda_queues.json").data + dstRSE = [agis[x]["astorages"]["pr"][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] + # get the list of output files to transfer fileSpecs = jobspec.get_output_file_specs(skip_done=True) fileList = [] lfns = [] for fileSpec in fileSpecs: tmpFile = dict() - tmpFile['scope'] = datasetScope - tmpFile['name'] = fileSpec.lfn - tmpFile['bytes'] = fileSpec.fsize - tmpFile['adler32'] = fileSpec.chksum - tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} + tmpFile["scope"] = datasetScope + tmpFile["name"] = fileSpec.lfn + tmpFile["bytes"] = fileSpec.fsize + tmpFile["adler32"] = fileSpec.chksum + tmpFile["meta"] = {"guid": fileSpec.fileAttributes["guid"]} fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] # test that srcRSE and dstRSE are defined - errStr = '' + errStr = "" if srcRSE is None: - errStr = 'Source RSE is not defined ' + errStr = "Source RSE is not defined " if dstRSE is None: - errStr = errStr + ' Desitination RSE is not defined' - if (srcRSE is None) or (dstRSE is None) : - tmpLog.error(errStr) - return False,errStr + errStr = errStr + " Desitination RSE is not defined" + if (srcRSE is None) or (dstRSE is None): + tmpLog.error(errStr) + return False, errStr - # create the dataset and add files to it and create a transfer rule try: # register dataset - tmpLog.debug('register {0}:{1}'.format(datasetScope, datasetName)) + tmpLog.debug("register {0}:{1}".format(datasetScope, datasetName)) rucioAPI = RucioClient() try: - rucioAPI.add_dataset(datasetScope, datasetName, - meta={'hidden': True}, - lifetime=30 * 24 * 60 * 60, - rse=srcRSE - ) + rucioAPI.add_dataset(datasetScope, datasetName, meta={"hidden": True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: - tmpLog.error('Could not create dataset with scope: {0} Name: {1} in Rucio' - .format(datasetScope,datasetName)) + tmpLog.error("Could not create dataset with scope: {0} Name: {1} in Rucio".format(datasetScope, datasetName)) raise # add files to dataset try: - rucioAPI.add_files_to_datasets([{'scope': datasetScope, - 'name': datasetName, - 'dids': fileList, - 'rse': srcRSE}], - ignore_duplicate=True) + rucioAPI.add_files_to_datasets([{"scope": datasetScope, "name": datasetName, "dids": fileList, "rse": srcRSE}], ignore_duplicate=True) except FileAlreadyExists: # ignore if files already exist pass except Exception: - tmpLog.error('Could add files to dataset with scope: {0} Name: {1} in Rucio' - .format(datasetScope,datasetName)) + tmpLog.error("Could add files to dataset with scope: {0} Name: {1} in Rucio".format(datasetScope, datasetName)) raise # add rule try: tmpDID = dict() - tmpDID['scope'] = datasetScope - tmpDID['name'] = datasetName - tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, - lifetime=30 * 24 * 60 * 60) + tmpDID["scope"] = datasetScope + tmpDID["name"] = datasetName + tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] - tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, - str(ruleIDs))) + tmpLog.debug("registered dataset {0}:{1} with rule {2}".format(datasetScope, datasetName, str(ruleIDs))) # group the output files together by the Rucio transfer rule - jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}}) - msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns) + jobspec.set_groups_to_files({ruleIDs: {"lfns": lfns, "groupStatus": "pending"}}) + msgStr = "jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending".format(ruleIDs, lfns) tmpLog.debug(msgStr) - tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)') - tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'pending') - tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)') + tmpLog.debug("call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)") + tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True), ruleIDs, "pending") + tmpLog.debug("called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)") tmpStat = True - tmpMsg = 'created Rucio rule successfully' + tmpMsg = "created Rucio rule successfully" except DuplicateRule: # ignore duplicated rule - tmpLog.debug('rule is already available') + tmpLog.debug("rule is already available") except Exception: - tmpLog.debug('Error creating rule for dataset {0}:{1}' - .format(datasetScope, datasetName)) + tmpLog.debug("Error creating rule for dataset {0}:{1}".format(datasetScope, datasetName)) raise # update file group status - self.dbInterface.update_file_group_status(ruleIDs, 'transferring') + self.dbInterface.update_file_group_status(ruleIDs, "transferring") except Exception: - core_utils.dump_error_message(tmpLog) - # treat as a temporary error - tmpStat = None - tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) - - return tmpStat,tmpMsg - + core_utils.dump_error_message(tmpLog) + # treat as a temporary error + tmpStat = None + tmpMsg = "failed to add a rule for {0}:{1}".format(datasetScope, datasetName) + return tmpStat, tmpMsg diff --git a/pandaharvester/harvesterstager/rucio_stager.py b/pandaharvester/harvesterstager/rucio_stager.py index 2e7613e7..fa4aab19 100644 --- a/pandaharvester/harvesterstager/rucio_stager.py +++ b/pandaharvester/harvesterstager/rucio_stager.py @@ -13,7 +13,7 @@ from rucio.common.exception import RuleNotFound # logger -baseLogger = core_utils.setup_logger('rucio_stager') +baseLogger = core_utils.setup_logger("rucio_stager") # plugin for stage-out with Rucio @@ -21,63 +21,61 @@ class RucioStager(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - if not hasattr(self, 'scopeForTmp'): - self.scopeForTmp = 'panda' + if not hasattr(self, "scopeForTmp"): + self.scopeForTmp = "panda" # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") # loop over all files allChecked = True oneErrMsg = None transferStatus = dict() for fileSpec in jobspec.outFiles: # skip already don - if fileSpec.status in ['finished', 'failed']: + if fileSpec.status in ["finished", "failed"]: continue # get transfer ID - transferID = fileSpec.fileAttributes['transferID'] + transferID = fileSpec.fileAttributes["transferID"] if transferID not in transferStatus: # get status try: rucioAPI = RucioClient() ruleInfo = rucioAPI.get_replication_rule(transferID) - tmpTransferStatus = ruleInfo['state'] - tmpLog.debug('got state={0} for rule={1}'.format(tmpTransferStatus, transferID)) + tmpTransferStatus = ruleInfo["state"] + tmpLog.debug("got state={0} for rule={1}".format(tmpTransferStatus, transferID)) except RuleNotFound: - tmpLog.error('rule {0} not found'.format(transferID)) - tmpTransferStatus = 'FAILED' - except: + tmpLog.error("rule {0} not found".format(transferID)) + tmpTransferStatus = "FAILED" + except BaseException: err_type, err_value = sys.exc_info()[:2] errMsg = "{0} {1}".format(err_type.__name__, err_value) - tmpLog.error('failed to get status for rule={0} with {1}'.format(transferID, errMsg)) + tmpLog.error("failed to get status for rule={0} with {1}".format(transferID, errMsg)) # set dummy not to lookup again tmpTransferStatus = None allChecked = False # keep one message if oneErrMsg is None: oneErrMsg = errMsg - tmpTransferStatus = 'OK' + tmpTransferStatus = "OK" transferStatus[transferID] = tmpTransferStatus # final status - if transferStatus[transferID] == 'OK': - fileSpec.status = 'finished' - elif transferStatus[transferID] in ['FAILED', 'CANCELED']: - fileSpec.status = 'failed' + if transferStatus[transferID] == "OK": + fileSpec.status = "finished" + elif transferStatus[transferID] in ["FAILED", "CANCELED"]: + fileSpec.status = "failed" if allChecked: - return True, '' + return True, "" else: return False, oneErrMsg # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # loop over all files files = dict() transferIDs = dict() @@ -88,39 +86,39 @@ def trigger_stage_out(self, jobspec): if fileSpec.zipFileID is not None: continue # skip if already processed - if 'transferDataset' in fileSpec.fileAttributes: + if "transferDataset" in fileSpec.fileAttributes: if fileSpec.fileType not in transferDatasets: - transferDatasets[fileSpec.fileType] = fileSpec.fileAttributes['transferDataset'] + transferDatasets[fileSpec.fileType] = fileSpec.fileAttributes["transferDataset"] if fileSpec.fileType not in transferIDs: - transferIDs[fileSpec.fileType] = fileSpec.fileAttributes['transferID'] + transferIDs[fileSpec.fileType] = fileSpec.fileAttributes["transferID"] continue # set OS ID - if fileSpec.fileType == ['es_output', 'zip_output']: + if fileSpec.fileType == ["es_output", "zip_output"]: fileSpec.objstoreID = self.objStoreID_ES # make path where file is copied for transfer - if fileSpec.fileType != 'zip_output': - scope = fileAttrs[fileSpec.lfn]['scope'] - datasetName = fileAttrs[fileSpec.lfn]['dataset'] + if fileSpec.fileType != "zip_output": + scope = fileAttrs[fileSpec.lfn]["scope"] + datasetName = fileAttrs[fileSpec.lfn]["dataset"] else: # use panda scope for zipped files scope = self.scopeForTmp - datasetName = 'dummy' + datasetName = "dummy" srcPath = fileSpec.path dstPath = mover_utils.construct_file_path(self.srcBasePath, scope, fileSpec.lfn) # remove if os.path.exists(dstPath): os.remove(dstPath) # copy - tmpLog.debug('copy src={srcPath} dst={dstPath}'.format(srcPath=srcPath, dstPath=dstPath)) + tmpLog.debug("copy src={srcPath} dst={dstPath}".format(srcPath=srcPath, dstPath=dstPath)) dstDir = os.path.dirname(dstPath) if not os.path.exists(dstDir): os.makedirs(dstDir) shutil.copyfile(srcPath, dstPath) # collect files tmpFile = dict() - tmpFile['scope'] = scope - tmpFile['name'] = fileSpec.lfn - tmpFile['bytes'] = fileSpec.fsize + tmpFile["scope"] = scope + tmpFile["name"] = fileSpec.lfn + tmpFile["bytes"] = fileSpec.fsize if fileSpec.fileType not in files: files[fileSpec.fileType] = [] files[fileSpec.fileType].append(tmpFile) @@ -128,14 +126,14 @@ def trigger_stage_out(self, jobspec): rucioAPI = RucioClient() for fileType, fileList in iteritems(files): # set destination RSE - if fileType in ['es_output', 'zip_output']: + if fileType in ["es_output", "zip_output"]: dstRSE = self.dstRSE_ES - elif fileType == 'output': + elif fileType == "output": dstRSE = self.dstRSE_Out - elif fileType == 'log': + elif fileType == "log": dstRSE = self.dstRSE_Log else: - errMsg = 'unsupported file type {0}'.format(fileType) + errMsg = "unsupported file type {0}".format(fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None @@ -145,25 +143,18 @@ def trigger_stage_out(self, jobspec): if fileType not in transferDatasets: try: tmpScope = self.scopeForTmp - tmpDS = 'panda.harvester_stage_out.{0}'.format(str(uuid.uuid4())) - rucioAPI.add_dataset(tmpScope, tmpDS, - meta={'hidden': True}, - lifetime=30*24*60*60, - files=fileList, - rse=self.srcRSE - ) + tmpDS = "panda.harvester_stage_out.{0}".format(str(uuid.uuid4())) + rucioAPI.add_dataset(tmpScope, tmpDS, meta={"hidden": True}, lifetime=30 * 24 * 60 * 60, files=fileList, rse=self.srcRSE) transferDatasets[fileType] = tmpDS # add rule tmpDID = dict() - tmpDID['scope'] = tmpScope - tmpDID['name'] = tmpDS - tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, - lifetime=30*24*60*60 - ) + tmpDID["scope"] = tmpScope + tmpDID["name"] = tmpDS + tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) tmpTransferIDs = tmpRet[0] transferIDs[fileType] = tmpTransferIDs - tmpLog.debug('register dataset {0} with rule {1}'.format(tmpDS, str(tmpTransferIDs))) - except: + tmpLog.debug("register dataset {0} with rule {1}".format(tmpDS, str(tmpTransferIDs))) + except BaseException: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) else: @@ -172,8 +163,8 @@ def trigger_stage_out(self, jobspec): tmpScope = self.scopeForTmp tmpDS = transferDatasets[fileType] rucioAPI.add_files_to_dataset(tmpScope, tmpDS, fileList, self.srcRSE) - tmpLog.debug('added files to {0}'.format(tmpDS)) - except: + tmpLog.debug("added files to {0}".format(tmpDS)) + except BaseException: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) # set transfer datasets and rules @@ -182,28 +173,27 @@ def trigger_stage_out(self, jobspec): if fileSpec.zipFileID is not None: continue # skip already done - if fileSpec.status in ['finished', 'failed']: + if fileSpec.status in ["finished", "failed"]: continue # skip if already processed - if 'transferDataset' in fileSpec.fileAttributes: + if "transferDataset" in fileSpec.fileAttributes: continue # no destination if fileSpec.fileType not in transferDatasets: - fileSpec.status = 'finished' + fileSpec.status = "finished" continue # set dataset - fileSpec.fileAttributes['transferDataset'] = transferDatasets[fileSpec.fileType] + fileSpec.fileAttributes["transferDataset"] = transferDatasets[fileSpec.fileType] # set rule - fileSpec.fileAttributes['transferID'] = transferIDs[fileSpec.fileType] + fileSpec.fileAttributes["transferID"] = transferIDs[fileSpec.fileType] # force update - fileSpec.force_update('fileAttributes') + fileSpec.force_update("fileAttributes") # return - tmpLog.debug('done') - return (True, '') + tmpLog.debug("done") + return (True, "") # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc.py b/pandaharvester/harvesterstager/rucio_stager_hpc.py index 95c49a0b..f52c114b 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc.py @@ -11,7 +11,7 @@ # TODO: retry of failed transfers # logger -baseLogger = core_utils.setup_logger('rucio_stager_hpc') +baseLogger = core_utils.setup_logger("rucio_stager_hpc") # plugin for stage-out with Rucio on an HPC site that must copy output elsewhere @@ -19,52 +19,50 @@ class RucioStagerHPC(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - if not hasattr(self, 'scopeForTmp'): - self.scopeForTmp = 'panda' - if not hasattr(self, 'pathConvention'): + if not hasattr(self, "scopeForTmp"): + self.scopeForTmp = "panda" + if not hasattr(self, "pathConvention"): self.pathConvention = None - if not hasattr(self, 'objstoreID'): + if not hasattr(self, "objstoreID"): self.objstoreID = None - if not hasattr(self, 'maxAttempts'): + if not hasattr(self, "maxAttempts"): self.maxAttempts = 3 - if not hasattr(self, 'objectstore_additions'): + if not hasattr(self, "objectstore_additions"): self.objectstore_additions = None # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') - return (True, '') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") + return (True, "") # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # loop over all files allChecked = True - ErrMsg = 'These files failed to upload : ' - zip_datasetName = 'harvester_stage_out.{0}'.format(str(uuid.uuid4())) + ErrMsg = "These files failed to upload : " + zip_datasetName = "harvester_stage_out.{0}".format(str(uuid.uuid4())) fileAttrs = jobspec.get_output_file_attributes() for fileSpec in jobspec.outFiles: # fileSpec.fileAttributes['transferID'] = None # synchronius transfer # skip already done - tmpLog.debug('file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) - if fileSpec.status in ['finished', 'failed']: + tmpLog.debug("file: %s status: %s" % (fileSpec.lfn, fileSpec.status)) + if fileSpec.status in ["finished", "failed"]: continue fileSpec.pathConvention = self.pathConvention fileSpec.objstoreID = self.objstoreID # set destination RSE - if fileSpec.fileType in ['es_output', 'zip_output', 'output']: + if fileSpec.fileType in ["es_output", "zip_output", "output"]: dstRSE = self.dstRSE_Out - elif fileSpec.fileType == 'log': + elif fileSpec.fileType == "log": dstRSE = self.dstRSE_Log else: - errMsg = 'unsupported file type {0}'.format(fileSpec.fileType) + errMsg = "unsupported file type {0}".format(fileSpec.fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None @@ -72,17 +70,17 @@ def trigger_stage_out(self, jobspec): continue # get/set scope and dataset name - if fileSpec.fileType == 'log': + if fileSpec.fileType == "log": if fileSpec.lfn in fileAttrs: - scope = fileAttrs[fileSpec.lfn]['scope'] - datasetName = fileAttrs[fileSpec.lfn]['dataset'] + scope = fileAttrs[fileSpec.lfn]["scope"] + datasetName = fileAttrs[fileSpec.lfn]["dataset"] else: - lfnWithoutWorkerID = ".".join(fileSpec.lfn.split('.')[:-1]) - scope = fileAttrs[lfnWithoutWorkerID]['scope'] - datasetName = fileAttrs[lfnWithoutWorkerID]['dataset'] - elif fileSpec.fileType != 'zip_output' and fileSpec.lfn in fileAttrs: - scope = fileAttrs[fileSpec.lfn]['scope'] - datasetName = fileAttrs[fileSpec.lfn]['dataset'] + lfnWithoutWorkerID = ".".join(fileSpec.lfn.split(".")[:-1]) + scope = fileAttrs[lfnWithoutWorkerID]["scope"] + datasetName = fileAttrs[lfnWithoutWorkerID]["dataset"] + elif fileSpec.fileType != "zip_output" and fileSpec.lfn in fileAttrs: + scope = fileAttrs[fileSpec.lfn]["scope"] + datasetName = fileAttrs[fileSpec.lfn]["dataset"] else: # use panda scope for zipped files scope = self.scopeForTmp @@ -93,46 +91,44 @@ def trigger_stage_out(self, jobspec): executable_prefix = None pfn_prefix = None if self.objectstore_additions and dstRSE in self.objectstore_additions: - if 'storage_id' in self.objectstore_additions[dstRSE]: - fileSpec.objstoreID = self.objectstore_additions[dstRSE]['storage_id'] - if 'access_key' in self.objectstore_additions[dstRSE] and \ - 'secret_key' in self.objectstore_additions[dstRSE] and \ - 'is_secure' in self.objectstore_additions[dstRSE]: + if "storage_id" in self.objectstore_additions[dstRSE]: + fileSpec.objstoreID = self.objectstore_additions[dstRSE]["storage_id"] + if ( + "access_key" in self.objectstore_additions[dstRSE] + and "secret_key" in self.objectstore_additions[dstRSE] + and "is_secure" in self.objectstore_additions[dstRSE] + ): executable_prefix = "export S3_ACCESS_KEY=%s; export S3_SECRET_KEY=%s; export S3_IS_SECURE=%s" % ( - self.objectstore_additions[dstRSE]['access_key'], - self.objectstore_additions[dstRSE]['secret_key'], - self.objectstore_additions[dstRSE]['is_secure']) - if 'pfn_prefix' in self.objectstore_additions[dstRSE]: - pfn_prefix = self.objectstore_additions[dstRSE]['pfn_prefix'] - - executable = ['/usr/bin/env', 'rucio', '-v', 'upload'] - executable += ['--no-register'] - if hasattr(self, 'lifetime'): - executable += ['--lifetime', ('%d' % self.lifetime)] - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - executable += ['--guid', fileSpec.fileAttributes['guid']] - - executable += ['--rse', dstRSE] - executable += ['--scope', scope] + self.objectstore_additions[dstRSE]["access_key"], + self.objectstore_additions[dstRSE]["secret_key"], + self.objectstore_additions[dstRSE]["is_secure"], + ) + if "pfn_prefix" in self.objectstore_additions[dstRSE]: + pfn_prefix = self.objectstore_additions[dstRSE]["pfn_prefix"] + + executable = ["/usr/bin/env", "rucio", "-v", "upload"] + executable += ["--no-register"] + if hasattr(self, "lifetime"): + executable += ["--lifetime", ("%d" % self.lifetime)] + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + executable += ["--guid", fileSpec.fileAttributes["guid"]] + + executable += ["--rse", dstRSE] + executable += ["--scope", scope] if pfn_prefix: - executable += ['--pfn %s' % os.path.join(pfn_prefix, os.path.basename(fileSpec.path))] + executable += ["--pfn %s" % os.path.join(pfn_prefix, os.path.basename(fileSpec.path))] else: - executable += [('%s:%s' % (scope, datasetName))] - executable += [('%s' % fileSpec.path)] + executable += [("%s:%s" % (scope, datasetName))] + executable += [("%s" % fileSpec.path)] - tmpLog.debug('rucio upload command: {0} '.format(executable)) - tmpLog.debug('rucio upload command (for human): %s ' % ' '.join(executable)) + tmpLog.debug("rucio upload command: {0} ".format(executable)) + tmpLog.debug("rucio upload command (for human): %s " % " ".join(executable)) if executable_prefix: - cmd = executable_prefix + "; " + ' '.join(executable) - process = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True) + cmd = executable_prefix + "; " + " ".join(executable) + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) else: - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = process.communicate() fileSpec.attemptNr += 1 @@ -140,54 +136,53 @@ def trigger_stage_out(self, jobspec): tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: - fileSpec.status = 'finished' + fileSpec.status = "finished" else: # check what failed file_exists = False rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'File name in specified scope already exists' in line: + for line in stdout.split("\n"): + if "File name in specified scope already exists" in line: file_exists = True break - elif 'File already exists on RSE' in line: + elif "File already exists on RSE" in line: # can skip if file exist on RSE since no register - tmpLog.warning('rucio skipped upload and returned stdout: %s' % stdout) + tmpLog.warning("rucio skipped upload and returned stdout: %s" % stdout) file_exists = True break - elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + elif "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True if file_exists: - tmpLog.debug('file exists, marking transfer as finished') - fileSpec.status = 'finished' + tmpLog.debug("file exists, marking transfer as finished") + fileSpec.status = "finished" elif rucio_sessions_limit_error: # do nothing - tmpLog.warning('rucio returned error, will retry: stdout: %s' % stdout) + tmpLog.warning("rucio returned error, will retry: stdout: %s" % stdout) # do not change fileSpec.status and Harvester will retry if this function returns False allChecked = False continue else: - tmpLog.error('rucio upload failed with stdout: %s' % stdout) + tmpLog.error("rucio upload failed with stdout: %s" % stdout) ErrMsg += '%s failed with rucio error stdout="%s"' % (fileSpec.lfn, stdout) allChecked = False if fileSpec.attemptNr >= self.maxAttempts: - tmpLog.error('reached maxattempts: %s, marked it as failed' % self.maxAttempts) - fileSpec.status = 'failed' + tmpLog.error("reached maxattempts: %s, marked it as failed" % self.maxAttempts) + fileSpec.status = "failed" # force update - fileSpec.force_update('status') + fileSpec.force_update("status") - tmpLog.debug('file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) + tmpLog.debug("file: %s status: %s" % (fileSpec.lfn, fileSpec.status)) # return - tmpLog.debug('done') + tmpLog.debug("done") if allChecked: - return True, '' + return True, "" else: return False, ErrMsg # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py index 0abb90bd..bbc84c21 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py @@ -11,12 +11,12 @@ from pandaharvester.harvestercore import core_utils from .base_stager import BaseStager -### Meant to put all heavy file operations on hpc side via ssh and use multithreading to accelerate. -### Other file operations require shared fs (e.g. sshfs) with the same path of all files. -### Works for MareNostrums +# Meant to put all heavy file operations on hpc side via ssh and use multithreading to accelerate. +# Other file operations require shared fs (e.g. sshfs) with the same path of all files. +# Works for MareNostrums # logger -baseLogger = core_utils.setup_logger('rucio_stager_hpc_minikui') +baseLogger = core_utils.setup_logger("rucio_stager_hpc_minikui") # plugin for stage-out with Rucio on an HPC site that must copy output elsewhere @@ -24,26 +24,25 @@ class RucioStagerHPC(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - if not hasattr(self, 'scopeForTmp'): - self.scopeForTmp = 'panda' - if not hasattr(self, 'pathConvention'): + if not hasattr(self, "scopeForTmp"): + self.scopeForTmp = "panda" + if not hasattr(self, "pathConvention"): self.pathConvention = None - if not hasattr(self, 'objstoreID'): + if not hasattr(self, "objstoreID"): self.objstoreID = None - if not hasattr(self, 'maxAttempts'): + if not hasattr(self, "maxAttempts"): self.maxAttempts = 3 - if not hasattr(self, 'objectstore_additions'): + if not hasattr(self, "objectstore_additions"): self.objectstore_additions = None - if not hasattr(self, 'nThreadsForUpload'): + if not hasattr(self, "nThreadsForUpload"): self.nThreadsForUpload = 4 # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='check_stage_out_status') - tmpLog.debug('start') - return (True, '') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="check_stage_out_status") + tmpLog.debug("start") + return (True, "") # trigger stage out def trigger_stage_out(self, jobspec): @@ -51,51 +50,50 @@ def trigger_stage_out(self, jobspec): gc.collect() # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # loop over all files - zip_datasetName = 'harvester_stage_out.{0}'.format(str(uuid.uuid4())) + zip_datasetName = "harvester_stage_out.{0}".format(str(uuid.uuid4())) fileAttrs = jobspec.get_output_file_attributes() def _stage_one_file(fileSpec): isChecked = True # ErrMsg = 'These files failed to upload : ' - ErrMsg = '' + ErrMsg = "" # fileSpec.fileAttributes['transferID'] = None # synchronius transfer # skip already done - tmpLog.debug('file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) - if fileSpec.status in ['finished', 'failed']: - return False, 'file {0} already {1}'.format(fileSpec.lfn, fileSpec.status) + tmpLog.debug("file: %s status: %s" % (fileSpec.lfn, fileSpec.status)) + if fileSpec.status in ["finished", "failed"]: + return False, "file {0} already {1}".format(fileSpec.lfn, fileSpec.status) fileSpec.pathConvention = self.pathConvention fileSpec.objstoreID = self.objstoreID # set destination RSE - if fileSpec.fileType in ['es_output', 'zip_output', 'output']: + if fileSpec.fileType in ["es_output", "zip_output", "output"]: dstRSE = self.dstRSE_Out - elif fileSpec.fileType == 'log': + elif fileSpec.fileType == "log": dstRSE = self.dstRSE_Log else: - errMsg = '{0} unsupported file type {1}'.format(fileSpec.lfn, fileSpec.fileType) + errMsg = "{0} unsupported file type {1}".format(fileSpec.lfn, fileSpec.fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None if dstRSE is None: - return False, 'file {0} dstRSE is None'.format(fileSpec.lfn) + return False, "file {0} dstRSE is None".format(fileSpec.lfn) # get/set scope and dataset name - if fileSpec.fileType == 'log': + if fileSpec.fileType == "log": if fileSpec.lfn in fileAttrs: - scope = fileAttrs[fileSpec.lfn]['scope'] - datasetName = fileAttrs[fileSpec.lfn]['dataset'] + scope = fileAttrs[fileSpec.lfn]["scope"] + datasetName = fileAttrs[fileSpec.lfn]["dataset"] else: - lfnWithoutWorkerID = ".".join(fileSpec.lfn.split('.')[:-1]) - scope = fileAttrs[lfnWithoutWorkerID]['scope'] - datasetName = fileAttrs[lfnWithoutWorkerID]['dataset'] - elif fileSpec.fileType != 'zip_output' and fileSpec.lfn in fileAttrs: - scope = fileAttrs[fileSpec.lfn]['scope'] - datasetName = fileAttrs[fileSpec.lfn]['dataset'] + lfnWithoutWorkerID = ".".join(fileSpec.lfn.split(".")[:-1]) + scope = fileAttrs[lfnWithoutWorkerID]["scope"] + datasetName = fileAttrs[lfnWithoutWorkerID]["dataset"] + elif fileSpec.fileType != "zip_output" and fileSpec.lfn in fileAttrs: + scope = fileAttrs[fileSpec.lfn]["scope"] + datasetName = fileAttrs[fileSpec.lfn]["dataset"] else: # use panda scope for zipped files scope = self.scopeForTmp @@ -106,48 +104,44 @@ def _stage_one_file(fileSpec): executable_prefix = None pfn_prefix = None if self.objectstore_additions and dstRSE in self.objectstore_additions: - if 'storage_id' in self.objectstore_additions[dstRSE]: - fileSpec.objstoreID = self.objectstore_additions[dstRSE]['storage_id'] - if 'access_key' in self.objectstore_additions[dstRSE] and \ - 'secret_key' in self.objectstore_additions[dstRSE] and \ - 'is_secure' in self.objectstore_additions[dstRSE]: + if "storage_id" in self.objectstore_additions[dstRSE]: + fileSpec.objstoreID = self.objectstore_additions[dstRSE]["storage_id"] + if ( + "access_key" in self.objectstore_additions[dstRSE] + and "secret_key" in self.objectstore_additions[dstRSE] + and "is_secure" in self.objectstore_additions[dstRSE] + ): executable_prefix = "export S3_ACCESS_KEY=%s; export S3_SECRET_KEY=%s; export S3_IS_SECURE=%s" % ( - self.objectstore_additions[dstRSE]['access_key'], - self.objectstore_additions[dstRSE]['secret_key'], - self.objectstore_additions[dstRSE]['is_secure']) - if 'pfn_prefix' in self.objectstore_additions[dstRSE]: - pfn_prefix = self.objectstore_additions[dstRSE]['pfn_prefix'] - - executable = ['/usr/bin/env', 'rucio', '-v', 'upload'] - executable += ['--no-register'] - if hasattr(self, 'lifetime'): - executable += ['--lifetime', ('%d' % self.lifetime)] - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - executable += ['--guid', fileSpec.fileAttributes['guid']] - - executable += ['--rse', dstRSE] - executable += ['--scope', scope] + self.objectstore_additions[dstRSE]["access_key"], + self.objectstore_additions[dstRSE]["secret_key"], + self.objectstore_additions[dstRSE]["is_secure"], + ) + if "pfn_prefix" in self.objectstore_additions[dstRSE]: + pfn_prefix = self.objectstore_additions[dstRSE]["pfn_prefix"] + + executable = ["/usr/bin/env", "rucio", "-v", "upload"] + executable += ["--no-register"] + if hasattr(self, "lifetime"): + executable += ["--lifetime", ("%d" % self.lifetime)] + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + executable += ["--guid", fileSpec.fileAttributes["guid"]] + + executable += ["--rse", dstRSE] + executable += ["--scope", scope] if pfn_prefix: - executable += ['--pfn %s' % os.path.join(pfn_prefix, os.path.basename(fileSpec.path))] + executable += ["--pfn %s" % os.path.join(pfn_prefix, os.path.basename(fileSpec.path))] else: - executable += [('%s:%s' % (scope, datasetName))] - executable += [('%s' % fileSpec.path)] + executable += [("%s:%s" % (scope, datasetName))] + executable += [("%s" % fileSpec.path)] - tmpLog.debug('rucio upload command: {0} '.format(executable)) - tmpLog.debug('rucio upload command (for human): %s ' % ' '.join(executable)) + tmpLog.debug("rucio upload command: {0} ".format(executable)) + tmpLog.debug("rucio upload command (for human): %s " % " ".join(executable)) if executable_prefix: - cmd = executable_prefix + "; " + ' '.join(executable) - process = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - close_fds=True, - shell=True) + cmd = executable_prefix + "; " + " ".join(executable) + process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, shell=True) else: - process = subprocess.Popen(executable, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + process = subprocess.Popen(executable, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = process.communicate() fileSpec.attemptNr += 1 @@ -155,42 +149,42 @@ def _stage_one_file(fileSpec): tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: - fileSpec.status = 'finished' + fileSpec.status = "finished" else: # check what failed file_exists = False rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'File name in specified scope already exists' in line: + for line in stdout.split("\n"): + if "File name in specified scope already exists" in line: file_exists = True break - elif 'File already exists on RSE' in line: + elif "File already exists on RSE" in line: # can skip if file exist on RSE since no register - tmpLog.warning('rucio skipped upload and returned stdout: %s' % stdout) + tmpLog.warning("rucio skipped upload and returned stdout: %s" % stdout) file_exists = True break - elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + elif "exceeded simultaneous SESSIONS_PER_USER limit" in line: rucio_sessions_limit_error = True if file_exists: - tmpLog.debug('file exists, marking transfer as finished') - fileSpec.status = 'finished' + tmpLog.debug("file exists, marking transfer as finished") + fileSpec.status = "finished" elif rucio_sessions_limit_error: # do nothing - tmpLog.warning('rucio returned error, will retry: stdout: %s' % stdout) + tmpLog.warning("rucio returned error, will retry: stdout: %s" % stdout) # do not change fileSpec.status and Harvester will retry if this function returns False - return False, 'rucio returned error' + return False, "rucio returned error" else: - tmpLog.error('rucio upload failed with stdout: %s' % stdout) + tmpLog.error("rucio upload failed with stdout: %s" % stdout) ErrMsg += '%s failed with rucio error stdout="%s"' % (fileSpec.lfn, stdout) isChecked = False if fileSpec.attemptNr >= self.maxAttempts: - tmpLog.error('reached maxattempts: %s, marked it as failed' % self.maxAttempts) - fileSpec.status = 'failed' + tmpLog.error("reached maxattempts: %s, marked it as failed" % self.maxAttempts) + fileSpec.status = "failed" # force update - fileSpec.force_update('status') + fileSpec.force_update("status") - tmpLog.debug('file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) + tmpLog.debug("file: %s status: %s" % (fileSpec.lfn, fileSpec.status)) del process, stdout, stderr @@ -200,14 +194,14 @@ def _stage_one_file(fileSpec): with Pool(max_workers=self.nThreadsForUpload) as pool: ret_list = list(pool.map(_stage_one_file, list(jobspec.outFiles))) - isChecked_list, ErrMsg_list = zip(*ret_list) if ret_list else ([], '') + isChecked_list, ErrMsg_list = zip(*ret_list) if ret_list else ([], "") allChecked = all(isChecked_list) - ErrMsg_all = ';'.join(ErrMsg_list) + ErrMsg_all = ";".join(ErrMsg_list) # return - tmpLog.debug('done') + tmpLog.debug("done") if allChecked: - return True, '' + return True, "" else: return False, ErrMsg_all @@ -217,7 +211,6 @@ def zip_output(self, jobspec): gc.collect() # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.ssh_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py index 0905bdfe..4fd470d6 100644 --- a/pandaharvester/harvesterstager/xrdcp_stager.py +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -14,7 +14,7 @@ import uuid # logger -_logger = core_utils.setup_logger('xrdcp_stager') +_logger = core_utils.setup_logger("xrdcp_stager") # stager plugin with https://xrootd.slac.stanford.edu/ xrdcp """ @@ -41,13 +41,13 @@ class XrdcpStager(BaseStager): # constructor def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - if not hasattr(self, 'xrdcpOpts'): + if not hasattr(self, "xrdcpOpts"): self.xrdcpOpts = None - if not hasattr(self, 'maxAttempts'): + if not hasattr(self, "maxAttempts"): self.maxAttempts = 3 - if not hasattr(self, 'timeout'): + if not hasattr(self, "timeout"): self.timeout = None - if not hasattr(self, 'checkLocalPath'): + if not hasattr(self, "checkLocalPath"): self.checkLocalPath = True # check status @@ -68,8 +68,8 @@ def check_stage_out_status(self, jobspec): :rtype: (bool, string) """ for fileSpec in jobspec.get_output_file_specs(skip_done=True): - fileSpec.status = 'finished' - return True, '' + fileSpec.status = "finished" + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): @@ -89,15 +89,14 @@ def trigger_stage_out(self, jobspec): gc.collect() # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="trigger_stage_out") + tmpLog.debug("start") # get the environment harvester_env = os.environ.copy() - #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + # tmpLog.debug('Harvester environment : {}'.format(harvester_env)) xrdcpOutput = None - allfiles_transfered = True + allfiles_transfered = True overall_errMsg = "" fileAttrs = jobspec.get_output_file_attributes() # loop over all output files @@ -105,91 +104,89 @@ def trigger_stage_out(self, jobspec): # fileSpec.objstoreID = 123 # fileSpec.fileAttributes['guid'] # construct source and destination paths - dstPath = mover_utils.construct_file_path(self.dstBasePath, fileAttrs[fileSpec.lfn]['scope'], - fileSpec.lfn) + dstPath = mover_utils.construct_file_path(self.dstBasePath, fileAttrs[fileSpec.lfn]["scope"], fileSpec.lfn) # local path - localPath = mover_utils.construct_file_path(self.localBasePath, fileAttrs[fileSpec.lfn]['scope'], - fileSpec.lfn) - tmpLog.debug('fileSpec.path - {0} fileSpec.lfn = {1}'.format(fileSpec.path,fileSpec.lfn)) + localPath = mover_utils.construct_file_path(self.localBasePath, fileAttrs[fileSpec.lfn]["scope"], fileSpec.lfn) + tmpLog.debug("fileSpec.path - {0} fileSpec.lfn = {1}".format(fileSpec.path, fileSpec.lfn)) localPath = fileSpec.path if self.checkLocalPath: # check if already exits if os.path.exists(localPath): # calculate checksum checksum = core_utils.calc_adler32(localPath) - checksum = 'ad:{0}'.format(checksum) - if checksum == fileAttrs[fileSpec.lfn]['checksum']: + checksum = "ad:{0}".format(checksum) + if checksum == fileAttrs[fileSpec.lfn]["checksum"]: continue # collect list of output files if xrdcpOutput is None: xrdcpOutput = [dstPath] else: - if dstPath not in xrdcpOutput : + if dstPath not in xrdcpOutput: xrdcpOutput.append(dstPath) # transfer using xrdcp one file at a time - tmpLog.debug('execute xrdcp') - args = ['xrdcp', '--nopbar', '--force'] - args_files = [localPath,dstPath] + tmpLog.debug("execute xrdcp") + args = ["xrdcp", "--nopbar", "--force"] + args_files = [localPath, dstPath] if self.xrdcpOpts is not None: args += self.xrdcpOpts.split() args += args_files fileSpec.attemptNr += 1 try: - xrdcp_cmd = ' '.join(args) - tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + xrdcp_cmd = " ".join(args) + tmpLog.debug("execute: {0}".format(xrdcp_cmd)) process = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) try: stdout, stderr = process.communicate(timeout=self.timeout) except subprocess.TimeoutExpired: process.kill() stdout, stderr = process.communicate() - tmpLog.warning('command timeout') + tmpLog.warning("command timeout") return_code = process.returncode if stdout is not None: if not isinstance(stdout, str): stdout = stdout.decode() - stdout = stdout.replace('\n', ' ') + stdout = stdout.replace("\n", " ") if stderr is not None: if not isinstance(stderr, str): stderr = stderr.decode() - stderr = stderr.replace('\n', ' ') + stderr = stderr.replace("\n", " ") tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) except Exception: core_utils.dump_error_message(tmpLog) return_code = 1 if return_code == 0: - fileSpec.status = 'finished' + fileSpec.status = "finished" else: - overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) - allfiles_transfered = False - errMsg = 'failed with {0}'.format(return_code) + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath, return_code) + allfiles_transfered = False + errMsg = "failed with {0}".format(return_code) tmpLog.error(errMsg) # check attemptNr if fileSpec.attemptNr >= self.maxAttempts: - tmpLog.error('reached maxattempts: {0}, marked it as failed'.format(self.maxAttempts)) - fileSpec.status = 'failed' + tmpLog.error("reached maxattempts: {0}, marked it as failed".format(self.maxAttempts)) + fileSpec.status = "failed" # force update - fileSpec.force_update('status') - tmpLog.debug('file: {0} status: {1}'.format(fileSpec.lfn, fileSpec.status)) + fileSpec.force_update("status") + tmpLog.debug("file: {0} status: {1}".format(fileSpec.lfn, fileSpec.status)) del process, stdout, stderr # end loop over output files # nothing to transfer if xrdcpOutput is None: - tmpLog.debug('done with no transfers') - return True, '' + tmpLog.debug("done with no transfers") + return True, "" # check if all files were transfered - tmpLog.debug('done') - if allfiles_transfered : - return True, '' + tmpLog.debug("done") + if allfiles_transfered: + return True, "" else: return None, overall_errMsg - # zip output files + def zip_output(self, jobspec): """OBSOLETE : zip functions should be implemented in zipper plugins. Zip output files. This method loops over jobspec.outFiles, which is a list of zip file's FileSpecs, @@ -204,8 +201,7 @@ def zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) # asynchronous zip output @@ -224,18 +220,14 @@ def async_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # set some ID which can be used for lookup in post_zip_output() groupID = str(uuid.uuid4()) lfns = [] for fileSpec in jobspec.outFiles: lfns.append(fileSpec.lfn) - jobspec.set_groups_to_files({groupID: {'lfns': lfns, - 'groupStatus': 'zipping'} - } - ) - return True, '' + jobspec.set_groups_to_files({groupID: {"lfns": lfns, "groupStatus": "zipping"}}) + return True, "" # post zipping def post_zip_output(self, jobspec): @@ -250,15 +242,14 @@ def post_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # get groups for lookup groups = jobspec.get_groups_of_output_files() # do something with groupIDs pass # update file attributes for fileSpec in jobspec.outFiles: - fileSpec.path = '/path/to/zip' + fileSpec.path = "/path/to/zip" fileSpec.fsize = 12345 - fileSpec.chksum = '66bb0985' - return True, '' + fileSpec.chksum = "66bb0985" + return True, "" diff --git a/pandaharvester/harvesterstager/yoda_rse_direct_stager.py b/pandaharvester/harvesterstager/yoda_rse_direct_stager.py index 14596248..64128279 100644 --- a/pandaharvester/harvesterstager/yoda_rse_direct_stager.py +++ b/pandaharvester/harvesterstager/yoda_rse_direct_stager.py @@ -24,148 +24,150 @@ from .base_stager import BaseStager # logger -baseLogger = core_utils.setup_logger('yoda_rse_direct_stager') +baseLogger = core_utils.setup_logger("yoda_rse_direct_stager") + class Error(EnvironmentError): pass + class SpecialFileError(EnvironmentError): """Raised when trying to do a kind of operation (e.g. copying) which is not supported on a special file (e.g. a named pipe)""" + class ExecError(EnvironmentError): """Raised when a command could not be executed""" - # stager plugin with RSE + local site move behaviour for Yoda zip files class YodaRseDirectStager(BaseStager): """In the workflow for RseDirectStager, workers directly upload output files to RSE and thus there is no data motion in Harvester.""" + # constructor + def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - tmpLog = self.make_logger(baseLogger, 'ThreadID={0}'.format(threading.current_thread().ident), - method_name='YodaRseDirectStager __init__ ') - tmpLog.debug('start') - self.Yodajob = False + tmpLog = self.make_logger(baseLogger, "ThreadID={0}".format(threading.current_thread().ident), method_name="YodaRseDirectStager __init__ ") + tmpLog.debug("start") + self.Yodajob = False self.pathConvention = None self.objstoreID = None self.changeFileStatusOnSuccess = True - tmpLog.debug('stop') + tmpLog.debug("stop") # check status def check_stage_out_status(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger( + baseLogger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="check_stage_out_status" + ) + tmpLog.debug("start") for fileSpec in jobspec.get_output_file_specs(skip_done=True): fileSpec.objstoreID = self.objstoreID fileSpec.pathConvention = self.pathConvention - fileSpec.status = 'finished' - tmpLog.debug('stop') - return True, '' + fileSpec.status = "finished" + tmpLog.debug("stop") + return True, "" # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger( + baseLogger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="trigger_stage_out" + ) + tmpLog.debug("start") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager - tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) + tmpLog.debug("jobspec.computingSite - {0} queueConfig.stager {1}".format(jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if jobtype is set - if 'jobtype' in queueConfig.stager: - if queueConfig.stager['jobtype'] == "Yoda" : + if "jobtype" in queueConfig.stager: + if queueConfig.stager["jobtype"] == "Yoda": self.Yodajob = True # set the location of the files in fileSpec.objstoreID - # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json - self.objstoreID = int(queueConfig.stager['objStoreID_ES']) - if self.Yodajob : - self.pathConvention = int(queueConfig.stager['pathConvention']) - tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) + # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json + self.objstoreID = int(queueConfig.stager["objStoreID_ES"]) + if self.Yodajob: + self.pathConvention = int(queueConfig.stager["pathConvention"]) + tmpLog.debug("Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}".format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None - tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) - self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] + tmpLog.debug("PandaID = {0} objstoreID = {1}".format(jobspec.PandaID, self.objstoreID)) + self.RSE_dstpath = queueConfig.stager["RSE_dstPath"] # loop over the output files and copy the files ifile = 0 errors = [] for fileSpec in jobspec.get_output_file_specs(skip_done=True): - scope ='panda' - if fileSpec.scope is not None : + scope = "panda" + if fileSpec.scope is not None: scope = fileSpec.scope - # for Yoda job set the scope to transient - if self.Yodajob : - scope = 'transient' + # for Yoda job set the scope to transient + if self.Yodajob: + scope = "transient" # only print to log file first 25 files - if ifile < 25 : + if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) - if ifile == 25 : + if ifile == 25: msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) + correctedscope = "/".join(scope.split(".")) srcURL = fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - if ifile < 25 : - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + if ifile < 25: + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): - tmpLog.debug('Already copied file {0}'.format(dstURL)) + tmpLog.debug("Already copied file {0}".format(dstURL)) # Set the file spec status if self.changeFileStatusOnSuccess: - fileSpec.status = 'finished' - else : - if os.path.exists(srcURL) : + fileSpec.status = "finished" + else: + if os.path.exists(srcURL): # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: - if not os.path.exists(dstDIR) : + if not os.path.exists(dstDIR): os.makedirs(dstDIR) - mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP + mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID - os.chmod(dstDIR,mode) + os.chmod(dstDIR, mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # Set the file spec status if self.changeFileStatusOnSuccess: - self.set_FileSpec_status(jobspec, 'finished') + self.set_FileSpec_status(jobspec, "finished") except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) - else : - errors.append((srcURL, dstURL, 'Source file missing')) + else: + errors.append((srcURL, dstURL, "Source file missing")) ifile += 1 # Now test for any errors if errors: for error in errors: - tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2])) + tmpLog.debug("copy error source {0} destination {1} Reason {2}".format(error[0], error[1], error[2])) raise Error(errors) - # otherwise we are OK - tmpLog.debug('stop') - return True, '' + # otherwise we are OK + tmpLog.debug("stop") + return True, "" # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvesterstager/yoda_rucio_rse_direct_stager.py b/pandaharvester/harvesterstager/yoda_rucio_rse_direct_stager.py index c8d42c5b..947bc108 100644 --- a/pandaharvester/harvesterstager/yoda_rucio_rse_direct_stager.py +++ b/pandaharvester/harvesterstager/yoda_rucio_rse_direct_stager.py @@ -15,8 +15,7 @@ from future.utils import iteritems from rucio.client import Client as RucioClient -from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, \ - FileAlreadyExists +from rucio.common.exception import DataIdentifierNotFound, DuplicateRule, DataIdentifierAlreadyExists, FileAlreadyExists from pandaharvester.harvestercore import core_utils @@ -28,52 +27,55 @@ from .base_stager import BaseStager -def dump(tmpLog,obj): - for attr in dir(obj): - if hasattr( obj, attr ): - tmpLog.debug( "obj.%s = %s" % (attr, getattr(obj, attr))) +def dump(tmpLog, obj): + for attr in dir(obj): + if hasattr(obj, attr): + tmpLog.debug("obj.%s = %s" % (attr, getattr(obj, attr))) # logger -baseLogger = core_utils.setup_logger('yoda_rucio_rse_direct_stager') +baseLogger = core_utils.setup_logger("yoda_rucio_rse_direct_stager") + class Error(EnvironmentError): pass + class SpecialFileError(EnvironmentError): """Raised when trying to do a kind of operation (e.g. copying) which is not supported on a special file (e.g. a named pipe)""" + class ExecError(EnvironmentError): """Raised when a command could not be executed""" - # stager plugin with RSE + local site move behaviour for Yoda zip files class YodaRucioRseDirectStager(BaseStager): """In the workflow for RseDirectStager, workers directly upload output files to RSE and thus there is no data motion in Harvester.""" + # constructor + def __init__(self, **kwarg): BaseStager.__init__(self, **kwarg) - tmpLog = self.make_logger(baseLogger, 'ThreadID={0}'.format(threading.current_thread().ident), - method_name='YodaRucioRseDirectStager __init__ ') - tmpLog.debug('start') - self.Yodajob = False + tmpLog = self.make_logger(baseLogger, "ThreadID={0}".format(threading.current_thread().ident), method_name="YodaRucioRseDirectStager __init__ ") + tmpLog.debug("start") + self.Yodajob = False self.pathConvention = None self.objstoreID = None - tmpLog.debug('stop') + tmpLog.debug("stop") - # set FileSpec.objstoreID - def set_FileSpec_objstoreID(self,jobspec, objstoreID, pathConvention): + # set FileSpec.objstoreID + def set_FileSpec_objstoreID(self, jobspec, objstoreID, pathConvention): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.objstoreID = objstoreID fileSpec.pathConvention = pathConvention - # set FileSpec.status - def set_FileSpec_status(self,jobspec,status): + # set FileSpec.status + def set_FileSpec_status(self, jobspec, status): # loop over all output files for fileSpec in jobspec.outFiles: fileSpec.status = status @@ -81,170 +83,172 @@ def set_FileSpec_status(self,jobspec,status): # check status def check_stage_out_status(self, jobspec): tmpStat = True - tmpMsg = '' + tmpMsg = "" # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='check_stage_out_status') - tmpLog.debug('start') + tmpLog = self.make_logger( + baseLogger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="check_stage_out_status" + ) + tmpLog.debug("start") # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager - tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) + tmpLog.debug("jobspec.computingSite - {0} queueConfig.stager {1}".format(jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if jobtype is set - if 'jobtype' in queueConfig.stager: - if queueConfig.stager['jobtype'] == "Yoda" : + if "jobtype" in queueConfig.stager: + if queueConfig.stager["jobtype"] == "Yoda": self.Yodajob = True # get destination endpoint - nucleus = jobspec.jobParams['nucleus'] - agis = self.dbInterface.get_cache('panda_queues.json').data - dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] + nucleus = jobspec.jobParams["nucleus"] + agis = self.dbInterface.get_cache("panda_queues.json").data + dstRSE = [agis[x]["astorages"]["pr"][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] # set the location of the files in fileSpec.objstoreID - # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data - self.objstoreID = ddm[dstRSE]['id'] - if self.Yodajob : - self.pathConvention = int(queueConfig.stager['pathConvention']) - tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) + # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data + self.objstoreID = ddm[dstRSE]["id"] + if self.Yodajob: + self.pathConvention = int(queueConfig.stager["pathConvention"]) + tmpLog.debug("Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}".format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None - tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) + tmpLog.debug("PandaID = {0} objstoreID = {1}".format(jobspec.PandaID, self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) - # Get the files grouped by Rucio Rule ID + # Get the files grouped by Rucio Rule ID groups = jobspec.get_groups_of_output_files() if len(groups) == 0: - tmpLog.debug('No Rucio Rules') - return None,'No Rucio Rules' - tmpLog.debug('#Rucio Rules - {0} - Rules - {1}'.format(len(groups),groups)) - + tmpLog.debug("No Rucio Rules") + return None, "No Rucio Rules" + tmpLog.debug("#Rucio Rules - {0} - Rules - {1}".format(len(groups), groups)) + try: rucioAPI = RucioClient() - except: - tmpLog.error('failure to get Rucio Client try again later') - return None,'failure to get Rucio Client try again later' + except BaseException: + tmpLog.error("failure to get Rucio Client try again later") + return None, "failure to get Rucio Client try again later" - # loop over the Rucio rules + # loop over the Rucio rules for rucioRule in groups: if rucioRule is None: continue # lock have_db_lock = self.dbInterface.get_object_lock(rucioRule, lock_interval=120) if not have_db_lock: - msgStr = 'escape since {0} is locked by another thread'.format(rucioRule) + msgStr = "escape since {0} is locked by another thread".format(rucioRule) tmpLog.debug(msgStr) return None, msgStr # get transfer status groupStatus = self.dbInterface.get_file_group_status(rucioRule) - tmpLog.debug('rucioRule - {0} - groupStatus - {1}'.format(rucioRule,groupStatus)) - if 'transferred' in groupStatus: - # already succeeded - set the fileSpec status for these files + tmpLog.debug("rucioRule - {0} - groupStatus - {1}".format(rucioRule, groupStatus)) + if "transferred" in groupStatus: + # already succeeded - set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) pass - elif 'failed' in groupStatus : + elif "failed" in groupStatus: # transfer failure tmpStat = False - tmpMsg = 'rucio rule for {0}:{1} already failed'.format(datasetScope, datasetName) - elif 'transferring' in groupStatus or 'pending' in groupStatus: + tmpMsg = "rucio rule for {0}:{1} already failed".format(datasetScope, datasetName) + elif "transferring" in groupStatus or "pending" in groupStatus: # transfer started in Rucio check status try: - result = rucioAPI.get_replication_rule(rucioRule,False) - if result['state'] == "OK" : + result = rucioAPI.get_replication_rule(rucioRule, False) + if result["state"] == "OK": # files transfered to nucleus - tmpLog.debug('Files for Rucio Rule {0} successfully transferred'.format(rucioRule)) - self.dbInterface.update_file_group_status(rucioRule, 'transferred') - # set the fileSpec status for these files + tmpLog.debug("Files for Rucio Rule {0} successfully transferred".format(rucioRule)) + self.dbInterface.update_file_group_status(rucioRule, "transferred") + # set the fileSpec status for these files self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) - self.set_FileSpec_status(jobspec,'finished') - elif result['state'] == "FAILED" : + self.set_FileSpec_status(jobspec, "finished") + elif result["state"] == "FAILED": # failed Rucio Transfer tmpStat = False - tmpMsg = 'Failed Rucio Transfer - Rucio Rule - {0}'.format(rucioRule) + tmpMsg = "Failed Rucio Transfer - Rucio Rule - {0}".format(rucioRule) tmpLog.debug(tmpMsg) - self.set_FileSpec_status(jobspec,'failed') - elif result['state'] == 'STUCK' : + self.set_FileSpec_status(jobspec, "failed") + elif result["state"] == "STUCK": tmpStat = None - tmpMsg = 'Rucio Transfer Rule {0} Stuck'.format(rucioRule) + tmpMsg = "Rucio Transfer Rule {0} Stuck".format(rucioRule) tmpLog.debug(tmpMsg) - except: - tmpStat = None - tmpMsg = 'Could not get information or Rucio Rule {0}'.format(rucioRule) - tmpLog.error(tmpMsg) - pass + except BaseException: + tmpStat = None + tmpMsg = "Could not get information or Rucio Rule {0}".format(rucioRule) + tmpLog.error(tmpMsg) + pass # release the lock if have_db_lock: - tmpLog.debug('attempt to release DB lock for Rucio Rule {0}'.format(rucioRule)) - release_db_lock = self.dbInterface.release_object_lock(rucioRule) + tmpLog.debug("attempt to release DB lock for Rucio Rule {0}".format(rucioRule)) + release_db_lock = self.dbInterface.release_object_lock(rucioRule) if release_db_lock: - tmpLog.debug('released DB lock for rucioRule - {0}'.format(rucioRule)) - have_db_lock = False + tmpLog.debug("released DB lock for rucioRule - {0}".format(rucioRule)) + have_db_lock = False else: - msgStr = ' Could not release DB lock for {}'.format(rucioRule) + msgStr = " Could not release DB lock for {}".format(rucioRule) tmpLog.error(msgStr) return None, msgStr - tmpLog.debug('stop') - return tmpStat, tmpMsg + tmpLog.debug("stop") + return tmpStat, tmpMsg # trigger stage out def trigger_stage_out(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0} ThreadID={1}'.format(jobspec.PandaID,threading.current_thread().ident), - method_name='trigger_stage_out') - tmpLog.debug('start') + tmpLog = self.make_logger( + baseLogger, "PandaID={0} ThreadID={1}".format(jobspec.PandaID, threading.current_thread().ident), method_name="trigger_stage_out" + ) + tmpLog.debug("start") # initialize some values tmpStat = None - tmpMsg = '' + tmpMsg = "" srcRSE = None dstRSE = None - datasetName = 'panda.harvester.{0}.{1}'.format(jobspec.PandaID,str(uuid.uuid4())) - datasetScope = 'transient' + datasetName = "panda.harvester.{0}.{1}".format(jobspec.PandaID, str(uuid.uuid4())) + datasetScope = "transient" # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found - tmpLog.error('jobspec.computingSite is not defined') - return False, 'jobspec.computingSite is not defined' + tmpLog.error("jobspec.computingSite is not defined") + return False, "jobspec.computingSite is not defined" else: - tmpLog.debug('jobspec.computingSite : {0}'.format(jobspec.computingSite)) + tmpLog.debug("jobspec.computingSite : {0}".format(jobspec.computingSite)) # get the queueConfig and corresponding objStoreID_ES queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(jobspec.computingSite) # write to debug log queueConfig.stager - tmpLog.debug('jobspec.computingSite - {0} queueConfig.stager {1}'.format(jobspec.computingSite,queueConfig.stager)) + tmpLog.debug("jobspec.computingSite - {0} queueConfig.stager {1}".format(jobspec.computingSite, queueConfig.stager)) # check queueConfig stager section to see if jobtype is set - if 'jobtype' in queueConfig.stager: - if queueConfig.stager['jobtype'] == "Yoda" : + if "jobtype" in queueConfig.stager: + if queueConfig.stager["jobtype"] == "Yoda": self.Yodajob = True # get destination endpoint - nucleus = jobspec.jobParams['nucleus'] - agis = self.dbInterface.get_cache('panda_queues.json').data - dstRSE = [agis[x]["astorages"]['pr'][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] - # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data - self.objstoreID = ddm[dstRSE]['id'] - if self.Yodajob : - self.pathConvention = int(queueConfig.stager['pathConvention']) - tmpLog.debug('Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}'.format(jobspec.PandaID,self.objstoreID,self.pathConvention)) + nucleus = jobspec.jobParams["nucleus"] + agis = self.dbInterface.get_cache("panda_queues.json").data + dstRSE = [agis[x]["astorages"]["pr"][0] for x in agis if agis[x]["atlas_site"] == nucleus][0] + # see file /cvmfs/atlas.cern.ch/repo/sw/local/etc/agis_ddmendpoints.json + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data + self.objstoreID = ddm[dstRSE]["id"] + if self.Yodajob: + self.pathConvention = int(queueConfig.stager["pathConvention"]) + tmpLog.debug("Yoda Job - PandaID = {0} objstoreID = {1} pathConvention ={2}".format(jobspec.PandaID, self.objstoreID, self.pathConvention)) else: self.pathConvention = None - tmpLog.debug('PandaID = {0} objstoreID = {1}'.format(jobspec.PandaID,self.objstoreID)) + tmpLog.debug("PandaID = {0} objstoreID = {1}".format(jobspec.PandaID, self.objstoreID)) # set the location of the files in fileSpec.objstoreID self.set_FileSpec_objstoreID(jobspec, self.objstoreID, self.pathConvention) - self.RSE_dstpath = queueConfig.stager['RSE_dstPath'] + self.RSE_dstpath = queueConfig.stager["RSE_dstPath"] # check queueConfig stager section to see if srcRSE is set - if 'srcRSE' in queueConfig.stager: - srcRSE = queueConfig.stager['srcRSE'] + if "srcRSE" in queueConfig.stager: + srcRSE = queueConfig.stager["srcRSE"] else: - tmpLog.debug('Warning srcRSE not defined in stager portion of queue config file') - tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) - + tmpLog.debug("Warning srcRSE not defined in stager portion of queue config file") + tmpLog.debug("srcRSE - {0} dstRSE - {1}".format(srcRSE, dstRSE)) + # loop over the output files and copy the files ifile = 0 errors = [] @@ -252,219 +256,201 @@ def trigger_stage_out(self, jobspec): lfns = [] fileSpec_list = [] fileSpec_list = jobspec.get_output_file_specs(skip_done=False) - msgStr = '#(jobspec.get_output_file_specs(skip_done=False)) = {0}'\ - .format(len(fileSpec_list)) + msgStr = "#(jobspec.get_output_file_specs(skip_done=False)) = {0}".format(len(fileSpec_list)) tmpLog.debug(msgStr) for fileSpec in fileSpec_list: - msgstr = 'fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}'\ - .format(datasetScope,fileSpec.lfn,fileSpec.fsize,fileSpec.chksum) - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - msgstr += ' guid - {0}'.format(fileSpec.fileAttributes['guid']) - tmpLog.debug(msgstr) + msgstr = "fileSpec: dataset scope - {0} file name - {1} size(Bytes) - {2} adler32 - {3}".format( + datasetScope, fileSpec.lfn, fileSpec.fsize, fileSpec.chksum + ) + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + msgstr += " guid - {0}".format(fileSpec.fileAttributes["guid"]) + tmpLog.debug(msgstr) - - #for fileSpec in jobspec.get_output_file_specs(skip_done=True): + # for fileSpec in jobspec.get_output_file_specs(skip_done=True): for fileSpec in jobspec.get_output_file_specs(skip_done=False): - scope ='panda' - if fileSpec.scope is not None : + scope = "panda" + if fileSpec.scope is not None: scope = fileSpec.scope - # for Yoda job set the scope to transient - if self.Yodajob : - scope = 'transient' + # for Yoda job set the scope to transient + if self.Yodajob: + scope = "transient" # only print to log file first 25 files - if ifile < 25 : + if ifile < 25: msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) - if ifile == 25 : + if ifile == 25: msgStr = "printed first 25 files skipping the rest".format(fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) hash = hashlib.md5() - hash.update('%s:%s' % (scope, fileSpec.lfn)) + hash.update("%s:%s" % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) + correctedscope = "/".join(scope.split(".")) srcURL = fileSpec.path - dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=self.RSE_dstPath, - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - if ifile < 25 : - tmpLog.debug('src={srcURL} dst={dstURL}'.format(srcURL=srcURL, dstURL=dstURL)) + dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=self.RSE_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + if ifile < 25: + tmpLog.debug("src={srcURL} dst={dstURL}".format(srcURL=srcURL, dstURL=dstURL)) tmpFile = dict() # copy the source file from source to destination skip over if file already exists if os.path.exists(dstURL): - tmpLog.debug('Already copied file {0}'.format(dstURL)) + tmpLog.debug("Already copied file {0}".format(dstURL)) # save for adding to rucio dataset - tmpFile['scope'] = datasetScope - tmpFile['name'] = fileSpec.lfn - tmpFile['bytes'] = fileSpec.fsize - tmpFile['adler32'] = fileSpec.chksum - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} - else : - tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) - tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) + tmpFile["scope"] = datasetScope + tmpFile["name"] = fileSpec.lfn + tmpFile["bytes"] = fileSpec.fsize + tmpFile["adler32"] = fileSpec.chksum + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + tmpFile["meta"] = {"guid": fileSpec.fileAttributes["guid"]} + else: + tmpLog.debug("File - {0} does not have a guid value".format(fileSpec.lfn)) + tmpLog.debug("Adding file {0} to fileList".format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if srcRSE is None and fileSpec.objstoreID is not None: - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] - tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) - else : - if os.path.exists(srcURL) : + tmpLog.debug("srcRSE - {0} defined from agis_ddmendpoints.json".format(srcRSE)) + else: + if os.path.exists(srcURL): # check if destination directory exists if not create it dstDIR = os.path.dirname(dstURL) try: - if not os.path.exists(dstDIR) : + if not os.path.exists(dstDIR): os.makedirs(dstDIR) - mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP + mode = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP mode = mode | stat.S_IROTH | stat.S_IXOTH | stat.S_ISGID - os.chmod(dstDIR,mode) + os.chmod(dstDIR, mode) # copy the source file to destination file shutil.copy2(srcURL, dstURL) # save for adding to rucio dataset - tmpFile['scope'] = datasetScope - tmpFile['name'] = fileSpec.lfn - tmpFile['bytes'] = fileSpec.fsize - tmpFile['adler32'] = fileSpec.chksum - if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: - tmpFile['meta'] = {'guid': fileSpec.fileAttributes['guid']} - else : - tmpLog.debug('File - {0} does not have a guid value'.format(fileSpec.lfn)) - tmpLog.debug('Adding file {0} to fileList'.format(fileSpec.lfn)) + tmpFile["scope"] = datasetScope + tmpFile["name"] = fileSpec.lfn + tmpFile["bytes"] = fileSpec.fsize + tmpFile["adler32"] = fileSpec.chksum + if fileSpec.fileAttributes is not None and "guid" in fileSpec.fileAttributes: + tmpFile["meta"] = {"guid": fileSpec.fileAttributes["guid"]} + else: + tmpLog.debug("File - {0} does not have a guid value".format(fileSpec.lfn)) + tmpLog.debug("Adding file {0} to fileList".format(fileSpec.lfn)) fileList.append(tmpFile) lfns.append(fileSpec.lfn) # get source RSE if not already set if srcRSE is None and fileSpec.objstoreID is not None: - ddm = self.dbInterface.get_cache('agis_ddmendpoints.json').data + ddm = self.dbInterface.get_cache("agis_ddmendpoints.json").data srcRSE = [x for x in ddm if ddm[x]["id"] == fileSpec.objstoreID][0] - tmpLog.debug('srcRSE - {0} defined from agis_ddmendpoints.json'.format(srcRSE)) + tmpLog.debug("srcRSE - {0} defined from agis_ddmendpoints.json".format(srcRSE)) except (IOError, os.error) as why: errors.append((srcURL, dstURL, str(why))) - else : - errors.append((srcURL, dstURL, 'Source file missing')) + else: + errors.append((srcURL, dstURL, "Source file missing")) ifile += 1 # test that srcRSE and dstRSE are defined - tmpLog.debug('srcRSE - {0} dstRSE - {1}'.format(srcRSE,dstRSE)) - errStr = '' + tmpLog.debug("srcRSE - {0} dstRSE - {1}".format(srcRSE, dstRSE)) + errStr = "" if srcRSE is None: - errStr = 'Source RSE is not defined ' + errStr = "Source RSE is not defined " if dstRSE is None: - errStr = errStr + ' Desitination RSE is not defined' - if (srcRSE is None) or (dstRSE is None) : - tmpLog.error(errStr) - return None,errStr + errStr = errStr + " Desitination RSE is not defined" + if (srcRSE is None) or (dstRSE is None): + tmpLog.error(errStr) + return None, errStr # test to see if there are any files to add dataset if len(fileList) == 0: - errStr = 'There are no files to add to database' + errStr = "There are no files to add to database" tmpLog.error(errStr) - return None,errStr + return None, errStr # print out the file list - tmpLog.debug('fileList - {0}'.format(fileList)) - + tmpLog.debug("fileList - {0}".format(fileList)) + # create the dataset and add files to it and create a transfer rule try: # register dataset rucioAPI = RucioClient() - tmpLog.debug('register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}' - .format(datasetScope, datasetName,srcRSE,(30*24*60*60))) + tmpLog.debug("register {0}:{1} rse = {2} meta=(hidden: True) lifetime = {3}".format(datasetScope, datasetName, srcRSE, (30 * 24 * 60 * 60))) try: - rucioAPI.add_dataset(datasetScope, datasetName, - meta={'hidden': True}, - lifetime=30 * 24 * 60 * 60, - rse=srcRSE - ) + rucioAPI.add_dataset(datasetScope, datasetName, meta={"hidden": True}, lifetime=30 * 24 * 60 * 60, rse=srcRSE) except DataIdentifierAlreadyExists: # ignore even if the dataset already exists pass except Exception: - errMsg = 'Could not create dataset {0}:{1} srcRSE - {2}'.format(datasetScope, - datasetName, - srcRSE) + errMsg = "Could not create dataset {0}:{1} srcRSE - {2}".format(datasetScope, datasetName, srcRSE) core_utils.dump_error_message(tmpLog) tmpLog.error(errMsg) - return None,errMsg + return None, errMsg # add files to dataset # add 500 files at a time numfiles = len(fileList) maxfiles = 500 - numslices = numfiles/maxfiles - if (numfiles%maxfiles) > 0 : - numslices = numslices + 1 + numslices = numfiles / maxfiles + if (numfiles % maxfiles) > 0: + numslices = numslices + 1 start = 0 - for i in range(numslices) : - try: - stop = start + maxfiles - if stop > numfiles : - stop = numfiles - - rucioAPI.add_files_to_datasets([{'scope': datasetScope, - 'name': datasetName, - 'dids': fileList[start:stop], - 'rse': srcRSE}], - ignore_duplicate=True) - start = stop - except FileAlreadyExists: - # ignore if files already exist - pass - except Exception: - errMsg = 'Could not add files to DS - {0}:{1} rse - {2} files - {3}'.format(datasetScope, - datasetName, - srcRSE, - fileList) - core_utils.dump_error_message(tmpLog) - tmpLog.error(errMsg) - return None,errMsg + for i in range(numslices): + try: + stop = start + maxfiles + if stop > numfiles: + stop = numfiles + + rucioAPI.add_files_to_datasets( + [{"scope": datasetScope, "name": datasetName, "dids": fileList[start:stop], "rse": srcRSE}], ignore_duplicate=True + ) + start = stop + except FileAlreadyExists: + # ignore if files already exist + pass + except Exception: + errMsg = "Could not add files to DS - {0}:{1} rse - {2} files - {3}".format(datasetScope, datasetName, srcRSE, fileList) + core_utils.dump_error_message(tmpLog) + tmpLog.error(errMsg) + return None, errMsg # add rule try: tmpDID = dict() - tmpDID['scope'] = datasetScope - tmpDID['name'] = datasetName - tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, - lifetime=30 * 24 * 60 * 60) + tmpDID["scope"] = datasetScope + tmpDID["name"] = datasetName + tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=30 * 24 * 60 * 60) ruleIDs = tmpRet[0] - tmpLog.debug('registered dataset {0}:{1} with rule {2}'.format(datasetScope, datasetName, - str(ruleIDs))) + tmpLog.debug("registered dataset {0}:{1} with rule {2}".format(datasetScope, datasetName, str(ruleIDs))) # group the output files together by the Rucio transfer rule - jobspec.set_groups_to_files({ruleIDs: {'lfns': lfns,'groupStatus': 'pending'}}) - msgStr = 'jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending'.format(ruleIDs,lfns) + jobspec.set_groups_to_files({ruleIDs: {"lfns": lfns, "groupStatus": "pending"}}) + msgStr = "jobspec.set_groups_to_files -Rucio rule - {0}, lfns - {1}, groupStatus - pending".format(ruleIDs, lfns) tmpLog.debug(msgStr) - tmpLog.debug('call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)') - tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,'transferring') - tmpLog.debug('called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)') + tmpLog.debug("call self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,pending)") + tmpStat = self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True), ruleIDs, "transferring") + tmpLog.debug("called self.dbInterface.set_file_group(jobspec.get_output_file_specs(skip_done=True),ruleIDs,transferring)") tmpStat = True - tmpMsg = 'created Rucio rule successfully' + tmpMsg = "created Rucio rule successfully" except DuplicateRule: # ignore duplicated rule - tmpLog.debug('rule is already available') + tmpLog.debug("rule is already available") except Exception: - errMsg = 'Error creating rule for dataset {0}:{1}'.format(datasetScope, datasetName) + errMsg = "Error creating rule for dataset {0}:{1}".format(datasetScope, datasetName) core_utils.dump_error_message(tmpLog) tmpLog.debug(errMsg) - return None,errMsg + return None, errMsg # update file group status - self.dbInterface.update_file_group_status(ruleIDs, 'transferring') + self.dbInterface.update_file_group_status(ruleIDs, "transferring") except Exception: - core_utils.dump_error_message(tmpLog) - # treat as a temporary error - tmpStat = None - tmpMsg = 'failed to add a rule for {0}:{1}'.format(datasetScope, datasetName) + core_utils.dump_error_message(tmpLog) + # treat as a temporary error + tmpStat = None + tmpMsg = "failed to add a rule for {0}:{1}".format(datasetScope, datasetName) # Now test for any errors if errors: for error in errors: - tmpLog.debug('copy error source {0} destination {1} Reason {2}'.format(error[0],error[1],error[2])) + tmpLog.debug("copy error source {0} destination {1} Reason {2}".format(error[0], error[1], error[2])) raise Error(errors) - # otherwise we are OK - tmpLog.debug('stop') - return tmpStat,tmpMsg + # otherwise we are OK + tmpLog.debug("stop") + return tmpStat, tmpMsg # zip output files def zip_output(self, jobspec): # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(baseLogger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) diff --git a/pandaharvester/harvestersubmitter/act_submitter.py b/pandaharvester/harvestersubmitter/act_submitter.py index e6ddd4eb..9fc4175c 100644 --- a/pandaharvester/harvestersubmitter/act_submitter.py +++ b/pandaharvester/harvestersubmitter/act_submitter.py @@ -14,9 +14,11 @@ from act.atlas.aCTDBPanda import aCTDBPanda # logger -baseLogger = core_utils.setup_logger('act_submitter') +baseLogger = core_utils.setup_logger("act_submitter") # submitter for aCT + + class ACTSubmitter(PluginBase): # constructor def __init__(self, **kwarg): @@ -24,11 +26,10 @@ def __init__(self, **kwarg): self.hostname = socket.getfqdn() # Set up aCT DB connection - self.log = core_utils.make_logger(baseLogger, 'aCT submitter', method_name='__init__') + self.log = core_utils.make_logger(baseLogger, "aCT submitter", method_name="__init__") self.actDB = aCTDBPanda(self.log) # Credential dictionary role: proxy file - self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], - list(harvester_config.credmanager.outCertFile))) + self.certs = dict(zip([r.split("=")[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) # Map of role to aCT proxyid self.proxymap = {} @@ -40,23 +41,21 @@ def __init__(self, **kwarg): uc.ProxyPath(str(proxy)) cred = arc.Credential(uc) dn = cred.GetIdentityName() - + actp = aCTProxy(self.log) - attr = '/atlas/Role='+role + attr = "/atlas/Role=" + role proxyid = actp.getProxyId(dn, attr) if not proxyid: raise Exception("Proxy with DN {0} and attribute {1} was not found in proxies table".format(dn, attr)) self.proxymap[role] = proxyid - # submit workers + def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: - - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) @@ -69,27 +68,29 @@ def submit_workers(self, workspec_list): jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) # Unified queues: take prodsourcelabel from job - prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel) + prodSourceLabel = jobSpec.jobParams.get("prodSourceLabel", prodSourceLabel) desc = {} # If we need to prefetch events, set aCT status waiting. # feed_events in act_messenger will fill events and release the job if queueconfig.prefetchEvents: - desc['pandastatus'] = 'waiting' - desc['actpandastatus'] = 'waiting' - desc['arcjobid'] = -1 # dummy id to prevent submission + desc["pandastatus"] = "waiting" + desc["actpandastatus"] = "waiting" + desc["arcjobid"] = -1 # dummy id to prevent submission else: - desc['pandastatus'] = 'sent' - desc['actpandastatus'] = 'sent' - desc['siteName'] = workSpec.computingSite - desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production'] - desc['prodSourceLabel'] = prodSourceLabel - desc['sendhb'] = 0 - metadata = {'harvesteraccesspoint': workSpec.get_access_point(), - 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id), - 'harvesterid': harvester_config.master.harvester_id, - 'harvesterworkerid': workSpec.workerID} - desc['metadata'] = json.dumps(metadata) + desc["pandastatus"] = "sent" + desc["actpandastatus"] = "sent" + desc["siteName"] = workSpec.computingSite + desc["proxyid"] = self.proxymap["pilot" if prodSourceLabel in ["user", "panda"] else "production"] + desc["prodSourceLabel"] = prodSourceLabel + desc["sendhb"] = 0 + metadata = { + "harvesteraccesspoint": workSpec.get_access_point(), + "schedulerid": "harvester-{}".format(harvester_config.master.harvester_id), + "harvesterid": harvester_config.master.harvester_id, + "harvesterworkerid": workSpec.workerID, + } + desc["metadata"] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) @@ -98,32 +99,35 @@ def submit_workers(self, workspec_list): else: # pull mode: set pandaid (to workerid), prodsourcelabel, resource type and requirements pandaid = workSpec.workerID - actjobdesc = '&'.join(['PandaID={}'.format(pandaid), - 'prodSourceLabel={}'.format(prodSourceLabel), - 'resourceType={}'.format(workSpec.resourceType), - 'minRamCount={}'.format(workSpec.minRamCount), - 'coreCount={}'.format(workSpec.nCore), - 'logFile={}.pilot.log'.format(pandaid) - ]) + actjobdesc = "&".join( + [ + "PandaID={}".format(pandaid), + "prodSourceLabel={}".format(prodSourceLabel), + "resourceType={}".format(workSpec.resourceType), + "minRamCount={}".format(workSpec.minRamCount), + "coreCount={}".format(workSpec.nCore), + "logFile={}.pilot.log".format(pandaid), + ] + ) tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc))) try: - batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] + batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)["LAST_INSERT_ID()"] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format(str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) workSpec.submissionHost = self.hostname - workSpec.nativeStatus = desc['actpandastatus'] + workSpec.nativeStatus = desc["actpandastatus"] # Set log files in workSpec - today = time.strftime('%Y-%m-%d', time.gmtime()) - logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)]) - workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) - workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) - workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) - workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl)) - result = (True, '') + today = time.strftime("%Y-%m-%d", time.gmtime()) + logurl = "/".join([queueconfig.submitter.get("logBaseURL"), today, workSpec.computingSite, str(pandaid)]) + workSpec.set_log_file("batch_log", "{0}.log".format(logurl)) + workSpec.set_log_file("stdout", "{0}.out".format(logurl)) + workSpec.set_log_file("stderr", "{0}.err".format(logurl)) + workSpec.set_log_file("jdl", "{0}.jdl".format(logurl)) + result = (True, "") retList.append(result) return retList diff --git a/pandaharvester/harvestersubmitter/cloud_google_submitter.py b/pandaharvester/harvestersubmitter/cloud_google_submitter.py index 4a794a2e..432322c8 100644 --- a/pandaharvester/harvestersubmitter/cloud_google_submitter.py +++ b/pandaharvester/harvestersubmitter/cloud_google_submitter.py @@ -6,12 +6,13 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase + # from requests.exceptions import SSLError from pandaharvester.harvestercloud.googlecloud import compute, GoogleVM, ZONE, PROJECT from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper # setup base logger -base_logger = core_utils.setup_logger('google_submitter') +base_logger = core_utils.setup_logger("google_submitter") def wait_for_operation(project, zone, operation_name): @@ -23,16 +24,16 @@ def wait_for_operation(project, zone, operation_name): :param operation_name: :return: """ - tmp_log = core_utils.make_logger(base_logger, method_name='wait_for_operation') - tmp_log.debug('Waiting for operation to finish...') + tmp_log = core_utils.make_logger(base_logger, method_name="wait_for_operation") + tmp_log.debug("Waiting for operation to finish...") while True: result = compute.zoneOperations().get(project=project, zone=zone, operation=operation_name).execute() - if result['status'] == 'DONE': - if 'error' in result: - raise Exception(result['error']) - tmp_log.debug('Operation finished...') + if result["status"] == "DONE": + if "error" in result: + raise Exception(result["error"]) + tmp_log.debug("Operation finished...") return result time.sleep(1) @@ -47,13 +48,13 @@ def create_vm(work_spec, queue_config): """ work_spec.reset_changed_list() - tmp_log = core_utils.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), - method_name='submit_a_worker') + tmp_log = core_utils.make_logger(base_logger, "workerID={0}".format(work_spec.workerID), method_name="submit_a_worker") - tmp_log.debug('nCore={0} minRamCount={1} maxDiskCount={2} maxWalltime={0}'.format(work_spec.nCore, - work_spec.minRamCount, - work_spec.maxDiskCount, - work_spec.maxWalltime)) + tmp_log.debug( + "nCore={0} minRamCount={1} maxDiskCount={2} maxWalltime={0}".format( + work_spec.nCore, work_spec.minRamCount, work_spec.maxDiskCount, work_spec.maxWalltime + ) + ) try: vm = GoogleVM(work_spec, queue_config) @@ -64,22 +65,22 @@ def create_vm(work_spec, queue_config): zone = ZONE except Exception as e: - tmp_log.debug('VM preparation failed with: {0}'.format(e)) + tmp_log.debug("VM preparation failed with: {0}".format(e)) # there was some problem preparing the VM, usually related to interaction with GCE # since the VM was not submitted yet, we mark the worker as "missed" return (False, str(e)), work_spec.get_changed_attributes() try: - tmp_log.debug('Going to submit VM {0}'.format(vm.name)) + tmp_log.debug("Going to submit VM {0}".format(vm.name)) work_spec.batchID = vm.name operation = compute.instances().insert(project=PROJECT, zone=zone, body=vm.config).execute() # tmp_log.debug('Submitting VM {0}'.format(vm.name)) # wait_for_operation(PROJECT, ZONE, operation['name']) - tmp_log.debug('Submitted VM {0}'.format(vm.name)) + tmp_log.debug("Submitted VM {0}".format(vm.name)) - return (True, 'OK'), work_spec.get_changed_attributes() + return (True, "OK"), work_spec.get_changed_attributes() except Exception as e: - tmp_log.debug('GCE API exception: {0}'.format(e)) + tmp_log.debug("GCE API exception: {0}".format(e)) # Despite the exception we will consider the submission successful to set the worker as "submitted". # This is related to the GCE API reliability. We have observed that despite failures (time outs, SSL errors, etc) # in many cases the VMs still start and we don't want VMs that are not inventorized. If the VM submission failed @@ -93,7 +94,7 @@ class GoogleSubmitter(PluginBase): """ def __init__(self, **kwarg): - self.logBaseURL = 'http://localhost/test' + self.logBaseURL = "http://localhost/test" PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() @@ -104,12 +105,12 @@ def submit_workers(self, work_spec_list): :return: """ - tmp_log = self.make_logger(base_logger, method_name='submit_workers') - tmp_log.debug('start nWorkers={0}'.format(len(work_spec_list))) + tmp_log = self.make_logger(base_logger, method_name="submit_workers") + tmp_log.debug("start nWorkers={0}".format(len(work_spec_list))) ret_list = [] if not work_spec_list: - tmp_log.debug('empty work_spec_list') + tmp_log.debug("empty work_spec_list") return ret_list # we assume all work_specs in the list belong to the same queue @@ -130,13 +131,11 @@ def submit_workers(self, work_spec_list): ret_val, tmp_dict = tmp_val work_spec.set_attributes_with_dict(tmp_dict) - work_spec.set_log_file('batch_log', '{0}/{1}.log'.format(self.logBaseURL, work_spec.batchID)) - work_spec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.batchID)) - work_spec.set_log_file('stderr', '{0}/{1}.err'.format(self.logBaseURL, work_spec.batchID)) + work_spec.set_log_file("batch_log", "{0}/{1}.log".format(self.logBaseURL, work_spec.batchID)) + work_spec.set_log_file("stdout", "{0}/{1}.out".format(self.logBaseURL, work_spec.batchID)) + work_spec.set_log_file("stderr", "{0}/{1}.err".format(self.logBaseURL, work_spec.batchID)) ret_list.append(ret_val) - tmp_log.debug('done') + tmp_log.debug("done") return ret_list - - diff --git a/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py b/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py index 1786ec4f..3545e1d8 100644 --- a/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py +++ b/pandaharvester/harvestersubmitter/cloud_openstack_submitter.py @@ -1,8 +1,9 @@ import os import tempfile + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess import random import uuid @@ -17,16 +18,16 @@ # setup base logger -baseLogger = core_utils.setup_logger('cloud_openstack_submitter') +baseLogger = core_utils.setup_logger("cloud_openstack_submitter") def _init_script_replace(string, **kwarg): new_string = string macro_map = { - '\$\(workerID\)': str(kwarg['workerID']), - '\$\(batchID\)': str(kwarg['batchID']), - '\$\(accessPoint\)': str(kwarg['accessPoint']), - } + "\$\(workerID\)": str(kwarg["workerID"]), + "\$\(batchID\)": str(kwarg["batchID"]), + "\$\(accessPoint\)": str(kwarg["accessPoint"]), + } for k, v in macro_map.items(): new_string = re.sub(k, v, new_string) return new_string @@ -35,14 +36,14 @@ def _init_script_replace(string, **kwarg): # make cloud initialization script def _make_init_script(workspec, template_str): # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_make_init_script') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_make_init_script") # make init tempfile - tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_init.sh', dir=workspec.get_access_point()) + tmpFile = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_init.sh", dir=workspec.get_access_point()) new_template_str = _init_script_replace(template_str, **workspec.__dict__) tmpFile.write(new_template_str) tmpFile.close() - tmpLog.debug('done') + tmpLog.debug("done") return tmpFile.name @@ -54,91 +55,86 @@ def __init__(self, **kwarg): self.nProcesses = 4 self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - def _submit_a_vm(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_submit_a_vm') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_submit_a_vm") # initial return values - tmpRetVal = (None, 'Nothing done') + tmpRetVal = (None, "Nothing done") # decide id - vm_name = 'harvester-vm_{0}'.format(str(uuid.uuid4())) + vm_name = "harvester-vm_{0}".format(str(uuid.uuid4())) # # decide image vm_image_id = self.vmImageID # decide flavor - #FIXME + # FIXME if workspec.nCore == 1: - vm_flavor_id = self.jobType_vmFlavor_map['SCORE'] + vm_flavor_id = self.jobType_vmFlavor_map["SCORE"] elif workspec.nCore == 8: - vm_flavor_id = self.jobType_vmFlavor_map['MCORE'] + vm_flavor_id = self.jobType_vmFlavor_map["MCORE"] else: - vm_flavor_id = self.jobType_vmFlavor_map['other'] + vm_flavor_id = self.jobType_vmFlavor_map["other"] # decide userdata with open(self.initScriptTemplate) as _f: template_str = _f.read() vm_userdata_file = _make_init_script(workspec, template_str) - vm_userdata = open(vm_userdata_file, 'r') + vm_userdata = open(vm_userdata_file, "r") # get image and flavor try: vm_image = self.vm_client.nova.glance.find_image(vm_image_id) vm_flavor = self.vm_client.nova.flavors.get(vm_flavor_id) except Exception as _e: - errStr = 'Failed to create a VM with name={0} ; {1}'.format(vm_name, _e) + errStr = "Failed to create a VM with name={0} ; {1}".format(vm_name, _e) tmpLog.error(errStr) tmpRetVal = (None, errStr) return tmpRetVal # create a VM try: - self.vm_client.nova.servers.create( name=vm_name, - image=vm_image, - flavor=vm_flavor, - userdata=vm_userdata, - **self.vmCreateAttributes) + self.vm_client.nova.servers.create(name=vm_name, image=vm_image, flavor=vm_flavor, userdata=vm_userdata, **self.vmCreateAttributes) except Exception as _e: - errStr = 'Failed to create a VM with name={0} ; {1}'.format(vm_name, _e) + errStr = "Failed to create a VM with name={0} ; {1}".format(vm_name, _e) tmpLog.error(errStr) tmpRetVal = (None, errStr) else: try: - vm_server = self.vm_client.nova.servers.list(search_opts={'name': vm_name}, limit=1)[0] + vm_server = self.vm_client.nova.servers.list(search_opts={"name": vm_name}, limit=1)[0] vm_id = vm_server.id except Exception as _e: - errStr = 'Failed to create a VM with name={0} ; {1}'.format(vm_name, _e) + errStr = "Failed to create a VM with name={0} ; {1}".format(vm_name, _e) tmpLog.error(errStr) tmpRetVal = (None, errStr) else: workspec.batchID = vm_id - tmpLog.info('Created a VM with name={vm_name} id={vm_id}'.format(vm_name=vm_name, vm_id=vm_id)) - tmpRetVal = (True, '') + tmpLog.info("Created a VM with name={vm_name} id={vm_id}".format(vm_name=vm_name, vm_id=vm_id)) + tmpRetVal = (True, "") vm_userdata.close() # return return tmpRetVal - # submit workers + def submit_workers(self, workspec_list): # set logger - tmpLog = self.make_logger(baseLogger, method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, method_name="submit_workers") nWorkers = len(workspec_list) - tmpLog.debug('start nWorkers={0}'.format(nWorkers)) + tmpLog.debug("start nWorkers={0}".format(nWorkers)) # exec with multi-thread with ThreadPoolExecutor(self.nProcesses) as thread_pool: retValList = thread_pool.map(self._submit_a_vm, workspec_list) - tmpLog.debug('{0} workers submitted'.format(nWorkers)) + tmpLog.debug("{0} workers submitted".format(nWorkers)) # return retList = list(retValList) - tmpLog.debug('done') + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/cobalt_submitter.py b/pandaharvester/harvestersubmitter/cobalt_submitter.py index e609bc15..9e706ef6 100644 --- a/pandaharvester/harvestersubmitter/cobalt_submitter.py +++ b/pandaharvester/harvestersubmitter/cobalt_submitter.py @@ -1,7 +1,8 @@ import tempfile + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess import os import stat @@ -10,7 +11,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('cobalt_submitter') +baseLogger = core_utils.setup_logger("cobalt_submitter") # submitter for Cobalt batch system @@ -31,8 +32,7 @@ def submit_workers(self, workspec_list): retStrList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") # set nCore workSpec.nCore = self.nCore # make batch script @@ -41,20 +41,16 @@ def submit_workers(self, workspec_list): # DPBcomStr = "qsub --cwd {0} {1}".format(workSpec.get_access_point(), batchFile) comStr = "qsub {0}".format(batchFile) # submit - tmpLog.debug('submit with {0}'.format(batchFile)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True) + tmpLog.debug("submit with {0}".format(batchFile)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) if retCode == 0: # extract batchID workSpec.batchID = stdOut.split()[-1] - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # set log files if self.uploadLog: if self.logBaseURL is None: @@ -63,15 +59,15 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL batchLog, stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) if batchLog is not None: - workSpec.set_log_file('batch_log', '{0}/{0}'.format(baseDir, batchLog)) + workSpec.set_log_file("batch_log", "{0}/{0}".format(baseDir, batchLog)) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) tmpRetVal = (False, errStr) retList.append(tmpRetVal) @@ -79,14 +75,11 @@ def submit_workers(self, workspec_list): # make batch script def make_batch_script(self, workspec): - tmpFile = tempfile.NamedTemporaryFile(mode='w+t',delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) - tmpFile.write(self.template.format(nNode=int(workspec.nCore / self.nCorePerNode), - accessPoint=workspec.accessPoint, - workerID=workspec.workerID) - ) + tmpFile = tempfile.NamedTemporaryFile(mode="w+t", delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) + tmpFile.write(self.template.format(nNode=int(workspec.nCore / self.nCorePerNode), accessPoint=workspec.accessPoint, workerID=workspec.workerID)) tmpFile.close() - # set execution bit on the temp file + # set execution bit on the temp file st = os.stat(tmpFile.name) os.chmod(tmpFile.name, st.st_mode | stat.S_IEXEC | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH) @@ -99,13 +92,13 @@ def get_log_file_names(self, batch_script, batch_id): stdErr = None with open(batch_script) as f: for line in f: - if not line.startswith('#COBALT'): + if not line.startswith("#COBALT"): continue items = line.split() - if '--debuglog' in items: - batchLog = items[-1].replace('$COBALT_JOBID', batch_id) - elif '-o' in items: - stdOut = items[-1].replace('$COBALT_JOBID', batch_id) - elif '-e' in items: - stdErr = items[-1].replace('$COBALT_JOBID', batch_id) + if "--debuglog" in items: + batchLog = items[-1].replace("$COBALT_JOBID", batch_id) + elif "-o" in items: + stdOut = items[-1].replace("$COBALT_JOBID", batch_id) + elif "-e" in items: + stdErr = items[-1].replace("$COBALT_JOBID", batch_id) return batchLog, stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/dummy_mcore_submitter.py b/pandaharvester/harvestersubmitter/dummy_mcore_submitter.py index cf233272..2dded8ca 100644 --- a/pandaharvester/harvestersubmitter/dummy_mcore_submitter.py +++ b/pandaharvester/harvestersubmitter/dummy_mcore_submitter.py @@ -1,8 +1,9 @@ import uuid import os + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from concurrent.futures import ProcessPoolExecutor as Pool @@ -12,35 +13,28 @@ from pandaharvester.harvestercore.work_spec import WorkSpec # setup base logger -baseLogger = core_utils.setup_logger('dummy_mcore_submitter') +baseLogger = core_utils.setup_logger("dummy_mcore_submitter") # submit a worker using subprocess def submit_a_worker(workspec): - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='submit_a_worker') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="submit_a_worker") workspec.reset_changed_list() if workspec.get_jobspec_list() is not None: - tmpLog.debug('aggregated nCore={0} minRamCount={1} maxDiskCount={2}'.format(workspec.nCore, - workspec.minRamCount, - workspec.maxDiskCount)) - tmpLog.debug('max maxWalltime={0}'.format(workspec.maxWalltime)) + tmpLog.debug("aggregated nCore={0} minRamCount={1} maxDiskCount={2}".format(workspec.nCore, workspec.minRamCount, workspec.maxDiskCount)) + tmpLog.debug("max maxWalltime={0}".format(workspec.maxWalltime)) for jobSpec in workspec.get_jobspec_list(): - tmpLog.debug('PandaID={0} nCore={1} RAM={2}'.format(jobSpec.PandaID, - jobSpec.jobParams['coreCount'], - jobSpec.jobParams['minRamCount'])) + tmpLog.debug("PandaID={0} nCore={1} RAM={2}".format(jobSpec.PandaID, jobSpec.jobParams["coreCount"], jobSpec.jobParams["minRamCount"])) for job in workspec.jobspec_list: - tmpLog.debug(" ".join([job.jobParams['transformation'], job.jobParams['jobPars']])) - workspec.batchID = 'batch_ID_{0}'.format(uuid.uuid4().hex) - workspec.queueName = 'batch_queue_name' - workspec.computingElement = 'CE_name' - f = open(os.path.join(workspec.accessPoint, 'status.txt'), 'w') + tmpLog.debug(" ".join([job.jobParams["transformation"], job.jobParams["jobPars"]])) + workspec.batchID = "batch_ID_{0}".format(uuid.uuid4().hex) + workspec.queueName = "batch_queue_name" + workspec.computingElement = "CE_name" + f = open(os.path.join(workspec.accessPoint, "status.txt"), "w") f.write(WorkSpec.ST_submitted) f.close() # fake submission - p = subprocess.Popen(['sleep', '3'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(["sleep", "3"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdoutStr, stderrStr = p.communicate() return (True, stdoutStr + stderrStr), workspec.get_changed_attributes() @@ -49,13 +43,13 @@ def submit_a_worker(workspec): class DummyMcoreSubmitter(PluginBase): # constructor def __init__(self, **kwarg): - self.logBaseURL = 'http://localhost/test' + self.logBaseURL = "http://localhost/test" PluginBase.__init__(self, **kwarg) # submit workers with multiple cores def submit_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, method_name='submit_workers') - tmpLog.debug('start nWorkers={0}'.format(len(workspec_list))) + tmpLog = self.make_logger(baseLogger, method_name="submit_workers") + tmpLog.debug("start nWorkers={0}".format(len(workspec_list))) with Pool() as pool: retValList = pool.map(submit_a_worker, workspec_list) # propagate changed attributes @@ -63,9 +57,9 @@ def submit_workers(self, workspec_list): for workSpec, tmpVal in zip(workspec_list, retValList): retVal, tmpDict = tmpVal workSpec.set_attributes_with_dict(tmpDict) - workSpec.set_log_file('batch_log', '{0}/{1}.log'.format(self.logBaseURL, workSpec.batchID)) - workSpec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, workSpec.batchID)) - workSpec.set_log_file('stderr', '{0}/{1}.err'.format(self.logBaseURL, workSpec.batchID)) + workSpec.set_log_file("batch_log", "{0}/{1}.log".format(self.logBaseURL, workSpec.batchID)) + workSpec.set_log_file("stdout", "{0}/{1}.out".format(self.logBaseURL, workSpec.batchID)) + workSpec.set_log_file("stderr", "{0}/{1}.err".format(self.logBaseURL, workSpec.batchID)) retList.append(retVal) - tmpLog.debug('done') + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/dummy_singleton_submitter.py b/pandaharvester/harvestersubmitter/dummy_singleton_submitter.py index 10bef3b8..b7c73f21 100644 --- a/pandaharvester/harvestersubmitter/dummy_singleton_submitter.py +++ b/pandaharvester/harvestersubmitter/dummy_singleton_submitter.py @@ -4,7 +4,7 @@ from .dummy_submitter import DummySubmitter # setup base logger -baseLogger = core_utils.setup_logger('dummy_singleton_submitter') +baseLogger = core_utils.setup_logger("dummy_singleton_submitter") # dummy submitter with singleton @@ -14,7 +14,7 @@ class DummySingletonSubmitter(object): # constructor def __init__(self, **kwarg): - key = kwarg['queueName'] + key = kwarg["queueName"] cls = self.__class__ with cls.lock: if key not in cls.instances: diff --git a/pandaharvester/harvestersubmitter/dummy_submitter.py b/pandaharvester/harvestersubmitter/dummy_submitter.py index 90d31b34..674a244c 100644 --- a/pandaharvester/harvestersubmitter/dummy_submitter.py +++ b/pandaharvester/harvestersubmitter/dummy_submitter.py @@ -5,14 +5,14 @@ from pandaharvester.harvestercore.work_spec import WorkSpec # setup base logger -baseLogger = core_utils.setup_logger('dummy_submitter') +baseLogger = core_utils.setup_logger("dummy_submitter") # dummy submitter class DummySubmitter(PluginBase): # constructor def __init__(self, **kwarg): - self.logBaseURL = 'http://localhost/test' + self.logBaseURL = "http://localhost/test" PluginBase.__init__(self, **kwarg) # submit workers @@ -35,32 +35,28 @@ def submit_workers(self, workspec_list): False for permanent failures, None for temporary failures) and dialog message :rtype: [(bool, string),] """ - tmpLog = self.make_logger(baseLogger, method_name='submit_workers') - tmpLog.debug('start nWorkers={0}'.format(len(workspec_list))) + tmpLog = self.make_logger(baseLogger, method_name="submit_workers") + tmpLog.debug("start nWorkers={0}".format(len(workspec_list))) retList = [] for workSpec in workspec_list: - workSpec.batchID = 'batch_ID_{0}'.format(uuid.uuid4().hex) - workSpec.queueName = 'batch_queue_name' - workSpec.computingElement = 'CE_name' - workSpec.set_log_file('batch_log', '{0}/{1}.log'.format(self.logBaseURL, workSpec.batchID)) - workSpec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, workSpec.batchID)) - workSpec.set_log_file('stderr', '{0}/{1}.err'.format(self.logBaseURL, workSpec.batchID)) + workSpec.batchID = "batch_ID_{0}".format(uuid.uuid4().hex) + workSpec.queueName = "batch_queue_name" + workSpec.computingElement = "CE_name" + workSpec.set_log_file("batch_log", "{0}/{1}.log".format(self.logBaseURL, workSpec.batchID)) + workSpec.set_log_file("stdout", "{0}/{1}.out".format(self.logBaseURL, workSpec.batchID)) + workSpec.set_log_file("stderr", "{0}/{1}.err".format(self.logBaseURL, workSpec.batchID)) if workSpec.get_jobspec_list() is not None: - tmpLog.debug('aggregated nCore={0} minRamCount={1} maxDiskCount={2}'.format(workSpec.nCore, - workSpec.minRamCount, - workSpec.maxDiskCount)) - tmpLog.debug('max maxWalltime={0}'.format(workSpec.maxWalltime)) + tmpLog.debug("aggregated nCore={0} minRamCount={1} maxDiskCount={2}".format(workSpec.nCore, workSpec.minRamCount, workSpec.maxDiskCount)) + tmpLog.debug("max maxWalltime={0}".format(workSpec.maxWalltime)) for jobSpec in workSpec.get_jobspec_list(): - tmpLog.debug('PandaID={0} nCore={1} RAM={2}'.format(jobSpec.PandaID, - jobSpec.jobParams['coreCount'], - jobSpec.jobParams['minRamCount'])) + tmpLog.debug("PandaID={0} nCore={1} RAM={2}".format(jobSpec.PandaID, jobSpec.jobParams["coreCount"], jobSpec.jobParams["minRamCount"])) # using batchLog URL as pilot ID - jobSpec.set_one_attribute('pilotID', workSpec.workAttributes['batchLog']) + jobSpec.set_one_attribute("pilotID", workSpec.workAttributes["batchLog"]) for job in workSpec.jobspec_list: - tmpLog.debug(" ".join([job.jobParams['transformation'], job.jobParams['jobPars']])) - f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') + tmpLog.debug(" ".join([job.jobParams["transformation"], job.jobParams["jobPars"]])) + f = open(os.path.join(workSpec.accessPoint, "status.txt"), "w") f.write(WorkSpec.ST_submitted) f.close() - retList.append((True, '')) - tmpLog.debug('done') + retList.append((True, "")) + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/gitlab_submitter.py b/pandaharvester/harvestersubmitter/gitlab_submitter.py index 220f75ae..1fe114e1 100644 --- a/pandaharvester/harvestersubmitter/gitlab_submitter.py +++ b/pandaharvester/harvestersubmitter/gitlab_submitter.py @@ -7,7 +7,7 @@ from pandaharvester.harvestermisc.gitlab_utils import store_job_params # setup base logger -baseLogger = core_utils.setup_logger('gitlab_submitter') +baseLogger = core_utils.setup_logger("gitlab_submitter") # dummy submitter @@ -19,38 +19,37 @@ def __init__(self, **kwarg): # trigger pipeline jobs def submit_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, method_name='submit_workers') - tmpLog.debug('start nWorkers={0}'.format(len(workspec_list))) + tmpLog = self.make_logger(baseLogger, method_name="submit_workers") + tmpLog.debug("start nWorkers={0}".format(len(workspec_list))) retList = [] for workSpec in workspec_list: try: jobSpec = workSpec.get_jobspec_list()[0] - secrets = jobSpec.jobParams['secrets'] - params = json.loads(jobSpec.jobParams['jobPars']) - params['secrets'] = secrets + secrets = jobSpec.jobParams["secrets"] + params = json.loads(jobSpec.jobParams["jobPars"]) + params["secrets"] = secrets store_job_params(workSpec, params) - url = '{}/{}/trigger/pipeline'.format(params['project_api'], params['project_id']) - data = {'token': secrets[params['trigger_token']], - 'ref': params['ref']} + url = "{}/{}/trigger/pipeline".format(params["project_api"], params["project_id"]) + data = {"token": secrets[params["trigger_token"]], "ref": params["ref"]} try: - tmpLog.debug('trigger pipeline at {}'.format(url)) + tmpLog.debug("trigger pipeline at {}".format(url)) r = requests.post(url, data=data, timeout=self.timeout) response = r.json() - tmpLog.debug('got {}'.format(str(response))) + tmpLog.debug("got {}".format(str(response))) except Exception: err_str = core_utils.dump_error_message(tmpLog) retList.append((False, err_str)) continue - if response['status'] == 'created': - workSpec.batchID = '{} {}'.format(response['id'], response['project_id']) - tmpLog.debug('succeeded with {}'.format(workSpec.batchID)) - retList.append((True, '')) + if response["status"] == "created": + workSpec.batchID = "{} {}".format(response["id"], response["project_id"]) + tmpLog.debug("succeeded with {}".format(workSpec.batchID)) + retList.append((True, "")) else: - err_str = 'failed to trigger with {}'.format(response['status']) + err_str = "failed to trigger with {}".format(response["status"]) tmpLog.error(err_str) retList.append((False, err_str)) except Exception: err_str = core_utils.dump_error_message(tmpLog) retList.append((False, err_str)) - tmpLog.debug('done') + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/globus_compute_submitter.py b/pandaharvester/harvestersubmitter/globus_compute_submitter.py index a3568674..17db0343 100644 --- a/pandaharvester/harvestersubmitter/globus_compute_submitter.py +++ b/pandaharvester/harvestersubmitter/globus_compute_submitter.py @@ -16,11 +16,12 @@ from globus_compute_sdk import Client # logger -baseLogger = core_utils.setup_logger('globus_compute_submitter') +baseLogger = core_utils.setup_logger("globus_compute_submitter") def run_wrapper(base_path, data_path, func_str): import traceback + try: import json import os @@ -30,12 +31,12 @@ def run_wrapper(base_path, data_path, func_str): current_dir = os.getcwd() os.chdir(base_path) - os.environ['HARVESTER_WORKER_BASE_PATH'] = base_path - os.environ['HARVESTER_DATA_PATH'] = data_path - os.environ['PYTHONPATH'] = base_path + ":" + os.environ.get("PYTHONPATH", "") + os.environ["HARVESTER_WORKER_BASE_PATH"] = base_path + os.environ["HARVESTER_DATA_PATH"] = data_path + os.environ["PYTHONPATH"] = base_path + ":" + os.environ.get("PYTHONPATH", "") print("hostname: %s" % socket.gethostname()) print("current directory: %s" % os.getcwd()) - print("PYTHONPATH: %s" % os.environ['PYTHONPATH']) + print("PYTHONPATH: %s" % os.environ["PYTHONPATH"]) print("execute programe: %s" % str(func_str)) func_json = json.loads(func_str) @@ -59,7 +60,7 @@ def run_wrapper(base_path, data_path, func_str): print(ex) print(traceback.format_exc()) raise Exception(traceback.format_exc()) - except: + except BaseException: print("traceback") print(traceback.format_exc()) raise Exception(traceback.format_exc()) @@ -93,7 +94,7 @@ def get_job_data(self, workSpec, logger): func_args = {} for jobSpec in jobSpecs: # logger.info(jobSpec) - logger.debug(" ".join([jobSpec.jobParams['transformation'], jobSpec.jobParams['jobPars']])) + logger.debug(" ".join([jobSpec.jobParams["transformation"], jobSpec.jobParams["jobPars"]])) panda_id = jobSpec.PandaID func_arg = self.get_job_funcx_args(workSpec, jobSpec, logger) func_args[panda_id] = func_arg @@ -101,20 +102,20 @@ def get_job_data(self, workSpec, logger): def get_panda_argparser(self): if self.parser is None: - parser = argparse.ArgumentParser(description='PanDA argparser') - parser.add_argument('-j', type=str, required=False, default='', help='j') - parser.add_argument('--sourceURL', type=str, required=False, default='', help='source url') - parser.add_argument('-r', type=str, required=False, default='', help='directory') - parser.add_argument('-l', '--lib', required=False, action='store_true', default=False, help='library') - parser.add_argument('-i', '--input', type=str, required=False, default='', help='input') - parser.add_argument('-o', '--output', type=str, required=False, default='', help='output') - parser.add_argument('-p', '--program', type=str, required=False, default='', help='program') - parser.add_argument('-a', '--archive', type=str, required=False, default='', help='source archive file') + parser = argparse.ArgumentParser(description="PanDA argparser") + parser.add_argument("-j", type=str, required=False, default="", help="j") + parser.add_argument("--sourceURL", type=str, required=False, default="", help="source url") + parser.add_argument("-r", type=str, required=False, default="", help="directory") + parser.add_argument("-l", "--lib", required=False, action="store_true", default=False, help="library") + parser.add_argument("-i", "--input", type=str, required=False, default="", help="input") + parser.add_argument("-o", "--output", type=str, required=False, default="", help="output") + parser.add_argument("-p", "--program", type=str, required=False, default="", help="program") + parser.add_argument("-a", "--archive", type=str, required=False, default="", help="source archive file") self.parser = parser return self.parser def get_job_funcx_args(self, workSpec, jobSpec, logger): - job_pars = jobSpec.jobParams['jobPars'] + job_pars = jobSpec.jobParams["jobPars"] job_arguments = shlex.split(job_pars) parser = self.get_panda_argparser() job_args, _ = parser.parse_known_args(job_arguments) @@ -154,12 +155,13 @@ def download_source_codes(self, base_dir, source_url, source_file, logger): os.environ["PANDACACHE_URL"] = source_url logger.info("PANDACACHE_URL: %s" % (os.environ["PANDACACHE_URL"])) from pandaclient import Client + Client.baseURLCSRVSSL = source_url status, output = Client.getFile(archive_basename, output_path=full_output_filename) logger.info("Download archive file from pandacache status: %s, output: %s" % (status, output)) if status != 0: raise RuntimeError("Failed to download archive file from pandacache") - with tarfile.open(full_output_filename, 'r:gz') as f: + with tarfile.open(full_output_filename, "r:gz") as f: f.extractall(base_dir) logger.info("Extract %s to %s" % (full_output_filename, base_dir)) @@ -172,15 +174,13 @@ def submit_workers(self, workspec_list): self.gc_client = Client() self.submit_func_id = self.gc_client.register_function(run_wrapper) except Exception as ex: - tmpLog = self.make_logger(baseLogger, "init_gc_client", - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "init_gc_client", method_name="submit_workers") tmpLog.error("Failed to init gc client: %s" % str(ex)) tmpLog.error(traceback.format_exc()) for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") try: if self.gc_client is None or self.submit_func_id is None: errStr = "Globus Compute client is not initialized" @@ -201,8 +201,8 @@ def submit_workers(self, workspec_list): batch_ids.append(batch_id) workSpec.batchID = json.dumps(batch_ids) - tmpLog.debug('PanDAID={0}'.format([panda_id for panda_id in func_args])) - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + tmpLog.debug("PanDAID={0}".format([panda_id for panda_id in func_args])) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # batch_id = self.gc_client.run(base_path, data_path, job_script, endpoint_id=self.funcxEndpointId, function_id=self.submit_func_id) # workSpec.batchID = batch_id # tmpLog.debug('batchID={0}'.format(workSpec.batchID)) @@ -215,10 +215,10 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL stdOut, stdErr = self.get_log_file_names(workSpec.batchID) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") except Exception as ex: # failed errStr = str(ex) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 48436c15..e7e716da 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -22,48 +22,51 @@ from pandaharvester.harvestersubmitter import submitter_common # logger -baseLogger = core_utils.setup_logger('htcondor_submitter') +baseLogger = core_utils.setup_logger("htcondor_submitter") # Replace condor Macro from SDF file, return string def _condor_macro_replace(string, **kwarg): new_string = string macro_map = { - '\$\(Cluster\)': str(kwarg['ClusterId']), - '\$\(Process\)': str(kwarg['ProcId']), - } + "\$\(Cluster\)": str(kwarg["ClusterId"]), + "\$\(Process\)": str(kwarg["ProcId"]), + } for k, v in macro_map.items(): new_string = re.sub(k, v, new_string) return new_string + # submit a bag of workers + + def submit_bag_of_workers(data_list): # make logger - tmpLog = core_utils.make_logger(baseLogger, method_name='submit_bag_of_workers') + tmpLog = core_utils.make_logger(baseLogger, method_name="submit_bag_of_workers") # keep order of workers in data_list - workerIDs_list = [ data['workspec'].workerID for data in data_list ] + workerIDs_list = [data["workspec"].workerID for data in data_list] # initialization worker_retval_map = {} worker_data_map = {} host_jdl_list_workerid_map = {} # go for data in data_list: - workspec = data['workspec'] + workspec = data["workspec"] workerID = workspec.workerID worker_data_map[workerID] = data - to_submit = data['to_submit'] + to_submit = data["to_submit"] # no need to submit bad worker if not to_submit: - errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) + errStr = "{0} not submitted due to incomplete data of the worker".format(workerID) tmpLog.warning(errStr) tmpRetVal = (None, errStr) # return tmpRetVal, workspec.get_changed_attributes() worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes()) # attributes try: - use_spool = data['use_spool'] + use_spool = data["use_spool"] except KeyError: - errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) + errStr = "{0} not submitted due to incomplete data of the worker".format(workerID) tmpLog.warning(errStr) tmpRetVal = (None, errStr) # return tmpRetVal, workspec.get_changed_attributes() @@ -80,81 +83,100 @@ def submit_bag_of_workers(data_list): # loop over submissionHost for host, val_list in host_jdl_list_workerid_map.items(): # make jdl string of workers - jdl_list = [ val[1] for val in val_list ] + jdl_list = [val[1] for val in val_list] # condor job submit object - tmpLog.debug('submitting to submissionHost={0}'.format(host)) + tmpLog.debug("submitting to submissionHost={0}".format(host)) # submit try: condor_job_submit = CondorJobSubmit(id=host) batchIDs_list, ret_err_str = condor_job_submit.submit(jdl_list, use_spool=use_spool) except Exception as e: batchIDs_list = None - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) # result if batchIDs_list: # submitted n_workers = len(val_list) - tmpLog.debug('submitted {0} workers to submissionHost={1}'.format(n_workers, host)) + tmpLog.debug("submitted {0} workers to submissionHost={1}".format(n_workers, host)) for val_i in range(n_workers): val = val_list[val_i] workspec = val[0] placeholder_map = val[2] # got batchID workspec.batchID = batchIDs_list[val_i] - tmpLog.debug('workerID={0} submissionHost={1} batchID={2}'.format( - workspec.workerID, workspec.submissionHost, workspec.batchID)) + tmpLog.debug("workerID={0} submissionHost={1} batchID={2}".format(workspec.workerID, workspec.submissionHost, workspec.batchID)) # get worker data data = worker_data_map[workspec.workerID] # set computingElement - ce_info_dict = data['ce_info_dict'] - workspec.computingElement = ce_info_dict.get('ce_endpoint', '') + ce_info_dict = data["ce_info_dict"] + workspec.computingElement = ce_info_dict.get("ce_endpoint", "") # set log - batch_log_dict = data['batch_log_dict'] + batch_log_dict = data["batch_log_dict"] (clusterid, procid) = get_job_id_tuple_from_batchid(workspec.batchID) - batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) - batch_stdout = _condor_macro_replace(batch_log_dict['batch_stdout'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) - batch_stderr = _condor_macro_replace(batch_log_dict['batch_stderr'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) + batch_log = _condor_macro_replace(batch_log_dict["batch_log"], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) + batch_stdout = _condor_macro_replace(batch_log_dict["batch_stdout"], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) + batch_stderr = _condor_macro_replace(batch_log_dict["batch_stderr"], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) try: - batch_jdl = '{0}.jdl'.format(batch_stderr[:-4]) + batch_jdl = "{0}.jdl".format(batch_stderr[:-4]) except Exception: batch_jdl = None - workspec.set_log_file('batch_log', batch_log) - workspec.set_log_file('stdout', batch_stdout) - workspec.set_log_file('stderr', batch_stderr) - workspec.set_log_file('jdl', batch_jdl) + workspec.set_log_file("batch_log", batch_log) + workspec.set_log_file("stdout", batch_stdout) + workspec.set_log_file("stderr", batch_stderr) + workspec.set_log_file("jdl", batch_jdl) if not workspec.get_jobspec_list(): - tmpLog.debug('No jobspec associated in the worker of workerID={0}'.format(workspec.workerID)) + tmpLog.debug("No jobspec associated in the worker of workerID={0}".format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog - jobSpec.set_one_attribute('pilotID', workspec.workAttributes['stdOut']) - jobSpec.set_one_attribute('pilotLog', workspec.workAttributes['batchLog']) - tmpLog.debug('Done set_log_file after submission of workerID={0}'.format(workspec.workerID)) - tmpRetVal = (True, '') + jobSpec.set_one_attribute("pilotID", workspec.workAttributes["stdOut"]) + jobSpec.set_one_attribute("pilotLog", workspec.workAttributes["batchLog"]) + tmpLog.debug("Done set_log_file after submission of workerID={0}".format(workspec.workerID)) + tmpRetVal = (True, "") worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes()) else: # failed - tmpLog.debug('failed to submit workers to submissionHost={0} ; {1}'.format(host, ret_err_str)) + tmpLog.debug("failed to submit workers to submissionHost={0} ; {1}".format(host, ret_err_str)) for val in val_list: workspec = val[0] - errStr = 'submission failed: {0}'.format(ret_err_str) + errStr = "submission failed: {0}".format(ret_err_str) tmpLog.error(errStr) tmpRetVal = (None, errStr) worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes()) # make return list - retValList = [ worker_retval_map[w_id] for w_id in workerIDs_list ] + retValList = [worker_retval_map[w_id] for w_id in workerIDs_list] return retValList + # make a condor jdl for a worker -def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file, - x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), - pilot_url=None, pilot_args='', is_unified_dispatch=False, - special_par='', harvester_queue_config=None, is_unified_queue=False, - pilot_version='unknown', python_version='unknown', prod_rc_permille=0, - token_dir=None, is_gpu_resource=False, **kwarg): + + +def make_a_jdl( + workspec, + template, + n_core_per_node, + log_dir, + panda_queue_name, + executable_file, + x509_user_proxy, + log_subdir=None, + ce_info_dict=dict(), + batch_log_dict=dict(), + pilot_url=None, + pilot_args="", + is_unified_dispatch=False, + special_par="", + harvester_queue_config=None, + is_unified_queue=False, + pilot_version="unknown", + python_version="unknown", + prod_rc_permille=0, + token_dir=None, + is_gpu_resource=False, + **kwarg +): # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='make_a_jdl') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="make_a_jdl") # Note: In workspec, unit of minRamCount and of maxDiskCount are both MB. # In HTCondor SDF, unit of request_memory is MB, and request_disk is KB. n_core_total = workspec.nCore if workspec.nCore else n_core_per_node @@ -166,124 +188,131 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e batch_log_dict = batch_log_dict.copy() # possible override by CRIC special_par if special_par: - special_par_attr_list = ['queue', 'maxWallTime', 'xcount', ] - _match_special_par_dict = { attr: re.search('\({attr}=([^)]+)\)'.format(attr=attr), special_par) \ - for attr in special_par_attr_list } + special_par_attr_list = [ + "queue", + "maxWallTime", + "xcount", + ] + _match_special_par_dict = {attr: re.search("\({attr}=([^)]+)\)".format(attr=attr), special_par) for attr in special_par_attr_list} for attr, _match in _match_special_par_dict.items(): if not _match: continue - elif attr == 'queue': - ce_info_dict['ce_queue_name'] = str(_match.group(1)) - elif attr == 'maxWallTime': + elif attr == "queue": + ce_info_dict["ce_queue_name"] = str(_match.group(1)) + elif attr == "maxWallTime": request_walltime = int(_match.group(1)) - elif attr == 'xcount': + elif attr == "xcount": n_core_total = int(_match.group(1)) - tmpLog.debug('job attributes override by CRIC special_par: {0}={1}'.format(attr, str(_match.group(1)))) + tmpLog.debug("job attributes override by CRIC special_par: {0}={1}".format(attr, str(_match.group(1)))) # derived job attributes - n_node = ceil(n_core_total/n_core_per_node) + n_node = ceil(n_core_total / n_core_per_node) request_ram_bytes = request_ram * 2**20 - request_ram_per_core = ceil(request_ram*n_node/n_core_total) - request_ram_bytes_per_core = ceil(request_ram_bytes*n_node/n_core_total) + request_ram_per_core = ceil(request_ram * n_node / n_core_total) + request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total) request_cputime = request_walltime * n_core_total - request_walltime_minute = ceil(request_walltime/60) - request_cputime_minute = ceil(request_cputime/60) + request_walltime_minute = ceil(request_walltime / 60) + request_cputime_minute = ceil(request_cputime / 60) # decide prodSourceLabel pilot_opt_dict = submitter_common.get_complicated_pilot_options( - pilot_type=workspec.pilotType, - pilot_url=pilot_url, - pilot_version=pilot_version, - prod_source_label=harvester_queue_config.get_source_label(workspec.jobType), - prod_rc_permille=prod_rc_permille) - prod_source_label = pilot_opt_dict['prod_source_label'] - pilot_type_opt = pilot_opt_dict['pilot_type_opt'] - pilot_url_str = pilot_opt_dict['pilot_url_str'] - pilot_debug_str = pilot_opt_dict['pilot_debug_str'] + pilot_type=workspec.pilotType, + pilot_url=pilot_url, + pilot_version=pilot_version, + prod_source_label=harvester_queue_config.get_source_label(workspec.jobType), + prod_rc_permille=prod_rc_permille, + ) + prod_source_label = pilot_opt_dict["prod_source_label"] + pilot_type_opt = pilot_opt_dict["pilot_type_opt"] + pilot_url_str = pilot_opt_dict["pilot_url_str"] + pilot_debug_str = pilot_opt_dict["pilot_debug_str"] # get token filename according to CE token_filename = None - if token_dir is not None and ce_info_dict.get('ce_endpoint'): - token_filename = endpoint_to_filename(ce_info_dict['ce_endpoint']) + if token_dir is not None and ce_info_dict.get("ce_endpoint"): + token_filename = endpoint_to_filename(ce_info_dict["ce_endpoint"]) token_path = None if token_dir is not None and token_filename is not None: token_path = os.path.join(token_dir, token_filename) else: - tmpLog.warning('token_path is None: site={0}, token_dir={1} , token_filename={2}'.format(panda_queue_name, token_dir, token_filename)) + tmpLog.warning("token_path is None: site={0}, token_dir={1} , token_filename={2}".format(panda_queue_name, token_dir, token_filename)) # open tmpfile as submit description file - tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point()) + tmpFile = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_submit.sdf", dir=workspec.get_access_point()) # placeholder map placeholder_map = { - 'sdfPath': tmpFile.name, - 'executableFile': executable_file, - 'nCorePerNode': n_core_per_node, - 'nCoreTotal': n_core_total, - 'nNode': n_node, - 'requestRam': request_ram, - 'requestRamBytes': request_ram_bytes, - 'requestRamPerCore': request_ram_per_core, - 'requestRamBytesPerCore': request_ram_bytes_per_core, - 'requestDisk': request_disk, - 'requestWalltime': request_walltime, - 'requestWalltimeMinute': request_walltime_minute, - 'requestCputime': request_cputime, - 'requestCputimeMinute': request_cputime_minute, - 'accessPoint': workspec.accessPoint, - 'harvesterID': harvester_config.master.harvester_id, - 'workerID': workspec.workerID, - 'computingSite': workspec.computingSite, - 'pandaQueueName': panda_queue_name, - 'x509UserProxy': x509_user_proxy, - 'ceEndpoint': ce_info_dict.get('ce_endpoint', ''), - 'ceHostname': ce_info_dict.get('ce_hostname', ''), - 'ceFlavour': ce_info_dict.get('ce_flavour', ''), - 'ceJobmanager': ce_info_dict.get('ce_jobmanager', ''), - 'ceQueueName': ce_info_dict.get('ce_queue_name', ''), - 'ceVersion': ce_info_dict.get('ce_version', ''), - 'logDir': log_dir, - 'logSubdir': log_subdir, - 'gtag': batch_log_dict.get('gtag', 'fake_GTAG_string'), - 'prodSourceLabel': prod_source_label, - 'jobType': workspec.jobType, - 'resourceType': submitter_common.get_resource_type(workspec.resourceType, is_unified_queue), - 'pilotResourceTypeOption': submitter_common.get_resource_type(workspec.resourceType, is_unified_queue, True), - 'ioIntensity': io_intensity, - 'pilotType': pilot_type_opt, - 'pilotUrlOption': pilot_url_str, - 'pilotVersion': pilot_version, - 'pilotPythonOption': submitter_common.get_python_version_option(python_version, prod_source_label), - 'pilotDebugOption': pilot_debug_str, - 'pilotArgs': pilot_args, - 'submissionHost': workspec.submissionHost, - 'submissionHostShort': workspec.submissionHost.split('.')[0], - 'ceARCGridType': ce_info_dict.get('ce_grid_type', 'arc'), - 'tokenDir': token_dir, - 'tokenFilename': token_filename, - 'tokenPath': token_path, - 'pilotJobLabel': submitter_common.get_joblabel(prod_source_label, is_unified_dispatch), - 'pilotJobType': submitter_common.get_pilot_job_type(workspec.jobType, is_unified_dispatch), - 'requestGpus': 1 if is_gpu_resource else 0, - 'requireGpus': is_gpu_resource, - } + "sdfPath": tmpFile.name, + "executableFile": executable_file, + "nCorePerNode": n_core_per_node, + "nCoreTotal": n_core_total, + "nNode": n_node, + "requestRam": request_ram, + "requestRamBytes": request_ram_bytes, + "requestRamPerCore": request_ram_per_core, + "requestRamBytesPerCore": request_ram_bytes_per_core, + "requestDisk": request_disk, + "requestWalltime": request_walltime, + "requestWalltimeMinute": request_walltime_minute, + "requestCputime": request_cputime, + "requestCputimeMinute": request_cputime_minute, + "accessPoint": workspec.accessPoint, + "harvesterID": harvester_config.master.harvester_id, + "workerID": workspec.workerID, + "computingSite": workspec.computingSite, + "pandaQueueName": panda_queue_name, + "x509UserProxy": x509_user_proxy, + "ceEndpoint": ce_info_dict.get("ce_endpoint", ""), + "ceHostname": ce_info_dict.get("ce_hostname", ""), + "ceFlavour": ce_info_dict.get("ce_flavour", ""), + "ceJobmanager": ce_info_dict.get("ce_jobmanager", ""), + "ceQueueName": ce_info_dict.get("ce_queue_name", ""), + "ceVersion": ce_info_dict.get("ce_version", ""), + "logDir": log_dir, + "logSubdir": log_subdir, + "gtag": batch_log_dict.get("gtag", "fake_GTAG_string"), + "prodSourceLabel": prod_source_label, + "jobType": workspec.jobType, + "resourceType": submitter_common.get_resource_type(workspec.resourceType, is_unified_queue), + "pilotResourceTypeOption": submitter_common.get_resource_type(workspec.resourceType, is_unified_queue, True), + "ioIntensity": io_intensity, + "pilotType": pilot_type_opt, + "pilotUrlOption": pilot_url_str, + "pilotVersion": pilot_version, + "pilotPythonOption": submitter_common.get_python_version_option(python_version, prod_source_label), + "pilotDebugOption": pilot_debug_str, + "pilotArgs": pilot_args, + "submissionHost": workspec.submissionHost, + "submissionHostShort": workspec.submissionHost.split(".")[0], + "ceARCGridType": ce_info_dict.get("ce_grid_type", "arc"), + "tokenDir": token_dir, + "tokenFilename": token_filename, + "tokenPath": token_path, + "pilotJobLabel": submitter_common.get_joblabel(prod_source_label, is_unified_dispatch), + "pilotJobType": submitter_common.get_pilot_job_type(workspec.jobType, is_unified_dispatch), + "requestGpus": 1 if is_gpu_resource else 0, + "requireGpus": is_gpu_resource, + } # fill in template string jdl_str = template.format(**placeholder_map) # save jdl to submit description file tmpFile.write(jdl_str) tmpFile.close() - tmpLog.debug('saved sdf at {0}'.format(tmpFile.name)) - tmpLog.debug('done') + tmpLog.debug("saved sdf at {0}".format(tmpFile.name)) + tmpLog.debug("done") return jdl_str, placeholder_map + # parse log, stdout, stderr filename + + def parse_batch_job_filename(value_str, file_dir, batchID, guess=False): _filename = os.path.basename(value_str) if guess: # guess file name before files really created; possibly containing condor macros return _filename else: - _sanitized_list = re.sub('\{(\w+)\}|\[(\w+)\]|\((\w+)\)|#(\w+)#|\$', '', _filename).split('.') + _sanitized_list = re.sub("\{(\w+)\}|\[(\w+)\]|\((\w+)\)|#(\w+)#|\$", "", _filename).split(".") _prefix = _sanitized_list[0] - _suffix = _sanitized_list[-1] if len(_sanitized_list) > 1 else '' + _suffix = _sanitized_list[-1] if len(_sanitized_list) > 1 else "" for _f in os.listdir(file_dir): - if re.match('{prefix}(.*)\.{batchID}\.(.*)\.{suffix}'.format(prefix=_prefix, suffix=_suffix, batchID=batchID), _f): + if re.match("{prefix}(.*)\.{batchID}\.(.*)\.{suffix}".format(prefix=_prefix, suffix=_suffix, batchID=batchID), _f): return _f return None @@ -292,10 +321,10 @@ def parse_batch_job_filename(value_str, file_dir, batchID, guess=False): class HTCondorSubmitter(PluginBase): # constructor def __init__(self, **kwarg): - tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') + tmpLog = core_utils.make_logger(baseLogger, method_name="__init__") self.logBaseURL = None self.templateFile = None - if hasattr(self, 'useFQDN') and self.useFQDN: + if hasattr(self, "useFQDN") and self.useFQDN: self.hostname = socket.getfqdn() else: self.hostname = socket.gethostname().split(".")[0] @@ -316,7 +345,7 @@ def __init__(self, **kwarg): # condor log directory try: self.logDir - if '$hostname' in self.logDir or '${hostname}' in self.logDir: + if "$hostname" in self.logDir or "${hostname}" in self.logDir: self.logDir = self.logDir.replace("$hostname", self.hostname).replace("${hostname}", self.hostname) try: if not os.path.exists(self.logDir): @@ -324,26 +353,26 @@ def __init__(self, **kwarg): except Exception as ex: tmpLog.debug("Failed to create logDir(%s): %s" % (self.logDir, str(ex))) except AttributeError: - self.logDir = os.getenv('TMPDIR') or '/tmp' + self.logDir = os.getenv("TMPDIR") or "/tmp" # log base url try: self.logBaseURL - if '$hostname' in self.logBaseURL or '${hostname}' in self.logBaseURL: + if "$hostname" in self.logBaseURL or "${hostname}" in self.logBaseURL: self.logBaseURL = self.logBaseURL.replace("$hostname", self.hostname).replace("${hostname}", self.hostname) except AttributeError: self.logBaseURL = None - if self.logBaseURL and '${harvester_id}' in self.logBaseURL: + if self.logBaseURL and "${harvester_id}" in self.logBaseURL: self.logBaseURL = self.logBaseURL.replace("${harvester_id}", harvester_config.master.harvester_id) # Default x509 proxy for a queue try: self.x509UserProxy except AttributeError: - self.x509UserProxy = os.getenv('X509_USER_PROXY') + self.x509UserProxy = os.getenv("X509_USER_PROXY") # x509 proxy for analysis jobs in grandly unified queues try: self.x509UserProxyAnalysis except AttributeError: - self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') + self.x509UserProxyAnalysis = os.getenv("X509_USER_PROXY_ANAL") # Default token directory for a queue try: self.tokenDir @@ -382,17 +411,17 @@ def __init__(self, **kwarg): try: self.CEtemplateDir except AttributeError: - self.CEtemplateDir = '' + self.CEtemplateDir = "" # remote condor schedd and pool name (collector) try: self.condorSchedd - if '$hostname' in self.condorSchedd or '${hostname}' in self.condorSchedd: + if "$hostname" in self.condorSchedd or "${hostname}" in self.condorSchedd: self.condorSchedd = self.condorSchedd.replace("$hostname", self.hostname).replace("${hostname}", self.hostname) except AttributeError: self.condorSchedd = None try: self.condorPool - if '$hostname' in self.condorPool or '${hostname}' in self.condorPool: + if "$hostname" in self.condorPool or "${hostname}" in self.condorPool: self.condorPool = self.condorPool.replace("$hostname", self.hostname).replace("${hostname}", self.hostname) except AttributeError: self.condorPool = None @@ -406,16 +435,16 @@ def __init__(self, **kwarg): self.condorSchedd = [] self.condorPool = [] self.condorHostWeight = [] - with open(self.condorHostConfig, 'r') as f: + with open(self.condorHostConfig, "r") as f: condor_host_config_map = json.load(f) for _schedd, _cm in condor_host_config_map.items(): - _pool = _cm['pool'] - _weight = int(_cm['weight']) + _pool = _cm["pool"] + _weight = int(_cm["weight"]) self.condorSchedd.append(_schedd) self.condorPool.append(_pool) self.condorHostWeight.append(_weight) except Exception as e: - tmpLog.error('error when parsing condorHostConfig json file; {0}: {1}'.format(e.__class__.__name__, e)) + tmpLog.error("error when parsing condorHostConfig json file; {0}: {1}".format(e.__class__.__name__, e)) raise else: if isinstance(self.condorSchedd, list): @@ -443,25 +472,25 @@ def __init__(self, **kwarg): except AttributeError: self.rcPilotRandomWeightPermille = 0 # submission to ARC CE's with nordugrid (gridftp) or arc (REST) grid type - self.submit_arc_grid_type = 'arc' + self.submit_arc_grid_type = "arc" try: - extra_plugin_configs = harvester_config.master.extraPluginConfigs['HTCondorSubmitter'] + extra_plugin_configs = harvester_config.master.extraPluginConfigs["HTCondorSubmitter"] except AttributeError: pass except KeyError: pass else: - if extra_plugin_configs.get('submit_arc_grid_type') == 'nordugrid': - self.submit_arc_grid_type = 'nordugrid' + if extra_plugin_configs.get("submit_arc_grid_type") == "nordugrid": + self.submit_arc_grid_type = "nordugrid" # record of information of CE statistics self.ceStatsLock = threading.Lock() self.ceStats = dict() # allowed associated parameters from CRIC self._allowed_cric_attrs = ( - 'pilot_url', - 'pilot_args', - 'unified_dispatch', - ) + "pilot_url", + "pilot_args", + "unified_dispatch", + ) # get CE statistics of a site def get_ce_statistics(self, site_name, n_new_workers, time_window=21600): @@ -478,17 +507,17 @@ def get_ce_statistics(self, site_name, n_new_workers, time_window=21600): # submit workers def submit_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, 'site={0}'.format(self.queueName), method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "site={0}".format(self.queueName), method_name="submit_workers") nWorkers = len(workspec_list) - tmpLog.debug('start nWorkers={0}'.format(nWorkers)) + tmpLog.debug("start nWorkers={0}".format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() - log_subdir = timeNow.strftime('%y-%m-%d_%H') + log_subdir = timeNow.strftime("%y-%m-%d_%H") log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: @@ -519,7 +548,7 @@ def submit_workers(self, workspec_list): if key in self._allowed_cric_attrs: if isinstance(val, str): # sanitized list the value - val = re.sub(r'[;$~`]*', '', val) + val = re.sub(r"[;$~`]*", "", val) associated_params_dict[key] = val else: panda_queues_dict = dict() @@ -527,14 +556,14 @@ def submit_workers(self, workspec_list): this_panda_queue_dict = dict() # get default information from queue info - n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 - is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' - is_unified_dispatch = associated_params_dict.get('unified_dispatch', False) - pilot_url = associated_params_dict.get('pilot_url') - pilot_args = associated_params_dict.get('pilot_args', '') - pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) - python_version = str(this_panda_queue_dict.get('python_version', '2')) - is_gpu_resource = this_panda_queue_dict.get('resource_type', '') == 'gpu' + n_core_per_node_from_queue = this_panda_queue_dict.get("corecount", 1) if this_panda_queue_dict.get("corecount", 1) else 1 + is_unified_queue = this_panda_queue_dict.get("capability", "") == "ucore" + is_unified_dispatch = associated_params_dict.get("unified_dispatch", False) + pilot_url = associated_params_dict.get("pilot_url") + pilot_args = associated_params_dict.get("pilot_args", "") + pilot_version = str(this_panda_queue_dict.get("pilot_version", "current")) + python_version = str(this_panda_queue_dict.get("python_version", "2")) + is_gpu_resource = this_panda_queue_dict.get("resource_type", "") == "gpu" # get override requirements from queue configured try: @@ -543,7 +572,7 @@ def submit_workers(self, workspec_list): n_core_per_node = n_core_per_node_from_queue # deal with Condor schedd and central managers; make a random list the choose - n_bulks = ceil(nWorkers/self.minBulkToRamdomizedSchedd) + n_bulks = ceil(nWorkers / self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: orig_list = [] if isinstance(self.condorPool, list) and len(self.condorPool) > 0: @@ -560,62 +589,61 @@ def submit_workers(self, workspec_list): schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)] # deal with CE - special_par = '' + special_par = "" ce_weighting = None if self.useCRICGridCE: # If CRIC Grid CE mode used - tmpLog.debug('Using CRIC Grid CE mode...') - queues_from_queue_list = this_panda_queue_dict.get('queues', []) - special_par = this_panda_queue_dict.get('special_par', '') + tmpLog.debug("Using CRIC Grid CE mode...") + queues_from_queue_list = this_panda_queue_dict.get("queues", []) + special_par = this_panda_queue_dict.get("special_par", "") ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: - if not ( _queue_dict.get('ce_endpoint') - and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' - and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce']) ): + if not ( + _queue_dict.get("ce_endpoint") + and str(_queue_dict.get("ce_state", "")).upper() == "ACTIVE" + and str(_queue_dict.get("ce_flavour", "")).lower() in set(["arc-ce", "cream-ce", "htcondor-ce"]) + ): continue ce_info_dict = _queue_dict.copy() # ignore protocol prefix in ce_endpoint for cream and condor CE # check protocol prefix for ARC CE (gridftp or REST) - _match_ce_endpoint = re.match('^(\w+)://(\w+)', ce_info_dict.get('ce_endpoint', '')) - ce_endpoint_prefix = '' + _match_ce_endpoint = re.match("^(\w+)://(\w+)", ce_info_dict.get("ce_endpoint", "")) + ce_endpoint_prefix = "" if _match_ce_endpoint: ce_endpoint_prefix = _match_ce_endpoint.group(1) - ce_endpoint_from_queue = re.sub('^\w+://', '', ce_info_dict.get('ce_endpoint', '')) - ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() - ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() + ce_endpoint_from_queue = re.sub("^\w+://", "", ce_info_dict.get("ce_endpoint", "")) + ce_flavour_str = str(ce_info_dict.get("ce_flavour", "")).lower() + ce_version_str = str(ce_info_dict.get("ce_version", "")).lower() # grid type of htcondor grid universe to use; empty string as default - ce_info_dict['ce_grid_type'] = '' - if ce_flavour_str == 'arc-ce': - ce_info_dict['ce_grid_type'] = self.submit_arc_grid_type - ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) - if ce_info_dict['ce_grid_type'] == 'arc': + ce_info_dict["ce_grid_type"] = "" + if ce_flavour_str == "arc-ce": + ce_info_dict["ce_grid_type"] = self.submit_arc_grid_type + ce_info_dict["ce_hostname"] = re.sub(":\w*", "", ce_endpoint_from_queue) + if ce_info_dict["ce_grid_type"] == "arc": default_port = None - if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: + if ce_info_dict["ce_hostname"] == ce_endpoint_from_queue: # defaut port default_port = 443 else: # change port 2811 to 443 - ce_endpoint_from_queue = re.sub(r':2811$', ':443', ce_endpoint_from_queue) - ce_info_dict['ce_endpoint'] = '{0}{1}'.format(ce_endpoint_from_queue, - ':{0}'.format(default_port) if default_port is not None else '') + ce_endpoint_from_queue = re.sub(r":2811$", ":443", ce_endpoint_from_queue) + ce_info_dict["ce_endpoint"] = "{0}{1}".format(ce_endpoint_from_queue, ":{0}".format(default_port) if default_port is not None else "") else: - if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: + if ce_info_dict["ce_hostname"] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { - 'cream-ce': 8443, - 'arc-ce': 2811, - 'htcondor-ce': 9619, - } + "cream-ce": 8443, + "arc-ce": 2811, + "htcondor-ce": 9619, + } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] - ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) - if ce_flavour_str == 'arc-ce': - ce_info_dict['ce_endpoint'] = '{0}'.format(ce_endpoint_from_queue) - tmpLog.debug('Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format( - pilot_version, ce_endpoint_from_queue, ce_flavour_str)) - ce_endpoint = ce_info_dict.get('ce_endpoint') - if ( ce_endpoint in ce_auxilary_dict - and str(ce_info_dict.get('ce_queue_name', '')).lower() == 'default' ): + ce_info_dict["ce_endpoint"] = "{0}:{1}".format(ce_endpoint_from_queue, default_port) + if ce_flavour_str == "arc-ce": + ce_info_dict["ce_endpoint"] = "{0}".format(ce_endpoint_from_queue) + tmpLog.debug('Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format(pilot_version, ce_endpoint_from_queue, ce_flavour_str)) + ce_endpoint = ce_info_dict.get("ce_endpoint") + if ce_endpoint in ce_auxilary_dict and str(ce_info_dict.get("ce_queue_name", "")).lower() == "default": pass else: ce_auxilary_dict[ce_endpoint] = ce_info_dict @@ -623,23 +651,22 @@ def submit_workers(self, workspec_list): n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting - tmpLog.debug('Get CE weighting') + tmpLog.debug("Get CE weighting") worker_ce_all_tuple = self.get_ce_statistics(self.queueName, nWorkers) - is_slave_queue = (harvester_queue_config.runMode == 'slave') - ce_weighting = submitter_common.get_ce_weighting(ce_endpoint_list=list(ce_auxilary_dict.keys()), - worker_ce_all_tuple=worker_ce_all_tuple, - is_slave_queue=is_slave_queue) - stats_weighting_display_str = submitter_common.get_ce_stats_weighting_display( - ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) - tmpLog.debug('CE stats and weighting: {0}'.format(stats_weighting_display_str)) + is_slave_queue = harvester_queue_config.runMode == "slave" + ce_weighting = submitter_common.get_ce_weighting( + ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple, is_slave_queue=is_slave_queue + ) + stats_weighting_display_str = submitter_common.get_ce_stats_weighting_display(ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) + tmpLog.debug("CE stats and weighting: {0}".format(stats_weighting_display_str)) else: - tmpLog.error('No valid CE endpoint found') + tmpLog.error("No valid CE endpoint found") to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger - tmpLog = core_utils.make_logger(baseLogger, 'site={0} workerID={1}'.format(self.queueName, workspec.workerID), - method_name='_handle_one_worker') + tmpLog = core_utils.make_logger(baseLogger, "site={0} workerID={1}".format(self.queueName, workspec.workerID), method_name="_handle_one_worker") + def _choose_credential(workspec): """ Choose the credential based on the job type @@ -647,46 +674,48 @@ def _choose_credential(workspec): job_type = workspec.jobType proxy = self.x509UserProxy token_dir = self.tokenDir - if (not is_unified_dispatch and is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis')) \ - or self.useAnalysisCredentials: + if (not is_unified_dispatch and is_grandly_unified_queue and job_type in ("user", "panda", "analysis")) or self.useAnalysisCredentials: if self.x509UserProxyAnalysis: - tmpLog.debug('Taking analysis proxy') + tmpLog.debug("Taking analysis proxy") proxy = self.x509UserProxyAnalysis if self.tokenDirAnalysis: - tmpLog.debug('Taking analysis token_dir') + tmpLog.debug("Taking analysis token_dir") token_dir = self.tokenDirAnalysis else: - tmpLog.debug('Taking default proxy') + tmpLog.debug("Taking default proxy") if self.tokenDir: - tmpLog.debug('Taking default token_dir') + tmpLog.debug("Taking default token_dir") return proxy, token_dir + # initialize ce_info_dict = dict() batch_log_dict = dict() - data = {'workspec': workspec, - 'to_submit': to_submit,} + data = { + "workspec": workspec, + "to_submit": to_submit, + } if to_submit: sdf_template_file = None if self.useCRICGridCE: # choose a CE - tmpLog.info('choose a CE...') + tmpLog.info("choose a CE...") ce_chosen = submitter_common.choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: - tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') + tmpLog.info("Problem choosing CE with weighting. Choose an arbitrary CE endpoint") ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() - ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() - tmpLog.debug('Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format( - pilot_version, ce_info_dict['ce_endpoint'], ce_flavour_str)) + ce_flavour_str = str(ce_info_dict.get("ce_flavour", "")).lower() + tmpLog.debug( + 'Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format(pilot_version, ce_info_dict["ce_endpoint"], ce_flavour_str) + ) if self.templateFile: sdf_template_file = self.templateFile elif os.path.isdir(self.CEtemplateDir) and ce_flavour_str: - sdf_suffix_str = '' - if ce_info_dict['ce_grid_type']: - sdf_suffix_str = '_{ce_grid_type}'.format(ce_grid_type=ce_info_dict['ce_grid_type']) - sdf_template_filename = '{ce_flavour_str}{sdf_suffix_str}.sdf'.format( - ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str) + sdf_suffix_str = "" + if ce_info_dict["ce_grid_type"]: + sdf_suffix_str = "_{ce_grid_type}".format(ce_grid_type=ce_info_dict["ce_grid_type"]) + sdf_template_filename = "{ce_flavour_str}{sdf_suffix_str}.sdf".format(ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str) sdf_template_file = os.path.join(self.CEtemplateDir, sdf_template_filename) else: if self.templateFile: @@ -695,19 +724,19 @@ def _choose_credential(workspec): # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: - ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) + ce_info_dict["ce_hostname"], ce_info_dict["ce_endpoint"] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) else: - ce_info_dict['ce_hostname'] = random.choice(self.ceHostname) - ce_info_dict['ce_endpoint'] = self.ceEndpoint + ce_info_dict["ce_hostname"] = random.choice(self.ceHostname) + ce_info_dict["ce_endpoint"] = self.ceEndpoint else: - ce_info_dict['ce_hostname'] = self.ceHostname - ce_info_dict['ce_endpoint'] = self.ceEndpoint + ce_info_dict["ce_hostname"] = self.ceHostname + ce_info_dict["ce_endpoint"] = self.ceEndpoint except AttributeError: pass try: # Manually define ceQueueName if self.ceQueueName: - ce_info_dict['ce_queue_name'] = self.ceQueueName + ce_info_dict["ce_queue_name"] = self.ceQueueName except AttributeError: pass # template for batch script @@ -716,19 +745,19 @@ def _choose_credential(workspec): sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: - tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found') + tmpLog.error("No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found") to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] - for _line in sdf_template_raw.split('\n'): - if _line.startswith('#'): + for _line in sdf_template_raw.split("\n"): + if _line.startswith("#"): continue sdf_template_str_list.append(_line) - _match_batch_log = re.match('log = (.+)', _line) - _match_stdout = re.match('output = (.+)', _line) - _match_stderr = re.match('error = (.+)', _line) + _match_batch_log = re.match("log = (.+)", _line) + _match_stdout = re.match("output = (.+)", _line) + _match_stderr = re.match("error = (.+)", _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue @@ -738,21 +767,23 @@ def _choose_credential(workspec): if _match_stderr: stderr_value = _match_stderr.group(1) continue - sdf_template = '\n'.join(sdf_template_str_list) + sdf_template = "\n".join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice(schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: - workspec.submissionHost = 'LOCAL' + workspec.submissionHost = "LOCAL" else: - workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool) - tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost)) + workspec.submissionHost = "{0},{1}".format(condor_schedd, condor_pool) + tmpLog.debug("set submissionHost={0}".format(workspec.submissionHost)) # Log Base URL - if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: - schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', - lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', - condor_schedd) - log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) + if self.logBaseURL and "[ScheddHostname]" in self.logBaseURL: + schedd_hostname = re.sub( + r"(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?", + lambda matchobj: matchobj.group(1) if matchobj.group(1) else "", + condor_schedd, + ) + log_base_url = re.sub(r"\[ScheddHostname\]", schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files @@ -761,82 +792,83 @@ def _choose_credential(workspec): batchID = workspec.batchID guess = False else: - batchID = '' + batchID = "" guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) - batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) - batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name) - batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename) - workspec.set_log_file('batch_log', batch_log) - workspec.set_log_file('stdout', batch_stdout) - workspec.set_log_file('stderr', batch_stderr) - batch_log_dict['batch_log'] = batch_log - batch_log_dict['batch_stdout'] = batch_stdout - batch_log_dict['batch_stderr'] = batch_stderr - batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] - tmpLog.debug('Done set_log_file before submission') - tmpLog.debug('Done jobspec attribute setting') + batch_log = "{0}/{1}/{2}".format(log_base_url, log_subdir, batch_log_filename) + batch_stdout = "{0}/{1}/{2}".format(log_base_url, log_subdir, stdout_path_file_name) + batch_stderr = "{0}/{1}/{2}".format(log_base_url, log_subdir, stderr_path_filename) + workspec.set_log_file("batch_log", batch_log) + workspec.set_log_file("stdout", batch_stdout) + workspec.set_log_file("stderr", batch_stderr) + batch_log_dict["batch_log"] = batch_log + batch_log_dict["batch_stdout"] = batch_stdout + batch_log_dict["batch_stderr"] = batch_stderr + batch_log_dict["gtag"] = workspec.workAttributes["stdOut"] + tmpLog.debug("Done set_log_file before submission") + tmpLog.debug("Done jobspec attribute setting") # choose the x509 certificate based on the type of job (analysis or production) proxy, token_dir = _choose_credential(workspec) # set data dict - data.update({ - 'workspec': workspec, - 'to_submit': to_submit, - 'template': sdf_template, - 'executable_file': self.executableFile, - 'log_dir': self.logDir, - 'log_subdir': log_subdir, - 'n_core_per_node': n_core_per_node, - 'panda_queue_name': panda_queue_name, - 'x509_user_proxy': proxy, - 'ce_info_dict': ce_info_dict, - 'batch_log_dict': batch_log_dict, - 'special_par': special_par, - 'harvester_queue_config': harvester_queue_config, - 'is_unified_queue': is_unified_queue, - 'condor_schedd': condor_schedd, - 'condor_pool': condor_pool, - 'use_spool': self.useSpool, - 'pilot_url': pilot_url, - 'pilot_args': pilot_args, - 'pilot_version': pilot_version, - 'python_version': python_version, - 'token_dir': token_dir, - 'is_unified_dispatch': is_unified_dispatch, - 'prod_rc_permille': self.rcPilotRandomWeightPermille, - 'is_gpu_resource': is_gpu_resource, - }) + data.update( + { + "workspec": workspec, + "to_submit": to_submit, + "template": sdf_template, + "executable_file": self.executableFile, + "log_dir": self.logDir, + "log_subdir": log_subdir, + "n_core_per_node": n_core_per_node, + "panda_queue_name": panda_queue_name, + "x509_user_proxy": proxy, + "ce_info_dict": ce_info_dict, + "batch_log_dict": batch_log_dict, + "special_par": special_par, + "harvester_queue_config": harvester_queue_config, + "is_unified_queue": is_unified_queue, + "condor_schedd": condor_schedd, + "condor_pool": condor_pool, + "use_spool": self.useSpool, + "pilot_url": pilot_url, + "pilot_args": pilot_args, + "pilot_version": pilot_version, + "python_version": python_version, + "token_dir": token_dir, + "is_unified_dispatch": is_unified_dispatch, + "prod_rc_permille": self.rcPilotRandomWeightPermille, + "is_gpu_resource": is_gpu_resource, + } + ) return data def _propagate_attributes(workspec, tmpVal): # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='_propagate_attributes') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="_propagate_attributes") (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) - tmpLog.debug('Done workspec attributes propagation') + tmpLog.debug("Done workspec attributes propagation") return retVal - tmpLog.debug('finished preparing worker attributes') + tmpLog.debug("finished preparing worker attributes") # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) - tmpLog.debug('{0} workers handled'.format(nWorkers)) + tmpLog.debug("{0} workers handled".format(nWorkers)) # submit retValList = submit_bag_of_workers(list(dataIterator)) - tmpLog.debug('{0} workers submitted'.format(nWorkers)) + tmpLog.debug("{0} workers submitted".format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map(lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) - tmpLog.debug('done') + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index db81b0db..5f479877 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -1,5 +1,6 @@ import os import traceback + try: from urllib import unquote # Python 2.X except ImportError: @@ -16,9 +17,11 @@ from pandaharvester.harvestersubmitter import submitter_common # logger -base_logger = core_utils.setup_logger('k8s_submitter') +base_logger = core_utils.setup_logger("k8s_submitter") # submitter for K8S + + class K8sSubmitter(PluginBase): # constructor def __init__(self, **kwarg): @@ -36,9 +39,7 @@ def __init__(self, **kwarg): self.k8s_client.create_or_patch_configmap_starter() # allowed associated parameters from CRIC - self._allowed_agis_attrs = ( - 'pilot_url', - ) + self._allowed_agis_attrs = ("pilot_url",) # number of processes try: @@ -53,15 +54,15 @@ def __init__(self, **kwarg): try: self.proxySecretPath except AttributeError: - if os.getenv('PROXY_SECRET_PATH'): - self.proxySecretPath = os.getenv('PROXY_SECRET_PATH') + if os.getenv("PROXY_SECRET_PATH"): + self.proxySecretPath = os.getenv("PROXY_SECRET_PATH") # analysis x509 proxy through k8s secrets: on GU queues try: self.proxySecretPathAnalysis except AttributeError: - if os.getenv('PROXY_SECRET_PATH_ANAL'): - self.proxySecretPath = os.getenv('PROXY_SECRET_PATH_ANAL') + if os.getenv("PROXY_SECRET_PATH_ANAL"): + self.proxySecretPath = os.getenv("PROXY_SECRET_PATH_ANAL") def _choose_proxy(self, workspec, is_grandly_unified_queue): """ @@ -70,7 +71,7 @@ def _choose_proxy(self, workspec, is_grandly_unified_queue): cert = None job_type = workspec.jobType - if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis'): + if is_grandly_unified_queue and job_type in ("user", "panda", "analysis"): if self.proxySecretPathAnalysis: cert = self.proxySecretPathAnalysis elif self.proxySecretPath: @@ -82,15 +83,15 @@ def _choose_proxy(self, workspec, is_grandly_unified_queue): return cert def submit_k8s_worker(self, work_spec): - tmp_log = self.make_logger(base_logger, 'queueName={0}'.format(self.queueName), method_name='submit_k8s_worker') + tmp_log = self.make_logger(base_logger, "queueName={0}".format(self.queueName), method_name="submit_k8s_worker") # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file - log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) - work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) + log_file_name = "{0}_{1}.out".format(harvester_config.master.harvester_id, work_spec.workerID) + work_spec.set_log_file("stdout", "{0}/{1}".format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) @@ -101,15 +102,15 @@ def submit_k8s_worker(self, work_spec): is_grandly_unified_queue = self.panda_queues_dict.is_grandly_unified_queue(self.queueName) cert = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: - err_str = 'No proxy specified in proxySecretPath. Not submitted' + err_str = "No proxy specified in proxySecretPath. Not submitted" tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: - max_time = this_panda_queue_dict['maxtime'] + max_time = this_panda_queue_dict["maxtime"] except Exception as e: - tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) + tmp_log.warning("Could not retrieve maxtime field for queue {0}".format(self.queueName)) max_time = None associated_params_dict = {} @@ -117,59 +118,57 @@ def submit_k8s_worker(self, work_spec): if key in self._allowed_agis_attrs: associated_params_dict[key] = val - pilot_url = associated_params_dict.get('pilot_url') - pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) - python_version = str(this_panda_queue_dict.get('python_version', '2')) + pilot_url = associated_params_dict.get("pilot_url") + pilot_version = str(this_panda_queue_dict.get("pilot_version", "current")) + python_version = str(this_panda_queue_dict.get("python_version", "2")) prod_source_label_tmp = harvester_queue_config.get_source_label(work_spec.jobType) - pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType, pilot_url, - pilot_version, prod_source_label_tmp) + pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType, pilot_url, pilot_version, prod_source_label_tmp) if pilot_opt_dict is None: prod_source_label = prod_source_label_tmp pilot_type = work_spec.pilotType - pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' + pilot_url_str = "--piloturl {0}".format(pilot_url) if pilot_url else "" else: - prod_source_label = pilot_opt_dict['prod_source_label'] - pilot_type = pilot_opt_dict['pilot_type_opt'] - pilot_url_str = pilot_opt_dict['pilot_url_str'] + prod_source_label = pilot_opt_dict["prod_source_label"] + pilot_type = pilot_opt_dict["pilot_type_opt"] + pilot_url_str = pilot_opt_dict["pilot_url_str"] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) host_image = self.panda_queues_dict.get_k8s_host_image(self.queueName) # submit the worker - rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, - pilot_type, pilot_url_str, - pilot_python_option, pilot_version, - host_image, cert, max_time=max_time) + rsp, yaml_content_final = self.k8s_client.create_job_from_yaml( + yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, pilot_version, host_image, cert, max_time=max_time + ) except Exception as _e: tmp_log.error(traceback.format_exc()) - err_str = 'Failed to create a JOB; {0}'.format(_e) + err_str = "Failed to create a JOB; {0}".format(_e) tmp_return_value = (False, err_str) else: - work_spec.batchID = yaml_content['metadata']['name'] - tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) - tmp_return_value = (True, '') + work_spec.batchID = yaml_content["metadata"]["name"] + tmp_log.debug("Created worker {0} with batchID={1}".format(work_spec.workerID, work_spec.batchID)) + tmp_return_value = (True, "") return tmp_return_value # submit workers def submit_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, 'queueName={0}'.format(self.queueName), method_name='submit_workers') + tmp_log = self.make_logger(base_logger, "queueName={0}".format(self.queueName), method_name="submit_workers") n_workers = len(workspec_list) - tmp_log.debug('start, n_workers={0}'.format(n_workers)) + tmp_log.debug("start, n_workers={0}".format(n_workers)) ret_list = list() if not workspec_list: - tmp_log.debug('empty workspec_list') + tmp_log.debug("empty workspec_list") return ret_list with ThreadPoolExecutor(self.nProcesses) as thread_pool: ret_val_list = thread_pool.map(self.submit_k8s_worker, workspec_list) - tmp_log.debug('{0} workers submitted'.format(n_workers)) + tmp_log.debug("{0} workers submitted".format(n_workers)) ret_list = list(ret_val_list) - tmp_log.debug('done') + tmp_log.debug("done") return ret_list diff --git a/pandaharvester/harvestersubmitter/lancium_submitter.py b/pandaharvester/harvestersubmitter/lancium_submitter.py index ec9c95db..c017edfd 100644 --- a/pandaharvester/harvestersubmitter/lancium_submitter.py +++ b/pandaharvester/harvestersubmitter/lancium_submitter.py @@ -12,12 +12,14 @@ from pandaharvester.harvestermisc.info_utils import PandaQueuesDict from pandaharvester.harvestermisc.lancium_utils import LanciumClient, SCRIPTS_PATH, get_job_name_from_workspec -base_logger = core_utils.setup_logger('lancium_submitter') -voms_lancium_path = '/voms/voms' -script_lancium_path = '/scripts/pilots_starter.py' -mount_path = 'input_files' -full_mount_path_secrets = '/jobDir/input_files/voms' +base_logger = core_utils.setup_logger("lancium_submitter") + +voms_lancium_path = "/voms/voms" +script_lancium_path = "/scripts/pilots_starter.py" +mount_path = "input_files" +full_mount_path_secrets = "/jobDir/input_files/voms" + class LanciumSubmitter(PluginBase): # constructor @@ -31,9 +33,7 @@ def __init__(self, **kwarg): self.panda_queues_dict = PandaQueuesDict() # allowed associated parameters from CRIC - self._allowed_agis_attrs = ( - 'pilot_url', - ) + self._allowed_agis_attrs = ("pilot_url",) # number of processes try: @@ -49,20 +49,19 @@ def __init__(self, **kwarg): # update or create the pilot starter executable self.upload_pilots_starter() - def upload_pilots_starter(self): - tmp_log = self.make_logger(base_logger, method_name='upload_pilots_starter') - tmp_log.debug('Start') + tmp_log = self.make_logger(base_logger, method_name="upload_pilots_starter") + tmp_log.debug("Start") try: - base_name = 'pilots_starter.py' + base_name = "pilots_starter.py" dir_name = os.path.dirname(__file__) - local_file = os.path.join(dir_name, '../harvestercloud/{0}'.format(base_name)) + local_file = os.path.join(dir_name, "../harvestercloud/{0}".format(base_name)) lancium_file = os.path.join(SCRIPTS_PATH, base_name) self.lancium_client.upload_file(local_file, lancium_file) - tmp_log.debug('Done') + tmp_log.debug("Done") except Exception: - tmp_log.error('Problem uploading proxy {0}. {1}'.format(local_file, traceback.format_exc())) + tmp_log.error("Problem uploading proxy {0}. {1}".format(local_file, traceback.format_exc())) def _choose_proxy(self, workspec): """ @@ -72,7 +71,7 @@ def _choose_proxy(self, workspec): job_type = workspec.jobType is_grandly_unified_queue = self.panda_queues_dict.is_grandly_unified_queue(self.queueName) - if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis'): + if is_grandly_unified_queue and job_type in ("user", "panda", "analysis"): if self.user_proxy: cert = self.user_proxy elif self.prod_proxy: @@ -83,61 +82,65 @@ def _choose_proxy(self, workspec): return cert - def _fill_params(self, workspec, container_image, cert, physical_cores, memory_gb, - maxwdir_prorated_gib, max_time, pilot_type, pilot_url_str, pilot_version, prod_source_label, pilot_python_option, - log_file_name): - + def _fill_params( + self, + workspec, + container_image, + cert, + physical_cores, + memory_gb, + maxwdir_prorated_gib, + max_time, + pilot_type, + pilot_url_str, + pilot_version, + prod_source_label, + pilot_python_option, + log_file_name, + ): lancium_job_name = get_job_name_from_workspec(workspec) # submit the worker - params = {'name': lancium_job_name, - 'command_line': 'python input_files/scripts/pilots_starter.py', - 'image': container_image, # 'harvester/centos7-singularity' - 'max_run_time': max_time, - 'resources': {'core_count': physical_cores, - 'memory': memory_gb, - 'scratch': int(maxwdir_prorated_gib) - }, - 'input_files': [ - {"source_type": "data", - "data": voms_lancium_path, - "name": mount_path - }, - {"source_type": "data", - "data": script_lancium_path, - "name": mount_path - } - ], - 'environment': ( - {'variable': 'pilotUrlOpt', 'value': pilot_url_str}, # pilotUrlOpt, stdout_name - {'variable': 'stdout_name', 'value': log_file_name}, - {'variable': 'PILOT_NOKILL', 'value': 'True'}, - {'variable': 'computingSite', 'value': self.queueName}, - {'variable': 'pandaQueueName', 'value': self.queueName}, - {'variable': 'resourceType', 'value': workspec.resourceType}, - {'variable': 'prodSourceLabel', 'value': prod_source_label}, - {'variable': 'pilotType', 'value': pilot_type}, - # {'variable': 'pythonOption', 'value': pilot_python_option}, - {'variable': 'pilotVersion', 'value': pilot_version}, - {'variable': 'jobType', 'value': prod_source_label}, - {'variable': 'proxySecretPath', 'value': os.path.join(full_mount_path_secrets, cert)}, - {'variable': 'workerID', 'value': str(workspec.workerID)}, - {'variable': 'pilotProxyCheck', 'value': 'False'}, - {'variable': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, - {'variable': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, - {'variable': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, - {'variable': 'HARVESTER_WORKER_ID', 'value': str(workspec.workerID)}, - {'variable': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id}, - {'variable': 'submit_mode', 'value': 'PULL'}, - {'variable': 'TMPDIR', 'value': '/jobDir'}, - {'variable': 'HOME', 'value': '/jobDir'}, - # {'variable': 'K8S_JOB_ID', 'value': lancium_job_name}, - ) - } + params = { + "name": lancium_job_name, + "command_line": "python input_files/scripts/pilots_starter.py", + "image": container_image, # 'harvester/centos7-singularity' + "max_run_time": max_time, + "resources": {"core_count": physical_cores, "memory": memory_gb, "scratch": int(maxwdir_prorated_gib)}, + "input_files": [ + {"source_type": "data", "data": voms_lancium_path, "name": mount_path}, + {"source_type": "data", "data": script_lancium_path, "name": mount_path}, + ], + "environment": ( + {"variable": "pilotUrlOpt", "value": pilot_url_str}, # pilotUrlOpt, stdout_name + {"variable": "stdout_name", "value": log_file_name}, + {"variable": "PILOT_NOKILL", "value": "True"}, + {"variable": "computingSite", "value": self.queueName}, + {"variable": "pandaQueueName", "value": self.queueName}, + {"variable": "resourceType", "value": workspec.resourceType}, + {"variable": "prodSourceLabel", "value": prod_source_label}, + {"variable": "pilotType", "value": pilot_type}, + # {'variable': 'pythonOption', 'value': pilot_python_option}, + {"variable": "pilotVersion", "value": pilot_version}, + {"variable": "jobType", "value": prod_source_label}, + {"variable": "proxySecretPath", "value": os.path.join(full_mount_path_secrets, cert)}, + {"variable": "workerID", "value": str(workspec.workerID)}, + {"variable": "pilotProxyCheck", "value": "False"}, + {"variable": "logs_frontend_w", "value": harvester_config.pandacon.pandaCacheURL_W}, + {"variable": "logs_frontend_r", "value": harvester_config.pandacon.pandaCacheURL_R}, + {"variable": "PANDA_JSID", "value": "harvester-" + harvester_config.master.harvester_id}, + {"variable": "HARVESTER_WORKER_ID", "value": str(workspec.workerID)}, + {"variable": "HARVESTER_ID", "value": harvester_config.master.harvester_id}, + {"variable": "submit_mode", "value": "PULL"}, + {"variable": "TMPDIR", "value": "/jobDir"}, + {"variable": "HOME", "value": "/jobDir"}, + # {'variable': 'K8S_JOB_ID', 'value': lancium_job_name}, + ), + } return params def submit_lancium_worker(self, workspec): - tmp_log = self.make_logger(base_logger, 'queueName={0}'.format(self.queueName), method_name='submit_lancium_worker') + tmp_log = self.make_logger(base_logger, "queueName={0}".format(self.queueName), method_name="submit_lancium_worker") this_panda_queue_dict = self.panda_queues_dict.get(self.queueName, dict()) @@ -147,50 +150,60 @@ def submit_lancium_worker(self, workspec): harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file - log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, workspec.workerID) - workspec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) + log_file_name = "{0}_{1}.out".format(harvester_config.master.harvester_id, workspec.workerID) + workspec.set_log_file("stdout", "{0}/{1}".format(self.logBaseURL, log_file_name)) # choose the appropriate proxy cert = self._choose_proxy(workspec) if not cert: - err_str = 'No proxy specified in proxySecretPath. Not submitted' + err_str = "No proxy specified in proxySecretPath. Not submitted" tmp_return_value = (False, err_str) return tmp_return_value # set the container image - container_image = 'harvester/centos7-singularity' # harvester_queue_config.container_image + container_image = "harvester/centos7-singularity" # harvester_queue_config.container_image physical_cores = workspec.nCore / 2 # lancium uses hyperthreading, but expects job size in physical cores memory_gb = workspec.minRamCount / 2 / 1000 - maxwdir_prorated_gib = self.panda_queues_dict.get_prorated_maxwdir_GiB(workspec.computingSite, - workspec.nCore) - max_time = this_panda_queue_dict.get('maxtime', None) + maxwdir_prorated_gib = self.panda_queues_dict.get_prorated_maxwdir_GiB(workspec.computingSite, workspec.nCore) + max_time = this_panda_queue_dict.get("maxtime", None) associated_params_dict = {} for key, val in self.panda_queues_dict.get_harvester_params(self.queueName).items(): if key in self._allowed_agis_attrs: associated_params_dict[key] = val - pilot_url = associated_params_dict.get('pilot_url') - pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) - python_version = str(this_panda_queue_dict.get('python_version', '3')) + pilot_url = associated_params_dict.get("pilot_url") + pilot_version = str(this_panda_queue_dict.get("pilot_version", "current")) + python_version = str(this_panda_queue_dict.get("python_version", "3")) prod_source_label_tmp = harvester_queue_config.get_source_label(workspec.jobType) - pilot_opt_dict = submitter_common.get_complicated_pilot_options(workspec.pilotType, pilot_url, - pilot_version, prod_source_label_tmp) + pilot_opt_dict = submitter_common.get_complicated_pilot_options(workspec.pilotType, pilot_url, pilot_version, prod_source_label_tmp) if pilot_opt_dict is None: prod_source_label = prod_source_label_tmp pilot_type = workspec.pilotType - pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' + pilot_url_str = "--piloturl {0}".format(pilot_url) if pilot_url else "" else: - prod_source_label = pilot_opt_dict['prod_source_label'] - pilot_type = pilot_opt_dict['pilot_type_opt'] - pilot_url_str = pilot_opt_dict['pilot_url_str'] + prod_source_label = pilot_opt_dict["prod_source_label"] + pilot_type = pilot_opt_dict["pilot_type_opt"] + pilot_url_str = pilot_opt_dict["pilot_url_str"] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) - params = self._fill_params(workspec, container_image, cert, physical_cores, memory_gb, maxwdir_prorated_gib, - max_time, pilot_type, pilot_url_str, pilot_version, prod_source_label, - pilot_python_option, log_file_name) + params = self._fill_params( + workspec, + container_image, + cert, + physical_cores, + memory_gb, + maxwdir_prorated_gib, + max_time, + pilot_type, + pilot_url_str, + pilot_version, + prod_source_label, + pilot_python_option, + log_file_name, + ) return_code, return_str = self.lancium_client.submit_job(**params) if not return_code: @@ -198,33 +211,33 @@ def submit_lancium_worker(self, workspec): except Exception as _e: tmp_log.error(traceback.format_exc()) - err_str = 'Failed to create a worker; {0}'.format(_e) + err_str = "Failed to create a worker; {0}".format(_e) tmp_return_value = (False, err_str) else: workspec.batchID = return_str - tmp_log.debug('Created worker {0} with batchID={1}'.format(workspec.workerID, workspec.batchID)) - tmp_return_value = (True, '') + tmp_log.debug("Created worker {0} with batchID={1}".format(workspec.workerID, workspec.batchID)) + tmp_return_value = (True, "") return tmp_return_value # submit workers def submit_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, 'queueName={0}'.format(self.queueName), method_name='submit_workers') + tmp_log = self.make_logger(base_logger, "queueName={0}".format(self.queueName), method_name="submit_workers") n_workers = len(workspec_list) - tmp_log.debug('start, n_workers={0}'.format(n_workers)) + tmp_log.debug("start, n_workers={0}".format(n_workers)) ret_list = list() if not workspec_list: - tmp_log.debug('empty workspec_list') + tmp_log.debug("empty workspec_list") return ret_list with ThreadPoolExecutor(self.nProcesses) as thread_pool: ret_val_list = thread_pool.map(self.submit_lancium_worker, workspec_list) - tmp_log.debug('{0} workers submitted'.format(n_workers)) + tmp_log.debug("{0} workers submitted".format(n_workers)) ret_list = list(ret_val_list) - tmp_log.debug('done') + tmp_log.debug("done") return ret_list diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 479d150c..2d4e37f9 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -1,16 +1,17 @@ import datetime import tempfile import re + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('lsf_submitter') +baseLogger = core_utils.setup_logger("lsf_submitter") # submitter for LSF batch system @@ -30,32 +31,27 @@ def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") # make batch script batchFile = self.make_batch_script(workSpec) # command comStr = "bsub -L /bin/sh" # submit - tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchFile)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=open(batchFile,'r')) + tmpLog.debug("submit with {0} and LSF options file {1}".format(comStr, batchFile)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=open(batchFile, "r")) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) - tmpLog.debug('stdOut={0}'.format(stdOut)) - tmpLog.debug('stdErr={0}'.format(stdErr)) + tmpLog.debug("retCode={0}".format(retCode)) + tmpLog.debug("stdOut={0}".format(stdOut)) + tmpLog.debug("stdErr={0}".format(stdErr)) if retCode == 0: # extract batchID - batchID = str(stdOut.split()[1],'utf-8') - result = re.sub('[^0-9]','', batchID) - tmpLog.debug('strip out non-numberic charactors from {0} - result {1}'.format(batchID,result)) + batchID = str(stdOut.split()[1], "utf-8") + result = re.sub("[^0-9]", "", batchID) + tmpLog.debug("strip out non-numberic charactors from {0} - result {1}".format(batchID, result)) workSpec.batchID = result - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # set log files if self.uploadLog: if self.logBaseURL is None: @@ -64,13 +60,13 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) tmpRetVal = (False, errStr) retList.append(tmpRetVal) @@ -78,36 +74,39 @@ def submit_workers(self, workspec_list): # make batch script def make_batch_script(self, workspec): - #if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: + # if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: # maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) # yodaWallClockLimit = workspec.maxWalltime / 60 - #else: + # else: # workspec.nCore = self.nCore # maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) # yodaWallClockLimit = self.maxWalltime / 60 # set number of nodes - Note Ultimately will need to something more sophisticated - if hasattr(self,'nGpuPerNode'): + if hasattr(self, "nGpuPerNode"): if int(self.nGpuPerNode) > 0: - numnodes = int(workspec.nJobs/self.nGpuPerNode) + numnodes = int(workspec.nJobs / self.nGpuPerNode) if numnodes <= 0: numnodes = 1 else: - if (workspec.nJobs % self.nGpuPerNode) != 0 : + if (workspec.nJobs % self.nGpuPerNode) != 0: numnodes += 1 else: - numnodes=workspec.nCore / self.nCorePerNode + numnodes = workspec.nCore / self.nCorePerNode - tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) - tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, - #localQueue=self.localQueue, - #projectName=self.projectName, - nNode=numnodes, - accessPoint=workspec.accessPoint, - #walltime=maxWalltime, - #yodaWallClockLimit=yodaWallClockLimit, - workerID=workspec.workerID) - ) + tmpFile = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) + tmpFile.write( + self.template.format( + nCorePerNode=self.nCorePerNode, + # localQueue=self.localQueue, + # projectName=self.projectName, + nNode=numnodes, + accessPoint=workspec.accessPoint, + # walltime=maxWalltime, + # yodaWallClockLimit=yodaWallClockLimit, + workerID=workspec.workerID, + ) + ) tmpFile.close() return tmpFile.name @@ -117,13 +116,13 @@ def get_log_file_names(self, batch_script, batch_id): stdErr = None with open(batch_script) as f: for line in f: - if not line.startswith('#BSUB'): + if not line.startswith("#BSUB"): continue items = line.split() - if '-o' in items: - #stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) - stdOut = items[-1].replace('%J', batch_id) - elif '-e' in items: - #stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) - stdErr = items[-1].replace('%J', batch_id) + if "-o" in items: + # stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdOut = items[-1].replace("%J", batch_id) + elif "-e" in items: + # stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdErr = items[-1].replace("%J", batch_id) return stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/pbs_submitter.py b/pandaharvester/harvestersubmitter/pbs_submitter.py index 2715bf5a..f8d56353 100644 --- a/pandaharvester/harvestersubmitter/pbs_submitter.py +++ b/pandaharvester/harvestersubmitter/pbs_submitter.py @@ -1,15 +1,16 @@ import datetime import tempfile + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('pbs_submitter') +baseLogger = core_utils.setup_logger("pbs_submitter") # submitter for PBS batch system @@ -29,26 +30,22 @@ def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") # make batch script batchFile = self.make_batch_script(workSpec) # command comStr = "qsub {0}".format(batchFile) # submit - tmpLog.debug('submit with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("submit with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) if retCode == 0: # extract batchID workSpec.batchID = stdOut.split()[-1] - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # set log files if self.uploadLog: if self.logBaseURL is None: @@ -57,13 +54,13 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) tmpRetVal = (False, errStr) retList.append(tmpRetVal) @@ -71,23 +68,26 @@ def submit_workers(self, workspec_list): # make batch script def make_batch_script(self, workspec): - if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: + if hasattr(self, "dynamicSizing") and self.dynamicSizing is True: maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) yodaWallClockLimit = workspec.maxWalltime / 60 else: workspec.nCore = self.nCore maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) yodaWallClockLimit = self.maxWalltime / 60 - tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) - tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, - localQueue=self.localQueue, - projectName=self.projectName, - nNode=workspec.nCore / self.nCorePerNode, - accessPoint=workspec.accessPoint, - walltime=maxWalltime, - yodaWallClockLimit=yodaWallClockLimit, - workerID=workspec.workerID) - ) + tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) + tmpFile.write( + self.template.format( + nCorePerNode=self.nCorePerNode, + localQueue=self.localQueue, + projectName=self.projectName, + nNode=workspec.nCore / self.nCorePerNode, + accessPoint=workspec.accessPoint, + walltime=maxWalltime, + yodaWallClockLimit=yodaWallClockLimit, + workerID=workspec.workerID, + ) + ) tmpFile.close() return tmpFile.name @@ -97,11 +97,11 @@ def get_log_file_names(self, batch_script, batch_id): stdErr = None with open(batch_script) as f: for line in f: - if not line.startswith('#PBS'): + if not line.startswith("#PBS"): continue items = line.split() - if '-o' in items: - stdOut = items[-1].replace('$SPBS_JOBID', batch_id) - elif '-e' in items: - stdErr = items[-1].replace('$PBS_JOBID', batch_id) + if "-o" in items: + stdOut = items[-1].replace("$SPBS_JOBID", batch_id) + elif "-e" in items: + stdErr = items[-1].replace("$PBS_JOBID", batch_id) return stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/saga_submitter.py b/pandaharvester/harvestersubmitter/saga_submitter.py index a02ccbe3..4923f14c 100644 --- a/pandaharvester/harvestersubmitter/saga_submitter.py +++ b/pandaharvester/harvestersubmitter/saga_submitter.py @@ -8,16 +8,17 @@ # setup base logger -baseLogger = core_utils.setup_logger('saga_submitter') +baseLogger = core_utils.setup_logger("saga_submitter") # SAGA submitter -class SAGASubmitter (PluginBase): + +class SAGASubmitter(PluginBase): # constructor # constructor define job service with particular adaptor (can be extended to support remote execution) def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - tmpLog = self.make_logger(baseLogger, method_name='__init__') + tmpLog = self.make_logger(baseLogger, method_name="__init__") tmpLog.info("[{0}] SAGA adaptor will be used".format(self.adaptor)) def workers_list(self): @@ -30,50 +31,45 @@ def workers_list(self): return workers def _get_executable(self, list_of_pandajobs): - ''' + """ Prepare command line to launch payload. TODO: In general will migrate to specific worker maker - :param list_of_pandajobs - list of job objects, which should be used: + :param list_of_pandajobs - list of job objects, which should be used: :return: string to execution which will be launched - ''' - executable_arr = ['module load python'] + """ + executable_arr = ["module load python"] for pj in list_of_pandajobs: - executable_arr.append('aprun -d 16 -n 1 ' + pj.jobParams['transformation'] - + ' ' + pj.jobParams['jobPars']) + executable_arr.append("aprun -d 16 -n 1 " + pj.jobParams["transformation"] + " " + pj.jobParams["jobPars"]) return executable_arr def _state_change_cb(self, src_obj, fire_on, value): + tmpLog = self.make_logger(baseLogger, method_name="_state_change_cb") - tmpLog = self.make_logger(baseLogger, method_name='_state_change_cb') - - #self._workSpec.status = self.status_translator(value) + # self._workSpec.status = self.status_translator(value) self._workSpec.set_status(self.status_translator(value)) - self._workSpec.force_update('status') + self._workSpec.force_update("status") try: tmpLog.debug("Created time: {}".format(src_obj.created)) - tmpLog.debug('src obj: {}'.format(src_obj)) - except: - tmpLog.debug('FAILED') - tmpLog.info('Worker with BatchID={0} workerID={2} change state to: {1}'.format(self._workSpec.batchID, - self._workSpec.status, - self._workSpec.workerID)) + tmpLog.debug("src obj: {}".format(src_obj)) + except BaseException: + tmpLog.debug("FAILED") + tmpLog.info("Worker with BatchID={0} workerID={2} change state to: {1}".format(self._workSpec.batchID, self._workSpec.status, self._workSpec.workerID)) # for compatibility with dummy monitor - f = open(os.path.join(self._workSpec.accessPoint, 'status.txt'), 'w') + f = open(os.path.join(self._workSpec.accessPoint, "status.txt"), "w") f.write(self._workSpec.status) f.close() return True def _execute(self, work_spec): - - tmpLog = self.make_logger(baseLogger, method_name='_execute') + tmpLog = self.make_logger(baseLogger, method_name="_execute") job_service = saga.job.Service(self.adaptor) - #sagadateformat_str = 'Tue Nov 7 11:31:10 2017' - #sagadateformat_str = '%a %b %d %H:%M:%S %Y' + # sagadateformat_str = 'Tue Nov 7 11:31:10 2017' + # sagadateformat_str = '%a %b %d %H:%M:%S %Y' try: os.chdir(work_spec.accessPoint) tmpLog.info("Walltime: {0} sec. {1} min.".format(work_spec.maxWalltime, work_spec.maxWalltime / 60)) @@ -98,11 +94,11 @@ def _execute(self, work_spec): jd.total_cpu_count = work_spec.nCore # one node with 16 cores for one job jd.queue = self.localqueue jd.working_directory = work_spec.accessPoint # working directory of task - uq_prefix = '{0:07}'.format(random.randint(0, 10000000)) - jd.output = os.path.join(work_spec.accessPoint, 'MPI_pilot_stdout_{0}'.format(uq_prefix)) - jd.error = os.path.join(work_spec.accessPoint, 'MPI_pilot_stderr_{0}'.format(uq_prefix)) - work_spec.set_log_file('stdout', jd.output) - work_spec.set_log_file('stderr', jd.error) + uq_prefix = "{0:07}".format(random.randint(0, 10000000)) + jd.output = os.path.join(work_spec.accessPoint, "MPI_pilot_stdout_{0}".format(uq_prefix)) + jd.error = os.path.join(work_spec.accessPoint, "MPI_pilot_stderr_{0}".format(uq_prefix)) + work_spec.set_log_file("stdout", jd.output) + work_spec.set_log_file("stderr", jd.error) # Create a new job from the job description. The initial state of # the job is 'New'. @@ -110,12 +106,12 @@ def _execute(self, work_spec): self._workSpec = work_spec task.run() - work_spec.batchID = task.id.split('-')[1][1:-1] #SAGA have own representation, but real batch id easy to extract + work_spec.batchID = task.id.split("-")[1][1:-1] # SAGA have own representation, but real batch id easy to extract tmpLog.info("Worker ID={0} with BatchID={1} submitted".format(work_spec.workerID, work_spec.batchID)) tmpLog.debug("SAGA status: {0}".format(task.state)) # for compatibility with dummy monitor - f = open(os.path.join(work_spec.accessPoint, 'status.txt'), 'w') + f = open(os.path.join(work_spec.accessPoint, "status.txt"), "w") f.write(self.status_translator(task.state)) f.close() @@ -145,17 +141,17 @@ def status_translator(saga_status): # submit workers def submit_workers(self, work_specs): - tmpLog = self.make_logger(baseLogger, method_name='submit_workers') - tmpLog.debug('start nWorkers={0}'.format(len(work_specs))) + tmpLog = self.make_logger(baseLogger, method_name="submit_workers") + tmpLog.debug("start nWorkers={0}".format(len(work_specs))) retList = [] for workSpec in work_specs: res = self._execute(workSpec) if res == 0: - retList.append((True, '')) + retList.append((True, "")) else: - retList.append((False, 'Failed to submit worker. Check logs')) + retList.append((False, "Failed to submit worker. Check logs")) - tmpLog.debug('done') + tmpLog.debug("done") return retList diff --git a/pandaharvester/harvestersubmitter/slurm_submitter.py b/pandaharvester/harvestersubmitter/slurm_submitter.py index 5d491c05..61da6046 100644 --- a/pandaharvester/harvestersubmitter/slurm_submitter.py +++ b/pandaharvester/harvestersubmitter/slurm_submitter.py @@ -17,7 +17,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('slurm_submitter') +baseLogger = core_utils.setup_logger("slurm_submitter") # submitter for SLURM batch system @@ -27,16 +27,15 @@ def __init__(self, **kwarg): self.uploadLog = False self.logBaseURL = None PluginBase.__init__(self, **kwarg) - if not hasattr(self, 'localQueueName'): - self.localQueueName = 'grid' + if not hasattr(self, "localQueueName"): + self.localQueueName = "grid" # submit workers def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") # set nCore workSpec.nCore = self.nCore # make batch script @@ -44,21 +43,18 @@ def submit_workers(self, workspec_list): # command comStr = "sbatch -D {0} {1}".format(workSpec.get_access_point(), batchFile) # submit - tmpLog.debug('submit with {0}'.format(batchFile)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("submit with {0}".format(batchFile)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: # extract batchID - workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*$', '{0}'.format(stdOut_str)).group(1) - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + workSpec.batchID = re.search("[^0-9]*([0-9]+)[^0-9]*$", "{0}".format(stdOut_str)).group(1) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # set log files if self.uploadLog: if self.logBaseURL is None: @@ -67,13 +63,13 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") else: # failed - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) tmpRetVal = (False, errStr) retList.append(tmpRetVal) @@ -86,7 +82,7 @@ def make_placeholder_map(self, workspec): this_panda_queue_dict = dict() # get default information from queue info - n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 + n_core_per_node_from_queue = this_panda_queue_dict.get("corecount", 1) if this_panda_queue_dict.get("corecount", 1) else 1 # get override requirements from queue configured try: @@ -100,7 +96,7 @@ def make_placeholder_map(self, workspec): request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0 n_node = ceil(n_core_total / n_core_per_node) - request_ram_bytes = request_ram * 2 ** 20 + request_ram_bytes = request_ram * 2**20 request_ram_per_core = ceil(request_ram * n_node / n_core_total) request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total) request_cputime = request_walltime * n_core_total @@ -108,30 +104,30 @@ def make_placeholder_map(self, workspec): request_cputime_minute = ceil(request_cputime / 60) placeholder_map = { - 'nCorePerNode': n_core_per_node, - 'nCoreTotal': n_core_total, - 'nNode': n_node, - 'requestRam': request_ram, - 'requestRamBytes': request_ram_bytes, - 'requestRamPerCore': request_ram_per_core, - 'requestRamBytesPerCore': request_ram_bytes_per_core, - 'requestDisk': request_disk, - 'requestWalltime': request_walltime, - 'requestWalltimeMinute': request_walltime_minute, - 'requestCputime': request_cputime, - 'requestCputimeMinute': request_cputime_minute, - 'accessPoint': workspec.accessPoint, - 'harvesterID': harvester_config.master.harvester_id, - 'workerID': workspec.workerID, - 'computingSite': workspec.computingSite, - 'pandaQueueName': panda_queue_name, - 'localQueueName': self.localQueueName, + "nCorePerNode": n_core_per_node, + "nCoreTotal": n_core_total, + "nNode": n_node, + "requestRam": request_ram, + "requestRamBytes": request_ram_bytes, + "requestRamPerCore": request_ram_per_core, + "requestRamBytesPerCore": request_ram_bytes_per_core, + "requestDisk": request_disk, + "requestWalltime": request_walltime, + "requestWalltimeMinute": request_walltime_minute, + "requestCputime": request_cputime, + "requestCputimeMinute": request_cputime_minute, + "accessPoint": workspec.accessPoint, + "harvesterID": harvester_config.master.harvester_id, + "workerID": workspec.workerID, + "computingSite": workspec.computingSite, + "pandaQueueName": panda_queue_name, + "localQueueName": self.localQueueName, # 'x509UserProxy': x509_user_proxy, - 'logDir': self.logDir, - 'logSubDir': os.path.join(self.logDir, timeNow.strftime('%y-%m-%d_%H')), - 'jobType': workspec.jobType + "logDir": self.logDir, + "logSubDir": os.path.join(self.logDir, timeNow.strftime("%y-%m-%d_%H")), + "jobType": workspec.jobType, } - for k in ['tokenDir', 'tokenName', 'tokenOrigin', 'submitMode']: + for k in ["tokenDir", "tokenName", "tokenOrigin", "submitMode"]: try: placeholder_map[k] = getattr(self, k) except Exception: @@ -143,7 +139,7 @@ def make_batch_script(self, workspec): # template for batch script with open(self.templateFile) as f: template = f.read() - tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) + tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) placeholder = self.make_placeholder_map(workspec) tmpFile.write(six.b(template.format_map(core_utils.SafeDict(placeholder)))) tmpFile.close() @@ -160,11 +156,11 @@ def get_log_file_names(self, batch_script, batch_id): stdErr = None with open(batch_script) as f: for line in f: - if not line.startswith('#SBATCH'): + if not line.startswith("#SBATCH"): continue items = line.split() - if '-o' in items: - stdOut = items[-1].replace('$SLURM_JOB_ID', batch_id) - elif '-e' in items: - stdErr = items[-1].replace('$SLURM_JOB_ID', batch_id) + if "-o" in items: + stdOut = items[-1].replace("$SLURM_JOB_ID", batch_id) + elif "-e" in items: + stdErr = items[-1].replace("$SLURM_JOB_ID", batch_id) return stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/slurm_submitter_jinja.py b/pandaharvester/harvestersubmitter/slurm_submitter_jinja.py index 531d7da4..b3a19a64 100644 --- a/pandaharvester/harvestersubmitter/slurm_submitter_jinja.py +++ b/pandaharvester/harvestersubmitter/slurm_submitter_jinja.py @@ -13,7 +13,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('slurm_submitter') +baseLogger = core_utils.setup_logger("slurm_submitter") # submitter for SLURM batch system @@ -30,8 +30,7 @@ def submit_workers(self, workspec_list): retStrList = [] for workSpec in workspec_list: # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), - method_name='submit_workers') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workSpec.workerID), method_name="submit_workers") # set nCore workSpec.nCore = self.nCore # make batch script @@ -39,21 +38,18 @@ def submit_workers(self, workspec_list): # command comStr = "sbatch -D {0} {1}".format(workSpec.get_access_point(), batchFile) # submit - tmpLog.debug('submit with {0}'.format(batchFile)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("submit with {0}".format(batchFile)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: # extract batchID - workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*', '{0}'.format(stdOut_str)).group(1) - tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + workSpec.batchID = re.search("[^0-9]*([0-9]+)[^0-9]*", "{0}".format(stdOut_str)).group(1) + tmpLog.debug("batchID={0}".format(workSpec.batchID)) # set log files if self.uploadLog: if self.logBaseURL is None: @@ -62,13 +58,13 @@ def submit_workers(self, workspec_list): baseDir = self.logBaseURL stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) if stdOut is not None: - workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + workSpec.set_log_file("stdout", "{0}/{1}".format(baseDir, stdOut)) if stdErr is not None: - workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) - tmpRetVal = (True, '') + workSpec.set_log_file("stderr", "{0}/{1}".format(baseDir, stdErr)) + tmpRetVal = (True, "") else: # failed - errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + errStr = "{0} {1}".format(stdOut_str, stdErr_str) tmpLog.error(errStr) tmpRetVal = (False, errStr) retList.append(tmpRetVal) @@ -81,37 +77,45 @@ def make_batch_script(self, workspec): self.template = tmpFile.read() tmpFile.close() del tmpFile - tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) - tmpFile.write(six.b(self.template.format(nCorePerNode=self.nCorePerNode, - nNode=workspec.nCore // self.nCorePerNode, - accessPoint=workspec.accessPoint, - workerID=workspec.workerID)) - ) + tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) + tmpFile.write( + six.b( + self.template.format( + nCorePerNode=self.nCorePerNode, nNode=workspec.nCore // self.nCorePerNode, accessPoint=workspec.accessPoint, workerID=workspec.workerID + ) + ) + ) tmpFile.close() return tmpFile.name - # make batch script + def make_batch_script_jinja(self, workspec): # template for batch script tmpFile = open(self.templateFile) self.template = tmpFile.read() tmpFile.close() del tmpFile - tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) + tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix="_submit.sh", dir=workspec.get_access_point()) tm = jinja2.Template(self.template) - tmpFile.write(six.b(tm.render(nCorePerNode=self.nCorePerNode, - nNode=workspec.nCore // self.nCorePerNode, - accessPoint=workspec.accessPoint, - workerID=workspec.workerID, - workspec=workspec))) + tmpFile.write( + six.b( + tm.render( + nCorePerNode=self.nCorePerNode, + nNode=workspec.nCore // self.nCorePerNode, + accessPoint=workspec.accessPoint, + workerID=workspec.workerID, + workspec=workspec, + ) + ) + ) - #tmpFile.write(six.b(self.template.format(nCorePerNode=self.nCorePerNode, + # tmpFile.write(six.b(self.template.format(nCorePerNode=self.nCorePerNode, # nNode=workspec.nCore // self.nCorePerNode, # accessPoint=workspec.accessPoint, # workerID=workspec.workerID)) # ) - #tmpFile.write(six.b(self.template.format(nCorePerNode=self.nCorePerNode, + # tmpFile.write(six.b(self.template.format(nCorePerNode=self.nCorePerNode, # nNode=workspec.nCore // self.nCorePerNode, # worker=workSpec, # submitter=self)) @@ -119,18 +123,18 @@ def make_batch_script_jinja(self, workspec): tmpFile.close() return tmpFile.name - # get log file names + def get_log_file_names(self, batch_script, batch_id): stdOut = None stdErr = None with open(batch_script) as f: for line in f: - if not line.startswith('#SBATCH'): + if not line.startswith("#SBATCH"): continue items = line.split() - if '-o' in items: - stdOut = items[-1].replace('$SLURM_JOB_ID', batch_id) - elif '-e' in items: - stdErr = items[-1].replace('$SLURM_JOB_ID', batch_id) + if "-o" in items: + stdOut = items[-1].replace("$SLURM_JOB_ID", batch_id) + elif "-e" in items: + stdErr = items[-1].replace("$SLURM_JOB_ID", batch_id) return stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/submitter_common.py b/pandaharvester/harvestersubmitter/submitter_common.py index 98a78336..4b3257f6 100644 --- a/pandaharvester/harvestersubmitter/submitter_common.py +++ b/pandaharvester/harvestersubmitter/submitter_common.py @@ -8,86 +8,102 @@ # Map "pilotType" (defined in harvester) to prodSourceLabel and pilotType option (defined in pilot, -i option) # and piloturl (pilot option --piloturl) for pilot 2 + + def get_complicated_pilot_options(pilot_type, pilot_url=None, pilot_version="", prod_source_label=None, prod_rc_permille=0): # for pilot 3 - is_pilot3 = True if pilot_version.startswith('3') else False + is_pilot3 = True if pilot_version.startswith("3") else False # basic map pt_psl_map = { - 'RC': { - 'prod_source_label': 'rc_test2', - 'pilot_type_opt': 'RC', - 'pilot_url_str': '--piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev.tar.gz' if is_pilot3 \ - else '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz', - 'pilot_debug_str': '-d', - }, - 'ALRB': { - 'prod_source_label': 'rc_alrb', - 'pilot_type_opt': 'ALRB', - 'pilot_url_str': '', - 'pilot_debug_str': '', - }, - 'PT': { - 'prod_source_label': 'ptest', - 'pilot_type_opt': 'PR', - 'pilot_url_str': '--piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev2.tar.gz' if is_pilot3 \ - else '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev2.tar.gz', - 'pilot_debug_str': '-d', - }, - 'PR': { - 'prod_source_label': prod_source_label, - 'pilot_type_opt': 'PR', - 'pilot_url_str': '', - 'pilot_debug_str': '', - }, - } + "RC": { + "prod_source_label": "rc_test2", + "pilot_type_opt": "RC", + "pilot_url_str": "--piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev.tar.gz" + if is_pilot3 + else "--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz", + "pilot_debug_str": "-d", + }, + "ALRB": { + "prod_source_label": "rc_alrb", + "pilot_type_opt": "ALRB", + "pilot_url_str": "", + "pilot_debug_str": "", + }, + "PT": { + "prod_source_label": "ptest", + "pilot_type_opt": "PR", + "pilot_url_str": "--piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev2.tar.gz" + if is_pilot3 + else "--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev2.tar.gz", + "pilot_debug_str": "-d", + }, + "PR": { + "prod_source_label": prod_source_label, + "pilot_type_opt": "PR", + "pilot_url_str": "", + "pilot_debug_str": "", + }, + } # get pilot option dict - pilot_opt_dict = pt_psl_map.get(pilot_type, pt_psl_map['PR']) + pilot_opt_dict = pt_psl_map.get(pilot_type, pt_psl_map["PR"]) if pilot_url: # overwrite with specified pilot_url - pilot_opt_dict['pilot_url_str'] = '--piloturl {0}'.format(pilot_url) - elif pilot_type == 'PR': + pilot_opt_dict["pilot_url_str"] = "--piloturl {0}".format(pilot_url) + elif pilot_type == "PR": # randomization of pilot url for PR (managed, user) pilot run some portion of RC version (not RC dev) pilot - prod_rc_pilot_url_str = '--piloturl http://pandaserver.cern.ch:25085/cache/pilot/pilot3-rc.tar.gz' - prod_rc_prob = min(max(prod_rc_permille/1000., 0), 1) + prod_rc_pilot_url_str = "--piloturl http://pandaserver.cern.ch:25085/cache/pilot/pilot3-rc.tar.gz" + prod_rc_prob = min(max(prod_rc_permille / 1000.0, 0), 1) lucky_number = random.random() if lucky_number < prod_rc_prob: - pilot_opt_dict['pilot_url_str'] = prod_rc_pilot_url_str + pilot_opt_dict["pilot_url_str"] = prod_rc_pilot_url_str # return pilot option dict return pilot_opt_dict + # get special flag of pilot wrapper about python version of pilot, and whether to run with python 3 if python version is "3" + + def get_python_version_option(python_version, prod_source_label): - option = '' - if python_version.startswith('3'): - option = '--pythonversion 3' + option = "" + if python_version.startswith("3"): + option = "--pythonversion 3" return option + # get pilot joblabel (-j) option, support unified dispatch + + def get_joblabel(prod_source_label, is_unified_dispatch=False): joblabel = prod_source_label - if is_unified_dispatch and prod_source_label in ['managed', 'user']: - joblabel = 'unified' + if is_unified_dispatch and prod_source_label in ["managed", "user"]: + joblabel = "unified" return joblabel + # get pilot job type (--job-type) option, support unified dispatch + + def get_pilot_job_type(job_type, is_unified_dispatch=False): pilot_job_type = job_type if is_unified_dispatch: - pilot_job_type = 'unified' + pilot_job_type = "unified" return pilot_job_type + # Parse resource type from string for Unified PanDA Queue + + def get_resource_type(string, is_unified_queue, is_pilot_option=False): string = str(string) if not is_unified_queue: - ret = '' - elif string in set(['SCORE', 'MCORE', 'SCORE_HIMEM', 'MCORE_HIMEM']): + ret = "" + elif string in set(["SCORE", "MCORE", "SCORE_HIMEM", "MCORE_HIMEM"]): if is_pilot_option: - ret = '--resource-type {0}'.format(string) + ret = "--resource-type {0}".format(string) else: ret = string else: - ret = '' + ret = "" return ret @@ -95,65 +111,62 @@ def get_resource_type(string, is_unified_queue, is_pilot_option=False): # CE stats related functions ############################# + # Compute weight of each CE according to worker stat, return tuple(dict, total weight score) def get_ce_weighting(ce_endpoint_list=[], worker_ce_all_tuple=None, is_slave_queue=False): - multiplier = 1000. + multiplier = 1000.0 n_ce = len(ce_endpoint_list) worker_limits_dict, worker_ce_stats_dict, worker_ce_backend_throughput_dict, time_window, n_new_workers = worker_ce_all_tuple N = float(n_ce) - Q = float(worker_limits_dict['nQueueLimitWorker']) - W = float(worker_limits_dict['maxWorkers']) - Q_good_init = float(sum(worker_ce_backend_throughput_dict[_ce][_st] - for _st in ('submitted', 'running', 'finished') - for _ce in worker_ce_backend_throughput_dict)) - Q_good_fin = float(sum(worker_ce_backend_throughput_dict[_ce][_st] - for _st in ('submitted',) - for _ce in worker_ce_backend_throughput_dict)) - thruput_avg = (log1p(Q_good_init) - log1p(Q_good_fin)) + Q = float(worker_limits_dict["nQueueLimitWorker"]) + W = float(worker_limits_dict["maxWorkers"]) + Q_good_init = float( + sum(worker_ce_backend_throughput_dict[_ce][_st] for _st in ("submitted", "running", "finished") for _ce in worker_ce_backend_throughput_dict) + ) + Q_good_fin = float(sum(worker_ce_backend_throughput_dict[_ce][_st] for _st in ("submitted",) for _ce in worker_ce_backend_throughput_dict)) + thruput_avg = log1p(Q_good_init) - log1p(Q_good_fin) n_new_workers = float(n_new_workers) # target number of queuing target_Q = Q + n_new_workers if is_slave_queue: # take total number of current queuing if slave queue - total_Q = sum(( float(worker_ce_stats_dict[_k]['submitted']) for _k in worker_ce_stats_dict )) + total_Q = sum((float(worker_ce_stats_dict[_k]["submitted"]) for _k in worker_ce_stats_dict)) target_Q = min(total_Q, Q) + n_new_workers def _get_thruput(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_backend_throughput_dict: - q_good_init = 0. - q_good_fin = 0. + q_good_init = 0.0 + q_good_fin = 0.0 else: - q_good_init = float(sum(worker_ce_backend_throughput_dict[_ce_endpoint][_st] - for _st in ('submitted', 'running', 'finished'))) - q_good_fin = float(sum(worker_ce_backend_throughput_dict[_ce_endpoint][_st] - for _st in ('submitted',))) - thruput = (log1p(q_good_init) - log1p(q_good_fin)) + q_good_init = float(sum(worker_ce_backend_throughput_dict[_ce_endpoint][_st] for _st in ("submitted", "running", "finished"))) + q_good_fin = float(sum(worker_ce_backend_throughput_dict[_ce_endpoint][_st] for _st in ("submitted",))) + thruput = log1p(q_good_init) - log1p(q_good_fin) return thruput def _get_thruput_adj_ratio(thruput): # inner function try: - thruput_adj_ratio = thruput/thruput_avg + 1/N + thruput_adj_ratio = thruput / thruput_avg + 1 / N except ZeroDivisionError: - if thruput == 0.: - thruput_adj_ratio = 1/N + if thruput == 0.0: + thruput_adj_ratio = 1 / N else: raise return thruput_adj_ratio - ce_base_weight_sum = sum((_get_thruput_adj_ratio(_get_thruput(_ce)) - for _ce in ce_endpoint_list)) + + ce_base_weight_sum = sum((_get_thruput_adj_ratio(_get_thruput(_ce)) for _ce in ce_endpoint_list)) def _get_init_weight(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_stats_dict: - q = 0. - r = 0. + q = 0.0 + r = 0.0 else: - q = float(worker_ce_stats_dict[_ce_endpoint]['submitted']) - r = float(worker_ce_stats_dict[_ce_endpoint]['running']) + q = float(worker_ce_stats_dict[_ce_endpoint]["submitted"]) + r = float(worker_ce_stats_dict[_ce_endpoint]["running"]) # q_avg = sum(( float(worker_ce_stats_dict[_k]['submitted']) for _k in worker_ce_stats_dict )) / N # r_avg = sum(( float(worker_ce_stats_dict[_k]['running']) for _k in worker_ce_stats_dict )) / N - if ( _ce_endpoint in worker_ce_stats_dict and q > Q ): + if _ce_endpoint in worker_ce_stats_dict and q > Q: return float(0) - ce_base_weight_normalized = _get_thruput_adj_ratio(_get_thruput(_ce_endpoint))/ce_base_weight_sum + ce_base_weight_normalized = _get_thruput_adj_ratio(_get_thruput(_ce_endpoint)) / ce_base_weight_sum # target number of queuing of the CE q_expected = target_Q * ce_base_weight_normalized # weight by difference @@ -162,17 +175,18 @@ def _get_init_weight(_ce_endpoint): # inner function # _weight_r = 1 + N*r/R if r == 0: # Penalty for dead CE (no running worker) - ret = ret / (1 + log1p(q)**2) + ret = ret / (1 + log1p(q) ** 2) return ret + init_weight_iterator = map(_get_init_weight, ce_endpoint_list) sum_of_weights = sum(init_weight_iterator) total_score = multiplier * N try: regulator = total_score / sum_of_weights except ZeroDivisionError: - regulator = 1. + regulator = 1.0 ce_weight_dict = {_ce: _get_init_weight(_ce) * regulator for _ce in ce_endpoint_list} - ce_thruput_dict = {_ce: _get_thruput(_ce) * 86400. / time_window for _ce in ce_endpoint_list} + ce_thruput_dict = {_ce: _get_thruput(_ce) * 86400.0 / time_window for _ce in ce_endpoint_list} return total_score, ce_weight_dict, ce_thruput_dict, target_Q @@ -180,61 +194,64 @@ def _get_init_weight(_ce_endpoint): # inner function def choose_ce(weighting): total_score, ce_weight_dict, ce_thruput_dict, target_Q = weighting lucky_number = random.random() * total_score - cur = 0. + cur = 0.0 ce_now = None for _ce, _w in ce_weight_dict.items(): - if _w == 0.: + if _w == 0.0: continue ce_now = _ce cur += _w if cur >= lucky_number: return _ce - if ce_weight_dict.get(ce_now, -1) > 0.: + if ce_weight_dict.get(ce_now, -1) > 0.0: return ce_now else: return None + # Get better string to display the statistics and weightng of CEs + + def get_ce_stats_weighting_display(ce_list, worker_ce_all_tuple, ce_weighting): worker_limits_dict, worker_ce_stats_dict, worker_ce_backend_throughput_dict, time_window, n_new_workers = worker_ce_all_tuple total_score, ce_weight_dict, ce_thruput_dict, target_Q = ce_weighting - worker_ce_stats_dict_sub_default = {'submitted': 0, 'running': 0} - worker_ce_backend_throughput_dict_sub_default = {'submitted': 0, 'running': 0, 'finished': 0} + worker_ce_stats_dict_sub_default = {"submitted": 0, "running": 0} + worker_ce_backend_throughput_dict_sub_default = {"submitted": 0, "running": 0, "finished": 0} general_dict = { - 'maxWorkers': int(worker_limits_dict.get('maxWorkers')), - 'nQueueLimitWorker': int(worker_limits_dict.get('nQueueLimitWorker')), - 'nNewWorkers': int(n_new_workers), - 'target_Q': int(target_Q), - 'history_time_window': int(time_window), - } + "maxWorkers": int(worker_limits_dict.get("maxWorkers")), + "nQueueLimitWorker": int(worker_limits_dict.get("nQueueLimitWorker")), + "nNewWorkers": int(n_new_workers), + "target_Q": int(target_Q), + "history_time_window": int(time_window), + } general_str = ( - 'maxWorkers={maxWorkers} ' - 'nQueueLimitWorker={nQueueLimitWorker} ' - 'nNewWorkers={nNewWorkers} ' - 'target_Q={target_Q} ' - 'hist_timeWindow={history_time_window} ' - ).format(**general_dict) + "maxWorkers={maxWorkers} " + "nQueueLimitWorker={nQueueLimitWorker} " + "nNewWorkers={nNewWorkers} " + "target_Q={target_Q} " + "hist_timeWindow={history_time_window} " + ).format(**general_dict) ce_str_list = [] for _ce in ce_list: schema_sub_dict = { - 'submitted_now': int(worker_ce_stats_dict.get(_ce, worker_ce_stats_dict_sub_default).get('submitted')), - 'running_now': int(worker_ce_stats_dict.get(_ce, worker_ce_stats_dict_sub_default).get('running')), - 'submitted_history': int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get('submitted')), - 'running_history': int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get('running')), - 'finished_history': int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get('finished')), - 'thruput_score': ce_thruput_dict.get(_ce), - 'weight_score': ce_weight_dict.get(_ce), - } + "submitted_now": int(worker_ce_stats_dict.get(_ce, worker_ce_stats_dict_sub_default).get("submitted")), + "running_now": int(worker_ce_stats_dict.get(_ce, worker_ce_stats_dict_sub_default).get("running")), + "submitted_history": int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get("submitted")), + "running_history": int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get("running")), + "finished_history": int(worker_ce_backend_throughput_dict.get(_ce, worker_ce_backend_throughput_dict_sub_default).get("finished")), + "thruput_score": ce_thruput_dict.get(_ce), + "weight_score": ce_weight_dict.get(_ce), + } ce_str = ( - '"{_ce}": ' - 'now_S={submitted_now} ' - 'now_R={running_now} ' - 'hist_S={submitted_history} ' - 'hist_R={running_history} ' - 'hist_F={finished_history} ' - 'T={thruput_score:.02f} ' - 'W={weight_score:.03f} ' - ).format(_ce=_ce, **schema_sub_dict) + '"{_ce}": ' + "now_S={submitted_now} " + "now_R={running_now} " + "hist_S={submitted_history} " + "hist_R={running_history} " + "hist_F={finished_history} " + "T={thruput_score:.02f} " + "W={weight_score:.03f} " + ).format(_ce=_ce, **schema_sub_dict) ce_str_list.append(ce_str) - stats_weighting_display_str = general_str + ' ; ' + ' , '.join(ce_str_list) + stats_weighting_display_str = general_str + " ; " + " , ".join(ce_str_list) return stats_weighting_display_str diff --git a/pandaharvester/harvestersweeper/act_sweeper.py b/pandaharvester/harvestersweeper/act_sweeper.py index 179f93f6..5f10d265 100644 --- a/pandaharvester/harvestersweeper/act_sweeper.py +++ b/pandaharvester/harvestersweeper/act_sweeper.py @@ -8,7 +8,7 @@ from act.atlas.aCTDBPanda import aCTDBPanda # logger -baseLogger = core_utils.setup_logger('act_sweeper') +baseLogger = core_utils.setup_logger("act_sweeper") # plugin for aCT sweeper @@ -17,17 +17,17 @@ class ACTSweeper(PluginBase): def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - self.log = core_utils.make_logger(baseLogger, 'aCT sweeper', method_name='__init__') + self.log = core_utils.make_logger(baseLogger, "aCT sweeper", method_name="__init__") try: self.actDB = aCTDBPanda(self.log) except Exception as e: - self.log.error('Could not connect to aCT database: {0}'.format(str(e))) + self.log.error("Could not connect to aCT database: {0}".format(str(e))) self.actDB = None - # kill a worker + def kill_worker(self, workspec): - """ Mark aCT job as tobekilled. + """Mark aCT job as tobekilled. :param workspec: worker specification :type workspec: WorkSpec @@ -35,28 +35,27 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") if workspec.batchID is None: - tmpLog.info('workerID={0} has no batch ID so assume was not submitted - skipped'.format( - workspec.workerID)) - return True, '' + tmpLog.info("workerID={0} has no batch ID so assume was not submitted - skipped".format(workspec.workerID)) + return True, "" try: # Only kill jobs which are still active - self.actDB.updateJobs("id={0} AND actpandastatus IN ('sent', 'starting', 'running')".format(workspec.batchID), - {'actpandastatus': 'tobekilled', 'pandastatus': None}) + self.actDB.updateJobs( + "id={0} AND actpandastatus IN ('sent', 'starting', 'running')".format(workspec.batchID), {"actpandastatus": "tobekilled", "pandastatus": None} + ) except Exception as e: if self.actDB: - tmpLog.error('Failed to cancel job {0} in aCT: {1}'.format(workspec.batchID, str(e))) + tmpLog.error("Failed to cancel job {0} in aCT: {1}".format(workspec.batchID, str(e))) return False, str(e) - tmpLog.info('Job {0} cancelled in aCT'.format(workspec.batchID)) - return True, '' - + tmpLog.info("Job {0} cancelled in aCT".format(workspec.batchID)) + return True, "" # cleanup for a worker + def sweep_worker(self, workspec): """Clean up access point. aCT takes care of archiving its own jobs. @@ -66,13 +65,12 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmpLog = core_utils.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") # clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info('removed {0}'.format(workspec.accessPoint)) + tmpLog.info("removed {0}".format(workspec.accessPoint)) else: - tmpLog.info('access point {0} already removed.'.format(workspec.accessPoint)) + tmpLog.info("access point {0} already removed.".format(workspec.accessPoint)) # return - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/apfgrid_sweeper.py b/pandaharvester/harvestersweeper/apfgrid_sweeper.py index f9e07023..72a688c9 100644 --- a/pandaharvester/harvestersweeper/apfgrid_sweeper.py +++ b/pandaharvester/harvestersweeper/apfgrid_sweeper.py @@ -27,35 +27,28 @@ def __call__(self, *args, **kwargs): return self.__instance - # dummy plugin for sweeper class APFGridSweeper(object): __metaclass__ = APFGridSweeperSingleton - + STATUS_MAP = { - 1 : WorkSpec.ST_submitted, - 2 : WorkSpec.ST_running, - 3 : WorkSpec.ST_cancelled, - 4 : WorkSpec.ST_finished, - 5 : WorkSpec.ST_failed, - 6 : WorkSpec.ST_ready, - } - - JOBQUERYATTRIBUTES = ['match_apf_queue', - 'jobstatus', - 'workerid', - 'apf_queue', - 'clusterid', - 'procid' - ] - + 1: WorkSpec.ST_submitted, + 2: WorkSpec.ST_running, + 3: WorkSpec.ST_cancelled, + 4: WorkSpec.ST_finished, + 5: WorkSpec.ST_failed, + 6: WorkSpec.ST_ready, + } + + JOBQUERYATTRIBUTES = ["match_apf_queue", "jobstatus", "workerid", "apf_queue", "clusterid", "procid"] + # constructor def __init__(self, **kwarg): self.log = core_utils.make_logger(baseLogger) self.jobinfo = None self.allbyworkerid = {} - self.log.debug('APFGridSweeper initialized.') + self.log.debug("APFGridSweeper initialized.") def _updateJobInfo(self): self.log.debug("Getting job info from Condor...") @@ -64,15 +57,15 @@ def _updateJobInfo(self): self.jobinfo = out for jobad in self.jobinfo: try: - workerid = jobad['workerid'] - self.allbyworkerid[workerid]= jobad + workerid = jobad["workerid"] + self.allbyworkerid[workerid] = jobad except KeyError: # some non-harvester jobs may not have workerids, ignore them pass self.log.debug("All jobs indexed by worker_id. %d entries." % len(self.allbyworkerid)) - # kill a worker + def kill_worker(self, workspec): """Kill a single worker in a scheduling system like batch systems and computing elements. @@ -85,16 +78,16 @@ def kill_worker(self, workspec): self._updateJobInfo() try: jobad = self.allbyworkerid(workspec.workerID) - clusterid = jobad['clusterid'] - procid = jobad['procid'] + clusterid = jobad["clusterid"] + procid = jobad["procid"] killstr = "%s.%s" % (clusterid, procid) - self.log.debug("Killing condor job %s ..."% killstr) + self.log.debug("Killing condor job %s ..." % killstr) condorlib.condor_rm([killstr]) - self.log.debug("Killed condor job %s with workerid %s"% (killstr,workspec.workerID )) + self.log.debug("Killed condor job %s with workerid %s" % (killstr, workspec.workerID)) except KeyError: - self.log.warning("kill_worker called on non-existent workerid: %s" % workspec.workerID ) - - return True, '' + self.log.warning("kill_worker called on non-existent workerid: %s" % workspec.workerID) + + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -105,4 +98,4 @@ def sweep_worker(self, workspec): :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/arc_sweeper.py b/pandaharvester/harvestersweeper/arc_sweeper.py index 3efb72d9..2a339fcc 100644 --- a/pandaharvester/harvestersweeper/arc_sweeper.py +++ b/pandaharvester/harvestersweeper/arc_sweeper.py @@ -8,19 +8,18 @@ # logger baselogger = core_utils.setup_logger() + class ARCSweeper(PluginBase): - '''Sweeper for killing and cleaning ARC jobs''' - + """Sweeper for killing and cleaning ARC jobs""" + # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - + # Credential dictionary role: proxy file - self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], - list(harvester_config.credmanager.outCertFile))) + self.certs = dict(zip([r.split("=")[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) self.cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) - def kill_worker(self, workspec): """Cancel the ARC job. @@ -29,7 +28,7 @@ def kill_worker(self, workspec): :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ - + # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log @@ -38,16 +37,16 @@ def kill_worker(self, workspec): if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cancelled") - return True, '' + return True, "" # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) - except: + except BaseException: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) - return True, '' + return True, "" job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() @@ -60,7 +59,7 @@ def kill_worker(self, workspec): # If longer than one hour since submission assume job never made it if job.SubmissionTime + arc.Period(3600) < arc.Time(): tmplog.warning("Assuming job is lost and marking as cancelled") - return True, '' + return True, "" # Job has not yet reached info system tmplog.warning("Job is not yet in info system so cannot be cancelled") @@ -68,11 +67,10 @@ def kill_worker(self, workspec): # Log a warning and return True so that job can be cleaned tmplog.warning("Job could not be cancelled") - return True, '' + return True, "" tmplog.info("Job cancelled successfully") - return True, '' - + return True, "" def sweep_worker(self, workspec): """Clean the ARC job @@ -82,8 +80,8 @@ def sweep_worker(self, workspec): :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ - - # make logger + + # make logger arclog = arc_utils.ARCLogger(baselogger, workspec.workerID) tmplog = arclog.log @@ -91,57 +89,63 @@ def sweep_worker(self, workspec): if not job.JobID: # Job not submitted tmplog.info("Job was not submitted so cannot be cleaned") - return True, '' + return True, "" # Set certificate userconfig = arc.UserConfig(self.cred_type) try: userconfig.ProxyPath(str(self.certs[proxyrole])) - except: + except BaseException: # Log a warning and return True so that job can be cleaned tmplog.warning("Job {0}: no proxy found with role {1}".format(job.JobID, proxyrole)) - return True, '' + return True, "" job_supervisor = arc.JobSupervisor(userconfig, [job]) job_supervisor.Update() job_supervisor.Clean() - + notcleaned = job_supervisor.GetIDsNotProcessed() if job.JobID in notcleaned: # Log a warning and return True so that job can be finished tmplog.warning("Job could not be cleaned") - return True, '' + return True, "" tmplog.info("Job cleaned successfully") - return True, '' - + return True, "" def test(jobid): - '''Kill a job''' + """Kill a job""" from pandaharvester.harvestercore.work_spec import WorkSpec import json + wspec = WorkSpec() wspec.batchID = jobid workAttributes = {"arcjob": {}} workAttributes["arcjob"]["JobID"] = wspec.batchID - workAttributes["arcjob"]["JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format(urlparse.urlparse(jobid).netloc, wspec.batchID) + workAttributes["arcjob"]["JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format( + urlparse.urlparse(jobid).netloc, wspec.batchID + ) workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng" jobmanagementurl = arc.URL(wspec.batchID) jobmanagementurl.ChangePath("/jobs") workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str() workAttributes["arcjob"]["JobManagementInterfaceName"] = "org.nordugrid.gridftpjob" - + wspec.workAttributes = workAttributes - print (wspec.workAttributes) + print(wspec.workAttributes) sweeper = ARCSweeper() - print (sweeper.kill_worker(wspec)) + print(sweeper.kill_worker(wspec)) + if __name__ == "__main__": - import time, sys, urlparse + import time + import sys + import urlparse + if len(sys.argv) != 2: - print ("Please give ARC job id") + print("Please give ARC job id") sys.exit(1) test(sys.argv[1]) diff --git a/pandaharvester/harvestersweeper/cloud_google_sweeper.py b/pandaharvester/harvestersweeper/cloud_google_sweeper.py index 09bca861..fd823462 100644 --- a/pandaharvester/harvestersweeper/cloud_google_sweeper.py +++ b/pandaharvester/harvestersweeper/cloud_google_sweeper.py @@ -4,12 +4,14 @@ from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper import googleapiclient -base_logger = core_utils.setup_logger('google_sweeper') +base_logger = core_utils.setup_logger("google_sweeper") + class GoogleSweeper(PluginBase): """ Sweeper with kill/clean-up functions for Google Compute Engine """ + def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.queue_config_mapper = QueueConfigMapper() @@ -33,21 +35,21 @@ def kill_worker(self, work_spec): except AttributeError: zone = ZONE - base_logger.debug('Going to kill VM {0}'.format(vm_name)) + base_logger.debug("Going to kill VM {0}".format(vm_name)) compute.instances().delete(project=PROJECT, zone=zone, instance=vm_name).execute() - base_logger.debug('Killed VM {0}'.format(vm_name)) - return True, '' + base_logger.debug("Killed VM {0}".format(vm_name)) + return True, "" except googleapiclient.errors.HttpError as e: - if 'was not found' in e.content: + if "was not found" in e.content: # the VM was already killed or does not exist for any other reason - message = 'VM does not exist'.format(vm_name) + message = "VM does not exist".format(vm_name) base_logger.debug(message) return True, message else: # there was an issue killing the VM and it should be retried at another time - return False, 'Problems killing the VM: {0}'.format(e) + return False, "Problems killing the VM: {0}".format(e) except Exception as e: - return False, 'Problems killing the VM: {0}'.format(e) + return False, "Problems killing the VM: {0}".format(e) def sweep_worker(self, work_spec): """ diff --git a/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py b/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py index 9b589c63..da04f11b 100644 --- a/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py +++ b/pandaharvester/harvestersweeper/cloud_openstack_sweeper.py @@ -6,7 +6,7 @@ # setup base logger -baseLogger = core_utils.setup_logger('cloud_openstack_sweeper') +baseLogger = core_utils.setup_logger("cloud_openstack_sweeper") # Cloud Openstack submitter @@ -16,33 +16,33 @@ def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) self.vm_client = OS_SimpleClient(auth_config_json_file=self.authConfigFile) - # kill a worker + def kill_worker(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='kill_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") # initial return values - tmpRetVal = (None, 'Nothing done') + tmpRetVal = (None, "Nothing done") # kill vm vm_id = workspec.batchID try: self.vm_client.nova.servers.delete(vm_id) except Exception as _e: - errStr = 'Failed to delete a VM with id={0} ; {1}'.format(vm_id, _e) + errStr = "Failed to delete a VM with id={0} ; {1}".format(vm_id, _e) tmpLog.error(errStr) tmpRetVal = (False, errStr) else: - tmpLog.info('Deleted a VM with id={0}'.format(vm_id)) - tmpRetVal = (True, '') + tmpLog.info("Deleted a VM with id={0}".format(vm_id)) + tmpRetVal = (True, "") return tmpRetVal - # cleanup for a worker + def sweep_worker(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='sweep_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/cobalt_sweeper.py b/pandaharvester/harvestersweeper/cobalt_sweeper.py index c8da85d6..a3432f4d 100644 --- a/pandaharvester/harvestersweeper/cobalt_sweeper.py +++ b/pandaharvester/harvestersweeper/cobalt_sweeper.py @@ -1,26 +1,27 @@ - -#=== Imports ================================================== +# === Imports ================================================== from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestercore import core_utils import os + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess import shutil -#============================================================== +# ============================================================== -#=== Definitions ============================================== +# === Definitions ============================================== -## Logger -baseLogger = core_utils.setup_logger('cobalt_sweeper') +# Logger +baseLogger = core_utils.setup_logger("cobalt_sweeper") -#============================================================== +# ============================================================== + +# === Functions ================================================ -#=== Functions ================================================ def _runShell(cmd): cmd = str(cmd) @@ -29,11 +30,14 @@ def _runShell(cmd): retCode = p.returncode return (retCode, stdOut, stdErr) -#============================================================== -#=== Classes ================================================== +# ============================================================== + +# === Classes ================================================== # dummy plugin for sweeper + + class CobaltSweeper(PluginBase): # constructor def __init__(self, **kwarg): @@ -49,23 +53,22 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ - ## Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + # Make logger + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") - ## Kill command - comStr = 'qdel {0}'.format(workspec.batchID) + # Kill command + comStr = "qdel {0}".format(workspec.batchID) (retCode, stdOut, stdErr) = _runShell(comStr) if retCode != 0: - ## Command failed + # Command failed errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr) tmpLog.error(errStr) return False, errStr else: - tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + tmpLog.info("Succeeded to kill workerID={0} batchID={1}".format(workspec.workerID, workspec.workerID)) - ## Return - return True, '' + # Return + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -77,17 +80,17 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ - ## Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + # Make logger + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") - ## Clean up worker directory + # Clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info(' removed {1}'.format(workspec.workerID, workspec.accessPoint)) + tmpLog.info(" removed {1}".format(workspec.workerID, workspec.accessPoint)) else: - tmpLog.info('access point already removed.') - ## Return - return True, '' + tmpLog.info("access point already removed.") + # Return + return True, "" + -#============================================================== +# ============================================================== diff --git a/pandaharvester/harvestersweeper/dummy_sweeper.py b/pandaharvester/harvestersweeper/dummy_sweeper.py index bcb72073..b2d93d20 100644 --- a/pandaharvester/harvestersweeper/dummy_sweeper.py +++ b/pandaharvester/harvestersweeper/dummy_sweeper.py @@ -16,7 +16,7 @@ def kill_worker(self, workspec): :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ - return True, '' + return True, "" # kill workers def kill_workers(self, workspec_list): @@ -28,7 +28,7 @@ def kill_workers(self, workspec_list): """ retList = [] for workspec in workspec_list: - retList.append((True, '')) + retList.append((True, "")) return retList # cleanup for a worker @@ -43,4 +43,4 @@ def sweep_worker(self, workspec): :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/gitlab_sweeper.py b/pandaharvester/harvestersweeper/gitlab_sweeper.py index aae28006..5dbf4b4d 100644 --- a/pandaharvester/harvestersweeper/gitlab_sweeper.py +++ b/pandaharvester/harvestersweeper/gitlab_sweeper.py @@ -1,6 +1,7 @@ import os import shutil import requests + try: import subprocess32 as subprocess except ImportError: @@ -12,7 +13,7 @@ # logger -baseLogger = core_utils.setup_logger('gitlab_sweeper') +baseLogger = core_utils.setup_logger("gitlab_sweeper") # plugin for sweeper with Gitlab @@ -32,23 +33,20 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") params = get_job_params(workspec) - url = '{}/{}/pipelines/{}/cancel'.format(params['project_api'], params['project_id'], - workspec.batchID.split()[0]) + url = "{}/{}/pipelines/{}/cancel".format(params["project_api"], params["project_id"], workspec.batchID.split()[0]) try: - tmpLog.debug('cancel pipeline at {}'.format(url)) - r = requests.get(url, headers={'PRIVATE-TOKEN': params['secrets'][params['access_token']]}, - timeout=self.timeout) + tmpLog.debug("cancel pipeline at {}".format(url)) + r = requests.get(url, headers={"PRIVATE-TOKEN": params["secrets"][params["access_token"]]}, timeout=self.timeout) response = r.json() - tmpLog.debug('got {}'.format(str(response))) + tmpLog.debug("got {}".format(str(response))) except Exception: err_str = core_utils.dump_error_message(tmpLog) tmpLog.error(err_str) - tmpLog.debug('done') + tmpLog.debug("done") # return - return True, '' + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -60,13 +58,12 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") # clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info('removed {0}'.format(workspec.accessPoint)) + tmpLog.info("removed {0}".format(workspec.accessPoint)) else: - tmpLog.info('access point already removed.') + tmpLog.info("access point already removed.") # return - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/htcondor_sweeper.py b/pandaharvester/harvestersweeper/htcondor_sweeper.py index 30e0f710..5ab39eee 100644 --- a/pandaharvester/harvestersweeper/htcondor_sweeper.py +++ b/pandaharvester/harvestersweeper/htcondor_sweeper.py @@ -15,7 +15,7 @@ # Logger -baseLogger = core_utils.setup_logger('htcondor_sweeper') +baseLogger = core_utils.setup_logger("htcondor_sweeper") # sweeper for HTCONDOR batch system @@ -81,12 +81,12 @@ def __init__(self, **kwarg): # # Return # return True, '' - # kill workers + def kill_workers(self, workspec_list): # Make logger - tmpLog = self.make_logger(baseLogger, method_name='kill_workers') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, method_name="kill_workers") + tmpLog.debug("start") # Initialization all_job_ret_map = {} retList = [] @@ -97,36 +97,34 @@ def kill_workers(self, workspec_list): ret_map = condor_job_manage.remove(batchIDs_list) except Exception as e: ret_map = {} - ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) + ret_err_str = "Exception {0}: {1}".format(e.__class__.__name__, e) tmpLog.error(ret_err_str) all_job_ret_map.update(ret_map) # Fill return list for workspec in workspec_list: if workspec.batchID is None: - ret = (True, 'worker without batchID; skipped') + ret = (True, "worker without batchID; skipped") else: - ret = all_job_ret_map.get(condor_job_id_from_workspec(workspec), - (False, 'batch job not found in return map')) + ret = all_job_ret_map.get(condor_job_id_from_workspec(workspec), (False, "batch job not found in return map")) retList.append(ret) - tmpLog.debug('done') + tmpLog.debug("done") # Return return retList # cleanup for a worker def sweep_worker(self, workspec): # Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") + tmpLog.debug("start") # Clean up preparator base directory (staged-in files) try: preparatorBasePath = self.preparatorBasePath except AttributeError: - tmpLog.debug('No preparator base directory is configured. Skipped cleaning up preparator directory') + tmpLog.debug("No preparator base directory is configured. Skipped cleaning up preparator directory") else: if os.path.isdir(preparatorBasePath): if not workspec.get_jobspec_list(): - tmpLog.warning('No job PandaID found relate to workerID={0}. Skipped cleaning up preparator directory'.format(workspec.workerID)) + tmpLog.warning("No job PandaID found relate to workerID={0}. Skipped cleaning up preparator directory".format(workspec.workerID)) else: for jobspec in workspec.get_jobspec_list(): preparator_dir_for_cleanup = os.path.join(preparatorBasePath, str(jobspec.PandaID)) @@ -134,19 +132,21 @@ def sweep_worker(self, workspec): try: shutil.rmtree(preparator_dir_for_cleanup) except OSError as _err: - if 'No such file or directory' in _err.strerror: - tmpLog.debug('Found that {0} was already removed'.format(_err.filename)) + if "No such file or directory" in _err.strerror: + tmpLog.debug("Found that {0} was already removed".format(_err.filename)) pass - tmpLog.info('Succeeded to clean up preparator directory: Removed {0}'.format(preparator_dir_for_cleanup)) + tmpLog.info("Succeeded to clean up preparator directory: Removed {0}".format(preparator_dir_for_cleanup)) else: - errStr = 'Failed to clean up preparator directory: {0} does not exist or invalid to be cleaned up'.format(preparator_dir_for_cleanup) + errStr = "Failed to clean up preparator directory: {0} does not exist or invalid to be cleaned up".format( + preparator_dir_for_cleanup + ) tmpLog.error(errStr) return False, errStr else: - errStr = 'Configuration error: Preparator base directory {0} does not exist'.format(preparatorBasePath) + errStr = "Configuration error: Preparator base directory {0} does not exist".format(preparatorBasePath) tmpLog.error(errStr) return False, errStr - tmpLog.info('Succeeded to clean up everything about workerID={0}'.format(workspec.workerID)) - tmpLog.debug('done') + tmpLog.info("Succeeded to clean up everything about workerID={0}".format(workspec.workerID)) + tmpLog.debug("done") # Return - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index fe78346f..402065e6 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -4,7 +4,7 @@ from pandaharvester.harvestermisc.info_utils_k8s import PandaQueuesDictK8s # logger -base_logger = core_utils.setup_logger('k8s_sweeper') +base_logger = core_utils.setup_logger("k8s_sweeper") # sweeper for K8S @@ -20,39 +20,38 @@ def __init__(self, **kwarg): # kill workers def kill_workers(self, work_spec_list): - tmp_log = self.make_logger(base_logger, method_name='kill_workers') + tmp_log = self.make_logger(base_logger, method_name="kill_workers") ret_list = [] for work_spec in work_spec_list: - tmp_ret_val = (None, 'Nothing done') + tmp_ret_val = (None, "Nothing done") batch_id = work_spec.batchID worker_id = str(work_spec.workerID) if batch_id: # sometimes there are missed workers that were not submitted - # if push mode, delete the configmap - if work_spec.mapType != 'NoJob': + if work_spec.mapType != "NoJob": try: self.k8s_client.delete_config_map(worker_id) - tmp_log.debug('Deleted configmap {0}'.format(worker_id)) + tmp_log.debug("Deleted configmap {0}".format(worker_id)) except Exception as _e: - err_str = 'Failed to delete a CONFIGMAP with id={0} ; {1}'.format(worker_id, _e) + err_str = "Failed to delete a CONFIGMAP with id={0} ; {1}".format(worker_id, _e) tmp_log.error(err_str) tmp_ret_val = (False, err_str) else: - tmp_log.debug('No pandajob/configmap associated to worker {0}'.format(work_spec.workerID)) + tmp_log.debug("No pandajob/configmap associated to worker {0}".format(work_spec.workerID)) # delete the job try: self.k8s_client.delete_job(batch_id) - tmp_log.debug('Deleted JOB {0}'.format(batch_id)) + tmp_log.debug("Deleted JOB {0}".format(batch_id)) except Exception as _e: - err_str = 'Failed to delete a JOB with id={0} ; {1}'.format(batch_id, _e) + err_str = "Failed to delete a JOB with id={0} ; {1}".format(batch_id, _e) tmp_log.error(err_str) tmp_ret_val = (False, err_str) else: # the worker does not need be cleaned - tmp_ret_val = (True, '') + tmp_ret_val = (True, "") ret_list.append(tmp_ret_val) @@ -60,11 +59,11 @@ def kill_workers(self, work_spec_list): def sweep_worker(self, work_spec): # cleanup for a worker - tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), method_name='sweep_worker') + tmp_log = self.make_logger(base_logger, "workerID={0}".format(work_spec.workerID), method_name="sweep_worker") # retrieve and upload the logs to panda cache # batch_id = work_spec.batchID # log_content = self.k8s_client.retrieve_pod_log(batch_id) # nothing to do - return True, '' \ No newline at end of file + return True, "" diff --git a/pandaharvester/harvestersweeper/lancium_sweeper.py b/pandaharvester/harvestersweeper/lancium_sweeper.py index 3922ca34..c7f6155b 100644 --- a/pandaharvester/harvestersweeper/lancium_sweeper.py +++ b/pandaharvester/harvestersweeper/lancium_sweeper.py @@ -5,7 +5,7 @@ from pandaharvester.harvestermisc.lancium_utils import LanciumClient # logger -base_logger = core_utils.setup_logger('lancium_sweeper') +base_logger = core_utils.setup_logger("lancium_sweeper") # sweeper for Lancium @@ -18,37 +18,36 @@ def __init__(self, **kwarg): # kill workers def kill_workers(self, workspec_list): - tmp_log = self.make_logger(base_logger, method_name='kill_workers') - tmp_log.debug('Start') + tmp_log = self.make_logger(base_logger, method_name="kill_workers") + tmp_log.debug("Start") ret_list = [] for workspec in workspec_list: - tmp_log.debug('Running kill_worker for {0}'.format(workspec.workerID)) + tmp_log.debug("Running kill_worker for {0}".format(workspec.workerID)) tmp_ret_val = self.kill_worker(workspec) ret_list.append(tmp_ret_val) - tmp_log.debug('Done') + tmp_log.debug("Done") return ret_list def kill_worker(self, workspec): - tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_worker') + tmp_log = self.make_logger(base_logger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") batch_id = workspec.batchID - tmp_log.debug('Running kill_worker') + tmp_log.debug("Running kill_worker") if batch_id: # sometimes there are missed workers that were not submitted try: self.lancium_client.delete_job(batch_id) - tmp_log.debug('Deleted job {0}'.format(batch_id)) - return True, '' + tmp_log.debug("Deleted job {0}".format(batch_id)) + return True, "" except Exception as _e: - err_str = 'Failed to delete a job with id={0} ; {1}'.format(batch_id, _e) + err_str = "Failed to delete a job with id={0} ; {1}".format(batch_id, _e) tmp_log.error(err_str) return False, err_str else: # the worker does not need be cleaned - tmp_log.debug('No action necessary, since no batch ID') - return True, '' + tmp_log.debug("No action necessary, since no batch ID") + return True, "" def sweep_worker(self, workspec): # cleanup for a worker - tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(workspec.workerID), method_name='sweep_worker') - tmp_log.debug('Returning kill_worker') + tmp_log = self.make_logger(base_logger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") + tmp_log.debug("Returning kill_worker") return self.kill_worker(workspec) - diff --git a/pandaharvester/harvestersweeper/lsf_sweeper.py b/pandaharvester/harvestersweeper/lsf_sweeper.py index 7c0c29d4..935f88f3 100644 --- a/pandaharvester/harvestersweeper/lsf_sweeper.py +++ b/pandaharvester/harvestersweeper/lsf_sweeper.py @@ -1,15 +1,16 @@ import os import shutil + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('lsf_sweeper') +baseLogger = core_utils.setup_logger("lsf_sweeper") # plugin for sweeper with LSF @@ -28,10 +29,9 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") # kill command - comStr = 'bkill {0}'.format(workspec.batchID) + comStr = "bkill {0}".format(workspec.batchID) # execute p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() @@ -42,9 +42,9 @@ def kill_worker(self, workspec): tmpLog.error(errStr) return False, errStr else: - tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + tmpLog.info("Succeeded to kill workerID={0} batchID={1}".format(workspec.workerID, workspec.workerID)) # return - return True, '' + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -56,13 +56,12 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") # clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info('removed {0}'.format(workspec.accessPoint)) + tmpLog.info("removed {0}".format(workspec.accessPoint)) else: - tmpLog.info('access point already removed.') + tmpLog.info("access point already removed.") # return - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/pbs_sweeper.py b/pandaharvester/harvestersweeper/pbs_sweeper.py index e742e618..f3245ed4 100644 --- a/pandaharvester/harvestersweeper/pbs_sweeper.py +++ b/pandaharvester/harvestersweeper/pbs_sweeper.py @@ -1,15 +1,16 @@ import os import shutil + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('pbs_sweeper') +baseLogger = core_utils.setup_logger("pbs_sweeper") # plugin for sweeper with PBS @@ -28,10 +29,9 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") # kill command - comStr = 'qdel {0}'.format(workspec.batchID) + comStr = "qdel {0}".format(workspec.batchID) # execute p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() @@ -42,9 +42,9 @@ def kill_worker(self, workspec): tmpLog.error(errStr) return False, errStr else: - tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + tmpLog.info("Succeeded to kill workerID={0} batchID={1}".format(workspec.workerID, workspec.workerID)) # return - return True, '' + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -56,13 +56,12 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") # clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info('removed {0}'.format(workspec.accessPoint)) + tmpLog.info("removed {0}".format(workspec.accessPoint)) else: - tmpLog.info('access point already removed.') + tmpLog.info("access point already removed.") # return - return True, '' + return True, "" diff --git a/pandaharvester/harvestersweeper/saga_sweeper.py b/pandaharvester/harvestersweeper/saga_sweeper.py index 1ff3eb58..f87c3755 100644 --- a/pandaharvester/harvestersweeper/saga_sweeper.py +++ b/pandaharvester/harvestersweeper/saga_sweeper.py @@ -8,7 +8,7 @@ from pandaharvester.harvestersubmitter.saga_submitter import SAGASubmitter # logger -baseLogger = core_utils.setup_logger('saga_sweeper') +baseLogger = core_utils.setup_logger("saga_sweeper") # dummy plugin for sweeper @@ -16,7 +16,7 @@ class SAGASweeper(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) - tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') + tmpLog = core_utils.make_logger(baseLogger, method_name="__init__") tmpLog.info("[{0}] SAGA adaptor will be used".format(self.adaptor)) # kill a worker @@ -29,28 +29,23 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ job_service = saga.job.Service(self.adaptor) - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') - tmpLog.info("[{0}] SAGA adaptor will be used to kill worker {1} with batchid {2}".format(self.adaptor, - workspec.workerID, - workspec.batchID)) - errStr = '' + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") + tmpLog.info("[{0}] SAGA adaptor will be used to kill worker {1} with batchid {2}".format(self.adaptor, workspec.workerID, workspec.batchID)) + errStr = "" if workspec.batchID: - saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workspec.batchID) + saga_submission_id = "[{0}]-[{1}]".format(self.adaptor, workspec.batchID) try: worker = job_service.get_job(saga_submission_id) - tmpLog.info( - 'SAGA State for submission with batchid: {0} is: {1}'.format(workspec.batchID, worker.state)) + tmpLog.info("SAGA State for submission with batchid: {0} is: {1}".format(workspec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) - tmpLog.info( - 'Worker state with batchid: {0} is: {1}'.format(workspec.batchID, harvester_job_state)) + tmpLog.info("Worker state with batchid: {0} is: {1}".format(workspec.batchID, harvester_job_state)) if worker.state in [saga.job.PENDING, saga.job.RUNNING]: worker.cancel() tmpLog.info("Worker {0} with batchid {1} canceled".format(workspec.workerID, workspec.batchID)) except saga.SagaException as ex: errStr = ex.get_message() - tmpLog.info('An exception occured during canceling of worker: {0}'.format(errStr)) + tmpLog.info("An exception occured during canceling of worker: {0}".format(errStr)) # probably 'failed' is not proper state in this case, 'undefined' looks a bit better # harvester_job_state = workspec.ST_failed @@ -69,15 +64,14 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ - ## Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + # Make logger + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") - ## Clean up worker directory + # Clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info(' removed {1}'.format(workspec.workerID, workspec.accessPoint)) + tmpLog.info(" removed {1}".format(workspec.workerID, workspec.accessPoint)) else: - tmpLog.info('access point already removed.') - ## Return - return True, '' + tmpLog.info("access point already removed.") + # Return + return True, "" diff --git a/pandaharvester/harvestersweeper/slurm_sweeper.py b/pandaharvester/harvestersweeper/slurm_sweeper.py index c0af0e36..86a89aee 100644 --- a/pandaharvester/harvestersweeper/slurm_sweeper.py +++ b/pandaharvester/harvestersweeper/slurm_sweeper.py @@ -1,5 +1,6 @@ import os import shutil + try: import subprocess32 as subprocess except ImportError: @@ -9,7 +10,7 @@ from pandaharvester.harvestersweeper.base_sweeper import BaseSweeper # logger -baseLogger = core_utils.setup_logger('slurm_sweeper') +baseLogger = core_utils.setup_logger("slurm_sweeper") # plugin for sweeper with SLURM @@ -28,10 +29,9 @@ def kill_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='kill_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="kill_worker") # kill command - comStr = 'scancel {0}'.format(workspec.batchID) + comStr = "scancel {0}".format(workspec.batchID) # execute p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() @@ -42,9 +42,9 @@ def kill_worker(self, workspec): tmpLog.error(errStr) return False, errStr else: - tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + tmpLog.info("Succeeded to kill workerID={0} batchID={1}".format(workspec.workerID, workspec.workerID)) # return - return True, '' + return True, "" # cleanup for a worker def sweep_worker(self, workspec): @@ -56,13 +56,12 @@ def sweep_worker(self, workspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmpLog = self.make_logger(baseLogger, "workerID={0}".format(workspec.workerID), method_name="sweep_worker") # clean up worker directory if os.path.exists(workspec.accessPoint): shutil.rmtree(workspec.accessPoint) - tmpLog.info('removed {0}'.format(workspec.accessPoint)) + tmpLog.info("removed {0}".format(workspec.accessPoint)) else: - tmpLog.info('access point already removed.') + tmpLog.info("access point already removed.") # return - return True, '' + return True, "" diff --git a/pandaharvester/harvestertest/basicTest.py b/pandaharvester/harvestertest/basicTest.py index 826915db..ca312f70 100644 --- a/pandaharvester/harvestertest/basicTest.py +++ b/pandaharvester/harvestertest/basicTest.py @@ -4,6 +4,7 @@ import datetime from future.utils import iteritems from pandaharvester.harvesterconfig import harvester_config + try: os.remove(harvester_config.db.database_filename) except Exception: @@ -14,10 +15,10 @@ from pandaharvester.harvestercore.communicator_pool import CommunicatorPool for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): + if loggerName.startswith("panda.log"): if len(loggerObj.handlers) == 0: continue - if loggerName.split('.')[-1] in ['db_proxy']: + if loggerName.split(".")[-1] in ["db_proxy"]: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) @@ -39,4 +40,4 @@ a = CommunicatorPool() -a.get_jobs('siteName', 'nodeName', 'prodSourceLabel', 'computingElement', 1, {}) +a.get_jobs("siteName", "nodeName", "prodSourceLabel", "computingElement", 1, {}) diff --git a/pandaharvester/harvestertest/cacherTest.py b/pandaharvester/harvestertest/cacherTest.py index db9e9e08..072990d4 100644 --- a/pandaharvester/harvestertest/cacherTest.py +++ b/pandaharvester/harvestertest/cacherTest.py @@ -7,10 +7,10 @@ from pandaharvester.harvestercore.communicator_pool import CommunicatorPool for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): + if loggerName.startswith("panda.log"): if len(loggerObj.handlers) == 0: continue - if loggerName.split('.')[-1] not in ['cacher']: + if loggerName.split(".")[-1] not in ["cacher"]: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) @@ -21,4 +21,3 @@ cacher = Cacher(communicator, single_mode=True) cacher.execute(force_update=True, skip_lock=True) - diff --git a/pandaharvester/harvestertest/check_log_db_proxy_pool.py b/pandaharvester/harvestertest/check_log_db_proxy_pool.py index fc8cbc2a..ab7b8e10 100644 --- a/pandaharvester/harvestertest/check_log_db_proxy_pool.py +++ b/pandaharvester/harvestertest/check_log_db_proxy_pool.py @@ -6,13 +6,13 @@ from pandalogger import logger_config -logdir = logger_config.daemon['logdir'] +logdir = logger_config.daemon["logdir"] # collect data data = dict() -with open(os.path.join(logdir, 'panda-db_proxy_pool.log')) as f: +with open(os.path.join(logdir, "panda-db_proxy_pool.log")) as f: for line in f: - m = re.search(']+)> release lock .* (\d+\.\d+) ', line) + m = re.search("]+)> release lock .* (\d+\.\d+) ", line) if m is not None: method = m.group(1) exeTime = float(m.group(2)) @@ -39,19 +39,18 @@ aData[ave].append(method) # show average -aveList = aData.keys() -aveList.sort() +aveList = sorted(aData.keys()) aveList.reverse() print -print ('Execution time summary : top {0}'.format(nS)) -print (' average max n_call method') -print ('-------------------------------------------------') +print("Execution time summary : top {0}".format(nS)) +print(" average max n_call method") +print("-------------------------------------------------") i = 0 escape = False for ave in aveList: for method in aData[ave]: - print ('{0:8.2f} {1:8.2f} {2:8d} {3}'.format(ave, max(data[method]), len(data[method]), method)) + print("{0:8.2f} {1:8.2f} {2:8d} {3}".format(ave, max(data[method]), len(data[method]), method)) i += 1 if i >= nS: escape = True @@ -60,19 +59,18 @@ break # show longest methods -longList = lData.keys() -longList.sort() +longList = sorted(lData.keys()) longList.reverse() print -print ('Long execution method : top {0}'.format(nL)) -print (' method time') -print ('-------------------------------------------------') +print("Long execution method : top {0}".format(nL)) +print(" method time") +print("-------------------------------------------------") i = 0 escape = False for val in longList: for method in lData[val]: - print (' {0:30} {1:8.2f}'.format(method, val)) + print(" {0:30} {1:8.2f}".format(method, val)) i += 1 if i >= nL: escape = True diff --git a/pandaharvester/harvestertest/cleanDN.py b/pandaharvester/harvestertest/cleanDN.py index 812cd690..e19062b1 100644 --- a/pandaharvester/harvestertest/cleanDN.py +++ b/pandaharvester/harvestertest/cleanDN.py @@ -1,50 +1,52 @@ import re import sys + try: import subprocess32 as subprocess -except: +except BaseException: import subprocess def clean_user_id(id): try: - up = re.compile('/(DC|O|OU|C|L)=[^\/]+') - username = up.sub('', id) - up2 = re.compile('/CN=[0-9]+') - username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') - username = up4.sub('', username) - username = username.replace('/CN=proxy', '') - username = username.replace('/CN=limited proxy', '') - username = username.replace('limited proxy', '') - username = re.sub('/CN=Robot:[^/]+', '', username) - pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') + up = re.compile("/(DC|O|OU|C|L)=[^\/]+") + username = up.sub("", id) + up2 = re.compile("/CN=[0-9]+") + username = up2.sub("", username) + up3 = re.compile(" [0-9]+") + username = up3.sub("", username) + up4 = re.compile("_[0-9]+") + username = up4.sub("", username) + username = username.replace("/CN=proxy", "") + username = username.replace("/CN=limited proxy", "") + username = username.replace("limited proxy", "") + username = re.sub("/CN=Robot:[^/]+", "", username) + pat = re.compile(".*/CN=([^\/]+)/CN=([^\/]+)") mat = pat.match(username) if mat: username = mat.group(2) else: - username = username.replace('/CN=', '') - if username.lower().find('/email') > 0: - username = username[:username.lower().find('/email')] - pat = re.compile('.*(limited.*proxy).*') + username = username.replace("/CN=", "") + if username.lower().find("/email") > 0: + username = username[: username.lower().find("/email")] + pat = re.compile(".*(limited.*proxy).*") mat = pat.match(username) if mat: username = mat.group(1) - username = username.replace('(', '') - username = username.replace(')', '') - username = username.replace("'", '') + username = username.replace("(", "") + username = username.replace(")", "") + username = username.replace("'", "") return username - except: + except BaseException: return id + certFile = sys.argv[1] com = "openssl x509 -noout -subject -in" p = subprocess.Popen(com.split() + [certFile], stdout=subprocess.PIPE) out, err = p.communicate() -out = re.sub('^subject=', '', out) +out = re.sub("^subject=", "", out) out = out.strip() -print ('DN: "{0}"'.format(out)) -print ('extracted: "{0}"'.format(clean_user_id(out))) +print('DN: "{0}"'.format(out)) +print('extracted: "{0}"'.format(clean_user_id(out))) diff --git a/pandaharvester/harvestertest/container_auxpreparator_test.py b/pandaharvester/harvestertest/container_auxpreparator_test.py index 372d49bd..ece39c72 100644 --- a/pandaharvester/harvestertest/container_auxpreparator_test.py +++ b/pandaharvester/harvestertest/container_auxpreparator_test.py @@ -3,7 +3,7 @@ import time from pprint import pprint -#from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +# from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper from pandaharvester.harvestercore.job_spec import JobSpec from pandaharvester.harvesterextractor.aux_extractor import AuxExtractor @@ -11,13 +11,15 @@ job_data = json.loads(job_data_json) -job_data["jobPars"] = '--inputEVNTFile=EVNT.21265061._000036.pool.root.1 --maxEvents=1000 --postInclude "default:RecJobTransforms/UseFrontier.py" --preExec "EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)" "EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True" --preInclude "EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py" --skipEvents=1000 --firstEvent=331001 --outputHITSFile=HITS.21265064._002580.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=352 --DBRelease="all:current" --conditionsTag "default:OFLCOND-MC16-SDR-14" --geometryVersion="default:ATLAS-R2-2016-01-00-01_VALIDATION" --runNumber=830011 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus' +job_data[ + "jobPars" +] = '--inputEVNTFile=EVNT.21265061._000036.pool.root.1 --maxEvents=1000 --postInclude "default:RecJobTransforms/UseFrontier.py" --preExec "EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)" "EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True" --preInclude "EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py" --skipEvents=1000 --firstEvent=331001 --outputHITSFile=HITS.21265064._002580.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=352 --DBRelease="all:current" --conditionsTag "default:OFLCOND-MC16-SDR-14" --geometryVersion="default:ATLAS-R2-2016-01-00-01_VALIDATION" --runNumber=830011 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus' jobSpec = JobSpec() jobSpec.convert_job_json(job_data) -#pprint(jobSpec.jobParams) +# pprint(jobSpec.jobParams) ae = AuxExtractor() print(ae.get_aux_inputs(jobSpec)) diff --git a/pandaharvester/harvestertest/credMangerTest.py b/pandaharvester/harvestertest/credMangerTest.py index 50b8b741..322bb633 100644 --- a/pandaharvester/harvestertest/credMangerTest.py +++ b/pandaharvester/harvestertest/credMangerTest.py @@ -24,12 +24,12 @@ def get_list(data): moduleNames = get_list(harvester_config.credmanager.moduleName) classNames = get_list(harvester_config.credmanager.className) # file names of original certificates -if hasattr(harvester_config.credmanager, 'inCertFile'): +if hasattr(harvester_config.credmanager, "inCertFile"): inCertFiles = get_list(harvester_config.credmanager.inCertFile) else: inCertFiles = get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated -if hasattr(harvester_config.credmanager, 'outCertFile'): +if hasattr(harvester_config.credmanager, "outCertFile"): outCertFiles = get_list(harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name @@ -38,27 +38,26 @@ def get_list(data): vomses = get_list(harvester_config.credmanager.voms) # logger -_logger = core_utils.setup_logger('credManagerTest') +_logger = core_utils.setup_logger("credManagerTest") # get plugin(s) exeCores = [] -for moduleName, className, inCertFile, outCertFile, voms in \ - zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): +for moduleName, className, inCertFile, outCertFile, voms in zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} - pluginPar['module'] = moduleName - pluginPar['name'] = className - pluginPar['inCertFile'] = inCertFile - pluginPar['outCertFile'] = outCertFile - pluginPar['voms'] = voms + pluginPar["module"] = moduleName + pluginPar["name"] = className + pluginPar["inCertFile"] = inCertFile + pluginPar["outCertFile"] = outCertFile + pluginPar["voms"] = voms exeCore = pluginFactory.get_plugin(pluginPar) exeCores.append(exeCore) # setup logger to write to screen also for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): + if loggerName.startswith("panda.log"): if len(loggerObj.handlers) == 0: continue - if loggerName.split('.')[-1] in ['db_proxy']: + if loggerName.split(".")[-1] in ["db_proxy"]: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) @@ -70,22 +69,20 @@ def get_list(data): if exeCore is None: continue # make logger - mainLog = core_utils.make_logger(_logger, "{0} {1}".format(exeCore.__class__.__name__, - exeCore.outCertFile), - method_name='execute') + mainLog = core_utils.make_logger(_logger, "{0} {1}".format(exeCore.__class__.__name__, exeCore.outCertFile), method_name="execute") # list the plugin name - mainLog.debug('plugin={0}'.format(exeCore.__class__.__name__)) + mainLog.debug("plugin={0}".format(exeCore.__class__.__name__)) # check credential - mainLog.debug('check credential') + mainLog.debug("check credential") isValid = exeCore.check_credential() if isValid: - mainLog.debug('valid') + mainLog.debug("valid") elif not isValid: # renew it if necessary - mainLog.debug('invalid') - mainLog.debug('renew credential') + mainLog.debug("invalid") + mainLog.debug("renew credential") tmpStat, tmpOut = exeCore.renew_credential() if not tmpStat: - mainLog.error('failed : {0}'.format(tmpOut)) + mainLog.error("failed : {0}".format(tmpOut)) continue - mainLog.debug('done') + mainLog.debug("done") diff --git a/pandaharvester/harvestertest/encryptForWatcher.py b/pandaharvester/harvestertest/encryptForWatcher.py index 51bd1df2..16cbb40e 100644 --- a/pandaharvester/harvestertest/encryptForWatcher.py +++ b/pandaharvester/harvestertest/encryptForWatcher.py @@ -3,14 +3,14 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config -if not hasattr(harvester_config.watcher, 'passphraseEnv'): - print ('ERROR: passphraseEnv is not defined in the watcher section of etc/panda/panda_harvester.cfg') +if not hasattr(harvester_config.watcher, "passphraseEnv"): + print("ERROR: passphraseEnv is not defined in the watcher section of etc/panda/panda_harvester.cfg") sys.exit(1) envName = harvester_config.watcher.passphraseEnv if envName not in os.environ: - print ("ERROR: env variable {0} is undefined in etc/sysconfig/panda_harvester".format(envName)) + print("ERROR: env variable {0} is undefined in etc/sysconfig/panda_harvester".format(envName)) sys.exit(1) key = os.environ[envName] @@ -18,11 +18,11 @@ cipher_text = core_utils.encrypt_string(key, secret) -print ("original: {0}".format(secret)) -print ("encrypted: {0}".format(cipher_text)) +print("original: {0}".format(secret)) +print("encrypted: {0}".format(cipher_text)) plain_text = core_utils.decrypt_string(key, cipher_text) -print ("decrypted: {0}".format(plain_text)) +print("decrypted: {0}".format(plain_text)) if secret != plain_text: - print ("ERROR: the encrypted string cannot be correctly decrypted") + print("ERROR: the encrypted string cannot be correctly decrypted") diff --git a/pandaharvester/harvestertest/further_testing_go_bulk_preparator-test.py b/pandaharvester/harvestertest/further_testing_go_bulk_preparator-test.py index 0afa6bad..927c1448 100644 --- a/pandaharvester/harvestertest/further_testing_go_bulk_preparator-test.py +++ b/pandaharvester/harvestertest/further_testing_go_bulk_preparator-test.py @@ -18,7 +18,7 @@ from pandaharvester.harvesterbody.cacher import Cacher from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermisc import globus_utils from globus_sdk import TransferClient @@ -27,9 +27,9 @@ from globus_sdk import RefreshTokenAuthorizer -#initial variables -fileTableName = 'file_table' -queueName = 'ALCF_Theta' +# initial variables +fileTableName = "file_table" +queueName = "ALCF_Theta" job_id = 0 end_job_id = 1113 globus_sleep_time = 15 @@ -39,26 +39,25 @@ def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) if len(sys.argv) > 1: - queueName = sys.argv[1] + queueName = sys.argv[1] if len(sys.argv) > 2: - job_id = int(sys.argv[2]) -#if len(sys.argv) > 3: + job_id = int(sys.argv[2]) +# if len(sys.argv) > 3: # end_job_id = int(sys.argv[3]) -#if len(sys.argv) > 4: +# if len(sys.argv) > 4: # globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator -queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' -queueConfig.preparator['name'] = 'GlobusBulkPreparator' +queueConfig.preparator["module"] = "pandaharvester.harvesterpreparator.go_bulk_preparator" +queueConfig.preparator["name"] = "GlobusBulkPreparator" modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() @@ -66,20 +65,20 @@ def dump(obj): preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger -_logger = core_utils.setup_logger('further_testing_go_bulk_preparator') -tmpLog = core_utils.make_logger(_logger, method_name='further_testing_go_bulk_preparator') -tmpLog.debug('start') +_logger = core_utils.setup_logger("further_testing_go_bulk_preparator") +tmpLog = core_utils.make_logger(_logger, method_name="further_testing_go_bulk_preparator") +tmpLog.debug("start") for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) + # print "loggerName - {}".format(loggerName) + if loggerName.startswith("panda.log"): + if len(loggerObj.handlers) == 0: + continue + if loggerName.split(".")[-1] in ["db_proxy"]: + continue + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) + loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) @@ -88,7 +87,7 @@ def dump(obj): msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) tmpLog.debug(msgStr) -scope = 'panda' +scope = "panda" proxy = DBProxy() communicator = CommunicatorPool() @@ -97,83 +96,80 @@ def dump(obj): tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) - + # get all jobs in table in a preparing substate -#tmpLog.debug('try to get all jobs in a preparing substate') -#jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None) -# get all jobs -if job_id > 0 : - tmpLog.debug('try to get job ID - {}'.format(job_id)) - jobSpec_list = [proxy.get_job(job_id)] -else : - tmpLog.debug('try to get all jobs') - jobSpec_list = proxy.get_jobs() +# tmpLog.debug('try to get all jobs in a preparing substate') +# jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None) +# get all jobs +if job_id > 0: + tmpLog.debug("try to get job ID - {}".format(job_id)) + jobSpec_list = [proxy.get_job(job_id)] +else: + tmpLog.debug("try to get all jobs") + jobSpec_list = proxy.get_jobs() -tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) +tmpLog.debug("got {0} jobs".format(len(jobSpec_list))) # loop over all found jobs -if len(jobSpec_list) > 0 : +if len(jobSpec_list) > 0: for jobSpec in jobSpec_list: # if user entered a job id check for it - if job_id > 0 : - if jobSpec.PandaID != job_id : - continue - tmpLog.debug(' PandaID = %d status = %s subStatus = %s lockedBy = %s' % - (jobSpec.PandaID,jobSpec.status,jobSpec.subStatus,jobSpec.lockedBy)) + if job_id > 0: + if jobSpec.PandaID != job_id: + continue + tmpLog.debug(" PandaID = %d status = %s subStatus = %s lockedBy = %s" % (jobSpec.PandaID, jobSpec.status, jobSpec.subStatus, jobSpec.lockedBy)) # get the transfer groups groups = jobSpec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('jobspec.get_groups_of_input_files(skip_ready=True) = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_input_files(skip_ready=True) = : {0}".format(groups)) groups = jobSpec.get_groups_of_input_files() - tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_input_files() = : {0}".format(groups)) # get the number of input files - tmpLog.debug('Number of input files - {}'.format(len(jobSpec.inFiles))) + tmpLog.debug("Number of input files - {}".format(len(jobSpec.inFiles))) # loop over the groups and get the number of files per group - for group in groups: - tmpLog.debug('file group id - {0} number of input files - {1}'.format(group,len(jobSpec.get_input_file_specs(group)))) + for group in groups: + tmpLog.debug("file group id - {0} number of input files - {1}".format(group, len(jobSpec.get_input_file_specs(group)))) inFiles = jobSpec.get_input_file_attributes(skip_ready=True) - tmpLog.debug('number of input files from get_input_file_attributes - {}'.format(len(inFiles))) - - + tmpLog.debug("number of input files from get_input_file_attributes - {}".format(len(inFiles))) lfns = inFiles.keys() - tmpLog.debug('number of input files from inFiles.keys() - {}'.format(len(lfns))) + tmpLog.debug("number of input files from inFiles.keys() - {}".format(len(lfns))) - tmpLog.debug('{}'.format(lfns)) + tmpLog.debug("{}".format(lfns)) for inLFN in inFiles.keys(): - lfns.append(inLFN) - tmpLog.debug('number of input files from append inFiles.keys() - {}'.format(len(lfns))) + lfns.append(inLFN) + tmpLog.debug("number of input files from append inFiles.keys() - {}".format(len(lfns))) sys.exit(0) # loop over groups keys to see if db is locked for key in groups: - locked = preparatorCore.dbInterface.get_object_lock(key,lock_interval=120) - if not locked: - tmpLog.debug('DB Already locked by another thread') - # now unlock db - unlocked = preparatorCore.dbInterface.release_object_lock(key) - if unlocked : - tmpLog.debug('unlocked db') - else: - tmpLog.debug(' Could not unlock db') + locked = preparatorCore.dbInterface.get_object_lock(key, lock_interval=120) + if not locked: + tmpLog.debug("DB Already locked by another thread") + # now unlock db + unlocked = preparatorCore.dbInterface.release_object_lock(key) + if unlocked: + tmpLog.debug("unlocked db") + else: + tmpLog.debug(" Could not unlock db") # print out jobSpec PandID msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) tmpLog.debug(msgStr) - #msgStr = "testing trigger_preparation" - #tmpLog.debug(msgStr) - #tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) - #if tmpStat: + # msgStr = "testing trigger_preparation" + # tmpLog.debug(msgStr) + # tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) + # if tmpStat: # msgStr = " OK " # tmpLog.debug(msgStr) - #elif tmpStat == None: + # elif tmpStat == None: # msgStr = " Temporary failure NG {0}".format(tmpOut) # tmpLog.debug(msgStr) - #elif not tmpStat: + # elif not tmpStat: # msgStr = " No Good {0}".format(tmpOut) # tmpLog.debug(msgStr) # sys.exit(1) @@ -184,23 +180,22 @@ def dump(obj): tmpLog.debug(msgStr) # modify dummy_transfer_id from groups of input files for key in groups: - preparatorCore.set_dummy_transfer_id_testing(key) - msgStr = "Revised dummy_transfer_id = {}".format(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug(msgStr) - files = proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug("proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) - files = preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug("preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpLog.debug(msgStr) - tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - + preparatorCore.set_dummy_transfer_id_testing(key) + msgStr = "Revised dummy_transfer_id = {}".format(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug(msgStr) + files = proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug("proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) + files = preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug("preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpLog.debug(msgStr) + tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) diff --git a/pandaharvester/harvestertest/further_testing_go_bulk_preparator.py b/pandaharvester/harvestertest/further_testing_go_bulk_preparator.py index 93ea6763..e1b56dbb 100644 --- a/pandaharvester/harvestertest/further_testing_go_bulk_preparator.py +++ b/pandaharvester/harvestertest/further_testing_go_bulk_preparator.py @@ -18,7 +18,7 @@ from pandaharvester.harvesterbody.cacher import Cacher from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermisc import globus_utils from globus_sdk import TransferClient @@ -27,9 +27,9 @@ from globus_sdk import RefreshTokenAuthorizer -#initial variables -fileTableName = 'file_table' -queueName = 'ALCF_Theta' +# initial variables +fileTableName = "file_table" +queueName = "ALCF_Theta" begin_job_id = 1111 end_job_id = 1113 globus_sleep_time = 15 @@ -39,26 +39,25 @@ def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) if len(sys.argv) > 1: - queueName = sys.argv[1] -#if len(sys.argv) > 2: + queueName = sys.argv[1] +# if len(sys.argv) > 2: # begin_job_id = int(sys.argv[2]) -#if len(sys.argv) > 3: +# if len(sys.argv) > 3: # end_job_id = int(sys.argv[3]) -#if len(sys.argv) > 4: +# if len(sys.argv) > 4: # globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator -queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' -queueConfig.preparator['name'] = 'GlobusBulkPreparator' +queueConfig.preparator["module"] = "pandaharvester.harvesterpreparator.go_bulk_preparator" +queueConfig.preparator["name"] = "GlobusBulkPreparator" modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() @@ -66,20 +65,20 @@ def dump(obj): preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger -_logger = core_utils.setup_logger('further_testing_go_bulk_preparator') -tmpLog = core_utils.make_logger(_logger, method_name='further_testing_go_bulk_preparator') -tmpLog.debug('start') +_logger = core_utils.setup_logger("further_testing_go_bulk_preparator") +tmpLog = core_utils.make_logger(_logger, method_name="further_testing_go_bulk_preparator") +tmpLog.debug("start") for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) + # print "loggerName - {}".format(loggerName) + if loggerName.startswith("panda.log"): + if len(loggerObj.handlers) == 0: + continue + if loggerName.split(".")[-1] in ["db_proxy"]: + continue + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) + loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) @@ -88,7 +87,7 @@ def dump(obj): msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) tmpLog.debug(msgStr) -scope = 'panda' +scope = "panda" proxy = DBProxy() communicator = CommunicatorPool() @@ -97,43 +96,42 @@ def dump(obj): tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) - + # get all jobs in table in a preparing substate -tmpLog.debug('try to get all jobs in a preparing substate') -jobSpec_list = proxy.get_jobs_in_sub_status('preparing',2000,None,None,None,None,None,None) -tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) +tmpLog.debug("try to get all jobs in a preparing substate") +jobSpec_list = proxy.get_jobs_in_sub_status("preparing", 2000, None, None, None, None, None, None) +tmpLog.debug("got {0} jobs".format(len(jobSpec_list))) # loop over all found jobs -if len(jobSpec_list) > 0 : +if len(jobSpec_list) > 0: for jobSpec in jobSpec_list: - tmpLog.debug(' PandaID = %d status = %s subStatus = %s lockedBy = %s' % - (jobSpec.PandaID,jobSpec.status,jobSpec.subStatus,jobSpec.lockedBy)) + tmpLog.debug(" PandaID = %d status = %s subStatus = %s lockedBy = %s" % (jobSpec.PandaID, jobSpec.status, jobSpec.subStatus, jobSpec.lockedBy)) # get the transfer groups groups = jobSpec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_input_files() = : {0}".format(groups)) # loop over groups keys to see if db is locked for key in groups: - locked = preparatorCore.dbInterface.get_object_lock(key,lock_interval=120) - if not locked: - tmpLog.debug('DB Already locked by another thread') - # now unlock db - unlocked = preparatorCore.dbInterface.release_object_lock(key) - if unlocked : - tmpLog.debug('unlocked db') - else: - tmpLog.debug(' Could not unlock db') + locked = preparatorCore.dbInterface.get_object_lock(key, lock_interval=120) + if not locked: + tmpLog.debug("DB Already locked by another thread") + # now unlock db + unlocked = preparatorCore.dbInterface.release_object_lock(key) + if unlocked: + tmpLog.debug("unlocked db") + else: + tmpLog.debug(" Could not unlock db") # print out jobSpec PandID msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) tmpLog.debug(msgStr) - #msgStr = "testing trigger_preparation" - #tmpLog.debug(msgStr) - #tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) - #if tmpStat: + # msgStr = "testing trigger_preparation" + # tmpLog.debug(msgStr) + # tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) + # if tmpStat: # msgStr = " OK " # tmpLog.debug(msgStr) - #elif tmpStat == None: + # elif tmpStat == None: # msgStr = " Temporary failure NG {0}".format(tmpOut) # tmpLog.debug(msgStr) - #elif not tmpStat: + # elif not tmpStat: # msgStr = " No Good {0}".format(tmpOut) # tmpLog.debug(msgStr) # sys.exit(1) @@ -144,23 +142,22 @@ def dump(obj): tmpLog.debug(msgStr) # modify dummy_transfer_id from groups of input files for key in groups: - preparatorCore.set_dummy_transfer_id_testing(key) - msgStr = "Revised dummy_transfer_id = {}".format(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug(msgStr) - files = proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug("proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) - files = preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) - tmpLog.debug("preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpLog.debug(msgStr) - tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - + preparatorCore.set_dummy_transfer_id_testing(key) + msgStr = "Revised dummy_transfer_id = {}".format(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug(msgStr) + files = proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug("proxy.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) + files = preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) + tmpLog.debug("preparatorCore.dbInterface.get_files_with_group_id(preparatorCore.get_dummy_transfer_id()) = {0}".format(files)) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpLog.debug(msgStr) + tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) diff --git a/pandaharvester/harvestertest/further_testing_go_bulk_stager.py b/pandaharvester/harvestertest/further_testing_go_bulk_stager.py index 92454585..fa52a912 100644 --- a/pandaharvester/harvestertest/further_testing_go_bulk_stager.py +++ b/pandaharvester/harvestertest/further_testing_go_bulk_stager.py @@ -18,7 +18,7 @@ from pandaharvester.harvesterbody.cacher import Cacher from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermisc import globus_utils from globus_sdk import TransferClient @@ -27,9 +27,9 @@ from globus_sdk import RefreshTokenAuthorizer -#initial variables -fileTableName = 'file_table' -queueName = 'ALCF_Theta' +# initial variables +fileTableName = "file_table" +queueName = "ALCF_Theta" begin_job_id = 1111 end_job_id = 1113 globus_sleep_time = 15 @@ -39,26 +39,25 @@ def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) - + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) if len(sys.argv) > 1: - queueName = sys.argv[1] -#if len(sys.argv) > 2: + queueName = sys.argv[1] +# if len(sys.argv) > 2: # begin_job_id = int(sys.argv[2]) -#if len(sys.argv) > 3: +# if len(sys.argv) > 3: # end_job_id = int(sys.argv[3]) -#if len(sys.argv) > 4: +# if len(sys.argv) > 4: # globus_sleep_time = int(sys.argv[4]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_stager = queueConfig.stager -queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager' -queueConfig.stager['name'] = 'GlobusBulkStager' +queueConfig.stager["module"] = "pandaharvester.harvesterstager.go_bulk_stager" +queueConfig.stager["name"] = "GlobusBulkStager" modified_queueConfig_stager = queueConfig.stager pluginFactory = PluginFactory() @@ -66,20 +65,20 @@ def dump(obj): stagerCore = pluginFactory.get_plugin(queueConfig.stager) # logger -_logger = core_utils.setup_logger('further_testing_go_bulk_stager') -tmpLog = core_utils.make_logger(_logger, method_name='further_testing_go_bulk_stager') -tmpLog.debug('start') +_logger = core_utils.setup_logger("further_testing_go_bulk_stager") +tmpLog = core_utils.make_logger(_logger, method_name="further_testing_go_bulk_stager") +tmpLog.debug("start") for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) + # print "loggerName - {}".format(loggerName) + if loggerName.startswith("panda.log"): + if len(loggerObj.handlers) == 0: + continue + if loggerName.split(".")[-1] in ["db_proxy"]: + continue + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) + loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(stagerCore.__class__.__name__) tmpLog.debug(msgStr) @@ -88,7 +87,7 @@ def dump(obj): msgStr = "Modified queueConfig.stager = {}".format(modified_queueConfig_stager) tmpLog.debug(msgStr) -scope = 'panda' +scope = "panda" proxy = DBProxy() communicator = CommunicatorPool() @@ -97,43 +96,42 @@ def dump(obj): tmpLog.debug("plugin={0}".format(stagerCore.__class__.__name__)) tmpLog.debug("BasePath from stager configuration: %s " % stagerCore.basePath) - + # get all jobs in table in a preparing substate -tmpLog.debug('try to get all jobs in a transferring substate') -jobSpec_list = proxy.get_jobs_in_sub_status('transferring',2000,None,None,None,None,None,None) -tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) +tmpLog.debug("try to get all jobs in a transferring substate") +jobSpec_list = proxy.get_jobs_in_sub_status("transferring", 2000, None, None, None, None, None, None) +tmpLog.debug("got {0} jobs".format(len(jobSpec_list))) # loop over all found jobs -if len(jobSpec_list) > 0 : +if len(jobSpec_list) > 0: for jobSpec in jobSpec_list: - tmpLog.debug(' PandaID = %d status = %s subStatus = %s lockedBy = %s' % - (jobSpec.PandaID,jobSpec.status,jobSpec.subStatus,jobSpec.lockedBy)) + tmpLog.debug(" PandaID = %d status = %s subStatus = %s lockedBy = %s" % (jobSpec.PandaID, jobSpec.status, jobSpec.subStatus, jobSpec.lockedBy)) # get the transfer groups groups = jobSpec.get_groups_of_input_files(skip_ready=True) - tmpLog.debug('jobspec.get_groups_of_input_files() = : {0}'.format(groups)) + tmpLog.debug("jobspec.get_groups_of_input_files() = : {0}".format(groups)) # loop over groups keys to see if db is locked for key in groups: - locked = stagerCore.dbInterface.get_object_lock(key,lock_interval=120) - if not locked: - tmpLog.debug('DB Already locked by another thread') - # now unlock db - unlocked = stagerCore.dbInterface.release_object_lock(key) - if unlocked : - tmpLog.debug('unlocked db') - else: - tmpLog.debug(' Could not unlock db') + locked = stagerCore.dbInterface.get_object_lock(key, lock_interval=120) + if not locked: + tmpLog.debug("DB Already locked by another thread") + # now unlock db + unlocked = stagerCore.dbInterface.release_object_lock(key) + if unlocked: + tmpLog.debug("unlocked db") + else: + tmpLog.debug(" Could not unlock db") # print out jobSpec PandID msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) tmpLog.debug(msgStr) - #msgStr = "testing trigger_preparation" - #tmpLog.debug(msgStr) - #tmpStat, tmpOut = stagerCore.trigger_preparation(jobSpec) - #if tmpStat: + # msgStr = "testing trigger_preparation" + # tmpLog.debug(msgStr) + # tmpStat, tmpOut = stagerCore.trigger_preparation(jobSpec) + # if tmpStat: # msgStr = " OK " # tmpLog.debug(msgStr) - #elif tmpStat == None: + # elif tmpStat == None: # msgStr = " Temporary failure NG {0}".format(tmpOut) # tmpLog.debug(msgStr) - #elif not tmpStat: + # elif not tmpStat: # msgStr = " No Good {0}".format(tmpOut) # tmpLog.debug(msgStr) # sys.exit(1) @@ -144,23 +142,22 @@ def dump(obj): tmpLog.debug(msgStr) # modify dummy_transfer_id from groups of input files for key in groups: - stagerCore.set_dummy_transfer_id_testing(key) - msgStr = "Revised dummy_transfer_id = {}".format(stagerCore.get_dummy_transfer_id()) - tmpLog.debug(msgStr) - files = proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) - tmpLog.debug("Number proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) = {0}".format(len(files))) - files = stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) - tmpLog.debug("Number stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) = {0}".format(len(files))) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpLog.debug(msgStr) - tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " No Good {0}".format(tmpOut) - tmpLog.debug(msgStr) - + stagerCore.set_dummy_transfer_id_testing(key) + msgStr = "Revised dummy_transfer_id = {}".format(stagerCore.get_dummy_transfer_id()) + tmpLog.debug(msgStr) + files = proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) + tmpLog.debug("Number proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) = {0}".format(len(files))) + files = stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) + tmpLog.debug("Number stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) = {0}".format(len(files))) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpLog.debug(msgStr) + tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " No Good {0}".format(tmpOut) + tmpLog.debug(msgStr) diff --git a/pandaharvester/harvestertest/getEventRangesTest.py b/pandaharvester/harvestertest/getEventRangesTest.py index 7a5b742f..339caaf8 100644 --- a/pandaharvester/harvestertest/getEventRangesTest.py +++ b/pandaharvester/harvestertest/getEventRangesTest.py @@ -13,11 +13,7 @@ except Exception: n = 1 -data = {pandaid: {'pandaID': pandaid, - 'taskID': taskid, - 'jobsetID': jobsetid, - 'nRanges': n} - } +data = {pandaid: {"pandaID": pandaid, "taskID": taskid, "jobsetID": jobsetid, "nRanges": n}} a = CommunicatorPool() o = a.get_event_ranges(data, False, os.getcwd()) diff --git a/pandaharvester/harvestertest/getEvents.py b/pandaharvester/harvestertest/getEvents.py index ee23d353..55decc40 100644 --- a/pandaharvester/harvestertest/getEvents.py +++ b/pandaharvester/harvestertest/getEvents.py @@ -15,13 +15,13 @@ try: os.makedirs(accessPoint) -except: +except BaseException: pass node = {} -node['pandaID'] = jobSpec.PandaID -node['jobsetID'] = jobSpec.jobParams['jobsetID'] -node['taskID'] = jobSpec.taskID +node["pandaID"] = jobSpec.PandaID +node["jobsetID"] = jobSpec.jobParams["jobsetID"] +node["taskID"] = jobSpec.taskID a = CommunicatorPool() diff --git a/pandaharvester/harvestertest/getJob.py b/pandaharvester/harvestertest/getJob.py index 4c7dfeb8..e999bc5b 100644 --- a/pandaharvester/harvestertest/getJob.py +++ b/pandaharvester/harvestertest/getJob.py @@ -1,9 +1,10 @@ +from pandaharvester.harvestermessenger import shared_file_messenger +from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy import os import sys workerID = int(sys.argv[1]) -from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy proxy = DBProxy() workSpec = proxy.get_worker_with_id(workerID) @@ -12,10 +13,9 @@ try: os.makedirs(accessPoint) -except: +except BaseException: pass -from pandaharvester.harvestermessenger import shared_file_messenger -f = open(os.path.join(accessPoint, shared_file_messenger.jsonJobRequestFileName), 'w') +f = open(os.path.join(accessPoint, shared_file_messenger.jsonJobRequestFileName), "w") f.close() diff --git a/pandaharvester/harvestertest/getJobs.py b/pandaharvester/harvestertest/getJobs.py index ace22ad0..1aee06e0 100644 --- a/pandaharvester/harvestertest/getJobs.py +++ b/pandaharvester/harvestertest/getJobs.py @@ -10,10 +10,10 @@ from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): + if loggerName.startswith("panda.log"): if len(loggerObj.handlers) == 0: continue - if loggerName.split('.')[-1] in ['db_proxy']: + if loggerName.split(".")[-1] in ["db_proxy"]: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) @@ -26,13 +26,10 @@ proxy = DBProxy() # get all jobs in table -print ('try to get all jobs') +print("try to get all jobs") alljobs = proxy.get_jobs() -print ('got {0} jobs'.format(len(alljobs))) +print("got {0} jobs".format(len(alljobs))) # loop over all found jobs -if len(alljobs) > 0 : +if len(alljobs) > 0: for jobSpec in alljobs: - print (' PandaID = %d status = %s subStatus = %s lockedBy = %s' % - (jobSpec.PandaID,jobSpec.status,jobSpec.subStatus,jobSpec.lockedBy)) - - + print(" PandaID = %d status = %s subStatus = %s lockedBy = %s" % (jobSpec.PandaID, jobSpec.status, jobSpec.subStatus, jobSpec.lockedBy)) diff --git a/pandaharvester/harvestertest/k8s_node_occupancy.py b/pandaharvester/harvestertest/k8s_node_occupancy.py index f380db79..64f2430c 100644 --- a/pandaharvester/harvestertest/k8s_node_occupancy.py +++ b/pandaharvester/harvestertest/k8s_node_occupancy.py @@ -1,8 +1,8 @@ from kubernetes import client, config from kubernetes.client.rest import ApiException -config.load_kube_config(config_file='/opt/harvester_k8s/kubeconf') -namespace = '' # namespace needs to be set or read from queue configuration +config.load_kube_config(config_file="/opt/harvester_k8s/kubeconf") +namespace = "" # namespace needs to be set or read from queue configuration corev1 = client.CoreV1Api() batchv1 = client.BatchV1Api() @@ -15,27 +15,27 @@ # get nodes and make a dictionary with the available ones nodes = corev1.list_node() for node in nodes.items: - print('Processing {0}'.format(node.metadata.name)) + print("Processing {0}".format(node.metadata.name)) for condition in node.status.conditions: - print('Condition: {0}, status: {1}'.format(condition.type, type(condition.status))) - if condition.type == 'Ready' and condition.status == 'Unknown': + print("Condition: {0}, status: {1}".format(condition.type, type(condition.status))) + if condition.type == "Ready" and condition.status == "Unknown": nodes_unav.append(node.metadata.name) - elif condition.type == 'Ready' and condition.status == 'True': + elif condition.type == "Ready" and condition.status == "True": nodes_av.setdefault(node.metadata.name, []) -print('Unavailable nodes: {0}'.format(len(nodes_unav))) -print('Available nodes: {0}'.format(len(nodes_av.keys()))) +print("Unavailable nodes: {0}".format(len(nodes_unav))) +print("Available nodes: {0}".format(len(nodes_av.keys()))) # get pods and pack them into the available nodes pods = corev1.list_namespaced_pod(namespace=namespace) for pod in pods.items: print(pod.metadata.name) cpus = 0 - if pod.status.phase == 'Running' and pod.spec.containers: + if pod.status.phase == "Running" and pod.spec.containers: for container in pod.spec.containers: print(container.resources.limits) - if container.resources.limits and 'cpu' in container.resources.limits: - cpus += int(container.resources.limits['cpu']) + if container.resources.limits and "cpu" in container.resources.limits: + cpus += int(container.resources.limits["cpu"]) try: nodes_av[pod.spec.node_name].append(cpus) except KeyError: @@ -43,4 +43,4 @@ for node in nodes_av: if not sum(nodes_av[node]): - print('{0}: occupied cpus {1}'.format(node, nodes_av[node])) + print("{0}: occupied cpus {1}".format(node, nodes_av[node])) diff --git a/pandaharvester/harvestertest/lancium/clean.py b/pandaharvester/harvestertest/lancium/clean.py index 7432048a..4e6a97b4 100644 --- a/pandaharvester/harvestertest/lancium/clean.py +++ b/pandaharvester/harvestertest/lancium/clean.py @@ -2,7 +2,7 @@ from lancium.api.Job import Job if len(sys.argv) != 2: - print('Pass the job id as argument') + print("Pass the job id as argument") return job_id = sys.argv[1] diff --git a/pandaharvester/harvestertest/lancium/constants.py b/pandaharvester/harvestertest/lancium/constants.py index 28e1deeb..d1fe82e9 100644 --- a/pandaharvester/harvestertest/lancium/constants.py +++ b/pandaharvester/harvestertest/lancium/constants.py @@ -1,9 +1,9 @@ # voms proxy definitions -voms_local_path = '/root/lancium/voms' -voms_lancium_path = '/secrets/test1' -voms_job_path = 'voms' +voms_local_path = "/root/lancium/voms" +voms_lancium_path = "/secrets/test1" +voms_job_path = "voms" # pilot starter script -script_local_path = '/root/lancium/pilots_starter.py' -script_lancium_path = '/scripts/pilots_starter.py' -script_job_path = 'pilots_starter.py' \ No newline at end of file +script_local_path = "/root/lancium/pilots_starter.py" +script_lancium_path = "/scripts/pilots_starter.py" +script_job_path = "pilots_starter.py" diff --git a/pandaharvester/harvestertest/lancium/file_uploads.py b/pandaharvester/harvestertest/lancium/file_uploads.py index 3621bb3c..c5b11c3f 100644 --- a/pandaharvester/harvestertest/lancium/file_uploads.py +++ b/pandaharvester/harvestertest/lancium/file_uploads.py @@ -2,6 +2,7 @@ from lancium.api.Data import Data from constants import voms_local_path, voms_lancium_path, script_local_path, script_lancium_path + def fake_callback(total_chunks, current_chunk): pass @@ -9,13 +10,13 @@ def fake_callback(total_chunks, current_chunk): # https://lancium.github.io/compute-api-docs/library/lancium/api/Data.html#Data.create # 1. Upload a fake voms proxy -data = Data().create(voms_lancium_path, 'file', source=os.path.abspath(voms_local_path), force=True) +data = Data().create(voms_lancium_path, "file", source=os.path.abspath(voms_local_path), force=True) data.upload(os.path.abspath(voms_local_path), fake_callback) ex = data.show(voms_lancium_path)[0] print(ex.__dict__) # 2. Upload the pilot starter -data = Data().create(script_lancium_path, 'file', source=os.path.abspath(script_local_path), force=True) +data = Data().create(script_lancium_path, "file", source=os.path.abspath(script_local_path), force=True) data.upload(os.path.abspath(script_local_path), fake_callback) ex = data.show(script_lancium_path)[0] -print(ex.__dict__) \ No newline at end of file +print(ex.__dict__) diff --git a/pandaharvester/harvestertest/lancium/submit.py b/pandaharvester/harvestertest/lancium/submit.py index 59621802..20cd5494 100644 --- a/pandaharvester/harvestertest/lancium/submit.py +++ b/pandaharvester/harvestertest/lancium/submit.py @@ -9,54 +9,46 @@ memory = 1 scratch = 20 -params = {'name': 'grid-job-{0}'.format(worker_id), - 'command_line': 'python voms/scripts/pilots_starter.py', - 'image': 'harvester/centos7-singularity', - 'resources': {'core_count': core_count, - 'memory': memory, - 'scratch': scratch - }, - 'input_files': [ - {"source_type": "data", - "data": voms_lancium_path, - "name": voms_job_path - }, - {"source_type": "data", - "data": script_lancium_path, - "name": script_job_path - } - ], - # 'output_files': RETRIEVE THE PILOT LOG AND STORE IT IN HARVESTER? - 'environment': ( - # pilotUrlOpt, stdout_name - {'variable': 'PILOT_NOKILL', 'value': 'True'}, - {'variable': 'computingSite', 'value': 'GOOGLE_EUW1'}, - {'variable': 'pandaQueueName', 'value': 'GOOGLE_EUW1'}, - {'variable': 'resourceType', 'value': 'SCORE'}, - {'variable': 'prodSourceLabel', 'value': 'managed'}, - {'variable': 'pilotType', 'value': 'PR'}, - #{'variable': 'pythonOption', 'value': '--pythonversion\ 3'}, - {'variable': 'pilotVersion', 'value': '3.5.0.31'}, - {'variable': 'jobType', 'value': 'managed'}, - {'variable': 'proxySecretPath', 'value': '/jobDir/voms/secrets/test1'}, - {'variable': 'workerID', 'value': '1'}, - {'variable': 'pilotProxyCheck', 'value': 'False'}, - {'variable': 'logs_frontend_w', 'value': 'https://aipanda047.cern.ch:25443/server/panda'}, - {'variable': 'logs_frontend_r', 'value': 'https://aipanda047.cern.ch:25443/cache'}, - {'variable': 'PANDA_JSID', 'value': 'harvester-CERN_central_k8s'}, - {'variable': 'HARVESTER_WORKER_ID', 'value': '21421931'}, - {'variable': 'HARVESTER_ID', 'value': 'CERN_central_k8s'}, - {'variable': 'submit_mode', 'value': 'PULL'}, - {'variable': 'TMPDIR', 'value': '/jobDir'}, - {'variable': 'HOME', 'value': '/jobDir'}, - {'variable': 'K8S_JOB_ID', 'value': 'grid-job-1'}, - ) - } +params = { + "name": "grid-job-{0}".format(worker_id), + "command_line": "python voms/scripts/pilots_starter.py", + "image": "harvester/centos7-singularity", + "resources": {"core_count": core_count, "memory": memory, "scratch": scratch}, + "input_files": [ + {"source_type": "data", "data": voms_lancium_path, "name": voms_job_path}, + {"source_type": "data", "data": script_lancium_path, "name": script_job_path}, + ], + # 'output_files': RETRIEVE THE PILOT LOG AND STORE IT IN HARVESTER? + "environment": ( + # pilotUrlOpt, stdout_name + {"variable": "PILOT_NOKILL", "value": "True"}, + {"variable": "computingSite", "value": "GOOGLE_EUW1"}, + {"variable": "pandaQueueName", "value": "GOOGLE_EUW1"}, + {"variable": "resourceType", "value": "SCORE"}, + {"variable": "prodSourceLabel", "value": "managed"}, + {"variable": "pilotType", "value": "PR"}, + # {'variable': 'pythonOption', 'value': '--pythonversion\ 3'}, + {"variable": "pilotVersion", "value": "3.5.0.31"}, + {"variable": "jobType", "value": "managed"}, + {"variable": "proxySecretPath", "value": "/jobDir/voms/secrets/test1"}, + {"variable": "workerID", "value": "1"}, + {"variable": "pilotProxyCheck", "value": "False"}, + {"variable": "logs_frontend_w", "value": "https://aipanda047.cern.ch:25443/server/panda"}, + {"variable": "logs_frontend_r", "value": "https://aipanda047.cern.ch:25443/cache"}, + {"variable": "PANDA_JSID", "value": "harvester-CERN_central_k8s"}, + {"variable": "HARVESTER_WORKER_ID", "value": "21421931"}, + {"variable": "HARVESTER_ID", "value": "CERN_central_k8s"}, + {"variable": "submit_mode", "value": "PULL"}, + {"variable": "TMPDIR", "value": "/jobDir"}, + {"variable": "HOME", "value": "/jobDir"}, + {"variable": "K8S_JOB_ID", "value": "grid-job-1"}, + ), +} # create the job job = Job().create(**params) -print('Created! name: {0}, id: {1}, status: {2}'.format(job.name, job.id, job.status)) +print("Created! name: {0}, id: {1}, status: {2}".format(job.name, job.id, job.status)) # submit the job job.submit() -print('Submitted! name: {0}, id: {1}, status: {2}'.format(job.name, job.id, job.status)) \ No newline at end of file +print("Submitted! name: {0}, id: {1}, status: {2}".format(job.name, job.id, job.status)) diff --git a/pandaharvester/harvestertest/monitorFifoTest.py b/pandaharvester/harvestertest/monitorFifoTest.py index c4484869..859e220f 100644 --- a/pandaharvester/harvestertest/monitorFifoTest.py +++ b/pandaharvester/harvestertest/monitorFifoTest.py @@ -18,31 +18,32 @@ mq = MonitorFIFO() -print('sleepTime', mq.config.sleepTime) +print("sleepTime", mq.config.sleepTime) + def single_thread_test(nObjects=3, protective=False): time_point = time.time() - print('clear') + print("clear") mq.fifo.clear() - print('size', mq.size()) + print("size", mq.size()) time_consumed = time.time() - time_point - print('Time consumed: ', time_consumed) + print("Time consumed: ", time_consumed) time_point = time.time() for i in range(nObjects): workspec = WorkSpec() workspec.workerID = i - data = {'random': [random.random(), random.random()]} + data = {"random": [random.random(), random.random()]} workspec.workAttributes = data # print('put') mq.put(workspec) # print('size', mq.size()) time_consumed = time.time() - time_point - print('Time consumed: {0} sec ; Avg: {1} obj/sec '.format(time_consumed, nObjects/time_consumed)) + print("Time consumed: {0} sec ; Avg: {1} obj/sec ".format(time_consumed, nObjects / time_consumed)) - print('size', mq.size()) + print("size", mq.size()) - print('peek') + print("peek") print(mq.peek()) time_point = time.time() @@ -52,21 +53,21 @@ def single_thread_test(nObjects=3, protective=False): # print(obj) # print('size', mq.size()) time_consumed = time.time() - time_point - print('Time consumed: {0} sec ; Avg: {1} obj/sec '.format(time_consumed, nObjects/time_consumed)) + print("Time consumed: {0} sec ; Avg: {1} obj/sec ".format(time_consumed, nObjects / time_consumed)) -print('Normal test') +print("Normal test") single_thread_test(nObjects=1000) -print('Protective test') +print("Protective test") single_thread_test(nObjects=1000, protective=True) mq.fifo.clear() time_point = time.time() -print('MonitorFIFO.populate') +print("MonitorFIFO.populate") mq.populate(seconds_ago=0, clear_fifo=True) time_consumed = time.time() - time_point -print('Time consumed: ', time_consumed) +print("Time consumed: ", time_consumed) # workspec1 = WorkSpec() # workspec1.workerID = 777 diff --git a/pandaharvester/harvestertest/read_shared_file_messenger_files.py b/pandaharvester/harvestertest/read_shared_file_messenger_files.py index 6af4f409..7f4eae86 100644 --- a/pandaharvester/harvestertest/read_shared_file_messenger_files.py +++ b/pandaharvester/harvestertest/read_shared_file_messenger_files.py @@ -21,42 +21,42 @@ from pandaharvester.harvesterconfig import harvester_config # list of shared_file_messenger files -file_list=[] +file_list = [] # json for worker attributes jsonAttrsFileName = harvester_config.payload_interaction.workerAttributesFile -file_list.append(("json for worker attributes",jsonAttrsFileName)) +file_list.append(("json for worker attributes", jsonAttrsFileName)) # json for job report jsonJobReport = harvester_config.payload_interaction.jobReportFile -file_list.append(("json for job report",jsonJobReport)) +file_list.append(("json for job report", jsonJobReport)) # json for outputs jsonOutputsFileName = harvester_config.payload_interaction.eventStatusDumpJsonFile -file_list.append(("json for outputs",jsonOutputsFileName)) +file_list.append(("json for outputs", jsonOutputsFileName)) # xml for outputs xmlOutputsBaseFileName = harvester_config.payload_interaction.eventStatusDumpXmlFile # json for job request jsonJobRequestFileName = harvester_config.payload_interaction.jobRequestFile -file_list.append(("json for job request",jsonJobRequestFileName)) +file_list.append(("json for job request", jsonJobRequestFileName)) # json for job spec jsonJobSpecFileName = harvester_config.payload_interaction.jobSpecFile -file_list.append(("json for job spec",jsonJobSpecFileName)) +file_list.append(("json for job spec", jsonJobSpecFileName)) # json for event request jsonEventsRequestFileName = harvester_config.payload_interaction.eventRequestFile -file_list.append(("json for event request",jsonEventsRequestFileName)) +file_list.append(("json for event request", jsonEventsRequestFileName)) # json to feed events jsonEventsFeedFileName = harvester_config.payload_interaction.eventRangesFile -file_list.append(("json to feed events",jsonEventsFeedFileName)) +file_list.append(("json to feed events", jsonEventsFeedFileName)) # json to update events jsonEventsUpdateFileName = harvester_config.payload_interaction.updateEventsFile -file_list.append(("json to update events",jsonEventsUpdateFileName)) +file_list.append(("json to update events", jsonEventsUpdateFileName)) access_point = sys.argv[1] @@ -64,17 +64,17 @@ # Now loop over all of the json files " for description, jsonFileName in file_list: - print ("{0} : {1}".format(description,jsonFileName)) + print("{0} : {1}".format(description, jsonFileName)) jsonFilePath = os.path.join(access_point, jsonFileName) - print ('looking for attributes file {0}'.format(jsonFilePath)) + print("looking for attributes file {0}".format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found - print ('not found') + print("not found") else: try: - with open(jsonFilePath) as data_file: + with open(jsonFilePath) as data_file: data = json.load(data_file) pprint(data) - except: - print ('failed to load {0}'.format(jsonFilePath)) + except BaseException: + print("failed to load {0}".format(jsonFilePath)) continue diff --git a/pandaharvester/harvestertest/renice.py b/pandaharvester/harvestertest/renice.py index 358bd09b..e770e0a4 100644 --- a/pandaharvester/harvestertest/renice.py +++ b/pandaharvester/harvestertest/renice.py @@ -1,22 +1,24 @@ import subprocess from pandaharvester.harvesterconfig import harvester_config -proc = subprocess.Popen("ps -l x -U {0}".format(harvester_config.master.uname), - shell=True, - stdout=subprocess.PIPE, - ) -stdoutList = proc.communicate()[0].split('\n') +proc = subprocess.Popen( + "ps -l x -U {0}".format(harvester_config.master.uname), + shell=True, + stdout=subprocess.PIPE, +) +stdoutList = proc.communicate()[0].split("\n") for line in stdoutList: try: items = line.split() - if len(items)< 6: + if len(items) < 6: continue pid = items[3] nice = int(items[7]) - if 'master.py' in line and nice>0: - reniceProc = subprocess.Popen("renice 0 {0}".format(pid), - shell=True, - stdout=subprocess.PIPE, - ) + if "master.py" in line and nice > 0: + reniceProc = subprocess.Popen( + "renice 0 {0}".format(pid), + shell=True, + stdout=subprocess.PIPE, + ) except Exception: pass diff --git a/pandaharvester/harvestertest/sshTunnelTest.py b/pandaharvester/harvestertest/sshTunnelTest.py index 1b634279..9d8a73ea 100644 --- a/pandaharvester/harvestertest/sshTunnelTest.py +++ b/pandaharvester/harvestertest/sshTunnelTest.py @@ -8,10 +8,8 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument('--queueName', action='store', dest='queueName', default=None, required=True, - help='the name of queue where harvester is installed') - parser.add_argument('--middleware', action='store', dest='middleware', default='rpc', - help='middleware to access the remote target machine') + parser.add_argument("--queueName", action="store", dest="queueName", default=None, required=True, help="the name of queue where harvester is installed") + parser.add_argument("--middleware", action="store", dest="middleware", default="rpc", help="middleware to access the remote target machine") options = parser.parse_args() # get queue @@ -19,25 +17,24 @@ def main(): qcm.load_data() queueConfig = qcm.get_queue(options.queueName) if queueConfig is None: - print ('ERROR: queue={0} not found in panda_queueconfig.json'.format(options.queueName)) + print("ERROR: queue={0} not found in panda_queueconfig.json".format(options.queueName)) sys.exit(1) # get middleware if not hasattr(queueConfig, options.middleware): - print ('ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json'.format(options.middleware, - options.queueName)) + print("ERROR: middleware={0} is not defined for {1} in panda_queueconfig.json".format(options.middleware, options.queueName)) sys.exit(1) middleware = getattr(queueConfig, options.middleware) # get ssh parameters - sshHost = middleware['remoteHost'] + sshHost = middleware["remoteHost"] try: - sshPort = middleware['remotePort'] + sshPort = middleware["remotePort"] except Exception: sshPort = 22 - sshUserName = middleware['sshUserName'] + sshUserName = middleware["sshUserName"] try: - sshPassword = middleware['sshPassword'] + sshPassword = middleware["sshPassword"] except Exception: sshPassword = None @@ -45,30 +42,37 @@ def main(): passPhrase = None if sshPassword is None: try: - privateKey = middleware['privateKey'] + privateKey = middleware["privateKey"] except Exception: - print ("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) + print("ERROR: set sshPassword or privateKey in middleware={0}".format(options.middleware)) sys.exit(1) try: - passPhrase = middleware['passPhrase'] + passPhrase = middleware["passPhrase"] except Exception: passPhrase = None try: - jumpHost = middleware['jumpHost'] + jumpHost = middleware["jumpHost"] except Exception: jumpHost = None try: - jumpPort = middleware['jumpPort'] + jumpPort = middleware["jumpPort"] except Exception: jumpPort = 22 # ssh - sshTunnelPool.make_tunnel_server(sshHost, sshPort, remote_bind_port=middleware['remoteBindPort'], - num_tunnels=1, ssh_username=sshUserName, ssh_password=sshPassword, - private_key=privateKey, pass_phrase=passPhrase, - jump_host=jumpHost, jump_port=jumpPort - ) + sshTunnelPool.make_tunnel_server( + sshHost, + sshPort, + remote_bind_port=middleware["remoteBindPort"], + num_tunnels=1, + ssh_username=sshUserName, + ssh_password=sshPassword, + private_key=privateKey, + pass_phrase=passPhrase, + jump_host=jumpHost, + jump_port=jumpPort, + ) ssh = sshTunnelPool.get_tunnel(sshHost, sshPort)[-1] return ssh @@ -76,6 +80,6 @@ def main(): if __name__ == "__main__": ssh = main() if ssh is None: - print ("ERROR: failed to make an SSH tunnel. See ssh_tunnel_pool.log for more details") + print("ERROR: failed to make an SSH tunnel. See ssh_tunnel_pool.log for more details") else: - print ("OK") + print("OK") diff --git a/pandaharvester/harvestertest/stageInTest.py b/pandaharvester/harvestertest/stageInTest.py index 40a5874f..05d7daba 100644 --- a/pandaharvester/harvestertest/stageInTest.py +++ b/pandaharvester/harvestertest/stageInTest.py @@ -1,3 +1,4 @@ +from pandaharvester.harvestercore.plugin_factory import PluginFactory import sys import time from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper @@ -10,55 +11,55 @@ queueConfig = queueConfigMapper.get_queue(queueName) jobSpec = JobSpec() -jobSpec.jobParams = {'inFiles': 'DAOD_STDM4.09596175._000008.pool.root.1', - 'scopeIn': 'mc15_13TeV', - 'fsize': '658906675', - 'GUID': '7e3776f9bb0af341b03e59d3de895a13', - 'checksum': 'ad:3734bdd9', - 'ddmEndPointIn': 'BNL-OSG2_DATADISK', - 'realDatasetsIn': 'mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00', - } +jobSpec.jobParams = { + "inFiles": "DAOD_STDM4.09596175._000008.pool.root.1", + "scopeIn": "mc15_13TeV", + "fsize": "658906675", + "GUID": "7e3776f9bb0af341b03e59d3de895a13", + "checksum": "ad:3734bdd9", + "ddmEndPointIn": "BNL-OSG2_DATADISK", + "realDatasetsIn": "mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00", +} jobSpec.computingSite = queueName -from pandaharvester.harvestercore.plugin_factory import PluginFactory pluginFactory = PluginFactory() # get plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) -print ("plugin={0}".format(preparatorCore.__class__.__name__)) +print("plugin={0}".format(preparatorCore.__class__.__name__)) -print ("testing stagein:") -print ("BasePath from preparator configuration: %s " % preparatorCore.basePath) +print("testing stagein:") +print("BasePath from preparator configuration: %s " % preparatorCore.basePath) preparatorCore.basePath = preparatorCore.basePath + "/testdata/" -print ("basePath redifuned for test data: %s " % preparatorCore.basePath) +print("basePath redifuned for test data: %s " % preparatorCore.basePath) tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) if tmpStat: - print (" OK") + print(" OK") else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) print -print ("testing status check") +print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) if tmpStat is True: - print (" OK") + print(" OK") break elif tmpStat is False: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) sys.exit(1) else: - print (" still running. sleep 1 min") + print(" still running. sleep 1 min") time.sleep(60) print -print ("checking path resolution") +print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print (" OK {0}".format(jobSpec.jobParams['inFilePaths'])) + print(" OK {0}".format(jobSpec.jobParams["inFilePaths"])) else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageInTest_GlobusOnline.py b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py index f3fb4b89..c0570dd8 100644 --- a/pandaharvester/harvestertest/stageInTest_GlobusOnline.py +++ b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py @@ -1,3 +1,4 @@ +from pandaharvester.harvestercore.plugin_factory import PluginFactory import sys import time from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper @@ -11,67 +12,65 @@ queueConfig = queueConfigMapper.get_queue(queueName) jobSpec = JobSpec() -new_file_data = {'scope': 'test', - 'lfn': 'TXT.19772875._044894.tar.gz.1', 'attemptNr': 0 } -new_file_spec = FileSpec(filetype='input', **new_file_data) +new_file_data = {"scope": "test", "lfn": "TXT.19772875._044894.tar.gz.1", "attemptNr": 0} +new_file_spec = FileSpec(filetype="input", **new_file_data) new_file_spec.attemptNr = 0 -new_file_spec.path = '/home/psvirin/harvester3' +new_file_spec.path = "/home/psvirin/harvester3" jobSpec.inFiles = {new_file_spec} jobSpec.outFiles = {} jobSpec.jobParams = { - 'inFiles': 'TXT.19772875._044894.tar.gz.1', - 'scopeIn': 'mc15_13TeV', - 'fsize': '658906675', - 'GUID': '7e3776f9bb0af341b03e59d3de895a13', - 'checksum': 'ad:3734bdd9', - 'ddmEndPointIn': 'BNL-OSG2_DATADISK', - 'realDatasetsIn': 'mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00', - } + "inFiles": "TXT.19772875._044894.tar.gz.1", + "scopeIn": "mc15_13TeV", + "fsize": "658906675", + "GUID": "7e3776f9bb0af341b03e59d3de895a13", + "checksum": "ad:3734bdd9", + "ddmEndPointIn": "BNL-OSG2_DATADISK", + "realDatasetsIn": "mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00", +} jobSpec.computingSite = queueName -jobSpec.PandaID='11111' +jobSpec.PandaID = "11111" -from pandaharvester.harvestercore.plugin_factory import PluginFactory pluginFactory = PluginFactory() # get plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) -print ("plugin={0}".format(preparatorCore.__class__.__name__)) +print("plugin={0}".format(preparatorCore.__class__.__name__)) print(jobSpec) -print ("testing stagein:") -print ("BasePath from preparator configuration: %s " % preparatorCore.basePath) +print("testing stagein:") +print("BasePath from preparator configuration: %s " % preparatorCore.basePath) preparatorCore.basePath = preparatorCore.basePath + "/testdata/" -print ("basePath redifuned for test data: %s " % preparatorCore.basePath) +print("basePath redifuned for test data: %s " % preparatorCore.basePath) tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) if tmpStat: - print (" OK") + print(" OK") else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) print -print ("testing status check") +print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) if tmpStat is True: - print (" OK") + print(" OK") break elif tmpStat is False: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) sys.exit(1) else: - print (" still running. sleep 1 min") + print(" still running. sleep 1 min") time.sleep(60) print -print ("checking path resolution") +print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print (" OK {0}".format(jobSpec.jobParams['inFilePaths'])) + print(" OK {0}".format(jobSpec.jobParams["inFilePaths"])) else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageInTest_dpb.py b/pandaharvester/harvestertest/stageInTest_dpb.py index d85919e0..01e59066 100644 --- a/pandaharvester/harvestertest/stageInTest_dpb.py +++ b/pandaharvester/harvestertest/stageInTest_dpb.py @@ -1,61 +1,62 @@ +from pandaharvester.harvestercore.plugin_factory import PluginFactory +from pandaharvester.harvestercore.job_spec import JobSpec +from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper import sys queueName = sys.argv[1] -from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) -from pandaharvester.harvestercore.job_spec import JobSpec jobSpec = JobSpec() jobSpec.computingSite = sys.argv[1] -jobSpec.jobParams = {'inFiles': 'EVNT.06820166._000001.pool.root.1', - 'scopeIn': 'mc15_13TeV', - 'fsize': '196196765', - 'GUID': 'B7F387CD-1F97-1C47-88BD-D8785442C49D', - 'checksum': 'ad:326e445d', - 'ddmEndPointIn': 'MWT2_DATADISK', - 'realDatasetsIn': 'mc15_13TeV:mc15_13TeV.301042.PowhegPythia8EvtGen_AZNLOCTEQ6L1_DYtautau_250M400.evgen.EVNT.e3649_tid06820166_00', - } +jobSpec.jobParams = { + "inFiles": "EVNT.06820166._000001.pool.root.1", + "scopeIn": "mc15_13TeV", + "fsize": "196196765", + "GUID": "B7F387CD-1F97-1C47-88BD-D8785442C49D", + "checksum": "ad:326e445d", + "ddmEndPointIn": "MWT2_DATADISK", + "realDatasetsIn": "mc15_13TeV:mc15_13TeV.301042.PowhegPythia8EvtGen_AZNLOCTEQ6L1_DYtautau_250M400.evgen.EVNT.e3649_tid06820166_00", +} -from pandaharvester.harvestercore.plugin_factory import PluginFactory pluginFactory = PluginFactory() # get plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) -print ("plugin={0}".format(preparatorCore.__class__.__name__)) +print("plugin={0}".format(preparatorCore.__class__.__name__)) -print ("testing preparation") +print("testing preparation") tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) if tmpStat: - print (" OK") + print(" OK") else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) print -print ("testing status check") +print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) if tmpStat is True: - print (" OK") + print(" OK") break elif tmpStat is False: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) sys.exit(1) else: - print (" still running. sleep 1 min") + print(" still running. sleep 1 min") time.sleep(60) print -print ("checking path resolution") +print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print (" OK {0}".format(jobSpec.jobParams['inFilePaths'])) + print(" OK {0}".format(jobSpec.jobParams["inFilePaths"])) else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index 2957a70e..6237f2c4 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -18,7 +18,7 @@ from pandaharvester.harvesterbody.cacher import Cacher from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermisc import globus_utils from globus_sdk import TransferClient @@ -26,29 +26,31 @@ from globus_sdk import NativeAppAuthClient from globus_sdk import RefreshTokenAuthorizer + def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print("obj.%s = %s" % (attr, getattr(obj, attr))) + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) + print(len(sys.argv)) -queueName = 'ALCF_Theta' +queueName = "ALCF_Theta" job_id = 1111 globus_sleep_time = 15 if len(sys.argv) > 1: - queueName = sys.argv[1] + queueName = sys.argv[1] if len(sys.argv) > 2: - job_id = int(sys.argv[2]) + job_id = int(sys.argv[2]) if len(sys.argv) > 3: - globus_sleep_time = int(sys.argv[3]) + globus_sleep_time = int(sys.argv[3]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator -queueConfig.preparator['module'] = 'pandaharvester.harvesterpreparator.go_preparator' -queueConfig.preparator['name'] = 'GoPreparator' +queueConfig.preparator["module"] = "pandaharvester.harvesterpreparator.go_preparator" +queueConfig.preparator["name"] = "GoPreparator" modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() @@ -56,20 +58,20 @@ def dump(obj): preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger -_logger = core_utils.setup_logger('stageInTest_go_preparator') -tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_preparator') -tmpLog.debug('start') +_logger = core_utils.setup_logger("stageInTest_go_preparator") +tmpLog = core_utils.make_logger(_logger, method_name="stageInTest_go_preparator") +tmpLog.debug("start") for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) + # print "loggerName - {}".format(loggerName) + if loggerName.startswith("panda.log"): + if len(loggerObj.handlers) == 0: + continue + if loggerName.split(".")[-1] in ["db_proxy"]: + continue + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) + loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) @@ -78,176 +80,171 @@ def dump(obj): msgStr = "Modified queueConfig.preparator = {}".format(modified_queueConfig_preparator) tmpLog.debug(msgStr) -scope = 'panda' +scope = "panda" proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() -Globus_srcPath = queueConfig.preparator['Globus_srcPath'] -srcEndpoint = queueConfig.preparator['srcEndpoint'] -basePath = queueConfig.preparator['basePath'] -Globus_dstPath = queueConfig.preparator['Globus_dstPath'] -dstEndpoint = queueConfig.preparator['dstEndpoint'] +Globus_srcPath = queueConfig.preparator["Globus_srcPath"] +srcEndpoint = queueConfig.preparator["srcEndpoint"] +basePath = queueConfig.preparator["basePath"] +Globus_dstPath = queueConfig.preparator["Globus_dstPath"] +dstEndpoint = queueConfig.preparator["dstEndpoint"] # need to get client_id and refresh_token from PanDA server via harvester cache mechanism -c_data = preparatorCore.dbInterface.get_cache('globus_secret') +c_data = preparatorCore.dbInterface.get_cache("globus_secret") client_id = None refresh_token = None -if (not c_data == None) and c_data.data['StatusCode'] == 0 : - client_id = c_data.data['publicKey'] # client_id - refresh_token = c_data.data['privateKey'] # refresh_token -else : - client_id = None - refresh_token = None - tc = None - errStr = 'failed to get Globus Client ID and Refresh Token' - tmpLog.error(errStr) - sys.exit(1) +if (c_data is not None) and c_data.data["StatusCode"] == 0: + client_id = c_data.data["publicKey"] # client_id + refresh_token = c_data.data["privateKey"] # refresh_token +else: + client_id = None + refresh_token = None + tc = None + errStr = "failed to get Globus Client ID and Refresh Token" + tmpLog.error(errStr) + sys.exit(1) # create Globus transfer client to send initial files to remote Globus source -tmpStat, tc = globus_utils.create_globus_transfer_client(tmpLog,client_id,refresh_token) +tmpStat, tc = globus_utils.create_globus_transfer_client(tmpLog, client_id, refresh_token) if not tmpStat: - tc = None - errStr = 'failed to create Globus Transfer Client' - tmpLog.error(errStr) - sys.exit(1) + tc = None + errStr = "failed to create Globus Transfer Client" + tmpLog.error(errStr) + sys.exit(1) try: - # We are sending test files from our destination machine to the source machine - # Test endpoints for activation - - tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog,tc,dstEndpoint) - tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog,tc,srcEndpoint) - if tmpStatsrc and tmpStatdst: - errStr = 'source Endpoint and destination Endpoint activated' - tmpLog.debug(errStr) - else: - errStr = '' - if not tmpStatsrc : - errStr += ' source Endpoint not activated ' - if not tmpStatdst : - errStr += ' destination Endpoint not activated ' - tmpLog.error(errStr) - sys.exit(2) - # We are sending test files from our destination machine to the source machine - # both endpoints activated now prepare to transfer data - tdata = TransferData(tc,dstEndpoint,srcEndpoint,sync_level="checksum") -except: - errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) - sys.exit(1) - -# create JobSpec + # We are sending test files from our destination machine to the source machine + # Test endpoints for activation - + tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(tmpLog, tc, dstEndpoint) + tmpStatdst, dstStr = globus_utils.check_endpoint_activation(tmpLog, tc, srcEndpoint) + if tmpStatsrc and tmpStatdst: + errStr = "source Endpoint and destination Endpoint activated" + tmpLog.debug(errStr) + else: + errStr = "" + if not tmpStatsrc: + errStr += " source Endpoint not activated " + if not tmpStatdst: + errStr += " destination Endpoint not activated " + tmpLog.error(errStr) + sys.exit(2) + # We are sending test files from our destination machine to the source machine + # both endpoints activated now prepare to transfer data + tdata = TransferData(tc, dstEndpoint, srcEndpoint, sync_level="checksum") +except BaseException: + errStat, errMsg = globus_utils.handle_globus_exception(tmpLog) + sys.exit(1) + +# create JobSpec jobSpec = JobSpec() jobSpec.jobParams = { - 'scopeLog': 'panda', - 'logFile': 'log', - } + "scopeLog": "panda", + "logFile": "log", +} jobSpec.computingSite = queueName jobSpec.PandaID = job_id jobSpec.modificationTime = datetime.datetime.now() -realDataset = 'panda.sgotest.' + uuid.uuid4().hex -ddmEndPointIn = 'BNL-OSG2_DATADISK' -inFiles_scope_str = '' -inFiles_str = '' -realDatasets_str = '' -realDatasetsIn_str = '' -ddmEndPointIn_str = '' -GUID_str = '' -fsize_str = '' -checksum_str = '' -scope_in_str = '' +realDataset = "panda.sgotest." + uuid.uuid4().hex +ddmEndPointIn = "BNL-OSG2_DATADISK" +inFiles_scope_str = "" +inFiles_str = "" +realDatasets_str = "" +realDatasetsIn_str = "" +ddmEndPointIn_str = "" +GUID_str = "" +fsize_str = "" +checksum_str = "" +scope_in_str = "" # create up 5 files for input for index in range(random.randint(1, 5)): - fileSpec = FileSpec() - assFileSpec = FileSpec() - fileSpec.fileType = 'input' - assFileSpec.lfn = 'panda.sgotest.' + uuid.uuid4().hex - fileSpec.lfn = assFileSpec.lfn - fileSpec.scope = 'panda' - inFiles_scope_str += 'panda,' - inFiles_str += fileSpec.lfn + ',' - realDatasets_str += realDataset + "," - realDatasetsIn_str += realDataset + "," - ddmEndPointIn_str += ddmEndPointIn + "," - # some dummy inputs - GUID_str += 'd82e8e5e301b77489fd4da04bcdd6565,' - fsize_str += '3084569129,' - checksum_str += 'ad:9f60d29f,' - scope_in_str += 'panda,' - # - assFileSpec.fileType = 'input' - assFileSpec.fsize = random.randint(10, 100) - # create source file - hash = hashlib.md5() - hash.update(('%s:%s' % (fileSpec.scope, fileSpec.lfn)).encode('utf-8')) - hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - fileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_dstPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - assFileSpec.path = fileSpec.path - fileSpec.add_associated_file(assFileSpec) - # now create the temporary file - tmpfile_path = "{mountPoint}/testdata/{lfn}".format(mountPoint=queueConfig.preparator['basePath'], - lfn=assFileSpec.lfn) - if not os.path.exists(os.path.dirname(tmpfile_path)): - tmpLog.debug("os.makedirs({})".format(os.path.dirname(tmpfile_path))) - os.makedirs(os.path.dirname(tmpfile_path)) - oFile = open(tmpfile_path, 'w') - oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) - oFile.close() - # location of destination file - destfile_path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_srcPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=fileSpec.lfn) - - # add to Globus transfer list - tdata.add_item(tmpfile_path,destfile_path) - #print "dump(fileSpec)" - #dump(fileSpec) - # add input file to jobSpec - jobSpec.add_in_file(fileSpec) - # - tmpLog.debug("source file to transfer - {}".format(tmpfile_path)) - tmpLog.debug("destination file to transfer - {}".format(destfile_path)) - #print "dump(jobSpec)" - #dump(jobSpec) + fileSpec = FileSpec() + assFileSpec = FileSpec() + fileSpec.fileType = "input" + assFileSpec.lfn = "panda.sgotest." + uuid.uuid4().hex + fileSpec.lfn = assFileSpec.lfn + fileSpec.scope = "panda" + inFiles_scope_str += "panda," + inFiles_str += fileSpec.lfn + "," + realDatasets_str += realDataset + "," + realDatasetsIn_str += realDataset + "," + ddmEndPointIn_str += ddmEndPointIn + "," + # some dummy inputs + GUID_str += "d82e8e5e301b77489fd4da04bcdd6565," + fsize_str += "3084569129," + checksum_str += "ad:9f60d29f," + scope_in_str += "panda," + # + assFileSpec.fileType = "input" + assFileSpec.fsize = random.randint(10, 100) + # create source file + hash = hashlib.md5() + hash.update(("%s:%s" % (fileSpec.scope, fileSpec.lfn)).encode("utf-8")) + hash_hex = hash.hexdigest() + correctedscope = "/".join(scope.split(".")) + fileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=queueConfig.preparator["Globus_dstPath"], scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + assFileSpec.path = fileSpec.path + fileSpec.add_associated_file(assFileSpec) + # now create the temporary file + tmpfile_path = "{mountPoint}/testdata/{lfn}".format(mountPoint=queueConfig.preparator["basePath"], lfn=assFileSpec.lfn) + if not os.path.exists(os.path.dirname(tmpfile_path)): + tmpLog.debug("os.makedirs({})".format(os.path.dirname(tmpfile_path))) + os.makedirs(os.path.dirname(tmpfile_path)) + oFile = open(tmpfile_path, "w") + oFile.write("".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) + oFile.close() + # location of destination file + destfile_path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=queueConfig.preparator["Globus_srcPath"], scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn + ) + + # add to Globus transfer list + tdata.add_item(tmpfile_path, destfile_path) + # print "dump(fileSpec)" + # dump(fileSpec) + # add input file to jobSpec + jobSpec.add_in_file(fileSpec) + # + tmpLog.debug("source file to transfer - {}".format(tmpfile_path)) + tmpLog.debug("destination file to transfer - {}".format(destfile_path)) + # print "dump(jobSpec)" + # dump(jobSpec) # remove final "," -realDatasetsIn_str=realDatasetsIn_str[:-1] +realDatasetsIn_str = realDatasetsIn_str[:-1] inFiles_str = inFiles_str[:-1] inFiles_scope_str = inFiles_scope_str[:-1] GUID_str = GUID_str[:-1] fsize_str = fsize_str[:-1] checksum_str = checksum_str[:-1] scope_in_str = scope_in_str[:-1] -jobSpec.jobParams['realDatasets'] = realDatasets_str -jobSpec.jobParams['ddmEndPointIn'] = ddmEndPointIn_str -jobSpec.jobParams['inFiles'] = inFiles_str -jobSpec.jobParams['GUID'] = GUID_str -jobSpec.jobParams['fsize'] = fsize_str -jobSpec.jobParams['checksum'] = checksum_str -jobSpec.jobParams['scopeIn'] = scope_in_str -jobSpec.jobParams['realDatasetsIn'] = realDatasetsIn_str -msgStr = "jobSpec.jobParams ={}".format(jobSpec.jobParams) +jobSpec.jobParams["realDatasets"] = realDatasets_str +jobSpec.jobParams["ddmEndPointIn"] = ddmEndPointIn_str +jobSpec.jobParams["inFiles"] = inFiles_str +jobSpec.jobParams["GUID"] = GUID_str +jobSpec.jobParams["fsize"] = fsize_str +jobSpec.jobParams["checksum"] = checksum_str +jobSpec.jobParams["scopeIn"] = scope_in_str +jobSpec.jobParams["realDatasetsIn"] = realDatasetsIn_str +msgStr = "jobSpec.jobParams ={}".format(jobSpec.jobParams) tmpLog.debug(msgStr) - -# transfer dummy files to Remote site for input + +# transfer dummy files to Remote site for input transfer_result = tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) -if transfer_result['code'] == "Accepted": - # succeeded - # set transfer ID which are used for later lookup - transferID = transfer_result['task_id'] - tmpLog.debug('done') +if transfer_result["code"] == "Accepted": + # succeeded + # set transfer ID which are used for later lookup + transferID = transfer_result["task_id"] + tmpLog.debug("done") else: - tmpLog.error('Failed to send intial files') - sys.exit(3) + tmpLog.error("Failed to send intial files") + sys.exit(3) print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) @@ -256,42 +253,42 @@ def dump(obj): maxloop = 5 iloop = 0 NotFound = True -while (iloop < maxloop) and NotFound : - # get transfer task - tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog,tc,transferID) - # return a temporary error when failed to get task - if not tmpStat: - errStr = 'failed to get transfer task' - tmpLog.error(errStr) - else: - # return a temporary error when task is missing - tmpLog.debug('transferTasks : {} '.format(transferTasks)) - if transferID not in transferTasks: - errStr = 'transfer task ID - {} is missing'.format(transferID) - tmpLog.error(errStr) - else: - # succeeded in finding a transfer task by tranferID - if transferTasks[transferID]['status'] == 'SUCCEEDED': - tmpLog.debug('transfer task {} succeeded'.format(transferID)) - NotFound = False - # failed - if transferTasks[transferID]['status'] == 'FAILED': - errStr = 'transfer task {} failed'.format(transferID) +while (iloop < maxloop) and NotFound: + # get transfer task + tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(tmpLog, tc, transferID) + # return a temporary error when failed to get task + if not tmpStat: + errStr = "failed to get transfer task" + tmpLog.error(errStr) + else: + # return a temporary error when task is missing + tmpLog.debug("transferTasks : {} ".format(transferTasks)) + if transferID not in transferTasks: + errStr = "transfer task ID - {} is missing".format(transferID) tmpLog.error(errStr) - # another status - tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) - tmpLog.debug(tmpStr) - if NotFound : - print("sleep {0} seconds".format(globus_sleep_time)) - time.sleep(globus_sleep_time) - ++iloop + else: + # succeeded in finding a transfer task by tranferID + if transferTasks[transferID]["status"] == "SUCCEEDED": + tmpLog.debug("transfer task {} succeeded".format(transferID)) + NotFound = False + # failed + if transferTasks[transferID]["status"] == "FAILED": + errStr = "transfer task {} failed".format(transferID) + tmpLog.error(errStr) + # another status + tmpStr = "transfer task {0} status: {1}".format(transferID, transferTasks[transferID]["status"]) + tmpLog.debug(tmpStr) + if NotFound: + print("sleep {0} seconds".format(globus_sleep_time)) + time.sleep(globus_sleep_time) + ++iloop -if NotFound : - errStr = 'transfer task ID - {} is missing'.format(transferID) - tmpLog.error(errStr) - sys.exit(1) +if NotFound: + errStr = "transfer task ID - {} is missing".format(transferID) + tmpLog.error(errStr) + sys.exit(1) -#dump(queueConfig) +# dump(queueConfig) print("plugin={0}".format(preparatorCore.__class__.__name__)) @@ -311,7 +308,7 @@ def dump(obj): print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) - if tmpStat == True: + if tmpStat: print(" OK") break elif tmpStat == False: @@ -324,6 +321,6 @@ def dump(obj): print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print(" OK {0}".format(jobSpec.jobParams['inFilePaths'])) + print(" OK {0}".format(jobSpec.jobParams["inFilePaths"])) else: print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageOutTest.py b/pandaharvester/harvestertest/stageOutTest.py index 92bb8880..cd678125 100644 --- a/pandaharvester/harvestertest/stageOutTest.py +++ b/pandaharvester/harvestertest/stageOutTest.py @@ -10,11 +10,11 @@ from pandaharvester.harvestercore.plugin_factory import PluginFactory -file_prefix = 'panda.sgotest.' +file_prefix = "panda.sgotest." def exit_func(): - for f in os.listdir('.'): + for f in os.listdir("."): if f.startswith(file_prefix): os.remove(f) @@ -26,60 +26,61 @@ def exit_func(): queueConfig = queueConfigMapper.get_queue(queueName) fileSpec = FileSpec() -fileSpec.fileType = 'output' -fileSpec.lfn = file_prefix + uuid.uuid4().hex + '.gz' -fileSpec.fileAttributes = {'guid': str(uuid.uuid4())} -fileSpec.chksum = '0d439274' +fileSpec.fileType = "output" +fileSpec.lfn = file_prefix + uuid.uuid4().hex + ".gz" +fileSpec.fileAttributes = {"guid": str(uuid.uuid4())} +fileSpec.chksum = "0d439274" assFileSpec = FileSpec() assFileSpec.lfn = file_prefix + uuid.uuid4().hex -assFileSpec.fileType = 'es_output' +assFileSpec.fileType = "es_output" assFileSpec.fsize = random.randint(10, 100) -assFileSpec.path = os.getcwd() + '/' + assFileSpec.lfn -oFile = open(assFileSpec.lfn, 'w') -oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) +assFileSpec.path = os.getcwd() + "/" + assFileSpec.lfn +oFile = open(assFileSpec.lfn, "w") +oFile.write("".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) oFile.close() fileSpec.add_associated_file(assFileSpec) jobSpec = JobSpec() -jobSpec.jobParams = {'outFiles': fileSpec.lfn + ',log', - 'scopeOut': 'panda', - 'scopeLog': 'panda', - 'logFile': 'log', - 'realDatasets': 'panda.' + fileSpec.lfn, - 'ddmEndPointOut': 'BNL-OSG2_DATADISK', - } +jobSpec.jobParams = { + "outFiles": fileSpec.lfn + ",log", + "scopeOut": "panda", + "scopeLog": "panda", + "logFile": "log", + "realDatasets": "panda." + fileSpec.lfn, + "ddmEndPointOut": "BNL-OSG2_DATADISK", +} jobSpec.add_out_file(fileSpec) pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) -print ("plugin={0}".format(stagerCore.__class__.__name__)) +print("plugin={0}".format(stagerCore.__class__.__name__)) -print ("testing zip") +print("testing zip") tmpStat, tmpOut = stagerCore.zip_output(jobSpec) if tmpStat: - print (" OK") + print(" OK") else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) -print () +print() -print ("testing stage-out") +print("testing stage-out") transferID = None tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec) if tmpStat: - if fileSpec.fileAttributes is None and 'transferID' in fileSpec.fileAttributes: - transferID = fileSpec.fileAttributes['transferID'] - print (" OK transferID={0}".format(transferID)) + if fileSpec.fileAttributes is None and "transferID" in fileSpec.fileAttributes: + transferID = fileSpec.fileAttributes["transferID"] + print(" OK transferID={0}".format(transferID)) else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) sys.exit(1) -print () +print() -print ("checking status for transferID={0}".format(transferID)) +print("checking status for transferID={0}".format(transferID)) tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) if tmpStat: - print (" OK") + print(" OK") else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageOutTest_go_bulk_stager.py b/pandaharvester/harvestertest/stageOutTest_go_bulk_stager.py index 1eb24bec..712c135e 100644 --- a/pandaharvester/harvestertest/stageOutTest_go_bulk_stager.py +++ b/pandaharvester/harvestertest/stageOutTest_go_bulk_stager.py @@ -18,11 +18,11 @@ from pandaharvester.harvesterbody.cacher import Cacher from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvestercore.communicator_pool import CommunicatorPool -from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore import core_utils -#initial variables -fileTableName = 'file_table' -queueName = 'ALCF_Theta' +# initial variables +fileTableName = "file_table" +queueName = "ALCF_Theta" begin_job_id = 1111 end_job_id = 1113 @@ -31,23 +31,23 @@ def dump(obj): - for attr in dir(obj): - if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) + for attr in dir(obj): + if hasattr(obj, attr): + print("obj.%s = %s" % (attr, getattr(obj, attr))) if len(sys.argv) > 1: - queueName = sys.argv[1] + queueName = sys.argv[1] if len(sys.argv) > 2: - begin_job_id = int(sys.argv[2]) + begin_job_id = int(sys.argv[2]) if len(sys.argv) > 3: - end_job_id = int(sys.argv[3]) + end_job_id = int(sys.argv[3]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_stager = queueConfig.stager -queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager' -queueConfig.stager['name'] = 'GlobusBulkStager' +queueConfig.stager["module"] = "pandaharvester.harvesterstager.go_bulk_stager" +queueConfig.stager["name"] = "GlobusBulkStager" modified_queueConfig_stager = queueConfig.stager pluginFactory = PluginFactory() @@ -55,20 +55,20 @@ def dump(obj): stagerCore = pluginFactory.get_plugin(queueConfig.stager) # logger -_logger = core_utils.setup_logger('stageOutTest_go_bulk_stager') -tmpLog = core_utils.make_logger(_logger, method_name='stageOutTest_go_bulk_stager') -tmpLog.debug('start') +_logger = core_utils.setup_logger("stageOutTest_go_bulk_stager") +tmpLog = core_utils.make_logger(_logger, method_name="stageOutTest_go_bulk_stager") +tmpLog.debug("start") for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): - #print "loggerName - {}".format(loggerName) - if loggerName.startswith('panda.log'): - if len(loggerObj.handlers) == 0: - continue - if loggerName.split('.')[-1] in ['db_proxy']: - continue - stdoutHandler = logging.StreamHandler(sys.stdout) - stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) - loggerObj.addHandler(stdoutHandler) + # print "loggerName - {}".format(loggerName) + if loggerName.startswith("panda.log"): + if len(loggerObj.handlers) == 0: + continue + if loggerName.split(".")[-1] in ["db_proxy"]: + continue + stdoutHandler = logging.StreamHandler(sys.stdout) + stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) + loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(stagerCore.__class__.__name__) tmpLog.debug(msgStr) @@ -77,7 +77,7 @@ def dump(obj): msgStr = "Modified queueConfig.stager = {}".format(modified_queueConfig_stager) tmpLog.debug(msgStr) -scope = 'panda' +scope = "panda" proxy = DBProxy() communicator = CommunicatorPool() @@ -85,183 +85,182 @@ def dump(obj): cacher.run() - # check if db lock exits -locked = stagerCore.dbInterface.get_object_lock('dummy_id_for_out_0',lock_interval=120) +locked = stagerCore.dbInterface.get_object_lock("dummy_id_for_out_0", lock_interval=120) if not locked: - tmpLog.debug('DB Already locked by another thread') + tmpLog.debug("DB Already locked by another thread") # now unlock db -unlocked = stagerCore.dbInterface.release_object_lock('dummy_id_for_out_0') -if unlocked : - tmpLog.debug('unlocked db') +unlocked = stagerCore.dbInterface.release_object_lock("dummy_id_for_out_0") +if unlocked: + tmpLog.debug("unlocked db") else: - tmpLog.debug(' Could not unlock db') + tmpLog.debug(" Could not unlock db") # loop over the job id's creating various JobSpecs jobSpec_list = [] -for job_id in range(begin_job_id,end_job_id+1): - jobSpec = JobSpec() - jobSpec.jobParams = { - 'scopeLog': 'panda', - 'logFile': 'log', - } - jobSpec.computingSite = queueName - jobSpec.PandaID = job_id - jobSpec.modificationTime = datetime.datetime.now() - realDataset = 'panda.sgotest.' + uuid.uuid4().hex - ddmEndPointOut = 'BNL-OSG2_DATADISK' - outFiles_scope_str = '' - outFiles_str = '' - realDatasets_str = '' - ddmEndPointOut_str = '' - # create up 5 files for output - for index in range(random.randint(1, 5)): - fileSpec = FileSpec() - assFileSpec = FileSpec() - fileSpec.fileType = 'es_output' - assFileSpec.lfn = 'panda.sgotest.' + uuid.uuid4().hex - fileSpec.lfn = assFileSpec.lfn + '.gz' - fileSpec.scope = 'panda' - outFiles_scope_str += 'panda,' - outFiles_str += fileSpec.lfn + ',' - realDatasets_str += realDataset + "," - ddmEndPointOut_str += ddmEndPointOut + "," - assFileSpec.fileType = 'es_output' - assFileSpec.fsize = random.randint(10, 100) - # create source file - hash = hashlib.md5() - hash.update('%s:%s' % (scope, fileSpec.lfn)) - hash_hex = hash.hexdigest() - correctedscope = "/".join(scope.split('.')) - assFileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.stager['Globus_srcPath'], - scope=correctedscope, - hash1=hash_hex[0:2], - hash2=hash_hex[2:4], - lfn=assFileSpec.lfn) - if not os.path.exists(os.path.dirname(assFileSpec.path)): - tmpLog.debug("os.makedirs({})".format(os.path.dirname(assFileSpec.path))) - os.makedirs(os.path.dirname(assFileSpec.path)) - oFile = open(assFileSpec.path, 'w') - oFile.write(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) - oFile.close() - fileSpec.path = assFileSpec.path + '.gz' - fileSpec.add_associated_file(assFileSpec) - #print "dump(fileSpec)" - #dump(fileSpec) - # add output file to jobSpec - jobSpec.add_out_file(fileSpec) - # - tmpLog.debug("file to transfer - {}".format(fileSpec.path)) - #print "dump(jobSpec)" - #dump(jobSpec) - # add log file info - outFiles_str += 'log' - realDatasets_str += 'log.'+ uuid.uuid4().hex - ddmEndPointOut_str += 'MWT2-UC_DATADISK' - # remove final "," - outFiles_scope_str = outFiles_scope_str[:-1] - jobSpec.jobParams['scopeOut'] = outFiles_scope_str - jobSpec.jobParams['outFiles'] = outFiles_str - jobSpec.jobParams['realDatasets'] = realDatasets_str - jobSpec.jobParams['ddmEndPointOut'] = ddmEndPointOut_str - msgStr = "jobSpec.jobParams ={}".format(jobSpec.jobParams) - tmpLog.debug(msgStr) - msgStr = "len(jobSpec.get_output_file_attributes()) = {0} type - {1}".format(len(jobSpec.get_output_file_attributes()),type(jobSpec.get_output_file_attributes())) - tmpLog.debug(msgStr) - for key, value in jobSpec.get_output_file_attributes().iteritems(): - msgStr = "output file attributes - pre DB {0} {1}".format(key,value) - tmpLog.debug(msgStr) - jobSpec_list.append(jobSpec) - +for job_id in range(begin_job_id, end_job_id + 1): + jobSpec = JobSpec() + jobSpec.jobParams = { + "scopeLog": "panda", + "logFile": "log", + } + jobSpec.computingSite = queueName + jobSpec.PandaID = job_id + jobSpec.modificationTime = datetime.datetime.now() + realDataset = "panda.sgotest." + uuid.uuid4().hex + ddmEndPointOut = "BNL-OSG2_DATADISK" + outFiles_scope_str = "" + outFiles_str = "" + realDatasets_str = "" + ddmEndPointOut_str = "" + # create up 5 files for output + for index in range(random.randint(1, 5)): + fileSpec = FileSpec() + assFileSpec = FileSpec() + fileSpec.fileType = "es_output" + assFileSpec.lfn = "panda.sgotest." + uuid.uuid4().hex + fileSpec.lfn = assFileSpec.lfn + ".gz" + fileSpec.scope = "panda" + outFiles_scope_str += "panda," + outFiles_str += fileSpec.lfn + "," + realDatasets_str += realDataset + "," + ddmEndPointOut_str += ddmEndPointOut + "," + assFileSpec.fileType = "es_output" + assFileSpec.fsize = random.randint(10, 100) + # create source file + hash = hashlib.md5() + hash.update("%s:%s" % (scope, fileSpec.lfn)) + hash_hex = hash.hexdigest() + correctedscope = "/".join(scope.split(".")) + assFileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( + endPoint=queueConfig.stager["Globus_srcPath"], scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=assFileSpec.lfn + ) + if not os.path.exists(os.path.dirname(assFileSpec.path)): + tmpLog.debug("os.makedirs({})".format(os.path.dirname(assFileSpec.path))) + os.makedirs(os.path.dirname(assFileSpec.path)) + oFile = open(assFileSpec.path, "w") + oFile.write("".join(random.choice(string.ascii_uppercase + string.digits) for _ in range(assFileSpec.fsize))) + oFile.close() + fileSpec.path = assFileSpec.path + ".gz" + fileSpec.add_associated_file(assFileSpec) + # print "dump(fileSpec)" + # dump(fileSpec) + # add output file to jobSpec + jobSpec.add_out_file(fileSpec) + # + tmpLog.debug("file to transfer - {}".format(fileSpec.path)) + # print "dump(jobSpec)" + # dump(jobSpec) + # add log file info + outFiles_str += "log" + realDatasets_str += "log." + uuid.uuid4().hex + ddmEndPointOut_str += "MWT2-UC_DATADISK" + # remove final "," + outFiles_scope_str = outFiles_scope_str[:-1] + jobSpec.jobParams["scopeOut"] = outFiles_scope_str + jobSpec.jobParams["outFiles"] = outFiles_str + jobSpec.jobParams["realDatasets"] = realDatasets_str + jobSpec.jobParams["ddmEndPointOut"] = ddmEndPointOut_str + msgStr = "jobSpec.jobParams ={}".format(jobSpec.jobParams) + tmpLog.debug(msgStr) + msgStr = "len(jobSpec.get_output_file_attributes()) = {0} type - {1}".format( + len(jobSpec.get_output_file_attributes()), type(jobSpec.get_output_file_attributes()) + ) + tmpLog.debug(msgStr) + for key, value in jobSpec.get_output_file_attributes().iteritems(): + msgStr = "output file attributes - pre DB {0} {1}".format(key, value) + tmpLog.debug(msgStr) + jobSpec_list.append(jobSpec) + # now load into DB JobSpec's and output FileSpec's from jobSpec_list tmpStat = proxy.insert_jobs(jobSpec_list) if tmpStat: - msgStr = "OK Loaded jobs into DB" - tmpLog.debug(msgStr) + msgStr = "OK Loaded jobs into DB" + tmpLog.debug(msgStr) else: - msgStr = "NG Could not load jobs into DB" - tmpLog.debug(msgStr) + msgStr = "NG Could not load jobs into DB" + tmpLog.debug(msgStr) tmpStat = proxy.insert_files(jobSpec_list) if tmpStat: - msgStr = "OK Loaded files into DB" - tmpLog.debug(msgStr) + msgStr = "OK Loaded files into DB" + tmpLog.debug(msgStr) else: - msgStr = "NG Could not load files into DB" - tmpLog.debug(msgStr) + msgStr = "NG Could not load files into DB" + tmpLog.debug(msgStr) # Now loop over the jobSpec's for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - msgStr = "testing zip" - tmpStat, tmpOut = stagerCore.zip_output(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - else: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - msgStr = "testing trigger_stage_out" - tmpLog.debug(msgStr) - tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec) - if tmpStat: - msgStr = " OK " - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - sys.exit(1) - print - # get the files with the group_id and print out - msgStr = "dummy_transfer_id = {}".format(stagerCore.get_dummy_transfer_id()) - files = proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) - files = stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpLog.debug(msgStr) - tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) + # print out jobSpec PandID + msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) + msgStr = "testing zip" + tmpStat, tmpOut = stagerCore.zip_output(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + else: + msgStr = " NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + msgStr = "testing trigger_stage_out" + tmpLog.debug(msgStr) + tmpStat, tmpOut = stagerCore.trigger_stage_out(jobSpec) + if tmpStat: + msgStr = " OK " + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + sys.exit(1) + print + # get the files with the group_id and print out + msgStr = "dummy_transfer_id = {}".format(stagerCore.get_dummy_transfer_id()) + files = proxy.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) + files = stagerCore.dbInterface.get_files_with_group_id(stagerCore.get_dummy_transfer_id()) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpLog.debug(msgStr) + tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " NG {0}".format(tmpOut) + tmpLog.debug(msgStr) # sleep for 10 minutes 1 second - + msgStr = "Sleep for 601 seconds" -#msgStr = "Sleep for 181 seconds" +# msgStr = "Sleep for 181 seconds" tmpLog.debug(msgStr) -#time.sleep(181) +# time.sleep(181) time.sleep(601) msgStr = "now check the jobs" tmpLog.debug(msgStr) for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - tmpLog.debug(msgStr) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) + # print out jobSpec PandID + msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) + tmpLog.debug(msgStr) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + +# sleep for 3 minutes -# sleep for 3 minutes - msgStr = "Sleep for 180 seconds" tmpLog.debug(msgStr) time.sleep(180) @@ -269,17 +268,17 @@ def dump(obj): tmpLog.debug(msgStr) for jobSpec in jobSpec_list: - # print out jobSpec PandID - msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) - tmpLog.debug(msgStr) - msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" - tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) - if tmpStat: - msgStr = " OK" - tmpLog.debug(msgStr) - elif tmpStat == None: - msgStr = " Temporary failure NG {0}".format(tmpOut) - tmpLog.debug(msgStr) - elif not tmpStat: - msgStr = " NG {0}".format(tmpOut) - tmpLog.debug(msgStr) + # print out jobSpec PandID + msgStr = "jobSpec PandaID - {}".format(jobSpec.PandaID) + tmpLog.debug(msgStr) + msgStr = "checking status for transfer and perhaps ultimately triggering the transfer" + tmpStat, tmpOut = stagerCore.check_stage_out_status(jobSpec) + if tmpStat: + msgStr = " OK" + tmpLog.debug(msgStr) + elif tmpStat is None: + msgStr = " Temporary failure NG {0}".format(tmpOut) + tmpLog.debug(msgStr) + elif not tmpStat: + msgStr = " NG {0}".format(tmpOut) + tmpLog.debug(msgStr) diff --git a/pandaharvester/harvestertest/submitterTest.py b/pandaharvester/harvestertest/submitterTest.py index cc683e35..fe569ca0 100644 --- a/pandaharvester/harvestertest/submitterTest.py +++ b/pandaharvester/harvestertest/submitterTest.py @@ -14,7 +14,6 @@ signal_utils.set_suicide_handler(None) os.wait() else: - if len(sys.argv) not in (2, 4): print("Wrong number of parameters. You can either:") print(" - specify the queue name") @@ -25,27 +24,27 @@ queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) - if queueConfig.prodSourceLabel in ('user', 'managed'): + if queueConfig.prodSourceLabel in ("user", "managed"): jobType = queueConfig.prodSourceLabel else: - jobType = 'managed' # default, can be overwritten by parameters + jobType = "managed" # default, can be overwritten by parameters - resourceType = 'SCORE' # default, can be overwritten by parameters + resourceType = "SCORE" # default, can be overwritten by parameters if len(sys.argv) == 4: # jobType should be 'managed' or 'user'. If not specified will default to a production job - if sys.argv[2] in ('user', 'managed'): + if sys.argv[2] in ("user", "managed"): jobType = sys.argv[2] else: - print ('value for jobType not valid, defaulted to {0}'.format(jobType)) + print("value for jobType not valid, defaulted to {0}".format(jobType)) # resourceType should be 'SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'. If not specified defaults to single core - if sys.argv[3] in ('SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'): + if sys.argv[3] in ("SCORE", "SCORE_HIMEM", "MCORE", "MCORE_HIMEM"): resourceType = sys.argv[3] else: - print ('value for resourceType not valid, defaulted to {0}'.format(resourceType)) + print("value for resourceType not valid, defaulted to {0}".format(resourceType)) - print ('Running with queueName:{0}, jobType:{1}, resourceType:{2}'.format(queueName, jobType, resourceType)) + print("Running with queueName:{0}, jobType:{1}, resourceType:{2}".format(queueName, jobType, resourceType)) pluginFactory = PluginFactory() @@ -54,8 +53,7 @@ # get job jobSpecList = [] if queueConfig.mapType != WorkSpec.MT_NoJob: - jobs, errStr = com.get_jobs(queueConfig.queueName, 'nodeName', queueConfig.prodSourceLabel, - 'computingElement', 1, None) + jobs, errStr = com.get_jobs(queueConfig.queueName, "nodeName", queueConfig.prodSourceLabel, "computingElement", 1, None) if len(jobs) == 0: print("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) @@ -66,14 +64,14 @@ # set input file paths inFiles = jobSpec.get_input_file_attributes() for inLFN, inFile in iteritems(inFiles): - inFile['path'] = '{0}/{1}'.format(os.getcwd(), inLFN) + inFile["path"] = "{0}/{1}".format(os.getcwd(), inLFN) jobSpec.set_input_file_paths(inFiles) jobSpecList.append(jobSpec) maker = pluginFactory.get_plugin(queueConfig.workerMaker) workSpec = maker.make_worker(jobSpecList, queueConfig, jobType, resourceType) - workSpec.accessPoint = queueConfig.messenger['accessPoint'] + workSpec.accessPoint = queueConfig.messenger["accessPoint"] workSpec.mapType = queueConfig.mapType workSpec.computingSite = queueConfig.queueName @@ -91,50 +89,50 @@ messenger.feed_jobs(workSpec, jobSpecList) jobSpec = jobSpecList[0] - if 'eventService' in jobSpec.jobParams: + if "eventService" in jobSpec.jobParams: workSpec.eventsRequest = WorkSpec.EV_useEvents if workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() - eventsRequestParams[jobSpec.PandaID] = {'pandaID': jobSpec.PandaID, - 'taskID': jobSpec.taskID, - 'jobsetID': jobSpec.jobParams['jobsetID'], - 'nRanges': jobSpec.jobParams['coreCount'], - } + eventsRequestParams[jobSpec.PandaID] = { + "pandaID": jobSpec.PandaID, + "taskID": jobSpec.taskID, + "jobsetID": jobSpec.jobParams["jobsetID"], + "nRanges": jobSpec.jobParams["coreCount"], + } workSpec.eventsRequestParams = eventsRequestParams tmpStat, events = com.get_event_ranges(workSpec.eventsRequestParams, False, os.getcwd()) # failed if tmpStat is False: - print ('failed to get events with {0}'.format(events)) + print("failed to get events with {0}".format(events)) sys.exit(0) tmpStat = messenger.feed_events(workSpec, events) if tmpStat is False: - print ('failed to feed events with {0}'.format(events)) + print("failed to feed events with {0}".format(events)) sys.exit(0) # get submitter plugin submitterCore = pluginFactory.get_plugin(queueConfig.submitter) - print ("testing submission with plugin={0}".format(submitterCore.__class__.__name__)) + print("testing submission with plugin={0}".format(submitterCore.__class__.__name__)) tmpRetList = submitterCore.submit_workers([workSpec]) tmpStat, tmpOut = tmpRetList[0] if tmpStat: - print (" OK batchID={0}".format(workSpec.batchID)) + print(" OK batchID={0}".format(workSpec.batchID)) else: - print (" NG {0}".format(tmpOut)) + print(" NG {0}".format(tmpOut)) sys.exit(1) - print ('') + print("") # get monitoring plug-in monCore = pluginFactory.get_plugin(queueConfig.monitor) - print ("testing monitoring for batchID={0} with plugin={1}".format(workSpec.batchID, - monCore.__class__.__name__)) + print("testing monitoring for batchID={0} with plugin={1}".format(workSpec.batchID, monCore.__class__.__name__)) tmpStat, tmpOut = monCore.check_workers([workSpec]) tmpOut = tmpOut[0] if tmpStat: - print (" OK workerStatus={0}".format(tmpOut[0])) + print(" OK workerStatus={0}".format(tmpOut[0])) else: - print (" NG {0}".format(tmpOut[1])) + print(" NG {0}".format(tmpOut[1])) sys.exit(1) diff --git a/pandaharvester/harvestertest/testCommunication.py b/pandaharvester/harvestertest/testCommunication.py index 890d2152..53de120d 100644 --- a/pandaharvester/harvestertest/testCommunication.py +++ b/pandaharvester/harvestertest/testCommunication.py @@ -5,10 +5,10 @@ from pandaharvester.harvestercore.communicator_pool import CommunicatorPool for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): - if loggerName.startswith('panda.log'): + if loggerName.startswith("panda.log"): if len(loggerObj.handlers) == 0: continue - if loggerName.split('.')[-1] in ['db_proxy']: + if loggerName.split(".")[-1] in ["db_proxy"]: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) @@ -16,4 +16,4 @@ a = CommunicatorPool() return_object = a.check_panda() -print (return_object) +print(return_object) diff --git a/pandaharvester/harvestertest/updateEventRangesTest.py b/pandaharvester/harvestertest/updateEventRangesTest.py index 50d13b9b..0037578a 100644 --- a/pandaharvester/harvestertest/updateEventRangesTest.py +++ b/pandaharvester/harvestertest/updateEventRangesTest.py @@ -7,23 +7,22 @@ from pandaharvester.harvestercore.communicator_pool import CommunicatorPool rID = sys.argv[1] -taskid = rID.split('-')[0] -pandaid = long(rID.split('-')[1]) +taskid = rID.split("-")[0] +pandaid = long(rID.split("-")[1]) job = JobSpec() job.PandaID = pandaid event = EventSpec() file = FileSpec() -file.status = 'finished' +file.status = "finished" file.objstoreID = 9575 file.pathConvention = 1000 -file.lfn = str(uuid.uuid4().hex) + '.zip' +file.lfn = str(uuid.uuid4().hex) + ".zip" file.fsize = 555 -file.chksum = '0d2a9dc9' +file.chksum = "0d2a9dc9" event.eventRangeID = rID -event.eventStatus = 'finished' -job.zipEventMap = {1: {'events':[event], - 'zip':file}} +event.eventStatus = "finished" +job.zipEventMap = {1: {"events": [event], "zip": file}} a = CommunicatorPool() diff --git a/pandaharvester/harvestertest/updateEvents.py b/pandaharvester/harvestertest/updateEvents.py index 1456baf4..7b7eadd5 100644 --- a/pandaharvester/harvestertest/updateEvents.py +++ b/pandaharvester/harvestertest/updateEvents.py @@ -17,26 +17,26 @@ try: os.makedirs(accessPoint) -except: +except BaseException: pass node = {} -node['eventRangeID'] = eventID -node['eventStatus'] = status +node["eventRangeID"] = eventID +node["eventStatus"] = status -f = open(os.path.join(accessPoint, shared_file_messenger.jsonEventsUpdateFileName), 'w') +f = open(os.path.join(accessPoint, shared_file_messenger.jsonEventsUpdateFileName), "w") json.dump([node], f) f.close() -if status == 'finished': - lfn = id + '.data' +if status == "finished": + lfn = id + ".data" data = {} - data['path'] = os.path.join(accessPoint, lfn) - data['type'] = 'output' - data['fsize'] = 10 * 1024 * 1024 - data['eventRangeID'] = eventID + data["path"] = os.path.join(accessPoint, lfn) + data["type"] = "output" + data["fsize"] = 10 * 1024 * 1024 + data["eventRangeID"] = eventID node = {} node[lfn] = data - f = open(os.path.join(accessPoint, shared_file_messenger.jsonOutputsFileName), 'w') + f = open(os.path.join(accessPoint, shared_file_messenger.jsonOutputsFileName), "w") json.dump(node, f) f.close() diff --git a/pandaharvester/harvestertest/updateJob.py b/pandaharvester/harvestertest/updateJob.py index 04e48fef..533be7be 100644 --- a/pandaharvester/harvestertest/updateJob.py +++ b/pandaharvester/harvestertest/updateJob.py @@ -13,9 +13,9 @@ try: os.makedirs(accessPoint) -except: +except BaseException: pass -f = open(os.path.join(accessPoint, 'status.txt'), 'w') +f = open(os.path.join(accessPoint, "status.txt"), "w") f.write(status) f.close() diff --git a/pandaharvester/harvestertest/watcherTest.py b/pandaharvester/harvestertest/watcherTest.py index 90a29046..39f7fc22 100644 --- a/pandaharvester/harvestertest/watcherTest.py +++ b/pandaharvester/harvestertest/watcherTest.py @@ -5,9 +5,8 @@ try: os.remove(watcher.lockFileName) -except: +except BaseException: pass watcher = Watcher(single_mode=True) watcher.run() - diff --git a/pandaharvester/harvestertest/worker_pandajob_dump.py b/pandaharvester/harvestertest/worker_pandajob_dump.py index f002d4de..f0685505 100755 --- a/pandaharvester/harvestertest/worker_pandajob_dump.py +++ b/pandaharvester/harvestertest/worker_pandajob_dump.py @@ -1,8 +1,14 @@ #!/usr/bin/env python -import os,sys,optparse,logging,sqlite3,datetime +import os +import sys +import optparse +import logging +import sqlite3 +import datetime + logger = logging.getLogger(__name__) -# jw_table columns: +# jw_table columns: # 0|PandaID|integer|0||0 # 1|workerID|integer|0||0 # 2|relationType|text|0||0 @@ -40,104 +46,112 @@ def main(): - ''' this script grabs the latest workers that have been added to the worker_table, finds their associated panda job ids from the jw_table, then presents how many jobs are in each state for that worker. It also shows the panda jobs which are in the fetched, preparing, and prepared states which have not yet been assigned to a worker. ''' - logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s:%(name)s:%(message)s') - - parser = optparse.OptionParser(description='') - parser.add_option('-d','--database-filename',dest='database_filename',help='The Harvester data base file name.') - parser.add_option('-t','--time-in-hours',dest='hours',help='this prints the workers last modified in the last N hours. last-n-workers and time-in-hours are mutually exclusive.',type='int') - parser.add_option('-n','--last-n-workers',dest='workers',help='this prints the last N workers created. last-n-workers and time-in-hours are mutually exclusive.',type='int') - options,args = parser.parse_args() - - - manditory_args = [ - 'database_filename', - - ] - - for man in manditory_args: - if options.__dict__[man] is None: - logger.error('Must specify option: ' + man) - parser.print_help() - sys.exit(-1) - - if options.hours and options.workers: - logger.error('can only specify time-in-hours or last-n-workers, not both') - parser.print_help() - sys.exit(-1) - elif not options.hours and not options.workers: - logger.error('must specify time-in-hours or last-n-workers') - parser.print_help() - sys.exit(-1) - - - conn = sqlite3.connect(options.database_filename) - - cursor = conn.cursor() - - if options.hours: - utcnow = datetime.datetime.utcnow() - datetime.timedelta(hours=options.hours) - utcnow_str = utcnow.strftime('%Y-%d-%m %H:%M:%S') - work_cmd = 'SELECT workerID,batchID,status FROM work_table WHERE modificationTime > "%s"' % utcnow_str - elif options.workers: - work_cmd = 'SELECT workerID,batchID,status FROM work_table ORDER BY workerID DESC LIMIT %s' % options.workers - cursor.execute(work_cmd) - - work_entries = cursor.fetchall() - - for work_entry in work_entries: - workerID,batchID,workerStatus = work_entry - - jobs_in_state = {} - jobs_in_substate = {} - - jw_cmd = 'SELECT * FROM jw_table WHERE workerID=%s' % workerID - - cursor.execute(jw_cmd) - jw_entries = cursor.fetchall() - - for jw_entry in jw_entries: - pandaID,workerID,relationType = jw_entry - - job_cmd = 'SELECT status,subStatus FROM job_table WHERE PandaID=%s' % pandaID - - cursor.execute(job_cmd) - job_info = cursor.fetchall()[0] - jobStatus,jobSubStatus = job_info - if jobStatus in jobs_in_state: - jobs_in_state[jobStatus] += 1 - else: - jobs_in_state[jobStatus] = 1 - if jobSubStatus in jobs_in_substate: - jobs_in_substate[jobSubStatus] += 1 - else: - jobs_in_substate[jobSubStatus] = 1 - #logger.info('pandaID: %s status: %s subStatus: %s',pandaID,status,subStatus) - string = 'job status = [' - for job_status,count in jobs_in_state.iteritems(): - string += ' %s(%s)' % (job_status,count) - string += '] subStatus = {' - for job_substatus,count in jobs_in_substate.iteritems(): - string += '%s(%s)' % (job_substatus,count) - string += '}' - logger.info('workerID: %s; batchID: %s; worker status: %s; %s',workerID,batchID,workerStatus,string) - - cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="fetched"' - cursor.execute(cmd) - jobs = cursor.fetchall() - logger.info('panda jobs in fetched: %s',len(jobs)) - - cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="preparing"' - cursor.execute(cmd) - jobs = cursor.fetchall() - logger.info('panda jobs in preparing: %s',len(jobs)) - - cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="prepared"' - cursor.execute(cmd) - jobs = cursor.fetchall() - logger.info('panda jobs in prepared: %s',len(jobs)) - + """this script grabs the latest workers that have been added to the worker_table, finds their associated panda job ids from the jw_table, then presents how many jobs are in each state for that worker. It also shows the panda jobs which are in the fetched, preparing, and prepared states which have not yet been assigned to a worker.""" + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(name)s:%(message)s") + + parser = optparse.OptionParser(description="") + parser.add_option("-d", "--database-filename", dest="database_filename", help="The Harvester data base file name.") + parser.add_option( + "-t", + "--time-in-hours", + dest="hours", + help="this prints the workers last modified in the last N hours. last-n-workers and time-in-hours are mutually exclusive.", + type="int", + ) + parser.add_option( + "-n", + "--last-n-workers", + dest="workers", + help="this prints the last N workers created. last-n-workers and time-in-hours are mutually exclusive.", + type="int", + ) + options, args = parser.parse_args() + + manditory_args = [ + "database_filename", + ] + + for man in manditory_args: + if options.__dict__[man] is None: + logger.error("Must specify option: " + man) + parser.print_help() + sys.exit(-1) + + if options.hours and options.workers: + logger.error("can only specify time-in-hours or last-n-workers, not both") + parser.print_help() + sys.exit(-1) + elif not options.hours and not options.workers: + logger.error("must specify time-in-hours or last-n-workers") + parser.print_help() + sys.exit(-1) + + conn = sqlite3.connect(options.database_filename) + + cursor = conn.cursor() + + if options.hours: + utcnow = datetime.datetime.utcnow() - datetime.timedelta(hours=options.hours) + utcnow_str = utcnow.strftime("%Y-%d-%m %H:%M:%S") + work_cmd = 'SELECT workerID,batchID,status FROM work_table WHERE modificationTime > "%s"' % utcnow_str + elif options.workers: + work_cmd = "SELECT workerID,batchID,status FROM work_table ORDER BY workerID DESC LIMIT %s" % options.workers + cursor.execute(work_cmd) + + work_entries = cursor.fetchall() + + for work_entry in work_entries: + workerID, batchID, workerStatus = work_entry + + jobs_in_state = {} + jobs_in_substate = {} + + jw_cmd = "SELECT * FROM jw_table WHERE workerID=%s" % workerID + + cursor.execute(jw_cmd) + jw_entries = cursor.fetchall() + + for jw_entry in jw_entries: + pandaID, workerID, relationType = jw_entry + + job_cmd = "SELECT status,subStatus FROM job_table WHERE PandaID=%s" % pandaID + + cursor.execute(job_cmd) + job_info = cursor.fetchall()[0] + jobStatus, jobSubStatus = job_info + if jobStatus in jobs_in_state: + jobs_in_state[jobStatus] += 1 + else: + jobs_in_state[jobStatus] = 1 + if jobSubStatus in jobs_in_substate: + jobs_in_substate[jobSubStatus] += 1 + else: + jobs_in_substate[jobSubStatus] = 1 + # logger.info('pandaID: %s status: %s subStatus: %s',pandaID,status,subStatus) + string = "job status = [" + for job_status, count in jobs_in_state.iteritems(): + string += " %s(%s)" % (job_status, count) + string += "] subStatus = {" + for job_substatus, count in jobs_in_substate.iteritems(): + string += "%s(%s)" % (job_substatus, count) + string += "}" + logger.info("workerID: %s; batchID: %s; worker status: %s; %s", workerID, batchID, workerStatus, string) + + cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="fetched"' + cursor.execute(cmd) + jobs = cursor.fetchall() + logger.info("panda jobs in fetched: %s", len(jobs)) + + cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="preparing"' + cursor.execute(cmd) + jobs = cursor.fetchall() + logger.info("panda jobs in preparing: %s", len(jobs)) + + cmd = 'SELECT PandaID,status,subStatus FROM job_table WHERE subStatus="prepared"' + cursor.execute(cmd) + jobs = cursor.fetchall() + logger.info("panda jobs in prepared: %s", len(jobs)) -if __name__ == "__main__": - main() +if __name__ == "__main__": + main() diff --git a/pandaharvester/harvesterthrottler/simple_throttler.py b/pandaharvester/harvesterthrottler/simple_throttler.py index 83de09d1..76c79be0 100644 --- a/pandaharvester/harvesterthrottler/simple_throttler.py +++ b/pandaharvester/harvesterthrottler/simple_throttler.py @@ -5,7 +5,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('simple_throttler') +baseLogger = core_utils.setup_logger("simple_throttler") # simple throttler @@ -13,17 +13,16 @@ class SimpleThrottler(PluginBase): # constructor def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied - self.logicType = 'OR' + self.logicType = "OR" PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy() # check if to be throttled def to_be_throttled(self, queue_config): - tmpLog = self.make_logger(baseLogger, 'computingSite={0}'.format(queue_config.queueName), - method_name='to_be_throttled') - tmpLog.debug('start') + tmpLog = self.make_logger(baseLogger, "computingSite={0}".format(queue_config.queueName), method_name="to_be_throttled") + tmpLog.debug("start") # set default return vale - if self.logicType == 'OR': + if self.logicType == "OR": retVal = False, "no rule was satisfied" else: retVal = True, "all rules were satisfied" @@ -33,43 +32,43 @@ def to_be_throttled(self, queue_config): timeNow = datetime.datetime.utcnow() for rule in self.rulesForMissed: # convert rule to criteria - if rule['level'] == 'site': + if rule["level"] == "site": criteria = dict() - criteria['siteName'] = queue_config.siteName - criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) + criteria["siteName"] = queue_config.siteName + criteria["timeLimit"] = timeNow - datetime.timedelta(minutes=rule["timeWindow"]) criteriaList.append(criteria) - maxMissedList.append(rule['maxMissed']) - elif rule['level'] == 'pq': + maxMissedList.append(rule["maxMissed"]) + elif rule["level"] == "pq": criteria = dict() - criteria['computingSite'] = queue_config.queueName - criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) + criteria["computingSite"] = queue_config.queueName + criteria["timeLimit"] = timeNow - datetime.timedelta(minutes=rule["timeWindow"]) criteriaList.append(criteria) - maxMissedList.append(rule['maxMissed']) - elif rule['level'] == 'ce': - elmName = 'computingElements' + maxMissedList.append(rule["maxMissed"]) + elif rule["level"] == "ce": + elmName = "computingElements" if elmName not in queue_config.submitter: - tmpLog.debug('skipped since {0} is undefined in submitter config'.format(elmName)) + tmpLog.debug("skipped since {0} is undefined in submitter config".format(elmName)) continue for ce in queue_config.submitter[elmName]: criteria = dict() - criteria['computingElement'] = ce - criteria['timeLimit'] = timeNow - datetime.timedelta(minutes=rule['timeWindow']) + criteria["computingElement"] = ce + criteria["timeLimit"] = timeNow - datetime.timedelta(minutes=rule["timeWindow"]) criteriaList.append(criteria) - maxMissedList.append(rule['maxMissed']) + maxMissedList.append(rule["maxMissed"]) # loop over all criteria for criteria, maxMissed in zip(criteriaList, maxMissedList): nMissed = self.dbProxy.get_num_missed_workers(queue_config.queueName, criteria) if nMissed > maxMissed: - if self.logicType == 'OR': - tmpMsg = 'logic={0} and '.format(self.logicType) - tmpMsg += 'nMissed={0} > maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria)) + if self.logicType == "OR": + tmpMsg = "logic={0} and ".format(self.logicType) + tmpMsg += "nMissed={0} > maxMissed={1} for {2}".format(nMissed, maxMissed, str(criteria)) retVal = True, tmpMsg break else: - if self.logicType == 'AND': - tmpMsg = 'logic={0} and '.format(self.logicType) - tmpMsg += 'nMissed={0} <= maxMissed={1} for {2}'.format(nMissed, maxMissed, str(criteria)) + if self.logicType == "AND": + tmpMsg = "logic={0} and ".format(self.logicType) + tmpMsg += "nMissed={0} <= maxMissed={1} for {2}".format(nMissed, maxMissed, str(criteria)) retVal = False, tmpMsg break - tmpLog.debug('ret={0} : {1}'.format(*retVal)) + tmpLog.debug("ret={0} : {1}".format(*retVal)) return retVal diff --git a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py index beb29294..69d51154 100644 --- a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py @@ -5,6 +5,7 @@ # dummy worker maker + class DummyDynamicWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): @@ -21,20 +22,20 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec.maxWalltime = 0 for jobSpec in jobspec_list: try: - workSpec.nCore += jobSpec.jobParams['coreCount'] + workSpec.nCore += jobSpec.jobParams["coreCount"] except Exception: workSpec.nCore += 1 try: - workSpec.minRamCount += jobSpec.jobParams['minRamCount'] + workSpec.minRamCount += jobSpec.jobParams["minRamCount"] except Exception: pass try: - workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] + workSpec.maxDiskCount += jobSpec.jobParams["maxDiskCount"] except Exception: pass try: - if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): - workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) + if jobSpec.jobParams["maxWalltime"] not in (None, "NULL"): + workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams["maxWalltime"]) else: workSpec.maxWalltime = queue_config.walltimeLimit except Exception: diff --git a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py index 7de1f4f5..a7adbf6a 100644 --- a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py @@ -5,31 +5,31 @@ # multijob worker maker. one job per node. aprun as executor (initially) # static parameters collected from queue config file # pilot -baseLogger = core_utils.setup_logger('multijob_workermaker') +baseLogger = core_utils.setup_logger("multijob_workermaker") class MultiJobWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) - tmpLog = self.make_logger(baseLogger, method_name='__init__') + tmpLog = self.make_logger(baseLogger, method_name="__init__") tmpLog.info("Multijob workermaker") def _get_executable(self, queue_config): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" - tmpLog = self.make_logger(baseLogger, method_name='_get_executable') + tmpLog = self.make_logger(baseLogger, method_name="_get_executable") # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): - env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) + env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: if self.executor == "aprun": # "aprun -n [number of required nodes/jobs] -d [number of cpu per node/job]" - for one multicore job per node - exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, queue_config.submitter['nCorePerNode']) + exe_str = self.executor + " -n {0} -d {1} ".format(self.nJobsPerWorker, queue_config.submitter["nCorePerNode"]) exe_str += self.pilot else: exe_str = self.executor + " " + self.pilot @@ -46,13 +46,12 @@ def _get_executable(self, queue_config): # make a worker from a job with a disk access point def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmpLog = self.make_logger(baseLogger, method_name='make_worker') + tmpLog = self.make_logger(baseLogger, method_name="make_worker") workSpec = WorkSpec() self.nJobsPerWorker = len(jobspec_list) tmpLog.info("Worker for {0} jobs will be prepared".format(self.nJobsPerWorker)) if self.nJobsPerWorker > 0: - - workSpec.nCore = int(queue_config.submitter['nCorePerNode']) * self.nJobsPerWorker + workSpec.nCore = int(queue_config.submitter["nCorePerNode"]) * self.nJobsPerWorker workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = 0 @@ -61,11 +60,11 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog.debug("Wall time limit for worker: {0}".format(workSpec.maxWalltime)) for jobSpec in jobspec_list: try: - workSpec.minRamCount = max(workSpec.minRamCount, jobSpec.jobParams['minRamCount']) + workSpec.minRamCount = max(workSpec.minRamCount, jobSpec.jobParams["minRamCount"]) except Exception: pass try: - workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] + workSpec.maxDiskCount += jobSpec.jobParams["maxDiskCount"] except Exception: pass # try: we should not relay on job parameters yet (not relaible) diff --git a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py index c03c44cf..d0a632af 100644 --- a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py @@ -9,7 +9,7 @@ # dynamic parametrs from infrastructure through plugins -baseLogger = core_utils.setup_logger('multinode_workermaker') +baseLogger = core_utils.setup_logger("multinode_workermaker") class MultiNodeWorkerMaker(BaseWorkerMaker): @@ -18,7 +18,7 @@ def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) self.pluginFactory = PluginFactory() self.queue_config_mapper = QueueConfigMapper() - tmpLog = self.make_logger(baseLogger, method_name='__init__') + tmpLog = self.make_logger(baseLogger, method_name="__init__") tmpLog.info("Multinode workermaker: created.") tmpLog.debug("Queue name: {0}".format(self.queueName)) if self.mode == "static": @@ -32,12 +32,12 @@ def _get_executable(self): # return string which contain body of script for scheduler: specific enviroment setup, executor with parameters exe_str = "" - tmpLog = self.make_logger(baseLogger, method_name='_get_executable') + tmpLog = self.make_logger(baseLogger, method_name="_get_executable") # prepare static enviroment env_str = "" if self.env not in (None, "NULL"): - env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) + env_str = "\n".join(map(lambda s: s.strip(), self.env.split(", "))) # prepare executor try: @@ -59,15 +59,13 @@ def _get_executable(self): # make a worker from jobs def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), - method_name='make_worker') + tmpLog = core_utils.make_logger(baseLogger, "queue={0}".format(queue_config.queueName), method_name="make_worker") tmpLog.info("Multi node worker preparation started.") - tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, - self.nNodes)) + tmpLog.info("Worker size: {0} jobs on {2} nodes for {1} sec.".format(self.nJobsPerWorker, self.walltimelimit, self.nNodes)) workSpec = WorkSpec() - workSpec.nCore = self.nNodes * queue_config.submitter['nCorePerNode'] + workSpec.nCore = self.nNodes * queue_config.submitter["nCorePerNode"] workSpec.minRamCount = 0 workSpec.maxDiskCount = 0 workSpec.maxWalltime = self.walltimelimit @@ -77,23 +75,21 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): # push case: we know the job and set the parameters of the job for jobSpec in jobspec_list: try: - workSpec.minRamCount += jobSpec.jobParams['minRamCount'] + workSpec.minRamCount += jobSpec.jobParams["minRamCount"] except Exception: pass try: - workSpec.maxDiskCount += jobSpec.jobParams['maxDiskCount'] + workSpec.maxDiskCount += jobSpec.jobParams["maxDiskCount"] except Exception: pass - #try: + # try: # if jobSpec.jobParams['maxWalltime'] not in (None, "NULL"): # workSpec.maxWalltime = max(int(queue_config.walltimeLimit), jobSpec.jobParams['maxWalltime']) # else: # workSpec.maxWalltime = queue_config.walltimeLimit - #except Exception: + # except Exception: # pass - tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, - workSpec.maxWalltime, - self.nJobsPerWorker)) + tmpLog.info("Worker for {0} nodes with {2} jobs with walltime {1} sec. defined".format(self.nNodes, workSpec.maxWalltime, self.nJobsPerWorker)) return workSpec @@ -120,8 +116,7 @@ def get_resources(self): """ Function to get resourcese and map them to number of jobs """ - tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(self.queueName), - method_name='get_resources') + tmpLog = core_utils.make_logger(baseLogger, "queue={0}".format(self.queueName), method_name="get_resources") njobs = 0 walltime = self.walltimelimit queue_config = self.queue_config_mapper.get_queue(self.queueName) @@ -133,4 +128,3 @@ def get_resources(self): nodes = self.nNodes return nodes, walltime - diff --git a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py index bc2d7dad..8736fa80 100644 --- a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py @@ -1,6 +1,7 @@ import datetime import math import traceback + try: import subprocess32 as subprocess except Exception: @@ -15,13 +16,13 @@ # simple backfill eventservice maker # logger -_logger = core_utils.setup_logger('simple_bf_worker_maker') +_logger = core_utils.setup_logger("simple_bf_worker_maker") class SimpleBackfillESWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): - self.jobAttributesToUse = ['nCore', 'minRamCount', 'maxDiskCount', 'maxWalltime'] + self.jobAttributesToUse = ["nCore", "minRamCount", "maxDiskCount", "maxWalltime"] self.adjusters = None BaseWorkerMaker.__init__(self, **kwarg) self.init_adjusters_defaults() @@ -29,10 +30,9 @@ def __init__(self, **kwarg): # make a worker from jobs def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), - method_name='make_worker') + tmpLog = self.make_logger(_logger, "queue={0}".format(queue_config.queueName), method_name="make_worker") - tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) + tmpLog.debug("jobspec_list: {0}".format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() @@ -40,9 +40,9 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): # get the queue configuration from the DB panda_queues_dict = PandaQueuesDict() queue_dict = panda_queues_dict.get(queue_config.queueName, {}) - workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 - workSpec.maxWalltime = queue_dict.get('maxtime', 1) - workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) + workSpec.minRamCount = queue_dict.get("maxrss", 1) or 1 + workSpec.maxWalltime = queue_dict.get("maxtime", 1) + workSpec.maxDiskCount = queue_dict.get("maxwdir", 1) # get info from jobs if len(jobspec_list) > 0: @@ -56,12 +56,12 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec.maxWalltime = maxWalltime # TODO: this needs to be improved with real resource types - if resource_type and resource_type != 'ANY': + if resource_type and resource_type != "ANY": workSpec.resourceType = resource_type elif workSpec.nCore == 1: - workSpec.resourceType = 'SCORE' + workSpec.resourceType = "SCORE" else: - workSpec.resourceType = 'MCORE' + workSpec.resourceType = "MCORE" return workSpec @@ -76,8 +76,7 @@ def get_num_workers_per_job(self, n_workers): # check number of ready resources def num_ready_resources(self): # make logger - tmpLog = self.make_logger(_logger, 'simple_bf_es_maker', - method_name='num_ready_resources') + tmpLog = self.make_logger(_logger, "simple_bf_es_maker", method_name="num_ready_resources") try: resources = self.get_bf_resources() @@ -102,26 +101,27 @@ def init_adjusters_defaults(self): "minCapacity": * , "maxCapacity": * }] """ - adj_defaults = {"minNodes": 1, - "maxNodes": 125, - "minWalltimeSeconds": 1800, - "maxWalltimeSeconds": 7200, - "nodesToDecrease": 1, - "walltimeSecondsToDecrease": 60} + adj_defaults = { + "minNodes": 1, + "maxNodes": 125, + "minWalltimeSeconds": 1800, + "maxWalltimeSeconds": 7200, + "nodesToDecrease": 1, + "walltimeSecondsToDecrease": 60, + } if self.adjusters: for adjuster in self.adjusters: for key, value in adj_defaults.items(): if key not in adjuster: adjuster[key] = value - adjuster['minCapacity'] = adjuster['minWalltimeSeconds'] * adjuster['minNodes'] - adjuster['maxCapacity'] = adjuster['maxWalltimeSeconds'] * adjuster['maxNodes'] - self.adjusters.sort(key=lambda my_dict: my_dict['minNodes']) + adjuster["minCapacity"] = adjuster["minWalltimeSeconds"] * adjuster["minNodes"] + adjuster["maxCapacity"] = adjuster["maxWalltimeSeconds"] * adjuster["maxNodes"] + self.adjusters.sort(key=lambda my_dict: my_dict["minNodes"]) # get backfill resources def get_bf_resources(self, blocking=True): # make logger - tmpLog = self.make_logger(_logger, 'simple_bf_es_maker', - method_name='get_bf_resources') + tmpLog = self.make_logger(_logger, "simple_bf_es_maker", method_name="get_bf_resources") resources = [] # command if blocking: @@ -129,15 +129,12 @@ def get_bf_resources(self, blocking=True): else: comStr = "showbf -p {0}".format(self.partition) # get backfill resources - tmpLog.debug('Get backfill resources with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + tmpLog.debug("Get backfill resources with {0}".format(comStr)) + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode - tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug("retCode={0}".format(retCode)) if retCode == 0: # extract batchID tmpLog.debug("Available backfill resources for partition(%s):\n%s" % (self.partition, stdOut)) @@ -151,53 +148,52 @@ def get_bf_resources(self, blocking=True): if nodes < self.minNodes: continue walltime = items[3] - resources.append({'nodes': nodes, 'walltime': walltime}) - except: + resources.append({"nodes": nodes, "walltime": walltime}) + except BaseException: tmpLog.error("Failed to parse line: %s" % line) else: # failed - errStr = stdOut + ' ' + stdErr + errStr = stdOut + " " + stdErr tmpLog.error(errStr) tmpLog.info("Available backfill resources: %s" % resources) return resources def get_adjuster(self, nodes): for adj in self.adjusters: - if nodes >= adj['minNodes'] and nodes <= adj['maxNodes']: + if nodes >= adj["minNodes"] and nodes <= adj["maxNodes"]: return adj return None def adjust_resources(self, resources): # make logger - tmpLog = self.make_logger(_logger, 'simple_bf_es_maker', - method_name='adjust_resources') + tmpLog = self.make_logger(_logger, "simple_bf_es_maker", method_name="adjust_resources") ret_resources = [] for resource in resources: - if resource['nodes'] > self.maxNodes: + if resource["nodes"] > self.maxNodes: nodes = self.maxNodes else: - nodes = resource['nodes'] + nodes = resource["nodes"] adjuster = self.get_adjuster(nodes) if adjuster: - if (resource['nodes'] - adjuster['nodesToDecrease']) < nodes: - nodes = resource['nodes'] - adjuster['nodesToDecrease'] + if (resource["nodes"] - adjuster["nodesToDecrease"]) < nodes: + nodes = resource["nodes"] - adjuster["nodesToDecrease"] if nodes <= 0: continue - walltime = resource['walltime'] - if walltime == 'INFINITY': - walltime = adjuster['maxWalltimeSeconds'] - ret_resources.append({'nodes': nodes, 'walltime': walltime, 'nCore': nodes * self.nCorePerNode}) + walltime = resource["walltime"] + if walltime == "INFINITY": + walltime = adjuster["maxWalltimeSeconds"] + ret_resources.append({"nodes": nodes, "walltime": walltime, "nCore": nodes * self.nCorePerNode}) else: - h, m, s = walltime.split(':') + h, m, s = walltime.split(":") walltime = int(h) * 3600 + int(m) * 60 + int(s) - if walltime >= adjuster['minWalltimeSeconds'] and walltime < adjuster['maxWalltimeSeconds']: - walltime -= adjuster['walltimeSecondsToDecrease'] - ret_resources.append({'nodes': nodes, 'walltime': walltime, 'nCore': nodes * self.nCorePerNode}) - elif walltime >= adjuster['maxWalltimeSeconds']: - walltime = adjuster['maxWalltimeSeconds'] - adjuster['walltimeSecondsToDecrease'] - ret_resources.append({'nodes': nodes, 'walltime': walltime, 'nCore': nodes * self.nCorePerNode}) - ret_resources.sort(key=lambda my_dict: my_dict['nodes'] * my_dict['walltime'], reverse=True) + if walltime >= adjuster["minWalltimeSeconds"] and walltime < adjuster["maxWalltimeSeconds"]: + walltime -= adjuster["walltimeSecondsToDecrease"] + ret_resources.append({"nodes": nodes, "walltime": walltime, "nCore": nodes * self.nCorePerNode}) + elif walltime >= adjuster["maxWalltimeSeconds"]: + walltime = adjuster["maxWalltimeSeconds"] - adjuster["walltimeSecondsToDecrease"] + ret_resources.append({"nodes": nodes, "walltime": walltime, "nCore": nodes * self.nCorePerNode}) + ret_resources.sort(key=lambda my_dict: my_dict["nodes"] * my_dict["walltime"], reverse=True) tmpLog.info("Available backfill resources after adjusting: %s" % ret_resources) return ret_resources @@ -206,49 +202,48 @@ def get_dynamic_resource(self, queue_name, job_type, resource_type): if resources: resources = self.adjust_resources(resources) if resources: - return {'nNewWorkers': 1, 'resources': resources} + return {"nNewWorkers": 1, "resources": resources} return {} def get_needed_nodes_walltime(self, availNodes, availWalltime, neededCapacity): - tmpLog = self.make_logger(_logger, 'simple_bf_es_maker', - method_name='get_needed_nodes_walltime') + tmpLog = self.make_logger(_logger, "simple_bf_es_maker", method_name="get_needed_nodes_walltime") solutions = [] spareNodes = 1 # one Yoda node which doesn't process any events for adj in self.adjusters: - if availNodes < adj['minNodes']: + if availNodes < adj["minNodes"]: continue - solutionNodes = min(availNodes, adj['maxNodes']) - solutionWalltime = min(availWalltime, adj['maxWalltimeSeconds'] - adj['walltimeSecondsToDecrease']) + solutionNodes = min(availNodes, adj["maxNodes"]) + solutionWalltime = min(availWalltime, adj["maxWalltimeSeconds"] - adj["walltimeSecondsToDecrease"]) if neededCapacity >= (solutionNodes - spareNodes) * solutionWalltime: - solutions.append({'solutionNodes': solutionNodes, 'solutionWalltime': solutionWalltime}) + solutions.append({"solutionNodes": solutionNodes, "solutionWalltime": solutionWalltime}) else: solutionNodes = neededCapacity / solutionWalltime + spareNodes - if solutionNodes >= adj['minNodes']: - solutions.append({'solutionNodes': solutionNodes, 'solutionWalltime': solutionWalltime}) + if solutionNodes >= adj["minNodes"]: + solutions.append({"solutionNodes": solutionNodes, "solutionWalltime": solutionWalltime}) else: - solutionNodes = adj['minNodes'] + solutionNodes = adj["minNodes"] requiredWalltime = neededCapacity / (solutionNodes - spareNodes) - if requiredWalltime >= adj['minWalltimeSeconds']: + if requiredWalltime >= adj["minWalltimeSeconds"]: # walltime can be bigger than the requiredWalltime, will exit automatically - solutions.append({'solutionNodes': solutionNodes, 'solutionWalltime': solutionWalltime}) + solutions.append({"solutionNodes": solutionNodes, "solutionWalltime": solutionWalltime}) def solution_compare(x, y): - if x['solutionWalltime'] - y['solutionWalltime'] != 0: - return x['solutionWalltime'] - y['solutionWalltime'] + if x["solutionWalltime"] - y["solutionWalltime"] != 0: + return x["solutionWalltime"] - y["solutionWalltime"] else: - return x['solutionNodes'] - y['solutionNodes'] + return x["solutionNodes"] - y["solutionNodes"] + solutions.sort(cmp=solution_compare, reverse=True) tmpLog.info("Available solutions: %s" % solutions) if solutions: - return solutions[0]['solutionNodes'], solutions[0]['solutionWalltime'] + return solutions[0]["solutionNodes"], solutions[0]["solutionWalltime"] else: None # calculate needed cores and maxwalltime def calculate_worker_requirements(self, nRemainingEvents): - tmpLog = self.make_logger(_logger, 'simple_bf_es_maker', - method_name='calculate_worker_requirements') - if not hasattr(self, 'nSecondsPerEvent') or self.nSecondsPerEvent < 100: + tmpLog = self.make_logger(_logger, "simple_bf_es_maker", method_name="calculate_worker_requirements") + if not hasattr(self, "nSecondsPerEvent") or self.nSecondsPerEvent < 100: tmpLog.warn("nSecondsPerEvent is not set, will use default value 480 seconds(8 minutes)") nSecondsPerEvent = 480 else: @@ -259,21 +254,21 @@ def calculate_worker_requirements(self, nRemainingEvents): if self.dyn_resources: resource = self.dyn_resources.pop(0) tmpLog.debug("Selected dynamic resources: %s" % resource) - walltime = resource['walltime'] + walltime = resource["walltime"] if nRemainingEvents <= 0: - if resource['nodes'] < self.defaultNodes: - nCore = resource['nodes'] * self.nCorePerNode + if resource["nodes"] < self.defaultNodes: + nCore = resource["nodes"] * self.nCorePerNode else: tmpLog.warn("nRemainingEvents is not correctly propagated or delayed, will not submit big jobs, shrink number of nodes to default") nCore = self.defaultNodes * self.nCorePerNode else: neededCapacity = nRemainingEvents * nSecondsPerEvent * 1.0 / self.nCorePerNode - tmpLog.info("nRemainingEvents: %s, nSecondsPerEvent: %s, nCorePerNode: %s, neededCapacity(nodes*walltime): %s" % (nRemainingEvents, - nSecondsPerEvent, - self.nCorePerNode, - neededCapacity)) + tmpLog.info( + "nRemainingEvents: %s, nSecondsPerEvent: %s, nCorePerNode: %s, neededCapacity(nodes*walltime): %s" + % (nRemainingEvents, nSecondsPerEvent, self.nCorePerNode, neededCapacity) + ) - neededNodes, neededWalltime = self.get_needed_nodes_walltime(resource['nodes'], walltime, neededCapacity) + neededNodes, neededWalltime = self.get_needed_nodes_walltime(resource["nodes"], walltime, neededCapacity) tmpLog.info("neededNodes: %s, neededWalltime: %s" % (neededNodes, neededWalltime)) neededNodes = int(math.ceil(neededNodes)) walltime = int(neededWalltime) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index e0f73974..97a37a03 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -12,27 +12,26 @@ import datetime # logger -_logger = core_utils.setup_logger('simple_worker_maker') +_logger = core_utils.setup_logger("simple_worker_maker") # simple maker class SimpleWorkerMaker(BaseWorkerMaker): # constructor def __init__(self, **kwarg): - self.jobAttributesToUse = ['nCore', 'minRamCount', 'maxDiskCount', 'maxWalltime', 'ioIntensity'] + self.jobAttributesToUse = ["nCore", "minRamCount", "maxDiskCount", "maxWalltime", "ioIntensity"] BaseWorkerMaker.__init__(self, **kwarg) self.rt_mapper = ResourceTypeMapper() def get_job_core_and_memory(self, queue_dict, job_spec): + job_memory = job_spec.jobParams.get("minRamCount", 0) or 0 + job_corecount = job_spec.jobParams.get("coreCount", 1) or 1 - job_memory = job_spec.jobParams.get('minRamCount', 0) or 0 - job_corecount = job_spec.jobParams.get('coreCount', 1) or 1 - - is_ucore = queue_dict.get('capability', '') == 'ucore' + is_ucore = queue_dict.get("capability", "") == "ucore" if not job_memory and is_ucore: - site_maxrss = queue_dict.get('maxrss', 0) or 0 - site_corecount = queue_dict.get('corecount', 1) or 1 + site_maxrss = queue_dict.get("maxrss", 0) or 0 + site_corecount = queue_dict.get("corecount", 1) or 1 if job_corecount == 1: job_memory = int(math.ceil(site_maxrss / site_corecount)) @@ -42,52 +41,50 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_corecount, job_memory def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None): - - queue_type = queue_dict.get('type', None) + queue_type = queue_dict.get("type", None) # 1. get prodSourceLabel from job (PUSH) - if job_spec and 'prodSourceLabel' in job_spec.jobParams: - job_type_final = job_spec.jobParams['prodSourceLabel'] + if job_spec and "prodSourceLabel" in job_spec.jobParams: + job_type_final = job_spec.jobParams["prodSourceLabel"] # 2. get prodSourceLabel from the specified job_type (PULL UPS) elif job_type: job_type_final = job_type if tmp_prodsourcelabel: - if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda', 'managed'): + if queue_type != "analysis" and tmp_prodsourcelabel not in ("user", "panda", "managed"): # for production, unified or other types of queues we need to run neutral prodsourcelabels # with production proxy since they can't be distinguished and can fail - job_type_final = 'managed' + job_type_final = "managed" # 3. convert the prodSourcelabel from the queue configuration or leave it empty (PULL) else: # map CRIC types to PanDA types - if queue_type == 'analysis': - job_type_final = 'user' - elif queue_type == 'production': - job_type_final = 'managed' + if queue_type == "analysis": + job_type_final = "user" + elif queue_type == "production": + job_type_final = "managed" else: job_type_final = None return job_type_final def capability_to_rtype(self, capability): - if capability == 'score': - return 'SCORE' - elif capability == 'himem': - return 'SCORE_HIMEM' - elif capability == 'mcore': - return 'MCORE' - elif capability == 'mcorehimem': - return 'MCORE_HIMEM' - else: - return None + if capability == "score": + return "SCORE" + elif capability == "himem": + return "SCORE_HIMEM" + elif capability == "mcore": + return "MCORE" + elif capability == "mcorehimem": + return "MCORE_HIMEM" + else: + return None # make a worker from jobs def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmpLog = self.make_logger(_logger, 'queue={0}:{1}:{2}'.format(queue_config.queueName, job_type, resource_type), - method_name='make_worker') + tmpLog = self.make_logger(_logger, "queue={0}:{1}:{2}".format(queue_config.queueName, job_type, resource_type), method_name="make_worker") - tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) + tmpLog.debug("jobspec_list: {0}".format(jobspec_list)) workSpec = WorkSpec() workSpec.creationTime = datetime.datetime.utcnow() @@ -97,23 +94,22 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): queue_dict = panda_queues_dict.get(queue_config.queueName, {}) associated_params_dict = panda_queues_dict.get_harvester_params(queue_config.queueName) - is_ucore = queue_dict.get('capability', '') == 'ucore' + is_ucore = queue_dict.get("capability", "") == "ucore" # case of traditional (non-ucore) queue: look at the queue configuration if not is_ucore: - workSpec.nCore = queue_dict.get('corecount', 1) or 1 - workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 + workSpec.nCore = queue_dict.get("corecount", 1) or 1 + workSpec.minRamCount = queue_dict.get("maxrss", 1) or 1 # case of unified queue: look at the job & resource type and queue configuration else: - catchall = queue_dict.get('catchall', '') - if 'useMaxRam' in catchall or queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified', - 'Taiwan-LCG2-HPC_Unified', 'DESY-ZN_UCORE'): + catchall = queue_dict.get("catchall", "") + if "useMaxRam" in catchall or queue_config.queueName in ("Taiwan-LCG2-HPC2_Unified", "Taiwan-LCG2-HPC_Unified", "DESY-ZN_UCORE"): # temporary hack to debug killed workers in Taiwan queues - site_corecount = queue_dict.get('corecount', 1) or 1 - site_maxrss = queue_dict.get('maxrss', 1) or 1 + site_corecount = queue_dict.get("corecount", 1) or 1 + site_maxrss = queue_dict.get("maxrss", 1) or 1 # some cases need to overwrite those values - if 'SCORE' in resource_type: + if "SCORE" in resource_type: # the usual pilot streaming use case workSpec.nCore = 1 workSpec.minRamCount = int(math.ceil(site_maxrss / site_corecount)) @@ -122,17 +118,18 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec.nCore = site_corecount workSpec.minRamCount = site_maxrss else: - if not len(jobspec_list) and resource_type not in ['SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM']: + if not len(jobspec_list) and resource_type not in ["SCORE", "SCORE_HIMEM", "MCORE", "MCORE_HIMEM"]: # some testing PQs have ucore + pure pull, need to default to SCORE - tmpLog.warning('Invalid resource type "{resource_type}" (perhaps due to ucore with pure pull); default to SCORE'.format(resource_type=resource_type)) - resource_type = 'SCORE' - workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(resource_type, - queue_dict) + tmpLog.warning( + 'Invalid resource type "{resource_type}" (perhaps due to ucore with pure pull); default to SCORE'.format(resource_type=resource_type) + ) + resource_type = "SCORE" + workSpec.nCore, workSpec.minRamCount = self.rt_mapper.calculate_worker_requirements(resource_type, queue_dict) # parameters that are independent on traditional vs unified - workSpec.maxWalltime = queue_dict.get('maxtime', 1) - workSpec.maxDiskCount = queue_dict.get('maxwdir', 1) - walltimeLimit_default = getattr(queue_config, 'walltimeLimit', 0) + workSpec.maxWalltime = queue_dict.get("maxtime", 1) + workSpec.maxDiskCount = queue_dict.get("maxwdir", 1) + walltimeLimit_default = getattr(queue_config, "walltimeLimit", 0) if len(jobspec_list) > 0: # get info from jobs @@ -146,27 +143,27 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): nCore += job_corecount minRamCount += job_memory try: - maxDiskCount += jobSpec.jobParams['maxDiskCount'] + maxDiskCount += jobSpec.jobParams["maxDiskCount"] except Exception: pass try: - maxWalltime += jobSpec.jobParams['maxWalltime'] + maxWalltime += jobSpec.jobParams["maxWalltime"] except Exception: pass try: - ioIntensity += jobSpec.jobParams['ioIntensity'] + ioIntensity += jobSpec.jobParams["ioIntensity"] except Exception: pass # fill in worker attributes - if (nCore > 0 and 'nCore' in self.jobAttributesToUse) or is_ucore: + if (nCore > 0 and "nCore" in self.jobAttributesToUse) or is_ucore: workSpec.nCore = nCore - if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) or is_ucore: + if (minRamCount > 0 and "minRamCount" in self.jobAttributesToUse) or is_ucore: workSpec.minRamCount = minRamCount - if maxDiskCount > 0 and ('maxDiskCount' in self.jobAttributesToUse or associated_params_dict.get('job_maxdiskcount') is True): + if maxDiskCount > 0 and ("maxDiskCount" in self.jobAttributesToUse or associated_params_dict.get("job_maxdiskcount") is True): workSpec.maxDiskCount = maxDiskCount - if maxWalltime > 0 and ('maxWalltime' in self.jobAttributesToUse or associated_params_dict.get('job_maxwalltime') is True): + if maxWalltime > 0 and ("maxWalltime" in self.jobAttributesToUse or associated_params_dict.get("job_maxwalltime") is True): workSpec.maxWalltime = maxWalltime - if ioIntensity > 0 and ('ioIntensity' in self.jobAttributesToUse or associated_params_dict.get('job_iointensity') is True): + if ioIntensity > 0 and ("ioIntensity" in self.jobAttributesToUse or associated_params_dict.get("job_iointensity") is True): workSpec.ioIntensity = ioIntensity workSpec.pilotType = jobspec_list[0].get_pilot_type() @@ -175,32 +172,35 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): else: # when no job # randomize pilot type with weighting - pdpm = getattr(queue_config, 'prodSourceLabelRandomWeightsPermille', {}) - choice_list = core_utils.make_choice_list(pdpm=pdpm, default='managed') + pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) + choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") tmp_prodsourcelabel = random.choice(choice_list) fake_job = JobSpec() fake_job.jobParams = {} - fake_job.jobParams['prodSourceLabel'] = tmp_prodsourcelabel + fake_job.jobParams["prodSourceLabel"] = tmp_prodsourcelabel workSpec.pilotType = fake_job.get_pilot_type() del fake_job - if workSpec.pilotType in ['RC', 'ALRB', 'PT']: - tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) + if workSpec.pilotType in ["RC", "ALRB", "PT"]: + tmpLog.info("a worker has pilotType={0}".format(workSpec.pilotType)) workSpec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) - tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})' - .format(workSpec.jobType, job_type, queue_dict.get('type', None), tmp_prodsourcelabel)) + tmpLog.debug( + "get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})".format( + workSpec.jobType, job_type, queue_dict.get("type", None), tmp_prodsourcelabel + ) + ) # retrieve queue resource type - capability = queue_dict.get('capability', '') + capability = queue_dict.get("capability", "") queue_rtype = self.capability_to_rtype(capability) - if resource_type and resource_type != 'ANY': + if resource_type and resource_type != "ANY": workSpec.resourceType = resource_type elif queue_rtype: workSpec.resourceType = queue_rtype elif workSpec.nCore == 1: - workSpec.resourceType = 'SCORE' + workSpec.resourceType = "SCORE" else: - workSpec.resourceType = 'MCORE' + workSpec.resourceType = "MCORE" return workSpec diff --git a/pandaharvester/harvesterzipper/base_zipper.py b/pandaharvester/harvesterzipper/base_zipper.py index 11a4f95e..37cccf1a 100644 --- a/pandaharvester/harvesterzipper/base_zipper.py +++ b/pandaharvester/harvesterzipper/base_zipper.py @@ -27,7 +27,7 @@ def __init__(self, **kwarg): # zip output files def simple_zip_output(self, jobspec, tmp_log): - tmp_log.debug('start') + tmp_log.debug("start") self.zip_tmp_log = tmp_log self.zip_jobSpec = jobspec argDictList = [] @@ -44,17 +44,17 @@ def simple_zip_output(self, jobspec, tmp_log): zipDir = self.zipDir zipPath = os.path.join(zipDir, fileSpec.lfn) argDict = dict() - argDict['zipPath'] = zipPath - argDict['associatedFiles'] = [] + argDict["zipPath"] = zipPath + argDict["associatedFiles"] = [] for assFileSpec in fileSpec.associatedFiles: if os.path.exists(assFileSpec.path): - argDict['associatedFiles'].append(assFileSpec.path) + argDict["associatedFiles"].append(assFileSpec.path) else: - assFileSpec.status = 'failed' + assFileSpec.status = "failed" argDictList.append(argDict) # parallel execution try: - if hasattr(harvester_config, 'zipper'): + if hasattr(harvester_config, "zipper"): nThreadsForZip = harvester_config.zipper.nThreadsForZip else: nThreadsForZip = harvester_config.stager.nThreadsForZip @@ -67,52 +67,49 @@ def simple_zip_output(self, jobspec, tmp_log): tmpRet, errMsg, fileInfo = retVal if tmpRet is True: # set path - fileSpec.path = fileInfo['path'] - fileSpec.fsize = fileInfo['fsize'] - fileSpec.chksum = fileInfo['chksum'] - msgStr = 'fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}' \ - .format(fileSpec.path, fileSpec.fsize, fileSpec.chksum) + fileSpec.path = fileInfo["path"] + fileSpec.fsize = fileInfo["fsize"] + fileSpec.chksum = fileInfo["chksum"] + msgStr = "fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}".format( + fileSpec.path, fileSpec.fsize, fileSpec.chksum + ) tmp_log.debug(msgStr) else: - tmp_log.error('got {0} with {1} when zipping {2}'.format(tmpRet, errMsg, fileSpec.lfn)) - return tmpRet, 'failed to zip with {0}'.format(errMsg) + tmp_log.error("got {0} with {1} when zipping {2}".format(tmpRet, errMsg, fileSpec.lfn)) + return tmpRet, "failed to zip with {0}".format(errMsg) except Exception: errMsg = core_utils.dump_error_message(tmp_log) - return False, 'failed to zip with {0}'.format(errMsg) - tmp_log.debug('done') - return True, '' + return False, "failed to zip with {0}".format(errMsg) + tmp_log.debug("done") + return True, "" # make one zip file def make_one_zip(self, arg_dict): try: - zipPath = arg_dict['zipPath'] + zipPath = arg_dict["zipPath"] lfn = os.path.basename(zipPath) - self.zip_tmp_log.debug('{0} start zipPath={1} with {2} files'.format(lfn, zipPath, - len(arg_dict['associatedFiles']))) + self.zip_tmp_log.debug("{0} start zipPath={1} with {2} files".format(lfn, zipPath, len(arg_dict["associatedFiles"]))) # make zip if doesn't exist if not os.path.exists(zipPath): # tmp file names - tmpZipPath = zipPath + '.' + str(uuid.uuid4()) - tmpZipPathIn = tmpZipPath + '.in' + tmpZipPath = zipPath + "." + str(uuid.uuid4()) + tmpZipPathIn = tmpZipPath + ".in" with open(tmpZipPathIn, "w") as f: - for associatedFile in arg_dict['associatedFiles']: + for associatedFile in arg_dict["associatedFiles"]: f.write("{0}\n".format(associatedFile)) # make command - com = 'tar -c -f {0} -T {1} '.format(tmpZipPath, tmpZipPathIn) + com = "tar -c -f {0} -T {1} ".format(tmpZipPath, tmpZipPathIn) com += "--transform 's/.*\///' " # execute - p = subprocess.Popen(com, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen(com, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p.communicate() retCode = p.returncode if retCode != 0: - msgStr = 'failed to make zip for {0} with {1}:{2}'.format(lfn, stdOut, stdErr) + msgStr = "failed to make zip for {0} with {1}:{2}".format(lfn, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} # avoid overwriting - lockName = 'zip.lock.{0}'.format(lfn) + lockName = "zip.lock.{0}".format(lfn) lockInterval = 60 tmpStat = False # get lock @@ -123,7 +120,7 @@ def make_one_zip(self, arg_dict): time.sleep(1) # failed to lock if not tmpStat: - msgStr = 'failed to lock for {0}'.format(lfn) + msgStr = "failed to lock for {0}".format(lfn) self.zip_tmp_log.error(msgStr) return None, msgStr if not os.path.exists(zipPath): @@ -132,127 +129,114 @@ def make_one_zip(self, arg_dict): self.dbInterface.release_object_lock(lockName) # make return fileInfo = dict() - fileInfo['path'] = zipPath + fileInfo["path"] = zipPath # get size statInfo = os.stat(zipPath) - fileInfo['fsize'] = statInfo.st_size - fileInfo['chksum'] = core_utils.calc_adler32(zipPath) + fileInfo["fsize"] = statInfo.st_size + fileInfo["chksum"] = core_utils.calc_adler32(zipPath) except Exception: errMsg = core_utils.dump_error_message(self.zip_tmp_log) - return False, 'failed to zip with {0}'.format(errMsg) - self.zip_tmp_log.debug('{0} done'.format(lfn)) - return True, '', fileInfo + return False, "failed to zip with {0}".format(errMsg) + self.zip_tmp_log.debug("{0} done".format(lfn)) + return True, "", fileInfo # zip output files; file operations are done on remote side with ssh def ssh_zip_output(self, jobspec, tmp_log): - tmp_log.debug('start') + tmp_log.debug("start") self.zip_tmp_log = tmp_log self.zip_jobSpec = jobspec argDictList = [] outFiles_list = list(jobspec.outFiles) try: try: - if hasattr(harvester_config, 'zipper'): + if hasattr(harvester_config, "zipper"): nThreadsForZip = harvester_config.zipper.nThreadsForZip else: nThreadsForZip = harvester_config.stager.nThreadsForZip except Exception: nThreadsForZip = multiprocessing.cpu_count() # check associate file existence + def _check_assfile_existence(fileSpec): - in_data = '\\n'.join(['{0}'.format(assFileSpec.path) for assFileSpec in fileSpec.associatedFiles]) - com1 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - fileop_script=self.fileop_script, - suffix='_check-exist.tmp', - dir=os.path.dirname(next(iter(fileSpec.associatedFiles)).path), - data=in_data, - ) + in_data = "\\n".join(["{0}".format(assFileSpec.path) for assFileSpec in fileSpec.associatedFiles]) + com1 = ( + "ssh " + "-o StrictHostKeyChecking=no " + "-i {sshkey} " + "{userhost} " + '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' + ).format( + sshkey=self.sshkey, + userhost=self.userhost, + fileop_script=self.fileop_script, + suffix="_check-exist.tmp", + dir=os.path.dirname(next(iter(fileSpec.associatedFiles)).path), + data=in_data, + ) # execute - p1 = subprocess.Popen(com1, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p1 = subprocess.Popen(com1, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1.communicate() retCode = p1.returncode if retCode != 0: - msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format(stdOut, stdErr) + msgStr = "failed to make tmpargfile remotely with {0}:{1}".format(stdOut, stdErr) tmp_log.error(msgStr) - return False, 'failed to zip with {0}'.format(msgStr) + return False, "failed to zip with {0}".format(msgStr) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() - tmpargfile_name = stdOut_str.strip('\n') + tmpargfile_name = stdOut_str.strip("\n") del p1, stdOut, stdErr # record set existence_set = set() # make command - com2 = ( 'ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"cat {arg_file} | xargs -I%% sh -c \' test -f %% && echo T || echo F \' " ' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - arg_file=tmpargfile_name, - ) + com2 = ( + "ssh " + "-o StrictHostKeyChecking=no " + "-i {sshkey} " + "{userhost} " + "\"cat {arg_file} | xargs -I%% sh -c ' test -f %% && echo T || echo F ' \" " + ).format( + sshkey=self.sshkey, + userhost=self.userhost, + arg_file=tmpargfile_name, + ) # execute - p2 = subprocess.Popen(com2, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p2 = subprocess.Popen(com2, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p2.communicate() retCode = p2.returncode if retCode != 0: - msgStr = 'failed to existence of associate files with {0}:{1}'.format(stdOut, stdErr) + msgStr = "failed to existence of associate files with {0}:{1}".format(stdOut, stdErr) tmp_log.error(msgStr) else: try: stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() - ret_list = stdOut_str.strip('\n').split('\n') + ret_list = stdOut_str.strip("\n").split("\n") if len(fileSpec.associatedFiles) == len(ret_list): - for (assFileSpec, retVal) in zip(fileSpec.associatedFiles, ret_list): - if retVal == 'T': + for assFileSpec, retVal in zip(fileSpec.associatedFiles, ret_list): + if retVal == "T": existence_set.add(assFileSpec.path) else: - msgStr = 'returned number of files inconsistent! Skipped...' + msgStr = "returned number of files inconsistent! Skipped..." tmp_log.error(msgStr) except Exception: core_utils.dump_error_message(tmp_log) del p2, stdOut, stdErr, com2 # delete tmpargfile - com3 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"{fileop_script} remove_file {file_path} "' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - fileop_script=self.fileop_script, - file_path=tmpargfile_name, - ) + com3 = ("ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"{fileop_script} remove_file {file_path} "').format( + sshkey=self.sshkey, + userhost=self.userhost, + fileop_script=self.fileop_script, + file_path=tmpargfile_name, + ) # execute - p3 = subprocess.Popen(com3, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p3 = subprocess.Popen(com3, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p3.communicate() retCode = p3.returncode if retCode != 0: - msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format(stdOut, stdErr) + msgStr = "failed to delete tmpargfile remotely with {0}:{1}".format(stdOut, stdErr) tmp_log.error(msgStr) del p3, stdOut, stdErr gc.collect() return existence_set + # parallel execution of check existence with Pool(max_workers=nThreadsForZip) as pool: existence_set_list = pool.map(_check_assfile_existence, outFiles_list) @@ -269,14 +253,14 @@ def _check_assfile_existence(fileSpec): zipDir = self.zipDir zipPath = os.path.join(zipDir, fileSpec.lfn) argDict = dict() - argDict['zipPath'] = zipPath - argDict['associatedFiles'] = [] + argDict["zipPath"] = zipPath + argDict["associatedFiles"] = [] # check existence of files for assFileSpec in fileSpec.associatedFiles: if assFileSpec.path in existence_set: - argDict['associatedFiles'].append(assFileSpec.path) + argDict["associatedFiles"].append(assFileSpec.path) else: - assFileSpec.status = 'failed' + assFileSpec.status = "failed" # append argDictList.append(argDict) # parallel execution of zip @@ -287,110 +271,91 @@ def _check_assfile_existence(fileSpec): tmpRet, errMsg, fileInfo = retVal if tmpRet is True: # set path - fileSpec.path = fileInfo['path'] - fileSpec.fsize = fileInfo['fsize'] - fileSpec.chksum = fileInfo['chksum'] - msgStr = 'fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}' \ - .format(fileSpec.path, fileSpec.fsize, fileSpec.chksum) + fileSpec.path = fileInfo["path"] + fileSpec.fsize = fileInfo["fsize"] + fileSpec.chksum = fileInfo["chksum"] + msgStr = "fileSpec.path - {0}, fileSpec.fsize - {1}, fileSpec.chksum(adler32) - {2}".format( + fileSpec.path, fileSpec.fsize, fileSpec.chksum + ) tmp_log.debug(msgStr) else: - tmp_log.error('got {0} with {1} when zipping {2}'.format(tmpRet, errMsg, fileSpec.lfn)) - return tmpRet, 'failed to zip with {0}'.format(errMsg) + tmp_log.error("got {0} with {1} when zipping {2}".format(tmpRet, errMsg, fileSpec.lfn)) + return tmpRet, "failed to zip with {0}".format(errMsg) except Exception: errMsg = core_utils.dump_error_message(tmp_log) - return False, 'failed to zip with {0}'.format(errMsg) - tmp_log.debug('done') - return True, '' + return False, "failed to zip with {0}".format(errMsg) + tmp_log.debug("done") + return True, "" # make one zip file; file operations are done on remote side with ssh def ssh_make_one_zip(self, arg_dict): try: - zipPath = arg_dict['zipPath'] + zipPath = arg_dict["zipPath"] lfn = os.path.basename(zipPath) - self.zip_tmp_log.debug('{0} start zipPath={1} with {2} files'.format(lfn, zipPath, - len(arg_dict['associatedFiles']))) - in_data = '\\n'.join(['{0}'.format(path) for path in arg_dict['associatedFiles']]) - com0 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - fileop_script=self.fileop_script, - suffix='_tar-name.tmp', - dir=os.path.dirname(zipPath), - data=in_data, - ) + self.zip_tmp_log.debug("{0} start zipPath={1} with {2} files".format(lfn, zipPath, len(arg_dict["associatedFiles"]))) + in_data = "\\n".join(["{0}".format(path) for path in arg_dict["associatedFiles"]]) + com0 = ( + "ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"{fileop_script} write_tmpfile --suffix {suffix} --dir {dir} \\"{data}\\" "' + ).format( + sshkey=self.sshkey, + userhost=self.userhost, + fileop_script=self.fileop_script, + suffix="_tar-name.tmp", + dir=os.path.dirname(zipPath), + data=in_data, + ) # execute - p0 = subprocess.Popen(com0, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p0 = subprocess.Popen(com0, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p0.communicate() retCode = p0.returncode if retCode != 0: - msgStr = 'failed to make tmpargfile remotely with {0}:{1}'.format(stdOut, stdErr) + msgStr = "failed to make tmpargfile remotely with {0}:{1}".format(stdOut, stdErr) tmp_log.error(msgStr) - return False, 'failed to zip with {0}'.format(msgStr) + return False, "failed to zip with {0}".format(msgStr) stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() - tmpargfile_name = stdOut_str.strip('\n') + tmpargfile_name = stdOut_str.strip("\n") del p0, stdOut, stdErr # tmp zip file names - tmpZipPath = zipPath + '.' + str(uuid.uuid4()) - com1 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"test -f {tmpZipPath} || tar -cf {tmpZipPath} -T {arg_file} --transform \'s;.*/;;\' "' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - tmpZipPath=tmpZipPath, - arg_file=tmpargfile_name, - ) + tmpZipPath = zipPath + "." + str(uuid.uuid4()) + com1 = ( + "ssh " + "-o StrictHostKeyChecking=no " + "-i {sshkey} " + "{userhost} " + "\"test -f {tmpZipPath} || tar -cf {tmpZipPath} -T {arg_file} --transform 's;.*/;;' \"" + ).format( + sshkey=self.sshkey, + userhost=self.userhost, + tmpZipPath=tmpZipPath, + arg_file=tmpargfile_name, + ) # execute - p1 = subprocess.Popen(com1, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p1 = subprocess.Popen(com1, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1.communicate() retCode = p1.returncode if retCode != 0: - msgStr = 'failed to make zip for {0} with {1}:{2}'.format(lfn, stdOut, stdErr) + msgStr = "failed to make zip for {0} with {1}:{2}".format(lfn, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} del p1, stdOut, stdErr # delete tmpargfile - com1a = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"{fileop_script} remove_file {file_path} "' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - fileop_script=self.fileop_script, - file_path=tmpargfile_name, - ) + com1a = ("ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"{fileop_script} remove_file {file_path} "').format( + sshkey=self.sshkey, + userhost=self.userhost, + fileop_script=self.fileop_script, + file_path=tmpargfile_name, + ) # execute - p1a = subprocess.Popen(com1a, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p1a = subprocess.Popen(com1a, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p1a.communicate() retCode = p1a.returncode if retCode != 0: - msgStr = 'failed to delete tmpargfile remotely with {0}:{1}'.format(stdOut, stdErr) + msgStr = "failed to delete tmpargfile remotely with {0}:{1}".format(stdOut, stdErr) tmp_log.error(msgStr) del p1a, stdOut, stdErr gc.collect() # avoid overwriting - lockName = 'zip.lock.{0}'.format(lfn) + lockName = "zip.lock.{0}".format(lfn) lockInterval = 60 tmpStat = False # get lock @@ -401,26 +366,17 @@ def ssh_make_one_zip(self, arg_dict): time.sleep(1) # failed to lock if not tmpStat: - msgStr = 'failed to lock for {0}'.format(lfn) + msgStr = "failed to lock for {0}".format(lfn) self.zip_tmp_log.error(msgStr) return None, msgStr, {} # rename to be zipPath - com2 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"test -f {zipPath} || mv {tmpZipPath} {zipPath}"' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - zipPath=zipPath, - tmpZipPath=tmpZipPath, - ) - p2 = subprocess.Popen(com2, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + com2 = ("ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"test -f {zipPath} || mv {tmpZipPath} {zipPath}"').format( + sshkey=self.sshkey, + userhost=self.userhost, + zipPath=zipPath, + tmpZipPath=tmpZipPath, + ) + p2 = subprocess.Popen(com2, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p2.communicate() del p2 gc.collect() @@ -428,66 +384,48 @@ def ssh_make_one_zip(self, arg_dict): self.dbInterface.release_object_lock(lockName) # make return fileInfo = dict() - fileInfo['path'] = zipPath + fileInfo["path"] = zipPath # get size - com3 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"stat -c %s {zipPath}"' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - zipPath=zipPath, - ) - p3 = subprocess.Popen(com3, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + com3 = ("ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"stat -c %s {zipPath}"').format( + sshkey=self.sshkey, + userhost=self.userhost, + zipPath=zipPath, + ) + p3 = subprocess.Popen(com3, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p3.communicate() retCode = p3.returncode if retCode != 0: - msgStr = 'failed to get file size of {0} with {1}:{2}'.format(zipPath, stdOut, stdErr) + msgStr = "failed to get file size of {0} with {1}:{2}".format(zipPath, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} else: stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() - file_size = int(stdOut_str.strip('\n')) - fileInfo['fsize'] = file_size + file_size = int(stdOut_str.strip("\n")) + fileInfo["fsize"] = file_size del p3, stdOut, stdErr gc.collect() # get checksum - com4 = ('ssh ' - '-o StrictHostKeyChecking=no ' - '-i {sshkey} ' - '{userhost} ' - '"{fileop_script} adler32 {zipPath}"' - ).format( - sshkey=self.sshkey, - userhost=self.userhost, - fileop_script=self.fileop_script, - zipPath=zipPath, - ) - p4 = subprocess.Popen(com4, - shell=True, - close_fds=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + com4 = ("ssh " "-o StrictHostKeyChecking=no " "-i {sshkey} " "{userhost} " '"{fileop_script} adler32 {zipPath}"').format( + sshkey=self.sshkey, + userhost=self.userhost, + fileop_script=self.fileop_script, + zipPath=zipPath, + ) + p4 = subprocess.Popen(com4, shell=True, close_fds=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdOut, stdErr = p4.communicate() retCode = p4.returncode if retCode != 0: - msgStr = 'failed to get file adler32 of {0} with {1}:{2}'.format(zipPath, stdOut, stdErr) + msgStr = "failed to get file adler32 of {0} with {1}:{2}".format(zipPath, stdOut, stdErr) self.zip_tmp_log.error(msgStr) return None, msgStr, {} else: stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() - file_chksum = stdOut_str.strip('\n') - fileInfo['chksum'] = file_chksum + file_chksum = stdOut_str.strip("\n") + fileInfo["chksum"] = file_chksum del p4, stdOut, stdErr gc.collect() except Exception: errMsg = core_utils.dump_error_message(self.zip_tmp_log) - return False, 'failed to zip with {0}'.format(errMsg) - self.zip_tmp_log.debug('{0} done'.format(lfn)) - return True, '', fileInfo + return False, "failed to zip with {0}".format(errMsg) + self.zip_tmp_log.debug("{0} done".format(lfn)) + return True, "", fileInfo diff --git a/pandaharvester/harvesterzipper/dummy_zipper.py b/pandaharvester/harvesterzipper/dummy_zipper.py index 6a04af03..43e5a130 100644 --- a/pandaharvester/harvesterzipper/dummy_zipper.py +++ b/pandaharvester/harvesterzipper/dummy_zipper.py @@ -4,7 +4,7 @@ from .base_zipper import BaseZipper # logger -_logger = core_utils.setup_logger('dummy_zipper') +_logger = core_utils.setup_logger("dummy_zipper") # dummy plugin for zipper @@ -27,8 +27,7 @@ def zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) # asynchronous zip output @@ -46,19 +45,15 @@ def async_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='async_zip_output') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="async_zip_output") + tmpLog.debug("start") # set some ID which can be used for lookup in post_zip_output() groupID = str(uuid.uuid4()) lfns = [] for fileSpec in jobspec.outFiles: lfns.append(fileSpec.lfn) - jobspec.set_groups_to_files({groupID: {'lfns': lfns, - 'groupStatus': 'zipping'} - } - ) - return True, '' + jobspec.set_groups_to_files({groupID: {"lfns": lfns, "groupStatus": "zipping"}}) + return True, "" # post zipping def post_zip_output(self, jobspec): @@ -72,16 +67,15 @@ def post_zip_output(self, jobspec): :rtype: (bool, string) """ # make logger - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='post_zip_output') - tmpLog.debug('start') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="post_zip_output") + tmpLog.debug("start") # get groups for lookup groups = jobspec.get_groups_of_output_files() # do something with groupIDs pass # update file attributes for fileSpec in jobspec.outFiles: - fileSpec.path = '/path/to/zip' + fileSpec.path = "/path/to/zip" fileSpec.fsize = 12345 - fileSpec.chksum = '66bb0985' - return True, '' \ No newline at end of file + fileSpec.chksum = "66bb0985" + return True, "" diff --git a/pandaharvester/harvesterzipper/simple_zipper.py b/pandaharvester/harvesterzipper/simple_zipper.py index 8d037f05..56ac37ef 100644 --- a/pandaharvester/harvesterzipper/simple_zipper.py +++ b/pandaharvester/harvesterzipper/simple_zipper.py @@ -2,7 +2,7 @@ from .base_zipper import BaseZipper # logger -_logger = core_utils.setup_logger('simple_zipper') +_logger = core_utils.setup_logger("simple_zipper") # simple plugin for zipper @@ -13,17 +13,15 @@ def __init__(self, **kwarg): # zip output files def zip_output(self, jobspec): - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.simple_zip_output(jobspec, tmpLog) # asynchronous zip output def async_zip_output(self, jobspec): - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # not really asynchronous as two staged zipping is not implemented in this plugin return self.simple_zip_output(jobspec, tmpLog) # post zipping def post_zip_output(self, jobspec): - return True, '' \ No newline at end of file + return True, "" diff --git a/pandaharvester/harvesterzipper/ssh_zipper.py b/pandaharvester/harvesterzipper/ssh_zipper.py index 854d1f5d..43c10492 100644 --- a/pandaharvester/harvesterzipper/ssh_zipper.py +++ b/pandaharvester/harvesterzipper/ssh_zipper.py @@ -2,7 +2,7 @@ from .base_zipper import BaseZipper # logger -_logger = core_utils.setup_logger('ssh_zipper') +_logger = core_utils.setup_logger("ssh_zipper") # ssh plugin for zipper @@ -13,17 +13,15 @@ def __init__(self, **kwarg): # zip output files def zip_output(self, jobspec): - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") return self.ssh_zip_output(jobspec, tmpLog) # asynchronous zip output def async_zip_output(self, jobspec): - tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='zip_output') + tmpLog = self.make_logger(_logger, "PandaID={0}".format(jobspec.PandaID), method_name="zip_output") # not really asynchronous as two staged zipping is not implemented in this plugin return self.ssh_zip_output(jobspec, tmpLog) # post zipping def post_zip_output(self, jobspec): - return True, '' + return True, "" diff --git a/setup.py b/setup.py index 9783e337..f89a4981 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ from setuptools import setup, find_packages from pandaharvester import panda_pkg_info -sys.path.insert(0, '.') +sys.path.insert(0, ".") # get release version release_version = panda_pkg_info.release_version @@ -15,64 +15,75 @@ setup( name="pandaharvester", version=release_version, - description='Harvester Package', - long_description='''This package contains Harvester components''', - license='GPL', - author='Panda Team', - author_email='atlas-adc-panda@cern.ch', - url='https://github.com/PanDAWMS/panda-harvester/wiki', - python_requires='>=2.7', + description="Harvester Package", + long_description="""This package contains Harvester components""", + license="GPL", + author="Panda Team", + author_email="atlas-adc-panda@cern.ch", + url="https://github.com/PanDAWMS/panda-harvester/wiki", + python_requires=">=2.7", packages=find_packages(), - install_requires=['requests', - 'python-daemon', - 'future', - 'futures; python_version == "2.*"', - 'pycryptodomex', - 'panda-common', - 'pyjwt', - 'subprocess32; python_version == "2.*"', - 'rpyc', - 'paramiko', - 'pexpect', - 'psutil >= 5.4.8', - 'scandir; python_version < "3.5"', - 'panda-pilot >= 2.7.2.1', - 'six', - ], - + install_requires=[ + "requests", + "python-daemon", + "future", + 'futures; python_version == "2.*"', + "pycryptodomex", + "panda-common", + "pyjwt", + 'subprocess32; python_version == "2.*"', + "rpyc", + "paramiko", + "pexpect", + "psutil >= 5.4.8", + 'scandir; python_version < "3.5"', + "panda-pilot >= 2.7.2.1", + "six", + ], # optional pip dependencies extras_require={ - 'kubernetes': ['kubernetes', 'pyyaml'], - 'mysql': ['mysqlclient'], - 'atlasgrid': ['uWSGI >= 2.0.20', 'htcondor >= 9.2.0', 'mysqlclient >= 2.0.3'], + "kubernetes": ["kubernetes", "pyyaml"], + "mysql": ["mysqlclient"], + "atlasgrid": ["uWSGI >= 2.0.20", "htcondor >= 9.2.0", "mysqlclient >= 2.0.3"], }, - data_files=[ # config and cron files - ('etc/panda', ['templates/panda_harvester.cfg.rpmnew.template', - 'templates/logrotate.d/panda_harvester', - 'templates/panda_harvester-httpd.conf.rpmnew.template', - 'templates/panda_supervisord.cfg.rpmnew.template', - 'templates/panda_harvester-uwsgi.ini.rpmnew.template', - ] - ), + ( + "etc/panda", + [ + "templates/panda_harvester.cfg.rpmnew.template", + "templates/logrotate.d/panda_harvester", + "templates/panda_harvester-httpd.conf.rpmnew.template", + "templates/panda_supervisord.cfg.rpmnew.template", + "templates/panda_harvester-uwsgi.ini.rpmnew.template", + ], + ), # sysconfig - ('etc/sysconfig', ['templates/sysconfig/panda_harvester.rpmnew.template', - ] - ), + ( + "etc/sysconfig", + [ + "templates/sysconfig/panda_harvester.rpmnew.template", + ], + ), # init script - ('etc/rc.d/init.d', ['templates/init.d/panda_harvester.rpmnew.template', - 'templates/init.d/panda_harvester-apachectl.rpmnew.template', - 'templates/init.d/panda_harvester-uwsgi.rpmnew.template', - ] - ), + ( + "etc/rc.d/init.d", + [ + "templates/init.d/panda_harvester.rpmnew.template", + "templates/init.d/panda_harvester-apachectl.rpmnew.template", + "templates/init.d/panda_harvester-uwsgi.rpmnew.template", + ], + ), # admin tool - ('local/bin', ['templates/harvester-admin.rpmnew.template', - ] - ), - ], - - scripts=['templates/panda_jedi-renice', - 'templates/panda_harvester-sqlite3backup', - ] - ) + ( + "local/bin", + [ + "templates/harvester-admin.rpmnew.template", + ], + ), + ], + scripts=[ + "templates/panda_jedi-renice", + "templates/panda_harvester-sqlite3backup", + ], +) From fd9b5130e51b7fa1abd4fe398a9262a802d7a0ca Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 14:58:53 +0200 Subject: [PATCH 5/7] merge git hook pre-commit --- git_hooks/commit_timestamp | 9 --------- git_hooks/pre-commit | 26 ++++++++++++++++++++++++++ pandaharvester/commit_timestamp.py | 2 +- 3 files changed, 27 insertions(+), 10 deletions(-) delete mode 100755 git_hooks/commit_timestamp create mode 100755 git_hooks/pre-commit diff --git a/git_hooks/commit_timestamp b/git_hooks/commit_timestamp deleted file mode 100755 index 5aa37044..00000000 --- a/git_hooks/commit_timestamp +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -DATE=`date -u '+%d-%m-%Y %H:%M:%S'` -NAME=`git config --global user.name` -BRANCH=`git rev-parse --abbrev-ref HEAD` -FILE=pandaharvester/commit_timestamp.py -echo timestamp = \"$DATE on $BRANCH \(by $NAME\)\" > $FILE -git add $FILE -exit 0 \ No newline at end of file diff --git a/git_hooks/pre-commit b/git_hooks/pre-commit new file mode 100755 index 00000000..56f3b01a --- /dev/null +++ b/git_hooks/pre-commit @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# start templated +INSTALL_PYTHON=/usr/bin/python3 +ARGS=(hook-impl --config=.pre-commit-config.yaml --hook-type=pre-commit) +# end templated + +HERE="$(cd "$(dirname "$0")" && pwd)" +ARGS+=(--hook-dir "$HERE" -- "$@") + +if [ -x "$INSTALL_PYTHON" ]; then + exec "$INSTALL_PYTHON" -mpre_commit "${ARGS[@]}" +elif command -v pre-commit > /dev/null; then + exec pre-commit "${ARGS[@]}" +else + echo '`pre-commit` not found. Skipped...' 1>&2 +fi + + +# commit timestamp +DATE=`date -u '+%d-%m-%Y %H:%M:%S'` +NAME=`git config --global user.name` +BRANCH=`git rev-parse --abbrev-ref HEAD` +FILE=pandaharvester/commit_timestamp.py +echo timestamp = \"$DATE on $BRANCH \(by $NAME\)\" > $FILE +git add $FILE \ No newline at end of file diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 87a1c3b8..eb87e0a3 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "18-07-2023 14:53:35 on flin (by mightqxc)" +timestamp = "29-09-2023 12:55:52 on flin (by mightqxc)" From 7c22c9129c19e9dcf20c4aa7755a01e140ed2670 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 14:59:57 +0200 Subject: [PATCH 6/7] v0.3.0 --- pandaharvester/panda_pkg_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index c51de986..3d521944 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.2.35" +release_version = "0.3.0" From 3257f4b5e2572cf680c3fb55a805a6719c922c5e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 29 Sep 2023 15:31:07 +0200 Subject: [PATCH 7/7] multiple git pre-commit hooks --- git_hooks/pre-commit | 36 +++++++-------------- git_hooks/pre-commit.d/a10-run_pre-commit | 17 ++++++++++ git_hooks/pre-commit.d/a99-commit_timestamp | 10 ++++++ pandaharvester/commit_timestamp.py | 2 +- 4 files changed, 40 insertions(+), 25 deletions(-) create mode 100644 git_hooks/pre-commit.d/a10-run_pre-commit create mode 100644 git_hooks/pre-commit.d/a99-commit_timestamp diff --git a/git_hooks/pre-commit b/git_hooks/pre-commit index 56f3b01a..5b812d0c 100755 --- a/git_hooks/pre-commit +++ b/git_hooks/pre-commit @@ -1,26 +1,14 @@ #!/usr/bin/env bash -# start templated -INSTALL_PYTHON=/usr/bin/python3 -ARGS=(hook-impl --config=.pre-commit-config.yaml --hook-type=pre-commit) -# end templated - -HERE="$(cd "$(dirname "$0")" && pwd)" -ARGS+=(--hook-dir "$HERE" -- "$@") - -if [ -x "$INSTALL_PYTHON" ]; then - exec "$INSTALL_PYTHON" -mpre_commit "${ARGS[@]}" -elif command -v pre-commit > /dev/null; then - exec pre-commit "${ARGS[@]}" -else - echo '`pre-commit` not found. Skipped...' 1>&2 -fi - - -# commit timestamp -DATE=`date -u '+%d-%m-%Y %H:%M:%S'` -NAME=`git config --global user.name` -BRANCH=`git rev-parse --abbrev-ref HEAD` -FILE=pandaharvester/commit_timestamp.py -echo timestamp = \"$DATE on $BRANCH \(by $NAME\)\" > $FILE -git add $FILE \ No newline at end of file +basedir="$(dirname $0)/pre-commit.d" + +for hook in $(ls -1 $basedir); do + bash $basedir/$hook + RESULT=$? + if [ $RESULT != 0 ]; then + echo "$hook returned non-zero: $RESULT, abort commit" + exit $RESULT + fi +done + +exit 0 diff --git a/git_hooks/pre-commit.d/a10-run_pre-commit b/git_hooks/pre-commit.d/a10-run_pre-commit new file mode 100644 index 00000000..8fa56265 --- /dev/null +++ b/git_hooks/pre-commit.d/a10-run_pre-commit @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# start templated +INSTALL_PYTHON=/usr/bin/python3 +ARGS=(hook-impl --config=.pre-commit-config.yaml --hook-type=pre-commit) +# end templated + +HERE="$(cd "$(dirname "$0")" && pwd)" +ARGS+=(--hook-dir "$HERE" -- "$@") + +if [ -x "$INSTALL_PYTHON" ]; then + exec "$INSTALL_PYTHON" -mpre_commit "${ARGS[@]}" +elif command -v pre-commit > /dev/null; then + exec pre-commit "${ARGS[@]}" +else + echo '`pre-commit` not found. Skipped...' 1>&2 +fi diff --git a/git_hooks/pre-commit.d/a99-commit_timestamp b/git_hooks/pre-commit.d/a99-commit_timestamp new file mode 100644 index 00000000..847d0847 --- /dev/null +++ b/git_hooks/pre-commit.d/a99-commit_timestamp @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +DATE=`date -u '+%d-%m-%Y %H:%M:%S'` +NAME=`git config --global user.name` +BRANCH=`git rev-parse --abbrev-ref HEAD` +FILE=pandaharvester/commit_timestamp.py +echo timestamp = \"$DATE on $BRANCH \(by $NAME\)\" > $FILE +git add $FILE + +exit 0 diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index eb87e0a3..db21c9bb 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "29-09-2023 12:55:52 on flin (by mightqxc)" +timestamp = "29-09-2023 13:31:07 on flin (by mightqxc)"