From a9b2059d76bb82b183403ac4f5198936d16e11af Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Fri, 25 Jun 2021 07:56:40 -0500
Subject: [PATCH 01/22] first draft of a controller-worker wrapper for heat

---
 heat/cw4heat/__init__.py    | 256 ++++++++++++++++++++++++++++++++
 heat/cw4heat/arrayapi.py    | 288 ++++++++++++++++++++++++++++++++++++
 heat/cw4heat/distributor.py | 175 ++++++++++++++++++++++
 3 files changed, 719 insertions(+)
 create mode 100644 heat/cw4heat/__init__.py
 create mode 100644 heat/cw4heat/arrayapi.py
 create mode 100644 heat/cw4heat/distributor.py

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
new file mode 100644
index 0000000000..28698df781
--- /dev/null
+++ b/heat/cw4heat/__init__.py
@@ -0,0 +1,256 @@
+# MIT License
+
+# Copyright (c) 2021 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+###############################################################################
+# This provides a wrapper around SPMD-based HeAT
+# (github.com/helmholtz-analytics/heat) to operate in controller-worker mode.
+
+# The goal is to provide a compliant implementation of the array API
+# (github.com/data-apis/arra-api).
+
+# Returned array (DNDArray) objects are handles/futures only. Their content is
+# available through __int__ etc., through __partitioned__ or heat(). Notice: this
+# allows for delayed execution and optimizations of the workflow/task-graph and
+# communication.
+
+# For a function/method of the array-API that is executed on the controller
+# process, this wrapper generates the equivalent source code to be executed on
+# the worker processes.  The code is then sent to each remote worker and
+# executed there.
+
+# It's up to the distribution layer (e.g. distributor) to make sure the code is
+# executed in the right order on each process/worker so that collective
+# communication in HeAT can operate correctly without dead-locks.
+
+# To allow workflow optimizations array dependences and to avoid
+# pickle-dependencies to the array inputs we separate scalar/non-array arguments
+# from array arguments. For this we assume that array arguments never occur
+# after non-array arguments.  Each function.task handles and passes array-typed
+# and non-array-types arguments separately.
+###############################################################################
+
+from . import distributor
+from .arrayapi import (
+    aa_attributes,
+    aa_tlfuncs,
+    aa_datatypes,
+    aa_constants,
+    aa_methods_s,
+    aa_methods_a,
+    aa_inplace_operators,
+    aa_reflected_operators,
+)
+
+# just in case we find another SPMD/MPI implementation of numpy...
+import heat as impl
+from heat import DNDarray as dndarray
+impl_str = "impl"
+dndarray_str = "impl.DNDarray"
+
+def init():
+    '''
+    Initialize distribution engine.
+    For now we assume all ranks (controller and workers) are started through mpirun,
+    workers will never leave distributor.start() and so this function.
+    Call this as the very first thing in your program. For now it is recommended
+    to start your program with
+
+    import heat.cw4heat as ht
+    ht.init()
+
+    Also call fini() before exiting.
+    '''
+    distributor.init()
+    distributor.start()
+
+
+def fini():
+    '''
+    Finalize/shutdown distribution engine.
+    When called on controller, workers will sys.exit from init().
+    '''
+    distributor.fini()
+
+
+class _Task:
+    'A work item, executing functions provided as code.'
+    def __init__(self, func, args, kwargs, unwrap='*'):
+        self._func = func
+        self._args = args
+        self._kwargs = kwargs
+        self._unwrap = unwrap
+
+    def run(self, deps):
+        if deps:
+            return eval(f"{self._func}({self._unwrap}deps, *self._args, **self._kwargs)")
+        else:
+            return eval(f"{self._func}(*self._args, **self._kwargs)")
+
+
+class _PropertyTask:
+    'A work item, executing class properties provided as code.'
+    def __init__(self, func):
+        self._func = func
+
+    def run(self, deps):
+        return eval(f"deps[0].{self._func}")
+
+
+def _submit(name, args, kwargs, unwrap='*'):
+    '''
+    Create a _Task and submit, return PManager/Future.
+    '''
+    scalar_args = tuple(x for x in args if not isinstance(x, DDParray))
+    deps = [x._handle.getId() for x in args if isinstance(x, DDParray)]
+    return distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps)
+
+
+def _submitProperty(name, self):
+    '''
+    Create a _PropertyTask (property) and submit, return PManager/Future.
+    '''
+    t = _PropertyTask(name)
+    try:
+        res = distributor.submitPP(t, [self._handle.getId()])
+    except:
+        assert False
+    return res
+
+
+# setitem has scalar arg key before array arg value
+# we need to provide a function accepting the inverse order
+def _setitem_normalized(self, value, key):
+    self.__setitem__(key, value)
+
+
+#######################################################################
+# Our array is just a wrapper. Actual array is stored as a handle to
+# allow delayed execution.
+#######################################################################
+class DDParray:
+    '''
+    Shallow wrapper class representing a distributed array.
+    It will be filled dynamically from lists extracted from the array-API.
+    All functionality is delegated to the underlying implementation,
+    executed in tasks.
+    '''
+
+    #######################################################################
+    # first define methods/properties which need special care.
+    #######################################################################
+
+    def __init__(self, handle):
+        'Do not use this array. Use creator functions instead.'
+        self._handle = handle
+
+    def heat(self):
+        '''
+        Return heat native array.
+        With delayed execution, triggers computation as needed and blocks until array is available.
+        '''
+        return self._handle.get()
+
+    def __getitem__(self, key):
+        'Return item/slice as array.'
+        return DDParray(_submit(f'{dndarray_str}.__getitem__', (self, key), {}))
+
+    # bring args in the order we can process and feed into normal process
+    # using global normalized version
+    def __setitem__(self, key, value):
+        'set item/slice to given value'
+        _submit(f'_setitem_normalized', (self, value, key), {})
+
+    @property
+    def T(self):
+        return DDParray(_submitProperty('T', self))
+
+
+    #######################################################################
+    # Now we add methods/properties through the standard process.
+    #######################################################################
+
+    # dynamically generate class methods from list of methods in array-API
+    # we simply make lambdas which submit appropriate Tasks
+    # FIXME: aa_inplace_operators,others?
+    fixme_afuncs = ['squeeze', 'astype', 'balance',]
+    for method in aa_methods_a + aa_reflected_operators + fixme_afuncs:
+        if method not in ['__getitem__', '__setitem__'] and hasattr(dndarray, method):
+            exec(f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))")
+
+    for method in aa_methods_s:
+        if hasattr(dndarray, method):
+            exec(f"{method} = lambda self, *args, **kwargs: _submit('{dndarray_str}.{method}', (self, *args), kwargs).get()")
+
+    for attr in aa_attributes:
+        if attr != 'T' and hasattr(dndarray, attr):
+            exec(f"{attr} = property(lambda self: self._handle.get().{attr})")
+
+    def __getattr__(self, attr):
+        # attributes are special
+        if not attr in aa_attributes:
+            raise Exception(f"unknown method/attribute {attr} requested")
+
+
+#######################################################################
+# first define top-level functions which need special care.
+#######################################################################
+
+# np.concatenate accepts a list of arrays (not individual arrays)
+# so we let the task not unwrap the list of deps
+def concatenate(*args, **kwargs):
+    return DDParray(_submit(f'{impl_str}.concatenate', *args, kwargs, unwrap=''))
+
+
+#######################################################################
+# first define top-level functions through the standard process.
+#######################################################################
+#   - creating arrays
+#   - elementswise operations
+#   - statistical operations
+# (lists taken from list of methods in array-API)
+# Again, we simply make lambdas which submit appropriate Tasks
+
+fixme_funcs = ['load_csv']
+for func in aa_tlfuncs + fixme_funcs:
+    exec(f"{func} = lambda *args, **kwargs: DDParray(_submit('{impl_str}.{func}', args, kwargs))")
+
+
+def concatenate(*args, **kwargs):
+    return DDParray(_submit(f'{impl_str}.concatenate', *args, kwargs, unwrap=''))
+
+
+# Here we data types and constants
+for attr in aa_datatypes + aa_constants:
+    if hasattr(impl, attr):
+        exec(f"{attr} = {impl_str}.{attr}")
+    else:
+        print(f"{impl.__name__} has no {attr}")
+
+
+#######################################################################
+# quick hack to provide random features
+#######################################################################
+class random:
+    for method, obj in impl.random.__dict__.items():
+        if callable(obj):
+            exec(f"{method} = staticmethod(lambda *args, **kwargs: DDParray(_submit('{impl_str}.random.{method}', args, kwargs)))")
diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
new file mode 100644
index 0000000000..aaf18a3177
--- /dev/null
+++ b/heat/cw4heat/arrayapi.py
@@ -0,0 +1,288 @@
+__all__ = ['aa_creators', 'aa_attributes', 'aa_methods', 'aa_elementwises', 'aa_statisticals',
+           'aa_inplace_operators', 'aa_reflected_operators', 'aa_datatypes', 'aa_datatype_functions',
+           'aa_searching', 'aa_sorting', 'aa_set', 'aa_utility', 'aa_constants',
+           'aa_arraydir', 'aa_tldir', 'aa_tlfuncs', 'aa_arrayfuncs', 'aa_methods_s', 'aa_methods_a']
+
+aa_creators = [
+    'arange',      #(start, /, stop=None, step=1, *, dtype=None, device=None)
+    'asarray',     #(obj, /, *, dtype=None, device=None, copy=None)
+    'empty',       #(shape, *, dtype=None, device=None)
+    'empty_like',  #(x, /, *, dtype=None, device=None)
+    'eye',         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    'from_dlpack', #(x, /)
+    'full',        #(shape, fill_value, *, dtype=None, device=None)
+    'full_like',   #(x, /, fill_value, *, dtype=None, device=None)
+    'linspace',    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    'meshgrid',    #(*arrays, indexing=’xy’)
+    'ones',        #(shape, *, dtype=None, device=None)
+    'ones_like',   #(x, /, *, dtype=None, device=None)
+    'zeros',       #(shape, *, dtype=None, device=None)
+    'zeros_like',  #(x, /, *, dtype=None, device=None)
+]
+
+aa_attributes = [
+    'dtype',
+    'device',
+    'ndim',
+    'shape',
+    'size',
+    'T',
+]
+
+aa_inplace_operators = [
+    '__iadd__',
+    '__isub__',
+    '__imul__',
+    '__itruediv__',
+    '__iflowdiv__',
+    '__ipow__',
+    '__imatmul__',
+    '__imod__',
+    '__iand__',
+    '__ior__',
+    '__ixor__',
+    '__ilshift__',
+    '__irshift__',
+]
+
+aa_reflected_operators = [
+    '__radd__',
+    '__rsub__',
+    '__rmul__',
+    '__rtruediv__',
+    '__rflowdiv__',
+    '__rpow__',
+    '__rmatmul__',
+    '__rmod__',
+    '__rand__',
+    '__ror__',
+    '__rxor__',
+    '__rlshift__',
+    '__rrshift__',
+]
+
+aa_datatypes = [
+    'bool',
+    'int8',
+    'int16',
+    'int32',
+    'int64',
+    'uint8',
+    'uint16',
+    'uint32',
+    'uint64',
+    'float32',
+    'float64',
+]
+
+aa_datatype_functions = [
+    'broadcast_arrays',  #(*arrays)
+    'broadcast_to',  #(x, /, shape)
+    'can_cast',  #(from_, to, /)
+    'finfo',  #(type, /)
+    'iinfo',  #(type, /)
+    'result_type',  #(*arrays_and_dtypes)
+]
+                
+aa_methods = [
+    '__abs__',  #(self, /)
+    '__add__',  #(self, other, /)
+    '__and__',  #(self, other, /)
+    '__array_namespace__',  #(self, /, *, api_version=None)
+    '__bool__',  #(self, /)
+    '__dlpack__',  #(self, /, *, stream=None)
+    '__dlpack_device__',  #(self, /)
+    '__eq__',  #(self, other, /)
+    '__float__',  #(self, /)
+    '__floordiv__',  #(self, other, /)
+    '__ge__',  #(self, other, /)
+    '__getitem__',  #(self, key, /)
+    '__gt__',  #(self, other, /)
+    '__int__',  #(self, /)
+    '__invert__',  #(self, /)
+    '__le__',  #(self, other, /)
+    '__len__',  #(self, /)
+    '__lshift__',  #(self, other, /)
+    '__lt__',  #(self, other, /)
+    '__matmul__',  #(self, other, /)
+    '__mod__',  #(self, other, /)
+    '__mul__',  #(self, other, /)
+    '__ne__',  #(self, other, /)
+    '__neg__',  #(self, /)
+    '__or__',  #(self, other, /)
+    '__pos__',  #(self, /)
+    '__pow__',  #(self, other, /)
+    '__rshift__',  #(self, other, /)
+    '__setitem__',  #(self, key, value, /)
+    '__sub__',  #(self, other, /)
+    '__truediv__',  #(self, other, /)
+    '__xor__',  #(self, other, /)
+]
+
+aa_creators = [
+    'arange',      #(start, /, stop=None, step=1, *, dtype=None, device=None)
+    'asarray',     #(obj, /, *, dtype=None, device=None, copy=None)
+    'empty',       #(shape, *, dtype=None, device=None)
+    'empty_like',  #(x, /, *, dtype=None, device=None)
+    'eye',         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    'from_dlpack', #(x, /)
+    'full',        #(shape, fill_value, *, dtype=None, device=None)
+    'full_like',   #(x, /, fill_value, *, dtype=None, device=None)
+    'linspace',    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    'meshgrid',    #(*arrays, indexing=’xy’)
+    'ones',        #(shape, *, dtype=None, device=None)
+    'ones_like',   #(x, /, *, dtype=None, device=None)
+    'zeros',       #(shape, *, dtype=None, device=None)
+    'zeros_like',  #(x, /, *, dtype=None, device=None)
+]
+
+aa_attributes = [
+    'dtype',
+    'device',
+    'ndim',
+    'shape',
+    'size',
+    'T',
+]
+
+aa_methods_a = [
+    '__abs__',  #(self, /)
+    '__add__',  #(self, other, /)
+    '__floordiv__',  #(self, other, /)
+    '__invert__',  #(self, /)
+    '__lshift__',  #(self, other, /)
+    '__matmul__',  #(self, other, /)
+    '__mod__',  #(self, other, /)
+    '__mul__',  #(self, other, /)
+    '__neg__',  #(self, /)
+    '__pos__',  #(self, /)
+    '__pow__',  #(self, other, /)
+    '__rshift__',  #(self, other, /)
+    '__sub__',  #(self, other, /)
+    '__truediv__',  #(self, other, /)
+    '__getitem__',  #(self, key, /)
+    '__setitem__',  #(self, key, value, /)
+    '__eq__',  #(self, other, /)
+    '__ge__',  #(self, other, /)
+    '__gt__',  #(self, other, /)
+    '__le__',  #(self, other, /)
+    '__lt__',  #(self, other, /)
+    '__ne__',  #(self, other, /)
+    '__and__',  #(self, other, /)
+    '__or__',  #(self, other, /)
+    '__xor__',  #(self, other, /)
+]
+
+aa_methods_s = [
+    '__array_namespace__',  #(self, /, *, api_version=None)
+    '__bool__',  #(self, /)
+    '__dlpack__',  #(self, /, *, stream=None)
+    '__dlpack_device__',  #(self, /)
+    '__float__',  #(self, /)
+    '__int__',  #(self, /)
+    '__len__',  #(self, /)
+]
+
+aa_methods = aa_methods_s + aa_methods_a
+
+aa_elementwises = [
+    'abs',  #(x, /)
+    'acos',  #(x, /)
+    'acosh',  #(x, /)
+    'add',  #(x1, x2, /)
+    'asin',  #(x, /)
+    'asinh',  #(x, /)
+    'atan',  #(x, /)
+    'atan2',  #(x1, x2, /)
+    'atanh',  #(x, /)
+    'bitwise_and',  #(x1, x2, /)
+    'bitwise_left_shift',  #(x1, x2, /)
+    'bitwise_invert',  #(x, /)
+    'bitwise_or',  #(x1, x2, /)
+    'bitwise_right_shift',  #(x1, x2, /)
+    'bitwise_xor',  #(x1, x2, /)
+    'ceil',  #(x, /)
+    'cos',  #(x, /)
+    'cosh',  #(x, /)
+    'divide',  #(x1, x2, /)
+    'equal',  #(x1, x2, /)
+    'exp',  #(x, /)
+    'expm1',  #(x, /)
+    'floor',  #(x, /)
+    'floor_divide',  #(x1, x2, /)
+    'greater',  #(x1, x2, /)
+    'greater_equal',  #(x1, x2, /)
+    'isfinite',  #(x, /)
+    'isinf',  #(x, /)
+    'isnan',  #(x, /)
+    'less',  #(x1, x2, /)
+    'less_equal',  #(x1, x2, /)
+    'log',  #(x, /)
+    'log1p',  #(x, /)
+    'log2',  #(x, /)
+    'log10',  #(x, /)
+    'logaddexp',  #(x1, x2)
+    'logical_and',  #(x1, x2, /)
+    'logical_not',  #(x, /)
+    'logical_or',  #(x1, x2, /)
+    'logical_xor',  #(x1, x2, /)
+    'multiply',  #(x1, x2, /)
+    'negative',  #(x, /)
+    'not_equal',  #(x1, x2, /)
+    'positive',  #(x, /)
+    'pow',  #(x1, x2, /)
+    'remainder',  #(x1, x2, /)
+    'round',  #(x, /)
+    'sign',  #(x, /)
+    'sin',  #(x, /)
+    'sinh',  #(x, /)
+    'square',  #(x, /)
+    'sqrt',  #(x, /)
+    'subtract',  #(x1, x2, /)
+    'tan',  #(x, /)
+    'tanh',  #(x, /)
+    'trunc',  #(x, /)
+]
+
+aa_statisticals = [
+    'max',   #(x, /, *, axis=None, keepdims=False)
+    'mean',  #(x, /, *, axis=None, keepdims=False)
+    'min',   #(x, /, *, axis=None, keepdims=False)
+    'prod',  #(x, /, *, axis=None, keepdims=False)
+    'std',   #(x, /, *, axis=None, correction=0.0, keepdims=False)
+    'sum',   #(x, /, *, axis=None, keepdims=False)
+    'var',   #(x, /, *, axis=None, correction=0.0, keepdims=False)
+]
+
+aa_searching = [
+    'argmax',
+    'argmin',
+    'nonzero',
+    'where',
+]
+
+aa_sorting = [
+    'argsort',
+    'sort',
+]
+
+aa_set = [
+    'unique',
+]
+
+aa_utility = [
+    'all',
+    'any',
+]
+
+aa_constants = [
+    'e',
+    'inf',
+    'nan',
+    'pi',
+]
+
+aa_tlfuncs = aa_creators + aa_elementwises + aa_statisticals + aa_datatype_functions + aa_searching + aa_sorting + aa_set + aa_utility
+aa_tldir = aa_tlfuncs + aa_datatypes + aa_constants
+aa_arrayfuncs = aa_methods + aa_inplace_operators + aa_reflected_operators
+aa_arraydir = aa_attributes + aa_arrayfuncs
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
new file mode 100644
index 0000000000..5153c231cf
--- /dev/null
+++ b/heat/cw4heat/distributor.py
@@ -0,0 +1,175 @@
+# MIT License
+
+# Copyright (c) 2021 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+###############################################################################
+# Distribution engine.
+#   - schedules same tasks on all workers
+#   - handles dependences seperately
+# This currently is a very simple eagerly executing machinery.
+# We can make this better over time. A low hanging fruit seems might
+# be to delay distribution until go() is called. This would allow aggregating
+# multiple distribution messages into one.
+#
+# Dependent objects have a unique identifier, assigned when a handle to it is
+# created. We assume that all workers execute handle-creation in identical order.
+# Such dependences are assumed to be global entities, e.g. each worker holds
+# a handle/reference to it (e.g. like a heat.DNDarray). The local handles
+# exist on each, stored in a worker-local dictionary. Thsi allows identifying
+# dependences through simple integers.
+#
+# Notice, mpi4py does not provide ibcast, so we cannot overlap. This makes the
+# above aggregation particularly promising. Another option woujld be to write
+# this in C/C++ and use ibcast.
+###############################################################################
+
+
+import sys
+from mpi4py import MPI
+_comm = MPI.COMM_WORLD
+
+# define identifiers
+END = 0
+TASK = 1
+GO = 2
+
+
+def init():
+    'Init distributor'
+    pass
+
+
+def start():
+    '''
+    Start distribution engine.
+    Controller inits and returns.
+    Workers enter recv-loop and exit program when fini si called.
+    '''
+    if _comm.rank != 0:
+        done = False
+        header = None
+        rtask = None
+        while(not done):
+            # wait in bcast for work
+            header = _comm.bcast(header, 0)
+            # then see what we need to do
+            if header[0] == END:
+                done = True
+                break
+            elif header[0] == TASK:
+                header[1].submit()
+            elif header[0] == GO:
+                # no delayed execution for now -> nothing to do
+                pass
+            else:
+                raise Exception("Worker received unknown tag")
+        sys.exit()
+
+        
+def fini():
+    'Control sends end-tag. Workers will sys.exit'
+    header = [END]
+    header = _comm.bcast(header, 0)
+
+
+def go():
+    'Trigger execution of all tasks that are still in flight'
+    header = [GO]
+    header = _comm.bcast(header, 0)
+
+
+def submitPP(task, deps, in_order=True):
+    '''
+    Submit a process-parallel task and return a handle/future.
+    '''
+    rtask = _RemoteTask(task, deps)
+    header = [TASK, rtask]
+    _, rtask = _comm.bcast(header, 0)
+    return rtask.submit()
+
+
+class Handle:
+    '''
+    A future representing an object that will be available eventually.
+    get() will return None as long as the value is not available.
+    '''
+
+    # this defines the next free and globally unique identifier
+    _nextId = 1
+
+    def __init__(self):
+        '''
+        Initialize handle.
+        We assume all workers create handles to objects in identical order.
+        This allows us to assign a simple integers as the unqique id.
+        '''
+        self._obj = None
+        self._id = Handle._nextId
+        Handle._nextId += 1
+
+    def set(self, obj):
+        'Make object available.'
+        self._obj = obj
+
+    def getId(self):
+        'Return future/handle id'
+        return self._id
+
+    def get(self):
+        'Return object or None'
+        return self._obj
+        
+
+class _RemoteTask:
+    '''
+    A task which is executed remotely on a worker.
+    It accepts a task with a run-method that it will execute at some point.
+    It also accepts dependences explicitly and so allows to create
+    task-graphs etc.
+
+    We keep a static dictionary mapping globally unique identifiers to dependent
+    global objects (like heat.DNDarrays). This keeps the objects alive and allows
+    communicating through simple integers.
+    '''
+
+    def __init__(self, task, deps, inorder=True):
+        self._depIds = deps
+        self._task = task
+        self._inorder = inorder
+
+    # here we store objects that are input dependences to tasks
+    s_pms = {}
+
+    def submit(self):
+        '''
+        Submit task to local task scheduler.
+        For now we execute eagerly, this is much simpler to implement.
+        Later, we might consider lazy evaluation, task-graph-optimizations etc.
+        FIXME: We currently assign a new id and store the result even when there is no result
+        or the result is not a global object.
+        '''
+        deps = [_RemoteTask.s_pms[i] for i in self._depIds]
+        res = self._task.run(deps)
+        hndl = Handle()
+        hndl.set(res)
+        _RemoteTask.s_pms[hndl.getId()] = res
+        return hndl

From 634398815f3b262f87ab2e659cf1b706b2639848 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Fri, 25 Jun 2021 10:13:48 -0500
Subject: [PATCH 02/22] auto-init and auto-fini

---
 heat/cw4heat/__init__.py    | 18 +++++++++---------
 heat/cw4heat/distributor.py |  5 +++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 28698df781..1057e2438a 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -49,6 +49,7 @@
 # and non-array-types arguments separately.
 ###############################################################################
 
+import atexit
 from . import distributor
 from .arrayapi import (
     aa_attributes,
@@ -69,16 +70,9 @@
 
 def init():
     '''
-    Initialize distribution engine.
+    Initialize distribution engine. Automatically when when importing cw4heat.
     For now we assume all ranks (controller and workers) are started through mpirun,
     workers will never leave distributor.start() and so this function.
-    Call this as the very first thing in your program. For now it is recommended
-    to start your program with
-
-    import heat.cw4heat as ht
-    ht.init()
-
-    Also call fini() before exiting.
     '''
     distributor.init()
     distributor.start()
@@ -86,7 +80,7 @@ def init():
 
 def fini():
     '''
-    Finalize/shutdown distribution engine.
+    Finalize/shutdown distribution engine. Automatically called at exit.
     When called on controller, workers will sys.exit from init().
     '''
     distributor.fini()
@@ -254,3 +248,9 @@ class random:
     for method, obj in impl.random.__dict__.items():
         if callable(obj):
             exec(f"{method} = staticmethod(lambda *args, **kwargs: DDParray(_submit('{impl_str}.random.{method}', args, kwargs)))")
+
+
+#######################################################################
+#######################################################################
+atexit.register(fini)
+init()
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 5153c231cf..8a1e88f6b6 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -87,8 +87,9 @@ def start():
         
 def fini():
     'Control sends end-tag. Workers will sys.exit'
-    header = [END]
-    header = _comm.bcast(header, 0)
+    if _comm.rank == 0:
+        header = [END]
+        header = _comm.bcast(header, 0)
 
 
 def go():

From 7c780ccdd445f2577f67415cb1efe92277b07535 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Mon, 5 Jul 2021 03:38:13 -0500
Subject: [PATCH 03/22] using dealyed execution; using double-quotes

---
 heat/cw4heat/__init__.py    |  65 +++---
 heat/cw4heat/arrayapi.py    | 454 ++++++++++++++++++------------------
 heat/cw4heat/distributor.py | 149 ++++++++----
 3 files changed, 365 insertions(+), 303 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 1057e2438a..3b434dcb8c 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -69,26 +69,26 @@
 dndarray_str = "impl.DNDarray"
 
 def init():
-    '''
+    """
     Initialize distribution engine. Automatically when when importing cw4heat.
     For now we assume all ranks (controller and workers) are started through mpirun,
     workers will never leave distributor.start() and so this function.
-    '''
+    """
     distributor.init()
     distributor.start()
 
 
 def fini():
-    '''
+    """
     Finalize/shutdown distribution engine. Automatically called at exit.
     When called on controller, workers will sys.exit from init().
-    '''
+    """
     distributor.fini()
 
 
 class _Task:
-    'A work item, executing functions provided as code.'
-    def __init__(self, func, args, kwargs, unwrap='*'):
+    "A work item, executing functions provided as code."
+    def __init__(self, func, args, kwargs, unwrap="*"):
         self._func = func
         self._args = args
         self._kwargs = kwargs
@@ -102,7 +102,7 @@ def run(self, deps):
 
 
 class _PropertyTask:
-    'A work item, executing class properties provided as code.'
+    "A work item, executing class properties provided as code."
     def __init__(self, func):
         self._func = func
 
@@ -110,19 +110,19 @@ def run(self, deps):
         return eval(f"deps[0].{self._func}")
 
 
-def _submit(name, args, kwargs, unwrap='*'):
-    '''
+def _submit(name, args, kwargs, unwrap="*", numout=1):
+    """
     Create a _Task and submit, return PManager/Future.
-    '''
+    """
     scalar_args = tuple(x for x in args if not isinstance(x, DDParray))
     deps = [x._handle.getId() for x in args if isinstance(x, DDParray)]
-    return distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps)
+    return distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps, numout)
 
 
 def _submitProperty(name, self):
-    '''
+    """
     Create a _PropertyTask (property) and submit, return PManager/Future.
-    '''
+    """
     t = _PropertyTask(name)
     try:
         res = distributor.submitPP(t, [self._handle.getId()])
@@ -142,41 +142,41 @@ def _setitem_normalized(self, value, key):
 # allow delayed execution.
 #######################################################################
 class DDParray:
-    '''
+    """
     Shallow wrapper class representing a distributed array.
     It will be filled dynamically from lists extracted from the array-API.
     All functionality is delegated to the underlying implementation,
     executed in tasks.
-    '''
+    """
 
     #######################################################################
     # first define methods/properties which need special care.
     #######################################################################
 
     def __init__(self, handle):
-        'Do not use this array. Use creator functions instead.'
+        "Do not use this array. Use creator functions instead."
         self._handle = handle
 
     def heat(self):
-        '''
+        """
         Return heat native array.
         With delayed execution, triggers computation as needed and blocks until array is available.
-        '''
+        """
         return self._handle.get()
 
     def __getitem__(self, key):
-        'Return item/slice as array.'
-        return DDParray(_submit(f'{dndarray_str}.__getitem__', (self, key), {}))
+        "Return item/slice as array."
+        return DDParray(_submit(f"{dndarray_str}.__getitem__", (self, key), {}))
 
     # bring args in the order we can process and feed into normal process
     # using global normalized version
     def __setitem__(self, key, value):
-        'set item/slice to given value'
-        _submit(f'_setitem_normalized', (self, value, key), {})
+        "set item/slice to given value"
+        _submit(f"_setitem_normalized", (self, value, key), {})
 
     @property
     def T(self):
-        return DDParray(_submitProperty('T', self))
+        return DDParray(_submitProperty("T", self))
 
 
     #######################################################################
@@ -186,9 +186,9 @@ def T(self):
     # dynamically generate class methods from list of methods in array-API
     # we simply make lambdas which submit appropriate Tasks
     # FIXME: aa_inplace_operators,others?
-    fixme_afuncs = ['squeeze', 'astype', 'balance',]
+    fixme_afuncs = ["squeeze", "astype", "balance", "resplit",]
     for method in aa_methods_a + aa_reflected_operators + fixme_afuncs:
-        if method not in ['__getitem__', '__setitem__'] and hasattr(dndarray, method):
+        if method not in ["__getitem__", "__setitem__"] and hasattr(dndarray, method):
             exec(f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))")
 
     for method in aa_methods_s:
@@ -196,7 +196,7 @@ def T(self):
             exec(f"{method} = lambda self, *args, **kwargs: _submit('{dndarray_str}.{method}', (self, *args), kwargs).get()")
 
     for attr in aa_attributes:
-        if attr != 'T' and hasattr(dndarray, attr):
+        if attr != "T" and hasattr(dndarray, attr):
             exec(f"{attr} = property(lambda self: self._handle.get().{attr})")
 
     def __getattr__(self, attr):
@@ -212,7 +212,7 @@ def __getattr__(self, attr):
 # np.concatenate accepts a list of arrays (not individual arrays)
 # so we let the task not unwrap the list of deps
 def concatenate(*args, **kwargs):
-    return DDParray(_submit(f'{impl_str}.concatenate', *args, kwargs, unwrap=''))
+    return DDParray(_submit(f"{impl_str}.concatenate", *args, kwargs, unwrap=""))
 
 
 #######################################################################
@@ -224,13 +224,16 @@ def concatenate(*args, **kwargs):
 # (lists taken from list of methods in array-API)
 # Again, we simply make lambdas which submit appropriate Tasks
 
-fixme_funcs = ['load_csv']
+fixme_funcs = ["load_csv", "array", "triu"]
 for func in aa_tlfuncs + fixme_funcs:
-    exec(f"{func} = lambda *args, **kwargs: DDParray(_submit('{impl_str}.{func}', args, kwargs))")
+    if func == "meshgrid":
+        exec(f"{func} = lambda *args, **kwargs: list(DDParray(x) for x in _submit('{impl_str}.{func}', args, kwargs, numout=len(args)))")
+    else:
+        exec(f"{func} = lambda *args, **kwargs: DDParray(_submit('{impl_str}.{func}', args, kwargs))")
 
 
-def concatenate(*args, **kwargs):
-    return DDParray(_submit(f'{impl_str}.concatenate', *args, kwargs, unwrap=''))
+for func in ["concatenate", "hstack",]:
+    exec(f"{func} = lambda *args, **kwargs: DDParray(_submit(f'{impl_str}.{func}', *args, kwargs, unwrap=''))")
 
 
 # Here we data types and constants
diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
index aaf18a3177..5203e9e4c8 100644
--- a/heat/cw4heat/arrayapi.py
+++ b/heat/cw4heat/arrayapi.py
@@ -1,285 +1,285 @@
-__all__ = ['aa_creators', 'aa_attributes', 'aa_methods', 'aa_elementwises', 'aa_statisticals',
-           'aa_inplace_operators', 'aa_reflected_operators', 'aa_datatypes', 'aa_datatype_functions',
-           'aa_searching', 'aa_sorting', 'aa_set', 'aa_utility', 'aa_constants',
-           'aa_arraydir', 'aa_tldir', 'aa_tlfuncs', 'aa_arrayfuncs', 'aa_methods_s', 'aa_methods_a']
+__all__ = ["aa_creators", "aa_attributes", "aa_methods", "aa_elementwises", "aa_statisticals",
+           "aa_inplace_operators", "aa_reflected_operators", "aa_datatypes", "aa_datatype_functions",
+           "aa_searching", "aa_sorting", "aa_set", "aa_utility", "aa_constants",
+           "aa_arraydir", "aa_tldir", "aa_tlfuncs", "aa_arrayfuncs", "aa_methods_s", "aa_methods_a"]
 
 aa_creators = [
-    'arange',      #(start, /, stop=None, step=1, *, dtype=None, device=None)
-    'asarray',     #(obj, /, *, dtype=None, device=None, copy=None)
-    'empty',       #(shape, *, dtype=None, device=None)
-    'empty_like',  #(x, /, *, dtype=None, device=None)
-    'eye',         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
-    'from_dlpack', #(x, /)
-    'full',        #(shape, fill_value, *, dtype=None, device=None)
-    'full_like',   #(x, /, fill_value, *, dtype=None, device=None)
-    'linspace',    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
-    'meshgrid',    #(*arrays, indexing=’xy’)
-    'ones',        #(shape, *, dtype=None, device=None)
-    'ones_like',   #(x, /, *, dtype=None, device=None)
-    'zeros',       #(shape, *, dtype=None, device=None)
-    'zeros_like',  #(x, /, *, dtype=None, device=None)
+    "arange",      #(start, /, stop=None, step=1, *, dtype=None, device=None)
+    "asarray",     #(obj, /, *, dtype=None, device=None, copy=None)
+    "empty",       #(shape, *, dtype=None, device=None)
+    "empty_like",  #(x, /, *, dtype=None, device=None)
+    "eye",         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    "from_dlpack", #(x, /)
+    "full",        #(shape, fill_value, *, dtype=None, device=None)
+    "full_like",   #(x, /, fill_value, *, dtype=None, device=None)
+    "linspace",    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    "meshgrid",    #(*arrays, indexing=’xy’)
+    "ones",        #(shape, *, dtype=None, device=None)
+    "ones_like",   #(x, /, *, dtype=None, device=None)
+    "zeros",       #(shape, *, dtype=None, device=None)
+    "zeros_like",  #(x, /, *, dtype=None, device=None)
 ]
 
 aa_attributes = [
-    'dtype',
-    'device',
-    'ndim',
-    'shape',
-    'size',
-    'T',
+    "dtype",
+    "device",
+    "ndim",
+    "shape",
+    "size",
+    "T",
 ]
 
 aa_inplace_operators = [
-    '__iadd__',
-    '__isub__',
-    '__imul__',
-    '__itruediv__',
-    '__iflowdiv__',
-    '__ipow__',
-    '__imatmul__',
-    '__imod__',
-    '__iand__',
-    '__ior__',
-    '__ixor__',
-    '__ilshift__',
-    '__irshift__',
+    "__iadd__",
+    "__isub__",
+    "__imul__",
+    "__itruediv__",
+    "__iflowdiv__",
+    "__ipow__",
+    "__imatmul__",
+    "__imod__",
+    "__iand__",
+    "__ior__",
+    "__ixor__",
+    "__ilshift__",
+    "__irshift__",
 ]
 
 aa_reflected_operators = [
-    '__radd__',
-    '__rsub__',
-    '__rmul__',
-    '__rtruediv__',
-    '__rflowdiv__',
-    '__rpow__',
-    '__rmatmul__',
-    '__rmod__',
-    '__rand__',
-    '__ror__',
-    '__rxor__',
-    '__rlshift__',
-    '__rrshift__',
+    "__radd__",
+    "__rsub__",
+    "__rmul__",
+    "__rtruediv__",
+    "__rflowdiv__",
+    "__rpow__",
+    "__rmatmul__",
+    "__rmod__",
+    "__rand__",
+    "__ror__",
+    "__rxor__",
+    "__rlshift__",
+    "__rrshift__",
 ]
 
 aa_datatypes = [
-    'bool',
-    'int8',
-    'int16',
-    'int32',
-    'int64',
-    'uint8',
-    'uint16',
-    'uint32',
-    'uint64',
-    'float32',
-    'float64',
+    "bool",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "float32",
+    "float64",
 ]
 
 aa_datatype_functions = [
-    'broadcast_arrays',  #(*arrays)
-    'broadcast_to',  #(x, /, shape)
-    'can_cast',  #(from_, to, /)
-    'finfo',  #(type, /)
-    'iinfo',  #(type, /)
-    'result_type',  #(*arrays_and_dtypes)
+    "broadcast_arrays",  #(*arrays)
+    "broadcast_to",  #(x, /, shape)
+    "can_cast",  #(from_, to, /)
+    "finfo",  #(type, /)
+    "iinfo",  #(type, /)
+    "result_type",  #(*arrays_and_dtypes)
 ]
                 
 aa_methods = [
-    '__abs__',  #(self, /)
-    '__add__',  #(self, other, /)
-    '__and__',  #(self, other, /)
-    '__array_namespace__',  #(self, /, *, api_version=None)
-    '__bool__',  #(self, /)
-    '__dlpack__',  #(self, /, *, stream=None)
-    '__dlpack_device__',  #(self, /)
-    '__eq__',  #(self, other, /)
-    '__float__',  #(self, /)
-    '__floordiv__',  #(self, other, /)
-    '__ge__',  #(self, other, /)
-    '__getitem__',  #(self, key, /)
-    '__gt__',  #(self, other, /)
-    '__int__',  #(self, /)
-    '__invert__',  #(self, /)
-    '__le__',  #(self, other, /)
-    '__len__',  #(self, /)
-    '__lshift__',  #(self, other, /)
-    '__lt__',  #(self, other, /)
-    '__matmul__',  #(self, other, /)
-    '__mod__',  #(self, other, /)
-    '__mul__',  #(self, other, /)
-    '__ne__',  #(self, other, /)
-    '__neg__',  #(self, /)
-    '__or__',  #(self, other, /)
-    '__pos__',  #(self, /)
-    '__pow__',  #(self, other, /)
-    '__rshift__',  #(self, other, /)
-    '__setitem__',  #(self, key, value, /)
-    '__sub__',  #(self, other, /)
-    '__truediv__',  #(self, other, /)
-    '__xor__',  #(self, other, /)
+    "__abs__",  #(self, /)
+    "__add__",  #(self, other, /)
+    "__and__",  #(self, other, /)
+    "__array_namespace__",  #(self, /, *, api_version=None)
+    "__bool__",  #(self, /)
+    "__dlpack__",  #(self, /, *, stream=None)
+    "__dlpack_device__",  #(self, /)
+    "__eq__",  #(self, other, /)
+    "__float__",  #(self, /)
+    "__floordiv__",  #(self, other, /)
+    "__ge__",  #(self, other, /)
+    "__getitem__",  #(self, key, /)
+    "__gt__",  #(self, other, /)
+    "__int__",  #(self, /)
+    "__invert__",  #(self, /)
+    "__le__",  #(self, other, /)
+    "__len__",  #(self, /)
+    "__lshift__",  #(self, other, /)
+    "__lt__",  #(self, other, /)
+    "__matmul__",  #(self, other, /)
+    "__mod__",  #(self, other, /)
+    "__mul__",  #(self, other, /)
+    "__ne__",  #(self, other, /)
+    "__neg__",  #(self, /)
+    "__or__",  #(self, other, /)
+    "__pos__",  #(self, /)
+    "__pow__",  #(self, other, /)
+    "__rshift__",  #(self, other, /)
+    "__setitem__",  #(self, key, value, /)
+    "__sub__",  #(self, other, /)
+    "__truediv__",  #(self, other, /)
+    "__xor__",  #(self, other, /)
 ]
 
 aa_creators = [
-    'arange',      #(start, /, stop=None, step=1, *, dtype=None, device=None)
-    'asarray',     #(obj, /, *, dtype=None, device=None, copy=None)
-    'empty',       #(shape, *, dtype=None, device=None)
-    'empty_like',  #(x, /, *, dtype=None, device=None)
-    'eye',         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
-    'from_dlpack', #(x, /)
-    'full',        #(shape, fill_value, *, dtype=None, device=None)
-    'full_like',   #(x, /, fill_value, *, dtype=None, device=None)
-    'linspace',    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
-    'meshgrid',    #(*arrays, indexing=’xy’)
-    'ones',        #(shape, *, dtype=None, device=None)
-    'ones_like',   #(x, /, *, dtype=None, device=None)
-    'zeros',       #(shape, *, dtype=None, device=None)
-    'zeros_like',  #(x, /, *, dtype=None, device=None)
+    "arange",      #(start, /, stop=None, step=1, *, dtype=None, device=None)
+    "asarray",     #(obj, /, *, dtype=None, device=None, copy=None)
+    "empty",       #(shape, *, dtype=None, device=None)
+    "empty_like",  #(x, /, *, dtype=None, device=None)
+    "eye",         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    "from_dlpack", #(x, /)
+    "full",        #(shape, fill_value, *, dtype=None, device=None)
+    "full_like",   #(x, /, fill_value, *, dtype=None, device=None)
+    "linspace",    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    "meshgrid",    #(*arrays, indexing=’xy’)
+    "ones",        #(shape, *, dtype=None, device=None)
+    "ones_like",   #(x, /, *, dtype=None, device=None)
+    "zeros",       #(shape, *, dtype=None, device=None)
+    "zeros_like",  #(x, /, *, dtype=None, device=None)
 ]
 
 aa_attributes = [
-    'dtype',
-    'device',
-    'ndim',
-    'shape',
-    'size',
-    'T',
+    "dtype",
+    "device",
+    "ndim",
+    "shape",
+    "size",
+    "T",
 ]
 
 aa_methods_a = [
-    '__abs__',  #(self, /)
-    '__add__',  #(self, other, /)
-    '__floordiv__',  #(self, other, /)
-    '__invert__',  #(self, /)
-    '__lshift__',  #(self, other, /)
-    '__matmul__',  #(self, other, /)
-    '__mod__',  #(self, other, /)
-    '__mul__',  #(self, other, /)
-    '__neg__',  #(self, /)
-    '__pos__',  #(self, /)
-    '__pow__',  #(self, other, /)
-    '__rshift__',  #(self, other, /)
-    '__sub__',  #(self, other, /)
-    '__truediv__',  #(self, other, /)
-    '__getitem__',  #(self, key, /)
-    '__setitem__',  #(self, key, value, /)
-    '__eq__',  #(self, other, /)
-    '__ge__',  #(self, other, /)
-    '__gt__',  #(self, other, /)
-    '__le__',  #(self, other, /)
-    '__lt__',  #(self, other, /)
-    '__ne__',  #(self, other, /)
-    '__and__',  #(self, other, /)
-    '__or__',  #(self, other, /)
-    '__xor__',  #(self, other, /)
+    "__abs__",  #(self, /)
+    "__add__",  #(self, other, /)
+    "__floordiv__",  #(self, other, /)
+    "__invert__",  #(self, /)
+    "__lshift__",  #(self, other, /)
+    "__matmul__",  #(self, other, /)
+    "__mod__",  #(self, other, /)
+    "__mul__",  #(self, other, /)
+    "__neg__",  #(self, /)
+    "__pos__",  #(self, /)
+    "__pow__",  #(self, other, /)
+    "__rshift__",  #(self, other, /)
+    "__sub__",  #(self, other, /)
+    "__truediv__",  #(self, other, /)
+    "__getitem__",  #(self, key, /)
+    "__setitem__",  #(self, key, value, /)
+    "__eq__",  #(self, other, /)
+    "__ge__",  #(self, other, /)
+    "__gt__",  #(self, other, /)
+    "__le__",  #(self, other, /)
+    "__lt__",  #(self, other, /)
+    "__ne__",  #(self, other, /)
+    "__and__",  #(self, other, /)
+    "__or__",  #(self, other, /)
+    "__xor__",  #(self, other, /)
 ]
 
 aa_methods_s = [
-    '__array_namespace__',  #(self, /, *, api_version=None)
-    '__bool__',  #(self, /)
-    '__dlpack__',  #(self, /, *, stream=None)
-    '__dlpack_device__',  #(self, /)
-    '__float__',  #(self, /)
-    '__int__',  #(self, /)
-    '__len__',  #(self, /)
+    "__array_namespace__",  #(self, /, *, api_version=None)
+    "__bool__",  #(self, /)
+    "__dlpack__",  #(self, /, *, stream=None)
+    "__dlpack_device__",  #(self, /)
+    "__float__",  #(self, /)
+    "__int__",  #(self, /)
+    "__len__",  #(self, /)
 ]
 
 aa_methods = aa_methods_s + aa_methods_a
 
 aa_elementwises = [
-    'abs',  #(x, /)
-    'acos',  #(x, /)
-    'acosh',  #(x, /)
-    'add',  #(x1, x2, /)
-    'asin',  #(x, /)
-    'asinh',  #(x, /)
-    'atan',  #(x, /)
-    'atan2',  #(x1, x2, /)
-    'atanh',  #(x, /)
-    'bitwise_and',  #(x1, x2, /)
-    'bitwise_left_shift',  #(x1, x2, /)
-    'bitwise_invert',  #(x, /)
-    'bitwise_or',  #(x1, x2, /)
-    'bitwise_right_shift',  #(x1, x2, /)
-    'bitwise_xor',  #(x1, x2, /)
-    'ceil',  #(x, /)
-    'cos',  #(x, /)
-    'cosh',  #(x, /)
-    'divide',  #(x1, x2, /)
-    'equal',  #(x1, x2, /)
-    'exp',  #(x, /)
-    'expm1',  #(x, /)
-    'floor',  #(x, /)
-    'floor_divide',  #(x1, x2, /)
-    'greater',  #(x1, x2, /)
-    'greater_equal',  #(x1, x2, /)
-    'isfinite',  #(x, /)
-    'isinf',  #(x, /)
-    'isnan',  #(x, /)
-    'less',  #(x1, x2, /)
-    'less_equal',  #(x1, x2, /)
-    'log',  #(x, /)
-    'log1p',  #(x, /)
-    'log2',  #(x, /)
-    'log10',  #(x, /)
-    'logaddexp',  #(x1, x2)
-    'logical_and',  #(x1, x2, /)
-    'logical_not',  #(x, /)
-    'logical_or',  #(x1, x2, /)
-    'logical_xor',  #(x1, x2, /)
-    'multiply',  #(x1, x2, /)
-    'negative',  #(x, /)
-    'not_equal',  #(x1, x2, /)
-    'positive',  #(x, /)
-    'pow',  #(x1, x2, /)
-    'remainder',  #(x1, x2, /)
-    'round',  #(x, /)
-    'sign',  #(x, /)
-    'sin',  #(x, /)
-    'sinh',  #(x, /)
-    'square',  #(x, /)
-    'sqrt',  #(x, /)
-    'subtract',  #(x1, x2, /)
-    'tan',  #(x, /)
-    'tanh',  #(x, /)
-    'trunc',  #(x, /)
+    "abs",  #(x, /)
+    "acos",  #(x, /)
+    "acosh",  #(x, /)
+    "add",  #(x1, x2, /)
+    "asin",  #(x, /)
+    "asinh",  #(x, /)
+    "atan",  #(x, /)
+    "atan2",  #(x1, x2, /)
+    "atanh",  #(x, /)
+    "bitwise_and",  #(x1, x2, /)
+    "bitwise_left_shift",  #(x1, x2, /)
+    "bitwise_invert",  #(x, /)
+    "bitwise_or",  #(x1, x2, /)
+    "bitwise_right_shift",  #(x1, x2, /)
+    "bitwise_xor",  #(x1, x2, /)
+    "ceil",  #(x, /)
+    "cos",  #(x, /)
+    "cosh",  #(x, /)
+    "divide",  #(x1, x2, /)
+    "equal",  #(x1, x2, /)
+    "exp",  #(x, /)
+    "expm1",  #(x, /)
+    "floor",  #(x, /)
+    "floor_divide",  #(x1, x2, /)
+    "greater",  #(x1, x2, /)
+    "greater_equal",  #(x1, x2, /)
+    "isfinite",  #(x, /)
+    "isinf",  #(x, /)
+    "isnan",  #(x, /)
+    "less",  #(x1, x2, /)
+    "less_equal",  #(x1, x2, /)
+    "log",  #(x, /)
+    "log1p",  #(x, /)
+    "log2",  #(x, /)
+    "log10",  #(x, /)
+    "logaddexp",  #(x1, x2)
+    "logical_and",  #(x1, x2, /)
+    "logical_not",  #(x, /)
+    "logical_or",  #(x1, x2, /)
+    "logical_xor",  #(x1, x2, /)
+    "multiply",  #(x1, x2, /)
+    "negative",  #(x, /)
+    "not_equal",  #(x1, x2, /)
+    "positive",  #(x, /)
+    "pow",  #(x1, x2, /)
+    "remainder",  #(x1, x2, /)
+    "round",  #(x, /)
+    "sign",  #(x, /)
+    "sin",  #(x, /)
+    "sinh",  #(x, /)
+    "square",  #(x, /)
+    "sqrt",  #(x, /)
+    "subtract",  #(x1, x2, /)
+    "tan",  #(x, /)
+    "tanh",  #(x, /)
+    "trunc",  #(x, /)
 ]
 
 aa_statisticals = [
-    'max',   #(x, /, *, axis=None, keepdims=False)
-    'mean',  #(x, /, *, axis=None, keepdims=False)
-    'min',   #(x, /, *, axis=None, keepdims=False)
-    'prod',  #(x, /, *, axis=None, keepdims=False)
-    'std',   #(x, /, *, axis=None, correction=0.0, keepdims=False)
-    'sum',   #(x, /, *, axis=None, keepdims=False)
-    'var',   #(x, /, *, axis=None, correction=0.0, keepdims=False)
+    "max",   #(x, /, *, axis=None, keepdims=False)
+    "mean",  #(x, /, *, axis=None, keepdims=False)
+    "min",   #(x, /, *, axis=None, keepdims=False)
+    "prod",  #(x, /, *, axis=None, keepdims=False)
+    "std",   #(x, /, *, axis=None, correction=0.0, keepdims=False)
+    "sum",   #(x, /, *, axis=None, keepdims=False)
+    "var",   #(x, /, *, axis=None, correction=0.0, keepdims=False)
 ]
 
 aa_searching = [
-    'argmax',
-    'argmin',
-    'nonzero',
-    'where',
+    "argmax",
+    "argmin",
+    "nonzero",
+    "where",
 ]
 
 aa_sorting = [
-    'argsort',
-    'sort',
+    "argsort",
+    "sort",
 ]
 
 aa_set = [
-    'unique',
+    "unique",
 ]
 
 aa_utility = [
-    'all',
-    'any',
+    "all",
+    "any",
 ]
 
 aa_constants = [
-    'e',
-    'inf',
-    'nan',
-    'pi',
+    "e",
+    "inf",
+    "nan",
+    "pi",
 ]
 
 aa_tlfuncs = aa_creators + aa_elementwises + aa_statisticals + aa_datatype_functions + aa_searching + aa_sorting + aa_set + aa_utility
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 8a1e88f6b6..54f1137b19 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -25,10 +25,17 @@
 # Distribution engine.
 #   - schedules same tasks on all workers
 #   - handles dependences seperately
-# This currently is a very simple eagerly executing machinery.
-# We can make this better over time. A low hanging fruit seems might
-# be to delay distribution until go() is called. This would allow aggregating
-# multiple distribution messages into one.
+#
+# Whe tasks are submitted on root rank they are pushed on a queue and a
+# handle/future is returned. When computation is requested by calling go()
+# all tasks on the queue are sent to workers and executed on all ranks
+# sequentially.
+#
+# We store tasks in the same order as they are submitted on the root rank.
+# For any valid program this must be a legal ordering there is no need to check
+# if dependent objects are ready when a task is executed. A more sophisticated
+# scheduler could potentially try to parallelize. It remains to be invistigated
+# if this would be a profitable feature, though.
 #
 # Dependent objects have a unique identifier, assigned when a handle to it is
 # created. We assume that all workers execute handle-creation in identical order.
@@ -43,8 +50,10 @@
 ###############################################################################
 
 
-import sys
 from mpi4py import MPI
+import sys
+from collections import deque
+
 _comm = MPI.COMM_WORLD
 
 # define identifiers
@@ -53,17 +62,51 @@
 GO = 2
 
 
+class _TaskQueue:
+    """
+    A task queue, each rank holds one for queuing up local tasks.
+    We currently dissallow submitting tasks by on-root ranks.
+    Non-root ranks get their TaskQueue set in the recv-lop if init().
+    """
+    def __init__(self):
+        # here we store all tasks that have not been executed yet
+       self._taskQueue = deque()
+
+    def submit(self, rtask):
+        """
+        Sumbit a task to queue. Will not run it.
+        """
+        assert _comm.rank == 0
+        self._taskQueue.append(rtask)
+        return rtask._handle
+
+    def go(self):
+        """
+        Run all tasks in the queue.
+        We assume tasks were submitted in in a valid order, e.g. in an order
+        that guarntees no task is dependent on another task that is behind it in the queue.
+        """
+        while len(self._taskQueue):
+            self._taskQueue.popleft().go()
+
+
+# Our queue of tasks.
+_tQueue = _TaskQueue()
+
+
 def init():
-    'Init distributor'
+    """
+    Init distributor.
+    """
     pass
 
 
 def start():
-    '''
+    """
     Start distribution engine.
     Controller inits and returns.
     Workers enter recv-loop and exit program when fini si called.
-    '''
+    """
     if _comm.rank != 0:
         done = False
         header = None
@@ -76,72 +119,79 @@ def start():
                 done = True
                 break
             elif header[0] == TASK:
-                header[1].submit()
+                _tQueue._taskQueue = header[1]
             elif header[0] == GO:
                 # no delayed execution for now -> nothing to do
-                pass
+                _tQueue.go()
             else:
                 raise Exception("Worker received unknown tag")
         sys.exit()
 
-        
+
 def fini():
-    'Control sends end-tag. Workers will sys.exit'
+    """
+    Control sends end-tag. Workers will sys.exit.
+    """
     if _comm.rank == 0:
         header = [END]
         header = _comm.bcast(header, 0)
 
 
 def go():
-    'Trigger execution of all tasks that are still in flight'
+    """
+    Trigger execution of all tasks which are still in flight.
+    """
+    assert _comm.rank == 0
+    header = [TASK, _tQueue._taskQueue ]
+    _, _ = _comm.bcast(header, 0)
     header = [GO]
-    header = _comm.bcast(header, 0)
+    _ = _comm.bcast(header, 0)
+    _tQueue.go()
 
 
-def submitPP(task, deps, in_order=True):
-    '''
+def submitPP(task, deps, numout=1):
+    """
     Submit a process-parallel task and return a handle/future.
-    '''
-    rtask = _RemoteTask(task, deps)
-    header = [TASK, rtask]
-    _, rtask = _comm.bcast(header, 0)
-    return rtask.submit()
+    """
+    rtask = _RemoteTask(task, deps, numout)
+    return _tQueue.submit(rtask)
 
 
 class Handle:
-    '''
+    """
     A future representing an object that will be available eventually.
     get() will return None as long as the value is not available.
-    '''
+    """
 
     # this defines the next free and globally unique identifier
     _nextId = 1
 
     def __init__(self):
-        '''
+        """
         Initialize handle.
         We assume all workers create handles to objects in identical order.
         This allows us to assign a simple integers as the unqique id.
-        '''
+        """
         self._obj = None
         self._id = Handle._nextId
         Handle._nextId += 1
 
     def set(self, obj):
-        'Make object available.'
+        "Make object available."
         self._obj = obj
 
     def getId(self):
-        'Return future/handle id'
+        "Return future/handle id"
         return self._id
 
     def get(self):
-        'Return object or None'
+        "Return object or None"
+        go()
         return self._obj
-        
+
 
 class _RemoteTask:
-    '''
+    """
     A task which is executed remotely on a worker.
     It accepts a task with a run-method that it will execute at some point.
     It also accepts dependences explicitly and so allows to create
@@ -150,27 +200,36 @@ class _RemoteTask:
     We keep a static dictionary mapping globally unique identifiers to dependent
     global objects (like heat.DNDarrays). This keeps the objects alive and allows
     communicating through simple integers.
-    '''
+    """
 
-    def __init__(self, task, deps, inorder=True):
+    def __init__(self, task, deps, numout):
         self._depIds = deps
         self._task = task
-        self._inorder = inorder
+        self._nOut = numout
+        # FIXME: We currently assign a new id and store the result even when there is no result
+        #        or the result is not a global object.
+        if self._nOut == 1:
+            self._handle = Handle()
+        else:
+            self._handle = tuple(Handle() for _ in range(self._nOut))
+
 
     # here we store objects that are input dependences to tasks
     s_pms = {}
 
-    def submit(self):
-        '''
-        Submit task to local task scheduler.
-        For now we execute eagerly, this is much simpler to implement.
-        Later, we might consider lazy evaluation, task-graph-optimizations etc.
-        FIXME: We currently assign a new id and store the result even when there is no result
-        or the result is not a global object.
-        '''
+    def go(self):
+        """
+        Actually run the task.
+        """
         deps = [_RemoteTask.s_pms[i] for i in self._depIds]
         res = self._task.run(deps)
-        hndl = Handle()
-        hndl.set(res)
-        _RemoteTask.s_pms[hndl.getId()] = res
-        return hndl
+        if self._nOut == 1:
+            self._handle.set(res)
+            _RemoteTask.s_pms[self._handle.getId()] = res
+        else:
+            i = 0
+            for h in self._handle:
+                h.set(res[i])
+                _RemoteTask.s_pms[h.getId()] = res[i]
+                i += 1
+        return self._handle

From 04919dc714810e699622d6690d6ee91f7d43bc99 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Mon, 5 Jul 2021 04:09:41 -0500
Subject: [PATCH 04/22] making code flake8-, black-, and pydocstyle-compliant

---
 heat/cw4heat/__init__.py    | 121 +++++++-----
 heat/cw4heat/arrayapi.py    | 366 ++++++++++++++++++++----------------
 heat/cw4heat/distributor.py |  73 +++----
 3 files changed, 318 insertions(+), 242 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 3b434dcb8c..ba033b75dc 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -22,31 +22,33 @@
 
 
 ###############################################################################
-# This provides a wrapper around SPMD-based HeAT
-# (github.com/helmholtz-analytics/heat) to operate in controller-worker mode.
-
-# The goal is to provide a compliant implementation of the array API
-# (github.com/data-apis/arra-api).
-
-# Returned array (DNDArray) objects are handles/futures only. Their content is
-# available through __int__ etc., through __partitioned__ or heat(). Notice: this
-# allows for delayed execution and optimizations of the workflow/task-graph and
-# communication.
-
-# For a function/method of the array-API that is executed on the controller
-# process, this wrapper generates the equivalent source code to be executed on
-# the worker processes.  The code is then sent to each remote worker and
-# executed there.
-
-# It's up to the distribution layer (e.g. distributor) to make sure the code is
-# executed in the right order on each process/worker so that collective
-# communication in HeAT can operate correctly without dead-locks.
-
-# To allow workflow optimizations array dependences and to avoid
-# pickle-dependencies to the array inputs we separate scalar/non-array arguments
-# from array arguments. For this we assume that array arguments never occur
-# after non-array arguments.  Each function.task handles and passes array-typed
-# and non-array-types arguments separately.
+"""
+This provides a wrapper around SPMD-based HeAT
+(github.com/helmholtz-analytics/heat) to operate in controller-worker mode.
+
+The goal is to provide a compliant implementation of the array API
+(github.com/data-apis/arra-api).
+
+Returned array (DNDArray) objects are handles/futures only. Their content is
+available through __int__ etc., through __partitioned__ or heat(). Notice: this
+allows for delayed execution and optimizations of the workflow/task-graph and
+communication.
+
+For a function/method of the array-API that is executed on the controller
+process, this wrapper generates the equivalent source code to be executed on
+the worker processes.  The code is then sent to each remote worker and
+executed there.
+
+It's up to the distribution layer (e.g. distributor) to make sure the code is
+executed in the right order on each process/worker so that collective
+communication in HeAT can operate correctly without dead-locks.
+
+To allow workflow optimizations array dependences and to avoid
+pickle-dependencies to the array inputs we separate scalar/non-array arguments
+from array arguments. For this we assume that array arguments never occur
+after non-array arguments.  Each function.task handles and passes array-typed
+and non-array-types arguments separately.
+"""
 ###############################################################################
 
 import atexit
@@ -65,9 +67,11 @@
 # just in case we find another SPMD/MPI implementation of numpy...
 import heat as impl
 from heat import DNDarray as dndarray
+
 impl_str = "impl"
 dndarray_str = "impl.DNDarray"
 
+
 def init():
     """
     Initialize distribution engine. Automatically when when importing cw4heat.
@@ -87,7 +91,10 @@ def fini():
 
 
 class _Task:
-    "A work item, executing functions provided as code."
+    """
+    A work item, executing functions provided as code.
+    """
+
     def __init__(self, func, args, kwargs, unwrap="*"):
         self._func = func
         self._args = args
@@ -102,7 +109,10 @@ def run(self, deps):
 
 
 class _PropertyTask:
-    "A work item, executing class properties provided as code."
+    """
+    A work item, executing class properties provided as code.
+    """
+
     def __init__(self, func):
         self._func = func
 
@@ -126,7 +136,7 @@ def _submitProperty(name, self):
     t = _PropertyTask(name)
     try:
         res = distributor.submitPP(t, [self._handle.getId()])
-    except:
+    except Exception:
         assert False
     return res
 
@@ -154,7 +164,9 @@ class DDParray:
     #######################################################################
 
     def __init__(self, handle):
-        "Do not use this array. Use creator functions instead."
+        """
+        Do not use this array. Use creator functions instead.
+        """
         self._handle = handle
 
     def heat(self):
@@ -165,20 +177,26 @@ def heat(self):
         return self._handle.get()
 
     def __getitem__(self, key):
-        "Return item/slice as array."
+        """
+        Return item/slice as array.
+        """
         return DDParray(_submit(f"{dndarray_str}.__getitem__", (self, key), {}))
 
     # bring args in the order we can process and feed into normal process
     # using global normalized version
     def __setitem__(self, key, value):
-        "set item/slice to given value"
-        _submit(f"_setitem_normalized", (self, value, key), {})
+        """
+        Set item/slice to given value.
+        """
+        _submit("_setitem_normalized", (self, value, key), {})
 
     @property
     def T(self):
+        """
+        Transpose.
+        """
         return DDParray(_submitProperty("T", self))
 
-
     #######################################################################
     # Now we add methods/properties through the standard process.
     #######################################################################
@@ -186,14 +204,18 @@ def T(self):
     # dynamically generate class methods from list of methods in array-API
     # we simply make lambdas which submit appropriate Tasks
     # FIXME: aa_inplace_operators,others?
-    fixme_afuncs = ["squeeze", "astype", "balance", "resplit",]
+    fixme_afuncs = ["squeeze", "astype", "balance", "resplit"]
     for method in aa_methods_a + aa_reflected_operators + fixme_afuncs:
         if method not in ["__getitem__", "__setitem__"] and hasattr(dndarray, method):
-            exec(f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))")
+            exec(
+                f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
+            )
 
     for method in aa_methods_s:
         if hasattr(dndarray, method):
-            exec(f"{method} = lambda self, *args, **kwargs: _submit('{dndarray_str}.{method}', (self, *args), kwargs).get()")
+            exec(
+                f"{method} = lambda self, *args, **kwargs: _submit('{dndarray_str}.{method}', (self, *args), kwargs).get()"
+            )
 
     for attr in aa_attributes:
         if attr != "T" and hasattr(dndarray, attr):
@@ -201,7 +223,7 @@ def T(self):
 
     def __getattr__(self, attr):
         # attributes are special
-        if not attr in aa_attributes:
+        if attr not in aa_attributes:
             raise Exception(f"unknown method/attribute {attr} requested")
 
 
@@ -212,6 +234,9 @@ def __getattr__(self, attr):
 # np.concatenate accepts a list of arrays (not individual arrays)
 # so we let the task not unwrap the list of deps
 def concatenate(*args, **kwargs):
+    """
+    Wrapper for impl.concatenate.
+    """
     return DDParray(_submit(f"{impl_str}.concatenate", *args, kwargs, unwrap=""))
 
 
@@ -227,13 +252,19 @@ def concatenate(*args, **kwargs):
 fixme_funcs = ["load_csv", "array", "triu"]
 for func in aa_tlfuncs + fixme_funcs:
     if func == "meshgrid":
-        exec(f"{func} = lambda *args, **kwargs: list(DDParray(x) for x in _submit('{impl_str}.{func}', args, kwargs, numout=len(args)))")
+        exec(
+            f"{func} = lambda *args, **kwargs: list(DDParray(x) for x in _submit('{impl_str}.{func}', args, kwargs, numout=len(args)))"
+        )
     else:
-        exec(f"{func} = lambda *args, **kwargs: DDParray(_submit('{impl_str}.{func}', args, kwargs))")
+        exec(
+            f"{func} = lambda *args, **kwargs: DDParray(_submit('{impl_str}.{func}', args, kwargs))"
+        )
 
 
-for func in ["concatenate", "hstack",]:
-    exec(f"{func} = lambda *args, **kwargs: DDParray(_submit(f'{impl_str}.{func}', *args, kwargs, unwrap=''))")
+for func in ["concatenate", "hstack"]:
+    exec(
+        f"{func} = lambda *args, **kwargs: DDParray(_submit(f'{impl_str}.{func}', *args, kwargs, unwrap=''))"
+    )
 
 
 # Here we data types and constants
@@ -248,9 +279,15 @@ def concatenate(*args, **kwargs):
 # quick hack to provide random features
 #######################################################################
 class random:
+    """
+    Wrapper class for random.
+    """
+
     for method, obj in impl.random.__dict__.items():
         if callable(obj):
-            exec(f"{method} = staticmethod(lambda *args, **kwargs: DDParray(_submit('{impl_str}.random.{method}', args, kwargs)))")
+            exec(
+                f"{method} = staticmethod(lambda *args, **kwargs: DDParray(_submit('{impl_str}.random.{method}', args, kwargs)))"
+            )
 
 
 #######################################################################
diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
index 5203e9e4c8..d1ac6fc85b 100644
--- a/heat/cw4heat/arrayapi.py
+++ b/heat/cw4heat/arrayapi.py
@@ -1,23 +1,46 @@
-__all__ = ["aa_creators", "aa_attributes", "aa_methods", "aa_elementwises", "aa_statisticals",
-           "aa_inplace_operators", "aa_reflected_operators", "aa_datatypes", "aa_datatype_functions",
-           "aa_searching", "aa_sorting", "aa_set", "aa_utility", "aa_constants",
-           "aa_arraydir", "aa_tldir", "aa_tlfuncs", "aa_arrayfuncs", "aa_methods_s", "aa_methods_a"]
+"""
+Functions, data-types etc. defined by Array API.
+See https://data-apis.org/array-api/latest
+"""
+
+__all__ = [
+    "aa_creators",
+    "aa_attributes",
+    "aa_methods",
+    "aa_elementwises",
+    "aa_statisticals",
+    "aa_inplace_operators",
+    "aa_reflected_operators",
+    "aa_datatypes",
+    "aa_datatype_functions",
+    "aa_searching",
+    "aa_sorting",
+    "aa_set",
+    "aa_utility",
+    "aa_constants",
+    "aa_arraydir",
+    "aa_tldir",
+    "aa_tlfuncs",
+    "aa_arrayfuncs",
+    "aa_methods_s",
+    "aa_methods_a",
+]
 
 aa_creators = [
-    "arange",      #(start, /, stop=None, step=1, *, dtype=None, device=None)
-    "asarray",     #(obj, /, *, dtype=None, device=None, copy=None)
-    "empty",       #(shape, *, dtype=None, device=None)
-    "empty_like",  #(x, /, *, dtype=None, device=None)
-    "eye",         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
-    "from_dlpack", #(x, /)
-    "full",        #(shape, fill_value, *, dtype=None, device=None)
-    "full_like",   #(x, /, fill_value, *, dtype=None, device=None)
-    "linspace",    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
-    "meshgrid",    #(*arrays, indexing=’xy’)
-    "ones",        #(shape, *, dtype=None, device=None)
-    "ones_like",   #(x, /, *, dtype=None, device=None)
-    "zeros",       #(shape, *, dtype=None, device=None)
-    "zeros_like",  #(x, /, *, dtype=None, device=None)
+    "arange",  # (start, /, stop=None, step=1, *, dtype=None, device=None)
+    "asarray",  # (obj, /, *, dtype=None, device=None, copy=None)
+    "empty",  # (shape, *, dtype=None, device=None)
+    "empty_like",  # (x, /, *, dtype=None, device=None)
+    "eye",  # (n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    "from_dlpack",  # (x, /)
+    "full",  # (shape, fill_value, *, dtype=None, device=None)
+    "full_like",  # (x, /, fill_value, *, dtype=None, device=None)
+    "linspace",  # (start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    "meshgrid",  # (*arrays, indexing=’xy’)
+    "ones",  # (shape, *, dtype=None, device=None)
+    "ones_like",  # (x, /, *, dtype=None, device=None)
+    "zeros",  # (shape, *, dtype=None, device=None)
+    "zeros_like",  # (x, /, *, dtype=None, device=None)
 ]
 
 aa_attributes = [
@@ -76,64 +99,64 @@
 ]
 
 aa_datatype_functions = [
-    "broadcast_arrays",  #(*arrays)
-    "broadcast_to",  #(x, /, shape)
-    "can_cast",  #(from_, to, /)
-    "finfo",  #(type, /)
-    "iinfo",  #(type, /)
-    "result_type",  #(*arrays_and_dtypes)
+    "broadcast_arrays",  # (*arrays)
+    "broadcast_to",  # (x, /, shape)
+    "can_cast",  # (from_, to, /)
+    "finfo",  # (type, /)
+    "iinfo",  # (type, /)
+    "result_type",  # (*arrays_and_dtypes)
 ]
-                
+
 aa_methods = [
-    "__abs__",  #(self, /)
-    "__add__",  #(self, other, /)
-    "__and__",  #(self, other, /)
-    "__array_namespace__",  #(self, /, *, api_version=None)
-    "__bool__",  #(self, /)
-    "__dlpack__",  #(self, /, *, stream=None)
-    "__dlpack_device__",  #(self, /)
-    "__eq__",  #(self, other, /)
-    "__float__",  #(self, /)
-    "__floordiv__",  #(self, other, /)
-    "__ge__",  #(self, other, /)
-    "__getitem__",  #(self, key, /)
-    "__gt__",  #(self, other, /)
-    "__int__",  #(self, /)
-    "__invert__",  #(self, /)
-    "__le__",  #(self, other, /)
-    "__len__",  #(self, /)
-    "__lshift__",  #(self, other, /)
-    "__lt__",  #(self, other, /)
-    "__matmul__",  #(self, other, /)
-    "__mod__",  #(self, other, /)
-    "__mul__",  #(self, other, /)
-    "__ne__",  #(self, other, /)
-    "__neg__",  #(self, /)
-    "__or__",  #(self, other, /)
-    "__pos__",  #(self, /)
-    "__pow__",  #(self, other, /)
-    "__rshift__",  #(self, other, /)
-    "__setitem__",  #(self, key, value, /)
-    "__sub__",  #(self, other, /)
-    "__truediv__",  #(self, other, /)
-    "__xor__",  #(self, other, /)
+    "__abs__",  # (self, /)
+    "__add__",  # (self, other, /)
+    "__and__",  # (self, other, /)
+    "__array_namespace__",  # (self, /, *, api_version=None)
+    "__bool__",  # (self, /)
+    "__dlpack__",  # (self, /, *, stream=None)
+    "__dlpack_device__",  # (self, /)
+    "__eq__",  # (self, other, /)
+    "__float__",  # (self, /)
+    "__floordiv__",  # (self, other, /)
+    "__ge__",  # (self, other, /)
+    "__getitem__",  # (self, key, /)
+    "__gt__",  # (self, other, /)
+    "__int__",  # (self, /)
+    "__invert__",  # (self, /)
+    "__le__",  # (self, other, /)
+    "__len__",  # (self, /)
+    "__lshift__",  # (self, other, /)
+    "__lt__",  # (self, other, /)
+    "__matmul__",  # (self, other, /)
+    "__mod__",  # (self, other, /)
+    "__mul__",  # (self, other, /)
+    "__ne__",  # (self, other, /)
+    "__neg__",  # (self, /)
+    "__or__",  # (self, other, /)
+    "__pos__",  # (self, /)
+    "__pow__",  # (self, other, /)
+    "__rshift__",  # (self, other, /)
+    "__setitem__",  # (self, key, value, /)
+    "__sub__",  # (self, other, /)
+    "__truediv__",  # (self, other, /)
+    "__xor__",  # (self, other, /)
 ]
 
 aa_creators = [
-    "arange",      #(start, /, stop=None, step=1, *, dtype=None, device=None)
-    "asarray",     #(obj, /, *, dtype=None, device=None, copy=None)
-    "empty",       #(shape, *, dtype=None, device=None)
-    "empty_like",  #(x, /, *, dtype=None, device=None)
-    "eye",         #(n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
-    "from_dlpack", #(x, /)
-    "full",        #(shape, fill_value, *, dtype=None, device=None)
-    "full_like",   #(x, /, fill_value, *, dtype=None, device=None)
-    "linspace",    #(start, stop, /, num, *, dtype=None, device=None, endpoint=True)
-    "meshgrid",    #(*arrays, indexing=’xy’)
-    "ones",        #(shape, *, dtype=None, device=None)
-    "ones_like",   #(x, /, *, dtype=None, device=None)
-    "zeros",       #(shape, *, dtype=None, device=None)
-    "zeros_like",  #(x, /, *, dtype=None, device=None)
+    "arange",  # (start, /, stop=None, step=1, *, dtype=None, device=None)
+    "asarray",  # (obj, /, *, dtype=None, device=None, copy=None)
+    "empty",  # (shape, *, dtype=None, device=None)
+    "empty_like",  # (x, /, *, dtype=None, device=None)
+    "eye",  # (n_rows, n_cols=None, /, *, k=0, dtype=None, device=None)
+    "from_dlpack",  # (x, /)
+    "full",  # (shape, fill_value, *, dtype=None, device=None)
+    "full_like",  # (x, /, fill_value, *, dtype=None, device=None)
+    "linspace",  # (start, stop, /, num, *, dtype=None, device=None, endpoint=True)
+    "meshgrid",  # (*arrays, indexing=’xy’)
+    "ones",  # (shape, *, dtype=None, device=None)
+    "ones_like",  # (x, /, *, dtype=None, device=None)
+    "zeros",  # (shape, *, dtype=None, device=None)
+    "zeros_like",  # (x, /, *, dtype=None, device=None)
 ]
 
 aa_attributes = [
@@ -146,112 +169,112 @@
 ]
 
 aa_methods_a = [
-    "__abs__",  #(self, /)
-    "__add__",  #(self, other, /)
-    "__floordiv__",  #(self, other, /)
-    "__invert__",  #(self, /)
-    "__lshift__",  #(self, other, /)
-    "__matmul__",  #(self, other, /)
-    "__mod__",  #(self, other, /)
-    "__mul__",  #(self, other, /)
-    "__neg__",  #(self, /)
-    "__pos__",  #(self, /)
-    "__pow__",  #(self, other, /)
-    "__rshift__",  #(self, other, /)
-    "__sub__",  #(self, other, /)
-    "__truediv__",  #(self, other, /)
-    "__getitem__",  #(self, key, /)
-    "__setitem__",  #(self, key, value, /)
-    "__eq__",  #(self, other, /)
-    "__ge__",  #(self, other, /)
-    "__gt__",  #(self, other, /)
-    "__le__",  #(self, other, /)
-    "__lt__",  #(self, other, /)
-    "__ne__",  #(self, other, /)
-    "__and__",  #(self, other, /)
-    "__or__",  #(self, other, /)
-    "__xor__",  #(self, other, /)
+    "__abs__",  # (self, /)
+    "__add__",  # (self, other, /)
+    "__floordiv__",  # (self, other, /)
+    "__invert__",  # (self, /)
+    "__lshift__",  # (self, other, /)
+    "__matmul__",  # (self, other, /)
+    "__mod__",  # (self, other, /)
+    "__mul__",  # (self, other, /)
+    "__neg__",  # (self, /)
+    "__pos__",  # (self, /)
+    "__pow__",  # (self, other, /)
+    "__rshift__",  # (self, other, /)
+    "__sub__",  # (self, other, /)
+    "__truediv__",  # (self, other, /)
+    "__getitem__",  # (self, key, /)
+    "__setitem__",  # (self, key, value, /)
+    "__eq__",  # (self, other, /)
+    "__ge__",  # (self, other, /)
+    "__gt__",  # (self, other, /)
+    "__le__",  # (self, other, /)
+    "__lt__",  # (self, other, /)
+    "__ne__",  # (self, other, /)
+    "__and__",  # (self, other, /)
+    "__or__",  # (self, other, /)
+    "__xor__",  # (self, other, /)
 ]
 
 aa_methods_s = [
-    "__array_namespace__",  #(self, /, *, api_version=None)
-    "__bool__",  #(self, /)
-    "__dlpack__",  #(self, /, *, stream=None)
-    "__dlpack_device__",  #(self, /)
-    "__float__",  #(self, /)
-    "__int__",  #(self, /)
-    "__len__",  #(self, /)
+    "__array_namespace__",  # (self, /, *, api_version=None)
+    "__bool__",  # (self, /)
+    "__dlpack__",  # (self, /, *, stream=None)
+    "__dlpack_device__",  # (self, /)
+    "__float__",  # (self, /)
+    "__int__",  # (self, /)
+    "__len__",  # (self, /)
 ]
 
 aa_methods = aa_methods_s + aa_methods_a
 
 aa_elementwises = [
-    "abs",  #(x, /)
-    "acos",  #(x, /)
-    "acosh",  #(x, /)
-    "add",  #(x1, x2, /)
-    "asin",  #(x, /)
-    "asinh",  #(x, /)
-    "atan",  #(x, /)
-    "atan2",  #(x1, x2, /)
-    "atanh",  #(x, /)
-    "bitwise_and",  #(x1, x2, /)
-    "bitwise_left_shift",  #(x1, x2, /)
-    "bitwise_invert",  #(x, /)
-    "bitwise_or",  #(x1, x2, /)
-    "bitwise_right_shift",  #(x1, x2, /)
-    "bitwise_xor",  #(x1, x2, /)
-    "ceil",  #(x, /)
-    "cos",  #(x, /)
-    "cosh",  #(x, /)
-    "divide",  #(x1, x2, /)
-    "equal",  #(x1, x2, /)
-    "exp",  #(x, /)
-    "expm1",  #(x, /)
-    "floor",  #(x, /)
-    "floor_divide",  #(x1, x2, /)
-    "greater",  #(x1, x2, /)
-    "greater_equal",  #(x1, x2, /)
-    "isfinite",  #(x, /)
-    "isinf",  #(x, /)
-    "isnan",  #(x, /)
-    "less",  #(x1, x2, /)
-    "less_equal",  #(x1, x2, /)
-    "log",  #(x, /)
-    "log1p",  #(x, /)
-    "log2",  #(x, /)
-    "log10",  #(x, /)
-    "logaddexp",  #(x1, x2)
-    "logical_and",  #(x1, x2, /)
-    "logical_not",  #(x, /)
-    "logical_or",  #(x1, x2, /)
-    "logical_xor",  #(x1, x2, /)
-    "multiply",  #(x1, x2, /)
-    "negative",  #(x, /)
-    "not_equal",  #(x1, x2, /)
-    "positive",  #(x, /)
-    "pow",  #(x1, x2, /)
-    "remainder",  #(x1, x2, /)
-    "round",  #(x, /)
-    "sign",  #(x, /)
-    "sin",  #(x, /)
-    "sinh",  #(x, /)
-    "square",  #(x, /)
-    "sqrt",  #(x, /)
-    "subtract",  #(x1, x2, /)
-    "tan",  #(x, /)
-    "tanh",  #(x, /)
-    "trunc",  #(x, /)
+    "abs",  # (x, /)
+    "acos",  # (x, /)
+    "acosh",  # (x, /)
+    "add",  # (x1, x2, /)
+    "asin",  # (x, /)
+    "asinh",  # (x, /)
+    "atan",  # (x, /)
+    "atan2",  # (x1, x2, /)
+    "atanh",  # (x, /)
+    "bitwise_and",  # (x1, x2, /)
+    "bitwise_left_shift",  # (x1, x2, /)
+    "bitwise_invert",  # (x, /)
+    "bitwise_or",  # (x1, x2, /)
+    "bitwise_right_shift",  # (x1, x2, /)
+    "bitwise_xor",  # (x1, x2, /)
+    "ceil",  # (x, /)
+    "cos",  # (x, /)
+    "cosh",  # (x, /)
+    "divide",  # (x1, x2, /)
+    "equal",  # (x1, x2, /)
+    "exp",  # (x, /)
+    "expm1",  # (x, /)
+    "floor",  # (x, /)
+    "floor_divide",  # (x1, x2, /)
+    "greater",  # (x1, x2, /)
+    "greater_equal",  # (x1, x2, /)
+    "isfinite",  # (x, /)
+    "isinf",  # (x, /)
+    "isnan",  # (x, /)
+    "less",  # (x1, x2, /)
+    "less_equal",  # (x1, x2, /)
+    "log",  # (x, /)
+    "log1p",  # (x, /)
+    "log2",  # (x, /)
+    "log10",  # (x, /)
+    "logaddexp",  # (x1, x2)
+    "logical_and",  # (x1, x2, /)
+    "logical_not",  # (x, /)
+    "logical_or",  # (x1, x2, /)
+    "logical_xor",  # (x1, x2, /)
+    "multiply",  # (x1, x2, /)
+    "negative",  # (x, /)
+    "not_equal",  # (x1, x2, /)
+    "positive",  # (x, /)
+    "pow",  # (x1, x2, /)
+    "remainder",  # (x1, x2, /)
+    "round",  # (x, /)
+    "sign",  # (x, /)
+    "sin",  # (x, /)
+    "sinh",  # (x, /)
+    "square",  # (x, /)
+    "sqrt",  # (x, /)
+    "subtract",  # (x1, x2, /)
+    "tan",  # (x, /)
+    "tanh",  # (x, /)
+    "trunc",  # (x, /)
 ]
 
 aa_statisticals = [
-    "max",   #(x, /, *, axis=None, keepdims=False)
-    "mean",  #(x, /, *, axis=None, keepdims=False)
-    "min",   #(x, /, *, axis=None, keepdims=False)
-    "prod",  #(x, /, *, axis=None, keepdims=False)
-    "std",   #(x, /, *, axis=None, correction=0.0, keepdims=False)
-    "sum",   #(x, /, *, axis=None, keepdims=False)
-    "var",   #(x, /, *, axis=None, correction=0.0, keepdims=False)
+    "max",  # (x, /, *, axis=None, keepdims=False)
+    "mean",  # (x, /, *, axis=None, keepdims=False)
+    "min",  # (x, /, *, axis=None, keepdims=False)
+    "prod",  # (x, /, *, axis=None, keepdims=False)
+    "std",  # (x, /, *, axis=None, correction=0.0, keepdims=False)
+    "sum",  # (x, /, *, axis=None, keepdims=False)
+    "var",  # (x, /, *, axis=None, correction=0.0, keepdims=False)
 ]
 
 aa_searching = [
@@ -282,7 +305,16 @@
     "pi",
 ]
 
-aa_tlfuncs = aa_creators + aa_elementwises + aa_statisticals + aa_datatype_functions + aa_searching + aa_sorting + aa_set + aa_utility
+aa_tlfuncs = (
+    aa_creators
+    + aa_elementwises
+    + aa_statisticals
+    + aa_datatype_functions
+    + aa_searching
+    + aa_sorting
+    + aa_set
+    + aa_utility
+)
 aa_tldir = aa_tlfuncs + aa_datatypes + aa_constants
 aa_arrayfuncs = aa_methods + aa_inplace_operators + aa_reflected_operators
 aa_arraydir = aa_attributes + aa_arrayfuncs
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 54f1137b19..f2cdd549c0 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -22,31 +22,33 @@
 
 
 ###############################################################################
-# Distribution engine.
-#   - schedules same tasks on all workers
-#   - handles dependences seperately
-#
-# Whe tasks are submitted on root rank they are pushed on a queue and a
-# handle/future is returned. When computation is requested by calling go()
-# all tasks on the queue are sent to workers and executed on all ranks
-# sequentially.
-#
-# We store tasks in the same order as they are submitted on the root rank.
-# For any valid program this must be a legal ordering there is no need to check
-# if dependent objects are ready when a task is executed. A more sophisticated
-# scheduler could potentially try to parallelize. It remains to be invistigated
-# if this would be a profitable feature, though.
-#
-# Dependent objects have a unique identifier, assigned when a handle to it is
-# created. We assume that all workers execute handle-creation in identical order.
-# Such dependences are assumed to be global entities, e.g. each worker holds
-# a handle/reference to it (e.g. like a heat.DNDarray). The local handles
-# exist on each, stored in a worker-local dictionary. Thsi allows identifying
-# dependences through simple integers.
-#
-# Notice, mpi4py does not provide ibcast, so we cannot overlap. This makes the
-# above aggregation particularly promising. Another option woujld be to write
-# this in C/C++ and use ibcast.
+"""
+Distribution engine.
+  - schedules same tasks on all workers
+  - handles dependences seperately
+
+Whe tasks are submitted on root rank they are pushed on a queue and a
+handle/future is returned. When computation is requested by calling go()
+all tasks on the queue are sent to workers and executed on all ranks
+sequentially.
+
+We store tasks in the same order as they are submitted on the root rank.
+For any valid program this must be a legal ordering there is no need to check
+if dependent objects are ready when a task is executed. A more sophisticated
+scheduler could potentially try to parallelize. It remains to be invistigated
+if this would be a profitable feature, though.
+
+Dependent objects have a unique identifier, assigned when a handle to it is
+created. We assume that all workers execute handle-creation in identical order.
+Such dependences are assumed to be global entities, e.g. each worker holds
+a handle/reference to it (e.g. like a heat.DNDarray). The local handles
+exist on each, stored in a worker-local dictionary. Thsi allows identifying
+dependences through simple integers.
+
+Notice, mpi4py does not provide ibcast, so we cannot overlap. This makes the
+above aggregation particularly promising. Another option woujld be to write
+this in C/C++ and use ibcast.
+"""
 ###############################################################################
 
 
@@ -68,9 +70,10 @@ class _TaskQueue:
     We currently dissallow submitting tasks by on-root ranks.
     Non-root ranks get their TaskQueue set in the recv-lop if init().
     """
+
     def __init__(self):
         # here we store all tasks that have not been executed yet
-       self._taskQueue = deque()
+        self._taskQueue = deque()
 
     def submit(self, rtask):
         """
@@ -110,8 +113,7 @@ def start():
     if _comm.rank != 0:
         done = False
         header = None
-        rtask = None
-        while(not done):
+        while not done:
             # wait in bcast for work
             header = _comm.bcast(header, 0)
             # then see what we need to do
@@ -142,7 +144,7 @@ def go():
     Trigger execution of all tasks which are still in flight.
     """
     assert _comm.rank == 0
-    header = [TASK, _tQueue._taskQueue ]
+    header = [TASK, _tQueue._taskQueue]
     _, _ = _comm.bcast(header, 0)
     header = [GO]
     _ = _comm.bcast(header, 0)
@@ -177,15 +179,21 @@ def __init__(self):
         Handle._nextId += 1
 
     def set(self, obj):
-        "Make object available."
+        """
+        Make object available.
+        """
         self._obj = obj
 
     def getId(self):
-        "Return future/handle id"
+        """
+        Return future/handle id
+        """
         return self._id
 
     def get(self):
-        "Return object or None"
+        """
+        Return object or None
+        """
         go()
         return self._obj
 
@@ -213,7 +221,6 @@ def __init__(self, task, deps, numout):
         else:
             self._handle = tuple(Handle() for _ in range(self._nOut))
 
-
     # here we store objects that are input dependences to tasks
     s_pms = {}
 

From 533194abdd081892b1504000739e60f7d0e8fb46 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Mon, 5 Jul 2021 05:01:10 -0500
Subject: [PATCH 05/22] serving picky black

---
 heat/cw4heat/arrayapi.py | 46 ++++++----------------------------------
 1 file changed, 7 insertions(+), 39 deletions(-)

diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
index d1ac6fc85b..785da687c1 100644
--- a/heat/cw4heat/arrayapi.py
+++ b/heat/cw4heat/arrayapi.py
@@ -43,14 +43,7 @@
     "zeros_like",  # (x, /, *, dtype=None, device=None)
 ]
 
-aa_attributes = [
-    "dtype",
-    "device",
-    "ndim",
-    "shape",
-    "size",
-    "T",
-]
+aa_attributes = ["dtype", "device", "ndim", "shape", "size", "T"]
 
 aa_inplace_operators = [
     "__iadd__",
@@ -159,14 +152,7 @@
     "zeros_like",  # (x, /, *, dtype=None, device=None)
 ]
 
-aa_attributes = [
-    "dtype",
-    "device",
-    "ndim",
-    "shape",
-    "size",
-    "T",
-]
+aa_attributes = ["dtype", "device", "ndim", "shape", "size", "T"]
 
 aa_methods_a = [
     "__abs__",  # (self, /)
@@ -277,33 +263,15 @@
     "var",  # (x, /, *, axis=None, correction=0.0, keepdims=False)
 ]
 
-aa_searching = [
-    "argmax",
-    "argmin",
-    "nonzero",
-    "where",
-]
+aa_searching = ["argmax", "argmin", "nonzero", "where"]
 
-aa_sorting = [
-    "argsort",
-    "sort",
-]
+aa_sorting = ["argsort", "sort"]
 
-aa_set = [
-    "unique",
-]
+aa_set = ["unique"]
 
-aa_utility = [
-    "all",
-    "any",
-]
+aa_utility = ["all", "any"]
 
-aa_constants = [
-    "e",
-    "inf",
-    "nan",
-    "pi",
-]
+aa_constants = ["e", "inf", "nan", "pi"]
 
 aa_tlfuncs = (
     aa_creators

From ee414d9253769c684a0fdf2c42d9b4adfcd68233 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 7 Jul 2021 05:24:32 -0500
Subject: [PATCH 06/22] first cut for supporting ray actors. Controller no
 longer a worker.

---
 heat/cw4heat/__init__.py    |  99 ++++++++++++++---------
 heat/cw4heat/distributor.py | 155 ++++++++++++++++++++++--------------
 heat/cw4heat/ray_runner.py  | 137 +++++++++++++++++++++++++++++++
 3 files changed, 293 insertions(+), 98 deletions(-)
 create mode 100644 heat/cw4heat/ray_runner.py

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index ba033b75dc..e679972a80 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -51,8 +51,10 @@
 """
 ###############################################################################
 
+from mpi4py import MPI
+from os import getenv, getpid
 import atexit
-from . import distributor
+from .distributor import Distributor
 from .arrayapi import (
     aa_attributes,
     aa_tlfuncs,
@@ -71,6 +73,10 @@
 impl_str = "impl"
 dndarray_str = "impl.DNDarray"
 
+_distributor = None
+_comm = None
+_fini = None
+
 
 def init():
     """
@@ -78,8 +84,31 @@ def init():
     For now we assume all ranks (controller and workers) are started through mpirun,
     workers will never leave distributor.start() and so this function.
     """
-    distributor.init()
-    distributor.start()
+    global _distributor
+    global _comm
+    global _fini
+
+    if _distributor is not None:
+        return
+
+    _launcher = getenv("CW4H_LAUNCHER", default="mpi").lower()
+
+    def _setComm(c):
+        return impl.use_comm(impl.MPICommunication(c.Create(c.group.Excl([0]))))
+
+    # atexit.register(fini)
+    if _launcher == "ray":
+        from .ray_runner import init as ray_init, fini as ray_fini
+
+        _comm, _distributor, _futures = ray_init(_setComm)
+        _distributor.start(initImpl=_setComm)
+        _fini = ray_fini
+    elif _launcher == "mpi":
+        _comm = MPI.COMM_WORLD
+        _distributor = Distributor(_comm)
+        _distributor.start(initImpl=_setComm)
+    else:
+        raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
 
 def fini():
@@ -87,7 +116,9 @@ def fini():
     Finalize/shutdown distribution engine. Automatically called at exit.
     When called on controller, workers will sys.exit from init().
     """
-    distributor.fini()
+    _distributor.fini()
+    if _fini:
+        _fini()
 
 
 class _Task:
@@ -126,7 +157,7 @@ def _submit(name, args, kwargs, unwrap="*", numout=1):
     """
     scalar_args = tuple(x for x in args if not isinstance(x, DDParray))
     deps = [x._handle.getId() for x in args if isinstance(x, DDParray)]
-    return distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps, numout)
+    return _distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps, numout)
 
 
 def _submitProperty(name, self):
@@ -135,7 +166,7 @@ def _submitProperty(name, self):
     """
     t = _PropertyTask(name)
     try:
-        res = distributor.submitPP(t, [self._handle.getId()])
+        res = _distributor.submitPP(t, [self._handle.getId()])
     except Exception:
         assert False
     return res
@@ -168,13 +199,14 @@ def __init__(self, handle):
         Do not use this array. Use creator functions instead.
         """
         self._handle = handle
+        self._attributes = None
 
-    def heat(self):
-        """
-        Return heat native array.
-        With delayed execution, triggers computation as needed and blocks until array is available.
-        """
-        return self._handle.get()
+    # def heat(self):
+    #     """
+    #     Return heat native array.
+    #     With delayed execution, triggers computation as needed and blocks until array is available.
+    #     """
+    #     return _distributor.get(self._handle)
 
     def __getitem__(self, key):
         """
@@ -211,40 +243,31 @@ def T(self):
                 f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
             )
 
-    for method in aa_methods_s:
+    for method in aa_methods_s + ["__str__"]:
         if hasattr(dndarray, method):
             exec(
-                f"{method} = lambda self, *args, **kwargs: _submit('{dndarray_str}.{method}', (self, *args), kwargs).get()"
+                f"{method} = lambda self, *args, **kwargs: _distributor.get(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
             )
 
-    for attr in aa_attributes:
-        if attr != "T" and hasattr(dndarray, attr):
-            exec(f"{attr} = property(lambda self: self._handle.get().{attr})")
-
     def __getattr__(self, attr):
-        # attributes are special
-        if attr not in aa_attributes:
-            raise Exception(f"unknown method/attribute {attr} requested")
-
-
-#######################################################################
-# first define top-level functions which need special care.
-#######################################################################
-
-# np.concatenate accepts a list of arrays (not individual arrays)
-# so we let the task not unwrap the list of deps
-def concatenate(*args, **kwargs):
-    """
-    Wrapper for impl.concatenate.
-    """
-    return DDParray(_submit(f"{impl_str}.concatenate", *args, kwargs, unwrap=""))
+        """
+        Get attributes.
+        Caches attributes from workers, so we communicate only once.
+        """
+        if self._attributes is None:
+            self._attributes = _distributor.get(
+                _submit(
+                    "(lambda a: {x: getattr(a, x) for x in aa_attributes if x != 'T'})", (self,), {}
+                )
+            )
+        return self._attributes[attr]
 
 
 #######################################################################
 # first define top-level functions through the standard process.
 #######################################################################
 #   - creating arrays
-#   - elementswise operations
+#   - elementwise operations
 #   - statistical operations
 # (lists taken from list of methods in array-API)
 # Again, we simply make lambdas which submit appropriate Tasks
@@ -261,13 +284,15 @@ def concatenate(*args, **kwargs):
         )
 
 
+# np.concatenate/hstack accept a list of arrays (not individual arrays)
+# so we let the task not unwrap the list of deps
 for func in ["concatenate", "hstack"]:
     exec(
         f"{func} = lambda *args, **kwargs: DDParray(_submit(f'{impl_str}.{func}', *args, kwargs, unwrap=''))"
     )
 
 
-# Here we data types and constants
+# Here we define data types and constants
 for attr in aa_datatypes + aa_constants:
     if hasattr(impl, attr):
         exec(f"{attr} = {impl_str}.{attr}")
@@ -292,5 +317,3 @@ class random:
 
 #######################################################################
 #######################################################################
-atexit.register(fini)
-init()
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index f2cdd549c0..21d3358698 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -56,12 +56,12 @@
 import sys
 from collections import deque
 
-_comm = MPI.COMM_WORLD
 
 # define identifiers
 END = 0
 TASK = 1
 GO = 2
+GET = 3
 
 
 class _TaskQueue:
@@ -79,7 +79,6 @@ def submit(self, rtask):
         """
         Sumbit a task to queue. Will not run it.
         """
-        assert _comm.rank == 0
         self._taskQueue.append(rtask)
         return rtask._handle
 
@@ -89,74 +88,107 @@ def go(self):
         We assume tasks were submitted in in a valid order, e.g. in an order
         that guarntees no task is dependent on another task that is behind it in the queue.
         """
+        print("Executing tasks", len(self._taskQueue), flush=True)
         while len(self._taskQueue):
             self._taskQueue.popleft().go()
 
+    def len(self):
+        return len(self._taskQueue)
 
-# Our queue of tasks.
-_tQueue = _TaskQueue()
+    def clear(self):
+        self._taskQueue.clear()
 
 
-def init():
+class Distributor:
     """
-    Init distributor.
+    Instances of this class distribute work from controller to workers.
+    Work-items are treated as dependent tasks.
     """
-    pass
 
+    def __init__(self, comm=MPI.COMM_WORLD):
+        """
+        Init distributor, optionally accepts MPI communicator.
+        """
+        self._comm = comm
+        # Our queue of tasks.
+        self._tQueue = _TaskQueue()
 
-def start():
-    """
-    Start distribution engine.
-    Controller inits and returns.
-    Workers enter recv-loop and exit program when fini si called.
-    """
-    if _comm.rank != 0:
-        done = False
-        header = None
-        while not done:
-            # wait in bcast for work
-            header = _comm.bcast(header, 0)
-            # then see what we need to do
-            if header[0] == END:
-                done = True
-                break
-            elif header[0] == TASK:
-                _tQueue._taskQueue = header[1]
-            elif header[0] == GO:
-                # no delayed execution for now -> nothing to do
-                _tQueue.go()
-            else:
-                raise Exception("Worker received unknown tag")
-        sys.exit()
-
-
-def fini():
-    """
-    Control sends end-tag. Workers will sys.exit.
-    """
-    if _comm.rank == 0:
-        header = [END]
-        header = _comm.bcast(header, 0)
-
-
-def go():
-    """
-    Trigger execution of all tasks which are still in flight.
-    """
-    assert _comm.rank == 0
-    header = [TASK, _tQueue._taskQueue]
-    _, _ = _comm.bcast(header, 0)
-    header = [GO]
-    _ = _comm.bcast(header, 0)
-    _tQueue.go()
-
+    def start(self, doExit=True, initImpl=None):
+        """
+        Start distribution engine.
+        Controller inits and returns.
+        Workers enter recv-loop and exit program when fini is called.
+        """
+        if initImpl:
+            initImpl(self._comm)
+        if self._comm.rank != 0:
+            done = False
+            header = None
+            while not done:
+                # wait in bcast for work
+                header = self._comm.bcast(header, 0)
+                # then see what we need to do
+                if header[0] == END:
+                    done = True
+                    break
+                elif header[0] == TASK:
+                    self._tQueue._taskQueue = header[1]
+                elif header[0] == GO:
+                    self._tQueue.go()
+                elif header[0] == GET:
+                    if self._comm.rank == 1:
+                        val = _RemoteTask.getVal(header[1])
+                        self._comm.send(val, dest=0, tag=GET)
+                else:
+                    raise Exception("Worker received unknown tag")
+            self._comm.Barrier()
+            MPI.Finalize()
+            if doExit:
+                sys.exit()
+
+    def fini(self):
+        """
+        Control sends end-tag. Workers will sys.exit.
+        """
+        if MPI.Is_initialized() and self._comm.rank == 0:
+            header = [END]
+            header = self._comm.bcast(header, 0)
+            self._comm.Barrier()
+            MPI.Finalize()
 
-def submitPP(task, deps, numout=1):
-    """
-    Submit a process-parallel task and return a handle/future.
-    """
-    rtask = _RemoteTask(task, deps, numout)
-    return _tQueue.submit(rtask)
+    def go(self):
+        """
+        Trigger execution of all tasks which are still in flight.
+        """
+        assert self._comm.rank == 0
+        if self._tQueue.len():
+            header = [TASK, self._tQueue._taskQueue]
+            _, _ = self._comm.bcast(header, 0)
+            header = [GO]
+            _ = self._comm.bcast(header, 0)
+            self._tQueue.clear()
+
+    def get(self, handle):
+        """
+        Get actualy value from handle.
+        Requires communication.
+        We get the value from worker 0 (rank 1 in global comm).
+        Does not work for arrays (yet).
+        """
+        assert self._comm.rank == 0
+        self.go()
+        header = [GET, handle.getId()]
+        _ = self._comm.bcast(header, 0)
+        val = self._comm.recv(source=1, tag=GET)
+        handle.set(val)
+        return val
+
+    def submitPP(self, task, deps, numout=1):
+        """
+        Submit a process-parallel task and return a handle/future.
+        """
+        rtask = _RemoteTask(task, deps, numout)
+        return self._tQueue.submit(rtask)
 
 
 class Handle:
@@ -194,7 +226,6 @@ def get(self):
         """
         Return object or None
         """
-        go()
         return self._obj
 
 
@@ -240,3 +271,7 @@ def go(self):
                 _RemoteTask.s_pms[h.getId()] = res[i]
                 i += 1
         return self._handle
+
+    @staticmethod
+    def getVal(id):
+        return _RemoteTask.s_pms[id]
diff --git a/heat/cw4heat/ray_runner.py b/heat/cw4heat/ray_runner.py
new file mode 100644
index 0000000000..f318c25fec
--- /dev/null
+++ b/heat/cw4heat/ray_runner.py
@@ -0,0 +1,137 @@
+# ===============================================================================
+# Copyright 2014-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+"""
+A Ray backend for HeAT controller-worker wrapper.
+
+1. Init() nitializes actors
+   - one for each node in the existing ray cluster
+   - actors connect through MPI
+2. Start actors
+   - actors will sit in recv-loop and wait for work
+3. fini() kills all actors.
+   - Make sure you let distributor end recv-loop before calling this.
+"""
+
+from mpi4py import MPI
+import ray
+from ray.services import get_node_ip_address as getIP
+from .distributor import Distributor
+import os
+
+_actors = {}
+
+
+@ray.remote
+class RayActor:
+    """
+    A ray actor which connects to other actors and controller through MPI.
+    """
+
+    def __init__(self, node):
+        self.node = node
+        self._commWorld = MPI.COMM_SELF
+        self._distributor = None
+        print("Actor up", flush=True)
+
+    def connect(self, port, nWorkers):
+        """
+        Let nWorkers-many processes connect to controller process.
+        """
+        print("Actor connecting", flush=True)
+        # workers go here
+        # initial connect
+        intercomm = self._commWorld.Connect(port)
+        # merge communicator
+        self._commWorld = intercomm.Merge(1)
+        intercomm.Disconnect()
+        rank = self._commWorld.rank
+        print(f"Yey, rank {rank} connected!")
+        # collectively accept connections from all (other) clients
+        for i in range(rank, nWorkers):
+            # connect to next worker (collectively)
+            intercomm = self._commWorld.Accept(port)
+            # merge communicators
+            self._commWorld = intercomm.Merge(0)
+            intercomm.Disconnect()
+        # setup our distributor
+        assert self._distributor is None
+        self._distributor = Distributor(self._commWorld)
+        return None
+
+    def start(self, initImpl=None):
+        """
+        Enter receive-loop as provided by distributor.
+        """
+        print("actor.start", self._distributor, flush=True)
+        self._distributor.start(doExit=False, initImpl=initImpl)
+        print("Actor done!")
+
+
+def _initActors(initImpl=None):
+    """
+    Initalize our (SPMD) actors, one per node in ray cluster and make them
+    connect through MPI.
+    Controller (calling process) gets connection config and then
+    passes it to init function on each actor.
+    """
+    global _actors
+    if not ray.is_initialized():
+        ray.init(address="auto")
+    # first create one actor per node in the ray cluster
+    for node in ray.cluster_resources():
+        if "node" in node:
+            name = node.split(":")[-1]
+            print(os.getpid(), "starting", name, flush=True)
+            _actors[name] = RayActor.options(resources={node: 1}).remote(
+                name
+            )  # runtime_env={"I_MPI_FABRICS": "ofi"}
+    nw = len(_actors)  # number of workers
+    print(nw, flush=True)
+    comm = MPI.COMM_SELF
+    # Get Port for MPI connections
+    port = MPI.Open_port(MPI.INFO_NULL)
+    # make all actors connect
+    x = [_actors[a].connect.remote(port, nw) for a in _actors]
+    for i in range(nw):
+        # connect to next worker (collectively)
+        intercomm = comm.Accept(port)
+        # merge communicators
+        comm = intercomm.Merge(0)
+        intercomm.Disconnect()
+        print("Connected", i, flush=True)
+    # wait for connections to be established
+    r = ray.get(x)
+    print("All connected", r, _actors, flush=True)
+    x = [_actors[a].start.remote(initImpl) for a in _actors]
+    print("All started", flush=True)
+    # setup our distributor
+    return (comm, Distributor(comm), x)
+
+
+def _finiActors():
+    """
+    Finalize Ray Actors: killing actor processes.
+    """
+    global _actors
+    if ray.is_initialized():
+        print("Killing actors")
+        for a in _actors.values():
+            ray.kill(a)
+
+
+init = _initActors
+fini = _finiActors

From c0c8c1658aa04da91a506a7a50d98ea11f9b293e Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Fri, 9 Jul 2021 03:23:55 -0500
Subject: [PATCH 07/22] adding __partitioned__

---
 heat/core/dndarray.py       | 92 +++++++++++++++++++++++++++++++++++++
 heat/cw4heat/__init__.py    | 52 +++++++++++++++++++--
 heat/cw4heat/distributor.py | 38 +++++++++++----
 3 files changed, 168 insertions(+), 14 deletions(-)

diff --git a/heat/core/dndarray.py b/heat/core/dndarray.py
index a7edcf7e7b..fe3cbd289d 100644
--- a/heat/core/dndarray.py
+++ b/heat/core/dndarray.py
@@ -611,6 +611,98 @@ def create_lshape_map(self, force_check: bool = True) -> torch.Tensor:
         self.__lshape_map = lshape_map
         return lshape_map
 
+    def create_partition_interface(self, no_data=False):
+        """
+        Create a partition interface in line with the DPPY proposal. This is subject to change.
+        The intention of this to facilitate the usage of a general format for the referencing of
+        distributed datasets.
+        An example of the output and shape is shown below.
+        __partitioned__ = {
+            'shape': (27, 3, 2),
+            'partition_tiling': (4, 1, 1),
+            'partitions': {
+                (0, 0, 0): {
+                    'start': (0, 0, 0),
+                    'shape': (7, 3, 2),
+                    'data': tensor([...], dtype=torch.int32),
+                    'location': 0,
+                    'dtype': torch.int32,
+                    'device': 'cpu'
+                },
+                (1, 0, 0): {
+                    'start': (7, 0, 0),
+                    'shape': (7, 3, 2),
+                    'data': None,
+                    'location': 1,
+                    'dtype': torch.int32,
+                    'device': 'cpu'
+                },
+                (2, 0, 0): {
+                    'start': (14,  0,  0),
+                    'shape': (7, 3, 2),
+                    'data': None,
+                    'location': 2,
+                    'dtype': torch.int32,
+                    'device': 'cpu'
+                },
+                (3, 0, 0): {
+                    'start': (21,  0,  0),
+                    'shape': (6, 3, 2),
+                    'data': None,
+                    'location': 3,
+                    'dtype': torch.int32,
+                    'device': 'cpu'
+                }
+            },
+            'locals': [(rank, 0, 0)],
+        }
+        Returns
+        -------
+        dictionary containing the partition interface as shown above.
+        """
+        # sp =
+        lshape_map = self.create_lshape_map()
+        start_idx_map = torch.zeros_like(lshape_map)
+
+        part_tiling = [1] * self.ndim
+        lcls = [0] * self.ndim
+
+        z = torch.tensor([0], device=self.device.torch_device, dtype=self.dtype.torch_type())
+        if self.split is not None:
+            starts = torch.cat((z, torch.cumsum(lshape_map[:, self.split], dim=0)[:-1]), dim=0)
+            lcls[self.split] = self.comm.rank
+            part_tiling[self.split] = self.comm.size
+        else:
+            starts = torch.zeros(self.ndim, dtype=torch.int, device=self.device.torch_device)
+
+        start_idx_map[:, self.split] = starts
+
+        partitions = {}
+        base_key = [0] * self.ndim
+        for r in range(self.comm.size):
+            if self.split is not None:
+                base_key[self.split] = r
+                dat = None if no_data or r != self.comm.rank else self.larray
+            else:
+                dat = self.larray
+
+            partitions[tuple(base_key)] = {
+                "start": tuple(start_idx_map[r].tolist()),
+                "shape": tuple(lshape_map[r].tolist()),
+                "data": dat,
+                "location": r,
+                "dtype": self.dtype.torch_type(),
+                "device": self.device.torch_device,
+            }
+
+        partition_dict = {
+            "shape": self.gshape,
+            "partition_tiling": tuple(part_tiling),
+            "partitions": partitions,
+            "locals": [tuple(lcls)],
+        }
+        return partition_dict
+
     def __float__(self) -> DNDarray:
         """
         Float scalar casting.
diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index e679972a80..41a4000bf2 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -53,6 +53,7 @@
 
 from mpi4py import MPI
 from os import getenv, getpid
+from collections import namedtuple
 import atexit
 from .distributor import Distributor
 from .arrayapi import (
@@ -78,7 +79,11 @@
 _fini = None
 
 
-def init():
+def _setComm(c):
+    return impl.use_comm(impl.MPICommunication(c.Create(c.group.Excl([0]))))
+
+
+def init(doStart=True):
     """
     Initialize distribution engine. Automatically when when importing cw4heat.
     For now we assume all ranks (controller and workers) are started through mpirun,
@@ -93,9 +98,6 @@ def init():
 
     _launcher = getenv("CW4H_LAUNCHER", default="mpi").lower()
 
-    def _setComm(c):
-        return impl.use_comm(impl.MPICommunication(c.Create(c.group.Excl([0]))))
-
     # atexit.register(fini)
     if _launcher == "ray":
         from .ray_runner import init as ray_init, fini as ray_fini
@@ -106,11 +108,21 @@ def _setComm(c):
     elif _launcher == "mpi":
         _comm = MPI.COMM_WORLD
         _distributor = Distributor(_comm)
-        _distributor.start(initImpl=_setComm)
+        if doStart:
+            _distributor.start(initImpl=_setComm)
     else:
         raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
 
+def asController():
+    """
+    Enter controller-worker region.
+    Rank 0 becomes controller, all others act as workers.
+    """
+    init(False)
+    return _distributor.start(initImpl=_setComm, doExit=False)
+
+
 def fini():
     """
     Finalize/shutdown distribution engine. Automatically called at exit.
@@ -249,6 +261,36 @@ def T(self):
                 f"{method} = lambda self, *args, **kwargs: _distributor.get(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
             )
 
+    partRef = namedtuple("partRef", ("id", "rank"))
+
+    # @property
+    def __partitioned__(self):
+        """
+        Return partitioning meta data.
+        """
+
+        def getPartForRef(pref):
+            """
+            Return actual partition data for given partRef.
+            """
+            # FIXME Ray
+            # only supported on root rank right now
+            # Notice: HeAT does not use COMM_WORLD, we have to translate to global rank
+            assert MPI.COMM_WORLD.rank == 0
+            return _distributor.getPart(pref, "larray")
+
+        parts = _distributor.get(
+            _submit(f"{dndarray_str}.create_partition_interface", (self, True), {})
+        )
+        # Provide all data as handle/reference
+        for _, p in parts["partitions"].items():
+            p["data"] = self.partRef(self._handle._id, p["location"] + 1)
+        # set getter
+        parts["get"] = getPartForRef
+        # remove SPMD local key
+        del parts["locals"]
+        return parts
+
     def __getattr__(self, attr):
         """
         Get attributes.
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 21d3358698..82137a2e47 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -62,6 +62,7 @@
 TASK = 1
 GO = 2
 GET = 3
+GETPART = 4
 
 
 class _TaskQueue:
@@ -121,30 +122,38 @@ def start(self, doExit=True, initImpl=None):
         """
         if initImpl:
             initImpl(self._comm)
-        if self._comm.rank != 0:
+        if self._comm.rank == 0:
+            return True
+        else:
             done = False
             header = None
             while not done:
                 # wait in bcast for work
                 header = self._comm.bcast(header, 0)
                 # then see what we need to do
-                if header[0] == END:
-                    done = True
-                    break
-                elif header[0] == TASK:
+                if header[0] == TASK:
                     self._tQueue._taskQueue = header[1]
-                elif header[0] == GO:
-                    self._tQueue.go()
                 elif header[0] == GET:
                     if self._comm.rank == 1:
                         val = _RemoteTask.getVal(header[1])
                         self._comm.send(val, dest=0, tag=GET)
+                elif header[0] == GO:
+                    self._tQueue.go()
+                elif header[0] == GETPART:
+                    if self._comm.rank == header[1]:
+                        val = _RemoteTask.getVal(header[2])
+                        attr = getattr(val, header[3])
+                        self._comm.send(attr, dest=0, tag=GETPART)
+                elif header[0] == END:
+                    done = True
+                    break
                 else:
                     raise Exception("Worker received unknown tag")
             self._comm.Barrier()
-            MPI.Finalize()
+            # MPI.Finalize()
             if doExit:
                 sys.exit()
+            return False
 
     def fini(self):
         """
@@ -154,7 +163,7 @@ def fini(self):
             header = [END]
             header = self._comm.bcast(header, 0)
             self._comm.Barrier()
-            MPI.Finalize()
+            # MPI.Finalize()
 
     def go(self):
         """
@@ -183,6 +192,17 @@ def get(self, handle):
         handle.set(val)
         return val
 
+    def getPart(self, handle, attr):
+        """
+        Get local raw partition data for given handle.
+        """
+        assert self._comm.rank == 0
+        self.go()
+        header = [GETPART, handle.rank, handle.id, attr]
+        _ = self._comm.bcast(header, 0)
+        val = self._comm.recv(source=handle.rank, tag=GETPART)
+        return val
+
     def submitPP(self, task, deps, numout=1):
         """
         Submit a process-parallel task and return a handle/future.

From dbc3056b9c3f6d46ef715585ceb0ca58935324c6 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Fri, 9 Jul 2021 08:04:02 -0500
Subject: [PATCH 08/22] demoing cw/region (MPI backend)

---
 heat/cw4heat/__init__.py       | 82 +++++++++++++++++++++++-----------
 heat/cw4heat/distributor.py    | 30 ++++++-------
 heat/cw4heat/examples/tcw4h.py | 39 ++++++++++++++++
 heat/cw4heat/ray_runner.py     | 26 +++++------
 4 files changed, 121 insertions(+), 56 deletions(-)
 create mode 100644 heat/cw4heat/examples/tcw4h.py

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 41a4000bf2..3989a68c0f 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -53,7 +53,6 @@
 
 from mpi4py import MPI
 from os import getenv, getpid
-from collections import namedtuple
 import atexit
 from .distributor import Distributor
 from .arrayapi import (
@@ -80,10 +79,11 @@
 
 
 def _setComm(c):
-    return impl.use_comm(impl.MPICommunication(c.Create(c.group.Excl([0]))))
+    # return impl.use_comm(impl.MPICommunication(c.Create(c.group.Excl([0]))))
+    return impl.use_comm(impl.MPICommunication(c))
 
 
-def init(doStart=True):
+def init(doStart=True, ctxt=False):
     """
     Initialize distribution engine. Automatically when when importing cw4heat.
     For now we assume all ranks (controller and workers) are started through mpirun,
@@ -100,6 +100,7 @@ def init(doStart=True):
 
     # atexit.register(fini)
     if _launcher == "ray":
+        assert ctxt is False, "Controller-worker context is useless with ray launcher."
         from .ray_runner import init as ray_init, fini as ray_fini
 
         _comm, _distributor, _futures = ray_init(_setComm)
@@ -114,15 +115,6 @@ def init(doStart=True):
         raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
 
-def asController():
-    """
-    Enter controller-worker region.
-    Rank 0 becomes controller, all others act as workers.
-    """
-    init(False)
-    return _distributor.start(initImpl=_setComm, doExit=False)
-
-
 def fini():
     """
     Finalize/shutdown distribution engine. Automatically called at exit.
@@ -133,6 +125,40 @@ def fini():
         _fini()
 
 
+class cw4h:
+    """
+    Contextmanager to establish controller-worker regions within SPMD runs.
+    Not that useful for HeAT, but demonstrates the concept.
+
+    >>> import heat.cw4heat as ht
+    >>> with ht.cw4h() as cw:
+    >>>   if cw.controller():
+    >>>     a = ht.arange(8)
+    """
+
+    def __init__(self):
+        init(False, True)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if _comm.rank == 0:
+            fini()
+
+    def controller(self):
+        """
+        Sends non root ranks/workers into reicv-loop and lets root rank execute
+        the code block protected as controller.
+        Non-root workers will not finish until self gets deleted.
+        """
+        if _comm.rank == 0:
+            return True
+        else:
+            _distributor.start(doExit=False, initImpl=_setComm)
+            return False
+
+
 class _Task:
     """
     A work item, executing functions provided as code.
@@ -190,6 +216,14 @@ def _setitem_normalized(self, value, key):
     self.__setitem__(key, value)
 
 
+def _getPartForRef(pref):
+    """
+    Return actual partition data for given partRef.
+    """
+    # FIXME Ray
+    return _distributor.getPart(pref, "larray")
+
+
 #######################################################################
 # Our array is just a wrapper. Actual array is stored as a handle to
 # allow delayed execution.
@@ -261,32 +295,28 @@ def T(self):
                 f"{method} = lambda self, *args, **kwargs: _distributor.get(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
             )
 
-    partRef = namedtuple("partRef", ("id", "rank"))
+    class partRef:
+        """
+        Handle used in __partitioned__. Identifies one chunk of a distributed array.
+        """
+
+        def __init__(self, id_, rank_):
+            self.id = id_
+            self.rank = rank_
 
     # @property
     def __partitioned__(self):
         """
         Return partitioning meta data.
         """
-
-        def getPartForRef(pref):
-            """
-            Return actual partition data for given partRef.
-            """
-            # FIXME Ray
-            # only supported on root rank right now
-            # Notice: HeAT does not use COMM_WORLD, we have to translate to global rank
-            assert MPI.COMM_WORLD.rank == 0
-            return _distributor.getPart(pref, "larray")
-
         parts = _distributor.get(
             _submit(f"{dndarray_str}.create_partition_interface", (self, True), {})
         )
         # Provide all data as handle/reference
         for _, p in parts["partitions"].items():
-            p["data"] = self.partRef(self._handle._id, p["location"] + 1)
+            p["data"] = self.partRef(self._handle._id, p["location"])
         # set getter
-        parts["get"] = getPartForRef
+        parts["get"] = _getPartForRef
         # remove SPMD local key
         del parts["locals"]
         return parts
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 82137a2e47..0971d1f071 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -89,7 +89,6 @@ def go(self):
         We assume tasks were submitted in in a valid order, e.g. in an order
         that guarntees no task is dependent on another task that is behind it in the queue.
         """
-        print("Executing tasks", len(self._taskQueue), flush=True)
         while len(self._taskQueue):
             self._taskQueue.popleft().go()
 
@@ -134,9 +133,8 @@ def start(self, doExit=True, initImpl=None):
                 if header[0] == TASK:
                     self._tQueue._taskQueue = header[1]
                 elif header[0] == GET:
-                    if self._comm.rank == 1:
-                        val = _RemoteTask.getVal(header[1])
-                        self._comm.send(val, dest=0, tag=GET)
+                    # We do not support arrays yet, scalars do not need communication
+                    assert False
                 elif header[0] == GO:
                     self._tQueue.go()
                 elif header[0] == GETPART:
@@ -146,10 +144,10 @@ def start(self, doExit=True, initImpl=None):
                         self._comm.send(attr, dest=0, tag=GETPART)
                 elif header[0] == END:
                     done = True
+                    self._comm.Barrier()
                     break
                 else:
                     raise Exception("Worker received unknown tag")
-            self._comm.Barrier()
             # MPI.Finalize()
             if doExit:
                 sys.exit()
@@ -175,7 +173,7 @@ def go(self):
             _, _ = self._comm.bcast(header, 0)
             header = [GO]
             _ = self._comm.bcast(header, 0)
-            self._tQueue.clear()
+            self._tQueue.go()
 
     def get(self, handle):
         """
@@ -186,21 +184,21 @@ def get(self, handle):
         """
         assert self._comm.rank == 0
         self.go()
-        header = [GET, handle.getId()]
-        _ = self._comm.bcast(header, 0)
-        val = self._comm.recv(source=1, tag=GET)
-        handle.set(val)
-        return val
+        return handle.get()
 
     def getPart(self, handle, attr):
         """
         Get local raw partition data for given handle.
         """
-        assert self._comm.rank == 0
-        self.go()
-        header = [GETPART, handle.rank, handle.id, attr]
-        _ = self._comm.bcast(header, 0)
-        val = self._comm.recv(source=handle.rank, tag=GETPART)
+        if handle.rank == self._comm.rank:
+            val = _RemoteTask.getVal(handle.id)
+            val = getattr(val, attr)
+        else:
+            # FIXME what if left CW-context (SPMD mode) ?
+            assert self._comm.rank == 0
+            header = [GETPART, handle.rank, handle.id, attr]
+            _ = self._comm.bcast(header, 0)
+            val = self._comm.recv(source=handle.rank, tag=GETPART)
         return val
 
     def submitPP(self, task, deps, numout=1):
diff --git a/heat/cw4heat/examples/tcw4h.py b/heat/cw4heat/examples/tcw4h.py
new file mode 100644
index 0000000000..90ed7e5bc7
--- /dev/null
+++ b/heat/cw4heat/examples/tcw4h.py
@@ -0,0 +1,39 @@
+from mpi4py import MPI
+
+comm = MPI.COMM_WORLD
+
+import heat.cw4heat as ht
+
+
+with ht.cw4h() as cw:
+    if cw.controller():
+        a = ht.arange(8, split=0)
+        b = ht.ones(8, split=0)
+        c = a @ b
+        assert hasattr(c, "__partitioned__")
+        p = a.__partitioned__()
+        print(c.shape, c, p)
+        for k, v in p["partitions"].items():
+            print(k, p["get"](v["data"]))
+
+print("hello")
+
+with ht.cw4h() as cw:
+    if cw.controller():
+        a = ht.arange(8, split=0)
+        b = ht.ones(8, split=0)
+        c = a @ b
+        assert hasattr(c, "__partitioned__")
+        p = a.__partitioned__()
+        print(c.shape, c, p)
+        for k, v in p["partitions"].items():
+            print(k, p["get"](v["data"]))
+    else:
+        p = None
+
+p = comm.bcast(p, 0)
+for v in p["partitions"].values():
+    if v["location"] == comm.rank:
+        print("My part:", p["get"](v["data"]))
+
+print("bye")
diff --git a/heat/cw4heat/ray_runner.py b/heat/cw4heat/ray_runner.py
index f318c25fec..4d7758220a 100644
--- a/heat/cw4heat/ray_runner.py
+++ b/heat/cw4heat/ray_runner.py
@@ -31,6 +31,7 @@
 from ray.services import get_node_ip_address as getIP
 from .distributor import Distributor
 import os
+from os import getenv, getpid
 
 _actors = {}
 
@@ -45,13 +46,11 @@ def __init__(self, node):
         self.node = node
         self._commWorld = MPI.COMM_SELF
         self._distributor = None
-        print("Actor up", flush=True)
 
     def connect(self, port, nWorkers):
         """
         Let nWorkers-many processes connect to controller process.
         """
-        print("Actor connecting", flush=True)
         # workers go here
         # initial connect
         intercomm = self._commWorld.Connect(port)
@@ -59,7 +58,6 @@ def connect(self, port, nWorkers):
         self._commWorld = intercomm.Merge(1)
         intercomm.Disconnect()
         rank = self._commWorld.rank
-        print(f"Yey, rank {rank} connected!")
         # collectively accept connections from all (other) clients
         for i in range(rank, nWorkers):
             # connect to next worker (collectively)
@@ -76,9 +74,7 @@ def start(self, initImpl=None):
         """
         Enter receive-loop as provided by distributor.
         """
-        print("actor.start", self._distributor, flush=True)
         self._distributor.start(doExit=False, initImpl=initImpl)
-        print("Actor done!")
 
 
 def _initActors(initImpl=None):
@@ -91,16 +87,20 @@ def _initActors(initImpl=None):
     global _actors
     if not ray.is_initialized():
         ray.init(address="auto")
+    ppn = int(getenv("CW4H_PPN", default="1"))
+    assert ppn >= 1
+    my_ip = getIP()
     # first create one actor per node in the ray cluster
     for node in ray.cluster_resources():
         if "node" in node:
             name = node.split(":")[-1]
-            print(os.getpid(), "starting", name, flush=True)
-            _actors[name] = RayActor.options(resources={node: 1}).remote(
-                name
-            )  # runtime_env={"I_MPI_FABRICS": "ofi"}
+            _ppn = ppn - 1 if name == my_ip else ppn
+            if _ppn >= 1:
+                for i in range(_ppn):
+                    _actors[name] = RayActor.options(resources={node: 1}).remote(
+                        name
+                    )  # runtime_env={"I_MPI_FABRICS": "ofi"}
     nw = len(_actors)  # number of workers
-    print(nw, flush=True)
     comm = MPI.COMM_SELF
     # Get Port for MPI connections
     port = MPI.Open_port(MPI.INFO_NULL)
@@ -112,12 +112,10 @@ def _initActors(initImpl=None):
         # merge communicators
         comm = intercomm.Merge(0)
         intercomm.Disconnect()
-        print("Connected", i, flush=True)
     # wait for connections to be established
-    r = ray.get(x)
-    print("All connected", r, _actors, flush=True)
+    _ = ray.get(x)
     x = [_actors[a].start.remote(initImpl) for a in _actors]
-    print("All started", flush=True)
+    print("All actors started", flush=True)
     # setup our distributor
     return (comm, Distributor(comm), x)
 

From e8a70117fbb1a61928ae34650d5991003def9a2c Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Thu, 22 Jul 2021 05:41:10 -0500
Subject: [PATCH 09/22] refactoring ray_runner and let it create ray ObjRefs in
 __partitioned__

---
 heat/cw4heat/__init__.py       | 151 ++++++++++++++++++---------------
 heat/cw4heat/distributor.py    |  17 ++++
 heat/cw4heat/examples/t1.py    |  17 ++++
 heat/cw4heat/examples/tcw4h.py |   9 +-
 heat/cw4heat/ray_runner.py     | 132 +++++++++++++++++-----------
 5 files changed, 205 insertions(+), 121 deletions(-)
 create mode 100644 heat/cw4heat/examples/t1.py

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 3989a68c0f..1e5c441402 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -73,9 +73,26 @@
 impl_str = "impl"
 dndarray_str = "impl.DNDarray"
 
-_distributor = None
-_comm = None
-_fini = None
+_runner = None
+
+
+class _partRef:
+    """
+    Handle used in __partitioned__. Identifies one chunk of a distributed array.
+    """
+
+    def __init__(self, id_, rank_):
+        self.id = id_
+        self.rank = rank_
+
+
+def _getPartForRef(pref):
+    """
+    Return actual partition data for given _partRef.
+    """
+    # FIXME Ray
+    ret = _runner.distributor.getPart(pref, "larray")
+    return ret
 
 
 def _setComm(c):
@@ -89,11 +106,9 @@ def init(doStart=True, ctxt=False):
     For now we assume all ranks (controller and workers) are started through mpirun,
     workers will never leave distributor.start() and so this function.
     """
-    global _distributor
-    global _comm
-    global _fini
+    global _runner
 
-    if _distributor is not None:
+    if _runner is not None:
         return
 
     _launcher = getenv("CW4H_LAUNCHER", default="mpi").lower()
@@ -101,16 +116,28 @@ def init(doStart=True, ctxt=False):
     # atexit.register(fini)
     if _launcher == "ray":
         assert ctxt is False, "Controller-worker context is useless with ray launcher."
-        from .ray_runner import init as ray_init, fini as ray_fini
+        from .ray_runner import init as ray_init
 
-        _comm, _distributor, _futures = ray_init(_setComm)
-        _distributor.start(initImpl=_setComm)
-        _fini = ray_fini
+        _runner = ray_init(_setComm)
+        _runner.distributor.start(initImpl=_setComm)
     elif _launcher == "mpi":
-        _comm = MPI.COMM_WORLD
-        _distributor = Distributor(_comm)
+
+        class MPIRunner:
+            def __init__(self, dist, comm):
+                self.comm = comm
+                self.distributor = dist
+                self.publish = lambda id, distributor: [
+                    (i, _partRef(id, i)) for i in range(self.comm.size)
+                ]
+                self.get = _getPartForRef
+
+            def fini(self):
+                pass
+
+        c = MPI.COMM_WORLD
+        _runner = MPIRunner(Distributor(c), c)
         if doStart:
-            _distributor.start(initImpl=_setComm)
+            _runner.distributor.start(initImpl=_setComm)
     else:
         raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
@@ -120,9 +147,10 @@ def fini():
     Finalize/shutdown distribution engine. Automatically called at exit.
     When called on controller, workers will sys.exit from init().
     """
-    _distributor.fini()
-    if _fini:
-        _fini()
+    global _runner
+    _runner.distributor.fini()
+    if _runner:
+        _runner.fini()
 
 
 class cw4h:
@@ -143,7 +171,7 @@ def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        if _comm.rank == 0:
+        if _runner.comm.rank == 0:
             fini()
 
     def controller(self):
@@ -152,10 +180,10 @@ def controller(self):
         the code block protected as controller.
         Non-root workers will not finish until self gets deleted.
         """
-        if _comm.rank == 0:
+        if _runner.comm.rank == 0:
             return True
         else:
-            _distributor.start(doExit=False, initImpl=_setComm)
+            _runner.distributor.start(doExit=False, initImpl=_setComm)
             return False
 
 
@@ -195,7 +223,9 @@ def _submit(name, args, kwargs, unwrap="*", numout=1):
     """
     scalar_args = tuple(x for x in args if not isinstance(x, DDParray))
     deps = [x._handle.getId() for x in args if isinstance(x, DDParray)]
-    return _distributor.submitPP(_Task(name, scalar_args, kwargs, unwrap=unwrap), deps, numout)
+    return _runner.distributor.submitPP(
+        _Task(name, scalar_args, kwargs, unwrap=unwrap), deps, numout
+    )
 
 
 def _submitProperty(name, self):
@@ -204,7 +234,7 @@ def _submitProperty(name, self):
     """
     t = _PropertyTask(name)
     try:
-        res = _distributor.submitPP(t, [self._handle.getId()])
+        res = _runner.distributor.submitPP(t, [self._handle.getId()])
     except Exception:
         assert False
     return res
@@ -216,14 +246,6 @@ def _setitem_normalized(self, value, key):
     self.__setitem__(key, value)
 
 
-def _getPartForRef(pref):
-    """
-    Return actual partition data for given partRef.
-    """
-    # FIXME Ray
-    return _distributor.getPart(pref, "larray")
-
-
 #######################################################################
 # Our array is just a wrapper. Actual array is stored as a handle to
 # allow delayed execution.
@@ -252,7 +274,7 @@ def __init__(self, handle):
     #     Return heat native array.
     #     With delayed execution, triggers computation as needed and blocks until array is available.
     #     """
-    #     return _distributor.get(self._handle)
+    #     return _runner.distributor.get(self._handle)
 
     def __getitem__(self, key):
         """
@@ -275,48 +297,23 @@ def T(self):
         """
         return DDParray(_submitProperty("T", self))
 
-    #######################################################################
-    # Now we add methods/properties through the standard process.
-    #######################################################################
-
-    # dynamically generate class methods from list of methods in array-API
-    # we simply make lambdas which submit appropriate Tasks
-    # FIXME: aa_inplace_operators,others?
-    fixme_afuncs = ["squeeze", "astype", "balance", "resplit"]
-    for method in aa_methods_a + aa_reflected_operators + fixme_afuncs:
-        if method not in ["__getitem__", "__setitem__"] and hasattr(dndarray, method):
-            exec(
-                f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
-            )
-
-    for method in aa_methods_s + ["__str__"]:
-        if hasattr(dndarray, method):
-            exec(
-                f"{method} = lambda self, *args, **kwargs: _distributor.get(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
-            )
-
-    class partRef:
-        """
-        Handle used in __partitioned__. Identifies one chunk of a distributed array.
-        """
-
-        def __init__(self, id_, rank_):
-            self.id = id_
-            self.rank = rank_
-
-    # @property
+    @property
     def __partitioned__(self):
         """
         Return partitioning meta data.
         """
-        parts = _distributor.get(
+        global _runner
+
+        parts = _runner.distributor.get(
             _submit(f"{dndarray_str}.create_partition_interface", (self, True), {})
         )
         # Provide all data as handle/reference
-        for _, p in parts["partitions"].items():
-            p["data"] = self.partRef(self._handle._id, p["location"])
+        futures = _runner.publish(self._handle._id, _runner.distributor)
+        for i, p in enumerate(parts["partitions"].values()):
+            p["location"] = futures[i][0]
+            p["data"] = futures[i][1]
         # set getter
-        parts["get"] = _getPartForRef
+        parts["get"] = _runner.get
         # remove SPMD local key
         del parts["locals"]
         return parts
@@ -327,13 +324,33 @@ def __getattr__(self, attr):
         Caches attributes from workers, so we communicate only once.
         """
         if self._attributes is None:
-            self._attributes = _distributor.get(
+            self._attributes = _runner.distributor.get(
                 _submit(
                     "(lambda a: {x: getattr(a, x) for x in aa_attributes if x != 'T'})", (self,), {}
                 )
             )
         return self._attributes[attr]
 
+    #######################################################################
+    # Now we add methods/properties through the standard process.
+    #######################################################################
+
+    # dynamically generate class methods from list of methods in array-API
+    # we simply make lambdas which submit appropriate Tasks
+    # FIXME: aa_inplace_operators,others?
+    fixme_afuncs = ["squeeze", "astype", "balance", "resplit", "reshape"]
+    for method in aa_methods_a + aa_reflected_operators + fixme_afuncs:
+        if method not in ["__getitem__", "__setitem__"] and hasattr(dndarray, method):
+            exec(
+                f"{method} = lambda self, *args, **kwargs: DDParray(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
+            )
+
+    for method in aa_methods_s + ["__str__"]:
+        if hasattr(dndarray, method):
+            exec(
+                f"{method} = lambda self, *args, **kwargs: _runner.distributor.get(_submit('{dndarray_str}.{method}', (self, *args), kwargs))"
+            )
+
 
 #######################################################################
 # first define top-level functions through the standard process.
@@ -344,7 +361,7 @@ def __getattr__(self, attr):
 # (lists taken from list of methods in array-API)
 # Again, we simply make lambdas which submit appropriate Tasks
 
-fixme_funcs = ["load_csv", "array", "triu"]
+fixme_funcs = ["load_csv", "array", "triu", "copy", "repeat"]
 for func in aa_tlfuncs + fixme_funcs:
     if func == "meshgrid":
         exec(
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 0971d1f071..961652bf7f 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -63,6 +63,7 @@
 GO = 2
 GET = 3
 GETPART = 4
+PUBPART = 5
 
 
 class _TaskQueue:
@@ -124,6 +125,7 @@ def start(self, doExit=True, initImpl=None):
         if self._comm.rank == 0:
             return True
         else:
+            print("Entering worker loop", flush=True)
             done = False
             header = None
             while not done:
@@ -142,6 +144,10 @@ def start(self, doExit=True, initImpl=None):
                         val = _RemoteTask.getVal(header[2])
                         attr = getattr(val, header[3])
                         self._comm.send(attr, dest=0, tag=GETPART)
+                elif header[0] == PUBPART:
+                    val = _RemoteTask.getVal(header[1])
+                    attr = header[3](getattr(val, header[2]))
+                    self._comm.gather(attr, root=0)
                 elif header[0] == END:
                     done = True
                     self._comm.Barrier()
@@ -201,6 +207,16 @@ def getPart(self, handle, attr):
             val = self._comm.recv(source=handle.rank, tag=GETPART)
         return val
 
+    def publishParts(self, id, attr, publish):
+        """
+        Publish array's attribute for each partition and gather handles on root.
+        """
+        assert self._comm.rank == 0
+        header = [PUBPART, id, attr, publish]
+        _ = self._comm.bcast(header, 0)
+        val = publish(getattr(_RemoteTask.getVal(id), attr))
+        return self._comm.gather(val, root=0)
+
     def submitPP(self, task, deps, numout=1):
         """
         Submit a process-parallel task and return a handle/future.
@@ -277,6 +293,7 @@ def go(self):
         """
         Actually run the task.
         """
+        # print(self._task._func)
         deps = [_RemoteTask.s_pms[i] for i in self._depIds]
         res = self._task.run(deps)
         if self._nOut == 1:
diff --git a/heat/cw4heat/examples/t1.py b/heat/cw4heat/examples/t1.py
new file mode 100644
index 0000000000..b9080ddd76
--- /dev/null
+++ b/heat/cw4heat/examples/t1.py
@@ -0,0 +1,17 @@
+import pickle
+import heat.cw4heat as ht
+
+ht.init()
+
+a = ht.arange(8, split=0)
+b = ht.ones(8, split=0)
+c = a @ b
+# assert hasattr(c, "__partitioned__")
+print(type(c))
+p = a.__partitioned__()
+print(a.shape, a, p)
+for k, v in p["partitions"].items():
+    print(33)
+    print(k, p["get"](v["data"]))
+print("kkkkkk")
+ht.fini()
diff --git a/heat/cw4heat/examples/tcw4h.py b/heat/cw4heat/examples/tcw4h.py
index 90ed7e5bc7..9ecf540e50 100644
--- a/heat/cw4heat/examples/tcw4h.py
+++ b/heat/cw4heat/examples/tcw4h.py
@@ -10,8 +10,9 @@
         a = ht.arange(8, split=0)
         b = ht.ones(8, split=0)
         c = a @ b
-        assert hasattr(c, "__partitioned__")
-        p = a.__partitioned__()
+        # assert hasattr(c, "__partitioned__")
+        print(type(c))
+        p = c.__partitioned__()
         print(c.shape, c, p)
         for k, v in p["partitions"].items():
             print(k, p["get"](v["data"]))
@@ -23,8 +24,8 @@
         a = ht.arange(8, split=0)
         b = ht.ones(8, split=0)
         c = a @ b
-        assert hasattr(c, "__partitioned__")
-        p = a.__partitioned__()
+        # assert hasattr(c, "__partitioned__")
+        p = c.__partitioned__()
         print(c.shape, c, p)
         for k, v in p["partitions"].items():
             print(k, p["get"](v["data"]))
diff --git a/heat/cw4heat/ray_runner.py b/heat/cw4heat/ray_runner.py
index 4d7758220a..de3854de91 100644
--- a/heat/cw4heat/ray_runner.py
+++ b/heat/cw4heat/ray_runner.py
@@ -28,13 +28,12 @@
 
 from mpi4py import MPI
 import ray
+import ray.cloudpickle
 from ray.services import get_node_ip_address as getIP
 from .distributor import Distributor
 import os
 from os import getenv, getpid
 
-_actors = {}
-
 
 @ray.remote
 class RayActor:
@@ -77,59 +76,92 @@ def start(self, initImpl=None):
         self._distributor.start(doExit=False, initImpl=initImpl)
 
 
-def _initActors(initImpl=None):
+def _pub(x):
+    return ray.cloudpickle.dumps((getIP(), ray.put(x)))
+
+
+def _ray_publish(id, distributor):
     """
-    Initalize our (SPMD) actors, one per node in ray cluster and make them
-    connect through MPI.
-    Controller (calling process) gets connection config and then
-    passes it to init function on each actor.
+    Return ray ObjRef for obj to be used in ray.
     """
-    global _actors
-    if not ray.is_initialized():
-        ray.init(address="auto")
-    ppn = int(getenv("CW4H_PPN", default="1"))
-    assert ppn >= 1
-    my_ip = getIP()
-    # first create one actor per node in the ray cluster
-    for node in ray.cluster_resources():
-        if "node" in node:
-            name = node.split(":")[-1]
-            _ppn = ppn - 1 if name == my_ip else ppn
-            if _ppn >= 1:
-                for i in range(_ppn):
-                    _actors[name] = RayActor.options(resources={node: 1}).remote(
-                        name
-                    )  # runtime_env={"I_MPI_FABRICS": "ofi"}
-    nw = len(_actors)  # number of workers
-    comm = MPI.COMM_SELF
-    # Get Port for MPI connections
-    port = MPI.Open_port(MPI.INFO_NULL)
-    # make all actors connect
-    x = [_actors[a].connect.remote(port, nw) for a in _actors]
-    for i in range(nw):
-        # connect to next worker (collectively)
-        intercomm = comm.Accept(port)
-        # merge communicators
-        comm = intercomm.Merge(0)
-        intercomm.Disconnect()
-    # wait for connections to be established
-    _ = ray.get(x)
-    x = [_actors[a].start.remote(initImpl) for a in _actors]
-    print("All actors started", flush=True)
-    # setup our distributor
-    return (comm, Distributor(comm), x)
+    vals = distributor.publishParts(id, "larray", _pub)
+    return [ray.cloudpickle.loads(x) for x in vals]
+
 
+def _ray_get(x):
+    return ray.get(x)
 
-def _finiActors():
+
+class RayRunner:
     """
-    Finalize Ray Actors: killing actor processes.
+    Using ray to launch ranks by using ray actors.
     """
-    global _actors
-    if ray.is_initialized():
-        print("Killing actors")
-        for a in _actors.values():
-            ray.kill(a)
+
+    def __init__(self, initImpl=None):
+        """
+        Initalize our (SPMD) actors, one per node in ray cluster and make them
+        connect through MPI.
+        Controller (calling process) gets connection config and then
+        passes it to init function on each actor.
+        """
+        self.publish = _ray_publish
+        self.get = _ray_get
+        self._actors = {}
+        self._init(initImpl)
+
+    def fini(self):
+        """
+        Finalize Ray Actors: killing actor processes.
+        """
+        if ray.is_initialized():
+            print("Killing actors")
+            if self._handles:
+                ray.get(self._handles)
+            if self._actors:
+                for a in self._actors.values():
+                    ray.kill(a)
+
+    def _init(self, initImpl=None):
+        if not ray.is_initialized():
+            ray.init(address="auto")
+        ppn = int(getenv("CW4H_PPN", default="1"))
+        assert ppn >= 1
+        my_ip = getIP()
+        # first create one actor per node in the ray cluster
+        for node in ray.cluster_resources():
+            if "node" in node:
+                name = node.split(":")[-1]
+                _ppn = ppn - 1 if name == my_ip else ppn
+                if _ppn >= 1:
+                    for i in range(_ppn):
+                        self._actors[f"{name}{i}"] = RayActor.options(resources={node: 1}).remote(
+                            name
+                        )  # runtime_env={"I_MPI_FABRICS": "ofi"}
+        nw = len(self._actors)  # number of workers
+        self.comm = MPI.COMM_SELF
+        # Get Port for MPI connections
+        port = MPI.Open_port(MPI.INFO_NULL)
+        # make all actors connect
+        x = [a.connect.remote(port, nw) for a in self._actors.values()]
+        for i in range(nw):
+            # connect to next worker (collectively)
+            intercomm = self.comm.Accept(port)
+            # merge communicators
+            self.comm = intercomm.Merge(0)
+            intercomm.Disconnect()
+        # wait for connections to be established
+        _ = ray.get(x)
+        self._handles = [a.start.remote(initImpl) for a in self._actors.values()]
+        print("All actors started", flush=True)
+        # setup our distributor
+        self.distributor = Distributor(self.comm)
+
+        return self
 
 
-init = _initActors
-fini = _finiActors
+def init(initImpl=None):
+    """
+    Return a Ray Runner.
+    Ray runner will launch actors and connect them throuh MPI.
+    """
+    return RayRunner(initImpl)

From 0ea1705c8c588fa16e979f7b35849eac57af76a0 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Thu, 22 Jul 2021 05:47:57 -0500
Subject: [PATCH 10/22] making location a list

---
 heat/cw4heat/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 1e5c441402..780f33e3f1 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -310,7 +310,7 @@ def __partitioned__(self):
         # Provide all data as handle/reference
         futures = _runner.publish(self._handle._id, _runner.distributor)
         for i, p in enumerate(parts["partitions"].values()):
-            p["location"] = futures[i][0]
+            p["location"] = [futures[i][0]]
             p["data"] = futures[i][1]
         # set getter
         parts["get"] = _runner.get

From 53fc1587cb192dfc34937d091a0ac39047f1e590 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 18 Aug 2021 06:18:03 -0500
Subject: [PATCH 11/22] fixes

---
 heat/cw4heat/__init__.py    |  3 +++
 heat/cw4heat/arrayapi.py    | 35 -----------------------------------
 heat/cw4heat/distributor.py | 25 +++++++++++++++++--------
 3 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 780f33e3f1..1d07413f73 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -135,9 +135,12 @@ def fini(self):
                 pass
 
         c = MPI.COMM_WORLD
+        if c.size <= 1:
+            raise Exception("At least 2 ranks required for cw4heat")
         _runner = MPIRunner(Distributor(c), c)
         if doStart:
             _runner.distributor.start(initImpl=_setComm)
+            atexit.register(fini)
     else:
         raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
index 785da687c1..40766c6111 100644
--- a/heat/cw4heat/arrayapi.py
+++ b/heat/cw4heat/arrayapi.py
@@ -100,41 +100,6 @@
     "result_type",  # (*arrays_and_dtypes)
 ]
 
-aa_methods = [
-    "__abs__",  # (self, /)
-    "__add__",  # (self, other, /)
-    "__and__",  # (self, other, /)
-    "__array_namespace__",  # (self, /, *, api_version=None)
-    "__bool__",  # (self, /)
-    "__dlpack__",  # (self, /, *, stream=None)
-    "__dlpack_device__",  # (self, /)
-    "__eq__",  # (self, other, /)
-    "__float__",  # (self, /)
-    "__floordiv__",  # (self, other, /)
-    "__ge__",  # (self, other, /)
-    "__getitem__",  # (self, key, /)
-    "__gt__",  # (self, other, /)
-    "__int__",  # (self, /)
-    "__invert__",  # (self, /)
-    "__le__",  # (self, other, /)
-    "__len__",  # (self, /)
-    "__lshift__",  # (self, other, /)
-    "__lt__",  # (self, other, /)
-    "__matmul__",  # (self, other, /)
-    "__mod__",  # (self, other, /)
-    "__mul__",  # (self, other, /)
-    "__ne__",  # (self, other, /)
-    "__neg__",  # (self, /)
-    "__or__",  # (self, other, /)
-    "__pos__",  # (self, /)
-    "__pow__",  # (self, other, /)
-    "__rshift__",  # (self, other, /)
-    "__setitem__",  # (self, key, value, /)
-    "__sub__",  # (self, other, /)
-    "__truediv__",  # (self, other, /)
-    "__xor__",  # (self, other, /)
-]
-
 aa_creators = [
     "arange",  # (start, /, stop=None, step=1, *, dtype=None, device=None)
     "asarray",  # (obj, /, *, dtype=None, device=None, copy=None)
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 961652bf7f..45007810a9 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -154,7 +154,7 @@ def start(self, doExit=True, initImpl=None):
                     break
                 else:
                     raise Exception("Worker received unknown tag")
-            # MPI.Finalize()
+            MPI.Finalize()
             if doExit:
                 sys.exit()
             return False
@@ -262,6 +262,18 @@ def get(self):
         """
         return self._obj
 
+    def __getstate__(self):
+        # we do not pickle the actual object
+        return {"_id": self._id}
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._obj = None
+
+
+# here we store objects that are input dependences to tasks
+_s_pms = {}
+
 
 class _RemoteTask:
     """
@@ -286,27 +298,24 @@ def __init__(self, task, deps, numout):
         else:
             self._handle = tuple(Handle() for _ in range(self._nOut))
 
-    # here we store objects that are input dependences to tasks
-    s_pms = {}
-
     def go(self):
         """
         Actually run the task.
         """
         # print(self._task._func)
-        deps = [_RemoteTask.s_pms[i] for i in self._depIds]
+        deps = [_s_pms[i] for i in self._depIds]
         res = self._task.run(deps)
         if self._nOut == 1:
             self._handle.set(res)
-            _RemoteTask.s_pms[self._handle.getId()] = res
+            _s_pms[self._handle.getId()] = res
         else:
             i = 0
             for h in self._handle:
                 h.set(res[i])
-                _RemoteTask.s_pms[h.getId()] = res[i]
+                _s_pms[h.getId()] = res[i]
                 i += 1
         return self._handle
 
     @staticmethod
     def getVal(id):
-        return _RemoteTask.s_pms[id]
+        return _s_pms[id]

From 40f7f444cf47e0568be7ae9085654098842a2af6 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 18 Aug 2021 06:40:08 -0500
Subject: [PATCH 12/22] adding reset()

---
 heat/cw4heat/__init__.py    |  8 ++++++++
 heat/cw4heat/distributor.py | 39 +++++++++++++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 1d07413f73..2ead4e216e 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -156,6 +156,14 @@ def fini():
         _runner.fini()
 
 
+def reset():
+    """
+    Reset all internal state.
+    Distributed objects created before calling reset cannot be used afterwards.
+    """
+    _runner.distributor.reset()
+
+
 class cw4h:
     """
     Contextmanager to establish controller-worker regions within SPMD runs.
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 45007810a9..7fe8550f83 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -64,6 +64,10 @@
 GET = 3
 GETPART = 4
 PUBPART = 5
+RESET = 6
+
+# here we store objects that are input dependences to tasks
+_s_pms = {}
 
 
 class _TaskQueue:
@@ -148,6 +152,11 @@ def start(self, doExit=True, initImpl=None):
                     val = _RemoteTask.getVal(header[1])
                     attr = header[3](getattr(val, header[2]))
                     self._comm.gather(attr, root=0)
+                elif header[0] == RESET:
+                    print("reset", flush=True)
+                    _RemoteTask.reset()
+                    self._tQueue.clear()
+                    Handle.reset()
                 elif header[0] == END:
                     done = True
                     self._comm.Barrier()
@@ -159,9 +168,20 @@ def start(self, doExit=True, initImpl=None):
                 sys.exit()
             return False
 
+    def reset(self):
+        """
+        Reset task queues.
+        """
+        assert self._comm.rank == 0
+        header = [RESET]
+        header = self._comm.bcast(header, 0)
+        _RemoteTask.reset()
+        self._tQueue.clear()
+        Handle.reset()
+
     def fini(self):
         """
-        Control sends end-tag. Workers will sys.exit.
+        Controler sends end-tag. Workers will sys.exit.
         """
         if MPI.Is_initialized() and self._comm.rank == 0:
             header = [END]
@@ -270,9 +290,12 @@ def __setstate__(self, state):
         self.__dict__.update(state)
         self._obj = None
 
-
-# here we store objects that are input dependences to tasks
-_s_pms = {}
+    @staticmethod
+    def reset():
+        """
+        Reset internal state.
+        """
+        Handle._nextId = 1
 
 
 class _RemoteTask:
@@ -319,3 +342,11 @@ def go(self):
     @staticmethod
     def getVal(id):
         return _s_pms[id]
+
+    @staticmethod
+    def reset():
+        """
+        Reset internal state.
+        """
+        global _s_pms
+        _s_pms = {}

From e7e439864a7a09677fbe37e8832bc651eb1de5ac Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 18 Aug 2021 06:42:37 -0500
Subject: [PATCH 13/22] using clear()

---
 heat/cw4heat/distributor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 7fe8550f83..dc8713c3e7 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -349,4 +349,4 @@ def reset():
         Reset internal state.
         """
         global _s_pms
-        _s_pms = {}
+        _s_pms.clear()

From d659c2ebddeae915e1635f68b7c9e97d77259e12 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Fri, 20 Aug 2021 06:32:04 -0500
Subject: [PATCH 14/22] adding dot

---
 heat/cw4heat/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 2ead4e216e..668ac0b85a 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -372,7 +372,7 @@ def __getattr__(self, attr):
 # (lists taken from list of methods in array-API)
 # Again, we simply make lambdas which submit appropriate Tasks
 
-fixme_funcs = ["load_csv", "array", "triu", "copy", "repeat"]
+fixme_funcs = ["load_csv", "array", "triu", "copy", "repeat", "dot"]
 for func in aa_tlfuncs + fixme_funcs:
     if func == "meshgrid":
         exec(

From 1b0a4bf4ed69c9792eec36ec17704773579b3070 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 1 Sep 2021 04:34:28 -0500
Subject: [PATCH 15/22] quick workaround to have __localop in cw4heat

---
 heat/cw4heat/__init__.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 668ac0b85a..1ded5dcc56 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -135,8 +135,8 @@ def fini(self):
                 pass
 
         c = MPI.COMM_WORLD
-        if c.size <= 1:
-            raise Exception("At least 2 ranks required for cw4heat")
+        # if c.size <= 1:
+        #    raise Exception("At least 2 ranks required for cw4heat")
         _runner = MPIRunner(Distributor(c), c)
         if doStart:
             _runner.distributor.start(initImpl=_setComm)
@@ -392,6 +392,14 @@ def __getattr__(self, attr):
     )
 
 
+def __local_op_normalized(a, f):
+    return impl.core._operations.__local_op(f, a)
+
+
+def __local_op(*args, **kwargs):
+    return DDParray(_submit("__local_op_normalized", args, kwargs))
+
+
 # Here we define data types and constants
 for attr in aa_datatypes + aa_constants:
     if hasattr(impl, attr):

From 06be72ea796fe84534d3a58d5f5030874fabdd9b Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 1 Sep 2021 09:45:32 -0500
Subject: [PATCH 16/22] fixed GC

---
 heat/cw4heat/__init__.py    | 7 +++++++
 heat/cw4heat/distributor.py | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 1ded5dcc56..6b5e038a44 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -257,6 +257,10 @@ def _setitem_normalized(self, value, key):
     self.__setitem__(key, value)
 
 
+def _release(hdl):
+    hdl._release()
+
+
 #######################################################################
 # Our array is just a wrapper. Actual array is stored as a handle to
 # allow delayed execution.
@@ -287,6 +291,9 @@ def __init__(self, handle):
     #     """
     #     return _runner.distributor.get(self._handle)
 
+    def __del__(self):
+        _submit("_release", (self._handle,), {})
+
     def __getitem__(self, key):
         """
         Return item/slice as array.
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index dc8713c3e7..abbaa05fee 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -153,7 +153,6 @@ def start(self, doExit=True, initImpl=None):
                     attr = header[3](getattr(val, header[2]))
                     self._comm.gather(attr, root=0)
                 elif header[0] == RESET:
-                    print("reset", flush=True)
                     _RemoteTask.reset()
                     self._tQueue.clear()
                     Handle.reset()
@@ -290,6 +289,13 @@ def __setstate__(self, state):
         self.__dict__.update(state)
         self._obj = None
 
+    def _release(self):
+        """
+        Release handle from dict to make it available for GC.
+        """
+        global _s_pms
+        del _s_pms[self._id]
+
     @staticmethod
     def reset():
         """

From df6a193526c4ed266f5789b5f9936ebbd364955a Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 1 Sep 2021 11:08:53 -0500
Subject: [PATCH 17/22] fixing GC issues

---
 heat/cw4heat/__init__.py    |  2 +-
 heat/cw4heat/distributor.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 6b5e038a44..5a8ce3fd4f 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -292,7 +292,7 @@ def __init__(self, handle):
     #     return _runner.distributor.get(self._handle)
 
     def __del__(self):
-        _submit("_release", (self._handle,), {})
+        _submit("_release", (self._handle,), {}, numout=0)
 
     def __getitem__(self, key):
         """
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index abbaa05fee..1188215427 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -155,7 +155,7 @@ def start(self, doExit=True, initImpl=None):
                 elif header[0] == RESET:
                     _RemoteTask.reset()
                     self._tQueue.clear()
-                    Handle.reset()
+                    # Handle._reset()
                 elif header[0] == END:
                     done = True
                     self._comm.Barrier()
@@ -176,7 +176,7 @@ def reset(self):
         header = self._comm.bcast(header, 0)
         _RemoteTask.reset()
         self._tQueue.clear()
-        Handle.reset()
+        # Handle.reset()
 
     def fini(self):
         """
@@ -294,10 +294,11 @@ def _release(self):
         Release handle from dict to make it available for GC.
         """
         global _s_pms
-        del _s_pms[self._id]
+        if self._id in _s_pms:
+            del _s_pms[self._id]
 
     @staticmethod
-    def reset():
+    def _reset():
         """
         Reset internal state.
         """
@@ -324,8 +325,10 @@ def __init__(self, task, deps, numout):
         #        or the result is not a global object.
         if self._nOut == 1:
             self._handle = Handle()
-        else:
+        elif self._nOut > 0:
             self._handle = tuple(Handle() for _ in range(self._nOut))
+        else:
+            self._handle = None
 
     def go(self):
         """
@@ -337,7 +340,7 @@ def go(self):
         if self._nOut == 1:
             self._handle.set(res)
             _s_pms[self._handle.getId()] = res
-        else:
+        elif self._nOut > 0:
             i = 0
             for h in self._handle:
                 h.set(res[i])

From 9a68af9788c5de1a69580a6da08baa8535535508 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Thu, 2 Sep 2021 05:13:40 -0500
Subject: [PATCH 18/22] quick hack to have random.normal

---
 heat/cw4heat/__init__.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 5a8ce3fd4f..6ee5cfa15a 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -418,6 +418,18 @@ def __local_op(*args, **kwargs):
 #######################################################################
 # quick hack to provide random features
 #######################################################################
+
+if not hasattr(impl.random, "normal"):
+    import torch
+
+    def _normal(mean, std, size):
+        ret = impl.empty(size)
+        torch.normal(mean, std, ret.lshape, out=ret.larray)
+        return ret
+
+    impl.random.normal = _normal
+
+
 class random:
     """
     Wrapper class for random.

From b9c9315ee561ea436cccc15ec25ca7da631f7208 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Tue, 21 Sep 2021 04:16:45 -0700
Subject: [PATCH 19/22] allow barrier after go

---
 heat/cw4heat/__init__.py    | 10 +++++++++-
 heat/cw4heat/distributor.py | 10 +++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 6ee5cfa15a..0c26eae199 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -120,6 +120,7 @@ def init(doStart=True, ctxt=False):
 
         _runner = ray_init(_setComm)
         _runner.distributor.start(initImpl=_setComm)
+        atexit.register(fini)
     elif _launcher == "mpi":
 
         class MPIRunner:
@@ -134,7 +135,7 @@ def __init__(self, dist, comm):
             def fini(self):
                 pass
 
-        c = MPI.COMM_WORLD
+        c = MPI.COMM_WORLD.Dup()
         # if c.size <= 1:
         #    raise Exception("At least 2 ranks required for cw4heat")
         _runner = MPIRunner(Distributor(c), c)
@@ -164,6 +165,13 @@ def reset():
     _runner.distributor.reset()
 
 
+def sync():
+    """
+    Trigger all computation.
+    """
+    _runner.distributor.go(True)
+
+
 class cw4h:
     """
     Contextmanager to establish controller-worker regions within SPMD runs.
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index 1188215427..f263bbee93 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -143,6 +143,8 @@ def start(self, doExit=True, initImpl=None):
                     assert False
                 elif header[0] == GO:
                     self._tQueue.go()
+                    if header[1]:
+                        self._comm.Barrier()
                 elif header[0] == GETPART:
                     if self._comm.rank == header[1]:
                         val = _RemoteTask.getVal(header[2])
@@ -188,7 +190,7 @@ def fini(self):
             self._comm.Barrier()
             # MPI.Finalize()
 
-    def go(self):
+    def go(self, barrier=False):
         """
         Trigger execution of all tasks which are still in flight.
         """
@@ -196,9 +198,11 @@ def go(self):
         if self._tQueue.len():
             header = [TASK, self._tQueue._taskQueue]
             _, _ = self._comm.bcast(header, 0)
-            header = [GO]
+            header = [GO, barrier]
             _ = self._comm.bcast(header, 0)
             self._tQueue.go()
+            if barrier:
+                self._comm.Barrier()
 
     def get(self, handle):
         """
@@ -334,7 +338,7 @@ def go(self):
         """
         Actually run the task.
         """
-        # print(self._task._func)
+        #print(self._task._func)
         deps = [_s_pms[i] for i in self._depIds]
         res = self._task.run(deps)
         if self._nOut == 1:

From af939d57c84431d9fb670f751d01c3d299e66355 Mon Sep 17 00:00:00 2001
From: Frank Schlimbach <frank.schlimbach@intel.com>
Date: Wed, 22 Sep 2021 09:43:57 -0500
Subject: [PATCH 20/22] allow spmd mode in cw4heat

---
 heat/cw4heat/__init__.py    | 22 ++++++-----
 heat/cw4heat/arrayapi.py    | 12 ++++++
 heat/cw4heat/distributor.py | 73 ++++++++++++++++++++++++++-----------
 heat/cw4heat/ray_runner.py  |  2 +-
 setup.py                    |  2 +-
 5 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/heat/cw4heat/__init__.py b/heat/cw4heat/__init__.py
index 0c26eae199..2e92f4e874 100644
--- a/heat/cw4heat/__init__.py
+++ b/heat/cw4heat/__init__.py
@@ -111,8 +111,8 @@ def init(doStart=True, ctxt=False):
     if _runner is not None:
         return
 
-    _launcher = getenv("CW4H_LAUNCHER", default="mpi").lower()
-
+    _launcher = getenv("CW4H_LAUNCHER", default="spmd").lower()
+    print("launcher:", _launcher)
     # atexit.register(fini)
     if _launcher == "ray":
         assert ctxt is False, "Controller-worker context is useless with ray launcher."
@@ -121,7 +121,8 @@ def init(doStart=True, ctxt=False):
         _runner = ray_init(_setComm)
         _runner.distributor.start(initImpl=_setComm)
         atexit.register(fini)
-    elif _launcher == "mpi":
+    else:
+        c = MPI.COMM_WORLD.Dup()
 
         class MPIRunner:
             def __init__(self, dist, comm):
@@ -135,15 +136,18 @@ def __init__(self, dist, comm):
             def fini(self):
                 pass
 
-        c = MPI.COMM_WORLD.Dup()
-        # if c.size <= 1:
-        #    raise Exception("At least 2 ranks required for cw4heat")
-        _runner = MPIRunner(Distributor(c), c)
+        _runner = MPIRunner(Distributor(c, _launcher == "spmd"), c)
+
+        if _launcher == "spmd":
+            _runner.publish = None
+        elif _launcher != "mpi":
+            raise Exception(
+                f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', 'spmd', or 'ray'."
+            )
+
         if doStart:
             _runner.distributor.start(initImpl=_setComm)
             atexit.register(fini)
-    else:
-        raise Exception(f"unknown launcher {_launcher}. CW4H_LAUNCHER must be 'mpi', or 'ray'.")
 
 
 def fini():
diff --git a/heat/cw4heat/arrayapi.py b/heat/cw4heat/arrayapi.py
index 40766c6111..827d65b265 100644
--- a/heat/cw4heat/arrayapi.py
+++ b/heat/cw4heat/arrayapi.py
@@ -24,6 +24,7 @@
     "aa_arrayfuncs",
     "aa_methods_s",
     "aa_methods_a",
+    "aa_manips",
 ]
 
 aa_creators = [
@@ -159,6 +160,16 @@
 
 aa_methods = aa_methods_s + aa_methods_a
 
+aa_manips = [
+    "concat",  # (arrays, /, *, axis=0)
+    "expand_dims",  # (x, /, *, axis)
+    "flip",  # (x, /, *, axis=None)
+    "reshape",  # (x, /, shape)
+    "roll",  # (x, /, shift, *, axis=None)
+    "squeeze",  # (x, /, axis)
+    "stack",  # (arrays, /, *, axis=0)
+]
+
 aa_elementwises = [
     "abs",  # (x, /)
     "acos",  # (x, /)
@@ -247,6 +258,7 @@
     + aa_sorting
     + aa_set
     + aa_utility
+    + aa_manips
 )
 aa_tldir = aa_tlfuncs + aa_datatypes + aa_constants
 aa_arrayfuncs = aa_methods + aa_inplace_operators + aa_reflected_operators
diff --git a/heat/cw4heat/distributor.py b/heat/cw4heat/distributor.py
index f263bbee93..054df4e066 100644
--- a/heat/cw4heat/distributor.py
+++ b/heat/cw4heat/distributor.py
@@ -110,22 +110,36 @@ class Distributor:
     Work-items are treated as dependent tasks.
     """
 
-    def __init__(self, comm=MPI.COMM_WORLD):
+    def __init__(self, comm, spmd=True):
         """
         Init distributor, optionally accepts MPI communicator.
         """
         self._comm = comm
+        self._spmd = spmd
         # Our queue of tasks.
         self._tQueue = _TaskQueue()
 
-    def start(self, doExit=True, initImpl=None):
+        self.start = self._start if spmd else self._cw_start
+        self.reset = self._reset if spmd else self._cw_reset
+        self.fini = self._fini if spmd else self._cw_fini
+        self.go = self._go if spmd else self._cw_go
+        self.get = self._get if spmd else self._cw_get
+        self.getPart = self._getPart if spmd else self._cw_getPart
+        if not spmd:
+            self.publishParts = self._cw_publishParts
+
+    def _start(self, doExit=True, initImpl=None):
+        if initImpl:
+            initImpl(self._comm)
+
+    def _cw_start(self, doExit=True, initImpl=None):
         """
         Start distribution engine.
         Controller inits and returns.
         Workers enter recv-loop and exit program when fini is called.
         """
-        if initImpl:
-            initImpl(self._comm)
+        self._start(doExit, initImpl)
+
         if self._comm.rank == 0:
             return True
         else:
@@ -169,28 +183,38 @@ def start(self, doExit=True, initImpl=None):
                 sys.exit()
             return False
 
-    def reset(self):
+    def _reset(self):
+        _RemoteTask.reset()
+        self._tQueue.clear()
+        # Handle.reset()
+
+    def _cw_reset(self):
         """
         Reset task queues.
         """
         assert self._comm.rank == 0
         header = [RESET]
         header = self._comm.bcast(header, 0)
-        _RemoteTask.reset()
-        self._tQueue.clear()
-        # Handle.reset()
+        self._reset()
 
-    def fini(self):
+    def _fini(self):
+        self._comm.Barrier()
+
+    def _cw_fini(self):
         """
         Controler sends end-tag. Workers will sys.exit.
         """
         if MPI.Is_initialized() and self._comm.rank == 0:
             header = [END]
             header = self._comm.bcast(header, 0)
+            self._fini()
+
+    def _go(self, barrier=False):
+        self._tQueue.go()
+        if barrier:
             self._comm.Barrier()
-            # MPI.Finalize()
 
-    def go(self, barrier=False):
+    def _cw_go(self, barrier=False):
         """
         Trigger execution of all tasks which are still in flight.
         """
@@ -200,11 +224,13 @@ def go(self, barrier=False):
             _, _ = self._comm.bcast(header, 0)
             header = [GO, barrier]
             _ = self._comm.bcast(header, 0)
-            self._tQueue.go()
-            if barrier:
-                self._comm.Barrier()
+            self._go(barrier)
 
-    def get(self, handle):
+    def _get(self, handle):
+        self.go()
+        return handle.get()
+
+    def _cw_get(self, handle):
         """
         Get actualy value from handle.
         Requires communication.
@@ -212,16 +238,19 @@ def get(self, handle):
         Does not work for arrays (yet).
         """
         assert self._comm.rank == 0
-        self.go()
-        return handle.get()
+        return self._get(handle)
+
+    def _getPart(self, handle, attr):
+        assert handle.rank == self._comm.rank
+        val = _RemoteTask.getVal(handle.id)
+        return getattr(val, attr)
 
-    def getPart(self, handle, attr):
+    def _cw_getPart(self, handle, attr):
         """
         Get local raw partition data for given handle.
         """
         if handle.rank == self._comm.rank:
-            val = _RemoteTask.getVal(handle.id)
-            val = getattr(val, attr)
+            val = self._getPart(handle, attr)
         else:
             # FIXME what if left CW-context (SPMD mode) ?
             assert self._comm.rank == 0
@@ -230,7 +259,7 @@ def getPart(self, handle, attr):
             val = self._comm.recv(source=handle.rank, tag=GETPART)
         return val
 
-    def publishParts(self, id, attr, publish):
+    def _cw_publishParts(self, id, attr, publish):
         """
         Publish array's attribute for each partition and gather handles on root.
         """
@@ -338,7 +367,7 @@ def go(self):
         """
         Actually run the task.
         """
-        #print(self._task._func)
+        # print(self._task._func)
         deps = [_s_pms[i] for i in self._depIds]
         res = self._task.run(deps)
         if self._nOut == 1:
diff --git a/heat/cw4heat/ray_runner.py b/heat/cw4heat/ray_runner.py
index de3854de91..a3de73456d 100644
--- a/heat/cw4heat/ray_runner.py
+++ b/heat/cw4heat/ray_runner.py
@@ -154,7 +154,7 @@ def _init(self, initImpl=None):
         self._handles = [a.start.remote(initImpl) for a in self._actors.values()]
         print("All actors started", flush=True)
         # setup our distributor
-        self.distributor = Distributor(self.comm)
+        self.distributor = Distributor(self.comm, False)
 
         return self
 
diff --git a/setup.py b/setup.py
index 740f6660f7..00f4e6c414 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@
     install_requires=[
         "mpi4py>=3.0.0",
         "numpy>=1.13.0",
-        "torch>=1.7.0, <1.9",
+        "torch>=1.7.0, <=1.9",
         "scipy>=0.14.0",
         "pillow>=6.0.0",
         "torchvision>=0.8.0",

From b7afb5706ea2cb0b23473e01b09e387014752ece Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jun 2022 07:30:17 +0000
Subject: [PATCH 21/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .github/ISSUE_TEMPLATE/bug_report.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index e32a87a384..aef16d1152 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -25,7 +25,7 @@ body:
     id: logs
     attributes:
       label: Error message or erroneous outcome
-      description: Please copy and paste your error. 
+      description: Please copy and paste your error.
       render: shell
   - type: dropdown
     id: version

From 68dcc20e913904debf10883e9a63a18ddc2e26df Mon Sep 17 00:00:00 2001
From: Claudia Comito <39374113+ClaudiaComito@users.noreply.github.com>
Date: Wed, 1 Jun 2022 09:42:20 +0200
Subject: [PATCH 22/22] Add type hints to `create_partition_interface`

---
 heat/core/dndarray.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/heat/core/dndarray.py b/heat/core/dndarray.py
index 42932abd23..fb43d1c96c 100644
--- a/heat/core/dndarray.py
+++ b/heat/core/dndarray.py
@@ -9,7 +9,7 @@
 from inspect import stack
 from mpi4py import MPI
 from pathlib import Path
-from typing import List, Union, Tuple, TypeVar, Optional
+from typing import List, Union, Tuple, TypeVar, Optional, Dict
 
 warnings.simplefilter("always", ResourceWarning)
 
@@ -599,7 +599,7 @@ def create_lshape_map(self, force_check: bool = False) -> torch.Tensor:
         self.__lshape_map = lshape_map
         return lshape_map.clone()
 
-    def create_partition_interface(self, no_data=False):
+    def create_partition_interface(self, no_data: bool = False) -> Dict:
         """
         Create a partition interface in line with the DPPY proposal. This is subject to change.
         The intention of this to facilitate the usage of a general format for the referencing of
@@ -644,9 +644,6 @@ def create_partition_interface(self, no_data=False):
             },
             'locals': [(rank, 0, 0)],
         }
-        Returns
-        -------
-        dictionary containing the partition interface as shown above.
         """
         # sp =
         lshape_map = self.create_lshape_map()