Add the ability to compute additional columns for plottable data

girder · Sep 6, 2024 · d165889 · d165889
1 parent 80970f0
commit d165889
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 20 deletions.
diff --git a/girder_annotation/girder_large_image_annotation/rest/annotation.py b/girder_annotation/girder_large_image_annotation/rest/annotation.py
@@ -661,19 +661,29 @@ def getItemPlottableElements(self, item, annotations, adjacentItems, sources=Non
         .param('sources', 'An optional comma separated list that can contain '
                'folder, item, annotation, annotationelement, datafile.',
                required=False)
+        .jsonParam(
+            'compute', 'A dictionary with keys "columns": a list of columns '
+            'to include in the computation; if unspecified or an empty list, '
+            'no computation is done, "function": a string with the name of '
+            'the function, such as umap, "params": additional parameters to '
+            'pass to the function.  If none of the requiredKeys are '
+            'compute.(x|y|z), the computation will not be performed.  Only '
+            'rows which have all selected columns present will be included in '
+            'the computation.',
+            paramType='formData', requireObject=True, required=False)
         .errorResponse('ID was invalid.')
         .errorResponse('Read access was denied for the item.', 403),
     )
     @access.public(cookie=True, scope=TokenScope.DATA_READ)
     def getItemPlottableData(
-            self, item, keys, adjacentItems, annotations, requiredKeys, sources=None):
+            self, item, keys, adjacentItems, annotations, requiredKeys, sources=None, compute=None):
         user = self.getCurrentUser()
         if adjacentItems != '__all__':
             adjacentItems = str(adjacentItems).lower() == 'true'
         sources = sources or None
         data = utils.PlottableItemData(
             user, item, annotations=annotations, adjacentItems=adjacentItems,
-            sources=sources)
+            sources=sources, compute=compute)
         return data.data(keys, requiredKeys)
 
     def getFolderAnnotations(self, id, recurse, user, limit=False, offset=False, sort=False,

diff --git a/girder_annotation/girder_large_image_annotation/utils/__init__.py b/girder_annotation/girder_large_image_annotation/utils/__init__.py
@@ -29,6 +29,7 @@
     'application/x-xls': 'read_excel',
 }
 scanDatafileRecords = 50
+scanAnnotationElements = 5000
 
 
 @functools.lru_cache(maxsize=100)
@@ -393,7 +394,8 @@ class PlottableItemData:
     maxDistinct = 20
     allowedTypes = (str, bool, int, float)
 
-    def __init__(self, user, item, annotations=None, adjacentItems=False, sources=None):
+    def __init__(self, user, item, annotations=None, adjacentItems=False,
+                 sources=None, compute=None):
         """
         Get plottable data associated with an item.
 
@@ -408,15 +410,32 @@ def __init__(self, user, item, annotations=None, adjacentItems=False, sources=No
         :param sources: None for all, or a string with a comma-separated list
             or a list of strings; when a list, the options are folder, item,
             annotation, datafile.
+        :param compute: None for none, or a dictionary with keys "columns": a
+            list of columns to include in the computation; if unspecified or an
+            empty list, no computation is done, "function": a string with the
+            name of the function, such as umap, "params": additional parameters
+            to pass to the function.  If none of the requiredKeys are
+            compute.(x|y|z), the computation will not be performed.  Only rows
+            which have all selected columns present will be included in the
+            computation.
         """
         self.user = user
         self._columns = None
         self._datacolumns = None
         self._data = None
+        self._compute = None
+        try:
+            if len(compute['columns']):
+                self._compute = {'function': 'umap', 'params': {
+                    'random_state': 1, 'n_jobs': 1}}
+                self._compute.update(compute)
+        except Exception:
+            pass
         if sources and not isinstance(sources, (list, tuple)):
             sources = sources.split(',')
         self._sources = tuple(sources) if sources else None
-        if self._sources and 'annotation' not in self._sources:
+        if (self._sources and 'annotation' not in self._sources and
+                'annotationelement' not in self._sources):
             annotations = None
         self._fullScan = adjacentItems == '__all__'
         self._findItems(item, adjacentItems)
@@ -559,7 +578,11 @@ def _findDataFiles(self):  # noqa
         'bbox.y0': 'Bounding Box Low Y',
         'bbox.x1': 'Bounding Box High X',
         'bbox.y1': 'Bounding Box High Y',
+        'compute.x': 'Dimension Reduction X',
+        'compute.y': 'Dimension Reduction Y',
+        'compute.z': 'Dimension Reduction Z',
     }
+    computeColumns = {'compute.x', 'compute.y', 'compute.z'}
 
     def itemNameIDSelector(self, isName, selector):
         """
@@ -1068,6 +1091,78 @@ def _getColumnsFromDataFiles(self, columns):
                     countsPerDataFile[dfidx] = count - startcount
         return count
 
+    def _computeFunction(self, rows):
+        if self._compute['function'] == 'umap':
+            import umap
+
+            logger.info(f'Calling umap on {len(rows)} rows')
+            reducer = umap.UMAP(**self._compute['params'])
+            self._computed = reducer.fit_transform(list(rows.values()))
+            logger.info('Called umap')
+            return True
+
+    def _getColumnsFromCompute(self, columns):  # noqa
+        """
+        Collect columns and data from compute actions.
+        """
+
+        def computeGetData(record):
+            return {}
+
+        def computeLength(record, data):
+            return len(self._computed)
+
+        def computeSelector(key):
+            axis = ord(key[-1:]) - ord('x')
+
+            def computeSelectorAxis(record, data, row):
+                return self._computed[row][axis]
+
+            return computeSelectorAxis
+
+        if not self._datacolumns:
+            for key in self.computeColumns:
+                title = self.commonColumns[key]
+                self._ensureColumn(
+                    columns, key, title, 'compute', computeGetData,
+                    computeSelector(key), computeLength)
+                columns[key]['count'] = 1
+                columns[key]['min'] = columns[key]['max'] = 0
+            return 0
+        if self._compute is None or not len(self._requiredColumns & self.computeColumns):
+            return 0
+        compcol = {
+            key for key, col in columns.items()
+            if col['type'] == 'number' and col.get('min') is not None
+        } & set(self._compute['columns'])
+        if not len(compcol):
+            return 0
+        rows = {}
+        cols = sorted({col for col in self._compute['columns'] if col in self._datacolumns})
+        for kidx, key in enumerate(cols):
+            for row, value in self._datacolumns[key].items():
+                if not kidx:
+                    rows[row] = [value]
+                elif row in rows and len(rows[row]) == kidx:
+                    rows[row].append(value)
+        rows = {k: row for k, row in rows.items() if len(row) == len(cols)}
+        if not len(rows):
+            return 0
+        if not self._computeFunction(rows):
+            return 0
+        for key in self.computeColumns:
+            if key in self._requiredColumns and key in self._datacolumns:
+                title = self.commonColumns[key]
+                self._ensureColumn(
+                    columns, key, title, 'compute', computeGetData,
+                    computeSelector(key), computeLength)
+                cidx = ord(key[-1:]) - ord('x')
+                for ridx, row in enumerate(rows):
+                    self._datacolumns[key][row] = float(self._computed[ridx][cidx])
+                columns[key]['count'] = len(rows)
+                columns[key]['min'] = columns[key]['max'] = 0
+        return len(rows)
+
     def _getColumns(self):
         """
         Get a sorted list of plottable columns with some metadata for each.
@@ -1086,6 +1181,7 @@ def _getColumns(self):
                     count += self._collectColumns(columns, [item], 'item', first=False)
         count += self._getColumnsFromAnnotations(columns)
         count += self._getColumnsFromDataFiles(columns)
+        count += self._getColumnsFromCompute(columns)
         for result in columns.values():
             if len(result['distinct']) <= self.maxDistinct:
                 result['distinct'] = sorted(result['distinct'])
@@ -1095,7 +1191,9 @@ def _getColumns(self):
             if result['type'] != 'number' or result['min'] is None:
                 result.pop('min', None)
                 result.pop('max', None)
-        prefixOrder = {'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3, 'bbox': 4}
+        prefixOrder = {
+            'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3,
+            'bbox': 4, 'compute': 5}
         columns = sorted(columns.values(), key=lambda x: (
             prefixOrder.get(x['key'].split('.', 1)[0], len(prefixOrder)), x['key']))
         return columns
@@ -1168,7 +1266,7 @@ def _collectData(self, rows, colsout):
             rows = [row for ridx, row in enumerate(rows) if rows[ridx] not in discard]
         return data, rows
 
-    def data(self, columns, requiredColumns=None):
+    def data(self, columns, requiredColumns=None):  # noqa
         """
         Get plottable data.
 
@@ -1182,8 +1280,14 @@ def data(self, columns, requiredColumns=None):
             columns = columns.split(',')
         if not isinstance(requiredColumns, list):
             requiredColumns = requiredColumns.split(',') if requiredColumns is not None else []
-        requiredColumns = set(requiredColumns)
+        specifiedReqColumns = set(requiredColumns)
         self._requiredColumns = set(requiredColumns)
+        if self._compute:
+            if ('compute.z' in specifiedReqColumns and
+                    self._compute['function'] == 'umap' and
+                    'n_components' not in self._compute['params']):
+                self._compute['params']['n_components'] = 3
+            self._requiredColumns.update(self._compute['columns'])
         with self._dataLock:
             self._datacolumns = {c: {} for c in columns}
             rows = set()
@@ -1201,7 +1305,7 @@ def data(self, columns, requiredColumns=None):
         for cidx, col in enumerate(colsout):
             colkey = col['key']
             numrows = len(data)
-            if colkey in requiredColumns:
+            if colkey in specifiedReqColumns:
                 data = [row for row in data if row[cidx] is not None]
             if len(data) < numrows:
                 logger.info(f'Reduced row count from {numrows} to {len(data)} '
@@ -1210,7 +1314,7 @@ def data(self, columns, requiredColumns=None):
         for cidx, col in enumerate(colsout):
             colkey = col['key']
             numrows = len(data)
-            if colkey in self._requiredColumns and colkey not in requiredColumns:
+            if colkey in self._requiredColumns and colkey not in specifiedReqColumns:
                 subdata = [row for row in subdata if row[cidx] is not None]
         if len(subdata) and len(subdata) < len(data):
             logger.info(f'Reduced row count from {len(data)} to {len(subdata)} '

diff --git a/girder_annotation/setup.py b/girder_annotation/setup.py
@@ -57,6 +57,9 @@ def prerelease_local_scheme(version):
         'orjson',
     ],
     extras_require={
+        'compute': [
+            'umap-learn',
+        ],
         'tasks': [
             f'girder-large-image[tasks]{limit_version}',
         ],

diff --git a/girder_annotation/test_annotation/test_annotations.py b/girder_annotation/test_annotation/test_annotations.py
@@ -937,71 +937,71 @@ def testPlottableDataMultipleItems(admin):
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, sources='item')
     col = plottable.columns
-    assert len(col) == 3
+    assert len(col) == 6
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 3
     assert len(data['data']) == 1
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, sources='item', adjacentItems=True)
     col = plottable.columns
-    assert len(col) == 3
+    assert len(col) == 6
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 3
     assert len(data['data']) == 2
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, sources='item', adjacentItems='__all__')
     col = plottable.columns
-    assert len(col) == 4
+    assert len(col) == 7
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 4
     assert len(data['data']) == 2
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1)
     col = plottable.columns
-    assert len(col) == 4
+    assert len(col) == 7
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 4
     assert len(data['data']) == 3
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, adjacentItems=True)
     col = plottable.columns
-    assert len(col) == 4
+    assert len(col) == 7
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 4
     assert len(data['data']) == 4
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, annotations=[str(annot1a['_id']), str(annot1c['_id'])])
     col = plottable.columns
-    assert len(col) == 14
+    assert len(col) == 17
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 14
     assert len(data['data']) == 6
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, annotations=[str(annot1a['_id']), str(annot1c['_id'])], adjacentItems=True)
     col = plottable.columns
-    assert len(col) == 14
+    assert len(col) == 17
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 14
     assert len(data['data']) == 8
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, annotations='__all__')
     col = plottable.columns
-    assert len(col) == 14
+    assert len(col) == 17
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 14
     assert len(data['data']) == 8
 
     plottable = girder_large_image_annotation.utils.PlottableItemData(
         admin, item1, annotations='__all__', adjacentItems=True)
     col = plottable.columns
-    assert len(col) == 14
+    assert len(col) == 17
     data = plottable.data([c['key'] for c in col])
     assert len(data['columns']) == 14
     assert len(data['data']) == 12
diff --git a/girder_annotation/test_annotation/test_annotations_rest.py b/girder_annotation/test_annotation/test_annotations_rest.py
@@ -866,7 +866,7 @@ def testPlottableEndpoints(self, server, admin):
             },
         )
         assert utilities.respStatus(resp) == 200
-        assert len(resp.json) == 2
+        assert len(resp.json) == 5
 
         resp = server.request(
             path=f'/annotation/item/{itemSrc["_id"]}/plot/list',

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -26,7 +26,7 @@ girder-jobs>=3.0.3
 # Girder and worker dependencies are already installed above
 -e utilities/tasks[girder]
 -e girder/.
--e girder_annotation/.
+-e girder_annotation/.[compute]
 
 # Extras from main setup.py
 pylibmc>=1.5.1