From f86f44787e8d64d2f7ad43277e61020dd0ccc42f Mon Sep 17 00:00:00 2001 From: David Manthey Date: Fri, 6 Sep 2024 13:54:33 -0400 Subject: [PATCH] Add the ability to compute additional columns for plottable data --- CHANGELOG.md | 4 + .../rest/annotation.py | 14 ++- .../utils/__init__.py | 118 ++++++++++++++++-- girder_annotation/setup.py | 3 + .../test_annotation/test_annotations.py | 20 +-- .../test_annotation/test_annotations_rest.py | 2 +- requirements-dev.txt | 2 +- 7 files changed, 142 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a9c7b559..bdc741950 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## 1.29.8 +### Improvements + +- Add the option to compute additional columns for plottable data ([#1626](../../pull/1626)) + ### Bug Fixes - Fix scaling small images in the multi source with bicubic smoothing ([#1627](../../pull/1627)) diff --git a/girder_annotation/girder_large_image_annotation/rest/annotation.py b/girder_annotation/girder_large_image_annotation/rest/annotation.py index d2d233ccb..49744ef0f 100644 --- a/girder_annotation/girder_large_image_annotation/rest/annotation.py +++ b/girder_annotation/girder_large_image_annotation/rest/annotation.py @@ -661,19 +661,29 @@ def getItemPlottableElements(self, item, annotations, adjacentItems, sources=Non .param('sources', 'An optional comma separated list that can contain ' 'folder, item, annotation, annotationelement, datafile.', required=False) + .jsonParam( + 'compute', 'A dictionary with keys "columns": a list of columns ' + 'to include in the computation; if unspecified or an empty list, ' + 'no computation is done, "function": a string with the name of ' + 'the function, such as umap, "params": additional parameters to ' + 'pass to the function. If none of the requiredKeys are ' + 'compute.(x|y|z), the computation will not be performed. Only ' + 'rows which have all selected columns present will be included in ' + 'the computation.', + paramType='formData', requireObject=True, required=False) .errorResponse('ID was invalid.') .errorResponse('Read access was denied for the item.', 403), ) @access.public(cookie=True, scope=TokenScope.DATA_READ) def getItemPlottableData( - self, item, keys, adjacentItems, annotations, requiredKeys, sources=None): + self, item, keys, adjacentItems, annotations, requiredKeys, sources=None, compute=None): user = self.getCurrentUser() if adjacentItems != '__all__': adjacentItems = str(adjacentItems).lower() == 'true' sources = sources or None data = utils.PlottableItemData( user, item, annotations=annotations, adjacentItems=adjacentItems, - sources=sources) + sources=sources, compute=compute) return data.data(keys, requiredKeys) def getFolderAnnotations(self, id, recurse, user, limit=False, offset=False, sort=False, diff --git a/girder_annotation/girder_large_image_annotation/utils/__init__.py b/girder_annotation/girder_large_image_annotation/utils/__init__.py index 539eeb860..71b783185 100644 --- a/girder_annotation/girder_large_image_annotation/utils/__init__.py +++ b/girder_annotation/girder_large_image_annotation/utils/__init__.py @@ -29,6 +29,7 @@ 'application/x-xls': 'read_excel', } scanDatafileRecords = 50 +scanAnnotationElements = 5000 @functools.lru_cache(maxsize=100) @@ -393,7 +394,8 @@ class PlottableItemData: maxDistinct = 20 allowedTypes = (str, bool, int, float) - def __init__(self, user, item, annotations=None, adjacentItems=False, sources=None): + def __init__(self, user, item, annotations=None, adjacentItems=False, + sources=None, compute=None): """ Get plottable data associated with an item. @@ -408,15 +410,32 @@ def __init__(self, user, item, annotations=None, adjacentItems=False, sources=No :param sources: None for all, or a string with a comma-separated list or a list of strings; when a list, the options are folder, item, annotation, datafile. + :param compute: None for none, or a dictionary with keys "columns": a + list of columns to include in the computation; if unspecified or an + empty list, no computation is done, "function": a string with the + name of the function, such as umap, "params": additional parameters + to pass to the function. If none of the requiredKeys are + compute.(x|y|z), the computation will not be performed. Only rows + which have all selected columns present will be included in the + computation. """ self.user = user self._columns = None self._datacolumns = None self._data = None + self._compute = None + try: + if len(compute['columns']): + self._compute = {'function': 'umap', 'params': { + 'random_state': 1, 'n_jobs': 1}} + self._compute.update(compute) + except Exception: + pass if sources and not isinstance(sources, (list, tuple)): sources = sources.split(',') self._sources = tuple(sources) if sources else None - if self._sources and 'annotation' not in self._sources: + if (self._sources and 'annotation' not in self._sources and + 'annotationelement' not in self._sources): annotations = None self._fullScan = adjacentItems == '__all__' self._findItems(item, adjacentItems) @@ -559,7 +578,11 @@ def _findDataFiles(self): # noqa 'bbox.y0': 'Bounding Box Low Y', 'bbox.x1': 'Bounding Box High X', 'bbox.y1': 'Bounding Box High Y', + 'compute.x': 'Dimension Reduction X', + 'compute.y': 'Dimension Reduction Y', + 'compute.z': 'Dimension Reduction Z', } + computeColumns = {'compute.x', 'compute.y', 'compute.z'} def itemNameIDSelector(self, isName, selector): """ @@ -1068,6 +1091,78 @@ def _getColumnsFromDataFiles(self, columns): countsPerDataFile[dfidx] = count - startcount return count + def _computeFunction(self, rows): + if self._compute['function'] == 'umap': + import umap + + logger.info(f'Calling umap on {len(rows)} rows') + reducer = umap.UMAP(**self._compute['params']) + self._computed = reducer.fit_transform(list(rows.values())) + logger.info('Called umap') + return True + + def _getColumnsFromCompute(self, columns): # noqa + """ + Collect columns and data from compute actions. + """ + + def computeGetData(record): + return {} + + def computeLength(record, data): + return len(self._computed) + + def computeSelector(key): + axis = ord(key[-1:]) - ord('x') + + def computeSelectorAxis(record, data, row): + return self._computed[row][axis] + + return computeSelectorAxis + + if not self._datacolumns: + for key in self.computeColumns: + title = self.commonColumns[key] + self._ensureColumn( + columns, key, title, 'compute', computeGetData, + computeSelector(key), computeLength) + columns[key]['count'] = 1 + columns[key]['min'] = columns[key]['max'] = 0 + return 0 + if self._compute is None or not len(self._requiredColumns & self.computeColumns): + return 0 + compcol = { + key for key, col in columns.items() + if col['type'] == 'number' and col.get('min') is not None + } & set(self._compute['columns']) + if not len(compcol): + return 0 + rows = {} + cols = sorted({col for col in self._compute['columns'] if col in self._datacolumns}) + for kidx, key in enumerate(cols): + for row, value in self._datacolumns[key].items(): + if not kidx: + rows[row] = [value] + elif row in rows and len(rows[row]) == kidx: + rows[row].append(value) + rows = {k: row for k, row in rows.items() if len(row) == len(cols)} + if not len(rows): + return 0 + if not self._computeFunction(rows): + return 0 + for key in self.computeColumns: + if key in self._requiredColumns and key in self._datacolumns: + title = self.commonColumns[key] + self._ensureColumn( + columns, key, title, 'compute', computeGetData, + computeSelector(key), computeLength) + cidx = ord(key[-1:]) - ord('x') + for ridx, row in enumerate(rows): + self._datacolumns[key][row] = float(self._computed[ridx][cidx]) + columns[key]['count'] = len(rows) + columns[key]['min'] = columns[key]['max'] = 0 + return len(rows) + def _getColumns(self): """ Get a sorted list of plottable columns with some metadata for each. @@ -1086,6 +1181,7 @@ def _getColumns(self): count += self._collectColumns(columns, [item], 'item', first=False) count += self._getColumnsFromAnnotations(columns) count += self._getColumnsFromDataFiles(columns) + count += self._getColumnsFromCompute(columns) for result in columns.values(): if len(result['distinct']) <= self.maxDistinct: result['distinct'] = sorted(result['distinct']) @@ -1095,7 +1191,9 @@ def _getColumns(self): if result['type'] != 'number' or result['min'] is None: result.pop('min', None) result.pop('max', None) - prefixOrder = {'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3, 'bbox': 4} + prefixOrder = { + 'item': 0, 'annotation': 1, 'annotationelement': 2, 'data': 3, + 'bbox': 4, 'compute': 5} columns = sorted(columns.values(), key=lambda x: ( prefixOrder.get(x['key'].split('.', 1)[0], len(prefixOrder)), x['key'])) return columns @@ -1168,7 +1266,7 @@ def _collectData(self, rows, colsout): rows = [row for ridx, row in enumerate(rows) if rows[ridx] not in discard] return data, rows - def data(self, columns, requiredColumns=None): + def data(self, columns, requiredColumns=None): # noqa """ Get plottable data. @@ -1182,8 +1280,14 @@ def data(self, columns, requiredColumns=None): columns = columns.split(',') if not isinstance(requiredColumns, list): requiredColumns = requiredColumns.split(',') if requiredColumns is not None else [] - requiredColumns = set(requiredColumns) + specifiedReqColumns = set(requiredColumns) self._requiredColumns = set(requiredColumns) + if self._compute: + if ('compute.z' in specifiedReqColumns and + self._compute['function'] == 'umap' and + 'n_components' not in self._compute['params']): + self._compute['params']['n_components'] = 3 + self._requiredColumns.update(self._compute['columns']) with self._dataLock: self._datacolumns = {c: {} for c in columns} rows = set() @@ -1201,7 +1305,7 @@ def data(self, columns, requiredColumns=None): for cidx, col in enumerate(colsout): colkey = col['key'] numrows = len(data) - if colkey in requiredColumns: + if colkey in specifiedReqColumns: data = [row for row in data if row[cidx] is not None] if len(data) < numrows: logger.info(f'Reduced row count from {numrows} to {len(data)} ' @@ -1210,7 +1314,7 @@ def data(self, columns, requiredColumns=None): for cidx, col in enumerate(colsout): colkey = col['key'] numrows = len(data) - if colkey in self._requiredColumns and colkey not in requiredColumns: + if colkey in self._requiredColumns and colkey not in specifiedReqColumns: subdata = [row for row in subdata if row[cidx] is not None] if len(subdata) and len(subdata) < len(data): logger.info(f'Reduced row count from {len(data)} to {len(subdata)} ' diff --git a/girder_annotation/setup.py b/girder_annotation/setup.py index a6d099ee5..143dc5751 100644 --- a/girder_annotation/setup.py +++ b/girder_annotation/setup.py @@ -57,6 +57,9 @@ def prerelease_local_scheme(version): 'orjson', ], extras_require={ + 'compute': [ + 'umap-learn', + ], 'tasks': [ f'girder-large-image[tasks]{limit_version}', ], diff --git a/girder_annotation/test_annotation/test_annotations.py b/girder_annotation/test_annotation/test_annotations.py index a8458fc4c..1e7c7b74e 100644 --- a/girder_annotation/test_annotation/test_annotations.py +++ b/girder_annotation/test_annotation/test_annotations.py @@ -783,7 +783,7 @@ def testPlottableDataAccess(admin): plottable = girder_large_image_annotation.utils.PlottableItemData(admin, item) col = plottable.columns # Also contains item id, name, and description - assert len(col) == 12 + assert len(col) == 15 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 12 @@ -937,7 +937,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, sources='item') col = plottable.columns - assert len(col) == 3 + assert len(col) == 6 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 3 assert len(data['data']) == 1 @@ -945,7 +945,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, sources='item', adjacentItems=True) col = plottable.columns - assert len(col) == 3 + assert len(col) == 6 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 3 assert len(data['data']) == 2 @@ -953,7 +953,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, sources='item', adjacentItems='__all__') col = plottable.columns - assert len(col) == 4 + assert len(col) == 7 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 4 assert len(data['data']) == 2 @@ -961,7 +961,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1) col = plottable.columns - assert len(col) == 4 + assert len(col) == 7 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 4 assert len(data['data']) == 3 @@ -969,7 +969,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, adjacentItems=True) col = plottable.columns - assert len(col) == 4 + assert len(col) == 7 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 4 assert len(data['data']) == 4 @@ -977,7 +977,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, annotations=[str(annot1a['_id']), str(annot1c['_id'])]) col = plottable.columns - assert len(col) == 14 + assert len(col) == 17 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 14 assert len(data['data']) == 6 @@ -985,7 +985,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, annotations=[str(annot1a['_id']), str(annot1c['_id'])], adjacentItems=True) col = plottable.columns - assert len(col) == 14 + assert len(col) == 17 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 14 assert len(data['data']) == 8 @@ -993,7 +993,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, annotations='__all__') col = plottable.columns - assert len(col) == 14 + assert len(col) == 17 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 14 assert len(data['data']) == 8 @@ -1001,7 +1001,7 @@ def testPlottableDataMultipleItems(admin): plottable = girder_large_image_annotation.utils.PlottableItemData( admin, item1, annotations='__all__', adjacentItems=True) col = plottable.columns - assert len(col) == 14 + assert len(col) == 17 data = plottable.data([c['key'] for c in col]) assert len(data['columns']) == 14 assert len(data['data']) == 12 diff --git a/girder_annotation/test_annotation/test_annotations_rest.py b/girder_annotation/test_annotation/test_annotations_rest.py index bc3bd9f67..49e930f63 100644 --- a/girder_annotation/test_annotation/test_annotations_rest.py +++ b/girder_annotation/test_annotation/test_annotations_rest.py @@ -866,7 +866,7 @@ def testPlottableEndpoints(self, server, admin): }, ) assert utilities.respStatus(resp) == 200 - assert len(resp.json) == 2 + assert len(resp.json) == 5 resp = server.request( path=f'/annotation/item/{itemSrc["_id"]}/plot/list', diff --git a/requirements-dev.txt b/requirements-dev.txt index aabe62500..ec4de8471 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -26,7 +26,7 @@ girder-jobs>=3.0.3 # Girder and worker dependencies are already installed above -e utilities/tasks[girder] -e girder/. --e girder_annotation/. +-e girder_annotation/.[compute] # Extras from main setup.py pylibmc>=1.5.1