diff --git a/docs/tutorials/api_usage.ipynb b/docs/tutorials/api_usage.ipynb index c7923f0..873f360 100644 --- a/docs/tutorials/api_usage.ipynb +++ b/docs/tutorials/api_usage.ipynb @@ -306,10 +306,12 @@ "name": "stderr", "output_type": "stream", "text": [ + "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/cellpose/resnet_torch.py:280: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " state_dict = torch.load(filename, map_location=torch.device(\"cpu\"))\n", "\u001b[33;20m[WARNING] (sopa._settings)\u001b[0m Running without parallelization backend can be slow. Consider using a backend, e.g. via `sopa.settings.parallelization_backend = 'dask'`, or `export SOPA_PARALLELIZATION_BACKEND=dask`.\n", - "100%|██████████| 4/4 [00:30<00:00, 7.59s/it]\n", + "100%|██████████| 4/4 [00:28<00:00, 7.07s/it]\n", "\u001b[36;20m[INFO] (sopa.segmentation._stainings)\u001b[0m Found 314 total cells\n", - "Resolving conflicts: 100%|██████████| 100/100 [00:00<00:00, 7499.60it/s]\n", + "Resolving conflicts: 100%|██████████| 100/100 [00:00<00:00, 7575.87it/s]\n", "\u001b[36;20m[INFO] (sopa.segmentation._stainings)\u001b[0m Added 280 cell boundaries in sdata['cellpose_boundaries']\n" ] } @@ -530,7 +532,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 106.15 ms\n" + "[########################################] | 100% Completed | 106.00 ms\n" ] }, { @@ -544,7 +546,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 101.54 ms\n" + "[########################################] | 100% Completed | 106.00 ms\n" ] }, { @@ -572,19 +574,19 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "AnnData object with n_obs × n_vars = 267 × 5\n", - " obs: 'baysor_area', 'avg_assignment_confidence', 'avg_confidence', 'cluster', 'density', 'elongation', 'lifespan', 'max_cluster_frac', 'n_transcripts', 'x', 'y', 'region', 'cell_id', 'slide', 'area'\n", + "AnnData object with n_obs × n_vars = 280 × 6\n", + " obs: 'region', 'slide', 'cell_id', 'area'\n", " uns: 'sopa_attrs', 'spatialdata_attrs'\n", - " obsm: 'spatial', 'intensities'" + " obsm: 'intensities', 'spatial'" ] }, - "execution_count": 15, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -640,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -689,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -698,7 +700,7 @@ "text": [ "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing table with 6 columns\n", "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing 3 cell categories: region, slide, cell_type\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.shapes)\u001b[0m Writing 372 cell polygons\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.shapes)\u001b[0m Writing 280 cell polygons\n", "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m Writing 40000 transcripts\n", "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 0: 40000 transcripts\n", "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 1: 10000 transcripts\n", @@ -738,7 +740,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -747,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -763,12 +765,16 @@ "output_type": "stream", "text": [ "Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers). Got range [0.0..1.9530949634755863].\n", + "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/anndata/_core/anndata.py:430: FutureWarning: The dtype argument is deprecated and will be removed in late 2024.\n", + " warnings.warn(\n", "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n", "/Users/quentinblampey/dev/_external/spatialdata/src/spatialdata/_core/_elements.py:106: UserWarning: Key `misc` already exists. Overwriting it in-memory.\n", " self._check_key(key, self.keys(), self._shared_keys)\n", "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/spatialdata_plot/pl/render.py:494: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored\n", " _cax = ax.scatter(\n", + "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/anndata/_core/anndata.py:430: FutureWarning: The dtype argument is deprecated and will be removed in late 2024.\n", + " warnings.warn(\n", "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n", "/Users/quentinblampey/dev/_external/spatialdata/src/spatialdata/_core/_elements.py:106: UserWarning: Key `transcripts` already exists. Overwriting it in-memory.\n", diff --git a/docs/tutorials/xenium_explorer/explorer.ipynb b/docs/tutorials/xenium_explorer/explorer.ipynb index 6ed37e5..fe1c5bb 100644 --- a/docs/tutorials/xenium_explorer/explorer.ipynb +++ b/docs/tutorials/xenium_explorer/explorer.ipynb @@ -53,7 +53,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 259.68 ms\n" + "[########################################] | 100% Completed | 249.06 ms\n" ] }, { @@ -67,7 +67,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 101.64 ms\n" + "[########################################] | 100% Completed | 106.16 ms\n" ] }, { @@ -86,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -131,7 +131,7 @@ } ], "source": [ - "sopa.io.write(explorer_path, sdata)" + "sopa.io.explorer.write(explorer_path, sdata)" ] }, { @@ -428,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -446,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -471,7 +471,7 @@ " transcripts (Points)" ] }, - "execution_count": 17, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -489,7 +489,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -503,28 +503,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ ] | 0% Completed | 267.45 ms\n" + "[########################################] | 100% Completed | 101.67 ms\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36;20m[INFO] (sopa.aggregation.channels)\u001b[0m Averaging channels intensity over 4 cells with expansion expand_radius_ratio=0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 105.89 ms\n" ] }, { - "ename": "ValueError", - "evalue": "'index_left' and 'index_right' cannot be names in the frames being joined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43msopa\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moverlay_segmentation\u001b[49m\u001b[43m(\u001b[49m\u001b[43msdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshapes_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkey_added\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/dev/sopa/sopa/aggregation/overlay.py:73\u001b[0m, in \u001b[0;36moverlay_segmentation\u001b[0;34m(sdata, shapes_key, gene_column, area_ratio_threshold, image_key)\u001b[0m\n\u001b[1;32m 70\u001b[0m table_crop \u001b[38;5;241m=\u001b[39m old_table[\u001b[38;5;241m~\u001b[39mnp\u001b[38;5;241m.\u001b[39misin(old_table\u001b[38;5;241m.\u001b[39mobs[instance_key], gdf_join\u001b[38;5;241m.\u001b[39mindex)]\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m 71\u001b[0m table_crop\u001b[38;5;241m.\u001b[39mobs[SopaKeys\u001b[38;5;241m.\u001b[39mCELL_OVERLAY_KEY] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m---> 73\u001b[0m \u001b[43maggr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgene_column\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgene_column\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maverage_intensities\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maverage_intensities\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 74\u001b[0m aggr\u001b[38;5;241m.\u001b[39mtable\u001b[38;5;241m.\u001b[39mobs[SopaKeys\u001b[38;5;241m.\u001b[39mCELL_OVERLAY_KEY] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 76\u001b[0m aggr\u001b[38;5;241m.\u001b[39mtable \u001b[38;5;241m=\u001b[39m anndata\u001b[38;5;241m.\u001b[39mconcat([table_crop, aggr\u001b[38;5;241m.\u001b[39mtable], uns_merge\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfirst\u001b[39m\u001b[38;5;124m\"\u001b[39m, join\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mouter\u001b[39m\u001b[38;5;124m\"\u001b[39m, fill_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n", - "File \u001b[0;32m~/dev/sopa/sopa/aggregation/aggregation.py:141\u001b[0m, in \u001b[0;36mAggregator.compute_table\u001b[0;34m(self, gene_column, average_intensities, expand_radius_ratio, points_key, min_transcripts, min_intensity_ratio)\u001b[0m\n\u001b[1;32m 139\u001b[0m log\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msdata.table is already existing. Transcripts are not count again.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 141\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtable \u001b[38;5;241m=\u001b[39m \u001b[43mcount_transcripts\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 142\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgene_column\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshapes_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshapes_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpoints_key\u001b[49m\n\u001b[1;32m 143\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbins_key \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtable \u001b[38;5;241m=\u001b[39m aggregate_bins(\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msdata, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshapes_key, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbins_key, expand_radius_ratio\u001b[38;5;241m=\u001b[39mexpand_radius_ratio \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;241m0.2\u001b[39m\n\u001b[1;32m 147\u001b[0m )\n", - "File \u001b[0;32m~/dev/sopa/sopa/aggregation/transcripts.py:49\u001b[0m, in \u001b[0;36mcount_transcripts\u001b[0;34m(sdata, gene_column, shapes_key, points_key, geo_df)\u001b[0m\n\u001b[1;32m 46\u001b[0m geo_df \u001b[38;5;241m=\u001b[39m to_intrinsic(sdata, geo_df, points_key)\n\u001b[1;32m 48\u001b[0m log\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAggregating transcripts over \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(geo_df)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cells\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 49\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_count_transcripts_aligned\u001b[49m\u001b[43m(\u001b[49m\u001b[43mgeo_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpoints\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgene_column\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/dev/sopa/sopa/aggregation/transcripts.py:78\u001b[0m, in \u001b[0;36m_count_transcripts_aligned\u001b[0;34m(geo_df, points, value_key)\u001b[0m\n\u001b[1;32m 72\u001b[0m X_partitions \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ProgressBar():\n\u001b[1;32m 75\u001b[0m \u001b[43mpoints\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_partitions\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartial\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_add_coo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mX_partitions\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgeo_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgene_column\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalue_key\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgene_names\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgene_names\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mmeta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m---> 78\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m X_partition \u001b[38;5;129;01min\u001b[39;00m X_partitions:\n\u001b[1;32m 81\u001b[0m adata\u001b[38;5;241m.\u001b[39mX \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m X_partition\n", - "File \u001b[0;32m~/mambaforge/envs/sopa/lib/python3.10/site-packages/dask/base.py:372\u001b[0m, in \u001b[0;36mDaskMethodsMixin.compute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcompute\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 349\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Compute this dask collection\u001b[39;00m\n\u001b[1;32m 350\u001b[0m \n\u001b[1;32m 351\u001b[0m \u001b[38;5;124;03m This turns a lazy Dask collection into its in-memory equivalent.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;124;03m dask.compute\u001b[39;00m\n\u001b[1;32m 371\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 372\u001b[0m (result,) \u001b[38;5;241m=\u001b[39m \u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraverse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", - "File \u001b[0;32m~/mambaforge/envs/sopa/lib/python3.10/site-packages/dask/base.py:660\u001b[0m, in \u001b[0;36mcompute\u001b[0;34m(traverse, optimize_graph, scheduler, get, *args, **kwargs)\u001b[0m\n\u001b[1;32m 657\u001b[0m postcomputes\u001b[38;5;241m.\u001b[39mappend(x\u001b[38;5;241m.\u001b[39m__dask_postcompute__())\n\u001b[1;32m 659\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m shorten_traceback():\n\u001b[0;32m--> 660\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mschedule\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdsk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m repack([f(r, \u001b[38;5;241m*\u001b[39ma) \u001b[38;5;28;01mfor\u001b[39;00m r, (f, a) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(results, postcomputes)])\n", - "File \u001b[0;32m~/dev/sopa/sopa/aggregation/transcripts.py:95\u001b[0m, in \u001b[0;36m_add_coo\u001b[0;34m(X_partitions, geo_df, partition, gene_column, gene_names)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_add_coo\u001b[39m(\n\u001b[1;32m 88\u001b[0m X_partitions: \u001b[38;5;28mlist\u001b[39m[coo_matrix],\n\u001b[1;32m 89\u001b[0m geo_df: gpd\u001b[38;5;241m.\u001b[39mGeoDataFrame,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 92\u001b[0m gene_names: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 93\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 94\u001b[0m points_gdf \u001b[38;5;241m=\u001b[39m gpd\u001b[38;5;241m.\u001b[39mGeoDataFrame(partition, geometry\u001b[38;5;241m=\u001b[39mgpd\u001b[38;5;241m.\u001b[39mpoints_from_xy(partition[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mx\u001b[39m\u001b[38;5;124m\"\u001b[39m], partition[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m]))\n\u001b[0;32m---> 95\u001b[0m joined \u001b[38;5;241m=\u001b[39m \u001b[43mgeo_df\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpoints_gdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m cells_indices, column_indices \u001b[38;5;241m=\u001b[39m joined\u001b[38;5;241m.\u001b[39mindex, joined[gene_column]\u001b[38;5;241m.\u001b[39mcat\u001b[38;5;241m.\u001b[39mcodes\n\u001b[1;32m 98\u001b[0m cells_indices \u001b[38;5;241m=\u001b[39m cells_indices[column_indices \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m]\n", - "File \u001b[0;32m~/mambaforge/envs/sopa/lib/python3.10/site-packages/geopandas/geodataframe.py:2195\u001b[0m, in \u001b[0;36mGeoDataFrame.sjoin\u001b[0;34m(self, df, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2121\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msjoin\u001b[39m(\u001b[38;5;28mself\u001b[39m, df, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 2122\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Spatial join of two GeoDataFrames.\u001b[39;00m\n\u001b[1;32m 2123\u001b[0m \n\u001b[1;32m 2124\u001b[0m \u001b[38;5;124;03m See the User Guide page :doc:`../../user_guide/mergingdata` for details.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2193\u001b[0m \u001b[38;5;124;03m sjoin : equivalent top-level function\u001b[39;00m\n\u001b[1;32m 2194\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 2195\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mgeopandas\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mleft_df\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mright_df\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/sopa/lib/python3.10/site-packages/geopandas/tools/sjoin.py:119\u001b[0m, in \u001b[0;36msjoin\u001b[0;34m(left_df, right_df, how, predicate, lsuffix, rsuffix, **kwargs)\u001b[0m\n\u001b[1;32m 116\u001b[0m first \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28miter\u001b[39m(kwargs\u001b[38;5;241m.\u001b[39mkeys()))\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msjoin() got an unexpected keyword argument \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfirst\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 119\u001b[0m \u001b[43m_basic_checks\u001b[49m\u001b[43m(\u001b[49m\u001b[43mleft_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mright_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlsuffix\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrsuffix\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 121\u001b[0m indices \u001b[38;5;241m=\u001b[39m _geom_predicate_query(left_df, right_df, predicate)\n\u001b[1;32m 123\u001b[0m joined \u001b[38;5;241m=\u001b[39m _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)\n", - "File \u001b[0;32m~/mambaforge/envs/sopa/lib/python3.10/site-packages/geopandas/tools/sjoin.py:172\u001b[0m, in \u001b[0;36m_basic_checks\u001b[0;34m(left_df, right_df, how, lsuffix, rsuffix)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# due to GH 352\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(left_df\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin([index_left, index_right])) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28many\u001b[39m(\n\u001b[1;32m 170\u001b[0m right_df\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39misin([index_left, index_right])\n\u001b[1;32m 171\u001b[0m ):\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m and \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m cannot be names in the frames being\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 174\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m joined\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(index_left, index_right)\n\u001b[1;32m 175\u001b[0m )\n", - "\u001b[0;31mValueError\u001b[0m: 'index_left' and 'index_right' cannot be names in the frames being joined" + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/quentinblampey/dev/_external/spatialdata/src/spatialdata/_core/_elements.py:96: UserWarning: Key `large_cells` already exists. Overwriting it in-memory.\n", + " self._check_key(key, self.keys(), self._shared_keys)\n", + "/Users/quentinblampey/mambaforge/envs/sopa/lib/python3.10/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", + " utils.warn_names_duplicates(\"obs\")\n", + "/Users/quentinblampey/dev/_external/spatialdata/src/spatialdata/_core/_elements.py:116: UserWarning: Key `table` already exists. Overwriting it in-memory.\n", + " self._check_key(key, self.keys(), self._shared_keys)\n" ] } ], @@ -536,37 +541,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, we have a new table (the old table is also kept), and we have new shapes called `'cellpose_boundaries+large_cells'`." + "Now, we have a new table (the old table is also kept), and we have new shapes called `'cellpose_boundaries_overlay_large_cells'`." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "SpatialData object with:\n", + "SpatialData object\n", "├── Images\n", - "│ └── 'image': SpatialImage[cyx] (4, 2048, 2048)\n", + "│ ├── 'he_image': DataTree[cyx] (3, 1024, 1024), (3, 512, 512), (3, 256, 256)\n", + "│ └── 'image': DataArray[cyx] (4, 2048, 2048)\n", "├── Points\n", - "│ └── 'transcripts': DataFrame with shape: (, 4) (3D points)\n", + "│ ├── 'misc': DataFrame with shape: (, 2) (2D points)\n", + "│ └── 'transcripts': DataFrame with shape: (, 5) (2D points)\n", "├── Shapes\n", "│ ├── 'cellpose_boundaries': GeoDataFrame shape: (400, 1) (2D shapes)\n", - "│ ├── 'cellpose_boundaries+large_cells': GeoDataFrame shape: (380, 1) (2D shapes)\n", + "│ ├── 'cellpose_boundaries_overlay_large_cells': GeoDataFrame shape: (380, 1) (2D shapes)\n", "│ └── 'large_cells': GeoDataFrame shape: (4, 1) (2D shapes)\n", "└── Tables\n", - " ├── 'old_table': AnnData (400, 5)\n", - " └── 'table': AnnData (380, 5)\n", + " ├── 'old_table': AnnData (400, 6)\n", + " └── 'table': AnnData (380, 6)\n", "with coordinate systems:\n", - "▸ 'global', with elements:\n", - " image (Images), transcripts (Points), cellpose_boundaries (Shapes), cellpose_boundaries+large_cells (Shapes), large_cells (Shapes)\n", - "▸ 'microns', with elements:\n", + " ▸ 'global', with elements:\n", + " he_image (Images), image (Images), misc (Points), transcripts (Points), cellpose_boundaries (Shapes), cellpose_boundaries_overlay_large_cells (Shapes), large_cells (Shapes)\n", + " ▸ 'microns', with elements:\n", " transcripts (Points)" ] }, - "execution_count": 23, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -586,15 +593,15 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing table with 5 columns\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing 4 cell categories: region, slide, leiden, lasso\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing table with 6 columns\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing 2 cell categories: region, slide\n", "\u001b[36;20m[INFO] (sopa.io.explorer.shapes)\u001b[0m Writing 380 cell polygons\n", "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m Saved files in the following directory: tuto.explorer\n", "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m You can open the experiment with 'open tuto.explorer/experiment.xenium'\n" @@ -602,10 +609,10 @@ } ], "source": [ - "sopa.io.write(\n", + "sopa.io.explorer.write(\n", " explorer_path,\n", " sdata,\n", - " shapes_key=\"cellpose_boundaries+large_cells\",\n", + " shapes_key=\"cellpose_boundaries_overlay_large_cells\",\n", " gene_column=\"genes\",\n", " mode=\"-it\",\n", ")" diff --git a/mkdocs.yml b/mkdocs.yml index 6699f12..8b70b72 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,7 @@ nav: - Readers: api/readers.md - Patches: api/patches.md - Segmentation: api/segmentation.md + - Aggregation: api/aggregation.md - Utils: api/utils.md - Spatial operations: api/spatial.md - Misc: api/misc.md diff --git a/sopa/aggregation/aggregation.py b/sopa/aggregation/aggregation.py index 169bc91..acd9701 100644 --- a/sopa/aggregation/aggregation.py +++ b/sopa/aggregation/aggregation.py @@ -7,7 +7,7 @@ from anndata import AnnData from scipy.sparse import csr_matrix from spatialdata import SpatialData -from spatialdata.models import PointsModel, TableModel +from spatialdata.models import TableModel from .._constants import ATTRS_KEY, SopaAttrs, SopaKeys from ..io.explorer.utils import str_cell_id @@ -28,33 +28,52 @@ def aggregate( aggregate_channels: bool = True, image_key: str | None = None, points_key: str | None = None, + gene_column: str | None = None, shapes_key: str | None = None, bins_key: str | None = None, - min_transcripts: int = 0, expand_radius_ratio: float | None = None, + min_transcripts: int = 0, min_intensity_ratio: float = 0.1, ): + """Aggregate gene counts and/or channel intensities over a `SpatialData` object to create an `AnnData` table (saved in `sdata["table"]`). + + !!! info + The main arguments are `sdata`, `aggregate_genes`, and `aggregate_channels`. The rest of the arguments are optional and will be inferred from the data if not provided. + + - If channels are aggregated and not genes, then `sdata['table'].X` will contain the mean channel intensities per cell. + - If genes are aggregated and not channels, then `sdata['table'].X` will contain the gene counts per cell. + - If both genes and channels are aggregated, then `sdata['table'].X` will contain the gene counts per cell and `sdata['table'].obsm['intensities']` will contain the mean channel intensities per cell. + + Args: + sdata: A `SpatialData` object + aggregate_genes: Whether to aggregate gene counts. If None, it will be inferred from the data. + aggregate_channels: Whether to aggregate channel intensities inside cells. + image_key: Key of `sdata` with the image channels to be averaged. By default, uses the segmentation image. + points_key: Key of `sdata` with the points dataframe representing the transcripts. + gene_column: Key of `sdata[points_key]` with the gene names. + shapes_key: Key of `sdata` with the shapes corresponding to the cells boundaries. + bins_key: Key of `sdata` with the table corresponding to the bin-by-gene table of gene counts (e.g., for Visium HD data). + expand_radius_ratio: Ratio to expand the cells polygons for channels averaging. By default, the value will be inferred depending on whether we aggregate bins or not. + min_transcripts: Min number of transcripts to keep a cell. + min_intensity_ratio: Min ratio of the 90th quantile of the mean channel intensity to keep a cell. + """ bins_key = bins_key or sdata.attrs.get(SopaAttrs.BINS_TABLE) - aggr = Aggregator(sdata, image_key=image_key, shapes_key=shapes_key, bins_key=bins_key) - - points_key, gene_column = None, None + points_key = None if aggregate_genes or (aggregate_genes is None and bins_key is None and sdata.points): - points_key, points = get_spatial_element( + points_key, _ = get_spatial_element( sdata.points, key=points_key or sdata.attrs.get(SopaAttrs.TRANSCRIPTS), return_key=True ) - gene_column = points.attrs.get(ATTRS_KEY, {}).get(PointsModel.FEATURE_KEY) - assert ( - aggregate_genes is None or gene_column is not None - ), f"No gene column found in sdata['{points_key}'].attrs['{ATTRS_KEY}']['{PointsModel.FEATURE_KEY}']" + + aggr = Aggregator(sdata, image_key=image_key, shapes_key=shapes_key, bins_key=bins_key, points_key=points_key) aggr.compute_table( - gene_column=gene_column, - average_intensities=aggregate_channels, + aggregate_genes=aggregate_genes, + aggregate_channels=aggregate_channels, expand_radius_ratio=expand_radius_ratio, min_transcripts=min_transcripts, min_intensity_ratio=min_intensity_ratio, - points_key=points_key, + gene_column=gene_column, ) @@ -69,6 +88,7 @@ def __init__( image_key: str | None = None, shapes_key: str | None = None, bins_key: str | None = None, + points_key: str | None = None, ): """ Args: @@ -76,17 +96,14 @@ def __init__( image_key: Key of `sdata` with the image to be averaged. If only one image, this does not have to be provided shapes_key: Key of `sdata` with the shapes corresponding to the cells boundaries bins_key: Key of `sdata` with the table corresponding to the bins table of gene counts (e.g., for Visium HD data) + points_key: Key of `sdata` with the points dataframe representing the transcripts """ self.sdata = sdata self.bins_key = bins_key + self.points_key = points_key self.image_key, self.image = get_spatial_image(sdata, image_key, return_key=True) - - if shapes_key is None: - self.shapes_key, self.geo_df = get_boundaries(sdata, return_key=True) - else: - self.shapes_key = shapes_key - self.geo_df = self.sdata[shapes_key] + self.shapes_key, self.geo_df = get_boundaries(sdata, return_key=True, key=shapes_key) self.table = None if SopaKeys.TABLE in self.sdata.tables and self.sdata[SopaKeys.TABLE].n_obs == len(self.geo_df): @@ -102,56 +119,44 @@ def filter_cells(self, where_filter: np.ndarray): self.table = self.table[~where_filter] def update_table(self, *args, **kwargs): - log.warning("'update_table' is deprecated, use 'compute_table' instead") + log.warning("'update_table' is deprecated and will be removed in future versions, use 'compute_table' instead") self.compute_table(*args, **kwargs) def compute_table( self, + aggregate_genes: bool | None = None, + aggregate_channels: bool = True, gene_column: str | None = None, - average_intensities: bool = True, expand_radius_ratio: float | None = None, - points_key: str | None = None, min_transcripts: int = 0, min_intensity_ratio: float = 0, + average_intensities: bool = True, # deprecated argument + points_key: str | None = None, # deprecated argument ): - """Perform aggregation and update the spatialdata table - - Args: - gene_column: Column key of the transcript dataframe containing the gene names - average_intensities: Whether to average the channels intensities inside cells polygons - expand_radius_ratio: Cells polygons will be expanded by `expand_radius_ratio * mean_radius` for channels averaging **only**. This help better aggregate boundary stainings - min_transcripts: Minimum amount of transcript to keep a cell - min_intensity_ratio: Cells whose mean channel intensity is less than `min_intensity_ratio * quantile_90` will be filtered - """ - does_count = ( - (self.table is not None and isinstance(self.table.X, csr_matrix)) - or gene_column is not None - or self.bins_key is not None + aggregate_genes, aggregate_channels = self._legacy_arguments( + points_key, gene_column, aggregate_genes, aggregate_channels, average_intensities ) assert ( - average_intensities or does_count - ), "You must choose at least one aggregation: transcripts or fluorescence intensities" - assert ( - gene_column is None or self.bins_key is None - ), "Can't count transcripts and aggregate bins at the same time" + aggregate_genes or aggregate_channels + ), "At least one of `aggregate_genes` or `aggregate_channels` must be True" - if gene_column is not None: - if self.table is not None: - log.warning("sdata.table is already existing. Transcripts are not count again.") - else: + if aggregate_genes: + if self.bins_key is not None: + self.table = aggregate_bins( + self.sdata, self.shapes_key, self.bins_key, expand_radius_ratio=expand_radius_ratio or 0.2 + ) + elif self.table is None: self.table = count_transcripts( self.sdata, gene_column, shapes_key=self.shapes_key, points_key=points_key ) - elif self.bins_key is not None: - self.table = aggregate_bins( - self.sdata, self.shapes_key, self.bins_key, expand_radius_ratio=expand_radius_ratio or 0.2 - ) + else: + log.warning("sdata.table is already existing. Transcripts are not count again.") - if does_count and min_transcripts > 0: - self.filter_cells(self.table.X.sum(axis=1) < min_transcripts) + if min_transcripts > 0: + self.filter_cells(self.table.X.sum(axis=1) < min_transcripts) - if average_intensities: + if aggregate_channels: mean_intensities = average_channels( self.sdata, image_key=self.image_key, @@ -166,29 +171,59 @@ def compute_table( self.filter_cells(where_filter) mean_intensities = mean_intensities[~where_filter] - if not does_count: + if aggregate_genes: + self.table.obsm[SopaKeys.INTENSITIES_OBSM] = pd.DataFrame( + mean_intensities, + columns=self.image.coords["c"].values.astype(str), + index=self.table.obs_names, + ) + else: self.table = AnnData( mean_intensities, dtype=mean_intensities.dtype, var=pd.DataFrame(index=self.image.coords["c"].values.astype(str)), obs=pd.DataFrame(index=self.geo_df.index), ) - else: - self.table.obsm[SopaKeys.INTENSITIES_OBSM] = pd.DataFrame( - mean_intensities, - columns=self.image.coords["c"].values.astype(str), - index=self.table.obs_names, - ) self.sdata.shapes[self.shapes_key] = self.geo_df self.table.uns[SopaKeys.UNS_KEY] = { - SopaKeys.UNS_HAS_TRANSCRIPTS: does_count, - SopaKeys.UNS_HAS_INTENSITIES: average_intensities, + SopaKeys.UNS_HAS_TRANSCRIPTS: aggregate_genes, + SopaKeys.UNS_HAS_INTENSITIES: aggregate_channels, } self.add_standardized_table() + def _legacy_arguments( + self, + points_key: str | None, + gene_column: str | None, + aggregate_genes: bool | None, + aggregate_channels: bool, + average_intensities: bool, + ) -> tuple[bool, bool]: + if points_key is not None: + log.warning( + "`points_key` in `compute_table` is deprecated and will be removed in future versions, provide it in the constructor instead" + ) + self.points_key = points_key + + if aggregate_genes is None: + aggregate_genes = ( + (self.table is not None and isinstance(self.table.X, csr_matrix)) + or gene_column is not None + or self.bins_key is not None + or self.points_key is not None + ) + + if not average_intensities: + log.warning( + "`average_intensities=False` is deprecated and will be removed in future versions, use `aggregate_channels=False` instead" + ) + return aggregate_genes, False + + return aggregate_genes, aggregate_channels + def add_standardized_table(self): self.table.obs_names = list(map(str_cell_id, range(self.table.n_obs))) self.geo_df.index = list(self.table.obs_names) diff --git a/sopa/aggregation/overlay.py b/sopa/aggregation/overlay.py index b3dcf53..abfb0e1 100644 --- a/sopa/aggregation/overlay.py +++ b/sopa/aggregation/overlay.py @@ -10,8 +10,8 @@ from shapely import Polygon from spatialdata import SpatialData -from .._constants import ATTRS_KEY, SopaAttrs, SopaKeys -from ..utils import get_spatial_element, to_intrinsic +from .._constants import SopaAttrs, SopaKeys +from ..utils import get_feature_key, get_spatial_element, to_intrinsic from . import Aggregator log = logging.getLogger(__name__) @@ -33,17 +33,20 @@ def overlay_segmentation( area_ratio_threshold: Threshold between 0 and 1. For each original cell overlapping with a new cell, we compute the overlap-area/cell-area, if above the threshold the cell is removed. image_key: Optional key of the original image """ - average_intensities = False + aggregate_genes, aggregate_channels = False, False - if "table" in sdata.tables and SopaKeys.UNS_KEY in sdata.tables["table"].uns: - sopa_attrs = sdata.tables["table"].uns[SopaKeys.UNS_KEY] + assert SopaKeys.TABLE in sdata.tables, "No table found in the SpatialData object" - if sopa_attrs[SopaKeys.UNS_HAS_TRANSCRIPTS]: - if gene_column is None: - points = get_spatial_element(sdata.points, key=sdata.attrs.get(SopaAttrs.TRANSCRIPTS)) - gene_column = points.attrs.get(ATTRS_KEY, {}).get("feature_key") - assert gene_column is not None, "Need 'gene_column' argument to count transcripts" - average_intensities = sopa_attrs[SopaKeys.UNS_HAS_INTENSITIES] + assert SopaKeys.UNS_KEY in sdata.tables["table"].uns, "It seems the table was not aggregated using `sopa.aggregate`" + + sopa_attrs = sdata.tables["table"].uns[SopaKeys.UNS_KEY] + + aggregate_genes = sopa_attrs[SopaKeys.UNS_HAS_TRANSCRIPTS] + aggregate_channels = sopa_attrs[SopaKeys.UNS_HAS_INTENSITIES] + + if aggregate_genes and gene_column is None: + points = get_spatial_element(sdata.points, key=sdata.attrs.get(SopaAttrs.TRANSCRIPTS)) + gene_column = get_feature_key(points, raise_error=True) aggr = Aggregator(sdata, image_key=image_key, shapes_key=shapes_key) @@ -61,7 +64,7 @@ def overlay_segmentation( old_geo_df = aggr.sdata[old_shapes_key] geo_df = to_intrinsic(aggr.sdata, aggr.geo_df, old_geo_df) - geo_df.index.name = "index_right" # to reuse the index name later + geo_df.index.name = None gdf_join = gpd.sjoin(old_geo_df, geo_df) gdf_join["geometry_right"] = gdf_join["index_right"].map(lambda i: geo_df.geometry.iloc[i]) gdf_join["overlap_ratio"] = gdf_join.apply(_overlap_area_ratio, axis=1) @@ -70,13 +73,13 @@ def overlay_segmentation( table_crop = old_table[~np.isin(old_table.obs[instance_key], gdf_join.index)].copy() table_crop.obs[SopaKeys.CELL_OVERLAY_KEY] = False - aggr.compute_table(gene_column=gene_column, average_intensities=average_intensities) + aggr.compute_table(aggregate_channels=aggregate_channels, aggregate_genes=aggregate_genes, gene_column=gene_column) aggr.table.obs[SopaKeys.CELL_OVERLAY_KEY] = True aggr.table = anndata.concat([table_crop, aggr.table], uns_merge="first", join="outer", fill_value=0) _fillna(aggr.table.obs) - aggr.shapes_key = f"{old_shapes_key}+{aggr.shapes_key}" + aggr.shapes_key = f"{old_shapes_key}_overlay_{aggr.shapes_key}" geo_df_cropped = old_geo_df.loc[~old_geo_df.index.isin(gdf_join.index)] aggr.geo_df = pd.concat([geo_df_cropped, geo_df], join="outer", axis=0) aggr.geo_df.attrs = old_geo_df.attrs diff --git a/sopa/aggregation/transcripts.py b/sopa/aggregation/transcripts.py index c104e52..1c3192f 100644 --- a/sopa/aggregation/transcripts.py +++ b/sopa/aggregation/transcripts.py @@ -13,16 +13,16 @@ from spatialdata import SpatialData from .._constants import SopaAttrs -from ..utils import get_boundaries, get_spatial_element, to_intrinsic +from ..utils import get_boundaries, get_feature_key, get_spatial_element, to_intrinsic log = logging.getLogger(__name__) def count_transcripts( sdata: SpatialData, - gene_column: str, - shapes_key: str = None, - points_key: str = None, + gene_column: str | None = None, + shapes_key: str | None = None, + points_key: str | None = None, geo_df: gpd.GeoDataFrame | None = None, ) -> AnnData: """Counts transcripts per cell. @@ -45,6 +45,8 @@ def count_transcripts( geo_df = get_boundaries(sdata, key=shapes_key) geo_df = to_intrinsic(sdata, geo_df, points_key) + gene_column = gene_column or get_feature_key(points, raise_error=True) + log.info(f"Aggregating transcripts over {len(geo_df)} cells") return _count_transcripts_aligned(geo_df, points, gene_column) diff --git a/sopa/io/explorer/converter.py b/sopa/io/explorer/converter.py index 0c6d7ac..62aa067 100644 --- a/sopa/io/explorer/converter.py +++ b/sopa/io/explorer/converter.py @@ -11,6 +11,7 @@ from ..._constants import ATTRS_KEY, SopaAttrs, SopaKeys from ...utils import ( get_boundaries, + get_feature_key, get_spatial_element, get_spatial_image, to_intrinsic, @@ -144,7 +145,7 @@ def write( df = get_spatial_element(sdata.points, key=points_key or sdata.attrs.get(SopaAttrs.TRANSCRIPTS)) if _should_save(mode, "t") and df is not None: - gene_column = gene_column or df.attrs[ATTRS_KEY].get("feature_key") + gene_column = gene_column or get_feature_key(df) if gene_column is not None: df = to_intrinsic(sdata, df, image_key) write_transcripts(path, df, gene_column, pixel_size=pixel_size) diff --git a/sopa/io/reader/cosmx.py b/sopa/io/reader/cosmx.py index abfdc38..0b2f3ca 100644 --- a/sopa/io/reader/cosmx.py +++ b/sopa/io/reader/cosmx.py @@ -39,6 +39,8 @@ def cosmx( - `*_tx_file.csv.gz` or `*_tx_file.csv`: Transcripts location and names - If `read_proteins` is `True`, all the images under the nested `ProteinImages` directories will be read + These files must be exported as flat files in AtomX. That is: within a study, click on "Export" and then select files from the "Flat CSV Files" section (transcripts flat and FOV position flat). + Args: path: Path to the root directory containing *Nanostring* files. dataset_id: Optional name of the dataset (needs to be provided if not infered). @@ -90,7 +92,7 @@ def cosmx( parsed_image = Image2DModel.parse(image, dims=("c", "y", "x"), c_coords=c_coords, **image_models_kwargs) if read_proteins: - return SpatialData(images={image_name: parsed_image}) + return SpatialData(images={image_name: parsed_image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name}) ### Read transcripts transcripts_data = _read_transcripts_csv(path, dataset_id) diff --git a/sopa/io/reader/utils.py b/sopa/io/reader/utils.py index a949caa..14dad35 100644 --- a/sopa/io/reader/utils.py +++ b/sopa/io/reader/utils.py @@ -162,4 +162,4 @@ def ome_tif(path: Path, as_image: bool = False) -> DataArray | SpatialData: **image_models_kwargs, ) - return SpatialData(images={image_name: image}) + return SpatialData(images={image_name: image}, attrs={SopaAttrs.CELL_SEGMENTATION: image_name}) diff --git a/sopa/io/reader/wsi.py b/sopa/io/reader/wsi.py index 762093c..d1bc561 100644 --- a/sopa/io/reader/wsi.py +++ b/sopa/io/reader/wsi.py @@ -98,7 +98,7 @@ def wsi_autoscale(path: str | Path, image_model_kwargs: dict | None = None) -> S ) multiscale_image.attrs["metadata"] = tiff_metadata - return SpatialData(images={image_name: multiscale_image}) + return SpatialData(images={image_name: multiscale_image}, attrs={SopaAttrs.TISSUE_SEGMENTATION: image_name}) def _default_image_models_kwargs(image_models_kwargs: dict | None) -> dict: diff --git a/sopa/segmentation/methods/_baysor.py b/sopa/segmentation/methods/_baysor.py index 012377b..1ebc516 100644 --- a/sopa/segmentation/methods/_baysor.py +++ b/sopa/segmentation/methods/_baysor.py @@ -7,8 +7,8 @@ from spatialdata import SpatialData from ... import settings -from ..._constants import ATTRS_KEY, SopaAttrs, SopaFiles, SopaKeys -from ...utils import get_transcripts_patches_dirs +from ..._constants import SopaAttrs, SopaFiles, SopaKeys +from ...utils import get_feature_key, get_transcripts_patches_dirs from .._transcripts import resolve log = logging.getLogger(__name__) @@ -164,10 +164,7 @@ def _get_default_config(sdata: SpatialData) -> dict: points_key ), f"Transcripts key not found in sdata.attrs['{SopaAttrs.TRANSCRIPTS}'], baysor config can't be inferred." - feature_key = sdata[points_key].attrs[ATTRS_KEY].get("feature_key") - assert ( - feature_key - ), f"Feature key not found in sdata['{points_key}'].attrs['{ATTRS_KEY}'], baysor config can't be inferred." + feature_key = get_feature_key(sdata[points_key], raise_error=True) return { "data": { diff --git a/sopa/utils/__init__.py b/sopa/utils/__init__.py index 867d152..25e8171 100644 --- a/sopa/utils/__init__.py +++ b/sopa/utils/__init__.py @@ -8,6 +8,7 @@ get_intensities, add_spatial_element, get_transcripts_patches_dirs, + get_feature_key, ) from .annotation import preprocess_fluo, higher_z_score, tangram_annotate from .image import ( diff --git a/sopa/utils/data.py b/sopa/utils/data.py index 7d8b85f..aee9ceb 100644 --- a/sopa/utils/data.py +++ b/sopa/utils/data.py @@ -278,4 +278,8 @@ def blobs( genes = pd.Series(np.random.choice(list("abcdef"), size=len(points))).astype("category") points["genes"] = dd.from_pandas(genes, npartitions=points.npartitions) - return SpatialData(images={"blob_image": image}, points={"blob_transcripts": points}) + return SpatialData( + images={"blob_image": image}, + points={"blob_transcripts": points}, + attrs={SopaAttrs.CELL_SEGMENTATION: "blob_image", SopaAttrs.TRANSCRIPTS: "blob_transcripts"}, + ) diff --git a/sopa/utils/utils.py b/sopa/utils/utils.py index 870bf2b..687b8a7 100644 --- a/sopa/utils/utils.py +++ b/sopa/utils/utils.py @@ -3,6 +3,7 @@ import logging from pathlib import Path +import dask.dataframe as dd import geopandas as gpd import pandas as pd import spatialdata @@ -17,7 +18,7 @@ from xarray import DataArray from .. import settings -from .._constants import SopaAttrs, SopaFiles, SopaKeys +from .._constants import ATTRS_KEY, SopaAttrs, SopaFiles, SopaKeys log = logging.getLogger(__name__) @@ -93,6 +94,18 @@ def to_intrinsic(sdata: SpatialData, element: SpatialElement | str, element_cs: return spatialdata.transform(element, transformation=transformation, maintain_positioning=True) +def get_feature_key(points: dd.DataFrame, raise_error: bool = False) -> str: + """Get the feature key of a transcript dataframe""" + feature_key = points.attrs.get(ATTRS_KEY, {}).get("feature_key") + + if raise_error and feature_key is None: + raise ValueError( + f"No gene column name found in points['{ATTRS_KEY}']['feature_key']. Provide the `gene_column` argument." + ) + + return feature_key + + def get_intensities(sdata: SpatialData) -> pd.DataFrame | None: """Gets the intensity dataframe of shape `n_obs x n_channels`""" assert SopaKeys.TABLE in sdata.tables, f"No '{SopaKeys.TABLE}' found in sdata.tables"