diff --git a/src/analysis/selfbpe/pairwise_merge.ipynb b/src/analysis/selfbpe/pairwise_merge.ipynb index 3c8e756..7693a30 100644 --- a/src/analysis/selfbpe/pairwise_merge.ipynb +++ b/src/analysis/selfbpe/pairwise_merge.ipynb @@ -73,7 +73,7 @@ "\n", "for tutorial_name, tutorials in data.items():\n", " for t in tutorials:\n", - " calls.extend(t['relevant_API'])\n", + " calls.extend([a.strip() for a in t['relevant_API'] if a.strip()])\n", " calls.append(SENTINEL)\n", "\n", "print(len(calls))\n", @@ -135,82 +135,6 @@ "c0.most_common(10)" ] }, - { - "cell_type": "markdown", - "id": "47b1be5c-dcc8-4765-8ccb-1e0a53ec2897", - "metadata": {}, - "source": [ - "We'll merge consecutive calls that occur more than some threshold." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "866945a4-dc47-46c0-9bf9-a33423b9cddd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n", - "Merging scanpy.pp.neighbors -> scanpy.tl.umap of frequency 6 >= 5\n" - ] - } - ], - "source": [ - "def merge(tokens: [str], counts: Counter, threshold: int, verbose=False):\n", - " merged = []\n", - " i = 0\n", - " while i < len(tokens) - 1:\n", - " w1 = tokens[i]\n", - " w2 = tokens[i+1]\n", - " c = counts[(w1, w2)]\n", - " if c >= threshold:\n", - " m = '{} -> {}'.format(w1, w2)\n", - " if verbose:\n", - " print('Merging {} of frequency {} >= {}'.format(m, c, threshold))\n", - " merged.append(m)\n", - " i += 2\n", - " else:\n", - " merged.append(w1)\n", - " i += 1\n", - " if i < len(tokens):\n", - " merged.append(tokens[i])\n", - " return merged\n", - "\n", - "r1 = merge(r0, c0, threshold=5, verbose=True)" - ] - }, { "cell_type": "markdown", "id": "ab2dd9f6-9d54-4a89-9a4f-3515e1f1af27", @@ -229,122 +153,42 @@ "name": "stdout", "output_type": "stream", "text": [ - "---------- Round 1 ----------\n", - "Top 5 pairs\n", - "('scanpy.tl.rank_genes_groups', 'scanpy.pl.rank_genes_groups') occurred 8 times\n", - "('scanpy.pp.pca', 'scanpy.pp.neighbors') occurred 6 times\n", - "('scanpy.pp.neighbors', 'scanpy.tl.umap') occurred 6 times\n", - "('scanpy.pl.umap', 'scanpy.pl.diffmap') occurred 6 times\n", - "('scanpy.pp.filter_cells', 'scanpy.pp.filter_genes') occurred 5 times\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n", - "Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n", - "Merging scanpy.tl.leiden -> scanpy.pl.umap of frequency 4 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n", - "Merging scanpy.tl.leiden -> scanpy.pl.umap of frequency 4 >= 3\n", - "Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n", - "Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n", - "Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n", - "Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n", - "Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n", - "Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n", - "Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n", - "Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n", - "Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n", - "Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n", - "Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n", - "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n", - "Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n", - "Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n", - "Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n", - "Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n", - "Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n", - "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n", - "Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.tl.umap -> scanpy.tl.leiden of frequency 3 >= 3\n", - "Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n", - "Merging scanpy.tl.umap -> scanpy.tl.leiden of frequency 3 >= 3\n", - "Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n", - "Merging scanpy.pp.neighbors -> scanpy.tl.umap of frequency 6 >= 3\n", - "---------- Round 2 ----------\n", - "Top 5 pairs\n", - "('scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups', 'scanpy.pl.rank_genes_groups_violin') occurred 4 times\n", - "('scanpy.pp.pca -> scanpy.pp.neighbors', 'scanpy.tl.umap') occurred 3 times\n", - "('scanpy.tl.dpt -> scanpy.pl.umap', 'scanpy.pl.diffmap') occurred 3 times\n", - "('scanpy.logging.print_versions', 'scanpy.settings.set_figure_params') occurred 2 times\n", - "('scanpy.tl.umap', 'scanpy.pl.umap') occurred 2 times\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n", - "Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n", - "Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n", - "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n", - "---------- Round 3 ----------\n", - "Top 5 pairs\n", - "('scanpy.logging.print_versions', 'scanpy.settings.set_figure_params') occurred 2 times\n", - "('scanpy.external.pp.bbknn', 'scanpy.tl.umap') occurred 2 times\n", - "('scanpy.tl.draw_graph', 'scanpy.pl.draw_graph') occurred 2 times\n", - "('scanpy.logging.print_header', 'scanpy.settings.set_figure_params') occurred 2 times\n", - "('scanpy.pp.calculate_qc_metrics', 'scanpy.pl.violin') occurred 2 times\n", - "Stopping after Round 3\n" + "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of freq 8 >= 3\n", + "Merging scanpy.pp.pca -> scanpy.pp.neighbors of freq 6 >= 3\n", + "Merging scanpy.pl.umap -> scanpy.pl.diffmap of freq 6 >= 3\n", + "Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of freq 5 >= 3\n", + "Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of freq 5 >= 3\n", + "Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of freq 5 >= 3\n", + "Merging scanpy.tl.leiden -> scanpy.pl.umap of freq 4 >= 3\n", + "Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of freq 4 >= 3\n", + "Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of freq 4 >= 3\n", + "Merging scanpy.read -> scanpy.tl.louvain of freq 4 >= 3\n", + "Merging scanpy.tl.paga -> scanpy.pl.paga_compare of freq 4 >= 3\n", + "Merging scanpy.tl.pca -> scanpy.pp.neighbors of freq 4 >= 3\n", + "Merging scanpy.tl.paga -> scanpy.pl.paga of freq 3 >= 3\n", + "Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of freq 3 >= 3\n", + "Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of freq 3 >= 3\n", + "Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of freq 3 >= 3\n" ] } ], "source": [ "def iterative_merge(tokens, threshold, verbose=False):\n", - " before = tokens[:]\n", + " tokens = tokens.copy()\n", " rounds = 0\n", " while True:\n", - " rounds += 1\n", - " counts = count_bigrams(before)\n", - " if verbose:\n", - " print('-' * 10, 'Round {}'.format(rounds), '-' * 10)\n", - " print('Top 5 pairs')\n", - " for k, v in counts.most_common(5):\n", - " print('{} occurred {} times'.format(k, v))\n", - " after = merge(before, counts, threshold=threshold, verbose=verbose)\n", - " if len(after) == len(before):\n", + " counts = count_bigrams(tokens)\n", + " (w1, w2), v = counts.most_common(1)[0]\n", + " if v < threshold:\n", " break\n", - " before = after\n", - " if verbose:\n", - " print('Stopping after Round {}'.format(rounds))\n", - " return after\n", + " cat = '___'.join(tokens)\n", + " before = '{}___{}'.format(w1, w2)\n", + " after = '{} -> {}'.format(w1, w2)\n", + " cat = cat.replace(before, after)\n", + " tokens = cat.split('___')\n", + " if verbose:\n", + " print('Merging {} of freq {} >= {}'.format(after, v, threshold))\n", + " return tokens\n", "\n", "merged = iterative_merge(calls, threshold=3, verbose=True)" ] @@ -1373,8 +1217,8 @@ "\n", "AFTER\n", "\n", - "scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n", - "scanpy.pl.rank_genes_groups\n", + "scanpy.pl.umap\n", + "scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups\n", "\n", "---------- Tutorial 84 ----------\n", "BEFORE\n", @@ -1442,8 +1286,7 @@ "\n", "AFTER\n", "\n", - "scanpy.tl.paga\n", - "scanpy.pl.paga_path\n", + "scanpy.tl.paga -> scanpy.pl.paga_path\n", "\n", "---------- Tutorial 90 ----------\n", "BEFORE\n", @@ -1552,8 +1395,8 @@ "\n", "AFTER\n", "\n", - "scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n", - "scanpy.pl.rank_genes_groups\n", + "scanpy.pl.umap\n", + "scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups\n", "\n", "---------- Tutorial 100 ----------\n", "BEFORE\n", @@ -1846,7 +1689,8 @@ "\n", "AFTER\n", "\n", - "scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n", + "scanpy.pl.umap\n", + "scanpy.tl.rank_genes_groups\n", "\n", "---------- Tutorial 126 ----------\n", "BEFORE\n", @@ -2165,7 +2009,8 @@ "\n", "AFTER\n", "\n", - "scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n", + "scanpy.pl.umap\n", + "scanpy.tl.rank_genes_groups\n", "\n", "---------- Tutorial 154 ----------\n", "BEFORE\n", @@ -2312,9 +2157,8 @@ "\n", "scanpy.pp.normalize_total -> scanpy.pp.log1p\n", "scanpy.pp.highly_variable_genes\n", - "scanpy.pp.pca -> scanpy.pp.neighbors\n", - "scanpy.tl.umap -> scanpy.tl.leiden\n", - "scanpy.pl.umap\n", + "scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap\n", + "scanpy.tl.leiden -> scanpy.pl.umap\n", "\n", "---------- Tutorial 166 ----------\n", "BEFORE\n", @@ -2336,8 +2180,7 @@ "\n", "AFTER\n", "\n", - "scanpy.tl.rank_genes_groups\n", - "scanpy.pl.rank_genes_groups_heatmap\n", + "scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups_heatmap\n", "scanpy.pl.spatial\n", "\n", "---------- Tutorial 168 ----------\n", @@ -2376,9 +2219,8 @@ "AFTER\n", "\n", "scanpy.pp.normalize_per_cell -> scanpy.pp.log1p\n", - "scanpy.pp.pca -> scanpy.pp.neighbors\n", - "scanpy.tl.umap -> scanpy.tl.leiden\n", - "scanpy.pl.umap\n", + "scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap\n", + "scanpy.tl.leiden -> scanpy.pl.umap\n", "scanpy.pl.embedding\n", "\n", "---------- Tutorial 171 ----------\n", @@ -2422,7 +2264,8 @@ "AFTER\n", "\n", "scanpy.concat\n", - "scanpy.pp.neighbors -> scanpy.tl.umap\n", + "scanpy.pp.neighbors\n", + "scanpy.tl.umap\n", "scanpy.tl.leiden\n", "\n", "---------- Tutorial 174 ----------\n",