Skip to content

Commit

Permalink
priority merge
Browse files Browse the repository at this point in the history
  • Loading branch information
vzhong committed Feb 2, 2024
1 parent ad6b189 commit e1a4fd3
Showing 1 changed file with 45 additions and 202 deletions.
247 changes: 45 additions & 202 deletions src/analysis/selfbpe/pairwise_merge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"\n",
"for tutorial_name, tutorials in data.items():\n",
" for t in tutorials:\n",
" calls.extend(t['relevant_API'])\n",
" calls.extend([a.strip() for a in t['relevant_API'] if a.strip()])\n",
" calls.append(SENTINEL)\n",
"\n",
"print(len(calls))\n",
Expand Down Expand Up @@ -135,82 +135,6 @@
"c0.most_common(10)"
]
},
{
"cell_type": "markdown",
"id": "47b1be5c-dcc8-4765-8ccb-1e0a53ec2897",
"metadata": {},
"source": [
"We'll merge consecutive calls that occur more than some threshold."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "866945a4-dc47-46c0-9bf9-a33423b9cddd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 5\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5\n",
"Merging scanpy.pp.neighbors -> scanpy.tl.umap of frequency 6 >= 5\n"
]
}
],
"source": [
"def merge(tokens: [str], counts: Counter, threshold: int, verbose=False):\n",
" merged = []\n",
" i = 0\n",
" while i < len(tokens) - 1:\n",
" w1 = tokens[i]\n",
" w2 = tokens[i+1]\n",
" c = counts[(w1, w2)]\n",
" if c >= threshold:\n",
" m = '{} -> {}'.format(w1, w2)\n",
" if verbose:\n",
" print('Merging {} of frequency {} >= {}'.format(m, c, threshold))\n",
" merged.append(m)\n",
" i += 2\n",
" else:\n",
" merged.append(w1)\n",
" i += 1\n",
" if i < len(tokens):\n",
" merged.append(tokens[i])\n",
" return merged\n",
"\n",
"r1 = merge(r0, c0, threshold=5, verbose=True)"
]
},
{
"cell_type": "markdown",
"id": "ab2dd9f6-9d54-4a89-9a4f-3515e1f1af27",
Expand All @@ -229,122 +153,42 @@
"name": "stdout",
"output_type": "stream",
"text": [
"---------- Round 1 ----------\n",
"Top 5 pairs\n",
"('scanpy.tl.rank_genes_groups', 'scanpy.pl.rank_genes_groups') occurred 8 times\n",
"('scanpy.pp.pca', 'scanpy.pp.neighbors') occurred 6 times\n",
"('scanpy.pp.neighbors', 'scanpy.tl.umap') occurred 6 times\n",
"('scanpy.pl.umap', 'scanpy.pl.diffmap') occurred 6 times\n",
"('scanpy.pp.filter_cells', 'scanpy.pp.filter_genes') occurred 5 times\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n",
"Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n",
"Merging scanpy.tl.leiden -> scanpy.pl.umap of frequency 4 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n",
"Merging scanpy.tl.leiden -> scanpy.pl.umap of frequency 4 >= 3\n",
"Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3\n",
"Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n",
"Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n",
"Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n",
"Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n",
"Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n",
"Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n",
"Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n",
"Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n",
"Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n",
"Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of frequency 6 >= 3\n",
"Merging scanpy.read -> scanpy.tl.louvain of frequency 4 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.tl.rank_genes_groups of frequency 4 >= 3\n",
"Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of frequency 3 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga_compare of frequency 4 >= 3\n",
"Merging scanpy.tl.pca -> scanpy.pp.neighbors of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3\n",
"Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.tl.umap -> scanpy.tl.leiden of frequency 3 >= 3\n",
"Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of frequency 4 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3\n",
"Merging scanpy.tl.umap -> scanpy.tl.leiden of frequency 3 >= 3\n",
"Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3\n",
"Merging scanpy.pp.neighbors -> scanpy.tl.umap of frequency 6 >= 3\n",
"---------- Round 2 ----------\n",
"Top 5 pairs\n",
"('scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups', 'scanpy.pl.rank_genes_groups_violin') occurred 4 times\n",
"('scanpy.pp.pca -> scanpy.pp.neighbors', 'scanpy.tl.umap') occurred 3 times\n",
"('scanpy.tl.dpt -> scanpy.pl.umap', 'scanpy.pl.diffmap') occurred 3 times\n",
"('scanpy.logging.print_versions', 'scanpy.settings.set_figure_params') occurred 2 times\n",
"('scanpy.tl.umap', 'scanpy.pl.umap') occurred 2 times\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of frequency 3 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of frequency 4 >= 3\n",
"---------- Round 3 ----------\n",
"Top 5 pairs\n",
"('scanpy.logging.print_versions', 'scanpy.settings.set_figure_params') occurred 2 times\n",
"('scanpy.external.pp.bbknn', 'scanpy.tl.umap') occurred 2 times\n",
"('scanpy.tl.draw_graph', 'scanpy.pl.draw_graph') occurred 2 times\n",
"('scanpy.logging.print_header', 'scanpy.settings.set_figure_params') occurred 2 times\n",
"('scanpy.pp.calculate_qc_metrics', 'scanpy.pl.violin') occurred 2 times\n",
"Stopping after Round 3\n"
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of freq 8 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors of freq 6 >= 3\n",
"Merging scanpy.pl.umap -> scanpy.pl.diffmap of freq 6 >= 3\n",
"Merging scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap of freq 5 >= 3\n",
"Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of freq 5 >= 3\n",
"Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of freq 5 >= 3\n",
"Merging scanpy.tl.leiden -> scanpy.pl.umap of freq 4 >= 3\n",
"Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups -> scanpy.pl.rank_genes_groups_violin of freq 4 >= 3\n",
"Merging scanpy.pp.normalize_per_cell -> scanpy.pp.log1p of freq 4 >= 3\n",
"Merging scanpy.read -> scanpy.tl.louvain of freq 4 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga_compare of freq 4 >= 3\n",
"Merging scanpy.tl.pca -> scanpy.pp.neighbors of freq 4 >= 3\n",
"Merging scanpy.tl.paga -> scanpy.pl.paga of freq 3 >= 3\n",
"Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of freq 3 >= 3\n",
"Merging scanpy.pl.stacked_violin -> scanpy.pl.matrixplot of freq 3 >= 3\n",
"Merging scanpy.tl.dpt -> scanpy.pl.umap -> scanpy.pl.diffmap of freq 3 >= 3\n"
]
}
],
"source": [
"def iterative_merge(tokens, threshold, verbose=False):\n",
" before = tokens[:]\n",
" tokens = tokens.copy()\n",
" rounds = 0\n",
" while True:\n",
" rounds += 1\n",
" counts = count_bigrams(before)\n",
" if verbose:\n",
" print('-' * 10, 'Round {}'.format(rounds), '-' * 10)\n",
" print('Top 5 pairs')\n",
" for k, v in counts.most_common(5):\n",
" print('{} occurred {} times'.format(k, v))\n",
" after = merge(before, counts, threshold=threshold, verbose=verbose)\n",
" if len(after) == len(before):\n",
" counts = count_bigrams(tokens)\n",
" (w1, w2), v = counts.most_common(1)[0]\n",
" if v < threshold:\n",
" break\n",
" before = after\n",
" if verbose:\n",
" print('Stopping after Round {}'.format(rounds))\n",
" return after\n",
" cat = '___'.join(tokens)\n",
" before = '{}___{}'.format(w1, w2)\n",
" after = '{} -> {}'.format(w1, w2)\n",
" cat = cat.replace(before, after)\n",
" tokens = cat.split('___')\n",
" if verbose:\n",
" print('Merging {} of freq {} >= {}'.format(after, v, threshold))\n",
" return tokens\n",
"\n",
"merged = iterative_merge(calls, threshold=3, verbose=True)"
]
Expand Down Expand Up @@ -1373,8 +1217,8 @@
"\n",
"AFTER\n",
"\n",
"scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n",
"scanpy.pl.rank_genes_groups\n",
"scanpy.pl.umap\n",
"scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups\n",
"\n",
"---------- Tutorial 84 ----------\n",
"BEFORE\n",
Expand Down Expand Up @@ -1442,8 +1286,7 @@
"\n",
"AFTER\n",
"\n",
"scanpy.tl.paga\n",
"scanpy.pl.paga_path\n",
"scanpy.tl.paga -> scanpy.pl.paga_path\n",
"\n",
"---------- Tutorial 90 ----------\n",
"BEFORE\n",
Expand Down Expand Up @@ -1552,8 +1395,8 @@
"\n",
"AFTER\n",
"\n",
"scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n",
"scanpy.pl.rank_genes_groups\n",
"scanpy.pl.umap\n",
"scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups\n",
"\n",
"---------- Tutorial 100 ----------\n",
"BEFORE\n",
Expand Down Expand Up @@ -1846,7 +1689,8 @@
"\n",
"AFTER\n",
"\n",
"scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n",
"scanpy.pl.umap\n",
"scanpy.tl.rank_genes_groups\n",
"\n",
"---------- Tutorial 126 ----------\n",
"BEFORE\n",
Expand Down Expand Up @@ -2165,7 +2009,8 @@
"\n",
"AFTER\n",
"\n",
"scanpy.pl.umap -> scanpy.tl.rank_genes_groups\n",
"scanpy.pl.umap\n",
"scanpy.tl.rank_genes_groups\n",
"\n",
"---------- Tutorial 154 ----------\n",
"BEFORE\n",
Expand Down Expand Up @@ -2312,9 +2157,8 @@
"\n",
"scanpy.pp.normalize_total -> scanpy.pp.log1p\n",
"scanpy.pp.highly_variable_genes\n",
"scanpy.pp.pca -> scanpy.pp.neighbors\n",
"scanpy.tl.umap -> scanpy.tl.leiden\n",
"scanpy.pl.umap\n",
"scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap\n",
"scanpy.tl.leiden -> scanpy.pl.umap\n",
"\n",
"---------- Tutorial 166 ----------\n",
"BEFORE\n",
Expand All @@ -2336,8 +2180,7 @@
"\n",
"AFTER\n",
"\n",
"scanpy.tl.rank_genes_groups\n",
"scanpy.pl.rank_genes_groups_heatmap\n",
"scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups_heatmap\n",
"scanpy.pl.spatial\n",
"\n",
"---------- Tutorial 168 ----------\n",
Expand Down Expand Up @@ -2376,9 +2219,8 @@
"AFTER\n",
"\n",
"scanpy.pp.normalize_per_cell -> scanpy.pp.log1p\n",
"scanpy.pp.pca -> scanpy.pp.neighbors\n",
"scanpy.tl.umap -> scanpy.tl.leiden\n",
"scanpy.pl.umap\n",
"scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap\n",
"scanpy.tl.leiden -> scanpy.pl.umap\n",
"scanpy.pl.embedding\n",
"\n",
"---------- Tutorial 171 ----------\n",
Expand Down Expand Up @@ -2422,7 +2264,8 @@
"AFTER\n",
"\n",
"scanpy.concat\n",
"scanpy.pp.neighbors -> scanpy.tl.umap\n",
"scanpy.pp.neighbors\n",
"scanpy.tl.umap\n",
"scanpy.tl.leiden\n",
"\n",
"---------- Tutorial 174 ----------\n",
Expand Down

0 comments on commit e1a4fd3

Please sign in to comment.