From 14b6ee9b0449efe4b6829ecc955e1c2b19d2627f Mon Sep 17 00:00:00 2001 From: Jean Neiverth Date: Tue, 12 Nov 2024 13:19:11 -0300 Subject: [PATCH] quickfix: rename notebooks to correct numerical order --- notebooks/07-remove-bot-or-not-noises.ipynb | 199 ------ ...ynb => 08-separate-sybils-in-groups.ipynb} | 0 notebooks/09-remove-bot-or-not-noises.ipynb | 594 ++++++++++++++++++ 3 files changed, 594 insertions(+), 199 deletions(-) delete mode 100644 notebooks/07-remove-bot-or-not-noises.ipynb rename notebooks/{07-separate-sybils-in-groups.ipynb => 08-separate-sybils-in-groups.ipynb} (100%) create mode 100644 notebooks/09-remove-bot-or-not-noises.ipynb diff --git a/notebooks/07-remove-bot-or-not-noises.ipynb b/notebooks/07-remove-bot-or-not-noises.ipynb deleted file mode 100644 index 520aef5..0000000 --- a/notebooks/07-remove-bot-or-not-noises.ipynb +++ /dev/null @@ -1,199 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Remove bot-or-not noises\n", - "\n", - "### Used files\n", - "- bot_or_not_without_info\n", - "- sybilscar_results\n", - "\n", - "### Summary:\n", - "1. Load necessary data\n", - "2. Apply logic to add new column \"is_noisy\"\n", - "3. Check bot label changes from sybilscar\n", - "4. Save new bot_or_not_without_noises" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Load necessary data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import polars as pl\n", - "import os\n", - "pl.Config.set_fmt_str_lengths(400)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_PATH = os.getenv(\"DATA_PATH\", \"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bot_or_not = pl.read_parquet(f\"{DATA_PATH}/interim/bot_or_not_without_info.parquet\")\n", - "bot_or_not\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sybilscar_result = pl.read_parquet(f\"{DATA_PATH}/../farcaster-social-graph-api/farcaster_social_graph_api/data/sybil_scar_results.parquet\")\n", - "sybilscar_result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fnames = pl.read_parquet(f\"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet\")\n", - "last_fnames = fnames[[\"fid\",\"updated_at\"]].group_by(\"fid\").max()\n", - "last_fnames = last_fnames.join(fnames,on=[\"fid\",\"updated_at\"],how=\"left\",coalesce=True)[[\"fid\",\"fname\"]]\n", - "# will be used in \"3. Check bot label changes from sybilscar\"\n", - "last_fnames" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Apply logic to add new column \"is_noisy\"\n", - "\n", - "For now, we are considering a sample noisy if sybil scar result (threshold p < 0.5) is different than bot_or_not\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = bot_or_not.join(sybilscar_result,on=\"fid\",coalesce=True,how=\"left\")\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check that there are indexes in bot_or_not that are outside the sybilscar result\n", - "df.filter(pl.col(\"posterior\").is_null())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.with_columns([\n", - " pl.when(pl.col(\"posterior\").is_null())\n", - " .then(pl.col(\"bot\"))\n", - " .otherwise(pl.col(\"bot\") != (pl.col(\"posterior\") < 0.5 ))\n", - " .alias(\"is_noisy\")\n", - "])\n", - "\n", - "display(df)\n", - "print(\"number of noisy elements: \",df[\"is_noisy\"].sum())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Check bot label changes from sybilscar" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bot_or_not_with_fnames = df.join(last_fnames[[\"fid\",\"fname\"]],on=\"fid\",how=\"left\", coalesce=True)\n", - "bot_or_not_with_fnames.filter(pl.col(\"is_noisy\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After manual inspection of the changed labels (noisy values), it is possible to check that ~70% of the changes make sense" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. Save new bot_or_not_without_noises" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Filter and remove unnecessary columns\n", - "bot_or_not_without_noises = df.filter(~pl.col(\"is_noisy\"))[[\"fid\",\"bot\"]]\n", - "bot_or_not_without_noises" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bot_or_not_without_noises.write_parquet(f\"{DATA_PATH}/interim/bot_or_not_without_noises.parquet\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "farcaster-social-graph-notebooks-_gupmy54-py3.13", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/07-separate-sybils-in-groups.ipynb b/notebooks/08-separate-sybils-in-groups.ipynb similarity index 100% rename from notebooks/07-separate-sybils-in-groups.ipynb rename to notebooks/08-separate-sybils-in-groups.ipynb diff --git a/notebooks/09-remove-bot-or-not-noises.ipynb b/notebooks/09-remove-bot-or-not-noises.ipynb new file mode 100644 index 0000000..a1c7fc7 --- /dev/null +++ b/notebooks/09-remove-bot-or-not-noises.ipynb @@ -0,0 +1,594 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Remove bot-or-not noises\n", + "\n", + "### Used files\n", + "- bot_or_not_without_info\n", + "- sybilscar_results\n", + "\n", + "### Summary:\n", + "1. Load necessary data\n", + "2. Apply logic to add new column \"is_noisy\"\n", + "3. Check bot label changes from sybilscar\n", + "4. Save new bot_or_not_without_noises" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Load necessary data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "polars.config.Config" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "import os\n", + "pl.Config.set_fmt_str_lengths(400)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_PATH = os.getenv(\"DATA_PATH\", \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (12_065, 2)
fidbot
i64bool
446097false
3false
8false
12false
2false
327500true
428200true
469138false
278549true
446821false
" + ], + "text/plain": [ + "shape: (12_065, 2)\n", + "┌────────┬───────┐\n", + "│ fid ┆ bot │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ bool │\n", + "╞════════╪═══════╡\n", + "│ 446097 ┆ false │\n", + "│ 3 ┆ false │\n", + "│ 8 ┆ false │\n", + "│ 12 ┆ false │\n", + "│ 2 ┆ false │\n", + "│ … ┆ … │\n", + "│ 327500 ┆ true │\n", + "│ 428200 ┆ true │\n", + "│ 469138 ┆ false │\n", + "│ 278549 ┆ true │\n", + "│ 446821 ┆ false │\n", + "└────────┴───────┘" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bot_or_not = pl.read_parquet(f\"{DATA_PATH}/interim/bot_or_not_without_info.parquet\")\n", + "bot_or_not\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (375_366, 3)
fid_indexposteriorfid
i64f64i64
1983060.0362936
470550.0690195
3268430.0551357
1201890.0429013
2973870.344896818125
1007251.0466914
162590.7863574
1284031.0720296
612380.0727956
3009220.7721503
" + ], + "text/plain": [ + "shape: (375_366, 3)\n", + "┌───────────┬───────────┬────────┐\n", + "│ fid_index ┆ posterior ┆ fid │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ f64 ┆ i64 │\n", + "╞═══════════╪═══════════╪════════╡\n", + "│ 198306 ┆ 0.0 ┆ 362936 │\n", + "│ 47055 ┆ 0.0 ┆ 690195 │\n", + "│ 326843 ┆ 0.0 ┆ 551357 │\n", + "│ 120189 ┆ 0.0 ┆ 429013 │\n", + "│ 297387 ┆ 0.344896 ┆ 818125 │\n", + "│ … ┆ … ┆ … │\n", + "│ 100725 ┆ 1.0 ┆ 466914 │\n", + "│ 16259 ┆ 0.7 ┆ 863574 │\n", + "│ 128403 ┆ 1.0 ┆ 720296 │\n", + "│ 61238 ┆ 0.0 ┆ 727956 │\n", + "│ 300922 ┆ 0.7 ┆ 721503 │\n", + "└───────────┴───────────┴────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sybilscar_result = pl.read_parquet(f\"{DATA_PATH}/../farcaster-social-graph-api/farcaster_social_graph_api/data/sybil_scar_results.parquet\")\n", + "sybilscar_result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (682_489, 2)
fidfname
i64str
606810"webfan"
291006"elawgrrl"
863985"hardiewalingvo"
481618"ericnam"
847339"maria0425"
339354"rakos"
836647"americans"
860644"dogavehayat"
492446"fainiguez"
728363"simpleearwig"
" + ], + "text/plain": [ + "shape: (682_489, 2)\n", + "┌────────┬────────────────┐\n", + "│ fid ┆ fname │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞════════╪════════════════╡\n", + "│ 606810 ┆ webfan │\n", + "│ 291006 ┆ elawgrrl │\n", + "│ 863985 ┆ hardiewalingvo │\n", + "│ 481618 ┆ ericnam │\n", + "│ 847339 ┆ maria0425 │\n", + "│ … ┆ … │\n", + "│ 339354 ┆ rakos │\n", + "│ 836647 ┆ americans │\n", + "│ 860644 ┆ dogavehayat │\n", + "│ 492446 ┆ fainiguez │\n", + "│ 728363 ┆ simpleearwig │\n", + "└────────┴────────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fnames = pl.read_parquet(f\"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet\")\n", + "last_fnames = fnames[[\"fid\",\"updated_at\"]].group_by(\"fid\").max()\n", + "last_fnames = last_fnames.join(fnames,on=[\"fid\",\"updated_at\"],how=\"left\",coalesce=True)[[\"fid\",\"fname\"]]\n", + "# will be used in \"3. Check bot label changes from sybilscar\"\n", + "last_fnames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Apply logic to add new column \"is_noisy\"\n", + "\n", + "For now, we are considering a sample noisy if sybil scar result (threshold p < 0.5) is different than bot_or_not\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (12_065, 4)
fidbotfid_indexposterior
i64booli64f64
446097false1639751.0
3false81291.0
8false2558721.0
12false434931.0
2false2483401.0
327500true1699660.0
428200true723880.0
469138false1058410.0
278549true223770.0
446821false2103281.0
" + ], + "text/plain": [ + "shape: (12_065, 4)\n", + "┌────────┬───────┬───────────┬───────────┐\n", + "│ fid ┆ bot ┆ fid_index ┆ posterior │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ i64 ┆ f64 │\n", + "╞════════╪═══════╪═══════════╪═══════════╡\n", + "│ 446097 ┆ false ┆ 163975 ┆ 1.0 │\n", + "│ 3 ┆ false ┆ 8129 ┆ 1.0 │\n", + "│ 8 ┆ false ┆ 255872 ┆ 1.0 │\n", + "│ 12 ┆ false ┆ 43493 ┆ 1.0 │\n", + "│ 2 ┆ false ┆ 248340 ┆ 1.0 │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ 327500 ┆ true ┆ 169966 ┆ 0.0 │\n", + "│ 428200 ┆ true ┆ 72388 ┆ 0.0 │\n", + "│ 469138 ┆ false ┆ 105841 ┆ 0.0 │\n", + "│ 278549 ┆ true ┆ 22377 ┆ 0.0 │\n", + "│ 446821 ┆ false ┆ 210328 ┆ 1.0 │\n", + "└────────┴───────┴───────────┴───────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bot_or_not.join(sybilscar_result,on=\"fid\",coalesce=True,how=\"left\")\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (79, 4)
fidbotfid_indexposterior
i64booli64f64
2348falsenullnull
12144falsenullnull
12775falsenullnull
191322falsenullnull
194515falsenullnull
854040falsenullnull
854041falsenullnull
854043falsenullnull
854923falsenullnull
856646falsenullnull
" + ], + "text/plain": [ + "shape: (79, 4)\n", + "┌────────┬───────┬───────────┬───────────┐\n", + "│ fid ┆ bot ┆ fid_index ┆ posterior │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ i64 ┆ f64 │\n", + "╞════════╪═══════╪═══════════╪═══════════╡\n", + "│ 2348 ┆ false ┆ null ┆ null │\n", + "│ 12144 ┆ false ┆ null ┆ null │\n", + "│ 12775 ┆ false ┆ null ┆ null │\n", + "│ 191322 ┆ false ┆ null ┆ null │\n", + "│ 194515 ┆ false ┆ null ┆ null │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ 854040 ┆ false ┆ null ┆ null │\n", + "│ 854041 ┆ false ┆ null ┆ null │\n", + "│ 854043 ┆ false ┆ null ┆ null │\n", + "│ 854923 ┆ false ┆ null ┆ null │\n", + "│ 856646 ┆ false ┆ null ┆ null │\n", + "└────────┴───────┴───────────┴───────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check that there are indexes in bot_or_not that are outside the sybilscar result\n", + "df.filter(pl.col(\"posterior\").is_null())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (12_065, 5)
fidbotfid_indexposterioris_noisy
i64booli64f64bool
446097false1639751.0false
3false81291.0false
8false2558721.0false
12false434931.0false
2false2483401.0false
327500true1699660.0false
428200true723880.0false
469138false1058410.0true
278549true223770.0false
446821false2103281.0false
" + ], + "text/plain": [ + "shape: (12_065, 5)\n", + "┌────────┬───────┬───────────┬───────────┬──────────┐\n", + "│ fid ┆ bot ┆ fid_index ┆ posterior ┆ is_noisy │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ i64 ┆ f64 ┆ bool │\n", + "╞════════╪═══════╪═══════════╪═══════════╪══════════╡\n", + "│ 446097 ┆ false ┆ 163975 ┆ 1.0 ┆ false │\n", + "│ 3 ┆ false ┆ 8129 ┆ 1.0 ┆ false │\n", + "│ 8 ┆ false ┆ 255872 ┆ 1.0 ┆ false │\n", + "│ 12 ┆ false ┆ 43493 ┆ 1.0 ┆ false │\n", + "│ 2 ┆ false ┆ 248340 ┆ 1.0 ┆ false │\n", + "│ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 327500 ┆ true ┆ 169966 ┆ 0.0 ┆ false │\n", + "│ 428200 ┆ true ┆ 72388 ┆ 0.0 ┆ false │\n", + "│ 469138 ┆ false ┆ 105841 ┆ 0.0 ┆ true │\n", + "│ 278549 ┆ true ┆ 22377 ┆ 0.0 ┆ false │\n", + "│ 446821 ┆ false ┆ 210328 ┆ 1.0 ┆ false │\n", + "└────────┴───────┴───────────┴───────────┴──────────┘" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of noisy elements: 3946\n" + ] + } + ], + "source": [ + "df = df.with_columns([\n", + " pl.when(pl.col(\"posterior\").is_null())\n", + " .then(pl.col(\"bot\"))\n", + " .otherwise(pl.col(\"bot\") != (pl.col(\"posterior\") < 0.5 ))\n", + " .alias(\"is_noisy\")\n", + "])\n", + "\n", + "display(df)\n", + "print(\"number of noisy elements: \",df[\"is_noisy\"].sum())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Check bot label changes from sybilscar" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (3_946, 6)
fidbotfid_indexposterioris_noisyfname
i64booli64f64boolstr
1731true1498461.0true"fayiz"
1771true3059791.0true"ruslan"
2183true482531.0true"djo"
2247false2725360.0true"papeclaus"
2278false922650.0true"versadchikov"
390605false3678760.0true"siatoshi"
810027false3502220.0true"naqu"
287794true524601.0true"jenny1"
423036true2836871.0true"sheva7.eth"
469138false1058410.0true"noormuhammad"
" + ], + "text/plain": [ + "shape: (3_946, 6)\n", + "┌────────┬───────┬───────────┬───────────┬──────────┬──────────────┐\n", + "│ fid ┆ bot ┆ fid_index ┆ posterior ┆ is_noisy ┆ fname │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ i64 ┆ f64 ┆ bool ┆ str │\n", + "╞════════╪═══════╪═══════════╪═══════════╪══════════╪══════════════╡\n", + "│ 1731 ┆ true ┆ 149846 ┆ 1.0 ┆ true ┆ fayiz │\n", + "│ 1771 ┆ true ┆ 305979 ┆ 1.0 ┆ true ┆ ruslan │\n", + "│ 2183 ┆ true ┆ 48253 ┆ 1.0 ┆ true ┆ djo │\n", + "│ 2247 ┆ false ┆ 272536 ┆ 0.0 ┆ true ┆ papeclaus │\n", + "│ 2278 ┆ false ┆ 92265 ┆ 0.0 ┆ true ┆ versadchikov │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 390605 ┆ false ┆ 367876 ┆ 0.0 ┆ true ┆ siatoshi │\n", + "│ 810027 ┆ false ┆ 350222 ┆ 0.0 ┆ true ┆ naqu │\n", + "│ 287794 ┆ true ┆ 52460 ┆ 1.0 ┆ true ┆ jenny1 │\n", + "│ 423036 ┆ true ┆ 283687 ┆ 1.0 ┆ true ┆ sheva7.eth │\n", + "│ 469138 ┆ false ┆ 105841 ┆ 0.0 ┆ true ┆ noormuhammad │\n", + "└────────┴───────┴───────────┴───────────┴──────────┴──────────────┘" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bot_or_not_with_fnames = df.join(last_fnames[[\"fid\",\"fname\"]],on=\"fid\",how=\"left\", coalesce=True)\n", + "bot_or_not_with_fnames.filter(pl.col(\"is_noisy\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 6)
fidbotfid_indexposterioris_noisyfname
i64booli64f64boolstr
415001false101150.0true"parviz8998"
826255false1780040.0true"austilicious123"
472997true1619831.0true"jinkyo"
843895false1264760.0true"escalord92"
473155false1999300.0true"amircyber"
324605false883630.0true"babaika.eth"
513102false3115580.0true"zach19"
2864true3516491.0true"launch"
507710false3056210.0true"cryptobeauty"
322511false3052190.0true"lukichka"
" + ], + "text/plain": [ + "shape: (10, 6)\n", + "┌────────┬───────┬───────────┬───────────┬──────────┬─────────────────┐\n", + "│ fid ┆ bot ┆ fid_index ┆ posterior ┆ is_noisy ┆ fname │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ bool ┆ i64 ┆ f64 ┆ bool ┆ str │\n", + "╞════════╪═══════╪═══════════╪═══════════╪══════════╪═════════════════╡\n", + "│ 415001 ┆ false ┆ 10115 ┆ 0.0 ┆ true ┆ parviz8998 │\n", + "│ 826255 ┆ false ┆ 178004 ┆ 0.0 ┆ true ┆ austilicious123 │\n", + "│ 472997 ┆ true ┆ 161983 ┆ 1.0 ┆ true ┆ jinkyo │\n", + "│ 843895 ┆ false ┆ 126476 ┆ 0.0 ┆ true ┆ escalord92 │\n", + "│ 473155 ┆ false ┆ 199930 ┆ 0.0 ┆ true ┆ amircyber │\n", + "│ 324605 ┆ false ┆ 88363 ┆ 0.0 ┆ true ┆ babaika.eth │\n", + "│ 513102 ┆ false ┆ 311558 ┆ 0.0 ┆ true ┆ zach19 │\n", + "│ 2864 ┆ true ┆ 351649 ┆ 1.0 ┆ true ┆ launch │\n", + "│ 507710 ┆ false ┆ 305621 ┆ 0.0 ┆ true ┆ cryptobeauty │\n", + "│ 322511 ┆ false ┆ 305219 ┆ 0.0 ┆ true ┆ lukichka │\n", + "└────────┴───────┴───────────┴───────────┴──────────┴─────────────────┘" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bot_or_not_with_fnames.filter(pl.col(\"is_noisy\")).sample(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| **fname** | **Bot or Not label** | **SybilSCAR label** | **inspection result** |\n", + "|-----------------|------------------|-----------------|-------------------|\n", + "| fayiz | bot | human | human |\n", + "| ruslan | bot | human | bot |\n", + "| djo | bot | human | bot |\n", + "| papeclaus | human | bot | bot |\n", + "| versadchikov | human | human | bot |\n", + "| siatoshi | human | bot | bot |\n", + "| naqu | human | bot | bot |\n", + "| jenny1 | bot | human | bot |\n", + "| sheva7.eth | bot | human | bot |\n", + "| noormuhammad | human | bot | bot |\n", + "| parviz8998 | human | bot | bot |\n", + "| austilicious123 | human | bot | bot |\n", + "| jinkyo | bot | human | human |\n", + "| escalord92 | human | bot | bot |\n", + "| amircyber | human | bot | bot |\n", + "| babaika.eth | human | bot | bot |\n", + "| zach19 | human | bot | bot |\n", + "| launch | bot | human | bot |\n", + "| cryptobeauty | human | bot | bot |\n", + "| lukichka | human | bot | bot |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After manual inspection of the changed labels (noisy values), it is possible to check that ~70% of the changes make sense" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Save new bot_or_not_without_noises" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (8_119, 2)
fidbot
i64bool
446097false
3false
8false
12false
2false
280179true
327500true
428200true
278549true
446821false
" + ], + "text/plain": [ + "shape: (8_119, 2)\n", + "┌────────┬───────┐\n", + "│ fid ┆ bot │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ bool │\n", + "╞════════╪═══════╡\n", + "│ 446097 ┆ false │\n", + "│ 3 ┆ false │\n", + "│ 8 ┆ false │\n", + "│ 12 ┆ false │\n", + "│ 2 ┆ false │\n", + "│ … ┆ … │\n", + "│ 280179 ┆ true │\n", + "│ 327500 ┆ true │\n", + "│ 428200 ┆ true │\n", + "│ 278549 ┆ true │\n", + "│ 446821 ┆ false │\n", + "└────────┴───────┘" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter and remove unnecessary columns\n", + "bot_or_not_without_noises = df.filter(~pl.col(\"is_noisy\"))[[\"fid\",\"bot\"]]\n", + "bot_or_not_without_noises" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "bot_or_not_without_noises.write_parquet(f\"{DATA_PATH}/interim/bot_or_not_without_noises.parquet\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "farcaster-social-graph-notebooks-_gupmy54-py3.13", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}