Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allan/web3chat #5

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions data_scrape_protocols.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"!pip install beautifulsoup4"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GfJ1NTeA263q",
"outputId": "53671623-d791-427c-b7ed-3cdabfb6172b"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (4.6.3)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import os, requests\n",
"from bs4 import BeautifulSoup"
],
"metadata": {
"id": "Ccp5Qdx72-Dk"
},
"execution_count": 127,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"##Scrape"
],
"metadata": {
"id": "aNMFnKSJ2Ak7"
}
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"id": "Gz2oTQLg12HJ"
},
"outputs": [],
"source": [
"class Scraper:\n",
" def __init__(self, parent_url):\n",
" self.parent_url = parent_url\n",
" self.all_urls = set()\n",
"\n",
" def get_all_urls(self, url):\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
" links = soup.find_all('a')\n",
" urls = [link['href'] for link in links if link['href'].startswith(self.parent_url)]\n",
" for u in urls:\n",
" if u.startswith(self.parent_url) and u not in self.all_urls:\n",
" self.get_all_urls(u)\n",
" self.all_urls.update(urls)\n",
"\n",
" def scrape(self, url):\n",
" page = requests.get(url)\n",
" return BeautifulSoup(page.content, 'html.parser')\n",
"\n",
" def preprocess(self, soup):\n",
" corpus = \"\"\n",
" for s in soup.find_all(['header', 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):\n",
" corpus += str(s.text) + '\\n'\n",
" return corpus"
]
},
{
"cell_type": "code",
"source": [
"def save_txt(protocol, doc, url):\n",
" with open(f\"{protocol}-documentation/{'_'.join(url.split('/'))}.txt\", 'w') as f:\n",
" f.write(doc)"
],
"metadata": {
"id": "xcEQYzVBjJ12"
},
"execution_count": 129,
"outputs": []
},
{
"cell_type": "code",
"source": [
"protocol = 'lido'\n",
"parent_url = \"https://docs.lido.fi/\"\n",
"os.makedirs(f\"{protocol}-documentation\", exist_ok=True)\n",
"\n",
"scraper = Scraper(parent_url)\n",
"scraper.get_all_urls(parent_url)\n",
"all_urls = scraper.all_urls\n",
"for u in all_urls:\n",
" soup = scraper.scrape(u)\n",
" doc = scraper.preprocess(soup)\n",
" save_txt(protocol, doc, u)"
],
"metadata": {
"id": "JN_XBeltgoQk"
},
"execution_count": 130,
"outputs": []
},
{
"cell_type": "code",
"source": [
"all_urls"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "V-ZCtCFmkdIp",
"outputId": "66f36fc7-b9f3-4055-f1d9-933f2c821685"
},
"execution_count": 131,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'https://docs.lido.fi/contracts/lido-oracle/#add-calculation-of-staker-rewards-apr',\n",
" 'https://docs.lido.fi/guides/steth-integration-guide',\n",
" 'https://docs.lido.fi/token-guides/steth-on-aave-caveats'}"
]
},
"metadata": {},
"execution_count": 131
}
]
}
]
}
219 changes: 219 additions & 0 deletions faiss_indexing.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8-jL1OaHigT6"
},
"outputs": [],
"source": [
"!pip install langchain\n",
"!pip install faiss-gpu\n",
"!pip install openai"
]
},
{
"cell_type": "code",
"source": [
"import os \n",
"import json\n",
"import pickle\n",
"import faiss\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.faiss import FAISS"
],
"metadata": {
"id": "J1eMVndBi1Pm"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"openai_api_key = \"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\""
],
"metadata": {
"id": "X06QitqDlCS3"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"embedding_function = OpenAIEmbeddings(openai_api_key=openai_api_key)"
],
"metadata": {
"id": "4jPBBBT7k_b_"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### corpus --> chunks"
],
"metadata": {
"id": "xvjXK9944TdS"
}
},
{
"cell_type": "code",
"source": [
"dir_path = \"\"\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = []"
],
"metadata": {
"id": "7yx-aTi84m01"
},
"execution_count": 61,
"outputs": []
},
{
"cell_type": "code",
"source": [
"for f_name in os.listdir(dir_path):\n",
" with open(f'{dir_path}/{f_name}') as f:\n",
" corpus = f.read()\n",
" texts.extend(text_splitter.split_text(corpus))"
],
"metadata": {
"id": "n6ZnWkKxxhXZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### corpus --> vectorDB"
],
"metadata": {
"id": "Nm7Wpb1C48Eu"
}
},
{
"cell_type": "code",
"source": [
"docsearch = FAISS.from_texts(texts, embedding_function)"
],
"metadata": {
"id": "q_jeL3XG42jH"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Save the files"
],
"metadata": {
"id": "iOj4TAIa4c8w"
}
},
{
"cell_type": "code",
"source": [
"save_dir_path = 'faiss'\n",
"os.makedirs(save_dir_path, exist_ok=True)"
],
"metadata": {
"id": "uhtow6q-5oMv"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# save the index_to_id \n",
"with open(f'{save_dir_path}/index_to_docstore_id.json', 'w') as f:\n",
" json.dump(docsearch.index_to_docstore_id, f)\n",
"\n",
"# save the docstore\n",
"with open(f'{save_dir_path}/doctore.pkl', 'wb') as f:\n",
" pickle.dump(docsearch.docstore, f)\n",
"\n",
"# save the faiss index\n",
"faiss.write_index(docsearch.index, f'{save_dir_path}/index')"
],
"metadata": {
"id": "lv7BHKcDjEFN"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### Reinit FAISS"
],
"metadata": {
"id": "LpZUTK_D4gPK"
}
},
{
"cell_type": "code",
"source": [
"save_dir_path = 'faiss'"
],
"metadata": {
"id": "ChNUp78aCa2W"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# read the index_to_id\n",
"with open(f'{save_dir_path}/index_to_docstore_id.json', 'r') as f:\n",
" index_to_docstore_id = json.load(f)\n",
"index_to_docstore_id = {int(k):v for (k,v) in index_to_docstore_id.items()}\n",
"\n",
"# read the docstore\n",
"with open(f'{save_dir_path}/doctore.pkl', 'rb') as f:\n",
" docstore = pickle.load(f)\n",
"\n",
"# read the faiss index\n",
"index = faiss.read_index(f'{save_dir_path}/index')"
],
"metadata": {
"id": "6enReGBJ_zzc"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"docsearch = FAISS(embedding_function.embed_query, index, docstore, index_to_docstore_id)"
],
"metadata": {
"id": "uWDKlWaX9Yvc"
},
"execution_count": 10,
"outputs": []
}
]
}
Loading