diff --git a/examples/anthropic/rate_limit_haiku.py b/examples/anthropic/rate_limit_haiku.py new file mode 100644 index 00000000..a01bff44 --- /dev/null +++ b/examples/anthropic/rate_limit_haiku.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper while setting an API rate limit. +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + + +# required environment variables in .env +# ANTHROPIC_API_KEY +load_dotenv() + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.getenv("ANTHROPIC_API_KEY"), + "model": "anthropic/claude-3-haiku-20240307", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""Don't say anything else. Output JSON only. List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + # also accepts a string with the already downloaded HTML code + source="https://www.hmhco.com/event", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/azure/rate_limit_azure.py b/examples/azure/rate_limit_azure.py new file mode 100644 index 00000000..cfd05f1f --- /dev/null +++ b/examples/azure/rate_limit_azure.py @@ -0,0 +1,57 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + + +# required environment variable in .env +# AZURE_OPENAI_ENDPOINT +# AZURE_OPENAI_CHAT_DEPLOYMENT_NAME +# MODEL_NAME +# AZURE_OPENAI_API_KEY +# OPENAI_API_TYPE +# AZURE_OPENAI_API_VERSION +# AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME +load_dotenv() + + +# ************************************************ +# Initialize the model instances +# ************************************************ + +graph_config = { + "llm": { + "api_key": os.environ["AZURE_OPENAI_KEY"], + "model": "azure_openai/gpt-3.5-turbo", + "rate_limit": { + "requests_per_second": 1 + }, + }, + "verbose": True, + "headless": False +} + +smart_scraper_graph = SmartScraperGraph( + prompt="""List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, + event_end_date, event_end_time, location, event_mode, event_category, + third_party_redirect, no_of_days, + time_in_hours, hosted_or_attending, refreshments_type, + registration_available, registration_link""", + # also accepts a string with the already downloaded HTML code + source="https://www.hmhco.com/event", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/bedrock/rate_limit_bedrock.py b/examples/bedrock/rate_limit_bedrock.py new file mode 100644 index 00000000..79a76a3e --- /dev/null +++ b/examples/bedrock/rate_limit_bedrock.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "client": "client_name", + "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + "temperature": 0.0, + "rate_limit": { + "requests_per_second": 1 + }, + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/deepseek/rate_limit_deepseek.py b/examples/deepseek/rate_limit_deepseek.py new file mode 100644 index 00000000..36278452 --- /dev/null +++ b/examples/deepseek/rate_limit_deepseek.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +deepseek_key = os.getenv("DEEPSEEK_APIKEY") + +graph_config = { + "llm": { + "model": "deepseek/deepseek-chat", + "api_key": deepseek_key, + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/ernie/rate_limit_ernie.py b/examples/ernie/rate_limit_ernie.py new file mode 100644 index 00000000..41314e87 --- /dev/null +++ b/examples/ernie/rate_limit_ernie.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +graph_config = { + "llm": { + "model": "ernie/ernie-bot-turbo", + "ernie_client_id": "", + "ernie_client_secret": "", + "temperature": 0.1, + "rate_limit": { + "requests_per_second": 1 + }, + }, + "library": "beautifulsoup" +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config, +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/fireworks/rate_limit_fireworks.py b/examples/fireworks/rate_limit_fireworks.py new file mode 100644 index 00000000..b19cb770 --- /dev/null +++ b/examples/fireworks/rate_limit_fireworks.py @@ -0,0 +1,50 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +fireworks_api_key = os.getenv("FIREWORKS_APIKEY") + +graph_config = { + "llm": { + "api_key": fireworks_api_key, + "model": "fireworks/accounts/fireworks/models/mixtral-8x7b-instruct", + "rate_limit": { + "requests_per_second": 1 + }, + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config, +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_genai/rate_limit_gemini.py b/examples/google_genai/rate_limit_gemini.py new file mode 100644 index 00000000..f4e68f69 --- /dev/null +++ b/examples/google_genai/rate_limit_gemini.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import SmartScraperGraph +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_genai/gemini-pro", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/google_vertexai/rate_limit_gemini.py b/examples/google_vertexai/rate_limit_gemini.py new file mode 100644 index 00000000..c5f15a35 --- /dev/null +++ b/examples/google_vertexai/rate_limit_gemini.py @@ -0,0 +1,47 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.utils import prettify_exec_info +from scrapegraphai.graphs import SmartScraperGraph +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +gemini_key = os.getenv("GOOGLE_APIKEY") + +graph_config = { + "llm": { + "api_key": gemini_key, + "model": "google_vertexai/gemini-1.5-pro", + "rate_limit": { + "requests_per_second": 1 + } + }, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the news with their description.", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/groq/rate_limit_groq.py b/examples/groq/rate_limit_groq.py new file mode 100644 index 00000000..976127be --- /dev/null +++ b/examples/groq/rate_limit_groq.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +groq_key = os.getenv("GROQ_APIKEY") + +graph_config = { + "llm": { + "model": "groq/gemma-7b-it", + "api_key": groq_key, + "temperature": 0, + "rate_limit": { + "requests_per_second": 1 + } + }, + "headless": False +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/mistral/rate_limit_mistral.py b/examples/mistral/rate_limit_mistral.py new file mode 100644 index 00000000..fbd65a1a --- /dev/null +++ b/examples/mistral/rate_limit_mistral.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("MISTRAL_API_KEY"), + "model": "mistralai/open-mistral-nemo", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/nemotron/rate_limit_nemotron.py b/examples/nemotron/rate_limit_nemotron.py new file mode 100644 index 00000000..8b1a5eb4 --- /dev/null +++ b/examples/nemotron/rate_limit_nemotron.py @@ -0,0 +1,46 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os, json +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +from dotenv import load_dotenv +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("NEMOTRON_KEY"), + "model": "nvidia/meta/llama3-70b-instruct", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="Extract me the python code inside the page", + source="https://www.exploit-db.com/exploits/51447", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/oneapi/rate_limit_oneapi.py b/examples/oneapi/rate_limit_oneapi.py new file mode 100644 index 00000000..64a170f7 --- /dev/null +++ b/examples/oneapi/rate_limit_oneapi.py @@ -0,0 +1,41 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": "***************************", + "model": "oneapi/qwen-turbo", + "base_url": "http://127.0.0.1:3000/v1", # 设置 OneAPI URL + "rate_limit": { + "requests_per_second": 1 + } + } +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the titles", + # also accepts a string with the already downloaded HTML code + source="https://www.wired.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/openai/rate_limit_openai.py b/examples/openai/rate_limit_openai.py new file mode 100644 index 00000000..9455e798 --- /dev/null +++ b/examples/openai/rate_limit_openai.py @@ -0,0 +1,48 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +import json +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + + +graph_config = { + "llm": { + "api_key": os.getenv("OPENAI_API_KEY"), + "model": "openai/gpt-4o", + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, + "headless": False, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me what does the company do, the name and a contact email.", + source="https://scrapegraphai.com/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(json.dumps(result, indent=4)) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/examples/together/rate_limit_together.py b/examples/together/rate_limit_together.py new file mode 100644 index 00000000..072f8557 --- /dev/null +++ b/examples/together/rate_limit_together.py @@ -0,0 +1,49 @@ +""" +Basic example of scraping pipeline using SmartScraper with a custom rate limit +""" + +import os +from dotenv import load_dotenv +from scrapegraphai.graphs import SmartScraperGraph +from scrapegraphai.utils import prettify_exec_info + +load_dotenv() + + +# ************************************************ +# Define the configuration for the graph +# ************************************************ + +together_key = os.getenv("TOGETHER_APIKEY") + +graph_config = { + "llm": { + "model": "togetherai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + "api_key": together_key, + "rate_limit": { + "requests_per_second": 1 + } + }, + "verbose": True, +} + +# ************************************************ +# Create the SmartScraperGraph instance and run it +# ************************************************ + +smart_scraper_graph = SmartScraperGraph( + prompt="List me all the projects with their description.", + # also accepts a string with the already downloaded HTML code + source="https://perinim.github.io/projects/", + config=graph_config +) + +result = smart_scraper_graph.run() +print(result) + +# ************************************************ +# Get graph execution info +# ************************************************ + +graph_exec_info = smart_scraper_graph.get_execution_info() +print(prettify_exec_info(graph_exec_info)) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 1a4f1e6a..ae5cc496 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -8,6 +8,7 @@ import warnings from pydantic import BaseModel from langchain.chat_models import init_chat_model +from langchain_core.rate_limiters import InMemoryRateLimiter from ..helpers import models_tokens from ..models import ( OneApi, @@ -119,6 +120,17 @@ def _create_llm(self, llm_config: dict) -> object: llm_defaults = {"temperature": 0, "streaming": False} llm_params = {**llm_defaults, **llm_config} + rate_limit_params = llm_params.pop("rate_limit", {}) + + if rate_limit_params: + requests_per_second = rate_limit_params.get("requests_per_second") + max_retries = rate_limit_params.get("max_retries") + if requests_per_second is not None: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + llm_params["rate_limiter"] = InMemoryRateLimiter(requests_per_second=requests_per_second) + if max_retries is not None: + llm_params["max_retries"] = max_retries if "model_instance" in llm_params: try: diff --git a/tests/graphs/abstract_graph_test.py b/tests/graphs/abstract_graph_test.py index 54349d22..642868fb 100644 --- a/tests/graphs/abstract_graph_test.py +++ b/tests/graphs/abstract_graph_test.py @@ -83,3 +83,17 @@ def test_create_llm_unknown_provider(self): with pytest.raises(ValueError): TestGraph("Test prompt", {"llm": {"model": "unknown_provider/model"}}) + @pytest.mark.parametrize("llm_config, expected_model", [ + ({"model": "openai/gpt-3.5-turbo", "openai_api_key": "sk-randomtest001", "rate_limit": {"requests_per_second": 1}}, ChatOpenAI), + ({"model": "azure_openai/gpt-3.5-turbo", "api_key": "random-api-key", "api_version": "no version", "azure_endpoint": "https://www.example.com/", "rate_limit": {"requests_per_second": 1}}, AzureChatOpenAI), + ({"model": "google_genai/gemini-pro", "google_api_key": "google-key-test", "rate_limit": {"requests_per_second": 1}}, ChatGoogleGenerativeAI), + ({"model": "ollama/llama2", "rate_limit": {"requests_per_second": 1}}, ChatOllama), + ({"model": "oneapi/qwen-turbo", "api_key": "oneapi-api-key", "rate_limit": {"requests_per_second": 1}}, OneApi), + ({"model": "deepseek/deepseek-coder", "api_key": "deepseek-api-key", "rate_limit": {"requests_per_second": 1}}, DeepSeek), + ({"model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", "region_name": "IDK", "rate_limit": {"requests_per_second": 1}}, ChatBedrock), + ]) + + + def test_create_llm_with_rate_limit(self, llm_config, expected_model): + graph = TestGraph("Test prompt", {"llm": llm_config}) + assert isinstance(graph.llm_model, expected_model) \ No newline at end of file