diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_haiku.py index 19eebf09..c9e7a875 100644 --- a/examples/anthropic/search_graph_schema_haiku.py +++ b/examples/anthropic/search_graph_schema_haiku.py @@ -3,13 +3,12 @@ """ import os +from typing import List from dotenv import load_dotenv -load_dotenv() - +from pydantic import BaseModel, Field from scrapegraphai.graphs import SearchGraph -from pydantic import BaseModel, Field -from typing import List +load_dotenv() # ************************************************ # Define the output schema for the graph diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py index 5a9006b2..d0816bf5 100644 --- a/examples/azure/smart_scraper_schema_azure.py +++ b/examples/azure/smart_scraper_schema_azure.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import os +import json from typing import List from pydantic import BaseModel, Field from dotenv import load_dotenv diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 5fcff433..088e2eed 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -19,7 +19,7 @@ class Projects(BaseModel): graph_config = { "llm": { - "model": "ollama/llama3", + "model": "ollama/llama3.1", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index 6771b817..4299ec29 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from dotenv import load_dotenv diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index d9e1bd1c..828a9b0a 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -30,7 +30,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-4o", + "model": "gpt-4o-mini", }, "verbose": True, "headless": False, diff --git a/pyproject.toml b/pyproject.toml index 24ef4673..6faaa3e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,12 @@ authors = [ ] dependencies = [ - "langchain>=0.2.10", + "langchain>=0.2.14", "langchain-fireworks>=0.1.3", "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", "langchain-google-vertexai>=1.0.7", - "langchain-openai>=0.1.17", + "langchain-openai>=0.1.22", "langchain-groq>=0.1.3", "langchain-aws>=0.1.3", "langchain-anthropic>=0.1.11", diff --git a/requirements-dev.lock b/requirements-dev.lock index ffcd2d40..64af8ee8 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -255,7 +255,7 @@ jsonschema-specifications==2023.12.1 # via jsonschema kiwisolver==1.4.5 # via matplotlib -langchain==0.2.12 +langchain==0.2.14 # via langchain-community # via scrapegraphai langchain-anthropic==0.1.22 @@ -264,7 +264,7 @@ langchain-aws==0.1.16 # via scrapegraphai langchain-community==0.2.11 # via scrapegraphai -langchain-core==0.2.29 +langchain-core==0.2.33 # via langchain # via langchain-anthropic # via langchain-aws @@ -292,7 +292,7 @@ langchain-mistralai==0.1.12 # via scrapegraphai langchain-nvidia-ai-endpoints==0.2.1 # via scrapegraphai -langchain-openai==0.1.21 +langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 # via langchain diff --git a/requirements.lock b/requirements.lock index f449a7b7..1d80e1bf 100644 --- a/requirements.lock +++ b/requirements.lock @@ -178,6 +178,7 @@ jinja2==3.1.4 # via torch jiter==0.5.0 # via anthropic + # via openai jmespath==1.0.1 # via boto3 # via botocore @@ -187,7 +188,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.11 +langchain==0.2.14 # via langchain-community # via scrapegraphai langchain-anthropic==0.1.20 @@ -196,7 +197,7 @@ langchain-aws==0.1.12 # via scrapegraphai langchain-community==0.2.10 # via scrapegraphai -langchain-core==0.2.28 +langchain-core==0.2.33 # via langchain # via langchain-anthropic # via langchain-aws @@ -224,7 +225,7 @@ langchain-mistralai==0.1.12 # via scrapegraphai langchain-nvidia-ai-endpoints==0.1.7 # via scrapegraphai -langchain-openai==0.1.17 +langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 # via langchain @@ -264,7 +265,7 @@ numpy==1.26.4 # via sentence-transformers # via shapely # via transformers -openai==1.37.0 +openai==1.41.0 # via langchain-fireworks # via langchain-openai orjson==3.10.6 diff --git a/requirements.txt b/requirements.txt index 754eab61..21c2fd3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -langchain>=0.2.10 +langchain>=0.2.14 langchain-fireworks>=0.1.3 langchain_community>=0.2.9 langchain-google-genai>=1.0.7 langchain-google-vertexai>=1.0.7 -langchain-openai>=0.1.17 +langchain-openai>=0.1.22 langchain-groq>=0.1.3 langchain-aws>=0.1.3 langchain-anthropic>=0.1.11 diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 43820159..e0b936ae 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -43,7 +43,7 @@ "gpt-4-32k-0613": 32768, "gpt-4o": 128000, "gpt-4o-mini":128000, - "chatgpt-4o-latest":128000 + "chatgpt-4o-latest": 128000 }, "google_genai": { "gemini-pro": 128000, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 068d4023..970a6790 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -5,7 +5,12 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel -from langchain_openai import ChatOpenAI +from langchain_openai import ChatOpenAI, AzureChatOpenAI +from langchain_mistralai import ChatMistralAI +from langchain_anthropic import ChatAnthropic +from langchain_groq import ChatGroq +from langchain_fireworks import ChatFireworks +from langchain_google_vertexai import ChatVertexAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm from ..utils.logging import get_logger @@ -88,12 +93,19 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + + # Use built-in structured output for providers that allow it + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI, ChatAnthropic, ChatFireworks, ChatGroq, ChatVertexAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="json_schema") + else: output_parser = JsonOutputParser() format_instructions = output_parser.get_format_instructions() - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD template_chunks_prompt = TEMPLATE_CHUNKS_MD template_merge_prompt = TEMPLATE_MERGE_MD diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index f7a60bc2..dbecdbf9 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -5,7 +5,6 @@ from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document -from ..utils.logging import get_logger from .base_node import BaseNode class ParseNode(BaseNode): @@ -78,16 +77,18 @@ def execute(self, state: dict) -> dict: else: docs_transformed = docs_transformed[0] + # Adapt the chunk size, leaving room for the reply, the prompt and the schema + chunk_size = self.node_config.get("chunk_size", 4096) + chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) + if isinstance(docs_transformed, Document): - chunks = chunk(text=docs_transformed.page_content, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) else: - chunks = chunk(text=docs_transformed, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index c5263efe..c5e5fbbb 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -1,5 +1,5 @@ """ -Module for truncatinh in chunks the messages +Module for truncating in chunks the messages """ from typing import List import tiktoken @@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] """ encoding = tiktoken.get_encoding(encoding_name) - max_tokens = models_tokens[model] - 500 + max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9)) encoded_text = encoding.encode(text) chunks = [encoded_text[i:i + max_tokens]