From c2c2e6582f600424c7e092bf0c65b8054ffcaa61 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 17 Aug 2024 13:30:44 +0200 Subject: [PATCH 1/6] Update generate_answer_node.py --- scrapegraphai/nodes/generate_answer_node.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index d01b50d2..94962310 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -88,19 +88,22 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) + if isinstance(self.llm_model, ChatOpenAI): + self.llm_model = self.llm_model.with_structured_output(self.node_config["schema"]) + else: output_parser = JsonOutputParser() format_instructions = output_parser.get_format_instructions() + template_no_chunks_prompt = template_no_chunks + template_chunks_prompt = template_chunks + template_merge_prompt = template_merge + if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = template_no_chunks_md template_chunks_prompt = template_chunks_md template_merge_prompt = template_merge_md - else: - template_no_chunks_prompt = template_no_chunks - template_chunks_prompt = template_chunks - template_merge_prompt = template_merge if self.additional_info is not None: template_no_chunks_prompt = self.additional_info + template_no_chunks_prompt From 391d051a5e9efcc4b591b2f70f0eb119f1718f58 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 17 Aug 2024 14:39:19 +0200 Subject: [PATCH 2/6] updated --- examples/openai/smart_scraper_schema_openai.py | 2 +- scrapegraphai/graphs/abstract_graph.py | 2 +- scrapegraphai/helpers/models_tokens.py | 1 + scrapegraphai/nodes/generate_answer_node.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index d9e1bd1c..4decac7e 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -30,7 +30,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "gpt-4o", + "model": "chatgpt-4o-latest", }, "verbose": True, "headless": False, diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index 01ff0b0a..ae1e90b2 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -147,7 +147,7 @@ def handle_model(model_name, provider, token_key, default_token=8192): warnings.simplefilter("ignore") return init_chat_model(**llm_params) - known_models = ["gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"] + known_models = ["chatgpt","gpt","openai", "azure_openai", "google_genai", "ollama", "oneapi", "nvidia", "groq", "google_vertexai", "bedrock", "mistralai", "hugging_face", "deepseek", "ernie", "fireworks"] if llm_params["model"].split("/")[0] not in known_models and llm_params["model"].split("-")[0] not in known_models: raise ValueError(f"Model '{llm_params['model']}' is not supported") diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 791bcf72..7dbb5620 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -39,6 +39,7 @@ "gpt-4-32k-0613": 32768, "gpt-4o": 128000, "gpt-4o-mini":128000, + "chatgpt-4o-latest": 128000 }, "google_genai": { "gemini-pro": 128000, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 94962310..8aa01e0a 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -89,7 +89,7 @@ def execute(self, state: dict) -> dict: if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) if isinstance(self.llm_model, ChatOpenAI): - self.llm_model = self.llm_model.with_structured_output(self.node_config["schema"]) + self.llm_model = self.llm_model.with_structured_output(self.node_config["schema"], method="json_mode") else: output_parser = JsonOutputParser() From ebdb74967dfd35f32d62b2539c7efc804c029bfb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 17 Aug 2024 15:00:44 +0200 Subject: [PATCH 3/6] add new model for gpt --- examples/openai/smart_scraper_schema_openai.py | 4 ++-- pyproject.toml | 4 ++-- requirements-dev.lock | 7 +++---- requirements.lock | 10 +++++----- scrapegraphai/nodes/generate_answer_node.py | 6 ++++-- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index 4decac7e..5a13e9ae 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -5,7 +5,7 @@ import os, json from typing import List from dotenv import load_dotenv -from pydantic import BaseModel, Field +from langchain_core.pydantic_v1 import BaseModel, Field from scrapegraphai.graphs import SmartScraperGraph load_dotenv() @@ -30,7 +30,7 @@ class Projects(BaseModel): graph_config = { "llm": { "api_key":openai_key, - "model": "chatgpt-4o-latest", + "model": "gpt-4o-mini", }, "verbose": True, "headless": False, diff --git a/pyproject.toml b/pyproject.toml index f6843503..cb3e6be2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,12 @@ authors = [ ] dependencies = [ - "langchain>=0.2.10", + "langchain==0.2.14", "langchain-fireworks>=0.1.3", "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", "langchain-google-vertexai>=1.0.7", - "langchain-openai>=0.1.17", + "langchain-openai==0.1.22", "langchain-groq>=0.1.3", "langchain-aws>=0.1.3", "langchain-anthropic>=0.1.11", diff --git a/requirements-dev.lock b/requirements-dev.lock index ffcd2d40..44cc64de 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -179,7 +179,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -255,7 +254,7 @@ jsonschema-specifications==2023.12.1 # via jsonschema kiwisolver==1.4.5 # via matplotlib -langchain==0.2.12 +langchain==0.2.14 # via langchain-community # via scrapegraphai langchain-anthropic==0.1.22 @@ -264,7 +263,7 @@ langchain-aws==0.1.16 # via scrapegraphai langchain-community==0.2.11 # via scrapegraphai -langchain-core==0.2.29 +langchain-core==0.2.33 # via langchain # via langchain-anthropic # via langchain-aws @@ -292,7 +291,7 @@ langchain-mistralai==0.1.12 # via scrapegraphai langchain-nvidia-ai-endpoints==0.2.1 # via scrapegraphai -langchain-openai==0.1.21 +langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 # via langchain diff --git a/requirements.lock b/requirements.lock index f449a7b7..1812ab21 100644 --- a/requirements.lock +++ b/requirements.lock @@ -133,7 +133,6 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright - # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 @@ -178,6 +177,7 @@ jinja2==3.1.4 # via torch jiter==0.5.0 # via anthropic + # via openai jmespath==1.0.1 # via boto3 # via botocore @@ -187,7 +187,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.11 +langchain==0.2.14 # via langchain-community # via scrapegraphai langchain-anthropic==0.1.20 @@ -196,7 +196,7 @@ langchain-aws==0.1.12 # via scrapegraphai langchain-community==0.2.10 # via scrapegraphai -langchain-core==0.2.28 +langchain-core==0.2.33 # via langchain # via langchain-anthropic # via langchain-aws @@ -224,7 +224,7 @@ langchain-mistralai==0.1.12 # via scrapegraphai langchain-nvidia-ai-endpoints==0.1.7 # via scrapegraphai -langchain-openai==0.1.17 +langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 # via langchain @@ -264,7 +264,7 @@ numpy==1.26.4 # via sentence-transformers # via shapely # via transformers -openai==1.37.0 +openai==1.41.0 # via langchain-fireworks # via langchain-openai orjson==3.10.6 diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 8aa01e0a..f307425e 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -88,8 +88,10 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) - if isinstance(self.llm_model, ChatOpenAI): - self.llm_model = self.llm_model.with_structured_output(self.node_config["schema"], method="json_mode") + if isinstance(self.llm_model, ChatOpenAI) and (self.llm_model.model_name=="gpt-4o-mini" or self.llm_model.model_name=="gpt-4o-2024-08-06"): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="json_schema") else: output_parser = JsonOutputParser() From 683bf57d895d8f6847fdd64e8936ffa1aa91926a Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 19 Aug 2024 11:33:09 +0200 Subject: [PATCH 4/6] fix(ParseNode): leave room for LLM reply in context window --- pyproject.toml | 4 ++-- requirements-dev.lock | 1 + requirements.lock | 1 + requirements.txt | 4 ++-- scrapegraphai/nodes/parse_node.py | 11 ++++++----- scrapegraphai/utils/token_calculator.py | 4 ++-- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cb3e6be2..53972f17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,12 +14,12 @@ authors = [ ] dependencies = [ - "langchain==0.2.14", + "langchain>=0.2.14", "langchain-fireworks>=0.1.3", "langchain_community>=0.2.9", "langchain-google-genai>=1.0.7", "langchain-google-vertexai>=1.0.7", - "langchain-openai==0.1.22", + "langchain-openai>=0.1.22", "langchain-groq>=0.1.3", "langchain-aws>=0.1.3", "langchain-anthropic>=0.1.11", diff --git a/requirements-dev.lock b/requirements-dev.lock index 44cc64de..64af8ee8 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -179,6 +179,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.lock b/requirements.lock index 1812ab21..1d80e1bf 100644 --- a/requirements.lock +++ b/requirements.lock @@ -133,6 +133,7 @@ graphviz==0.20.3 # via scrapegraphai greenlet==3.0.3 # via playwright + # via sqlalchemy groq==0.9.0 # via langchain-groq grpc-google-iam-v1==0.13.1 diff --git a/requirements.txt b/requirements.txt index 754eab61..21c2fd3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -langchain>=0.2.10 +langchain>=0.2.14 langchain-fireworks>=0.1.3 langchain_community>=0.2.9 langchain-google-genai>=1.0.7 langchain-google-vertexai>=1.0.7 -langchain-openai>=0.1.17 +langchain-openai>=0.1.22 langchain-groq>=0.1.3 langchain-aws>=0.1.3 langchain-anthropic>=0.1.11 diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index db7f8518..8c536bad 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -6,7 +6,6 @@ from semchunk import chunk from langchain_community.document_transformers import Html2TextTransformer from langchain_core.documents import Document -from ..utils.logging import get_logger from .base_node import BaseNode class ParseNode(BaseNode): @@ -79,16 +78,18 @@ def execute(self, state: dict) -> dict: else: docs_transformed = docs_transformed[0] + # Adapt the chunk size, leaving room for the reply, the prompt and the schema + chunk_size = self.node_config.get("chunk_size", 4096) + chunk_size = min(chunk_size - 500, int(chunk_size * 0.9)) + if isinstance(docs_transformed, Document): - chunks = chunk(text=docs_transformed.page_content, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) else: - chunks = chunk(text=docs_transformed, - chunk_size=self.node_config.get("chunk_size", 4096)-250, + chunk_size=chunk_size, token_counter=lambda text: len(text.split()), memoize=False) diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py index c5263efe..c5e5fbbb 100644 --- a/scrapegraphai/utils/token_calculator.py +++ b/scrapegraphai/utils/token_calculator.py @@ -1,5 +1,5 @@ """ -Module for truncatinh in chunks the messages +Module for truncating in chunks the messages """ from typing import List import tiktoken @@ -27,7 +27,7 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str] """ encoding = tiktoken.get_encoding(encoding_name) - max_tokens = models_tokens[model] - 500 + max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9)) encoded_text = encoding.encode(text) chunks = [encoded_text[i:i + max_tokens] From d29338b7c2ef0b13535a2e4edae4a4aab08f1825 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 19 Aug 2024 13:45:37 +0200 Subject: [PATCH 5/6] feat(GenerateAnswerNode): built-in structured output through LangChain Co-Authored-By: Marco Vinciguerra <88108002+VinciGit00@users.noreply.github.com> --- scrapegraphai/nodes/generate_answer_node.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index fdaacbfe..970a6790 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -5,7 +5,12 @@ from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel -from langchain_openai import ChatOpenAI +from langchain_openai import ChatOpenAI, AzureChatOpenAI +from langchain_mistralai import ChatMistralAI +from langchain_anthropic import ChatAnthropic +from langchain_groq import ChatGroq +from langchain_fireworks import ChatFireworks +from langchain_google_vertexai import ChatVertexAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm from ..utils.logging import get_logger @@ -88,7 +93,9 @@ def execute(self, state: dict) -> dict: # Initialize the output parser if self.node_config.get("schema", None) is not None: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) - if isinstance(self.llm_model, ChatOpenAI) and (self.llm_model.model_name=="gpt-4o-mini" or self.llm_model.model_name=="gpt-4o-2024-08-06"): + + # Use built-in structured output for providers that allow it + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI, ChatAnthropic, ChatFireworks, ChatGroq, ChatVertexAI)): self.llm_model = self.llm_model.with_structured_output( schema = self.node_config["schema"], method="json_schema") @@ -98,7 +105,7 @@ def execute(self, state: dict) -> dict: format_instructions = output_parser.get_format_instructions() - if isinstance(self.llm_model, ChatOpenAI) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: + if isinstance(self.llm_model, (ChatOpenAI, AzureChatOpenAI)) and not self.script_creator or self.force and not self.script_creator or self.is_md_scraper: template_no_chunks_prompt = TEMPLATE_NO_CHUNKS_MD template_chunks_prompt = TEMPLATE_CHUNKS_MD template_merge_prompt = TEMPLATE_MERGE_MD From 7d2fc672c8c3c05b0f0beac46316ce16c16bcd02 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 19 Aug 2024 14:15:54 +0200 Subject: [PATCH 6/6] feat: add structured output format --- examples/anthropic/search_graph_schema_haiku.py | 7 +++---- examples/azure/smart_scraper_schema_azure.py | 3 ++- examples/local_models/smart_scraper_schema_ollama.py | 2 +- examples/openai/smart_scraper_openai.py | 3 ++- examples/openai/smart_scraper_schema_openai.py | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/anthropic/search_graph_schema_haiku.py b/examples/anthropic/search_graph_schema_haiku.py index 19eebf09..c9e7a875 100644 --- a/examples/anthropic/search_graph_schema_haiku.py +++ b/examples/anthropic/search_graph_schema_haiku.py @@ -3,13 +3,12 @@ """ import os +from typing import List from dotenv import load_dotenv -load_dotenv() - +from pydantic import BaseModel, Field from scrapegraphai.graphs import SearchGraph -from pydantic import BaseModel, Field -from typing import List +load_dotenv() # ************************************************ # Define the output schema for the graph diff --git a/examples/azure/smart_scraper_schema_azure.py b/examples/azure/smart_scraper_schema_azure.py index 5a9006b2..d0816bf5 100644 --- a/examples/azure/smart_scraper_schema_azure.py +++ b/examples/azure/smart_scraper_schema_azure.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper with schema """ -import os, json +import os +import json from typing import List from pydantic import BaseModel, Field from dotenv import load_dotenv diff --git a/examples/local_models/smart_scraper_schema_ollama.py b/examples/local_models/smart_scraper_schema_ollama.py index 5fcff433..088e2eed 100644 --- a/examples/local_models/smart_scraper_schema_ollama.py +++ b/examples/local_models/smart_scraper_schema_ollama.py @@ -19,7 +19,7 @@ class Projects(BaseModel): graph_config = { "llm": { - "model": "ollama/llama3", + "model": "ollama/llama3.1", "temperature": 0, "format": "json", # Ollama needs the format to be specified explicitly # "base_url": "http://localhost:11434", # set ollama URL arbitrarily diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py index 6771b817..4299ec29 100644 --- a/examples/openai/smart_scraper_openai.py +++ b/examples/openai/smart_scraper_openai.py @@ -2,7 +2,8 @@ Basic example of scraping pipeline using SmartScraper """ -import os, json +import os +import json from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from dotenv import load_dotenv diff --git a/examples/openai/smart_scraper_schema_openai.py b/examples/openai/smart_scraper_schema_openai.py index 5a13e9ae..828a9b0a 100644 --- a/examples/openai/smart_scraper_schema_openai.py +++ b/examples/openai/smart_scraper_schema_openai.py @@ -5,7 +5,7 @@ import os, json from typing import List from dotenv import load_dotenv -from langchain_core.pydantic_v1 import BaseModel, Field +from pydantic import BaseModel, Field from scrapegraphai.graphs import SmartScraperGraph load_dotenv()