From 5c16ee985b11948c6a8c1dbfd051d458fa193973 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:44:05 +0200 Subject: [PATCH 1/3] fix(docloaders): BrowserBase dynamic import --- scrapegraphai/docloaders/browser_base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/docloaders/browser_base.py b/scrapegraphai/docloaders/browser_base.py index 9b60f36f..318c9f38 100644 --- a/scrapegraphai/docloaders/browser_base.py +++ b/scrapegraphai/docloaders/browser_base.py @@ -2,7 +2,6 @@ browserbase integration module """ from typing import List -from browserbase import Browserbase def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[str]: """ @@ -41,6 +40,12 @@ def browser_base_fetch(api_key: str, project_id: str, link: List[str]) -> List[s object: The result of the loading operation. """ + try: + from browserbase import Browserbase + except ImportError: + raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") + + browserbase = Browserbase(api_key=api_key, project_id=project_id) result = [] From 83e71df2e2cb3b6bfba11f8879d5c4917a3e1837 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:54:57 +0200 Subject: [PATCH 2/3] fix: set up dynamic imports correctly --- pyproject.toml | 1 + requirements-dev.lock | 149 ------------------- requirements.lock | 154 -------------------- requirements.txt | 14 +- scrapegraphai/nodes/fetch_node.py | 5 +- scrapegraphai/nodes/generate_answer_node.py | 21 ++- scrapegraphai/nodes/rag_node.py | 70 ++++----- 7 files changed, 56 insertions(+), 358 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3616c032..5afe841f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ docs = ["sphinx==6.0", "furo==2024.5.6"] # Group 1: Other Language Models other-language-models = [ + "langchain-google-vertexai>=1.0.7", "langchain-fireworks>=0.1.3", "langchain-groq>=0.1.3", "langchain-anthropic>=0.1.11", diff --git a/requirements-dev.lock b/requirements-dev.lock index 64af8ee8..04ca69d9 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -15,8 +15,6 @@ aiohappyeyeballs==2.3.5 aiohttp==3.10.3 # via langchain # via langchain-community - # via langchain-fireworks - # via langchain-nvidia-ai-endpoints aiosignal==1.3.1 # via aiohttp alabaster==0.7.16 @@ -25,11 +23,7 @@ altair==5.4.0 # via streamlit annotated-types==0.7.0 # via pydantic -anthropic==0.33.0 - # via langchain-anthropic anyio==4.4.0 - # via anthropic - # via groq # via httpx # via openai # via starlette @@ -55,8 +49,6 @@ boto3==1.34.158 botocore==1.34.158 # via boto3 # via s3transfer -browserbase==0.3.0 - # via scrapegraphai burr==0.22.1 # via scrapegraphai cachetools==5.4.0 @@ -78,17 +70,11 @@ cycler==0.12.1 # via matplotlib dataclasses-json==0.6.7 # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic dill==0.3.8 # via multiprocess # via pylint distro==1.9.0 - # via anthropic - # via groq # via openai -docstring-parser==0.16 - # via google-cloud-aiplatform docutils==0.19 # via sphinx exceptiongroup==1.2.2 @@ -102,10 +88,6 @@ fastapi-pagination==0.12.26 # via burr filelock==3.15.4 # via huggingface-hub - # via torch - # via transformers -fireworks-ai==0.15.0 - # via langchain-fireworks fonttools==4.53.1 # via matplotlib free-proxy==1.1.1 @@ -115,7 +97,6 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub - # via torch furo==2024.5.6 # via scrapegraphai gitdb==4.0.11 @@ -129,11 +110,6 @@ google-ai-generativelanguage==0.6.6 google-api-core==2.19.1 # via google-ai-generativelanguage # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage # via google-generativeai google-api-python-client==2.140.0 # via google-generativeai @@ -142,52 +118,21 @@ google-auth==2.33.0 # via google-api-core # via google-api-python-client # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-cloud-aiplatform==1.61.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.5 - # via google-cloud-aiplatform -google-cloud-storage==2.18.2 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media google-generativeai==0.7.2 # via langchain-google-genai -google-resumable-media==2.7.2 - # via google-cloud-bigquery - # via google-cloud-storage googleapis-common-protos==1.63.2 # via google-api-core - # via grpc-google-iam-v1 # via grpcio-status graphviz==0.20.3 # via burr - # via scrapegraphai greenlet==3.0.3 # via playwright # via sqlalchemy -groq==0.9.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager grpcio==1.65.4 # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 # via grpcio-status grpcio-status==1.62.3 # via google-api-core @@ -202,20 +147,12 @@ httplib2==0.22.0 # via google-api-python-client # via google-auth-httplib2 httpx==0.27.0 - # via anthropic - # via browserbase - # via fireworks-ai - # via groq # via langchain-mistralai # via openai httpx-sse==0.4.0 - # via fireworks-ai # via langchain-mistralai huggingface-hub==0.24.5 - # via langchain-huggingface - # via sentence-transformers # via tokenizers - # via transformers idna==3.7 # via anyio # via httpx @@ -236,15 +173,11 @@ jinja2==3.1.4 # via burr # via pydeck # via sphinx - # via torch jiter==0.5.0 - # via anthropic # via openai jmespath==1.0.1 # via boto3 # via botocore -joblib==1.4.2 - # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -258,40 +191,22 @@ kiwisolver==1.4.5 langchain==0.2.14 # via langchain-community # via scrapegraphai -langchain-anthropic==0.1.22 - # via scrapegraphai langchain-aws==0.1.16 # via scrapegraphai langchain-community==0.2.11 # via scrapegraphai langchain-core==0.2.33 # via langchain - # via langchain-anthropic # via langchain-aws # via langchain-community - # via langchain-fireworks # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-huggingface # via langchain-mistralai - # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters -langchain-fireworks==0.1.7 - # via scrapegraphai langchain-google-genai==1.0.8 # via scrapegraphai -langchain-google-vertexai==1.0.8 - # via scrapegraphai -langchain-groq==0.1.9 - # via scrapegraphai -langchain-huggingface==0.0.3 - # via scrapegraphai langchain-mistralai==0.1.12 # via scrapegraphai -langchain-nvidia-ai-endpoints==0.2.1 - # via scrapegraphai langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 @@ -320,8 +235,6 @@ minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk -mpmath==1.3.0 - # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -331,8 +244,6 @@ mypy-extensions==1.0.0 # via typing-inspect narwhals==1.3.0 # via altair -networkx==3.2.1 - # via torch numpy==1.26.4 # via contourpy # via faiss-cpu @@ -343,24 +254,16 @@ numpy==1.26.4 # via pandas # via pyarrow # via pydeck - # via scikit-learn - # via scipy - # via sentence-transformers # via sf-hamilton - # via shapely # via streamlit - # via transformers openai==1.40.3 # via burr - # via langchain-fireworks # via langchain-openai orjson==3.10.7 # via langsmith packaging==24.1 # via altair # via faiss-cpu - # via google-cloud-aiplatform - # via google-cloud-bigquery # via huggingface-hub # via langchain-core # via marshmallow @@ -368,21 +271,16 @@ packaging==24.1 # via pytest # via sphinx # via streamlit - # via transformers pandas==2.2.2 # via scrapegraphai # via sf-hamilton # via streamlit pillow==10.4.0 - # via fireworks-ai - # via langchain-nvidia-ai-endpoints # via matplotlib - # via sentence-transformers # via streamlit platformdirs==4.2.2 # via pylint playwright==1.45.1 - # via browserbase # via scrapegraphai # via undetected-playwright pluggy==1.5.0 @@ -390,16 +288,11 @@ pluggy==1.5.0 proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager protobuf==4.25.4 # via google-ai-generativelanguage # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager # via google-generativeai # via googleapis-common-protos - # via grpc-google-iam-v1 # via grpcio-status # via proto-plus # via streamlit @@ -411,15 +304,10 @@ pyasn1==0.6.0 pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 - # via anthropic - # via browserbase # via burr # via fastapi # via fastapi-pagination - # via fireworks-ai - # via google-cloud-aiplatform # via google-generativeai - # via groq # via langchain # via langchain-core # via langsmith @@ -444,7 +332,6 @@ pytest==8.0.0 pytest-mock==3.14.0 python-dateutil==2.9.0.post0 # via botocore - # via google-cloud-bigquery # via matplotlib # via pandas python-dotenv==1.0.1 @@ -456,28 +343,22 @@ pyyaml==6.0.2 # via langchain # via langchain-community # via langchain-core - # via transformers referencing==0.35.1 # via jsonschema # via jsonschema-specifications regex==2024.7.24 # via tiktoken - # via transformers requests==2.32.3 # via burr # via free-proxy # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage # via huggingface-hub # via langchain # via langchain-community - # via langchain-fireworks # via langsmith # via sphinx # via streamlit # via tiktoken - # via transformers rich==13.7.1 # via streamlit rpds-py==0.20.0 @@ -487,29 +368,16 @@ rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 -safetensors==0.4.4 - # via transformers -scikit-learn==1.5.1 - # via sentence-transformers -scipy==1.13.1 - # via scikit-learn - # via sentence-transformers semchunk==2.2.0 # via scrapegraphai -sentence-transformers==3.0.1 - # via langchain-huggingface sf-hamilton==1.73.1 # via burr -shapely==2.0.5 - # via google-cloud-aiplatform six==1.16.0 # via python-dateutil smmap==5.0.1 # via gitdb sniffio==1.3.1 - # via anthropic # via anyio - # via groq # via httpx # via openai snowballstemmer==2.2.0 @@ -541,23 +409,16 @@ starlette==0.37.2 # via fastapi streamlit==1.37.1 # via burr -sympy==1.13.2 - # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core # via streamlit -threadpoolctl==3.5.0 - # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 - # via anthropic - # via langchain-huggingface # via langchain-mistralai - # via transformers toml==0.10.2 # via streamlit tomli==2.0.1 @@ -565,8 +426,6 @@ tomli==2.0.1 # via pytest tomlkit==0.13.0 # via pylint -torch==2.2.2 - # via sentence-transformers tornado==6.4.1 # via streamlit tqdm==4.66.5 @@ -576,20 +435,13 @@ tqdm==4.66.5 # via openai # via scrapegraphai # via semchunk - # via sentence-transformers - # via transformers -transformers==4.44.0 - # via langchain-huggingface - # via sentence-transformers typing-extensions==4.12.2 # via altair - # via anthropic # via anyio # via astroid # via fastapi # via fastapi-pagination # via google-generativeai - # via groq # via huggingface-hub # via langchain-core # via openai @@ -601,7 +453,6 @@ typing-extensions==4.12.2 # via sqlalchemy # via starlette # via streamlit - # via torch # via typing-inspect # via uvicorn typing-inspect==0.9.0 diff --git a/requirements.lock b/requirements.lock index 1d80e1bf..f3cb5626 100644 --- a/requirements.lock +++ b/requirements.lock @@ -11,17 +11,11 @@ aiohttp==3.9.5 # via langchain # via langchain-community - # via langchain-fireworks - # via langchain-nvidia-ai-endpoints aiosignal==1.3.1 # via aiohttp annotated-types==0.7.0 # via pydantic -anthropic==0.31.2 - # via langchain-anthropic anyio==4.4.0 - # via anthropic - # via groq # via httpx # via openai async-timeout==4.0.3 @@ -37,8 +31,6 @@ boto3==1.34.146 botocore==1.34.146 # via boto3 # via s3transfer -browserbase==0.3.0 - # via scrapegraphai cachetools==5.4.0 # via google-auth certifi==2024.7.4 @@ -49,26 +41,16 @@ charset-normalizer==3.3.2 # via requests dataclasses-json==0.6.7 # via langchain-community -defusedxml==0.7.1 - # via langchain-anthropic dill==0.3.8 # via multiprocess distro==1.9.0 - # via anthropic - # via groq # via openai -docstring-parser==0.16 - # via google-cloud-aiplatform exceptiongroup==1.2.2 # via anyio faiss-cpu==1.8.0.post1 # via scrapegraphai filelock==3.15.4 # via huggingface-hub - # via torch - # via transformers -fireworks-ai==0.14.0 - # via langchain-fireworks free-proxy==1.1.1 # via scrapegraphai frozenlist==1.4.1 @@ -76,7 +58,6 @@ frozenlist==1.4.1 # via aiosignal fsspec==2024.6.1 # via huggingface-hub - # via torch google==3.0.0 # via scrapegraphai google-ai-generativelanguage==0.6.6 @@ -84,11 +65,6 @@ google-ai-generativelanguage==0.6.6 google-api-core==2.19.1 # via google-ai-generativelanguage # via google-api-python-client - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage # via google-generativeai google-api-python-client==2.137.0 # via google-generativeai @@ -97,51 +73,19 @@ google-auth==2.32.0 # via google-api-core # via google-api-python-client # via google-auth-httplib2 - # via google-cloud-aiplatform - # via google-cloud-bigquery - # via google-cloud-core - # via google-cloud-resource-manager - # via google-cloud-storage # via google-generativeai google-auth-httplib2==0.2.0 # via google-api-python-client -google-cloud-aiplatform==1.59.0 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via google-cloud-bigquery - # via google-cloud-storage -google-cloud-resource-manager==1.12.4 - # via google-cloud-aiplatform -google-cloud-storage==2.18.0 - # via google-cloud-aiplatform - # via langchain-google-vertexai -google-crc32c==1.5.0 - # via google-cloud-storage - # via google-resumable-media google-generativeai==0.7.2 # via langchain-google-genai -google-resumable-media==2.7.1 - # via google-cloud-bigquery - # via google-cloud-storage googleapis-common-protos==1.63.2 # via google-api-core - # via grpc-google-iam-v1 # via grpcio-status -graphviz==0.20.3 - # via scrapegraphai greenlet==3.0.3 # via playwright # via sqlalchemy -groq==0.9.0 - # via langchain-groq -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager grpcio==1.65.1 # via google-api-core - # via googleapis-common-protos - # via grpc-google-iam-v1 # via grpcio-status grpcio-status==1.62.2 # via google-api-core @@ -155,35 +99,22 @@ httplib2==0.22.0 # via google-api-python-client # via google-auth-httplib2 httpx==0.27.0 - # via anthropic - # via browserbase - # via fireworks-ai - # via groq # via langchain-mistralai # via openai httpx-sse==0.4.0 - # via fireworks-ai # via langchain-mistralai huggingface-hub==0.24.1 - # via langchain-huggingface - # via sentence-transformers # via tokenizers - # via transformers idna==3.7 # via anyio # via httpx # via requests # via yarl -jinja2==3.1.4 - # via torch jiter==0.5.0 - # via anthropic # via openai jmespath==1.0.1 # via boto3 # via botocore -joblib==1.4.2 - # via scikit-learn jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 @@ -191,40 +122,22 @@ jsonpointer==3.0.0 langchain==0.2.14 # via langchain-community # via scrapegraphai -langchain-anthropic==0.1.20 - # via scrapegraphai langchain-aws==0.1.12 # via scrapegraphai langchain-community==0.2.10 # via scrapegraphai langchain-core==0.2.33 # via langchain - # via langchain-anthropic # via langchain-aws # via langchain-community - # via langchain-fireworks # via langchain-google-genai - # via langchain-google-vertexai - # via langchain-groq - # via langchain-huggingface # via langchain-mistralai - # via langchain-nvidia-ai-endpoints # via langchain-openai # via langchain-text-splitters -langchain-fireworks==0.1.5 - # via scrapegraphai langchain-google-genai==1.0.8 # via scrapegraphai -langchain-google-vertexai==1.0.7 - # via scrapegraphai -langchain-groq==0.1.6 - # via scrapegraphai -langchain-huggingface==0.0.3 - # via scrapegraphai langchain-mistralai==0.1.12 # via scrapegraphai -langchain-nvidia-ai-endpoints==0.1.7 - # via scrapegraphai langchain-openai==0.1.22 # via scrapegraphai langchain-text-splitters==0.2.2 @@ -235,16 +148,12 @@ langsmith==0.1.93 # via langchain-core lxml==5.2.2 # via free-proxy -markupsafe==2.1.5 - # via jinja2 marshmallow==3.21.3 # via dataclasses-json minify-html==0.15.0 # via scrapegraphai mpire==2.10.2 # via semchunk -mpmath==1.3.0 - # via sympy multidict==6.0.5 # via aiohttp # via yarl @@ -252,55 +161,34 @@ multiprocess==0.70.16 # via mpire mypy-extensions==1.0.0 # via typing-inspect -networkx==3.2.1 - # via torch numpy==1.26.4 # via faiss-cpu # via langchain # via langchain-aws # via langchain-community # via pandas - # via scikit-learn - # via scipy - # via sentence-transformers - # via shapely - # via transformers openai==1.41.0 - # via langchain-fireworks # via langchain-openai orjson==3.10.6 # via langsmith packaging==24.1 # via faiss-cpu - # via google-cloud-aiplatform - # via google-cloud-bigquery # via huggingface-hub # via langchain-core # via marshmallow - # via transformers pandas==2.2.2 # via scrapegraphai -pillow==10.4.0 - # via fireworks-ai - # via langchain-nvidia-ai-endpoints - # via sentence-transformers playwright==1.45.1 - # via browserbase # via scrapegraphai # via undetected-playwright proto-plus==1.24.0 # via google-ai-generativelanguage # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager protobuf==4.25.3 # via google-ai-generativelanguage # via google-api-core - # via google-cloud-aiplatform - # via google-cloud-resource-manager # via google-generativeai # via googleapis-common-protos - # via grpc-google-iam-v1 # via grpcio-status # via proto-plus pyasn1==0.6.0 @@ -309,12 +197,7 @@ pyasn1==0.6.0 pyasn1-modules==0.4.0 # via google-auth pydantic==2.8.2 - # via anthropic - # via browserbase - # via fireworks-ai - # via google-cloud-aiplatform # via google-generativeai - # via groq # via langchain # via langchain-core # via langsmith @@ -329,7 +212,6 @@ pyparsing==3.1.2 # via httplib2 python-dateutil==2.9.0.post0 # via botocore - # via google-cloud-bigquery # via pandas python-dotenv==1.0.1 # via scrapegraphai @@ -340,45 +222,26 @@ pyyaml==6.0.1 # via langchain # via langchain-community # via langchain-core - # via transformers regex==2024.5.15 # via tiktoken - # via transformers requests==2.32.3 # via free-proxy # via google-api-core - # via google-cloud-bigquery - # via google-cloud-storage # via huggingface-hub # via langchain # via langchain-community - # via langchain-fireworks # via langsmith # via tiktoken - # via transformers rsa==4.9 # via google-auth s3transfer==0.10.2 # via boto3 -safetensors==0.4.3 - # via transformers -scikit-learn==1.5.1 - # via sentence-transformers -scipy==1.13.1 - # via scikit-learn - # via sentence-transformers semchunk==2.2.0 # via scrapegraphai -sentence-transformers==3.0.1 - # via langchain-huggingface -shapely==2.0.5 - # via google-cloud-aiplatform six==1.16.0 # via python-dateutil sniffio==1.3.1 - # via anthropic # via anyio - # via groq # via httpx # via openai soupsieve==2.5 @@ -386,24 +249,15 @@ soupsieve==2.5 sqlalchemy==2.0.31 # via langchain # via langchain-community -sympy==1.13.1 - # via torch tenacity==8.5.0 # via langchain # via langchain-community # via langchain-core -threadpoolctl==3.5.0 - # via scikit-learn tiktoken==0.7.0 # via langchain-openai # via scrapegraphai tokenizers==0.19.1 - # via anthropic - # via langchain-huggingface # via langchain-mistralai - # via transformers -torch==2.2.2 - # via sentence-transformers tqdm==4.66.4 # via google-generativeai # via huggingface-hub @@ -411,16 +265,9 @@ tqdm==4.66.4 # via openai # via scrapegraphai # via semchunk - # via sentence-transformers - # via transformers -transformers==4.43.3 - # via langchain-huggingface - # via sentence-transformers typing-extensions==4.12.2 - # via anthropic # via anyio # via google-generativeai - # via groq # via huggingface-hub # via langchain-core # via openai @@ -428,7 +275,6 @@ typing-extensions==4.12.2 # via pydantic-core # via pyee # via sqlalchemy - # via torch # via typing-inspect typing-inspect==0.9.0 # via dataclasses-json diff --git a/requirements.txt b/requirements.txt index 21c2fd3b..80cb0767 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,9 @@ langchain>=0.2.14 -langchain-fireworks>=0.1.3 -langchain_community>=0.2.9 langchain-google-genai>=1.0.7 -langchain-google-vertexai>=1.0.7 langchain-openai>=0.1.22 -langchain-groq>=0.1.3 -langchain-aws>=0.1.3 -langchain-anthropic>=0.1.11 langchain-mistralai>=0.1.12 -langchain-huggingface>=0.0.3 -langchain-nvidia-ai-endpoints>=0.1.6 +langchain_community>=0.2.9 +langchain-aws>=0.1.3 html2text>=2024.2.26 faiss-cpu>=1.8.0 beautifulsoup4>=4.12.3 @@ -17,11 +11,9 @@ pandas>=2.2.2 python-dotenv>=1.0.1 tiktoken>=0.7 tqdm>=4.66.4 -graphviz>=0.20.3 minify-html>=0.15.0 free-proxy>=1.1.1 playwright>=1.43.0 -google>=3.0.0 undetected-playwright>=0.3.0 +google>=3.0.0 semchunk>=1.0.1 -browserbase>=0.3.0 diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py index 4119ee9a..f015278d 100644 --- a/scrapegraphai/nodes/fetch_node.py +++ b/scrapegraphai/nodes/fetch_node.py @@ -268,7 +268,10 @@ def handle_web_source(self, state, source): loader_kwargs = self.node_config.get("loader_kwargs", {}) if self.browser_base is not None: - from ..docloaders.browser_base import browser_base_fetch + try: + from ..docloaders.browser_base import browser_base_fetch + except ImportError: + raise ImportError("The browserbase module is not installed. Please install it using `pip install browserbase`.") data = browser_base_fetch(self.browser_base.get("api_key"), self.browser_base.get("project_id"), [source]) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 970a6790..966a758f 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,16 +1,13 @@ """ GenerateAnswerNode Module """ +from sys import modules from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_mistralai import ChatMistralAI -from langchain_anthropic import ChatAnthropic -from langchain_groq import ChatGroq -from langchain_fireworks import ChatFireworks -from langchain_google_vertexai import ChatVertexAI from langchain_community.chat_models import ChatOllama from tqdm import tqdm from ..utils.logging import get_logger @@ -95,10 +92,18 @@ def execute(self, state: dict) -> dict: output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"]) # Use built-in structured output for providers that allow it - if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI, ChatAnthropic, ChatFireworks, ChatGroq, ChatVertexAI)): - self.llm_model = self.llm_model.with_structured_output( - schema = self.node_config["schema"], - method="json_schema") + optional_modules = {"langchain_anthropic", "langchain_fireworks", "langchain_groq", "langchain_google_vertexai"} + if all(key in modules for key in optional_modules): + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI, ChatAnthropic, ChatFireworks, ChatGroq, ChatVertexAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="json_schema") + else: + if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)): + self.llm_model = self.llm_model.with_structured_output( + schema = self.node_config["schema"], + method="json_schema") + else: output_parser = JsonOutputParser() diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py index 868044a0..974fa772 100644 --- a/scrapegraphai/nodes/rag_node.py +++ b/scrapegraphai/nodes/rag_node.py @@ -2,6 +2,7 @@ RAGNode Module """ import os +import sys from typing import List, Optional from langchain.docstore.document import Document from langchain.retrievers import ContextualCompressionRetriever @@ -13,18 +14,15 @@ from langchain_community.vectorstores import FAISS from langchain_community.chat_models import ChatOllama from langchain_aws import BedrockEmbeddings, ChatBedrock -from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings from langchain_community.embeddings import OllamaEmbeddings from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI -from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings -from langchain_fireworks import FireworksEmbeddings, ChatFireworks from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI -from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA from ..utils.logging import get_logger from .base_node import BaseNode from ..helpers import models_tokens from ..models import DeepSeek +optional_modules = {"langchain_anthropic", "langchain_fireworks", "langchain_groq", "langchain_google_vertexai"} class RAGNode(BaseNode): """ @@ -163,6 +161,7 @@ def _create_default_embedder(self, llm_config=None) -> object: Raises: ValueError: If the model is not supported. """ + if isinstance(self.llm_model, ChatGoogleGenerativeAI): return GoogleGenerativeAIEmbeddings( google_api_key=llm_config["api_key"], model="models/embedding-001" @@ -172,28 +171,28 @@ def _create_default_embedder(self, llm_config=None) -> object: base_url=self.llm_model.openai_api_base) elif isinstance(self.llm_model, DeepSeek): return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key) - elif isinstance(self.llm_model, ChatVertexAI): - return VertexAIEmbeddings() elif isinstance(self.llm_model, AzureOpenAIEmbeddings): return self.llm_model elif isinstance(self.llm_model, AzureChatOpenAI): return AzureOpenAIEmbeddings() - elif isinstance(self.llm_model, ChatFireworks): - return FireworksEmbeddings(model=self.llm_model.model_name) - elif isinstance(self.llm_model, ChatNVIDIA): - return NVIDIAEmbeddings(model=self.llm_model.model_name) elif isinstance(self.llm_model, ChatOllama): # unwrap the kwargs from the model whihc is a dict params = self.llm_model._lc_kwargs # remove streaming and temperature params.pop("streaming", None) params.pop("temperature", None) - return OllamaEmbeddings(**params) - elif isinstance(self.llm_model, ChatHuggingFace): - return HuggingFaceEmbeddings(model=self.llm_model.model) elif isinstance(self.llm_model, ChatBedrock): return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id) + elif all(key in sys.modules for key in optional_modules): + if isinstance(self.llm_model, ChatFireworks): + return FireworksEmbeddings(model=self.llm_model.model_name) + if isinstance(self.llm_model, ChatNVIDIA): + return NVIDIAEmbeddings(model=self.llm_model.model_name) + if isinstance(self.llm_model, ChatHuggingFace): + return HuggingFaceEmbeddings(model=self.llm_model.model) + if isinstance(self.llm_model, ChatVertexAI): + return VertexAIEmbeddings() else: raise ValueError("Embedding Model missing or not supported") @@ -218,14 +217,6 @@ def _create_embedder(self, embedder_config: dict) -> object: return OpenAIEmbeddings(api_key=embedder_params["api_key"]) if "azure" in embedder_params["model"]: return AzureOpenAIEmbeddings() - if "nvidia" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["nvidia"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return NVIDIAEmbeddings(model=embedder_params["model"], - nvidia_api_key=embedder_params["api_key"]) if "ollama" in embedder_params["model"]: embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) try: @@ -233,20 +224,6 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return OllamaEmbeddings(**embedder_params) - if "hugging_face" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["hugging_face"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return HuggingFaceEmbeddings(model=embedder_params["model"]) - if "fireworks" in embedder_params["model"]: - embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) - try: - models_tokens["fireworks"][embedder_params["model"]] - except KeyError as exc: - raise KeyError("Model not supported") from exc - return FireworksEmbeddings(model=embedder_params["model"]) if "gemini" in embedder_params["model"]: try: models_tokens["gemini"][embedder_params["model"]] @@ -261,5 +238,28 @@ def _create_embedder(self, embedder_config: dict) -> object: except KeyError as exc: raise KeyError("Model not supported") from exc return BedrockEmbeddings(client=client, model_id=embedder_params["model"]) + if all(key in sys.modules for key in optional_modules): + if "hugging_face" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["hugging_face"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return HuggingFaceEmbeddings(model=embedder_params["model"]) + if "fireworks" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["fireworks"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return FireworksEmbeddings(model=embedder_params["model"]) + if "nvidia" in embedder_params["model"]: + embedder_params["model"] = "/".join(embedder_params["model"].split("/")[1:]) + try: + models_tokens["nvidia"][embedder_params["model"]] + except KeyError as exc: + raise KeyError("Model not supported") from exc + return NVIDIAEmbeddings(model=embedder_params["model"], + nvidia_api_key=embedder_params["api_key"]) raise ValueError("Model provided by the configuration not supported") From 7789663338a89d27fde322ae282ce07ccca16845 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:17:51 +0200 Subject: [PATCH 3/3] fix(BurrBrige): dynamic imports --- scrapegraphai/integrations/burr_bridge.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scrapegraphai/integrations/burr_bridge.py b/scrapegraphai/integrations/burr_bridge.py index d1fe566f..e5eb3c6a 100644 --- a/scrapegraphai/integrations/burr_bridge.py +++ b/scrapegraphai/integrations/burr_bridge.py @@ -11,12 +11,13 @@ try: import burr + from burr import tracking + from burr.core import Application, ApplicationBuilder, State, Action, default, ApplicationContext + from burr.lifecycle import PostRunStepHook, PreRunStepHook except ImportError: raise ImportError("burr package is not installed. Please install it with 'pip install scrapegraphai[burr]'") -from burr import tracking -from burr.core import Application, ApplicationBuilder, State, Action, default, ApplicationContext -from burr.lifecycle import PostRunStepHook, PreRunStepHook + class PrintLnHook(PostRunStepHook, PreRunStepHook):