Merge pull request #780 from ScrapeGraphAI/pre/beta

Pre/beta
ScrapeGraphAI · Nov 1, 2024 · 9f0ba35 · 9f0ba35
2 parents ea2ff50 + 7e3598d
commit 9f0ba35
Show file tree

Hide file tree

Showing 9 changed files with 348 additions and 259 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,30 @@
+## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31)
+
+
+### Features
+
+* update generate answer ([7172b32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7172b32a0f37f547edccab7bd09406e73c9ec5b2))
+
+## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30)
+
+
+### Features
+
+* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952))
+* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c))
+
+
+### Bug Fixes
+
+* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)
+
+
+### CI
+
+* **release:** 1.27.0-beta.11 [skip ci] ([3b2cadc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2))
+* **release:** 1.27.0-beta.12 [skip ci] ([62369e3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62369e3e2886eb8cc09f6ef64865140a87a28b60))
+* **release:** 1.27.0-beta.13 [skip ci] ([deed355](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/deed355551d01d92dde11f8c0b373bdd43f8b8cf)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)
+
 ## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26)
 
 
@@ -13,6 +40,7 @@
 * refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887))
 
 
+
 ### Bug Fixes
 
 * fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422))
@@ -44,6 +72,21 @@
 * **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c))
 * **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee))
 * **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f))
+* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)
+
+## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28)
+
+
+### Features
+
+* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c))
+
+## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27)
+
+
+### Features
+
+* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952))
 
 ## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,8 @@
 [project]
 name = "scrapegraphai"
 
-version = "1.27.0"
+
+version = "1.28.0b2"
 
 
 

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -152,12 +152,15 @@ def _create_llm(self, llm_config: dict) -> object:
             raise ValueError(f"""Provider {llm_params['model_provider']} is not supported. 
                              If possible, try to use a model instance instead.""")
 
-        try:
-            self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
-        except KeyError:
-            print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found,
-                  using default token size (8192)""")
-            self.model_token = 8192
+        if "model_tokens" not in llm_params:
+            try:
+                self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
+            except KeyError:
+                print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found,
+                    using default token size (8192)""")
+                self.model_token = 8192
+        else:
+            self.model_token = llm_params["model_tokens"]
 
         try:
             if llm_params["model_provider"] not in \

diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py
@@ -98,21 +98,116 @@ def _set_conditional_node_edges(self):
                 except:
                     node.false_node_name = None
 
-    def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
-        """
-        Executes the graph by traversing nodes starting from the 
-        entry point using the standard method.
+    def _get_node_by_name(self, node_name: str):
+        """Returns a node instance by its name."""
+        return next(node for node in self.nodes if node.node_name == node_name)
 
-        Args:
-            initial_state (dict): The initial state to pass to the entry point node.
+    def _update_source_info(self, current_node, state):
+        """Updates source type and source information from FetchNode."""
+        source_type = None
+        source = []
+        prompt = None
+
+        if current_node.__class__.__name__ == "FetchNode":
+            source_type = list(state.keys())[1]
+            if state.get("user_prompt", None):
+                prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None
+
+            if source_type == "local_dir":
+                source_type = "html_dir"
+            elif source_type == "url":
+                if isinstance(state[source_type], list):
+                    source.extend(url for url in state[source_type] if isinstance(url, str))
+                elif isinstance(state[source_type], str):
+                    source.append(state[source_type])
+
+        return source_type, source, prompt
+
+    def _get_model_info(self, current_node):
+        """Extracts LLM and embedder model information from the node."""
+        llm_model = None
+        llm_model_name = None
+        embedder_model = None
 
-        Returns:
-            Tuple[dict, list]: A tuple containing the final state and a list of execution info.
+        if hasattr(current_node, "llm_model"):
+            llm_model = current_node.llm_model
+            if hasattr(llm_model, "model_name"):
+                llm_model_name = llm_model.model_name
+            elif hasattr(llm_model, "model"):
+                llm_model_name = llm_model.model
+            elif hasattr(llm_model, "model_id"):
+                llm_model_name = llm_model.model_id
+
+        if hasattr(current_node, "embedder_model"):
+            embedder_model = current_node.embedder_model
+            if hasattr(embedder_model, "model_name"):
+                embedder_model = embedder_model.model_name
+            elif hasattr(embedder_model, "model"):
+                embedder_model = embedder_model.model
+
+        return llm_model, llm_model_name, embedder_model
+
+    def _get_schema(self, current_node):
+        """Extracts schema information from the node configuration."""
+        if not hasattr(current_node, "node_config"):
+            return None
+
+        if not isinstance(current_node.node_config, dict):
+            return None
+
+        schema_config = current_node.node_config.get("schema")
+        if not schema_config or isinstance(schema_config, dict):
+            return None
+
+        try:
+            return schema_config.schema()
+        except Exception:
+            return None
+
+    def _execute_node(self, current_node, state, llm_model, llm_model_name):
+        """Executes a single node and returns execution information."""
+        curr_time = time.time()
+
+        with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
+            result = current_node.execute(state)
+            node_exec_time = time.time() - curr_time
+
+            cb_data = None
+            if cb is not None:
+                cb_data = {
+                    "node_name": current_node.node_name,
+                    "total_tokens": cb.total_tokens,
+                    "prompt_tokens": cb.prompt_tokens,
+                    "completion_tokens": cb.completion_tokens,
+                    "successful_requests": cb.successful_requests,
+                    "total_cost_USD": cb.total_cost,
+                    "exec_time": node_exec_time,
+                }
+
+        return result, node_exec_time, cb_data
+
+    def _get_next_node(self, current_node, result):
+        """Determines the next node to execute based on current node type and result."""
+        if current_node.node_type == "conditional_node":
+            node_names = {node.node_name for node in self.nodes}
+            if result in node_names:
+                return result
+            elif result is None:
+                return None
+            raise ValueError(
+                f"Conditional Node returned a node name '{result}' that does not exist in the graph"
+            )
+
+        return self.edges.get(current_node.node_name)
+
+    def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
+        """
+        Executes the graph by traversing nodes starting from the entry point using the standard method.
         """
         current_node_name = self.entry_point
         state = initial_state
-
-        # variables for tracking execution info
+        
+        # Tracking variables
         total_exec_time = 0.0
         exec_info = []
         cb_total = {
@@ -134,104 +229,51 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
         schema = None
 
         while current_node_name:
-            curr_time = time.time()
-            current_node = next(node for node in self.nodes if node.node_name == current_node_name)
-
-            if current_node.__class__.__name__ == "FetchNode":
-                source_type = list(state.keys())[1]
-                if state.get("user_prompt", None):
-                    prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None
-
-                if source_type == "local_dir":
-                    source_type = "html_dir"
-                elif source_type == "url":
-                    if isinstance(state[source_type], list):
-                        for url in state[source_type]:
-                            if isinstance(url, str):
-                                source.append(url)
-                    elif isinstance(state[source_type], str):
-                        source.append(state[source_type])
-
-            if hasattr(current_node, "llm_model") and llm_model is None:
-                llm_model = current_node.llm_model
-                if hasattr(llm_model, "model_name"):
-                    llm_model_name = llm_model.model_name
-                elif hasattr(llm_model, "model"):
-                    llm_model_name = llm_model.model
-                elif hasattr(llm_model, "model_id"):
-                    llm_model_name = llm_model.model_id
-
-            if hasattr(current_node, "embedder_model") and embedder_model is None:
-                embedder_model = current_node.embedder_model
-                if hasattr(embedder_model, "model_name"):
-                    embedder_model = embedder_model.model_name
-                elif hasattr(embedder_model, "model"):
-                    embedder_model = embedder_model.model
-
-            if hasattr(current_node, "node_config"):
-                if isinstance(current_node.node_config,dict):
-                    if current_node.node_config.get("schema", None) and schema is None:
-                        if not  isinstance(current_node.node_config["schema"], dict):
-                            try:
-                                schema = current_node.node_config["schema"].schema()
-                            except Exception as e:
-                                schema = None
-
-            with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
-                try:
-                    result = current_node.execute(state)
-                except Exception as e:
-                    error_node = current_node.node_name
-                    graph_execution_time = time.time() - start_time
-                    log_graph_execution(
-                        graph_name=self.graph_name,
-                        source=source,
-                        prompt=prompt,
-                        schema=schema,
-                        llm_model=llm_model_name,
-                        embedder_model=embedder_model,
-                        source_type=source_type,
-                        execution_time=graph_execution_time,
-                        error_node=error_node,
-                        exception=str(e)
-                    )
-                    raise e
-                node_exec_time = time.time() - curr_time
+            current_node = self._get_node_by_name(current_node_name)
+
+            # Update source information if needed
+            if source_type is None:
+                source_type, source, prompt = self._update_source_info(current_node, state)
+
+            # Get model information if needed
+            if llm_model is None:
+                llm_model, llm_model_name, embedder_model = self._get_model_info(current_node)
+
+            # Get schema if needed
+            if schema is None:
+                schema = self._get_schema(current_node)
+
+            try:
+                result, node_exec_time, cb_data = self._execute_node(
+                    current_node, state, llm_model, llm_model_name
+                )
                 total_exec_time += node_exec_time
 
-                if cb is not None:
-                    cb_data = {
-                        "node_name": current_node.node_name,
-                        "total_tokens": cb.total_tokens,
-                        "prompt_tokens": cb.prompt_tokens,
-                        "completion_tokens": cb.completion_tokens,
-                        "successful_requests": cb.successful_requests,
-                        "total_cost_USD": cb.total_cost,
-                        "exec_time": node_exec_time,
-                    }
-
+                if cb_data:
                     exec_info.append(cb_data)
-
-                    cb_total["total_tokens"] += cb_data["total_tokens"]
-                    cb_total["prompt_tokens"] += cb_data["prompt_tokens"]
-                    cb_total["completion_tokens"] += cb_data["completion_tokens"]
-                    cb_total["successful_requests"] += cb_data["successful_requests"]
-                    cb_total["total_cost_USD"] += cb_data["total_cost_USD"]
-
-            if current_node.node_type == "conditional_node":
-                node_names = {node.node_name for node in self.nodes}
-                if result in node_names:
-                    current_node_name = result
-                elif result is None:
-                    current_node_name = None
-                else:
-                    raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph")
-
-            elif current_node_name in self.edges:
-                current_node_name = self.edges[current_node_name]
-            else:
-                current_node_name = None
-
+                    for key in cb_total:
+                        cb_total[key] += cb_data[key]
+
+                current_node_name = self._get_next_node(current_node, result)
+
+            except Exception as e:
+                error_node = current_node.node_name
+                graph_execution_time = time.time() - start_time
+                log_graph_execution(
+                    graph_name=self.graph_name,
+                    source=source,
+                    prompt=prompt,
+                    schema=schema,
+                    llm_model=llm_model_name,
+                    embedder_model=embedder_model,
+                    source_type=source_type,
+                    execution_time=graph_execution_time,
+                    error_node=error_node,
+                    exception=str(e)
+                )
+                raise e
+
+        # Add total results to execution info
         exec_info.append({
             "node_name": "TOTAL RESULT",
             "total_tokens": cb_total["total_tokens"],
@@ -242,6 +284,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
             "exec_time": total_exec_time,
         })
 
+        # Log final execution results
         graph_execution_time = time.time() - start_time
         response = state.get("answer", None) if source_type == "url" else None
         content = state.get("parsed_doc", None) if response is not None else None
@@ -300,3 +343,4 @@ def append_node(self, node):
         self.raw_edges.append((last_node, node))
         self.nodes.append(node)
         self.edges = self._create_edges({e for e in self.raw_edges})
+
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -80,10 +80,12 @@
             "llama3.2:1b": 128000,
             "scrapegraph": 8192,
             "mistral": 8192,
+            "mistral-small": 128000,
+            "mistral-openorca": 32000,
+            "mistral-large": 128000,
             "grok-1": 8192,
             "llava": 4096,
             "mixtral:8x22b-instruct": 65536,
-            "mistral-openorca": 32000,
             "nomic-embed-text": 8192,
             "nous-hermes2:34b": 4096,
             "orca-mini": 2048,