From 691417089014b5b0b64a1b26687cbb0cba693952 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 26 Oct 2024 16:07:43 +0200 Subject: [PATCH 01/10] feat: add new mistral models --- scrapegraphai/helpers/models_tokens.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 113d1636..8f367ceb 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -80,10 +80,12 @@ "llama3.2:1b": 128000, "scrapegraph": 8192, "mistral": 8192, + "mistral-small": 128000, + "mistral-openorca": 32000, + "mistral-large": 128000, "grok-1": 8192, "llava": 4096, "mixtral:8x22b-instruct": 65536, - "mistral-openorca": 32000, "nomic-embed-text": 8192, "nous-hermes2:34b": 4096, "orca-mini": 2048, From 3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Sun, 27 Oct 2024 09:23:55 +0000 Subject: [PATCH 02/10] ci(release): 1.27.0-beta.11 [skip ci] ## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27) ### Features * add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58aba1fb..4632cf2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27) + + +### Features + +* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) + ## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25) diff --git a/pyproject.toml b/pyproject.toml index be705469..26de4336 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b10" +version = "1.27.0b11" From 12a6c18f6ac205b744d1de92e217cfc2dfc3486c Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 28 Oct 2024 09:58:03 +0100 Subject: [PATCH 03/10] feat: refactoring of the base_graph --- scrapegraphai/graphs/base_graph.py | 254 +++++++++++++++++------------ 1 file changed, 149 insertions(+), 105 deletions(-) diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 74135108..18a16ba3 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -98,21 +98,116 @@ def _set_conditional_node_edges(self): except: node.false_node_name = None - def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: - """ - Executes the graph by traversing nodes starting from the - entry point using the standard method. + def _get_node_by_name(self, node_name: str): + """Returns a node instance by its name.""" + return next(node for node in self.nodes if node.node_name == node_name) - Args: - initial_state (dict): The initial state to pass to the entry point node. + def _update_source_info(self, current_node, state): + """Updates source type and source information from FetchNode.""" + source_type = None + source = [] + prompt = None + + if current_node.__class__.__name__ == "FetchNode": + source_type = list(state.keys())[1] + if state.get("user_prompt", None): + prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None + + if source_type == "local_dir": + source_type = "html_dir" + elif source_type == "url": + if isinstance(state[source_type], list): + source.extend(url for url in state[source_type] if isinstance(url, str)) + elif isinstance(state[source_type], str): + source.append(state[source_type]) + + return source_type, source, prompt + + def _get_model_info(self, current_node): + """Extracts LLM and embedder model information from the node.""" + llm_model = None + llm_model_name = None + embedder_model = None - Returns: - Tuple[dict, list]: A tuple containing the final state and a list of execution info. + if hasattr(current_node, "llm_model"): + llm_model = current_node.llm_model + if hasattr(llm_model, "model_name"): + llm_model_name = llm_model.model_name + elif hasattr(llm_model, "model"): + llm_model_name = llm_model.model + elif hasattr(llm_model, "model_id"): + llm_model_name = llm_model.model_id + + if hasattr(current_node, "embedder_model"): + embedder_model = current_node.embedder_model + if hasattr(embedder_model, "model_name"): + embedder_model = embedder_model.model_name + elif hasattr(embedder_model, "model"): + embedder_model = embedder_model.model + + return llm_model, llm_model_name, embedder_model + + def _get_schema(self, current_node): + """Extracts schema information from the node configuration.""" + if not hasattr(current_node, "node_config"): + return None + + if not isinstance(current_node.node_config, dict): + return None + + schema_config = current_node.node_config.get("schema") + if not schema_config or isinstance(schema_config, dict): + return None + + try: + return schema_config.schema() + except Exception: + return None + + def _execute_node(self, current_node, state, llm_model, llm_model_name): + """Executes a single node and returns execution information.""" + curr_time = time.time() + + with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb: + result = current_node.execute(state) + node_exec_time = time.time() - curr_time + + cb_data = None + if cb is not None: + cb_data = { + "node_name": current_node.node_name, + "total_tokens": cb.total_tokens, + "prompt_tokens": cb.prompt_tokens, + "completion_tokens": cb.completion_tokens, + "successful_requests": cb.successful_requests, + "total_cost_USD": cb.total_cost, + "exec_time": node_exec_time, + } + + return result, node_exec_time, cb_data + + def _get_next_node(self, current_node, result): + """Determines the next node to execute based on current node type and result.""" + if current_node.node_type == "conditional_node": + node_names = {node.node_name for node in self.nodes} + if result in node_names: + return result + elif result is None: + return None + raise ValueError( + f"Conditional Node returned a node name '{result}' that does not exist in the graph" + ) + + return self.edges.get(current_node.node_name) + + def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: + """ + Executes the graph by traversing nodes starting from the entry point using the standard method. """ current_node_name = self.entry_point state = initial_state - - # variables for tracking execution info + + # Tracking variables total_exec_time = 0.0 exec_info = [] cb_total = { @@ -134,104 +229,51 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: schema = None while current_node_name: - curr_time = time.time() - current_node = next(node for node in self.nodes if node.node_name == current_node_name) - - if current_node.__class__.__name__ == "FetchNode": - source_type = list(state.keys())[1] - if state.get("user_prompt", None): - prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None - - if source_type == "local_dir": - source_type = "html_dir" - elif source_type == "url": - if isinstance(state[source_type], list): - for url in state[source_type]: - if isinstance(url, str): - source.append(url) - elif isinstance(state[source_type], str): - source.append(state[source_type]) - - if hasattr(current_node, "llm_model") and llm_model is None: - llm_model = current_node.llm_model - if hasattr(llm_model, "model_name"): - llm_model_name = llm_model.model_name - elif hasattr(llm_model, "model"): - llm_model_name = llm_model.model - elif hasattr(llm_model, "model_id"): - llm_model_name = llm_model.model_id - - if hasattr(current_node, "embedder_model") and embedder_model is None: - embedder_model = current_node.embedder_model - if hasattr(embedder_model, "model_name"): - embedder_model = embedder_model.model_name - elif hasattr(embedder_model, "model"): - embedder_model = embedder_model.model - - if hasattr(current_node, "node_config"): - if isinstance(current_node.node_config,dict): - if current_node.node_config.get("schema", None) and schema is None: - if not isinstance(current_node.node_config["schema"], dict): - try: - schema = current_node.node_config["schema"].schema() - except Exception as e: - schema = None - - with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb: - try: - result = current_node.execute(state) - except Exception as e: - error_node = current_node.node_name - graph_execution_time = time.time() - start_time - log_graph_execution( - graph_name=self.graph_name, - source=source, - prompt=prompt, - schema=schema, - llm_model=llm_model_name, - embedder_model=embedder_model, - source_type=source_type, - execution_time=graph_execution_time, - error_node=error_node, - exception=str(e) - ) - raise e - node_exec_time = time.time() - curr_time + current_node = self._get_node_by_name(current_node_name) + + # Update source information if needed + if source_type is None: + source_type, source, prompt = self._update_source_info(current_node, state) + + # Get model information if needed + if llm_model is None: + llm_model, llm_model_name, embedder_model = self._get_model_info(current_node) + + # Get schema if needed + if schema is None: + schema = self._get_schema(current_node) + + try: + result, node_exec_time, cb_data = self._execute_node( + current_node, state, llm_model, llm_model_name + ) total_exec_time += node_exec_time - if cb is not None: - cb_data = { - "node_name": current_node.node_name, - "total_tokens": cb.total_tokens, - "prompt_tokens": cb.prompt_tokens, - "completion_tokens": cb.completion_tokens, - "successful_requests": cb.successful_requests, - "total_cost_USD": cb.total_cost, - "exec_time": node_exec_time, - } - + if cb_data: exec_info.append(cb_data) - - cb_total["total_tokens"] += cb_data["total_tokens"] - cb_total["prompt_tokens"] += cb_data["prompt_tokens"] - cb_total["completion_tokens"] += cb_data["completion_tokens"] - cb_total["successful_requests"] += cb_data["successful_requests"] - cb_total["total_cost_USD"] += cb_data["total_cost_USD"] - - if current_node.node_type == "conditional_node": - node_names = {node.node_name for node in self.nodes} - if result in node_names: - current_node_name = result - elif result is None: - current_node_name = None - else: - raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph") - - elif current_node_name in self.edges: - current_node_name = self.edges[current_node_name] - else: - current_node_name = None - + for key in cb_total: + cb_total[key] += cb_data[key] + + current_node_name = self._get_next_node(current_node, result) + + except Exception as e: + error_node = current_node.node_name + graph_execution_time = time.time() - start_time + log_graph_execution( + graph_name=self.graph_name, + source=source, + prompt=prompt, + schema=schema, + llm_model=llm_model_name, + embedder_model=embedder_model, + source_type=source_type, + execution_time=graph_execution_time, + error_node=error_node, + exception=str(e) + ) + raise e + + # Add total results to execution info exec_info.append({ "node_name": "TOTAL RESULT", "total_tokens": cb_total["total_tokens"], @@ -242,6 +284,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: "exec_time": total_exec_time, }) + # Log final execution results graph_execution_time = time.time() - start_time response = state.get("answer", None) if source_type == "url" else None content = state.get("parsed_doc", None) if response is not None else None @@ -300,3 +343,4 @@ def append_node(self, node): self.raw_edges.append((last_node, node)) self.nodes.append(node) self.edges = self._create_edges({e for e in self.raw_edges}) + From 62369e3e2886eb8cc09f6ef64865140a87a28b60 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Mon, 28 Oct 2024 08:59:32 +0000 Subject: [PATCH 04/10] ci(release): 1.27.0-beta.12 [skip ci] ## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28) ### Features * refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4632cf2c..3b60dc63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28) + + +### Features + +* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) + ## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27) diff --git a/pyproject.toml b/pyproject.toml index 26de4336..adb44df9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b11" +version = "1.27.0b12" From f79f399ee0d660f162e0cb96d9faba48ecdc88b2 Mon Sep 17 00:00:00 2001 From: Federico Aguzzi <62149513+f-aguzzi@users.noreply.github.com> Date: Mon, 28 Oct 2024 14:51:39 +0100 Subject: [PATCH 05/10] fix(AbstractGraph): manually select model tokens closes #768 --- scrapegraphai/graphs/abstract_graph.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index c693400c..bd8244f0 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -152,12 +152,15 @@ def _create_llm(self, llm_config: dict) -> object: raise ValueError(f"""Provider {llm_params['model_provider']} is not supported. If possible, try to use a model instance instead.""") - try: - self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] - except KeyError: - print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found, - using default token size (8192)""") - self.model_token = 8192 + if "model_tokens" not in llm_params: + try: + self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] + except KeyError: + print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found, + using default token size (8192)""") + self.model_token = 8192 + else: + self.model_token = llm_params["model_tokens"] try: if llm_params["model_provider"] not in \ From 827f7260ad3c586ae34db728f00a758808d45e4e Mon Sep 17 00:00:00 2001 From: Umut CAN <78921017+C1N-S4@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:40:32 +0300 Subject: [PATCH 06/10] This commit focuses on optimizing the utility modules in the codebase for better performance and maintainability. Key improvements include: - More efficient HTML processing with combined regex operations and optimized tag handling - Enhanced deep copy functionality with better type handling and optimized recursion - Refactored web search with improved error handling and modular helper functions The changes maintain all existing functionality while improving code quality, performance, and maintainability. Documentation and type hints have been enhanced throughout. Optimize utils modules for better performance and maintainability - Improve HTML cleanup and minification: - Combine regex operations for better performance - Add better error handling for HTML processing - Optimize tag removal and attribute filtering - Enhance deep copy functionality: - Add special case handling for primitive types - Improve type checking and error handling - Optimize recursive copying for collections - Refactor web search functionality: - Add input validation and error handling - Split search logic into separate helper functions - Improve proxy handling and configuration - Add better timeout and error management - Optimize URL filtering and processing Technical improvements: - Better type hints and documentation - More efficient data structures - Improved error handling and validation - Reduced code duplication - Better separation of concerns No breaking changes - all existing functionality maintained --- scrapegraphai/utils/cleanup_html.py | 19 +-- scrapegraphai/utils/copy.py | 74 +++++------- scrapegraphai/utils/research_web.py | 178 ++++++++++++++-------------- 3 files changed, 127 insertions(+), 144 deletions(-) diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 2ec3b140..9b00f61c 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -60,13 +60,18 @@ def minify_html(html): """ minify_html function """ - html = re.sub(r'', '', html, flags=re.DOTALL) - - html = re.sub(r'>\s+<', '><', html) - html = re.sub(r'\s+>', '>', html) - html = re.sub(r'<\s+', '<', html) - html = re.sub(r'\s+', ' ', html) - html = re.sub(r'\s*=\s*', '=', html) + # Combine multiple regex operations into one for better performance + patterns = [ + (r'', '', re.DOTALL), + (r'>\s+<', '><', 0), + (r'\s+>', '>', 0), + (r'<\s+', '<', 0), + (r'\s+', ' ', 0), + (r'\s*=\s*', '=', 0) + ] + + for pattern, repl, flags in patterns: + html = re.sub(pattern, repl, html, flags=flags) return html.strip() diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index a35370ab..2ec7cee2 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -30,56 +30,38 @@ def is_boto3_client(obj): def safe_deepcopy(obj: Any) -> Any: """ - Attempts to create a deep copy of the object using `copy.deepcopy` - whenever possible. If that fails, it falls back to custom deep copy - logic. If that also fails, it raises a `DeepCopyError`. - + Safely create a deep copy of an object, handling special cases. + Args: - obj (Any): The object to be copied, which can be of any type. - + obj: Object to copy + Returns: - Any: A deep copy of the object if possible; otherwise, a shallow - copy if deep copying fails; if neither is possible, the original - object is returned. + Deep copy of the object + Raises: - DeepCopyError: If the object cannot be deep-copied or shallow-copied. + DeepCopyError: If object cannot be deep copied """ - try: - - return copy.deepcopy(obj) - except (TypeError, AttributeError) as e: - + # Handle special cases first + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + + if isinstance(obj, (list, set)): + return type(obj)(safe_deepcopy(v) for v in obj) + if isinstance(obj, dict): - new_obj = {} - - for k, v in obj.items(): - new_obj[k] = safe_deepcopy(v) - return new_obj - - elif isinstance(obj, list): - new_obj = [] - - for v in obj: - new_obj.append(safe_deepcopy(v)) - return new_obj - - elif isinstance(obj, tuple): - new_obj = tuple(safe_deepcopy(v) for v in obj) - - return new_obj - - elif isinstance(obj, frozenset): - new_obj = frozenset(safe_deepcopy(v) for v in obj) - return new_obj - - elif is_boto3_client(obj): + return {k: safe_deepcopy(v) for k, v in obj.items()} + + if isinstance(obj, tuple): + return tuple(safe_deepcopy(v) for v in obj) + + if isinstance(obj, frozenset): + return frozenset(safe_deepcopy(v) for v in obj) + + if is_boto3_client(obj): return obj - - else: - try: - return copy.copy(obj) - except (TypeError, AttributeError): - raise DeepCopyError( - f"Cannot deep copy the object of type {type(obj)}" - ) from e + + return copy.copy(obj) + + except Exception as e: + raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index af351ad4..86f9f5f3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -9,101 +9,97 @@ from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080, + max_results: int = 10, port: int = 8080, timeout: int = 10, proxy: str | dict = None) -> List[str]: + """Search web function with improved error handling and validation""" + + # Input validation + if not query or not isinstance(query, str): + raise ValueError("Query must be a non-empty string") + + search_engine = search_engine.lower() + valid_engines = {"google", "duckduckgo", "bing", "searxng"} + if search_engine not in valid_engines: + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") + + # Format proxy once + formatted_proxy = None + if proxy: + formatted_proxy = format_proxy(proxy) + + try: + results = [] + if search_engine == "google": + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) + + elif search_engine == "duckduckgo": + research = DuckDuckGoSearchResults(max_results=max_results) + res = research.run(query) + results = re.findall(r'https?://[^\s,\]]+', res) + + elif search_engine == "bing": + results = _search_bing(query, max_results, timeout, formatted_proxy) + + elif search_engine == "searxng": + results = _search_searxng(query, max_results, port, timeout) + + return filter_pdf_links(results) + + except requests.Timeout: + raise TimeoutError(f"Search request timed out after {timeout} seconds") + except requests.RequestException as e: + raise RuntimeError(f"Search request failed: {str(e)}") + +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: + """Helper function for Bing search""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + + proxies = {"http": proxy, "https": proxy} if proxy else None + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] + +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: + """Helper function for SearXNG search""" + url = f"http://localhost:{port}" + params = { + "q": query, + "format": "json", + "engines": "google,duckduckgo,brave,qwant,bing" + } + response = requests.get(url, params=params, timeout=timeout) + response.raise_for_status() + return [result['url'] for result in response.json().get("results", [])[:max_results]] + +def format_proxy(proxy): + if isinstance(proxy, dict): + server = proxy.get('server') + username = proxy.get('username') + password = proxy.get('password') + + if all([username, password, server]): + proxy_url = f"http://{username}:{password}@{server}" + return proxy_url + else: + raise ValueError("Proxy dictionary is missing required fields.") + elif isinstance(proxy, str): + return proxy # "https://username:password@ip:port" + else: + raise TypeError("Proxy should be a dictionary or a string.") + +def filter_pdf_links(links: List[str]) -> List[str]: """ - Searches the web for a given query using specified search - engine options and filters out PDF links. + Filters out any links that point to PDF files. Args: - query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. - max_results (int, optional): The maximum number of search results to return. - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. - timeout (int, optional): The number of seconds to wait - for a response from a request. Default is 10 seconds. - proxy (dict or string, optional): The proxy server to use for the request. Default is None. + links (List[str]): A list of URLs as strings. Returns: - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. - - Raises: - ValueError: If the search engine specified is not supported. - requests.exceptions.Timeout: If the request times out. - - Example: - >>> search_on_web("example query", search_engine="Google", max_results=5) - ['http://example.com', 'http://example.org', ...] + List[str]: A list of URLs excluding any that end with '.pdf'. """ - - def format_proxy(proxy): - if isinstance(proxy, dict): - server = proxy.get('server') - username = proxy.get('username') - password = proxy.get('password') - - if all([username, password, server]): - proxy_url = f"http://{username}:{password}@{server}" - return proxy_url - else: - raise ValueError("Proxy dictionary is missing required fields.") - elif isinstance(proxy, str): - return proxy # "https://username:password@ip:port" - else: - raise TypeError("Proxy should be a dictionary or a string.") - - def filter_pdf_links(links: List[str]) -> List[str]: - """ - Filters out any links that point to PDF files. - - Args: - links (List[str]): A list of URLs as strings. - - Returns: - List[str]: A list of URLs excluding any that end with '.pdf'. - """ - return [link for link in links if not link.lower().endswith('.pdf')] - - if proxy: - proxy = format_proxy(proxy) - - if search_engine.lower() == "google": - res = [] - for url in google_search(query, num_results=max_results, proxy=proxy): - res.append(url) - return filter_pdf_links(res) - - elif search_engine.lower() == "duckduckgo": - research = DuckDuckGoSearchResults(max_results=max_results) - res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return filter_pdf_links(links) - - elif search_engine.lower() == "bing": - headers = { - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" - } - search_url = f"https://www.bing.com/search?q={query}" - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - search_results = [] - for result in soup.find_all('li', class_='b_algo', limit=max_results): - link = result.find('a')['href'] - search_results.append(link) - return filter_pdf_links(search_results) - - elif search_engine.lower() == "searxng": - url = f"http://localhost:{port}" - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} - response = requests.get(url, params=params, timeout=timeout) - data = response.json() - limited_results = [result['url'] for result in data["results"][:max_results]] - return filter_pdf_links(limited_results) - - else: - raise ValueError("""The only search engines available are - DuckDuckGo, Google, Bing, or SearXNG""") + return [link for link in links if not link.lower().endswith('.pdf')] From deed355551d01d92dde11f8c0b373bdd43f8b8cf Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Tue, 29 Oct 2024 08:02:25 +0000 Subject: [PATCH 07/10] ci(release): 1.27.0-beta.13 [skip ci] ## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29) ### Bug Fixes * **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b60dc63..7be1274a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.27.0-beta.13](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.12...v1.27.0-beta.13) (2024-10-29) + + +### Bug Fixes + +* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + ## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28) diff --git a/pyproject.toml b/pyproject.toml index adb44df9..8a6d894a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "scrapegraphai" -version = "1.27.0b12" +version = "1.27.0b13" From 7172b32a0f37f547edccab7bd09406e73c9ec5b2 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 30 Oct 2024 08:59:13 +0100 Subject: [PATCH 08/10] feat: update generate answer --- scrapegraphai/nodes/generate_answer_node.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 40f7182d..30058ec5 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -2,6 +2,7 @@ GenerateAnswerNode Module """ from typing import List, Optional +from json.decoder import JSONDecodeError from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel @@ -121,9 +122,21 @@ def execute(self, state: dict) -> dict: partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model + raw_response = str((prompt | self.llm_model).invoke({"question": user_prompt})) + if output_parser: - chain = chain | output_parser - answer = chain.invoke({"question": user_prompt}) + try: + answer = output_parser.parse(raw_response) + except JSONDecodeError: + lines = raw_response.split('\n') + if lines[0].strip().startswith('```'): + lines = lines[1:] + if lines[-1].strip().endswith('```'): + lines = lines[:-1] + cleaned_response = '\n'.join(lines) + answer = output_parser.parse(cleaned_response) + else: + answer = raw_response state.update({self.output[0]: answer}) return state From 8cbe582ea99945ea6543f4c2000298acaa3d75c8 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Wed, 30 Oct 2024 08:02:00 +0000 Subject: [PATCH 09/10] ci(release): 1.28.0-beta.1 [skip ci] ## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30) ### Features * add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) * refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) ### Bug Fixes * **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) ### CI * **release:** 1.27.0-beta.11 [skip ci] ([3b2cadc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2)) * **release:** 1.27.0-beta.12 [skip ci] ([62369e3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62369e3e2886eb8cc09f6ef64865140a87a28b60)) * **release:** 1.27.0-beta.13 [skip ci] ([deed355](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/deed355551d01d92dde11f8c0b373bdd43f8b8cf)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) --- CHANGELOG.md | 20 ++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5292abd4..bf085283 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,23 @@ +## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30) + + +### Features + +* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) +* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) + + +### Bug Fixes + +* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + + +### CI + +* **release:** 1.27.0-beta.11 [skip ci] ([3b2cadc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2)) +* **release:** 1.27.0-beta.12 [skip ci] ([62369e3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62369e3e2886eb8cc09f6ef64865140a87a28b60)) +* **release:** 1.27.0-beta.13 [skip ci] ([deed355](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/deed355551d01d92dde11f8c0b373bdd43f8b8cf)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + ## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26) diff --git a/pyproject.toml b/pyproject.toml index 810f871b..f1d4fb0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.27.0b13" +version = "1.28.0b1" From 7e3598ddfacb2440df7b06e95b265b1b37cb4ea3 Mon Sep 17 00:00:00 2001 From: semantic-release-bot Date: Thu, 31 Oct 2024 09:21:45 +0000 Subject: [PATCH 10/10] ci(release): 1.28.0-beta.2 [skip ci] ## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31) ### Features * update generate answer ([7172b32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7172b32a0f37f547edccab7bd09406e73c9ec5b2)) --- CHANGELOG.md | 7 +++++++ pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf085283..2ee28236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31) + + +### Features + +* update generate answer ([7172b32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7172b32a0f37f547edccab7bd09406e73c9ec5b2)) + ## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30) diff --git a/pyproject.toml b/pyproject.toml index f1d4fb0d..dad622d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scrapegraphai" -version = "1.28.0b1" +version = "1.28.0b2"