diff --git a/CHANGELOG.md b/CHANGELOG.md index 90550351..2ee28236 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,30 @@ +## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31) + + +### Features + +* update generate answer ([7172b32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7172b32a0f37f547edccab7bd09406e73c9ec5b2)) + +## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30) + + +### Features + +* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) +* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) + + +### Bug Fixes + +* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + + +### CI + +* **release:** 1.27.0-beta.11 [skip ci] ([3b2cadc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2)) +* **release:** 1.27.0-beta.12 [skip ci] ([62369e3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62369e3e2886eb8cc09f6ef64865140a87a28b60)) +* **release:** 1.27.0-beta.13 [skip ci] ([deed355](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/deed355551d01d92dde11f8c0b373bdd43f8b8cf)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + ## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26) @@ -13,6 +40,7 @@ * refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887)) + ### Bug Fixes * fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422)) @@ -44,6 +72,21 @@ * **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c)) * **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee)) * **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f)) +* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768) + +## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28) + + +### Features + +* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c)) + +## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27) + + +### Features + +* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952)) ## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25) diff --git a/pyproject.toml b/pyproject.toml index ad67d0af..dad622d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,8 @@ [project] name = "scrapegraphai" -version = "1.27.0" + +version = "1.28.0b2" diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py index c693400c..bd8244f0 100644 --- a/scrapegraphai/graphs/abstract_graph.py +++ b/scrapegraphai/graphs/abstract_graph.py @@ -152,12 +152,15 @@ def _create_llm(self, llm_config: dict) -> object: raise ValueError(f"""Provider {llm_params['model_provider']} is not supported. If possible, try to use a model instance instead.""") - try: - self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] - except KeyError: - print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found, - using default token size (8192)""") - self.model_token = 8192 + if "model_tokens" not in llm_params: + try: + self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]] + except KeyError: + print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found, + using default token size (8192)""") + self.model_token = 8192 + else: + self.model_token = llm_params["model_tokens"] try: if llm_params["model_provider"] not in \ diff --git a/scrapegraphai/graphs/base_graph.py b/scrapegraphai/graphs/base_graph.py index 74135108..18a16ba3 100644 --- a/scrapegraphai/graphs/base_graph.py +++ b/scrapegraphai/graphs/base_graph.py @@ -98,21 +98,116 @@ def _set_conditional_node_edges(self): except: node.false_node_name = None - def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: - """ - Executes the graph by traversing nodes starting from the - entry point using the standard method. + def _get_node_by_name(self, node_name: str): + """Returns a node instance by its name.""" + return next(node for node in self.nodes if node.node_name == node_name) - Args: - initial_state (dict): The initial state to pass to the entry point node. + def _update_source_info(self, current_node, state): + """Updates source type and source information from FetchNode.""" + source_type = None + source = [] + prompt = None + + if current_node.__class__.__name__ == "FetchNode": + source_type = list(state.keys())[1] + if state.get("user_prompt", None): + prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None + + if source_type == "local_dir": + source_type = "html_dir" + elif source_type == "url": + if isinstance(state[source_type], list): + source.extend(url for url in state[source_type] if isinstance(url, str)) + elif isinstance(state[source_type], str): + source.append(state[source_type]) + + return source_type, source, prompt + + def _get_model_info(self, current_node): + """Extracts LLM and embedder model information from the node.""" + llm_model = None + llm_model_name = None + embedder_model = None - Returns: - Tuple[dict, list]: A tuple containing the final state and a list of execution info. + if hasattr(current_node, "llm_model"): + llm_model = current_node.llm_model + if hasattr(llm_model, "model_name"): + llm_model_name = llm_model.model_name + elif hasattr(llm_model, "model"): + llm_model_name = llm_model.model + elif hasattr(llm_model, "model_id"): + llm_model_name = llm_model.model_id + + if hasattr(current_node, "embedder_model"): + embedder_model = current_node.embedder_model + if hasattr(embedder_model, "model_name"): + embedder_model = embedder_model.model_name + elif hasattr(embedder_model, "model"): + embedder_model = embedder_model.model + + return llm_model, llm_model_name, embedder_model + + def _get_schema(self, current_node): + """Extracts schema information from the node configuration.""" + if not hasattr(current_node, "node_config"): + return None + + if not isinstance(current_node.node_config, dict): + return None + + schema_config = current_node.node_config.get("schema") + if not schema_config or isinstance(schema_config, dict): + return None + + try: + return schema_config.schema() + except Exception: + return None + + def _execute_node(self, current_node, state, llm_model, llm_model_name): + """Executes a single node and returns execution information.""" + curr_time = time.time() + + with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb: + result = current_node.execute(state) + node_exec_time = time.time() - curr_time + + cb_data = None + if cb is not None: + cb_data = { + "node_name": current_node.node_name, + "total_tokens": cb.total_tokens, + "prompt_tokens": cb.prompt_tokens, + "completion_tokens": cb.completion_tokens, + "successful_requests": cb.successful_requests, + "total_cost_USD": cb.total_cost, + "exec_time": node_exec_time, + } + + return result, node_exec_time, cb_data + + def _get_next_node(self, current_node, result): + """Determines the next node to execute based on current node type and result.""" + if current_node.node_type == "conditional_node": + node_names = {node.node_name for node in self.nodes} + if result in node_names: + return result + elif result is None: + return None + raise ValueError( + f"Conditional Node returned a node name '{result}' that does not exist in the graph" + ) + + return self.edges.get(current_node.node_name) + + def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: + """ + Executes the graph by traversing nodes starting from the entry point using the standard method. """ current_node_name = self.entry_point state = initial_state - - # variables for tracking execution info + + # Tracking variables total_exec_time = 0.0 exec_info = [] cb_total = { @@ -134,104 +229,51 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: schema = None while current_node_name: - curr_time = time.time() - current_node = next(node for node in self.nodes if node.node_name == current_node_name) - - if current_node.__class__.__name__ == "FetchNode": - source_type = list(state.keys())[1] - if state.get("user_prompt", None): - prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None - - if source_type == "local_dir": - source_type = "html_dir" - elif source_type == "url": - if isinstance(state[source_type], list): - for url in state[source_type]: - if isinstance(url, str): - source.append(url) - elif isinstance(state[source_type], str): - source.append(state[source_type]) - - if hasattr(current_node, "llm_model") and llm_model is None: - llm_model = current_node.llm_model - if hasattr(llm_model, "model_name"): - llm_model_name = llm_model.model_name - elif hasattr(llm_model, "model"): - llm_model_name = llm_model.model - elif hasattr(llm_model, "model_id"): - llm_model_name = llm_model.model_id - - if hasattr(current_node, "embedder_model") and embedder_model is None: - embedder_model = current_node.embedder_model - if hasattr(embedder_model, "model_name"): - embedder_model = embedder_model.model_name - elif hasattr(embedder_model, "model"): - embedder_model = embedder_model.model - - if hasattr(current_node, "node_config"): - if isinstance(current_node.node_config,dict): - if current_node.node_config.get("schema", None) and schema is None: - if not isinstance(current_node.node_config["schema"], dict): - try: - schema = current_node.node_config["schema"].schema() - except Exception as e: - schema = None - - with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb: - try: - result = current_node.execute(state) - except Exception as e: - error_node = current_node.node_name - graph_execution_time = time.time() - start_time - log_graph_execution( - graph_name=self.graph_name, - source=source, - prompt=prompt, - schema=schema, - llm_model=llm_model_name, - embedder_model=embedder_model, - source_type=source_type, - execution_time=graph_execution_time, - error_node=error_node, - exception=str(e) - ) - raise e - node_exec_time = time.time() - curr_time + current_node = self._get_node_by_name(current_node_name) + + # Update source information if needed + if source_type is None: + source_type, source, prompt = self._update_source_info(current_node, state) + + # Get model information if needed + if llm_model is None: + llm_model, llm_model_name, embedder_model = self._get_model_info(current_node) + + # Get schema if needed + if schema is None: + schema = self._get_schema(current_node) + + try: + result, node_exec_time, cb_data = self._execute_node( + current_node, state, llm_model, llm_model_name + ) total_exec_time += node_exec_time - if cb is not None: - cb_data = { - "node_name": current_node.node_name, - "total_tokens": cb.total_tokens, - "prompt_tokens": cb.prompt_tokens, - "completion_tokens": cb.completion_tokens, - "successful_requests": cb.successful_requests, - "total_cost_USD": cb.total_cost, - "exec_time": node_exec_time, - } - + if cb_data: exec_info.append(cb_data) - - cb_total["total_tokens"] += cb_data["total_tokens"] - cb_total["prompt_tokens"] += cb_data["prompt_tokens"] - cb_total["completion_tokens"] += cb_data["completion_tokens"] - cb_total["successful_requests"] += cb_data["successful_requests"] - cb_total["total_cost_USD"] += cb_data["total_cost_USD"] - - if current_node.node_type == "conditional_node": - node_names = {node.node_name for node in self.nodes} - if result in node_names: - current_node_name = result - elif result is None: - current_node_name = None - else: - raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph") - - elif current_node_name in self.edges: - current_node_name = self.edges[current_node_name] - else: - current_node_name = None - + for key in cb_total: + cb_total[key] += cb_data[key] + + current_node_name = self._get_next_node(current_node, result) + + except Exception as e: + error_node = current_node.node_name + graph_execution_time = time.time() - start_time + log_graph_execution( + graph_name=self.graph_name, + source=source, + prompt=prompt, + schema=schema, + llm_model=llm_model_name, + embedder_model=embedder_model, + source_type=source_type, + execution_time=graph_execution_time, + error_node=error_node, + exception=str(e) + ) + raise e + + # Add total results to execution info exec_info.append({ "node_name": "TOTAL RESULT", "total_tokens": cb_total["total_tokens"], @@ -242,6 +284,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]: "exec_time": total_exec_time, }) + # Log final execution results graph_execution_time = time.time() - start_time response = state.get("answer", None) if source_type == "url" else None content = state.get("parsed_doc", None) if response is not None else None @@ -300,3 +343,4 @@ def append_node(self, node): self.raw_edges.append((last_node, node)) self.nodes.append(node) self.edges = self._create_edges({e for e in self.raw_edges}) + diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py index 113d1636..8f367ceb 100644 --- a/scrapegraphai/helpers/models_tokens.py +++ b/scrapegraphai/helpers/models_tokens.py @@ -80,10 +80,12 @@ "llama3.2:1b": 128000, "scrapegraph": 8192, "mistral": 8192, + "mistral-small": 128000, + "mistral-openorca": 32000, + "mistral-large": 128000, "grok-1": 8192, "llava": 4096, "mixtral:8x22b-instruct": 65536, - "mistral-openorca": 32000, "nomic-embed-text": 8192, "nous-hermes2:34b": 4096, "orca-mini": 2048, diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 40f7182d..30058ec5 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -2,6 +2,7 @@ GenerateAnswerNode Module """ from typing import List, Optional +from json.decoder import JSONDecodeError from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser from langchain_core.runnables import RunnableParallel @@ -121,9 +122,21 @@ def execute(self, state: dict) -> dict: partial_variables={"context": doc, "format_instructions": format_instructions} ) chain = prompt | self.llm_model + raw_response = str((prompt | self.llm_model).invoke({"question": user_prompt})) + if output_parser: - chain = chain | output_parser - answer = chain.invoke({"question": user_prompt}) + try: + answer = output_parser.parse(raw_response) + except JSONDecodeError: + lines = raw_response.split('\n') + if lines[0].strip().startswith('```'): + lines = lines[1:] + if lines[-1].strip().endswith('```'): + lines = lines[:-1] + cleaned_response = '\n'.join(lines) + answer = output_parser.parse(cleaned_response) + else: + answer = raw_response state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/utils/cleanup_html.py b/scrapegraphai/utils/cleanup_html.py index 2ec3b140..9b00f61c 100644 --- a/scrapegraphai/utils/cleanup_html.py +++ b/scrapegraphai/utils/cleanup_html.py @@ -60,13 +60,18 @@ def minify_html(html): """ minify_html function """ - html = re.sub(r'', '', html, flags=re.DOTALL) - - html = re.sub(r'>\s+<', '><', html) - html = re.sub(r'\s+>', '>', html) - html = re.sub(r'<\s+', '<', html) - html = re.sub(r'\s+', ' ', html) - html = re.sub(r'\s*=\s*', '=', html) + # Combine multiple regex operations into one for better performance + patterns = [ + (r'', '', re.DOTALL), + (r'>\s+<', '><', 0), + (r'\s+>', '>', 0), + (r'<\s+', '<', 0), + (r'\s+', ' ', 0), + (r'\s*=\s*', '=', 0) + ] + + for pattern, repl, flags in patterns: + html = re.sub(pattern, repl, html, flags=flags) return html.strip() diff --git a/scrapegraphai/utils/copy.py b/scrapegraphai/utils/copy.py index a35370ab..2ec7cee2 100644 --- a/scrapegraphai/utils/copy.py +++ b/scrapegraphai/utils/copy.py @@ -30,56 +30,38 @@ def is_boto3_client(obj): def safe_deepcopy(obj: Any) -> Any: """ - Attempts to create a deep copy of the object using `copy.deepcopy` - whenever possible. If that fails, it falls back to custom deep copy - logic. If that also fails, it raises a `DeepCopyError`. - + Safely create a deep copy of an object, handling special cases. + Args: - obj (Any): The object to be copied, which can be of any type. - + obj: Object to copy + Returns: - Any: A deep copy of the object if possible; otherwise, a shallow - copy if deep copying fails; if neither is possible, the original - object is returned. + Deep copy of the object + Raises: - DeepCopyError: If the object cannot be deep-copied or shallow-copied. + DeepCopyError: If object cannot be deep copied """ - try: - - return copy.deepcopy(obj) - except (TypeError, AttributeError) as e: - + # Handle special cases first + if obj is None or isinstance(obj, (str, int, float, bool)): + return obj + + if isinstance(obj, (list, set)): + return type(obj)(safe_deepcopy(v) for v in obj) + if isinstance(obj, dict): - new_obj = {} - - for k, v in obj.items(): - new_obj[k] = safe_deepcopy(v) - return new_obj - - elif isinstance(obj, list): - new_obj = [] - - for v in obj: - new_obj.append(safe_deepcopy(v)) - return new_obj - - elif isinstance(obj, tuple): - new_obj = tuple(safe_deepcopy(v) for v in obj) - - return new_obj - - elif isinstance(obj, frozenset): - new_obj = frozenset(safe_deepcopy(v) for v in obj) - return new_obj - - elif is_boto3_client(obj): + return {k: safe_deepcopy(v) for k, v in obj.items()} + + if isinstance(obj, tuple): + return tuple(safe_deepcopy(v) for v in obj) + + if isinstance(obj, frozenset): + return frozenset(safe_deepcopy(v) for v in obj) + + if is_boto3_client(obj): return obj - - else: - try: - return copy.copy(obj) - except (TypeError, AttributeError): - raise DeepCopyError( - f"Cannot deep copy the object of type {type(obj)}" - ) from e + + return copy.copy(obj) + + except Exception as e: + raise DeepCopyError(f"Cannot deep copy object of type {type(obj)}") from e diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py index af351ad4..86f9f5f3 100644 --- a/scrapegraphai/utils/research_web.py +++ b/scrapegraphai/utils/research_web.py @@ -9,101 +9,97 @@ from bs4 import BeautifulSoup def search_on_web(query: str, search_engine: str = "Google", - max_results: int = 10, port: int = 8080, + max_results: int = 10, port: int = 8080, timeout: int = 10, proxy: str | dict = None) -> List[str]: + """Search web function with improved error handling and validation""" + + # Input validation + if not query or not isinstance(query, str): + raise ValueError("Query must be a non-empty string") + + search_engine = search_engine.lower() + valid_engines = {"google", "duckduckgo", "bing", "searxng"} + if search_engine not in valid_engines: + raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") + + # Format proxy once + formatted_proxy = None + if proxy: + formatted_proxy = format_proxy(proxy) + + try: + results = [] + if search_engine == "google": + results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) + + elif search_engine == "duckduckgo": + research = DuckDuckGoSearchResults(max_results=max_results) + res = research.run(query) + results = re.findall(r'https?://[^\s,\]]+', res) + + elif search_engine == "bing": + results = _search_bing(query, max_results, timeout, formatted_proxy) + + elif search_engine == "searxng": + results = _search_searxng(query, max_results, port, timeout) + + return filter_pdf_links(results) + + except requests.Timeout: + raise TimeoutError(f"Search request timed out after {timeout} seconds") + except requests.RequestException as e: + raise RuntimeError(f"Search request failed: {str(e)}") + +def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: + """Helper function for Bing search""" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + search_url = f"https://www.bing.com/search?q={query}" + + proxies = {"http": proxy, "https": proxy} if proxy else None + response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] + +def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: + """Helper function for SearXNG search""" + url = f"http://localhost:{port}" + params = { + "q": query, + "format": "json", + "engines": "google,duckduckgo,brave,qwant,bing" + } + response = requests.get(url, params=params, timeout=timeout) + response.raise_for_status() + return [result['url'] for result in response.json().get("results", [])[:max_results]] + +def format_proxy(proxy): + if isinstance(proxy, dict): + server = proxy.get('server') + username = proxy.get('username') + password = proxy.get('password') + + if all([username, password, server]): + proxy_url = f"http://{username}:{password}@{server}" + return proxy_url + else: + raise ValueError("Proxy dictionary is missing required fields.") + elif isinstance(proxy, str): + return proxy # "https://username:password@ip:port" + else: + raise TypeError("Proxy should be a dictionary or a string.") + +def filter_pdf_links(links: List[str]) -> List[str]: """ - Searches the web for a given query using specified search - engine options and filters out PDF links. + Filters out any links that point to PDF files. Args: - query (str): The search query to find on the internet. - search_engine (str, optional): Specifies the search engine to use, - options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. - max_results (int, optional): The maximum number of search results to return. - port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. - timeout (int, optional): The number of seconds to wait - for a response from a request. Default is 10 seconds. - proxy (dict or string, optional): The proxy server to use for the request. Default is None. + links (List[str]): A list of URLs as strings. Returns: - List[str]: A list of URLs as strings that are the search results, excluding any PDF links. - - Raises: - ValueError: If the search engine specified is not supported. - requests.exceptions.Timeout: If the request times out. - - Example: - >>> search_on_web("example query", search_engine="Google", max_results=5) - ['http://example.com', 'http://example.org', ...] + List[str]: A list of URLs excluding any that end with '.pdf'. """ - - def format_proxy(proxy): - if isinstance(proxy, dict): - server = proxy.get('server') - username = proxy.get('username') - password = proxy.get('password') - - if all([username, password, server]): - proxy_url = f"http://{username}:{password}@{server}" - return proxy_url - else: - raise ValueError("Proxy dictionary is missing required fields.") - elif isinstance(proxy, str): - return proxy # "https://username:password@ip:port" - else: - raise TypeError("Proxy should be a dictionary or a string.") - - def filter_pdf_links(links: List[str]) -> List[str]: - """ - Filters out any links that point to PDF files. - - Args: - links (List[str]): A list of URLs as strings. - - Returns: - List[str]: A list of URLs excluding any that end with '.pdf'. - """ - return [link for link in links if not link.lower().endswith('.pdf')] - - if proxy: - proxy = format_proxy(proxy) - - if search_engine.lower() == "google": - res = [] - for url in google_search(query, num_results=max_results, proxy=proxy): - res.append(url) - return filter_pdf_links(res) - - elif search_engine.lower() == "duckduckgo": - research = DuckDuckGoSearchResults(max_results=max_results) - res = research.run(query) - links = re.findall(r'https?://[^\s,\]]+', res) - return filter_pdf_links(links) - - elif search_engine.lower() == "bing": - headers = { - "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) - AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36""" - } - search_url = f"https://www.bing.com/search?q={query}" - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - - search_results = [] - for result in soup.find_all('li', class_='b_algo', limit=max_results): - link = result.find('a')['href'] - search_results.append(link) - return filter_pdf_links(search_results) - - elif search_engine.lower() == "searxng": - url = f"http://localhost:{port}" - params = {"q": query, "format": "json", "engines": "google,duckduckgo,brave,qwant,bing"} - response = requests.get(url, params=params, timeout=timeout) - data = response.json() - limited_results = [result['url'] for result in data["results"][:max_results]] - return filter_pdf_links(limited_results) - - else: - raise ValueError("""The only search engines available are - DuckDuckGo, Google, Bing, or SearXNG""") + return [link for link in links if not link.lower().endswith('.pdf')]