-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #825 from ScrapeGraphAI/revert-to-1.19
feat: revert search function
- Loading branch information
Showing
2 changed files
with
57 additions
and
121 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,129 +1,73 @@ | ||
""" | ||
research_web module | ||
Research_web module | ||
""" | ||
import re | ||
from typing import List | ||
from langchain_community.tools import DuckDuckGoSearchResults | ||
from googlesearch import search as google_search | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import json | ||
|
||
def search_on_web(query: str, search_engine: str = "Google", | ||
max_results: int = 10, port: int = 8080, | ||
timeout: int = 10, proxy: str | dict = None, | ||
serper_api_key: str = None) -> List[str]: | ||
"""Search web function with improved error handling and validation""" | ||
|
||
# Input validation | ||
if not query or not isinstance(query, str): | ||
raise ValueError("Query must be a non-empty string") | ||
|
||
search_engine = search_engine.lower() | ||
valid_engines = {"google", "duckduckgo", "bing", "searxng", "serper"} | ||
if search_engine not in valid_engines: | ||
raise ValueError(f"Search engine must be one of: {', '.join(valid_engines)}") | ||
def search_on_web(query: str, search_engine: str = "Google", | ||
max_results: int = 10, port: int = 8080) -> List[str]: | ||
""" | ||
Searches the web for a given query using specified search engine options. | ||
# Format proxy once | ||
formatted_proxy = None | ||
if proxy: | ||
formatted_proxy = format_proxy(proxy) | ||
|
||
try: | ||
results = [] | ||
if search_engine == "google": | ||
results = list(google_search(query, num_results=max_results, proxy=formatted_proxy)) | ||
|
||
elif search_engine == "duckduckgo": | ||
research = DuckDuckGoSearchResults(max_results=max_results) | ||
res = research.run(query) | ||
results = re.findall(r'https?://[^\s,\]]+', res) | ||
|
||
elif search_engine == "bing": | ||
results = _search_bing(query, max_results, timeout, formatted_proxy) | ||
|
||
elif search_engine == "searxng": | ||
results = _search_searxng(query, max_results, port, timeout) | ||
|
||
elif search_engine.lower() == "serper": | ||
results = _search_serper(query, max_results, serper_api_key, timeout) | ||
|
||
return filter_pdf_links(results) | ||
|
||
except requests.Timeout: | ||
raise TimeoutError(f"Search request timed out after {timeout} seconds") | ||
except requests.RequestException as e: | ||
raise RuntimeError(f"Search request failed: {str(e)}") | ||
Args: | ||
query (str): The search query to find on the internet. | ||
search_engine (str, optional): Specifies the search engine to use, | ||
options include 'Google', 'DuckDuckGo', 'Bing', or 'SearXNG'. Default is 'Google'. | ||
max_results (int, optional): The maximum number of search results to return. | ||
port (int, optional): The port number to use when searching with 'SearXNG'. Default is 8080. | ||
def _search_bing(query: str, max_results: int, timeout: int, proxy: str = None) -> List[str]: | ||
"""Helper function for Bing search""" | ||
headers = { | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | ||
} | ||
search_url = f"https://www.bing.com/search?q={query}" | ||
|
||
proxies = {"http": proxy, "https": proxy} if proxy else None | ||
response = requests.get(search_url, headers=headers, timeout=timeout, proxies=proxies) | ||
response.raise_for_status() | ||
|
||
soup = BeautifulSoup(response.text, "html.parser") | ||
return [result.find('a')['href'] for result in soup.find_all('li', class_='b_algo', limit=max_results)] | ||
Returns: | ||
List[str]: A list of URLs as strings that are the search results. | ||
def _search_searxng(query: str, max_results: int, port: int, timeout: int) -> List[str]: | ||
"""Helper function for SearXNG search""" | ||
url = f"http://localhost:{port}" | ||
params = { | ||
"q": query, | ||
"format": "json", | ||
"engines": "google,duckduckgo,brave,qwant,bing" | ||
} | ||
response = requests.get(url, params=params, timeout=timeout) | ||
response.raise_for_status() | ||
return [result['url'] for result in response.json().get("results", [])[:max_results]] | ||
Raises: | ||
ValueError: If the search engine specified is not supported. | ||
def _search_serper(query: str, max_results: int, serper_api_key: str, timeout: int) -> List[str]: | ||
"""Helper function for serper api""" | ||
if not serper_api_key: | ||
raise ValueError("API key is required for serper api.") | ||
|
||
url = "https://google.serper.dev/search" | ||
payload = json.dumps({ | ||
"q": query, | ||
"num": max_results | ||
}) | ||
headers = { | ||
'X-API-KEY': serper_api_key, | ||
'Content-Type': 'application/json' | ||
} | ||
response = requests.post(url, headers=headers, data=payload, timeout=timeout) | ||
response.raise_for_status() | ||
return [result.get("link") for result in response.json().get("organic", [])] | ||
Example: | ||
>>> search_on_web("example query", search_engine="Google", max_results=5) | ||
['http://example.com', 'http://example.org', ...] | ||
""" | ||
|
||
if search_engine.lower() == "google": | ||
res = [] | ||
for url in google_search(query, num_results=max_results): | ||
res.append(url) | ||
return res | ||
|
||
def format_proxy(proxy): | ||
if isinstance(proxy, dict): | ||
server = proxy.get('server') | ||
username = proxy.get('username') | ||
password = proxy.get('password') | ||
elif search_engine.lower() == "duckduckgo": | ||
research = DuckDuckGoSearchResults(max_results=max_results) | ||
res = research.run(query) | ||
links = re.findall(r'https?://[^\s,\]]+', res) | ||
return links | ||
|
||
if all([username, password, server]): | ||
proxy_url = f"http://{username}:{password}@{server}" | ||
return proxy_url | ||
else: | ||
raise ValueError("Proxy dictionary is missing required fields.") | ||
elif isinstance(proxy, str): | ||
return proxy # "https://username:password@ip:port" | ||
else: | ||
raise TypeError("Proxy should be a dictionary or a string.") | ||
|
||
def filter_pdf_links(links: List[str]) -> List[str]: | ||
""" | ||
Filters out any links that point to PDF files. | ||
elif search_engine.lower() == "bing": | ||
headers = { | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | ||
} | ||
search_url = f"https://www.bing.com/search?q={query}" | ||
response = requests.get(search_url, headers=headers) | ||
response.raise_for_status() | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
|
||
Args: | ||
links (List[str]): A list of URLs as strings. | ||
search_results = [] | ||
for result in soup.find_all('li', class_='b_algo', limit=max_results): | ||
link = result.find('a')['href'] | ||
search_results.append(link) | ||
return search_results | ||
|
||
Returns: | ||
List[str]: A list of URLs excluding any that end with '.pdf'. | ||
""" | ||
return [link for link in links if not link.lower().endswith('.pdf')] | ||
elif search_engine.lower() == "searxng": | ||
url = f"http://localhost:{port}" | ||
params = {"q": query, "format": "json"} | ||
|
||
# Send the GET request to the server | ||
response = requests.get(url, params=params) | ||
|
||
data = response.json() | ||
limited_results = data["results"][:max_results] | ||
return limited_results | ||
|
||
else: | ||
raise ValueError("The only search engines available are DuckDuckGo, Google, Bing, or SearXNG") |