From 01530ee6a99c86426aab1be11ec3b3b86ca640ac Mon Sep 17 00:00:00 2001 From: He Pengfei <31441162+PengfeiHePower@users.noreply.github.com> Date: Tue, 13 Aug 2024 18:44:21 -0700 Subject: [PATCH] Add Wiki retreval service (#324) --------- Co-authored-by: DavdGao --- README.md | 1 + README_ZH.md | 1 + src/agentscope/service/__init__.py | 7 ++ src/agentscope/service/web/wikipedia.py | 161 ++++++++++++++++++++++++ tests/wiki_test.py | 112 +++++++++++++++++ 5 files changed, 282 insertions(+) create mode 100644 src/agentscope/service/web/wikipedia.py create mode 100644 tests/wiki_test.py diff --git a/README.md b/README.md index a6822c344..0a2f1a6fb 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,7 @@ the following libraries. - File Operation - Text Processing - Multi Modality +- Wikipedia search and retrieval **Example Applications** diff --git a/README_ZH.md b/README_ZH.md index 8c89ff927..3d3e5c5d9 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -138,6 +138,7 @@ AgentScope支持使用以下库快速部署本地模型服务。 - 文件操作 - 文本处理 - 多模态生成 +- 维基百科搜索 **样例应用** diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index b7a2471aa..2e74d4dec 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -51,6 +51,11 @@ from .web.web_digest import digest_webpage, load_web, parse_html_to_text from .web.download import download_from_url +from .web.wikipedia import ( + wikipedia_search, + wikipedia_search_categories, +) + def get_help() -> None: """Get help message.""" @@ -80,6 +85,8 @@ def get_help() -> None: "bing_search", "google_search", "arxiv_search", + "wikipedia_search", + "wikipedia_search_categories", "query_mysql", "query_sqlite", "query_mongodb", diff --git a/src/agentscope/service/web/wikipedia.py b/src/agentscope/service/web/wikipedia.py new file mode 100644 index 000000000..ea10a8f18 --- /dev/null +++ b/src/agentscope/service/web/wikipedia.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +""" +Search contents from WikiPedia +""" +import requests + +from ..service_response import ( + ServiceResponse, + ServiceExecStatus, +) + + +def wikipedia_search_categories( + query: str, + max_members: int = 1000, +) -> ServiceResponse: + """Retrieve categories from Wikipedia:Category pages. + + Args: + query (str): + The given searching keywords + max_members (int): + The maximum number of members to output + + Returns: + `ServiceResponse`: A response that contains the execution status and + returned content. In the returned content, the meanings of keys: + - "pageid": unique page ID for the member + - "ns": namespace for the member + - "title": title of the member + + Example: + + .. code-block:: python + + members = wiki_get_category_members( + "Machine_learning", + max_members=10 + ) + print(members) + + It returns contents: + + .. code-block:: python + + { + 'status': , + 'content': [ + { + 'pageid': 67911196, + 'ns': 0, + 'title': 'Bayesian learning mechanisms' + }, + { + 'pageid': 233488, + 'ns': 0, + 'title': 'Machine learning' + }, + # ... + ] + } + + """ + url = "https://en.wikipedia.org/w/api.php" + limit_per_request: int = 500 + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{query}", + "cmlimit": limit_per_request, # Maximum number of results per request + "format": "json", + } + + members = [] + total_fetched = 0 + + try: + while total_fetched < max_members: + response = requests.get(url, params=params, timeout=20) + response.raise_for_status() + + data = response.json() + + batch_members = data["query"]["categorymembers"] + members.extend(batch_members) + total_fetched += len(batch_members) + + # Check if there is a continuation token + if "continue" in data and total_fetched < max_members: + params["cmcontinue"] = data["continue"]["cmcontinue"] + else: + break + + except Exception as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=str(e), + ) + + # If more members were fetched than max_members, trim the list + if len(members) > max_members: + members = members[:max_members] + + if len(members) > 0: + return ServiceResponse(ServiceExecStatus.SUCCESS, members) + + return ServiceResponse(ServiceExecStatus.ERROR, members) + + +def wikipedia_search( # pylint: disable=C0301 + query: str, +) -> ServiceResponse: + """Search the given query in Wikipedia. Note the returned text maybe related entities, which means you should adjust your query as needed and search again. + + Note the returned text maybe too long for some llm, it's recommended to + summarize the returned text first. + + Args: + query (`str`): + The searched query in wikipedia. + + Return: + `ServiceResponse`: A response that contains the execution status and + returned content. + """ # noqa + + url = "https://en.wikipedia.org/w/api.php" + params = { + "action": "query", + "titles": query, + "prop": "extracts", + "explaintext": True, + "format": "json", + } + try: + response = requests.get(url, params=params, timeout=20) + response.raise_for_status() + data = response.json() + + # Combine into a text + text = [] + for page in data["query"]["pages"].values(): + if "extract" in page: + text.append(page["extract"]) + else: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content="No content found", + ) + + content = "\n".join(text) + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=content, + ) + + except Exception as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=str(e), + ) diff --git a/tests/wiki_test.py b/tests/wiki_test.py new file mode 100644 index 000000000..1ed4fe375 --- /dev/null +++ b/tests/wiki_test.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +"""Wiki retriever test.""" +import unittest +from unittest.mock import Mock, patch, MagicMock + +from agentscope.service import ( + wikipedia_search, + wikipedia_search_categories, + ServiceResponse, + ServiceExecStatus, +) + + +class TestWikipedia(unittest.TestCase): + """ExampleTest for a unit test.""" + + @patch("agentscope.utils.common.requests.get") + def test_wikipedia_search_categories( + self, + mock_get: MagicMock, + ) -> None: + """Test test_get_category_members""" + mock_response = Mock() + mock_dict = { + "query": { + "categorymembers": [ + { + "pageid": 20, + "ns": 0, + "title": "This is a test", + }, + ], + }, + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=[ + { + "pageid": 20, + "ns": 0, + "title": "This is a test", + }, + ], + ) + + mock_response.json.return_value = mock_dict + mock_get.return_value = mock_response + + test_entity = "Test" + limit_per_request = 500 + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{test_entity}", + "cmlimit": limit_per_request, + "format": "json", + } + + results = wikipedia_search_categories(query=test_entity) + + mock_get.assert_called_once_with( + "https://en.wikipedia.org/w/api.php", + params=params, + timeout=20, + ) + + self.assertEqual( + results, + expected_result, + ) + + @patch("agentscope.utils.common.requests.get") + def test_wikipedia_search( + self, + mock_get: MagicMock, + ) -> None: + """Test get_page_content_by_paragraph""" + + # Mock responses for extract query + mock_response = Mock() + mock_dict = { + "query": { + "pages": { + "20": { + "pageid": 20, + "title": "Test", + "extract": "This is the first paragraph.", + }, + "21": { + "pageid": 30, + "title": "Test", + "extract": "This is the second paragraph.", + }, + }, + }, + } + + mock_response.json.return_value = mock_dict + mock_get.return_value = mock_response + + expected_response = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=( + "This is the first paragraph.\n" + "This is the second paragraph." + ), + ) + + response = wikipedia_search("Test") + + self.assertEqual(expected_response, response)