diff --git a/README.md b/README.md index cfa933a..ab6173d 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,6 @@ request = fpdsRequest( LAST_MOD_DATE="[2022/01/01, 2022/05/01]", AGENCY_CODE="7504" ) - data = asyncio.run(request.data()) ``` diff --git a/src/fpds/core/parser.py b/src/fpds/core/parser.py index d6504f9..51123e5 100644 --- a/src/fpds/core/parser.py +++ b/src/fpds/core/parser.py @@ -2,7 +2,7 @@ Base classes for FPDS XML elements. author: derek663@gmail.com -last_updated: 01/20/2024 +last_updated: 07/13/2024 """ import asyncio @@ -17,6 +17,7 @@ from aiohttp import ClientSession +from fpds.core import FPDS_ENTRY from fpds.core.mixins import fpdsMixin from fpds.core.xml import fpdsXML from fpds.utilities import validate_kwarg @@ -74,9 +75,6 @@ def __init__( for kwarg, value in self.kwargs.items(): self.kwargs[kwarg] = validate_kwarg(kwarg=kwarg, string=value) - def __call__(self) -> List[Dict[str, Union[str, float]]]: - return self.process_records() - def __str__(self) -> str: # pragma: no cover """String representation of `fpdsRequest`.""" kwargs_str = " ".join([f"{key}={value}" for key, value in self.kwargs.items()]) @@ -93,7 +91,7 @@ def search_params(self) -> str: return " ".join(_params) @property - def max_pages(self) -> int: + def page_count(self) -> int: """Total number of FPDS pages contained in request.""" return len(self.links) @@ -119,7 +117,7 @@ async def convert(self, session: ClientSession, link: str) -> fpdsXML: xml = fpdsXML(content=self.convert_to_lxml_tree(content)) return xml - async def fetch(self): + async def fetch(self) -> List[fpdsXML]: self.create_request_links() semaphore = Semaphore(self.thread_count) @@ -128,9 +126,11 @@ async def fetch(self): tasks = [self.convert(session, link) for link in self.links] return await asyncio.gather(*tasks) - def page_index(self): + def page_index(self) -> Optional[int]: """Converts `page` to index integer.""" - idx = 0 if self.page == 1 else self.page - 1 + idx = None + if self.page: + idx = 0 if self.page == 1 else self.page - 1 return idx def create_request_links(self) -> None: @@ -146,22 +146,23 @@ def create_request_links(self) -> None: self.links = links if self.page: - if self.page > self.max_pages: - raise ValueError(f"Max response page count is {self.max_pages}!") - self.links = [links[self.page_index()]] + idx = self.page_index() + if idx: + if self.page > self.page_count: + raise ValueError(f"Max response page count is {self.page_count}!") + self.links = [links[idx]] @staticmethod - def _jsonify(entry): + def _jsonify(entry) -> List[FPDS_ENTRY]: """Wrapper around `jsonify` method for avoiding pickle issue.""" return entry.jsonify() - async def data(self) -> List[Dict[str, Union[str, float]]]: + async def data(self) -> List[FPDS_ENTRY]: num_processes = multiprocessing.cpu_count() data = await self.fetch() # for parallel processing with ProcessPoolExecutor(max_workers=num_processes) as pool: - results = pool.map(self._jsonify, data) + results = list(pool.map(self._jsonify, data)) - data = list(chain.from_iterable(results)) - return data + return list(chain.from_iterable(results)) diff --git a/src/fpds/core/xml.py b/src/fpds/core/xml.py index e56e500..1cc32b8 100644 --- a/src/fpds/core/xml.py +++ b/src/fpds/core/xml.py @@ -2,7 +2,7 @@ XML classes for parsing FPDS content. author: derek663@gmail.com -last_updated: 06/05/2024 +last_updated: 07/13/2024 """ import re @@ -109,8 +109,10 @@ def namespace_dict(self) -> Dict[str, str]: return namespace_dict @property - def total_record_count(self) -> int: - """Total number of records across all pagination links.""" + def lower_limit(self) -> int: + """Lower limit of record count (i.e. if 40, it means there is a total of + 40-49 records). + """ last_link = self.tree.find(".//ns0:link[@rel='last']", self.namespace_dict) if isinstance(last_link, Element): # length of last_link should always be 1 @@ -126,8 +128,8 @@ def pagination_links(self, params: str) -> List[str]: total record count value. """ resp_size = self.response_size - offset = 0 if self.total_record_count < 10 else resp_size - page_range = list(range(0, self.total_record_count + offset, resp_size)) + offset = 0 if self.lower_limit < 10 else resp_size + page_range = list(range(0, self.lower_limit + offset, resp_size)) page_links = [] for num in page_range: link = f"{self.url_base}&q={params}&start={num}" diff --git a/tests/test_xml.py b/tests/test_xml.py index 36e748c..a7accc9 100644 --- a/tests/test_xml.py +++ b/tests/test_xml.py @@ -35,17 +35,17 @@ def test_namespace_dict(self): namespace_dict = self._class.namespace_dict self.assertEqual(namespace_dict, TEST_NAMESPACE_DICT) - def test_total_record_count(self): - total = self._class.total_record_count + def test_lower_limit(self): + total = self._class.lower_limit self.assertEqual(total, 20) - def test_total_record_count_truncated_response(self): + def test_lower_limit_count_truncated_response(self): """A truncated response won't have a `last` link tag. This test ensures that if the response size is less than 10 that the - `total_record_count` property is still generated correctly. + `lower_limit` property is still generated correctly. """ _class = fpdsXML(TRUNCATED_RESPONSE_DATA_BYTES) - total = _class.total_record_count + total = _class.lower_limit self.assertEqual(total, 1) def test_pagination_links(self): @@ -58,8 +58,8 @@ def test_get_atom_feed_entries(self): entry_types = set([type(entry) for entry in entries]) self.assertEqual(len(entry_types), 1) - def test_jsonified_entries(self): - entries = self._class.jsonified_entries() + def test_jsonify(self): + entries = self._class.jsonify() self.assertEqual(len(entries), 10)