From 36edc0616109a2fc0c08a43c714e6315e46b9f26 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Fri, 30 Jun 2023 18:02:47 +0200 Subject: [PATCH 01/13] create projects script --- bblocks/import_tools/world_bank_projects.py | 203 ++++++++++++++++++ .../test_world_bank_projects.py | 88 ++++++++ 2 files changed, 291 insertions(+) create mode 100644 bblocks/import_tools/world_bank_projects.py create mode 100644 tests/test_import_tools/test_world_bank_projects.py diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py new file mode 100644 index 0000000..7f9495f --- /dev/null +++ b/bblocks/import_tools/world_bank_projects.py @@ -0,0 +1,203 @@ +"""World Bank Projects Database Importer""" + +import pandas as pd +import numpy as np +import requests +import json +from dataclasses import dataclass + +from bblocks.logger import logger +from bblocks.import_tools.common import ImportData +from bblocks.config import BBPaths + + +class EmptyDataException(Exception): + """Exception raised when the API response does not contain any data.""" + + pass + + +BASE_API_URL = "http://search.worldbank.org/api/v2/projects" + + +class QueryAPI: + """Helper class for querying the World Bank Projects API""" + + def __init__( + self, response_format: str = 'json', max_rows_per_response: int = 500, + start_date: str | None = None, end_date: str | None = None + ): + """Initialize QueryAPI object""" + + self.response_format = response_format + self.max_rows_per_response = max_rows_per_response + self.start_date = start_date + self.end_date = end_date + + self._params = { + 'format': self.response_format, + 'rows': self.max_rows_per_response, + # 'os': 0, # offset + 'strdate': self.start_date, + 'enddate': self.end_date + } + + self._check_params() + + self.response_data = {} # initialize response_data as empty dict + + def _check_params(self) -> None: + """Check parameters""" + + # if end_date is before start_date, raise error + if self._params['strdate'] is not None and self._params['enddate'] is not None: + if self._params['enddate'] < self._params['strdate']: + raise ValueError("end date must be after start date") + + # if max_rows is greater than 1000, raise error + if self._params['rows'] > 1000: + raise ValueError("max_rows must be less than or equal to 1000") + + # if dates are None, drop them from params + if self._params['strdate'] is None: + # drop start_date from params + self._params.pop('strdate') + + if self._params['enddate'] is None: + # drop end_date from params + self._params.pop('enddate') + + def _request(self) -> dict: + """Single request to API. Returns the rsponse json.""" + + try: + response = requests.get(BASE_API_URL, params=self._params) + response.raise_for_status() + data = response.json()['projects'] # keep only the projects data + + return data + + except Exception as e: + raise Exception(f"Failed to get data: {e}") + + def request_data(self) -> 'QueryAPI': + """Request data from API + + This method will request all the data from the API + and store it in the response_data attribute. + It will automatically determine the request to make + based on the offset and number of rows parameters. + + Returns: + 'QueryAPI' to allow chaining of methods + """ + + self._params['os'] = 0 # reset offset to 0 + + while True: + + # request data + data = self._request() + + # if there are no more projects, break + if len(data) == 0: + break + + # add data to response_data + self.response_data.update(data) + + # update offset + self._params['os'] += self._params['rows'] + + # Log if no data was returned from API + if len(self.response_data) == 0: + raise EmptyDataException("No data was returned from API") + + return self + + def get_data(self) -> dict[dict]: + """Get the data, or request it if it hasn't been requested yet""" + + if len(self.response_data) == 0: + self.request_data() + + return self.response_data + + +fields = { + 'id': 'id', + 'regionname': 'region', + 'project_name': 'project name', + 'countryshortname': 'country', + 'projectstatusdisplay': 'project status', + + + + + 'curr_total_commitment': 'total commitment', + 'curr_ibrd_commitment': 'IBRD commitment', + 'curr_ida_commitment': 'IDA commitment', + +} + + +# df = pd.DataFrame.from_dict(proj._raw_data, orient='index') + + +@dataclass +class WorldBankProjects(ImportData): + """World Bank Projects Database Importer""" + + start_date: str | None = None + end_date: str | None = None + + @property + def _path(self): + """Generate path based on version""" + + start_date = f'_{self.start_date}' if self.start_date is not None else '' + end_date = f'_{self.end_date}' if self.end_date is not None else '' + + return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json" + + def load_data(self, project_codes: str | list = 'all') -> ImportData: + """ """ + + # if file does not exist, download it and save it as a json file + if not self._path.exists(): + with open(self._path, 'w') as file: + data = (QueryAPI(start_date=self.start_date, end_date=self.end_date) + .request_data() + .get_data() + ) + json.dump(data, file) + logger.info(f"Successfully downloaded World Bank Projects") + + with open(self._path, "r") as file: + self._raw_data = json.load(file) + + if project_codes == 'all': + self._data = self._raw_data + + if isinstance(project_codes, str): + project_codes = [project_codes] + + if isinstance(project_codes, list): + self._data = {k: v for k, v in self._raw_data.items() + if k in project_codes} + + if self._data == {}: + raise ValueError("No projects found with the given project codes") + logger.info(f"Successfully loaded World Bank Projects") + + return self + + def update_data(self, reload: bool = True) -> ImportData: + """ """ + + pass + + def get_data(self): + """ """ + + print('test') diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py new file mode 100644 index 0000000..51d017d --- /dev/null +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -0,0 +1,88 @@ +"""Tests for the world_bank_projects module.""" + +import pytest +import pandas as pd +import numpy as np +import requests +from unittest.mock import Mock, patch, MagicMock + +from bblocks.import_tools import world_bank_projects + + +class TestQueryAPI: + """Test QueryAPI class.""" + + def test_init(self): + """Test initialization of QueryAPI object.""" + + # test that error is raised if end_date is before start_date + with pytest.raises(ValueError): + world_bank_projects.QueryAPI(start_date='2020-01-01', end_date='2019-01-01') + + # test that error is raised if max_rows_per_response is greater than 1000 + with pytest.raises(ValueError): + world_bank_projects.QueryAPI(max_rows_per_response=1001) + + # test that start_date is dropped if end_date is None + assert 'strdate' not in world_bank_projects.QueryAPI(end_date='2020-01-01', + start_date=None)._params + + # test that end_date is dropped if start_date is None + assert 'enddate' not in world_bank_projects.QueryAPI(start_date='2020-01-01', + end_date=None)._params + + def test_request(self): + """ """ + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}} + + with patch("requests.get", return_value=mock_response) as mock_get: + assert world_bank_projects.QueryAPI()._request() == {'P1234': {'name': 'Test Project'}} + + def test_request_error(self): + """Test that error is raised if request fails.""" + + with patch("requests.get") as mock_get: + mock_get.return_value.raise_for_status.side_effect = ( + requests.exceptions.HTTPError + ) + mock_get.return_value.status_code = 404 + mock_get.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}} + + with pytest.raises(Exception): + world_bank_projects.QueryAPI()._request() + + def test_request_data_no_data(self): + """Test request_data method.""" + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = {'projects': {}} # test that empty response is handled + + with pytest.raises(world_bank_projects.EmptyDataException): + with patch("requests.get", return_value=mock_response) as mock_get: + obj = world_bank_projects.QueryAPI() + obj.request_data() + + def test_request_data(self): + """Test request_data method.""" + + # Mocking the requests.get function + mocked_get = MagicMock(side_effect=[ + Mock(json=MagicMock(return_value={'projects': {'P1': {'name': 'Test Project 1'}, + 'P2': {'name': 'Test Project 2'} + } + })), + Mock(json=MagicMock(return_value={'projects':{'P3': {'name': 'Test Project 3'}}})), + Mock(json=MagicMock(return_value={'projects': {}})) + ]) + + with patch("bblocks.import_tools.world_bank_projects.requests.get", mocked_get): + obj = world_bank_projects.QueryAPI() + obj.request_data() + + assert obj.response_data == {'P1': {'name': 'Test Project 1'}, + 'P2': {'name': 'Test Project 2'}, + 'P3': {'name': 'Test Project 3'} + } From 45676580c02713599f5e525bfd6aedb9d03ddc49 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Tue, 4 Jul 2023 16:47:51 +0200 Subject: [PATCH 02/13] update script --- bblocks/import_tools/world_bank_projects.py | 209 ++++++++++++++---- .../test_world_bank_projects.py | 91 ++++++++ 2 files changed, 262 insertions(+), 38 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index 7f9495f..e30a48a 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -9,6 +9,7 @@ from bblocks.logger import logger from bblocks.import_tools.common import ImportData from bblocks.config import BBPaths +from bblocks.cleaning_tools import clean class EmptyDataException(Exception): @@ -26,7 +27,7 @@ class QueryAPI: def __init__( self, response_format: str = 'json', max_rows_per_response: int = 500, start_date: str | None = None, end_date: str | None = None - ): + ): """Initialize QueryAPI object""" self.response_format = response_format @@ -124,26 +125,100 @@ def get_data(self) -> dict[dict]: return self.response_data -fields = { - 'id': 'id', - 'regionname': 'region', +def clean_theme(data: dict) -> list[dict]: + """Clean theme data from a nested list to a dataframe + If there are no themes, an empty dataframe will be returned + + Args: + data: data from API + + Returns: + dict with theme names and percentages + """ + + # if there are no themes, return an empty dataframe + if 'theme_list' not in data.keys(): + # return [{'project ID': proj_id}] + return [] + + theme_list = [] + proj_id = data['id'] + for theme1 in data['theme_list']: + + # get first theme + name = theme1['name'] + theme_list.append({'project ID': proj_id, + 'theme1': name, + 'percent': theme1['percent']}) + + # get 2nd theme + if 'theme2' in theme1.keys(): + for theme2 in theme1['theme2']: + name_2 = theme2['name'] + theme_list.append({'project ID': proj_id, + 'theme1': name, + 'theme2': name_2, + 'percent': theme2['percent']}) + + # get 3rd theme + if 'theme3' in theme2.keys(): + for theme3 in theme2['theme3']: + name_3 = theme3['name'] + theme_list.append({'project ID': proj_id, + 'theme1': name, + 'theme2': name_2, + 'theme3': name_3, + 'percent': theme3['percent']}) + return theme_list + + +def clean_sector(sector_series: pd.Series) -> pd.Series: + """Format sector data from a nested list to a string separating sectors by ' | ' + If there are no sectors, np.nan will be placed in the series row + + Args: + sector_series: series of sector data + """ + + return (sector_series + .apply(lambda x: ' | '.join([item['Name'] for item in x])if isinstance(x, list) else np.nan) + ) + + +general_fields = { # general info + 'id': 'project ID', 'project_name': 'project name', 'countryshortname': 'country', - 'projectstatusdisplay': 'project status', - - - - - 'curr_total_commitment': 'total commitment', - 'curr_ibrd_commitment': 'IBRD commitment', - 'curr_ida_commitment': 'IDA commitment', - + 'regionname': 'region name', + 'url': 'url', + 'teamleadname': 'team leader', + 'status': 'status', + 'envassesmentcategorycode': 'environmental assesment category', + + # dates + 'approvalfy': 'fiscal year', + 'boardapprovaldate': 'board approval date', + 'closingdate': 'closing date', + 'p2a_updated_date': 'update date', + + # lending + 'lendinginstr': 'lending instrument', + 'borrower': 'borrower', + 'impagency': 'implementing agency', + 'lendprojectcost': 'project cost', + 'totalcommamt': 'total commitment', + 'grantamt': 'grant amount', + 'idacommamt': 'IDA commitment amount', + 'ibrdcommamt': 'IBRD commitment amount', + 'curr_total_commitment': 'current total IBRD and IDA commitment', + 'curr_ibrd_commitment': 'current IBRD commitment', + 'curr_ida_commitment': 'current IDA commitment', + + # sectors + 'sector': 'sectors', } -# df = pd.DataFrame.from_dict(proj._raw_data, orient='index') - - @dataclass class WorldBankProjects(ImportData): """World Bank Projects Database Importer""" @@ -160,44 +235,102 @@ def _path(self): return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json" + def _format_data(self): + """Cleaning and formatting""" + + # create dataframe for general data + + numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt', + 'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment', + 'curr_ida_commitment'] + + self._data['general_data'] = (pd + .DataFrame.from_dict(self._raw_data, orient='index') + .reset_index(drop=True) + .loc[:, general_fields.keys()] + # change fiscal year to int + .assign(approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int)) + # change numeric columns to float + .pipe(clean.clean_numeric_series,series_columns=numeric_cols) + .assign(#format dates + boardapprovaldate = lambda d: clean.to_date_column(d['boardapprovaldate']), + closingdate = lambda d: clean.to_date_column(d['closingdate']), + p2a_updated_date =lambda d: clean.to_date_column(d['p2a_updated_date']), + #format sectors + sector = lambda d: clean_sector(d['sector']) + ) + # rename columns + .rename(columns=general_fields) + ) + + theme_data = [] + for _, proj_data in self._raw_data.items(): + theme_data.extend(clean_theme(proj_data)) + + self._data['theme_data'] = (pd.DataFrame(theme_data) + .assign(percent=lambda d: clean.clean_numeric_series(d['percent'], to=float)) + ) + + def _download(self) -> None: + """Download data from World Bank Projects API and save it as a json file""" + + with open(self._path, 'w') as file: + data = (QueryAPI(start_date=self.start_date, end_date=self.end_date) + .request_data() + .get_data() + ) + json.dump(data, file) + logger.info(f"Successfully downloaded World Bank Projects") + def load_data(self, project_codes: str | list = 'all') -> ImportData: """ """ # if file does not exist, download it and save it as a json file if not self._path.exists(): - with open(self._path, 'w') as file: - data = (QueryAPI(start_date=self.start_date, end_date=self.end_date) - .request_data() - .get_data() - ) - json.dump(data, file) - logger.info(f"Successfully downloaded World Bank Projects") + self._download() + # load data from json file with open(self._path, "r") as file: self._raw_data = json.load(file) - if project_codes == 'all': - self._data = self._raw_data + if self._raw_data is None: + raise EmptyDataException("No data was retrieved") - if isinstance(project_codes, str): - project_codes = [project_codes] + # format data + self._format_data() + return self - if isinstance(project_codes, list): - self._data = {k: v for k, v in self._raw_data.items() - if k in project_codes} + def update_data(self, reload: bool = True) -> ImportData: + """ """ - if self._data == {}: - raise ValueError("No projects found with the given project codes") - logger.info(f"Successfully loaded World Bank Projects") + self._download() + if reload: + self.load_data() return self - def update_data(self, reload: bool = True) -> ImportData: + def get_data( + self, project_codes: str | list = 'all', + data_type: str = 'general', + **kwargs + ) -> pd.DataFrame: """ """ - pass + if data_type == 'general': + df = self._data['general_data'] + elif data_type == 'theme': + df = self._data['theme_data'] + else: + raise ValueError("data_type must be either 'general' or 'theme'") - def get_data(self): - """ """ + if project_codes != 'all': + if isinstance(project_codes, str): + project_codes = [project_codes] + df = df[df['project ID'].isin(project_codes)] + + return df + + def get_json(self) -> dict: + """Return the raw data as a dictionary""" - print('test') + return self._raw_data diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py index 51d017d..b994a27 100644 --- a/tests/test_import_tools/test_world_bank_projects.py +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -86,3 +86,94 @@ def test_request_data(self): 'P2': {'name': 'Test Project 2'}, 'P3': {'name': 'Test Project 3'} } + + +def test_clean_theme(): + """Test clean_theme function.""" + + test_data_dict = {'id': 'P1234', + 'theme_list': + [{'name': 'Environment and Natural Resource Management', + 'code': '80', + 'seqno': '14', + 'percent': '34', + 'theme2': [ + {'name': 'Energy', + 'code': '86', + 'seqno': '18', + 'percent': '13', + 'theme3': [ + { 'name': 'Energy Efficiency', + 'code': '861', + 'seqno': '34', + 'percent': '13'}, + {'name': 'Energy Policies & Reform', + 'code': '862', + 'seqno': '35', + 'percent': '13'}] + }, + {'name': 'Environmental policies and institutions', + 'code': '84', + 'seqno': '17', + 'percent': '13'}, + + {'name': 'Environmental Health and Pollution Management', + 'code': '82', + 'seqno': '16', + 'percent': '13', + 'theme3': [ + {'name': 'Air quality management', + 'code': '821', + 'seqno': '33', + 'percent': '13'}]}, + {'name': 'Climate change', + 'code': '81', + 'seqno': '15', + 'percent': '34', + 'theme3': [ + {'name': 'Adaptation', + 'code': '812', + 'seqno': '32', + 'percent': '8'}, + {'name': 'Mitigation', 'code': '811', 'seqno': '31', 'percent': '26'}]}]} + ] + } + + formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': '34'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': '13'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': '34'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': '8'}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': '26'}] + + assert world_bank_projects.clean_theme(test_data_dict) == formatted + + +def test_clean_theme_no_theme(): + """Test clean_theme function with no theme.""" + + test_data_dict = {'id': 'P1234' + } + assert world_bank_projects.clean_theme(test_data_dict) == [] + + +def test_clean_sector(): + """Test clean_sector function.""" + + test_series = pd.Series({0: [{'Name': 'Public Administration - Transportation'}, {'Name': 'Ports/Waterways'}], + 1: [{'Name': 'Public Administration - Agriculture, Fishing & Forestry'}, {'Name': 'Agricultural Extension, Research, and Other Support Activities'}, {'Name': 'Other Agriculture, Fishing and Forestry'}, {'Name': 'Irrigation and Drainage'}, {'Name': 'Agricultural markets, commercialization and agri-business'}], + 2: np.nan, + 3: np.nan}) + + expected = pd.Series({0: 'Public Administration - Transportation | Ports/Waterways', + 1: 'Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business', + 2: np.nan, + 3: np.nan}) + + assert world_bank_projects.clean_sector(test_series).equals(expected) + + From 1e7c65c605c92eb8a3abc0ca828de406cbb1bdf5 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Tue, 4 Jul 2023 17:19:12 +0200 Subject: [PATCH 03/13] update --- bblocks/import_tools/world_bank_projects.py | 123 +++++++++++++----- .../test_world_bank_projects.py | 20 +-- 2 files changed, 97 insertions(+), 46 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index e30a48a..3d3ede9 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -125,18 +125,18 @@ def get_data(self) -> dict[dict]: return self.response_data -def clean_theme(data: dict) -> list[dict]: - """Clean theme data from a nested list to a dataframe - If there are no themes, an empty dataframe will be returned +def clean_theme(data: dict) -> list[dict] | list: + """Clean theme data from a nested list to a list of dictionaries with theme names and percentages + If there are no themes, an empty list will be returned Args: data: data from API Returns: - dict with theme names and percentages + list of dictionaries with theme names and percentages """ - # if there are no themes, return an empty dataframe + # if there are no themes, return an empty list if 'theme_list' not in data.keys(): # return [{'project ID': proj_id}] return [] @@ -149,7 +149,7 @@ def clean_theme(data: dict) -> list[dict]: name = theme1['name'] theme_list.append({'project ID': proj_id, 'theme1': name, - 'percent': theme1['percent']}) + 'percent': clean.clean_number(theme1['percent'])}) # get 2nd theme if 'theme2' in theme1.keys(): @@ -158,7 +158,7 @@ def clean_theme(data: dict) -> list[dict]: theme_list.append({'project ID': proj_id, 'theme1': name, 'theme2': name_2, - 'percent': theme2['percent']}) + 'percent': clean.clean_number(theme2['percent'])}) # get 3rd theme if 'theme3' in theme2.keys(): @@ -168,7 +168,7 @@ def clean_theme(data: dict) -> list[dict]: 'theme1': name, 'theme2': name_2, 'theme3': name_3, - 'percent': theme3['percent']}) + 'percent': clean.clean_number(theme3['percent'])}) return theme_list @@ -178,10 +178,14 @@ def clean_sector(sector_series: pd.Series) -> pd.Series: Args: sector_series: series of sector data + + Returns: + series of sector data as a string separated by ' | ' """ return (sector_series - .apply(lambda x: ' | '.join([item['Name'] for item in x])if isinstance(x, list) else np.nan) + .apply(lambda x: ' | '.join([item['Name'] + for item in x]) if isinstance(x, list) else np.nan) ) @@ -221,7 +225,22 @@ def clean_sector(sector_series: pd.Series) -> pd.Series: @dataclass class WorldBankProjects(ImportData): - """World Bank Projects Database Importer""" + """World Bank Projects Database Importer + + This object will import the World Bank Projects database from the World Bank API. + To use, create an instance of the class. Optionally, you can specify the start and end dates + of the data to import. If no dates are specified, all data will be imported. + To import the data, call the load_data method. If the data has already downloaded, it will + be loaded to the object from disk, otherwise it will be downloaded from the API. + To retrieve the data, call the get_data method. You can specify the type of data to retrieve, + either 'general' or 'theme'. If no type is specified, 'general' data will be returned. + To update the data, call the update_data method. This will download the data from the API. if 'reload' is + set to True, the data will be reloaded to the object. + + Parameters: + start_date: start date of data to import, in DD-MM-YYYY format + end_date: end date of data to import, in DD-MM-YYYY format + """ start_date: str | None = None end_date: str | None = None @@ -235,10 +254,8 @@ def _path(self): return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json" - def _format_data(self): - """Cleaning and formatting""" - - # create dataframe for general data + def _format_general_data(self) -> None: + """Clean and format general data and store it in _data attribute with key 'general_data'""" numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt', 'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment', @@ -248,28 +265,30 @@ def _format_data(self): .DataFrame.from_dict(self._raw_data, orient='index') .reset_index(drop=True) .loc[:, general_fields.keys()] - # change fiscal year to int - .assign(approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int)) - # change numeric columns to float - .pipe(clean.clean_numeric_series,series_columns=numeric_cols) - .assign(#format dates - boardapprovaldate = lambda d: clean.to_date_column(d['boardapprovaldate']), - closingdate = lambda d: clean.to_date_column(d['closingdate']), - p2a_updated_date =lambda d: clean.to_date_column(d['p2a_updated_date']), - #format sectors - sector = lambda d: clean_sector(d['sector']) - ) - # rename columns + # change fiscal year to int + .assign( + approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int)) + # change numeric columns to float + .pipe(clean.clean_numeric_series, series_columns=numeric_cols) + .assign( # format dates + boardapprovaldate=lambda d: clean.to_date_column(d['boardapprovaldate']), + closingdate=lambda d: clean.to_date_column(d['closingdate']), + p2a_updated_date=lambda d: clean.to_date_column(d['p2a_updated_date']), + # format sectors + sector=lambda d: clean_sector(d['sector']) + ) + # rename columns .rename(columns=general_fields) ) + def _format_theme_data(self) -> None: + """Format theme data and store it as a dataframe in _data attribute with key 'theme_data'""" + theme_data = [] for _, proj_data in self._raw_data.items(): theme_data.extend(clean_theme(proj_data)) - self._data['theme_data'] = (pd.DataFrame(theme_data) - .assign(percent=lambda d: clean.clean_numeric_series(d['percent'], to=float)) - ) + self._data['theme_data'] = pd.DataFrame(theme_data) def _download(self) -> None: """Download data from World Bank Projects API and save it as a json file""" @@ -280,10 +299,20 @@ def _download(self) -> None: .get_data() ) json.dump(data, file) - logger.info(f"Successfully downloaded World Bank Projects") - def load_data(self, project_codes: str | list = 'all') -> ImportData: - """ """ + logger.info(f"Successfully downloaded World Bank Projects") + + def load_data(self) -> ImportData: + """Load data to the object + + This method will load the World Bank Project data to the object. + If the data has already downloaded, it will be loaded to the object from disk, + otherwise it will be downloaded from the API and saved as a json file and loaded + to the object. + + returns: + object with loaded data + """ # if file does not exist, download it and save it as a json file if not self._path.exists(): @@ -296,12 +325,25 @@ def load_data(self, project_codes: str | list = 'all') -> ImportData: if self._raw_data is None: raise EmptyDataException("No data was retrieved") - # format data - self._format_data() + # set data + self._format_general_data() + self._format_theme_data() + + logger.info(f"Successfully loaded World Bank Projects") return self def update_data(self, reload: bool = True) -> ImportData: - """ """ + """Force update of data + + This method will download the data from the API. + If 'reload' is set to True, the data will be reloaded to the object. + + Args: + reload: if True, reload data to object after downloading it + + returns: + object with updated data + """ self._download() if reload: @@ -314,7 +356,16 @@ def get_data( data_type: str = 'general', **kwargs ) -> pd.DataFrame: - """ """ + """Get the data as a dataframe + + Get the the general data or the theme data for World Bank Projects as a dataframe. + Optionally, you can specify the project codes to retrieve data for. If no project codes + are specified, data for all projects will be returned. + + Args: + project_codes: project codes to retrieve data for. If 'all', data for all projects will be returned + data_type: type of data to retrieve. Either 'general' or 'theme' + """ if data_type == 'general': df = self._data['general_data'] diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py index b994a27..2baffc8 100644 --- a/tests/test_import_tools/test_world_bank_projects.py +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -139,16 +139,16 @@ def test_clean_theme(): ] } - formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': '34'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': '13'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': '34'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': '8'}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': '26'}] + formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': 34}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': 13}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': 34}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': 8}, + {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': 26}] assert world_bank_projects.clean_theme(test_data_dict) == formatted From 2caa2fb608ebf4910d5444b656a580628f2661c2 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Tue, 4 Jul 2023 17:23:50 +0200 Subject: [PATCH 04/13] ignore json downloads --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1b6675b..d126c3c 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ ENV/ # downloaded files /bblocks/.raw_data/*.csv /bblocks/.raw_data/*.feather +/bblocks/.raw_data/*.json # Sphinx documentation docs/_build/ From 700cb9c6bb3ad5849f97bb323efff3518a7e370c Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Tue, 4 Jul 2023 17:26:37 +0200 Subject: [PATCH 05/13] formatting --- bblocks/import_tools/world_bank_projects.py | 240 +++++++------- .../test_world_bank_projects.py | 307 +++++++++++++----- 2 files changed, 348 insertions(+), 199 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index 3d3ede9..33c88db 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -25,8 +25,11 @@ class QueryAPI: """Helper class for querying the World Bank Projects API""" def __init__( - self, response_format: str = 'json', max_rows_per_response: int = 500, - start_date: str | None = None, end_date: str | None = None + self, + response_format: str = "json", + max_rows_per_response: int = 500, + start_date: str | None = None, + end_date: str | None = None, ): """Initialize QueryAPI object""" @@ -36,11 +39,11 @@ def __init__( self.end_date = end_date self._params = { - 'format': self.response_format, - 'rows': self.max_rows_per_response, + "format": self.response_format, + "rows": self.max_rows_per_response, # 'os': 0, # offset - 'strdate': self.start_date, - 'enddate': self.end_date + "strdate": self.start_date, + "enddate": self.end_date, } self._check_params() @@ -51,22 +54,22 @@ def _check_params(self) -> None: """Check parameters""" # if end_date is before start_date, raise error - if self._params['strdate'] is not None and self._params['enddate'] is not None: - if self._params['enddate'] < self._params['strdate']: + if self._params["strdate"] is not None and self._params["enddate"] is not None: + if self._params["enddate"] < self._params["strdate"]: raise ValueError("end date must be after start date") # if max_rows is greater than 1000, raise error - if self._params['rows'] > 1000: + if self._params["rows"] > 1000: raise ValueError("max_rows must be less than or equal to 1000") # if dates are None, drop them from params - if self._params['strdate'] is None: + if self._params["strdate"] is None: # drop start_date from params - self._params.pop('strdate') + self._params.pop("strdate") - if self._params['enddate'] is None: + if self._params["enddate"] is None: # drop end_date from params - self._params.pop('enddate') + self._params.pop("enddate") def _request(self) -> dict: """Single request to API. Returns the rsponse json.""" @@ -74,14 +77,14 @@ def _request(self) -> dict: try: response = requests.get(BASE_API_URL, params=self._params) response.raise_for_status() - data = response.json()['projects'] # keep only the projects data + data = response.json()["projects"] # keep only the projects data return data except Exception as e: raise Exception(f"Failed to get data: {e}") - def request_data(self) -> 'QueryAPI': + def request_data(self) -> "QueryAPI": """Request data from API This method will request all the data from the API @@ -93,7 +96,7 @@ def request_data(self) -> 'QueryAPI': 'QueryAPI' to allow chaining of methods """ - self._params['os'] = 0 # reset offset to 0 + self._params["os"] = 0 # reset offset to 0 while True: @@ -108,7 +111,7 @@ def request_data(self) -> 'QueryAPI': self.response_data.update(data) # update offset - self._params['os'] += self._params['rows'] + self._params["os"] += self._params["rows"] # Log if no data was returned from API if len(self.response_data) == 0: @@ -137,38 +140,50 @@ def clean_theme(data: dict) -> list[dict] | list: """ # if there are no themes, return an empty list - if 'theme_list' not in data.keys(): + if "theme_list" not in data.keys(): # return [{'project ID': proj_id}] return [] theme_list = [] - proj_id = data['id'] - for theme1 in data['theme_list']: + proj_id = data["id"] + for theme1 in data["theme_list"]: # get first theme - name = theme1['name'] - theme_list.append({'project ID': proj_id, - 'theme1': name, - 'percent': clean.clean_number(theme1['percent'])}) + name = theme1["name"] + theme_list.append( + { + "project ID": proj_id, + "theme1": name, + "percent": clean.clean_number(theme1["percent"]), + } + ) # get 2nd theme - if 'theme2' in theme1.keys(): - for theme2 in theme1['theme2']: - name_2 = theme2['name'] - theme_list.append({'project ID': proj_id, - 'theme1': name, - 'theme2': name_2, - 'percent': clean.clean_number(theme2['percent'])}) + if "theme2" in theme1.keys(): + for theme2 in theme1["theme2"]: + name_2 = theme2["name"] + theme_list.append( + { + "project ID": proj_id, + "theme1": name, + "theme2": name_2, + "percent": clean.clean_number(theme2["percent"]), + } + ) # get 3rd theme - if 'theme3' in theme2.keys(): - for theme3 in theme2['theme3']: - name_3 = theme3['name'] - theme_list.append({'project ID': proj_id, - 'theme1': name, - 'theme2': name_2, - 'theme3': name_3, - 'percent': clean.clean_number(theme3['percent'])}) + if "theme3" in theme2.keys(): + for theme3 in theme2["theme3"]: + name_3 = theme3["name"] + theme_list.append( + { + "project ID": proj_id, + "theme1": name, + "theme2": name_2, + "theme3": name_3, + "percent": clean.clean_number(theme3["percent"]), + } + ) return theme_list @@ -183,43 +198,41 @@ def clean_sector(sector_series: pd.Series) -> pd.Series: series of sector data as a string separated by ' | ' """ - return (sector_series - .apply(lambda x: ' | '.join([item['Name'] - for item in x]) if isinstance(x, list) else np.nan) - ) + return sector_series.apply( + lambda x: " | ".join([item["Name"] for item in x]) + if isinstance(x, list) + else np.nan + ) general_fields = { # general info - 'id': 'project ID', - 'project_name': 'project name', - 'countryshortname': 'country', - 'regionname': 'region name', - 'url': 'url', - 'teamleadname': 'team leader', - 'status': 'status', - 'envassesmentcategorycode': 'environmental assesment category', - + "id": "project ID", + "project_name": "project name", + "countryshortname": "country", + "regionname": "region name", + "url": "url", + "teamleadname": "team leader", + "status": "status", + "envassesmentcategorycode": "environmental assesment category", # dates - 'approvalfy': 'fiscal year', - 'boardapprovaldate': 'board approval date', - 'closingdate': 'closing date', - 'p2a_updated_date': 'update date', - + "approvalfy": "fiscal year", + "boardapprovaldate": "board approval date", + "closingdate": "closing date", + "p2a_updated_date": "update date", # lending - 'lendinginstr': 'lending instrument', - 'borrower': 'borrower', - 'impagency': 'implementing agency', - 'lendprojectcost': 'project cost', - 'totalcommamt': 'total commitment', - 'grantamt': 'grant amount', - 'idacommamt': 'IDA commitment amount', - 'ibrdcommamt': 'IBRD commitment amount', - 'curr_total_commitment': 'current total IBRD and IDA commitment', - 'curr_ibrd_commitment': 'current IBRD commitment', - 'curr_ida_commitment': 'current IDA commitment', - + "lendinginstr": "lending instrument", + "borrower": "borrower", + "impagency": "implementing agency", + "lendprojectcost": "project cost", + "totalcommamt": "total commitment", + "grantamt": "grant amount", + "idacommamt": "IDA commitment amount", + "ibrdcommamt": "IBRD commitment amount", + "curr_total_commitment": "current total IBRD and IDA commitment", + "curr_ibrd_commitment": "current IBRD commitment", + "curr_ida_commitment": "current IDA commitment", # sectors - 'sector': 'sectors', + "sector": "sectors", } @@ -249,37 +262,47 @@ class WorldBankProjects(ImportData): def _path(self): """Generate path based on version""" - start_date = f'_{self.start_date}' if self.start_date is not None else '' - end_date = f'_{self.end_date}' if self.end_date is not None else '' + start_date = f"_{self.start_date}" if self.start_date is not None else "" + end_date = f"_{self.end_date}" if self.end_date is not None else "" return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json" def _format_general_data(self) -> None: """Clean and format general data and store it in _data attribute with key 'general_data'""" - numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt', - 'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment', - 'curr_ida_commitment'] - - self._data['general_data'] = (pd - .DataFrame.from_dict(self._raw_data, orient='index') - .reset_index(drop=True) - .loc[:, general_fields.keys()] - # change fiscal year to int - .assign( - approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int)) - # change numeric columns to float - .pipe(clean.clean_numeric_series, series_columns=numeric_cols) - .assign( # format dates - boardapprovaldate=lambda d: clean.to_date_column(d['boardapprovaldate']), - closingdate=lambda d: clean.to_date_column(d['closingdate']), - p2a_updated_date=lambda d: clean.to_date_column(d['p2a_updated_date']), - # format sectors - sector=lambda d: clean_sector(d['sector']) + numeric_cols = [ + "lendprojectcost", + "totalcommamt", + "grantamt", + "idacommamt", + "ibrdcommamt", + "curr_total_commitment", + "curr_ibrd_commitment", + "curr_ida_commitment", + ] + + self._data["general_data"] = ( + pd.DataFrame.from_dict(self._raw_data, orient="index") + .reset_index(drop=True) + .loc[:, general_fields.keys()] + # change fiscal year to int + .assign( + approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int) + ) + # change numeric columns to float + .pipe(clean.clean_numeric_series, series_columns=numeric_cols) + .assign( # format dates + boardapprovaldate=lambda d: clean.to_date_column( + d["boardapprovaldate"] + ), + closingdate=lambda d: clean.to_date_column(d["closingdate"]), + p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]), + # format sectors + sector=lambda d: clean_sector(d["sector"]), + ) + # rename columns + .rename(columns=general_fields) ) - # rename columns - .rename(columns=general_fields) - ) def _format_theme_data(self) -> None: """Format theme data and store it as a dataframe in _data attribute with key 'theme_data'""" @@ -288,16 +311,17 @@ def _format_theme_data(self) -> None: for _, proj_data in self._raw_data.items(): theme_data.extend(clean_theme(proj_data)) - self._data['theme_data'] = pd.DataFrame(theme_data) + self._data["theme_data"] = pd.DataFrame(theme_data) def _download(self) -> None: """Download data from World Bank Projects API and save it as a json file""" - with open(self._path, 'w') as file: - data = (QueryAPI(start_date=self.start_date, end_date=self.end_date) - .request_data() - .get_data() - ) + with open(self._path, "w") as file: + data = ( + QueryAPI(start_date=self.start_date, end_date=self.end_date) + .request_data() + .get_data() + ) json.dump(data, file) logger.info(f"Successfully downloaded World Bank Projects") @@ -352,9 +376,7 @@ def update_data(self, reload: bool = True) -> ImportData: return self def get_data( - self, project_codes: str | list = 'all', - data_type: str = 'general', - **kwargs + self, project_codes: str | list = "all", data_type: str = "general", **kwargs ) -> pd.DataFrame: """Get the data as a dataframe @@ -367,17 +389,17 @@ def get_data( data_type: type of data to retrieve. Either 'general' or 'theme' """ - if data_type == 'general': - df = self._data['general_data'] - elif data_type == 'theme': - df = self._data['theme_data'] + if data_type == "general": + df = self._data["general_data"] + elif data_type == "theme": + df = self._data["theme_data"] else: raise ValueError("data_type must be either 'general' or 'theme'") - if project_codes != 'all': + if project_codes != "all": if isinstance(project_codes, str): project_codes = [project_codes] - df = df[df['project ID'].isin(project_codes)] + df = df[df["project ID"].isin(project_codes)] return df diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py index 2baffc8..0531b39 100644 --- a/tests/test_import_tools/test_world_bank_projects.py +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -17,28 +17,40 @@ def test_init(self): # test that error is raised if end_date is before start_date with pytest.raises(ValueError): - world_bank_projects.QueryAPI(start_date='2020-01-01', end_date='2019-01-01') + world_bank_projects.QueryAPI(start_date="2020-01-01", end_date="2019-01-01") # test that error is raised if max_rows_per_response is greater than 1000 with pytest.raises(ValueError): world_bank_projects.QueryAPI(max_rows_per_response=1001) # test that start_date is dropped if end_date is None - assert 'strdate' not in world_bank_projects.QueryAPI(end_date='2020-01-01', - start_date=None)._params + assert ( + "strdate" + not in world_bank_projects.QueryAPI( + end_date="2020-01-01", start_date=None + )._params + ) # test that end_date is dropped if start_date is None - assert 'enddate' not in world_bank_projects.QueryAPI(start_date='2020-01-01', - end_date=None)._params + assert ( + "enddate" + not in world_bank_projects.QueryAPI( + start_date="2020-01-01", end_date=None + )._params + ) def test_request(self): """ """ mock_response = Mock() mock_response.status_code = 200 - mock_response.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}} + mock_response.json.return_value = { + "projects": {"P1234": {"name": "Test Project"}} + } with patch("requests.get", return_value=mock_response) as mock_get: - assert world_bank_projects.QueryAPI()._request() == {'P1234': {'name': 'Test Project'}} + assert world_bank_projects.QueryAPI()._request() == { + "P1234": {"name": "Test Project"} + } def test_request_error(self): """Test that error is raised if request fails.""" @@ -48,7 +60,9 @@ def test_request_error(self): requests.exceptions.HTTPError ) mock_get.return_value.status_code = 404 - mock_get.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}} + mock_get.json.return_value = { + "projects": {"P1234": {"name": "Test Project"}} + } with pytest.raises(Exception): world_bank_projects.QueryAPI()._request() @@ -58,7 +72,9 @@ def test_request_data_no_data(self): mock_response = Mock() mock_response.status_code = 200 - mock_response.json.return_value = {'projects': {}} # test that empty response is handled + mock_response.json.return_value = { + "projects": {} + } # test that empty response is handled with pytest.raises(world_bank_projects.EmptyDataException): with patch("requests.get", return_value=mock_response) as mock_get: @@ -69,86 +85,181 @@ def test_request_data(self): """Test request_data method.""" # Mocking the requests.get function - mocked_get = MagicMock(side_effect=[ - Mock(json=MagicMock(return_value={'projects': {'P1': {'name': 'Test Project 1'}, - 'P2': {'name': 'Test Project 2'} - } - })), - Mock(json=MagicMock(return_value={'projects':{'P3': {'name': 'Test Project 3'}}})), - Mock(json=MagicMock(return_value={'projects': {}})) - ]) + mocked_get = MagicMock( + side_effect=[ + Mock( + json=MagicMock( + return_value={ + "projects": { + "P1": {"name": "Test Project 1"}, + "P2": {"name": "Test Project 2"}, + } + } + ) + ), + Mock( + json=MagicMock( + return_value={"projects": {"P3": {"name": "Test Project 3"}}} + ) + ), + Mock(json=MagicMock(return_value={"projects": {}})), + ] + ) with patch("bblocks.import_tools.world_bank_projects.requests.get", mocked_get): obj = world_bank_projects.QueryAPI() obj.request_data() - assert obj.response_data == {'P1': {'name': 'Test Project 1'}, - 'P2': {'name': 'Test Project 2'}, - 'P3': {'name': 'Test Project 3'} - } + assert obj.response_data == { + "P1": {"name": "Test Project 1"}, + "P2": {"name": "Test Project 2"}, + "P3": {"name": "Test Project 3"}, + } def test_clean_theme(): """Test clean_theme function.""" - test_data_dict = {'id': 'P1234', - 'theme_list': - [{'name': 'Environment and Natural Resource Management', - 'code': '80', - 'seqno': '14', - 'percent': '34', - 'theme2': [ - {'name': 'Energy', - 'code': '86', - 'seqno': '18', - 'percent': '13', - 'theme3': [ - { 'name': 'Energy Efficiency', - 'code': '861', - 'seqno': '34', - 'percent': '13'}, - {'name': 'Energy Policies & Reform', - 'code': '862', - 'seqno': '35', - 'percent': '13'}] - }, - {'name': 'Environmental policies and institutions', - 'code': '84', - 'seqno': '17', - 'percent': '13'}, - - {'name': 'Environmental Health and Pollution Management', - 'code': '82', - 'seqno': '16', - 'percent': '13', - 'theme3': [ - {'name': 'Air quality management', - 'code': '821', - 'seqno': '33', - 'percent': '13'}]}, - {'name': 'Climate change', - 'code': '81', - 'seqno': '15', - 'percent': '34', - 'theme3': [ - {'name': 'Adaptation', - 'code': '812', - 'seqno': '32', - 'percent': '8'}, - {'name': 'Mitigation', 'code': '811', 'seqno': '31', 'percent': '26'}]}]} - ] + test_data_dict = { + "id": "P1234", + "theme_list": [ + { + "name": "Environment and Natural Resource Management", + "code": "80", + "seqno": "14", + "percent": "34", + "theme2": [ + { + "name": "Energy", + "code": "86", + "seqno": "18", + "percent": "13", + "theme3": [ + { + "name": "Energy Efficiency", + "code": "861", + "seqno": "34", + "percent": "13", + }, + { + "name": "Energy Policies & Reform", + "code": "862", + "seqno": "35", + "percent": "13", + }, + ], + }, + { + "name": "Environmental policies and institutions", + "code": "84", + "seqno": "17", + "percent": "13", + }, + { + "name": "Environmental Health and Pollution Management", + "code": "82", + "seqno": "16", + "percent": "13", + "theme3": [ + { + "name": "Air quality management", + "code": "821", + "seqno": "33", + "percent": "13", + } + ], + }, + { + "name": "Climate change", + "code": "81", + "seqno": "15", + "percent": "34", + "theme3": [ + { + "name": "Adaptation", + "code": "812", + "seqno": "32", + "percent": "8", + }, + { + "name": "Mitigation", + "code": "811", + "seqno": "31", + "percent": "26", + }, + ], + }, + ], + } + ], } - formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': 34}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': 13}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': 34}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': 8}, - {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': 26}] + formatted = [ + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "percent": 34, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Energy", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Energy", + "theme3": "Energy Efficiency", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Energy", + "theme3": "Energy Policies & Reform", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Environmental policies and institutions", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Environmental Health and Pollution Management", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Environmental Health and Pollution Management", + "theme3": "Air quality management", + "percent": 13, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Climate change", + "percent": 34, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Climate change", + "theme3": "Adaptation", + "percent": 8, + }, + { + "project ID": "P1234", + "theme1": "Environment and Natural Resource Management", + "theme2": "Climate change", + "theme3": "Mitigation", + "percent": 26, + }, + ] assert world_bank_projects.clean_theme(test_data_dict) == formatted @@ -156,24 +267,40 @@ def test_clean_theme(): def test_clean_theme_no_theme(): """Test clean_theme function with no theme.""" - test_data_dict = {'id': 'P1234' - } + test_data_dict = {"id": "P1234"} assert world_bank_projects.clean_theme(test_data_dict) == [] def test_clean_sector(): """Test clean_sector function.""" - test_series = pd.Series({0: [{'Name': 'Public Administration - Transportation'}, {'Name': 'Ports/Waterways'}], - 1: [{'Name': 'Public Administration - Agriculture, Fishing & Forestry'}, {'Name': 'Agricultural Extension, Research, and Other Support Activities'}, {'Name': 'Other Agriculture, Fishing and Forestry'}, {'Name': 'Irrigation and Drainage'}, {'Name': 'Agricultural markets, commercialization and agri-business'}], - 2: np.nan, - 3: np.nan}) - - expected = pd.Series({0: 'Public Administration - Transportation | Ports/Waterways', - 1: 'Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business', - 2: np.nan, - 3: np.nan}) + test_series = pd.Series( + { + 0: [ + {"Name": "Public Administration - Transportation"}, + {"Name": "Ports/Waterways"}, + ], + 1: [ + {"Name": "Public Administration - Agriculture, Fishing & Forestry"}, + { + "Name": "Agricultural Extension, Research, and Other Support Activities" + }, + {"Name": "Other Agriculture, Fishing and Forestry"}, + {"Name": "Irrigation and Drainage"}, + {"Name": "Agricultural markets, commercialization and agri-business"}, + ], + 2: np.nan, + 3: np.nan, + } + ) + + expected = pd.Series( + { + 0: "Public Administration - Transportation | Ports/Waterways", + 1: "Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business", + 2: np.nan, + 3: np.nan, + } + ) assert world_bank_projects.clean_sector(test_series).equals(expected) - - From 022a9d878b672778835a99d6b9cea7d3ede1c50c Mon Sep 17 00:00:00 2001 From: Jorge Rivera Date: Thu, 6 Jul 2023 10:14:31 +0200 Subject: [PATCH 06/13] minor formatting tweaks Including https for api call --- bblocks/import_tools/world_bank_projects.py | 36 +++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index 33c88db..c436cf0 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -18,7 +18,7 @@ class EmptyDataException(Exception): pass -BASE_API_URL = "http://search.worldbank.org/api/v2/projects" +BASE_API_URL = "https://search.worldbank.org/api/v2/projects" class QueryAPI: @@ -53,7 +53,7 @@ def __init__( def _check_params(self) -> None: """Check parameters""" - # if end_date is before start_date, raise error + # if end_date is before start_date, raise error. if self._params["strdate"] is not None and self._params["enddate"] is not None: if self._params["enddate"] < self._params["strdate"]: raise ValueError("end date must be after start date") @@ -72,7 +72,7 @@ def _check_params(self) -> None: self._params.pop("enddate") def _request(self) -> dict: - """Single request to API. Returns the rsponse json.""" + """Single request to API. Returns the response json.""" try: response = requests.get(BASE_API_URL, params=self._params) @@ -120,7 +120,7 @@ def request_data(self) -> "QueryAPI": return self def get_data(self) -> dict[dict]: - """Get the data, or request it if it hasn't been requested yet""" + """Get the data, or request it if it hasn't been requested yet.""" if len(self.response_data) == 0: self.request_data() @@ -129,8 +129,9 @@ def get_data(self) -> dict[dict]: def clean_theme(data: dict) -> list[dict] | list: - """Clean theme data from a nested list to a list of dictionaries with theme names and percentages - If there are no themes, an empty list will be returned + """Clean theme data from a nested list to a list of dictionaries with theme names and + percentages. + If there are no themes, an empty list will be returned. Args: data: data from API @@ -189,7 +190,7 @@ def clean_theme(data: dict) -> list[dict] | list: def clean_sector(sector_series: pd.Series) -> pd.Series: """Format sector data from a nested list to a string separating sectors by ' | ' - If there are no sectors, np.nan will be placed in the series row + If there are no sectors, np.nan will be placed in the series row. Args: sector_series: series of sector data @@ -247,12 +248,12 @@ class WorldBankProjects(ImportData): be loaded to the object from disk, otherwise it will be downloaded from the API. To retrieve the data, call the get_data method. You can specify the type of data to retrieve, either 'general' or 'theme'. If no type is specified, 'general' data will be returned. - To update the data, call the update_data method. This will download the data from the API. if 'reload' is - set to True, the data will be reloaded to the object. + To update the data, call the update_data method. This will download the data from the API. + If 'reload' is set to True, the data will be reloaded to the object. Parameters: start_date: start date of data to import, in DD-MM-YYYY format - end_date: end date of data to import, in DD-MM-YYYY format + end_date: end date of data to import, in DD-MM-YYYY format. """ start_date: str | None = None @@ -285,7 +286,7 @@ def _format_general_data(self) -> None: pd.DataFrame.from_dict(self._raw_data, orient="index") .reset_index(drop=True) .loc[:, general_fields.keys()] - # change fiscal year to int + # change the fiscal year to int .assign( approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int) ) @@ -314,7 +315,7 @@ def _format_theme_data(self) -> None: self._data["theme_data"] = pd.DataFrame(theme_data) def _download(self) -> None: - """Download data from World Bank Projects API and save it as a json file""" + """Download data from World Bank Projects API and save it as a json file.""" with open(self._path, "w") as file: data = ( @@ -334,7 +335,7 @@ def load_data(self) -> ImportData: otherwise it will be downloaded from the API and saved as a json file and loaded to the object. - returns: + Returns: object with loaded data """ @@ -363,9 +364,9 @@ def update_data(self, reload: bool = True) -> ImportData: If 'reload' is set to True, the data will be reloaded to the object. Args: - reload: if True, reload data to object after downloading it + reload: if True, reload data to object after downloading it. - returns: + Returns: object with updated data """ @@ -380,12 +381,13 @@ def get_data( ) -> pd.DataFrame: """Get the data as a dataframe - Get the the general data or the theme data for World Bank Projects as a dataframe. + Get the general data, or the theme data for World Bank Projects as a dataframe. Optionally, you can specify the project codes to retrieve data for. If no project codes are specified, data for all projects will be returned. Args: - project_codes: project codes to retrieve data for. If 'all', data for all projects will be returned + project_codes: project codes to retrieve data for. If 'all', data for all projects + will be returned data_type: type of data to retrieve. Either 'general' or 'theme' """ From 88b1547a9582c9bc5c178205d9b827b2f3148df4 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Fri, 7 Jul 2023 12:35:30 +0200 Subject: [PATCH 07/13] add sectors functionality --- bblocks/import_tools/world_bank_projects.py | 66 ++++++++++++++------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index c436cf0..e357d77 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -5,6 +5,7 @@ import requests import json from dataclasses import dataclass +import re from bblocks.logger import logger from bblocks.import_tools.common import ImportData @@ -26,24 +27,24 @@ class QueryAPI: def __init__( self, - response_format: str = "json", max_rows_per_response: int = 500, start_date: str | None = None, end_date: str | None = None, ): """Initialize QueryAPI object""" - self.response_format = response_format self.max_rows_per_response = max_rows_per_response self.start_date = start_date self.end_date = end_date self._params = { - "format": self.response_format, + "format": 'json', "rows": self.max_rows_per_response, # 'os': 0, # offset "strdate": self.start_date, "enddate": self.end_date, + "fl": "*", + 'apilang': 'en' } self._check_params() @@ -188,22 +189,33 @@ def clean_theme(data: dict) -> list[dict] | list: return theme_list -def clean_sector(sector_series: pd.Series) -> pd.Series: - """Format sector data from a nested list to a string separating sectors by ' | ' - If there are no sectors, np.nan will be placed in the series row. +def _get_sector_percentages(d: dict) -> dict: + """ """ - Args: - sector_series: series of sector data + sectors_dict = {} # empty dict to store sector data as {sector_name: percent} - Returns: - series of sector data as a string separated by ' | ' - """ + sector_names = [v['Name'] for v in d['sector']] # get list of sector names + sectors = {key: value for key, value in d.items() if re.search(r'^sector\d+$', key)} # get sectors fields which should contain percentages + + # get available sector percentages + for _, v in sectors.items(): + if isinstance(v, dict): + sectors_dict[v['Name']] = v['Percent'] + + # check if there are missing sectors from the dict + if (len(sectors_dict)== len(sectors)-1) and (sum(sectors_dict.values())<100): + + # loop through all the available sectors + for s in sector_names: + + # if a sectors has not been picked up it must be the missing sector + if s not in sectors_dict.keys(): + sectors_dict[s] = 100 - sum(sectors_dict.values()) - return sector_series.apply( - lambda x: " | ".join([item["Name"] for item in x]) - if isinstance(x, list) - else np.nan - ) + if sum(sectors_dict.values())!=100: + raise ValueError("Sector percentages don't add up to 100%") + + return sectors_dict general_fields = { # general info @@ -232,8 +244,6 @@ def clean_sector(sector_series: pd.Series) -> pd.Series: "curr_total_commitment": "current total IBRD and IDA commitment", "curr_ibrd_commitment": "current IBRD commitment", "curr_ida_commitment": "current IDA commitment", - # sectors - "sector": "sectors", } @@ -298,8 +308,6 @@ def _format_general_data(self) -> None: ), closingdate=lambda d: clean.to_date_column(d["closingdate"]), p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]), - # format sectors - sector=lambda d: clean_sector(d["sector"]), ) # rename columns .rename(columns=general_fields) @@ -314,6 +322,23 @@ def _format_theme_data(self) -> None: self._data["theme_data"] = pd.DataFrame(theme_data) + def _format_sector_data(self) -> None: + """Format sector data and store it as a dataframe in _data attribute + with key 'sector_data'""" + + sector_data = [] + for _, proj_data in self._raw_data.items(): + if 'sector' in proj_data.keys(): + proj_id = proj_data['id'] + + sectors = _get_sector_percentages(proj_data) + sector_data.extend([{'project ID': proj_id, + 'sector': s, + 'percent': p} + for s, p in sectors.items()]) + + self._data["sector_data"] = pd.DataFrame(sector_data) + def _download(self) -> None: """Download data from World Bank Projects API and save it as a json file.""" @@ -353,6 +378,7 @@ def load_data(self) -> ImportData: # set data self._format_general_data() self._format_theme_data() + self._format_sector_data() logger.info(f"Successfully loaded World Bank Projects") return self From c2e9f2f5b3e599c7afc532a0691e0ce41836e3a7 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Fri, 7 Jul 2023 12:40:23 +0200 Subject: [PATCH 08/13] Update world_bank_projects.py --- bblocks/import_tools/world_bank_projects.py | 24 +++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index e357d77..6d5e71a 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -190,12 +190,25 @@ def clean_theme(data: dict) -> list[dict] | list: def _get_sector_percentages(d: dict) -> dict: - """ """ + """Get sector percentages from a project dictionary + + the function first finds all available sectors + It then finds all fields from the json starting with 'sector' and ending with a number + and gets a dictionary of the sector name and percentage + If there are any sectors missing from the dict and the total percentage is less than 100 + the missing sector is added with the remaining percentage. + If the total is still not 100, it will raise an error to indicate a + problem with the data. + + args: + d: project dictionary + """ sectors_dict = {} # empty dict to store sector data as {sector_name: percent} - sector_names = [v['Name'] for v in d['sector']] # get list of sector names - sectors = {key: value for key, value in d.items() if re.search(r'^sector\d+$', key)} # get sectors fields which should contain percentages + + # get sectors fields which should contain percentages + sectors = {key: value for key, value in d.items() if re.search(r"^sector\d+$", key)} # get available sector percentages for _, v in sectors.items(): @@ -203,16 +216,15 @@ def _get_sector_percentages(d: dict) -> dict: sectors_dict[v['Name']] = v['Percent'] # check if there are missing sectors from the dict - if (len(sectors_dict)== len(sectors)-1) and (sum(sectors_dict.values())<100): + if (len(sectors_dict) == len(sectors)-1) and (sum(sectors_dict.values())<100): # loop through all the available sectors for s in sector_names: - # if a sectors has not been picked up it must be the missing sector if s not in sectors_dict.keys(): sectors_dict[s] = 100 - sum(sectors_dict.values()) - if sum(sectors_dict.values())!=100: + if sum(sectors_dict.values()) != 100: raise ValueError("Sector percentages don't add up to 100%") return sectors_dict From 37662bb4c8d6475ae32669fffb3de9c2125044b2 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Fri, 7 Jul 2023 12:55:54 +0200 Subject: [PATCH 09/13] test sector --- bblocks/import_tools/world_bank_projects.py | 4 +- .../test_world_bank_projects.py | 84 ++++++++++--------- 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index 6d5e71a..a103a76 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -189,7 +189,7 @@ def clean_theme(data: dict) -> list[dict] | list: return theme_list -def _get_sector_percentages(d: dict) -> dict: +def _get_sector_data(d: dict) -> dict: """Get sector percentages from a project dictionary the function first finds all available sectors @@ -343,7 +343,7 @@ def _format_sector_data(self) -> None: if 'sector' in proj_data.keys(): proj_id = proj_data['id'] - sectors = _get_sector_percentages(proj_data) + sectors = _get_sector_data(proj_data) sector_data.extend([{'project ID': proj_id, 'sector': s, 'percent': p} diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py index 0531b39..64339ee 100644 --- a/tests/test_import_tools/test_world_bank_projects.py +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -25,18 +25,18 @@ def test_init(self): # test that start_date is dropped if end_date is None assert ( - "strdate" - not in world_bank_projects.QueryAPI( - end_date="2020-01-01", start_date=None - )._params + "strdate" + not in world_bank_projects.QueryAPI( + end_date="2020-01-01", start_date=None + )._params ) # test that end_date is dropped if start_date is None assert ( - "enddate" - not in world_bank_projects.QueryAPI( - start_date="2020-01-01", end_date=None - )._params + "enddate" + not in world_bank_projects.QueryAPI( + start_date="2020-01-01", end_date=None + )._params ) def test_request(self): @@ -271,36 +271,44 @@ def test_clean_theme_no_theme(): assert world_bank_projects.clean_theme(test_data_dict) == [] -def test_clean_sector(): - """Test clean_sector function.""" +def test_get_sector_data(): + """test the get_sector_data function.""" - test_series = pd.Series( - { - 0: [ - {"Name": "Public Administration - Transportation"}, - {"Name": "Ports/Waterways"}, - ], - 1: [ - {"Name": "Public Administration - Agriculture, Fishing & Forestry"}, - { - "Name": "Agricultural Extension, Research, and Other Support Activities" - }, - {"Name": "Other Agriculture, Fishing and Forestry"}, - {"Name": "Irrigation and Drainage"}, - {"Name": "Agricultural markets, commercialization and agri-business"}, - ], - 2: np.nan, - 3: np.nan, - } - ) + d = {'id': 'P1', + 'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'}, + {'Name': 'Agricultural extension and research', 'code': 'AX'} + ], + 'sector1': {'Name': 'Agriculture, fishing, and forestry', + 'Percent': 50}, + 'sector2': {'Name': 'Agricultural extension and research', + 'Percent': 50} + } + + expected = {'Agriculture, fishing, and forestry': 50, + 'Agricultural extension and research': 50} + + assert world_bank_projects._get_sector_data(d) == expected + +def test_get_sector_data_missing_sector(): + """Test the get_sector_data function with missing sector.""" + + d = {'id': 'P2', + 'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'}, + {'Name': 'Agricultural extension and research', 'code': 'AX'}, + {'Name': 'Missing sector', 'code': 'XX'} + ], + 'sector1': {'Name': 'Agriculture, fishing, and forestry', + 'Percent': 40}, + 'sector2': {'Name': 'Agricultural extension and research', + 'Percent': 50}, + 'sector3': 'Missing sector', + } + + expected = {'Agriculture, fishing, and forestry': 40, + 'Agricultural extension and research': 50, + 'Missing sector': 10 + } + + assert world_bank_projects._get_sector_data(d) == expected - expected = pd.Series( - { - 0: "Public Administration - Transportation | Ports/Waterways", - 1: "Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business", - 2: np.nan, - 3: np.nan, - } - ) - assert world_bank_projects.clean_sector(test_series).equals(expected) From 9dd269d45175150f8c90b1c837ac99f112e28834 Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Fri, 7 Jul 2023 15:09:51 +0200 Subject: [PATCH 10/13] update --- bblocks/import_tools/world_bank_projects.py | 68 ++++++++++----- .../test_world_bank_projects.py | 82 ++++++++++--------- 2 files changed, 94 insertions(+), 56 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index a103a76..55d85e9 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -1,7 +1,6 @@ """World Bank Projects Database Importer""" import pandas as pd -import numpy as np import requests import json from dataclasses import dataclass @@ -30,21 +29,31 @@ def __init__( max_rows_per_response: int = 500, start_date: str | None = None, end_date: str | None = None, + fields: list[str] | str = "*", ): - """Initialize QueryAPI object""" + """Initialize QueryAPI object + + Args: + max_rows_per_response: maximum number of rows to return per request. + Must be less than or equal to 1000. + start_date: start date of projects to return. Format: YYYY-MM-DD + end_date: end date of projects to return. Format: YYYY-MM-DD + fields: fields to return. Can be a list of strings or a single string. + By default, all fields are returned ('*'). + """ self.max_rows_per_response = max_rows_per_response self.start_date = start_date self.end_date = end_date + self.fields = fields self._params = { - "format": 'json', + "format": "json", "rows": self.max_rows_per_response, # 'os': 0, # offset "strdate": self.start_date, "enddate": self.end_date, - "fl": "*", - 'apilang': 'en' + "fl": self.fields, } self._check_params() @@ -118,6 +127,7 @@ def request_data(self) -> "QueryAPI": if len(self.response_data) == 0: raise EmptyDataException("No data was returned from API") + logger.info(f"Retrieved {len(self.response_data)} projects from API") return self def get_data(self) -> dict[dict]: @@ -204,8 +214,8 @@ def _get_sector_data(d: dict) -> dict: d: project dictionary """ - sectors_dict = {} # empty dict to store sector data as {sector_name: percent} - sector_names = [v['Name'] for v in d['sector']] # get list of sector names + sectors_dict = {} # empty dict to store sector data as {sector_name: percent} + sector_names = [v["Name"] for v in d["sector"]] # get list of sector names # get sectors fields which should contain percentages sectors = {key: value for key, value in d.items() if re.search(r"^sector\d+$", key)} @@ -213,10 +223,10 @@ def _get_sector_data(d: dict) -> dict: # get available sector percentages for _, v in sectors.items(): if isinstance(v, dict): - sectors_dict[v['Name']] = v['Percent'] + sectors_dict[v["Name"]] = v["Percent"] # check if there are missing sectors from the dict - if (len(sectors_dict) == len(sectors)-1) and (sum(sectors_dict.values())<100): + if (len(sectors_dict) == len(sectors) - 1) and (sum(sectors_dict.values()) < 100): # loop through all the available sectors for s in sector_names: @@ -238,7 +248,11 @@ def _get_sector_data(d: dict) -> dict: "url": "url", "teamleadname": "team leader", "status": "status", + "last_stage_reached_name": "last stage reached", + "pdo": "project development objective", + "cons_serv_reqd_ind": "consulting services required", "envassesmentcategorycode": "environmental assesment category", + "esrc_ovrl_risk_rate": "environmental and social risk", # dates "approvalfy": "fiscal year", "boardapprovaldate": "board approval date", @@ -246,6 +260,7 @@ def _get_sector_data(d: dict) -> dict: "p2a_updated_date": "update date", # lending "lendinginstr": "lending instrument", + "projectfinancialtype": "financing type", "borrower": "borrower", "impagency": "implementing agency", "lendprojectcost": "project cost", @@ -253,6 +268,7 @@ def _get_sector_data(d: dict) -> dict: "grantamt": "grant amount", "idacommamt": "IDA commitment amount", "ibrdcommamt": "IBRD commitment amount", + "curr_project_cost": "current project cost", "curr_total_commitment": "current total IBRD and IDA commitment", "curr_ibrd_commitment": "current IBRD commitment", "curr_ida_commitment": "current IDA commitment", @@ -274,8 +290,8 @@ class WorldBankProjects(ImportData): If 'reload' is set to True, the data will be reloaded to the object. Parameters: - start_date: start date of data to import, in DD-MM-YYYY format - end_date: end date of data to import, in DD-MM-YYYY format. + start_date: start date of data to import, in YYYY-MM-DD format + end_date: end date of data to import, in YYYY-MM-DD format. """ start_date: str | None = None @@ -302,6 +318,7 @@ def _format_general_data(self) -> None: "curr_total_commitment", "curr_ibrd_commitment", "curr_ida_commitment", + "curr_project_cost", ] self._data["general_data"] = ( @@ -340,20 +357,24 @@ def _format_sector_data(self) -> None: sector_data = [] for _, proj_data in self._raw_data.items(): - if 'sector' in proj_data.keys(): - proj_id = proj_data['id'] + if "sector" in proj_data.keys(): + proj_id = proj_data["id"] sectors = _get_sector_data(proj_data) - sector_data.extend([{'project ID': proj_id, - 'sector': s, - 'percent': p} - for s, p in sectors.items()]) + sector_data.extend( + [ + {"project ID": proj_id, "sector": s, "percent": p} + for s, p in sectors.items() + ] + ) self._data["sector_data"] = pd.DataFrame(sector_data) def _download(self) -> None: """Download data from World Bank Projects API and save it as a json file.""" + logger.info(f"Starting download of World Bank Projects") + with open(self._path, "w") as file: data = ( QueryAPI(start_date=self.start_date, end_date=self.end_date) @@ -426,15 +447,24 @@ def get_data( Args: project_codes: project codes to retrieve data for. If 'all', data for all projects will be returned - data_type: type of data to retrieve. Either 'general' or 'theme' + data_type: type of data to retrieve. Either 'general', 'sector' or 'theme' + + Returns: + dataframe with the requested data """ + # check if data has been loaded + if len(self._data) == 0: + raise EmptyDataException("Data has not been loaded. Run load_data() first.") + if data_type == "general": df = self._data["general_data"] elif data_type == "theme": df = self._data["theme_data"] + elif data_type == "sector": + df = self._data["sector_data"] else: - raise ValueError("data_type must be either 'general' or 'theme'") + raise ValueError("data_type must be either 'general', 'theme' or 'sector'") if project_codes != "all": if isinstance(project_codes, str): diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py index 64339ee..08564e9 100644 --- a/tests/test_import_tools/test_world_bank_projects.py +++ b/tests/test_import_tools/test_world_bank_projects.py @@ -1,8 +1,6 @@ """Tests for the world_bank_projects module.""" import pytest -import pandas as pd -import numpy as np import requests from unittest.mock import Mock, patch, MagicMock @@ -25,18 +23,18 @@ def test_init(self): # test that start_date is dropped if end_date is None assert ( - "strdate" - not in world_bank_projects.QueryAPI( - end_date="2020-01-01", start_date=None - )._params + "strdate" + not in world_bank_projects.QueryAPI( + end_date="2020-01-01", start_date=None + )._params ) # test that end_date is dropped if start_date is None assert ( - "enddate" - not in world_bank_projects.QueryAPI( - start_date="2020-01-01", end_date=None - )._params + "enddate" + not in world_bank_projects.QueryAPI( + start_date="2020-01-01", end_date=None + )._params ) def test_request(self): @@ -274,41 +272,51 @@ def test_clean_theme_no_theme(): def test_get_sector_data(): """test the get_sector_data function.""" - d = {'id': 'P1', - 'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'}, - {'Name': 'Agricultural extension and research', 'code': 'AX'} - ], - 'sector1': {'Name': 'Agriculture, fishing, and forestry', - 'Percent': 50}, - 'sector2': {'Name': 'Agricultural extension and research', - 'Percent': 50} - } + d = { + "id": "P1", + "sector": [ + {"Name": "Agriculture, fishing, and forestry", "code": "BX"}, + {"Name": "Agricultural extension and research", "code": "AX"}, + ], + "sector1": {"Name": "Agriculture, fishing, and forestry", "Percent": 50}, + "sector2": {"Name": "Agricultural extension and research", "Percent": 50}, + } - expected = {'Agriculture, fishing, and forestry': 50, - 'Agricultural extension and research': 50} + expected = { + "Agriculture, fishing, and forestry": 50, + "Agricultural extension and research": 50, + } assert world_bank_projects._get_sector_data(d) == expected + def test_get_sector_data_missing_sector(): """Test the get_sector_data function with missing sector.""" - d = {'id': 'P2', - 'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'}, - {'Name': 'Agricultural extension and research', 'code': 'AX'}, - {'Name': 'Missing sector', 'code': 'XX'} - ], - 'sector1': {'Name': 'Agriculture, fishing, and forestry', - 'Percent': 40}, - 'sector2': {'Name': 'Agricultural extension and research', - 'Percent': 50}, - 'sector3': 'Missing sector', - } - - expected = {'Agriculture, fishing, and forestry': 40, - 'Agricultural extension and research': 50, - 'Missing sector': 10 - } + d = { + "id": "P2", + "sector": [ + {"Name": "Agriculture, fishing, and forestry", "code": "BX"}, + {"Name": "Agricultural extension and research", "code": "AX"}, + {"Name": "Missing sector", "code": "XX"}, + ], + "sector1": {"Name": "Agriculture, fishing, and forestry", "Percent": 40}, + "sector2": {"Name": "Agricultural extension and research", "Percent": 50}, + "sector3": "Missing sector", + } + + expected = { + "Agriculture, fishing, and forestry": 40, + "Agricultural extension and research": 50, + "Missing sector": 10, + } assert world_bank_projects._get_sector_data(d) == expected +def test_get_data_no_data_loaded(): + """Test the get_data function with no data loaded.""" + + with pytest.raises(world_bank_projects.EmptyDataException): + proj = world_bank_projects.WorldBankProjects() + proj.get_data() From d51dd852b514d7c1b14400a8492d94106303b69a Mon Sep 17 00:00:00 2001 From: Jorge Rivera Date: Mon, 10 Jul 2023 16:00:34 +0200 Subject: [PATCH 11/13] Update world_bank_projects.py --- bblocks/import_tools/world_bank_projects.py | 141 +++++++++++++------- 1 file changed, 95 insertions(+), 46 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index 55d85e9..f6221c0 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -45,7 +45,7 @@ def __init__( self.max_rows_per_response = max_rows_per_response self.start_date = start_date self.end_date = end_date - self.fields = fields + self.fields = list(set(fields)) if isinstance(fields, list) else [fields] self._params = { "format": "json", @@ -109,7 +109,6 @@ def request_data(self) -> "QueryAPI": self._params["os"] = 0 # reset offset to 0 while True: - # request data data = self._request() @@ -139,6 +138,58 @@ def get_data(self) -> dict[dict]: return self.response_data +def _append_theme_to_list( + proj_id: str, theme_list: list[dict], theme_names: list[str], theme: dict +) -> None: + """Appends a theme to the theme_list. + + Args: + proj_id: The project ID. + theme_list: The list of theme dictionaries to append to. + theme_names): The names of the parent themes. + theme: The theme to append. + """ + new_theme = { + "project ID": proj_id, + **{f"theme{idx + 1}": name for idx, name in enumerate(theme_names)}, + "percent": clean.clean_number(theme["percent"]), + } + theme_list.append(new_theme) + + +def _parse_themes( + proj_id: str, + theme_list: list[dict], + theme_names: list[str], + theme: dict, + theme_level: int, +) -> None: + """Recursive function to handle nested themes. + + Args: + proj_id (str): The project ID. + theme_list (list[dict]): The list of theme dictionaries to append to. + theme_names (list[str]): The names of the parent themes. + theme (dict): The current theme. + theme_level (int): The current level of theme nesting. + """ + # Append the current theme to the list + _append_theme_to_list( + proj_id=proj_id, theme_list=theme_list, theme_names=theme_names, theme=theme + ) + + # Recursively call this function for each nested theme + nested_theme_key = f"theme{theme_level + 1}" + for nested_theme in theme.get(nested_theme_key, []): + _parse_themes( + proj_id=proj_id, + theme_list=theme_list, + theme_names=theme_names + [nested_theme["name"]], + theme=nested_theme, + theme_level=theme_level + 1, + ) + + def clean_theme(data: dict) -> list[dict] | list: """Clean theme data from a nested list to a list of dictionaries with theme names and percentages. @@ -152,50 +203,15 @@ def clean_theme(data: dict) -> list[dict] | list: """ # if there are no themes, return an empty list - if "theme_list" not in data.keys(): + if "theme_list" not in data: # return [{'project ID': proj_id}] return [] theme_list = [] proj_id = data["id"] for theme1 in data["theme_list"]: + _parse_themes(proj_id, theme_list, [theme1["name"]], theme1, 1) - # get first theme - name = theme1["name"] - theme_list.append( - { - "project ID": proj_id, - "theme1": name, - "percent": clean.clean_number(theme1["percent"]), - } - ) - - # get 2nd theme - if "theme2" in theme1.keys(): - for theme2 in theme1["theme2"]: - name_2 = theme2["name"] - theme_list.append( - { - "project ID": proj_id, - "theme1": name, - "theme2": name_2, - "percent": clean.clean_number(theme2["percent"]), - } - ) - - # get 3rd theme - if "theme3" in theme2.keys(): - for theme3 in theme2["theme3"]: - name_3 = theme3["name"] - theme_list.append( - { - "project ID": proj_id, - "theme1": name, - "theme2": name_2, - "theme3": name_3, - "percent": clean.clean_number(theme3["percent"]), - } - ) return theme_list @@ -227,11 +243,10 @@ def _get_sector_data(d: dict) -> dict: # check if there are missing sectors from the dict if (len(sectors_dict) == len(sectors) - 1) and (sum(sectors_dict.values()) < 100): - # loop through all the available sectors for s in sector_names: # if a sectors has not been picked up it must be the missing sector - if s not in sectors_dict.keys(): + if s not in sectors_dict: sectors_dict[s] = 100 - sum(sectors_dict.values()) if sum(sectors_dict.values()) != 100: @@ -240,7 +255,7 @@ def _get_sector_data(d: dict) -> dict: return sectors_dict -general_fields = { # general info +GENERAL_FIELDS = { # general info "id": "project ID", "project_name": "project name", "countryshortname": "country", @@ -253,6 +268,9 @@ def _get_sector_data(d: dict) -> dict: "cons_serv_reqd_ind": "consulting services required", "envassesmentcategorycode": "environmental assesment category", "esrc_ovrl_risk_rate": "environmental and social risk", + "transactiontype:": "transaction type", + "financier_loan": "financier loan", + "interestandcharges": "interest and charges", # dates "approvalfy": "fiscal year", "boardapprovaldate": "board approval date", @@ -261,6 +279,8 @@ def _get_sector_data(d: dict) -> dict: # lending "lendinginstr": "lending instrument", "projectfinancialtype": "financing type", + "loantype": "loan type", + "loantypedesc": "loan type description", "borrower": "borrower", "impagency": "implementing agency", "lendprojectcost": "project cost", @@ -272,6 +292,31 @@ def _get_sector_data(d: dict) -> dict: "curr_total_commitment": "current total IBRD and IDA commitment", "curr_ibrd_commitment": "current IBRD commitment", "curr_ida_commitment": "current IDA commitment", + "reapayment": "repayment", +} + +OTHER_FIELDS = { + "projectstatusdisplay": "project status display", + "sector1": "sector1", + "sector2": "sector2", + "sector3": "sector3", + "sector4": "sector4", + "sector5": "sector5", + "sector6": "sector6", + "sector7": "sector7", + "sector8": "sector8", + "sector": "sector", + "theme1": "theme1", + "theme2": "theme2", + "theme3": "theme3", + "theme4": "theme4", + "theme5": "theme5", + "fiscal_year": "fiscal year", + "fiscalyear": "fiscal year", + "fiscalyear_budget": "fiscal year budget", + "project_abstract": "project abstract", + "sectorlist": "sectorlist", + "theme_list": "theme_list", } @@ -324,7 +369,7 @@ def _format_general_data(self) -> None: self._data["general_data"] = ( pd.DataFrame.from_dict(self._raw_data, orient="index") .reset_index(drop=True) - .loc[:, general_fields.keys()] + .filter(list(GENERAL_FIELDS), axis=1) # change the fiscal year to int .assign( approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int) @@ -339,7 +384,7 @@ def _format_general_data(self) -> None: p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]), ) # rename columns - .rename(columns=general_fields) + .rename(columns=GENERAL_FIELDS) ) def _format_theme_data(self) -> None: @@ -357,7 +402,7 @@ def _format_sector_data(self) -> None: sector_data = [] for _, proj_data in self._raw_data.items(): - if "sector" in proj_data.keys(): + if "sector" in proj_data: proj_id = proj_data["id"] sectors = _get_sector_data(proj_data) @@ -377,7 +422,11 @@ def _download(self) -> None: with open(self._path, "w") as file: data = ( - QueryAPI(start_date=self.start_date, end_date=self.end_date) + QueryAPI( + start_date=self.start_date, + end_date=self.end_date, + fields=list(GENERAL_FIELDS) + list(OTHER_FIELDS), + ) .request_data() .get_data() ) From 06530c3d659059c1a09bf5ff141092d72cddb40f Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Wed, 19 Jul 2023 17:26:02 +0200 Subject: [PATCH 12/13] Update world_bank_projects.py --- bblocks/import_tools/world_bank_projects.py | 53 ++++++++++++++++----- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index f6221c0..a00829f 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -292,7 +292,7 @@ def _get_sector_data(d: dict) -> dict: "curr_total_commitment": "current total IBRD and IDA commitment", "curr_ibrd_commitment": "current IBRD commitment", "curr_ida_commitment": "current IDA commitment", - "reapayment": "repayment", + "repayment": "repayment", } OTHER_FIELDS = { @@ -351,7 +351,7 @@ def _path(self): return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json" - def _format_general_data(self) -> None: + def _format_general_data(self, additional_fields: list = None) -> None: """Clean and format general data and store it in _data attribute with key 'general_data'""" numeric_cols = [ @@ -369,7 +369,7 @@ def _format_general_data(self) -> None: self._data["general_data"] = ( pd.DataFrame.from_dict(self._raw_data, orient="index") .reset_index(drop=True) - .filter(list(GENERAL_FIELDS), axis=1) + .filter(list(GENERAL_FIELDS) + additional_fields, axis=1) # change the fiscal year to int .assign( approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int) @@ -415,17 +415,24 @@ def _format_sector_data(self) -> None: self._data["sector_data"] = pd.DataFrame(sector_data) - def _download(self) -> None: + def _download(self, additional_fields: list | None = None) -> None: """Download data from World Bank Projects API and save it as a json file.""" logger.info(f"Starting download of World Bank Projects") + if additional_fields is None: + additional_fields = [] + if isinstance(additional_fields, str): + additional_fields = [additional_fields] + with open(self._path, "w") as file: data = ( QueryAPI( start_date=self.start_date, end_date=self.end_date, - fields=list(GENERAL_FIELDS) + list(OTHER_FIELDS), + fields=list(GENERAL_FIELDS) + + list(OTHER_FIELDS) + + additional_fields, ) .request_data() .get_data() @@ -434,7 +441,7 @@ def _download(self) -> None: logger.info(f"Successfully downloaded World Bank Projects") - def load_data(self) -> ImportData: + def load_data(self, *, additional_fields: str | list = None) -> ImportData: """Load data to the object This method will load the World Bank Project data to the object. @@ -442,13 +449,34 @@ def load_data(self) -> ImportData: otherwise it will be downloaded from the API and saved as a json file and loaded to the object. + Args: + additional_fields: additional fields to download from the API. If the data has + already been downloaded, the additional fields may not be loaded if they do not + exist in the downloaded file. To force download of data with additional fields, + use the update_data method passing the additional fields as argument + Returns: object with loaded data """ + # check if additional fields is a string or None and convert to list + if additional_fields is None: + additional_fields = [] + if isinstance(additional_fields, str): + additional_fields = [additional_fields] + # if file does not exist, download it and save it as a json file if not self._path.exists(): - self._download() + self._download(additional_fields=additional_fields) + + # if file exists and additional fields are passed, log warning + else: + if not additional_fields: + logger.warning( + "Data already exists in disk. The additional fields may not be " + "loaded. To force download of data with additional fields, use the" + " update_data method passing the additional fields as argument" + ) # load data from json file with open(self._path, "r") as file: @@ -458,14 +486,16 @@ def load_data(self) -> ImportData: raise EmptyDataException("No data was retrieved") # set data - self._format_general_data() + self._format_general_data(additional_fields=additional_fields) self._format_theme_data() self._format_sector_data() logger.info(f"Successfully loaded World Bank Projects") return self - def update_data(self, reload: bool = True) -> ImportData: + def update_data( + self, reload: bool = True, *, additional_fields: str | list = None + ) -> ImportData: """Force update of data This method will download the data from the API. @@ -473,14 +503,15 @@ def update_data(self, reload: bool = True) -> ImportData: Args: reload: if True, reload data to object after downloading it. + additional_fields: additional fields to download Returns: object with updated data """ - self._download() + self._download(additional_fields=additional_fields) if reload: - self.load_data() + self.load_data(additional_fields=additional_fields) return self From 436ae1f2759bc635bd2a0f4101b3582855c8c7be Mon Sep 17 00:00:00 2001 From: Luca Picci Date: Thu, 20 Jul 2023 16:27:00 +0200 Subject: [PATCH 13/13] update --- CHANGELOG.md | 5 +++++ bblocks/import_tools/world_bank_projects.py | 23 ++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ab9b1e..76b618f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +[1.2.0] - 2023-07-20 +-------------------- +- Added new feature: `world_bank_projects` module in `import_tools` with an object + to extract data from the World Bank Projects database. + [1.1.1] - 2023-07-06 -------------------- - Updated requirements diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py index a00829f..45a88d5 100644 --- a/bblocks/import_tools/world_bank_projects.py +++ b/bblocks/import_tools/world_bank_projects.py @@ -394,7 +394,10 @@ def _format_theme_data(self) -> None: for _, proj_data in self._raw_data.items(): theme_data.extend(clean_theme(proj_data)) - self._data["theme_data"] = pd.DataFrame(theme_data) + self._data["theme_data"] = pd.DataFrame(theme_data).filter( + ["project ID", "theme1", "theme2", "theme3", "theme4", "theme5", "percent"], + axis=1, + ) def _format_sector_data(self) -> None: """Format sector data and store it as a dataframe in _data attribute @@ -459,6 +462,15 @@ def load_data(self, *, additional_fields: str | list = None) -> ImportData: object with loaded data """ + # if additional fields are set but the data is read from disk, log a warning + if self._path.exists() and additional_fields is not None: + logger.warning( + "Data already exists in disk. The additional fields might not be " + "loaded if they do not exist in the downloaded data. To force download " + "of data with additional fields, use the update_data method passing the " + "additional fields as argument" + ) + # check if additional fields is a string or None and convert to list if additional_fields is None: additional_fields = [] @@ -469,15 +481,6 @@ def load_data(self, *, additional_fields: str | list = None) -> ImportData: if not self._path.exists(): self._download(additional_fields=additional_fields) - # if file exists and additional fields are passed, log warning - else: - if not additional_fields: - logger.warning( - "Data already exists in disk. The additional fields may not be " - "loaded. To force download of data with additional fields, use the" - " update_data method passing the additional fields as argument" - ) - # load data from json file with open(self._path, "r") as file: self._raw_data = json.load(file)