From 36edc0616109a2fc0c08a43c714e6315e46b9f26 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Fri, 30 Jun 2023 18:02:47 +0200
Subject: [PATCH 01/13] create projects script

---
 bblocks/import_tools/world_bank_projects.py   | 203 ++++++++++++++++++
 .../test_world_bank_projects.py               |  88 ++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 bblocks/import_tools/world_bank_projects.py
 create mode 100644 tests/test_import_tools/test_world_bank_projects.py

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
new file mode 100644
index 0000000..7f9495f
--- /dev/null
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -0,0 +1,203 @@
+"""World Bank Projects Database Importer"""
+
+import pandas as pd
+import numpy as np
+import requests
+import json
+from dataclasses import dataclass
+
+from bblocks.logger import logger
+from bblocks.import_tools.common import ImportData
+from bblocks.config import BBPaths
+
+
+class EmptyDataException(Exception):
+    """Exception raised when the API response does not contain any data."""
+
+    pass
+
+
+BASE_API_URL = "http://search.worldbank.org/api/v2/projects"
+
+
+class QueryAPI:
+    """Helper class for querying the World Bank Projects API"""
+
+    def __init__(
+            self, response_format: str = 'json', max_rows_per_response: int = 500,
+            start_date: str | None = None, end_date: str | None = None
+            ):
+        """Initialize QueryAPI object"""
+
+        self.response_format = response_format
+        self.max_rows_per_response = max_rows_per_response
+        self.start_date = start_date
+        self.end_date = end_date
+
+        self._params = {
+            'format': self.response_format,
+            'rows': self.max_rows_per_response,
+            # 'os': 0, # offset
+            'strdate': self.start_date,
+            'enddate': self.end_date
+        }
+
+        self._check_params()
+
+        self.response_data = {}  # initialize response_data as empty dict
+
+    def _check_params(self) -> None:
+        """Check parameters"""
+
+        # if end_date is before start_date, raise error
+        if self._params['strdate'] is not None and self._params['enddate'] is not None:
+            if self._params['enddate'] < self._params['strdate']:
+                raise ValueError("end date must be after start date")
+
+        # if max_rows is greater than 1000, raise error
+        if self._params['rows'] > 1000:
+            raise ValueError("max_rows must be less than or equal to 1000")
+
+        # if dates are None, drop them from params
+        if self._params['strdate'] is None:
+            # drop start_date from params
+            self._params.pop('strdate')
+
+        if self._params['enddate'] is None:
+            # drop end_date from params
+            self._params.pop('enddate')
+
+    def _request(self) -> dict:
+        """Single request to API. Returns the rsponse json."""
+
+        try:
+            response = requests.get(BASE_API_URL, params=self._params)
+            response.raise_for_status()
+            data = response.json()['projects']  # keep only the projects data
+
+            return data
+
+        except Exception as e:
+            raise Exception(f"Failed to get data: {e}")
+
+    def request_data(self) -> 'QueryAPI':
+        """Request data from API
+
+        This method will request all the data from the API
+        and store it in the response_data attribute.
+        It will automatically determine the request to make
+        based on the offset and number of rows parameters.
+
+        Returns:
+            'QueryAPI' to allow chaining of methods
+        """
+
+        self._params['os'] = 0  # reset offset to 0
+
+        while True:
+
+            # request data
+            data = self._request()
+
+            # if there are no more projects, break
+            if len(data) == 0:
+                break
+
+            # add data to response_data
+            self.response_data.update(data)
+
+            # update offset
+            self._params['os'] += self._params['rows']
+
+        # Log if no data was returned from API
+        if len(self.response_data) == 0:
+            raise EmptyDataException("No data was returned from API")
+
+        return self
+
+    def get_data(self) -> dict[dict]:
+        """Get the data, or request it if it hasn't been requested yet"""
+
+        if len(self.response_data) == 0:
+            self.request_data()
+
+        return self.response_data
+
+
+fields = {
+    'id': 'id',
+    'regionname': 'region',
+    'project_name': 'project name',
+    'countryshortname': 'country',
+    'projectstatusdisplay': 'project status',
+
+
+
+
+    'curr_total_commitment': 'total commitment',
+    'curr_ibrd_commitment': 'IBRD commitment',
+    'curr_ida_commitment': 'IDA commitment',
+
+}
+
+
+# df = pd.DataFrame.from_dict(proj._raw_data, orient='index')
+
+
+@dataclass
+class WorldBankProjects(ImportData):
+    """World Bank Projects Database Importer"""
+
+    start_date: str | None = None
+    end_date: str | None = None
+
+    @property
+    def _path(self):
+        """Generate path based on version"""
+
+        start_date = f'_{self.start_date}' if self.start_date is not None else ''
+        end_date = f'_{self.end_date}' if self.end_date is not None else ''
+
+        return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json"
+
+    def load_data(self, project_codes: str | list = 'all') -> ImportData:
+        """ """
+
+        # if file does not exist, download it and save it as a json file
+        if not self._path.exists():
+            with open(self._path, 'w') as file:
+                data = (QueryAPI(start_date=self.start_date, end_date=self.end_date)
+                        .request_data()
+                        .get_data()
+                        )
+                json.dump(data, file)
+                logger.info(f"Successfully downloaded World Bank Projects")
+
+        with open(self._path, "r") as file:
+            self._raw_data = json.load(file)
+
+        if project_codes == 'all':
+            self._data = self._raw_data
+
+        if isinstance(project_codes, str):
+            project_codes = [project_codes]
+
+        if isinstance(project_codes, list):
+            self._data = {k: v for k, v in self._raw_data.items()
+                          if k in project_codes}
+
+        if self._data == {}:
+            raise ValueError("No projects found with the given project codes")
+        logger.info(f"Successfully loaded World Bank Projects")
+
+        return self
+
+    def update_data(self, reload: bool = True) -> ImportData:
+        """ """
+
+        pass
+
+    def get_data(self):
+        """ """
+
+        print('test')
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
new file mode 100644
index 0000000..51d017d
--- /dev/null
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -0,0 +1,88 @@
+"""Tests for the world_bank_projects module."""
+
+import pytest
+import pandas as pd
+import numpy as np
+import requests
+from unittest.mock import Mock, patch, MagicMock
+
+from bblocks.import_tools import world_bank_projects
+
+
+class TestQueryAPI:
+    """Test QueryAPI class."""
+
+    def test_init(self):
+        """Test initialization of QueryAPI object."""
+
+        # test that error is raised if end_date is before start_date
+        with pytest.raises(ValueError):
+            world_bank_projects.QueryAPI(start_date='2020-01-01', end_date='2019-01-01')
+
+        # test that error is raised if max_rows_per_response is greater than 1000
+        with pytest.raises(ValueError):
+            world_bank_projects.QueryAPI(max_rows_per_response=1001)
+
+        # test that start_date is dropped if end_date is None
+        assert 'strdate' not in world_bank_projects.QueryAPI(end_date='2020-01-01',
+                                                             start_date=None)._params
+
+        # test that end_date is dropped if start_date is None
+        assert 'enddate' not in world_bank_projects.QueryAPI(start_date='2020-01-01',
+                                                             end_date=None)._params
+
+    def test_request(self):
+        """ """
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}}
+
+        with patch("requests.get", return_value=mock_response) as mock_get:
+            assert world_bank_projects.QueryAPI()._request() == {'P1234': {'name': 'Test Project'}}
+
+    def test_request_error(self):
+        """Test that error is raised if request fails."""
+
+        with patch("requests.get") as mock_get:
+            mock_get.return_value.raise_for_status.side_effect = (
+                requests.exceptions.HTTPError
+            )
+            mock_get.return_value.status_code = 404
+            mock_get.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}}
+
+            with pytest.raises(Exception):
+                world_bank_projects.QueryAPI()._request()
+
+    def test_request_data_no_data(self):
+        """Test request_data method."""
+
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {'projects': {}}  # test that empty response is handled
+
+        with pytest.raises(world_bank_projects.EmptyDataException):
+            with patch("requests.get", return_value=mock_response) as mock_get:
+                obj = world_bank_projects.QueryAPI()
+                obj.request_data()
+
+    def test_request_data(self):
+        """Test request_data method."""
+
+        # Mocking the requests.get function
+        mocked_get = MagicMock(side_effect=[
+            Mock(json=MagicMock(return_value={'projects': {'P1': {'name': 'Test Project 1'},
+                                                           'P2': {'name': 'Test Project 2'}
+                                                           }
+                                              })),
+            Mock(json=MagicMock(return_value={'projects':{'P3': {'name': 'Test Project 3'}}})),
+            Mock(json=MagicMock(return_value={'projects': {}}))
+        ])
+
+        with patch("bblocks.import_tools.world_bank_projects.requests.get", mocked_get):
+            obj = world_bank_projects.QueryAPI()
+            obj.request_data()
+
+            assert obj.response_data == {'P1': {'name': 'Test Project 1'},
+                                         'P2': {'name': 'Test Project 2'},
+                                         'P3': {'name': 'Test Project 3'}
+                                         }

From 45676580c02713599f5e525bfd6aedb9d03ddc49 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Tue, 4 Jul 2023 16:47:51 +0200
Subject: [PATCH 02/13] update script

---
 bblocks/import_tools/world_bank_projects.py   | 209 ++++++++++++++----
 .../test_world_bank_projects.py               |  91 ++++++++
 2 files changed, 262 insertions(+), 38 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index 7f9495f..e30a48a 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -9,6 +9,7 @@
 from bblocks.logger import logger
 from bblocks.import_tools.common import ImportData
 from bblocks.config import BBPaths
+from bblocks.cleaning_tools import clean
 
 
 class EmptyDataException(Exception):
@@ -26,7 +27,7 @@ class QueryAPI:
     def __init__(
             self, response_format: str = 'json', max_rows_per_response: int = 500,
             start_date: str | None = None, end_date: str | None = None
-            ):
+    ):
         """Initialize QueryAPI object"""
 
         self.response_format = response_format
@@ -124,26 +125,100 @@ def get_data(self) -> dict[dict]:
         return self.response_data
 
 
-fields = {
-    'id': 'id',
-    'regionname': 'region',
+def clean_theme(data: dict) -> list[dict]:
+    """Clean theme data from a nested list to a dataframe
+    If there are no themes, an empty dataframe will be returned
+
+    Args:
+        data: data from API
+
+    Returns:
+        dict with theme names and percentages
+    """
+
+    # if there are no themes, return an empty dataframe
+    if 'theme_list' not in data.keys():
+        # return [{'project ID': proj_id}]
+        return []
+
+    theme_list = []
+    proj_id = data['id']
+    for theme1 in data['theme_list']:
+
+        # get first theme
+        name = theme1['name']
+        theme_list.append({'project ID': proj_id,
+                           'theme1': name,
+                           'percent': theme1['percent']})
+
+        # get 2nd theme
+        if 'theme2' in theme1.keys():
+            for theme2 in theme1['theme2']:
+                name_2 = theme2['name']
+                theme_list.append({'project ID': proj_id,
+                                   'theme1': name,
+                                   'theme2': name_2,
+                                   'percent': theme2['percent']})
+
+                # get 3rd theme
+                if 'theme3' in theme2.keys():
+                    for theme3 in theme2['theme3']:
+                        name_3 = theme3['name']
+                        theme_list.append({'project ID': proj_id,
+                                           'theme1': name,
+                                           'theme2': name_2,
+                                           'theme3': name_3,
+                                           'percent': theme3['percent']})
+    return theme_list
+
+
+def clean_sector(sector_series: pd.Series) -> pd.Series:
+    """Format sector data from a nested list to a string separating sectors by ' | '
+    If there are no sectors, np.nan will be placed in the series row
+
+    Args:
+        sector_series: series of sector data
+    """
+
+    return (sector_series
+            .apply(lambda x: ' | '.join([item['Name'] for item in x])if isinstance(x, list) else np.nan)
+            )
+
+
+general_fields = {  # general info
+    'id': 'project ID',
     'project_name': 'project name',
     'countryshortname': 'country',
-    'projectstatusdisplay': 'project status',
-
-
-
-
-    'curr_total_commitment': 'total commitment',
-    'curr_ibrd_commitment': 'IBRD commitment',
-    'curr_ida_commitment': 'IDA commitment',
-
+    'regionname': 'region name',
+    'url': 'url',
+    'teamleadname': 'team leader',
+    'status': 'status',
+    'envassesmentcategorycode': 'environmental assesment category',
+
+    # dates
+    'approvalfy': 'fiscal year',
+    'boardapprovaldate': 'board approval date',
+    'closingdate': 'closing date',
+    'p2a_updated_date': 'update date',
+
+    # lending
+    'lendinginstr': 'lending instrument',
+    'borrower': 'borrower',
+    'impagency': 'implementing agency',
+    'lendprojectcost': 'project cost',
+    'totalcommamt': 'total commitment',
+    'grantamt': 'grant amount',
+    'idacommamt': 'IDA commitment amount',
+    'ibrdcommamt': 'IBRD commitment amount',
+    'curr_total_commitment': 'current total IBRD and IDA commitment',
+    'curr_ibrd_commitment': 'current IBRD commitment',
+    'curr_ida_commitment': 'current IDA commitment',
+
+    # sectors
+    'sector': 'sectors',
 }
 
 
-# df = pd.DataFrame.from_dict(proj._raw_data, orient='index')
-
-
 @dataclass
 class WorldBankProjects(ImportData):
     """World Bank Projects Database Importer"""
@@ -160,44 +235,102 @@ def _path(self):
 
         return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json"
 
+    def _format_data(self):
+        """Cleaning and formatting"""
+
+        # create dataframe for general data
+
+        numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt',
+                        'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment',
+                        'curr_ida_commitment']
+
+        self._data['general_data'] = (pd
+                                      .DataFrame.from_dict(self._raw_data, orient='index')
+                                      .reset_index(drop=True)
+                                      .loc[:, general_fields.keys()]
+        # change fiscal year to int
+                                      .assign(approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int))
+        # change numeric columns to float
+                                      .pipe(clean.clean_numeric_series,series_columns=numeric_cols)
+                                      .assign(#format dates
+                                              boardapprovaldate = lambda d: clean.to_date_column(d['boardapprovaldate']),
+                                              closingdate = lambda d: clean.to_date_column(d['closingdate']),
+                                              p2a_updated_date =lambda d: clean.to_date_column(d['p2a_updated_date']),
+                                              #format sectors
+                                              sector = lambda d: clean_sector(d['sector'])
+                                             )
+        # rename columns
+                                      .rename(columns=general_fields)
+                                      )
+
+        theme_data = []
+        for _, proj_data in self._raw_data.items():
+            theme_data.extend(clean_theme(proj_data))
+
+        self._data['theme_data'] = (pd.DataFrame(theme_data)
+                                    .assign(percent=lambda d: clean.clean_numeric_series(d['percent'], to=float))
+                                    )
+
+    def _download(self) -> None:
+        """Download data from World Bank Projects API and save it as a json file"""
+
+        with open(self._path, 'w') as file:
+            data = (QueryAPI(start_date=self.start_date, end_date=self.end_date)
+                    .request_data()
+                    .get_data()
+                    )
+            json.dump(data, file)
+            logger.info(f"Successfully downloaded World Bank Projects")
+
     def load_data(self, project_codes: str | list = 'all') -> ImportData:
         """ """
 
         # if file does not exist, download it and save it as a json file
         if not self._path.exists():
-            with open(self._path, 'w') as file:
-                data = (QueryAPI(start_date=self.start_date, end_date=self.end_date)
-                        .request_data()
-                        .get_data()
-                        )
-                json.dump(data, file)
-                logger.info(f"Successfully downloaded World Bank Projects")
+            self._download()
 
+        # load data from json file
         with open(self._path, "r") as file:
             self._raw_data = json.load(file)
 
-        if project_codes == 'all':
-            self._data = self._raw_data
+        if self._raw_data is None:
+            raise EmptyDataException("No data was retrieved")
 
-        if isinstance(project_codes, str):
-            project_codes = [project_codes]
+        # format data
+        self._format_data()
+        return self
 
-        if isinstance(project_codes, list):
-            self._data = {k: v for k, v in self._raw_data.items()
-                          if k in project_codes}
+    def update_data(self, reload: bool = True) -> ImportData:
+        """ """
 
-        if self._data == {}:
-            raise ValueError("No projects found with the given project codes")
-        logger.info(f"Successfully loaded World Bank Projects")
+        self._download()
+        if reload:
+            self.load_data()
 
         return self
 
-    def update_data(self, reload: bool = True) -> ImportData:
+    def get_data(
+            self, project_codes: str | list = 'all',
+            data_type: str = 'general',
+            **kwargs
+    ) -> pd.DataFrame:
         """ """
 
-        pass
+        if data_type == 'general':
+            df = self._data['general_data']
+        elif data_type == 'theme':
+            df = self._data['theme_data']
+        else:
+            raise ValueError("data_type must be either 'general' or 'theme'")
 
-    def get_data(self):
-        """ """
+        if project_codes != 'all':
+            if isinstance(project_codes, str):
+                project_codes = [project_codes]
+            df = df[df['project ID'].isin(project_codes)]
+
+        return df
+
+    def get_json(self) -> dict:
+        """Return the raw data as a dictionary"""
 
-        print('test')
+        return self._raw_data
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
index 51d017d..b994a27 100644
--- a/tests/test_import_tools/test_world_bank_projects.py
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -86,3 +86,94 @@ def test_request_data(self):
                                          'P2': {'name': 'Test Project 2'},
                                          'P3': {'name': 'Test Project 3'}
                                          }
+
+
+def test_clean_theme():
+    """Test clean_theme function."""
+
+    test_data_dict = {'id': 'P1234',
+                      'theme_list':
+        [{'name': 'Environment and Natural Resource Management',
+          'code': '80',
+          'seqno': '14',
+          'percent': '34',
+          'theme2': [
+              {'name': 'Energy',
+               'code': '86',
+               'seqno': '18',
+               'percent': '13',
+               'theme3': [
+                   { 'name': 'Energy Efficiency',
+                     'code': '861',
+                     'seqno': '34',
+                     'percent': '13'},
+                   {'name': 'Energy Policies & Reform',
+                    'code': '862',
+                    'seqno': '35',
+                    'percent': '13'}]
+               },
+              {'name': 'Environmental policies and institutions',
+               'code': '84',
+               'seqno': '17',
+               'percent': '13'},
+
+              {'name': 'Environmental Health and Pollution Management',
+               'code': '82',
+               'seqno': '16',
+               'percent': '13',
+               'theme3': [
+                   {'name': 'Air quality management',
+                    'code': '821',
+                    'seqno': '33',
+                    'percent': '13'}]},
+              {'name': 'Climate change',
+               'code': '81',
+               'seqno': '15',
+               'percent': '34',
+               'theme3': [
+                   {'name': 'Adaptation',
+                    'code': '812',
+                    'seqno': '32',
+                    'percent': '8'},
+                   {'name': 'Mitigation', 'code': '811', 'seqno': '31', 'percent': '26'}]}]}
+         ]
+    }
+
+    formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': '34'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': '13'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': '34'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': '8'},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': '26'}]
+
+    assert world_bank_projects.clean_theme(test_data_dict) == formatted
+
+
+def test_clean_theme_no_theme():
+    """Test clean_theme function with no theme."""
+
+    test_data_dict = {'id': 'P1234'
+                      }
+    assert world_bank_projects.clean_theme(test_data_dict) == []
+
+
+def test_clean_sector():
+    """Test clean_sector function."""
+
+    test_series = pd.Series({0: [{'Name': 'Public Administration - Transportation'}, {'Name': 'Ports/Waterways'}],
+                             1: [{'Name': 'Public Administration - Agriculture, Fishing & Forestry'}, {'Name': 'Agricultural Extension, Research, and Other Support Activities'}, {'Name': 'Other Agriculture, Fishing and Forestry'}, {'Name': 'Irrigation and Drainage'}, {'Name': 'Agricultural markets, commercialization and agri-business'}],
+                             2: np.nan,
+                             3: np.nan})
+
+    expected = pd.Series({0: 'Public Administration - Transportation | Ports/Waterways',
+                          1: 'Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business',
+                          2: np.nan,
+                          3: np.nan})
+
+    assert world_bank_projects.clean_sector(test_series).equals(expected)
+
+

From 1e7c65c605c92eb8a3abc0ca828de406cbb1bdf5 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Tue, 4 Jul 2023 17:19:12 +0200
Subject: [PATCH 03/13] update

---
 bblocks/import_tools/world_bank_projects.py   | 123 +++++++++++++-----
 .../test_world_bank_projects.py               |  20 +--
 2 files changed, 97 insertions(+), 46 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index e30a48a..3d3ede9 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -125,18 +125,18 @@ def get_data(self) -> dict[dict]:
         return self.response_data
 
 
-def clean_theme(data: dict) -> list[dict]:
-    """Clean theme data from a nested list to a dataframe
-    If there are no themes, an empty dataframe will be returned
+def clean_theme(data: dict) -> list[dict] | list:
+    """Clean theme data from a nested list to a list of dictionaries with theme names and percentages
+    If there are no themes, an empty list will be returned
 
     Args:
         data: data from API
 
     Returns:
-        dict with theme names and percentages
+        list of dictionaries with theme names and percentages
     """
 
-    # if there are no themes, return an empty dataframe
+    # if there are no themes, return an empty list
     if 'theme_list' not in data.keys():
         # return [{'project ID': proj_id}]
         return []
@@ -149,7 +149,7 @@ def clean_theme(data: dict) -> list[dict]:
         name = theme1['name']
         theme_list.append({'project ID': proj_id,
                            'theme1': name,
-                           'percent': theme1['percent']})
+                           'percent': clean.clean_number(theme1['percent'])})
 
         # get 2nd theme
         if 'theme2' in theme1.keys():
@@ -158,7 +158,7 @@ def clean_theme(data: dict) -> list[dict]:
                 theme_list.append({'project ID': proj_id,
                                    'theme1': name,
                                    'theme2': name_2,
-                                   'percent': theme2['percent']})
+                                   'percent': clean.clean_number(theme2['percent'])})
 
                 # get 3rd theme
                 if 'theme3' in theme2.keys():
@@ -168,7 +168,7 @@ def clean_theme(data: dict) -> list[dict]:
                                            'theme1': name,
                                            'theme2': name_2,
                                            'theme3': name_3,
-                                           'percent': theme3['percent']})
+                                           'percent': clean.clean_number(theme3['percent'])})
     return theme_list
 
 
@@ -178,10 +178,14 @@ def clean_sector(sector_series: pd.Series) -> pd.Series:
 
     Args:
         sector_series: series of sector data
+
+    Returns:
+        series of sector data as a string separated by ' | '
     """
 
     return (sector_series
-            .apply(lambda x: ' | '.join([item['Name'] for item in x])if isinstance(x, list) else np.nan)
+            .apply(lambda x: ' | '.join([item['Name']
+                                         for item in x]) if isinstance(x, list) else np.nan)
             )
 
 
@@ -221,7 +225,22 @@ def clean_sector(sector_series: pd.Series) -> pd.Series:
 
 @dataclass
 class WorldBankProjects(ImportData):
-    """World Bank Projects Database Importer"""
+    """World Bank Projects Database Importer
+
+    This object will import the World Bank Projects database from the World Bank API.
+    To use, create an instance of the class. Optionally, you can specify the start and end dates
+    of the data to import. If no dates are specified, all data will be imported.
+    To import the data, call the load_data method. If the data has already downloaded, it will
+    be loaded to the object from disk, otherwise it will be downloaded from the API.
+    To retrieve the data, call the get_data method. You can specify the type of data to retrieve,
+    either 'general' or 'theme'. If no type is specified, 'general' data will be returned.
+    To update the data, call the update_data method. This will download the data from the API. if 'reload' is
+    set to True, the data will be reloaded to the object.
+
+    Parameters:
+        start_date: start date of data to import, in DD-MM-YYYY format
+        end_date: end date of data to import, in DD-MM-YYYY format
+    """
 
     start_date: str | None = None
     end_date: str | None = None
@@ -235,10 +254,8 @@ def _path(self):
 
         return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json"
 
-    def _format_data(self):
-        """Cleaning and formatting"""
-
-        # create dataframe for general data
+    def _format_general_data(self) -> None:
+        """Clean and format general data and store it in _data attribute with key 'general_data'"""
 
         numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt',
                         'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment',
@@ -248,28 +265,30 @@ def _format_data(self):
                                       .DataFrame.from_dict(self._raw_data, orient='index')
                                       .reset_index(drop=True)
                                       .loc[:, general_fields.keys()]
-        # change fiscal year to int
-                                      .assign(approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int))
-        # change numeric columns to float
-                                      .pipe(clean.clean_numeric_series,series_columns=numeric_cols)
-                                      .assign(#format dates
-                                              boardapprovaldate = lambda d: clean.to_date_column(d['boardapprovaldate']),
-                                              closingdate = lambda d: clean.to_date_column(d['closingdate']),
-                                              p2a_updated_date =lambda d: clean.to_date_column(d['p2a_updated_date']),
-                                              #format sectors
-                                              sector = lambda d: clean_sector(d['sector'])
-                                             )
-        # rename columns
+                                      # change fiscal year to int
+                                      .assign(
+            approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int))
+                                      # change numeric columns to float
+                                      .pipe(clean.clean_numeric_series, series_columns=numeric_cols)
+                                      .assign(  # format dates
+            boardapprovaldate=lambda d: clean.to_date_column(d['boardapprovaldate']),
+            closingdate=lambda d: clean.to_date_column(d['closingdate']),
+            p2a_updated_date=lambda d: clean.to_date_column(d['p2a_updated_date']),
+            # format sectors
+            sector=lambda d: clean_sector(d['sector'])
+        )
+                                      # rename columns
                                       .rename(columns=general_fields)
                                       )
 
+    def _format_theme_data(self) -> None:
+        """Format theme data and store it as a dataframe in _data attribute with key 'theme_data'"""
+
         theme_data = []
         for _, proj_data in self._raw_data.items():
             theme_data.extend(clean_theme(proj_data))
 
-        self._data['theme_data'] = (pd.DataFrame(theme_data)
-                                    .assign(percent=lambda d: clean.clean_numeric_series(d['percent'], to=float))
-                                    )
+        self._data['theme_data'] = pd.DataFrame(theme_data)
 
     def _download(self) -> None:
         """Download data from World Bank Projects API and save it as a json file"""
@@ -280,10 +299,20 @@ def _download(self) -> None:
                     .get_data()
                     )
             json.dump(data, file)
-            logger.info(f"Successfully downloaded World Bank Projects")
 
-    def load_data(self, project_codes: str | list = 'all') -> ImportData:
-        """ """
+        logger.info(f"Successfully downloaded World Bank Projects")
+
+    def load_data(self) -> ImportData:
+        """Load data to the object
+
+        This method will load the World Bank Project data to the object.
+        If the data has already downloaded, it will be loaded to the object from disk,
+        otherwise it will be downloaded from the API and saved as a json file and  loaded
+        to the object.
+
+        returns:
+            object with loaded data
+        """
 
         # if file does not exist, download it and save it as a json file
         if not self._path.exists():
@@ -296,12 +325,25 @@ def load_data(self, project_codes: str | list = 'all') -> ImportData:
         if self._raw_data is None:
             raise EmptyDataException("No data was retrieved")
 
-        # format data
-        self._format_data()
+        # set data
+        self._format_general_data()
+        self._format_theme_data()
+
+        logger.info(f"Successfully loaded World Bank Projects")
         return self
 
     def update_data(self, reload: bool = True) -> ImportData:
-        """ """
+        """Force update of data
+
+        This method will download the data from the API.
+        If 'reload' is set to True, the data will be reloaded to the object.
+
+        Args:
+            reload: if True, reload data to object after downloading it
+
+        returns:
+            object with updated data
+        """
 
         self._download()
         if reload:
@@ -314,7 +356,16 @@ def get_data(
             data_type: str = 'general',
             **kwargs
     ) -> pd.DataFrame:
-        """ """
+        """Get the data as a dataframe
+
+        Get the the general data or the theme data for World Bank Projects as a dataframe.
+        Optionally, you can specify the project codes to retrieve data for. If no project codes
+        are specified, data for all projects will be returned.
+
+        Args:
+            project_codes: project codes to retrieve data for. If 'all', data for all projects will be returned
+            data_type: type of data to retrieve. Either 'general' or 'theme'
+        """
 
         if data_type == 'general':
             df = self._data['general_data']
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
index b994a27..2baffc8 100644
--- a/tests/test_import_tools/test_world_bank_projects.py
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -139,16 +139,16 @@ def test_clean_theme():
          ]
     }
 
-    formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': '34'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': '13'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': '34'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': '8'},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': '26'}]
+    formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': 34},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': 13},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': 34},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': 8},
+                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': 26}]
 
     assert world_bank_projects.clean_theme(test_data_dict) == formatted
 

From 2caa2fb608ebf4910d5444b656a580628f2661c2 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Tue, 4 Jul 2023 17:23:50 +0200
Subject: [PATCH 04/13] ignore json downloads

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1b6675b..d126c3c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -107,6 +107,7 @@ ENV/
 # downloaded files
 /bblocks/.raw_data/*.csv
 /bblocks/.raw_data/*.feather
+/bblocks/.raw_data/*.json
 
 # Sphinx documentation
 docs/_build/

From 700cb9c6bb3ad5849f97bb323efff3518a7e370c Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Tue, 4 Jul 2023 17:26:37 +0200
Subject: [PATCH 05/13] formatting

---
 bblocks/import_tools/world_bank_projects.py   | 240 +++++++-------
 .../test_world_bank_projects.py               | 307 +++++++++++++-----
 2 files changed, 348 insertions(+), 199 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index 3d3ede9..33c88db 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -25,8 +25,11 @@ class QueryAPI:
     """Helper class for querying the World Bank Projects API"""
 
     def __init__(
-            self, response_format: str = 'json', max_rows_per_response: int = 500,
-            start_date: str | None = None, end_date: str | None = None
+        self,
+        response_format: str = "json",
+        max_rows_per_response: int = 500,
+        start_date: str | None = None,
+        end_date: str | None = None,
     ):
         """Initialize QueryAPI object"""
 
@@ -36,11 +39,11 @@ def __init__(
         self.end_date = end_date
 
         self._params = {
-            'format': self.response_format,
-            'rows': self.max_rows_per_response,
+            "format": self.response_format,
+            "rows": self.max_rows_per_response,
             # 'os': 0, # offset
-            'strdate': self.start_date,
-            'enddate': self.end_date
+            "strdate": self.start_date,
+            "enddate": self.end_date,
         }
 
         self._check_params()
@@ -51,22 +54,22 @@ def _check_params(self) -> None:
         """Check parameters"""
 
         # if end_date is before start_date, raise error
-        if self._params['strdate'] is not None and self._params['enddate'] is not None:
-            if self._params['enddate'] < self._params['strdate']:
+        if self._params["strdate"] is not None and self._params["enddate"] is not None:
+            if self._params["enddate"] < self._params["strdate"]:
                 raise ValueError("end date must be after start date")
 
         # if max_rows is greater than 1000, raise error
-        if self._params['rows'] > 1000:
+        if self._params["rows"] > 1000:
             raise ValueError("max_rows must be less than or equal to 1000")
 
         # if dates are None, drop them from params
-        if self._params['strdate'] is None:
+        if self._params["strdate"] is None:
             # drop start_date from params
-            self._params.pop('strdate')
+            self._params.pop("strdate")
 
-        if self._params['enddate'] is None:
+        if self._params["enddate"] is None:
             # drop end_date from params
-            self._params.pop('enddate')
+            self._params.pop("enddate")
 
     def _request(self) -> dict:
         """Single request to API. Returns the rsponse json."""
@@ -74,14 +77,14 @@ def _request(self) -> dict:
         try:
             response = requests.get(BASE_API_URL, params=self._params)
             response.raise_for_status()
-            data = response.json()['projects']  # keep only the projects data
+            data = response.json()["projects"]  # keep only the projects data
 
             return data
 
         except Exception as e:
             raise Exception(f"Failed to get data: {e}")
 
-    def request_data(self) -> 'QueryAPI':
+    def request_data(self) -> "QueryAPI":
         """Request data from API
 
         This method will request all the data from the API
@@ -93,7 +96,7 @@ def request_data(self) -> 'QueryAPI':
             'QueryAPI' to allow chaining of methods
         """
 
-        self._params['os'] = 0  # reset offset to 0
+        self._params["os"] = 0  # reset offset to 0
 
         while True:
 
@@ -108,7 +111,7 @@ def request_data(self) -> 'QueryAPI':
             self.response_data.update(data)
 
             # update offset
-            self._params['os'] += self._params['rows']
+            self._params["os"] += self._params["rows"]
 
         # Log if no data was returned from API
         if len(self.response_data) == 0:
@@ -137,38 +140,50 @@ def clean_theme(data: dict) -> list[dict] | list:
     """
 
     # if there are no themes, return an empty list
-    if 'theme_list' not in data.keys():
+    if "theme_list" not in data.keys():
         # return [{'project ID': proj_id}]
         return []
 
     theme_list = []
-    proj_id = data['id']
-    for theme1 in data['theme_list']:
+    proj_id = data["id"]
+    for theme1 in data["theme_list"]:
 
         # get first theme
-        name = theme1['name']
-        theme_list.append({'project ID': proj_id,
-                           'theme1': name,
-                           'percent': clean.clean_number(theme1['percent'])})
+        name = theme1["name"]
+        theme_list.append(
+            {
+                "project ID": proj_id,
+                "theme1": name,
+                "percent": clean.clean_number(theme1["percent"]),
+            }
+        )
 
         # get 2nd theme
-        if 'theme2' in theme1.keys():
-            for theme2 in theme1['theme2']:
-                name_2 = theme2['name']
-                theme_list.append({'project ID': proj_id,
-                                   'theme1': name,
-                                   'theme2': name_2,
-                                   'percent': clean.clean_number(theme2['percent'])})
+        if "theme2" in theme1.keys():
+            for theme2 in theme1["theme2"]:
+                name_2 = theme2["name"]
+                theme_list.append(
+                    {
+                        "project ID": proj_id,
+                        "theme1": name,
+                        "theme2": name_2,
+                        "percent": clean.clean_number(theme2["percent"]),
+                    }
+                )
 
                 # get 3rd theme
-                if 'theme3' in theme2.keys():
-                    for theme3 in theme2['theme3']:
-                        name_3 = theme3['name']
-                        theme_list.append({'project ID': proj_id,
-                                           'theme1': name,
-                                           'theme2': name_2,
-                                           'theme3': name_3,
-                                           'percent': clean.clean_number(theme3['percent'])})
+                if "theme3" in theme2.keys():
+                    for theme3 in theme2["theme3"]:
+                        name_3 = theme3["name"]
+                        theme_list.append(
+                            {
+                                "project ID": proj_id,
+                                "theme1": name,
+                                "theme2": name_2,
+                                "theme3": name_3,
+                                "percent": clean.clean_number(theme3["percent"]),
+                            }
+                        )
     return theme_list
 
 
@@ -183,43 +198,41 @@ def clean_sector(sector_series: pd.Series) -> pd.Series:
         series of sector data as a string separated by ' | '
     """
 
-    return (sector_series
-            .apply(lambda x: ' | '.join([item['Name']
-                                         for item in x]) if isinstance(x, list) else np.nan)
-            )
+    return sector_series.apply(
+        lambda x: " | ".join([item["Name"] for item in x])
+        if isinstance(x, list)
+        else np.nan
+    )
 
 
 general_fields = {  # general info
-    'id': 'project ID',
-    'project_name': 'project name',
-    'countryshortname': 'country',
-    'regionname': 'region name',
-    'url': 'url',
-    'teamleadname': 'team leader',
-    'status': 'status',
-    'envassesmentcategorycode': 'environmental assesment category',
-
+    "id": "project ID",
+    "project_name": "project name",
+    "countryshortname": "country",
+    "regionname": "region name",
+    "url": "url",
+    "teamleadname": "team leader",
+    "status": "status",
+    "envassesmentcategorycode": "environmental assesment category",
     # dates
-    'approvalfy': 'fiscal year',
-    'boardapprovaldate': 'board approval date',
-    'closingdate': 'closing date',
-    'p2a_updated_date': 'update date',
-
+    "approvalfy": "fiscal year",
+    "boardapprovaldate": "board approval date",
+    "closingdate": "closing date",
+    "p2a_updated_date": "update date",
     # lending
-    'lendinginstr': 'lending instrument',
-    'borrower': 'borrower',
-    'impagency': 'implementing agency',
-    'lendprojectcost': 'project cost',
-    'totalcommamt': 'total commitment',
-    'grantamt': 'grant amount',
-    'idacommamt': 'IDA commitment amount',
-    'ibrdcommamt': 'IBRD commitment amount',
-    'curr_total_commitment': 'current total IBRD and IDA commitment',
-    'curr_ibrd_commitment': 'current IBRD commitment',
-    'curr_ida_commitment': 'current IDA commitment',
-
+    "lendinginstr": "lending instrument",
+    "borrower": "borrower",
+    "impagency": "implementing agency",
+    "lendprojectcost": "project cost",
+    "totalcommamt": "total commitment",
+    "grantamt": "grant amount",
+    "idacommamt": "IDA commitment amount",
+    "ibrdcommamt": "IBRD commitment amount",
+    "curr_total_commitment": "current total IBRD and IDA commitment",
+    "curr_ibrd_commitment": "current IBRD commitment",
+    "curr_ida_commitment": "current IDA commitment",
     # sectors
-    'sector': 'sectors',
+    "sector": "sectors",
 }
 
 
@@ -249,37 +262,47 @@ class WorldBankProjects(ImportData):
     def _path(self):
         """Generate path based on version"""
 
-        start_date = f'_{self.start_date}' if self.start_date is not None else ''
-        end_date = f'_{self.end_date}' if self.end_date is not None else ''
+        start_date = f"_{self.start_date}" if self.start_date is not None else ""
+        end_date = f"_{self.end_date}" if self.end_date is not None else ""
 
         return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json"
 
     def _format_general_data(self) -> None:
         """Clean and format general data and store it in _data attribute with key 'general_data'"""
 
-        numeric_cols = ['lendprojectcost', 'totalcommamt', 'grantamt', 'idacommamt',
-                        'ibrdcommamt', 'curr_total_commitment', 'curr_ibrd_commitment',
-                        'curr_ida_commitment']
-
-        self._data['general_data'] = (pd
-                                      .DataFrame.from_dict(self._raw_data, orient='index')
-                                      .reset_index(drop=True)
-                                      .loc[:, general_fields.keys()]
-                                      # change fiscal year to int
-                                      .assign(
-            approvalfy=lambda d: clean.clean_numeric_series(d['approvalfy'], to=int))
-                                      # change numeric columns to float
-                                      .pipe(clean.clean_numeric_series, series_columns=numeric_cols)
-                                      .assign(  # format dates
-            boardapprovaldate=lambda d: clean.to_date_column(d['boardapprovaldate']),
-            closingdate=lambda d: clean.to_date_column(d['closingdate']),
-            p2a_updated_date=lambda d: clean.to_date_column(d['p2a_updated_date']),
-            # format sectors
-            sector=lambda d: clean_sector(d['sector'])
+        numeric_cols = [
+            "lendprojectcost",
+            "totalcommamt",
+            "grantamt",
+            "idacommamt",
+            "ibrdcommamt",
+            "curr_total_commitment",
+            "curr_ibrd_commitment",
+            "curr_ida_commitment",
+        ]
+
+        self._data["general_data"] = (
+            pd.DataFrame.from_dict(self._raw_data, orient="index")
+            .reset_index(drop=True)
+            .loc[:, general_fields.keys()]
+            # change fiscal year to int
+            .assign(
+                approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int)
+            )
+            # change numeric columns to float
+            .pipe(clean.clean_numeric_series, series_columns=numeric_cols)
+            .assign(  # format dates
+                boardapprovaldate=lambda d: clean.to_date_column(
+                    d["boardapprovaldate"]
+                ),
+                closingdate=lambda d: clean.to_date_column(d["closingdate"]),
+                p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]),
+                # format sectors
+                sector=lambda d: clean_sector(d["sector"]),
+            )
+            # rename columns
+            .rename(columns=general_fields)
         )
-                                      # rename columns
-                                      .rename(columns=general_fields)
-                                      )
 
     def _format_theme_data(self) -> None:
         """Format theme data and store it as a dataframe in _data attribute with key 'theme_data'"""
@@ -288,16 +311,17 @@ def _format_theme_data(self) -> None:
         for _, proj_data in self._raw_data.items():
             theme_data.extend(clean_theme(proj_data))
 
-        self._data['theme_data'] = pd.DataFrame(theme_data)
+        self._data["theme_data"] = pd.DataFrame(theme_data)
 
     def _download(self) -> None:
         """Download data from World Bank Projects API and save it as a json file"""
 
-        with open(self._path, 'w') as file:
-            data = (QueryAPI(start_date=self.start_date, end_date=self.end_date)
-                    .request_data()
-                    .get_data()
-                    )
+        with open(self._path, "w") as file:
+            data = (
+                QueryAPI(start_date=self.start_date, end_date=self.end_date)
+                .request_data()
+                .get_data()
+            )
             json.dump(data, file)
 
         logger.info(f"Successfully downloaded World Bank Projects")
@@ -352,9 +376,7 @@ def update_data(self, reload: bool = True) -> ImportData:
         return self
 
     def get_data(
-            self, project_codes: str | list = 'all',
-            data_type: str = 'general',
-            **kwargs
+        self, project_codes: str | list = "all", data_type: str = "general", **kwargs
     ) -> pd.DataFrame:
         """Get the data as a dataframe
 
@@ -367,17 +389,17 @@ def get_data(
             data_type: type of data to retrieve. Either 'general' or 'theme'
         """
 
-        if data_type == 'general':
-            df = self._data['general_data']
-        elif data_type == 'theme':
-            df = self._data['theme_data']
+        if data_type == "general":
+            df = self._data["general_data"]
+        elif data_type == "theme":
+            df = self._data["theme_data"]
         else:
             raise ValueError("data_type must be either 'general' or 'theme'")
 
-        if project_codes != 'all':
+        if project_codes != "all":
             if isinstance(project_codes, str):
                 project_codes = [project_codes]
-            df = df[df['project ID'].isin(project_codes)]
+            df = df[df["project ID"].isin(project_codes)]
 
         return df
 
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
index 2baffc8..0531b39 100644
--- a/tests/test_import_tools/test_world_bank_projects.py
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -17,28 +17,40 @@ def test_init(self):
 
         # test that error is raised if end_date is before start_date
         with pytest.raises(ValueError):
-            world_bank_projects.QueryAPI(start_date='2020-01-01', end_date='2019-01-01')
+            world_bank_projects.QueryAPI(start_date="2020-01-01", end_date="2019-01-01")
 
         # test that error is raised if max_rows_per_response is greater than 1000
         with pytest.raises(ValueError):
             world_bank_projects.QueryAPI(max_rows_per_response=1001)
 
         # test that start_date is dropped if end_date is None
-        assert 'strdate' not in world_bank_projects.QueryAPI(end_date='2020-01-01',
-                                                             start_date=None)._params
+        assert (
+            "strdate"
+            not in world_bank_projects.QueryAPI(
+                end_date="2020-01-01", start_date=None
+            )._params
+        )
 
         # test that end_date is dropped if start_date is None
-        assert 'enddate' not in world_bank_projects.QueryAPI(start_date='2020-01-01',
-                                                             end_date=None)._params
+        assert (
+            "enddate"
+            not in world_bank_projects.QueryAPI(
+                start_date="2020-01-01", end_date=None
+            )._params
+        )
 
     def test_request(self):
         """ """
         mock_response = Mock()
         mock_response.status_code = 200
-        mock_response.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}}
+        mock_response.json.return_value = {
+            "projects": {"P1234": {"name": "Test Project"}}
+        }
 
         with patch("requests.get", return_value=mock_response) as mock_get:
-            assert world_bank_projects.QueryAPI()._request() == {'P1234': {'name': 'Test Project'}}
+            assert world_bank_projects.QueryAPI()._request() == {
+                "P1234": {"name": "Test Project"}
+            }
 
     def test_request_error(self):
         """Test that error is raised if request fails."""
@@ -48,7 +60,9 @@ def test_request_error(self):
                 requests.exceptions.HTTPError
             )
             mock_get.return_value.status_code = 404
-            mock_get.json.return_value = {'projects': {'P1234': {'name': 'Test Project'}}}
+            mock_get.json.return_value = {
+                "projects": {"P1234": {"name": "Test Project"}}
+            }
 
             with pytest.raises(Exception):
                 world_bank_projects.QueryAPI()._request()
@@ -58,7 +72,9 @@ def test_request_data_no_data(self):
 
         mock_response = Mock()
         mock_response.status_code = 200
-        mock_response.json.return_value = {'projects': {}}  # test that empty response is handled
+        mock_response.json.return_value = {
+            "projects": {}
+        }  # test that empty response is handled
 
         with pytest.raises(world_bank_projects.EmptyDataException):
             with patch("requests.get", return_value=mock_response) as mock_get:
@@ -69,86 +85,181 @@ def test_request_data(self):
         """Test request_data method."""
 
         # Mocking the requests.get function
-        mocked_get = MagicMock(side_effect=[
-            Mock(json=MagicMock(return_value={'projects': {'P1': {'name': 'Test Project 1'},
-                                                           'P2': {'name': 'Test Project 2'}
-                                                           }
-                                              })),
-            Mock(json=MagicMock(return_value={'projects':{'P3': {'name': 'Test Project 3'}}})),
-            Mock(json=MagicMock(return_value={'projects': {}}))
-        ])
+        mocked_get = MagicMock(
+            side_effect=[
+                Mock(
+                    json=MagicMock(
+                        return_value={
+                            "projects": {
+                                "P1": {"name": "Test Project 1"},
+                                "P2": {"name": "Test Project 2"},
+                            }
+                        }
+                    )
+                ),
+                Mock(
+                    json=MagicMock(
+                        return_value={"projects": {"P3": {"name": "Test Project 3"}}}
+                    )
+                ),
+                Mock(json=MagicMock(return_value={"projects": {}})),
+            ]
+        )
 
         with patch("bblocks.import_tools.world_bank_projects.requests.get", mocked_get):
             obj = world_bank_projects.QueryAPI()
             obj.request_data()
 
-            assert obj.response_data == {'P1': {'name': 'Test Project 1'},
-                                         'P2': {'name': 'Test Project 2'},
-                                         'P3': {'name': 'Test Project 3'}
-                                         }
+            assert obj.response_data == {
+                "P1": {"name": "Test Project 1"},
+                "P2": {"name": "Test Project 2"},
+                "P3": {"name": "Test Project 3"},
+            }
 
 
 def test_clean_theme():
     """Test clean_theme function."""
 
-    test_data_dict = {'id': 'P1234',
-                      'theme_list':
-        [{'name': 'Environment and Natural Resource Management',
-          'code': '80',
-          'seqno': '14',
-          'percent': '34',
-          'theme2': [
-              {'name': 'Energy',
-               'code': '86',
-               'seqno': '18',
-               'percent': '13',
-               'theme3': [
-                   { 'name': 'Energy Efficiency',
-                     'code': '861',
-                     'seqno': '34',
-                     'percent': '13'},
-                   {'name': 'Energy Policies & Reform',
-                    'code': '862',
-                    'seqno': '35',
-                    'percent': '13'}]
-               },
-              {'name': 'Environmental policies and institutions',
-               'code': '84',
-               'seqno': '17',
-               'percent': '13'},
-
-              {'name': 'Environmental Health and Pollution Management',
-               'code': '82',
-               'seqno': '16',
-               'percent': '13',
-               'theme3': [
-                   {'name': 'Air quality management',
-                    'code': '821',
-                    'seqno': '33',
-                    'percent': '13'}]},
-              {'name': 'Climate change',
-               'code': '81',
-               'seqno': '15',
-               'percent': '34',
-               'theme3': [
-                   {'name': 'Adaptation',
-                    'code': '812',
-                    'seqno': '32',
-                    'percent': '8'},
-                   {'name': 'Mitigation', 'code': '811', 'seqno': '31', 'percent': '26'}]}]}
-         ]
+    test_data_dict = {
+        "id": "P1234",
+        "theme_list": [
+            {
+                "name": "Environment and Natural Resource Management",
+                "code": "80",
+                "seqno": "14",
+                "percent": "34",
+                "theme2": [
+                    {
+                        "name": "Energy",
+                        "code": "86",
+                        "seqno": "18",
+                        "percent": "13",
+                        "theme3": [
+                            {
+                                "name": "Energy Efficiency",
+                                "code": "861",
+                                "seqno": "34",
+                                "percent": "13",
+                            },
+                            {
+                                "name": "Energy Policies & Reform",
+                                "code": "862",
+                                "seqno": "35",
+                                "percent": "13",
+                            },
+                        ],
+                    },
+                    {
+                        "name": "Environmental policies and institutions",
+                        "code": "84",
+                        "seqno": "17",
+                        "percent": "13",
+                    },
+                    {
+                        "name": "Environmental Health and Pollution Management",
+                        "code": "82",
+                        "seqno": "16",
+                        "percent": "13",
+                        "theme3": [
+                            {
+                                "name": "Air quality management",
+                                "code": "821",
+                                "seqno": "33",
+                                "percent": "13",
+                            }
+                        ],
+                    },
+                    {
+                        "name": "Climate change",
+                        "code": "81",
+                        "seqno": "15",
+                        "percent": "34",
+                        "theme3": [
+                            {
+                                "name": "Adaptation",
+                                "code": "812",
+                                "seqno": "32",
+                                "percent": "8",
+                            },
+                            {
+                                "name": "Mitigation",
+                                "code": "811",
+                                "seqno": "31",
+                                "percent": "26",
+                            },
+                        ],
+                    },
+                ],
+            }
+        ],
     }
 
-    formatted = [{'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'percent': 34},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Efficiency', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Energy', 'theme3': 'Energy Policies & Reform', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental policies and institutions', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Environmental Health and Pollution Management', 'theme3': 'Air quality management', 'percent': 13},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'percent': 34},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Adaptation', 'percent': 8},
-                 {'project ID': 'P1234', 'theme1': 'Environment and Natural Resource Management', 'theme2': 'Climate change', 'theme3': 'Mitigation', 'percent': 26}]
+    formatted = [
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "percent": 34,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Energy",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Energy",
+            "theme3": "Energy Efficiency",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Energy",
+            "theme3": "Energy Policies & Reform",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Environmental policies and institutions",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Environmental Health and Pollution Management",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Environmental Health and Pollution Management",
+            "theme3": "Air quality management",
+            "percent": 13,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Climate change",
+            "percent": 34,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Climate change",
+            "theme3": "Adaptation",
+            "percent": 8,
+        },
+        {
+            "project ID": "P1234",
+            "theme1": "Environment and Natural Resource Management",
+            "theme2": "Climate change",
+            "theme3": "Mitigation",
+            "percent": 26,
+        },
+    ]
 
     assert world_bank_projects.clean_theme(test_data_dict) == formatted
 
@@ -156,24 +267,40 @@ def test_clean_theme():
 def test_clean_theme_no_theme():
     """Test clean_theme function with no theme."""
 
-    test_data_dict = {'id': 'P1234'
-                      }
+    test_data_dict = {"id": "P1234"}
     assert world_bank_projects.clean_theme(test_data_dict) == []
 
 
 def test_clean_sector():
     """Test clean_sector function."""
 
-    test_series = pd.Series({0: [{'Name': 'Public Administration - Transportation'}, {'Name': 'Ports/Waterways'}],
-                             1: [{'Name': 'Public Administration - Agriculture, Fishing & Forestry'}, {'Name': 'Agricultural Extension, Research, and Other Support Activities'}, {'Name': 'Other Agriculture, Fishing and Forestry'}, {'Name': 'Irrigation and Drainage'}, {'Name': 'Agricultural markets, commercialization and agri-business'}],
-                             2: np.nan,
-                             3: np.nan})
-
-    expected = pd.Series({0: 'Public Administration - Transportation | Ports/Waterways',
-                          1: 'Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business',
-                          2: np.nan,
-                          3: np.nan})
+    test_series = pd.Series(
+        {
+            0: [
+                {"Name": "Public Administration - Transportation"},
+                {"Name": "Ports/Waterways"},
+            ],
+            1: [
+                {"Name": "Public Administration - Agriculture, Fishing & Forestry"},
+                {
+                    "Name": "Agricultural Extension, Research, and Other Support Activities"
+                },
+                {"Name": "Other Agriculture, Fishing and Forestry"},
+                {"Name": "Irrigation and Drainage"},
+                {"Name": "Agricultural markets, commercialization and agri-business"},
+            ],
+            2: np.nan,
+            3: np.nan,
+        }
+    )
+
+    expected = pd.Series(
+        {
+            0: "Public Administration - Transportation | Ports/Waterways",
+            1: "Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business",
+            2: np.nan,
+            3: np.nan,
+        }
+    )
 
     assert world_bank_projects.clean_sector(test_series).equals(expected)
-
-

From 022a9d878b672778835a99d6b9cea7d3ede1c50c Mon Sep 17 00:00:00 2001
From: Jorge Rivera <jorge.rivera@one.org>
Date: Thu, 6 Jul 2023 10:14:31 +0200
Subject: [PATCH 06/13] minor formatting tweaks

Including https for api call
---
 bblocks/import_tools/world_bank_projects.py | 36 +++++++++++----------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index 33c88db..c436cf0 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -18,7 +18,7 @@ class EmptyDataException(Exception):
     pass
 
 
-BASE_API_URL = "http://search.worldbank.org/api/v2/projects"
+BASE_API_URL = "https://search.worldbank.org/api/v2/projects"
 
 
 class QueryAPI:
@@ -53,7 +53,7 @@ def __init__(
     def _check_params(self) -> None:
         """Check parameters"""
 
-        # if end_date is before start_date, raise error
+        # if end_date is before start_date, raise error.
         if self._params["strdate"] is not None and self._params["enddate"] is not None:
             if self._params["enddate"] < self._params["strdate"]:
                 raise ValueError("end date must be after start date")
@@ -72,7 +72,7 @@ def _check_params(self) -> None:
             self._params.pop("enddate")
 
     def _request(self) -> dict:
-        """Single request to API. Returns the rsponse json."""
+        """Single request to API. Returns the response json."""
 
         try:
             response = requests.get(BASE_API_URL, params=self._params)
@@ -120,7 +120,7 @@ def request_data(self) -> "QueryAPI":
         return self
 
     def get_data(self) -> dict[dict]:
-        """Get the data, or request it if it hasn't been requested yet"""
+        """Get the data, or request it if it hasn't been requested yet."""
 
         if len(self.response_data) == 0:
             self.request_data()
@@ -129,8 +129,9 @@ def get_data(self) -> dict[dict]:
 
 
 def clean_theme(data: dict) -> list[dict] | list:
-    """Clean theme data from a nested list to a list of dictionaries with theme names and percentages
-    If there are no themes, an empty list will be returned
+    """Clean theme data from a nested list to a list of dictionaries with theme names and
+    percentages.
+    If there are no themes, an empty list will be returned.
 
     Args:
         data: data from API
@@ -189,7 +190,7 @@ def clean_theme(data: dict) -> list[dict] | list:
 
 def clean_sector(sector_series: pd.Series) -> pd.Series:
     """Format sector data from a nested list to a string separating sectors by ' | '
-    If there are no sectors, np.nan will be placed in the series row
+    If there are no sectors, np.nan will be placed in the series row.
 
     Args:
         sector_series: series of sector data
@@ -247,12 +248,12 @@ class WorldBankProjects(ImportData):
     be loaded to the object from disk, otherwise it will be downloaded from the API.
     To retrieve the data, call the get_data method. You can specify the type of data to retrieve,
     either 'general' or 'theme'. If no type is specified, 'general' data will be returned.
-    To update the data, call the update_data method. This will download the data from the API. if 'reload' is
-    set to True, the data will be reloaded to the object.
+    To update the data, call the update_data method. This will download the data from the API.
+    If 'reload' is set to True, the data will be reloaded to the object.
 
     Parameters:
         start_date: start date of data to import, in DD-MM-YYYY format
-        end_date: end date of data to import, in DD-MM-YYYY format
+        end_date: end date of data to import, in DD-MM-YYYY format.
     """
 
     start_date: str | None = None
@@ -285,7 +286,7 @@ def _format_general_data(self) -> None:
             pd.DataFrame.from_dict(self._raw_data, orient="index")
             .reset_index(drop=True)
             .loc[:, general_fields.keys()]
-            # change fiscal year to int
+            # change the fiscal year to int
             .assign(
                 approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int)
             )
@@ -314,7 +315,7 @@ def _format_theme_data(self) -> None:
         self._data["theme_data"] = pd.DataFrame(theme_data)
 
     def _download(self) -> None:
-        """Download data from World Bank Projects API and save it as a json file"""
+        """Download data from World Bank Projects API and save it as a json file."""
 
         with open(self._path, "w") as file:
             data = (
@@ -334,7 +335,7 @@ def load_data(self) -> ImportData:
         otherwise it will be downloaded from the API and saved as a json file and  loaded
         to the object.
 
-        returns:
+        Returns:
             object with loaded data
         """
 
@@ -363,9 +364,9 @@ def update_data(self, reload: bool = True) -> ImportData:
         If 'reload' is set to True, the data will be reloaded to the object.
 
         Args:
-            reload: if True, reload data to object after downloading it
+            reload: if True, reload data to object after downloading it.
 
-        returns:
+        Returns:
             object with updated data
         """
 
@@ -380,12 +381,13 @@ def get_data(
     ) -> pd.DataFrame:
         """Get the data as a dataframe
 
-        Get the the general data or the theme data for World Bank Projects as a dataframe.
+        Get the general data, or the theme data for World Bank Projects as a dataframe.
         Optionally, you can specify the project codes to retrieve data for. If no project codes
         are specified, data for all projects will be returned.
 
         Args:
-            project_codes: project codes to retrieve data for. If 'all', data for all projects will be returned
+            project_codes: project codes to retrieve data for. If 'all', data for all projects
+            will be returned
             data_type: type of data to retrieve. Either 'general' or 'theme'
         """
 

From 88b1547a9582c9bc5c178205d9b827b2f3148df4 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Fri, 7 Jul 2023 12:35:30 +0200
Subject: [PATCH 07/13] add sectors functionality

---
 bblocks/import_tools/world_bank_projects.py | 66 ++++++++++++++-------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index c436cf0..e357d77 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -5,6 +5,7 @@
 import requests
 import json
 from dataclasses import dataclass
+import re
 
 from bblocks.logger import logger
 from bblocks.import_tools.common import ImportData
@@ -26,24 +27,24 @@ class QueryAPI:
 
     def __init__(
         self,
-        response_format: str = "json",
         max_rows_per_response: int = 500,
         start_date: str | None = None,
         end_date: str | None = None,
     ):
         """Initialize QueryAPI object"""
 
-        self.response_format = response_format
         self.max_rows_per_response = max_rows_per_response
         self.start_date = start_date
         self.end_date = end_date
 
         self._params = {
-            "format": self.response_format,
+            "format": 'json',
             "rows": self.max_rows_per_response,
             # 'os': 0, # offset
             "strdate": self.start_date,
             "enddate": self.end_date,
+            "fl": "*",
+            'apilang': 'en'
         }
 
         self._check_params()
@@ -188,22 +189,33 @@ def clean_theme(data: dict) -> list[dict] | list:
     return theme_list
 
 
-def clean_sector(sector_series: pd.Series) -> pd.Series:
-    """Format sector data from a nested list to a string separating sectors by ' | '
-    If there are no sectors, np.nan will be placed in the series row.
+def _get_sector_percentages(d: dict) -> dict:
+    """ """
 
-    Args:
-        sector_series: series of sector data
+    sectors_dict = {} # empty dict to store sector data as {sector_name: percent}
 
-    Returns:
-        series of sector data as a string separated by ' | '
-    """
+    sector_names = [v['Name'] for v in d['sector']] # get list of sector names
+    sectors = {key: value for key, value in d.items() if re.search(r'^sector\d+$', key)} # get sectors fields which should contain percentages
+
+    # get available sector percentages
+    for _, v in sectors.items():
+        if isinstance(v, dict):
+            sectors_dict[v['Name']] = v['Percent']
+
+    # check if there are missing sectors from the dict
+    if (len(sectors_dict)== len(sectors)-1) and (sum(sectors_dict.values())<100):
+
+        # loop through all the available sectors
+        for s in sector_names:
+
+            # if a sectors has not been picked up it must be the missing sector
+            if s not in sectors_dict.keys():
+                sectors_dict[s] = 100 - sum(sectors_dict.values())
 
-    return sector_series.apply(
-        lambda x: " | ".join([item["Name"] for item in x])
-        if isinstance(x, list)
-        else np.nan
-    )
+    if sum(sectors_dict.values())!=100:
+        raise ValueError("Sector percentages don't add up to 100%")
+
+    return sectors_dict
 
 
 general_fields = {  # general info
@@ -232,8 +244,6 @@ def clean_sector(sector_series: pd.Series) -> pd.Series:
     "curr_total_commitment": "current total IBRD and IDA commitment",
     "curr_ibrd_commitment": "current IBRD commitment",
     "curr_ida_commitment": "current IDA commitment",
-    # sectors
-    "sector": "sectors",
 }
 
 
@@ -298,8 +308,6 @@ def _format_general_data(self) -> None:
                 ),
                 closingdate=lambda d: clean.to_date_column(d["closingdate"]),
                 p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]),
-                # format sectors
-                sector=lambda d: clean_sector(d["sector"]),
             )
             # rename columns
             .rename(columns=general_fields)
@@ -314,6 +322,23 @@ def _format_theme_data(self) -> None:
 
         self._data["theme_data"] = pd.DataFrame(theme_data)
 
+    def _format_sector_data(self) -> None:
+        """Format sector data and store it as a dataframe in _data attribute
+        with key 'sector_data'"""
+
+        sector_data = []
+        for _, proj_data in self._raw_data.items():
+            if 'sector' in proj_data.keys():
+                proj_id = proj_data['id']
+
+                sectors = _get_sector_percentages(proj_data)
+                sector_data.extend([{'project ID': proj_id,
+                                     'sector': s,
+                                     'percent': p}
+                                    for s, p in sectors.items()])
+
+        self._data["sector_data"] = pd.DataFrame(sector_data)
+
     def _download(self) -> None:
         """Download data from World Bank Projects API and save it as a json file."""
 
@@ -353,6 +378,7 @@ def load_data(self) -> ImportData:
         # set data
         self._format_general_data()
         self._format_theme_data()
+        self._format_sector_data()
 
         logger.info(f"Successfully loaded World Bank Projects")
         return self

From c2e9f2f5b3e599c7afc532a0691e0ce41836e3a7 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Fri, 7 Jul 2023 12:40:23 +0200
Subject: [PATCH 08/13] Update world_bank_projects.py

---
 bblocks/import_tools/world_bank_projects.py | 24 +++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index e357d77..6d5e71a 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -190,12 +190,25 @@ def clean_theme(data: dict) -> list[dict] | list:
 
 
 def _get_sector_percentages(d: dict) -> dict:
-    """ """
+    """Get sector percentages from a project dictionary
+
+    the function first finds all available sectors
+    It then finds all  fields from the json starting with 'sector' and ending with a number
+    and gets a dictionary of the sector name and percentage
+    If there are any sectors missing from the dict and the total percentage is less than 100
+    the missing sector is added with the remaining percentage.
+    If the total is still not 100, it will raise an error to indicate a
+    problem with the data.
+
+    args:
+        d: project dictionary
+    """
 
     sectors_dict = {} # empty dict to store sector data as {sector_name: percent}
-
     sector_names = [v['Name'] for v in d['sector']] # get list of sector names
-    sectors = {key: value for key, value in d.items() if re.search(r'^sector\d+$', key)} # get sectors fields which should contain percentages
+
+    # get sectors fields which should contain percentages
+    sectors = {key: value for key, value in d.items() if re.search(r"^sector\d+$", key)}
 
     # get available sector percentages
     for _, v in sectors.items():
@@ -203,16 +216,15 @@ def _get_sector_percentages(d: dict) -> dict:
             sectors_dict[v['Name']] = v['Percent']
 
     # check if there are missing sectors from the dict
-    if (len(sectors_dict)== len(sectors)-1) and (sum(sectors_dict.values())<100):
+    if (len(sectors_dict) == len(sectors)-1) and (sum(sectors_dict.values())<100):
 
         # loop through all the available sectors
         for s in sector_names:
-
             # if a sectors has not been picked up it must be the missing sector
             if s not in sectors_dict.keys():
                 sectors_dict[s] = 100 - sum(sectors_dict.values())
 
-    if sum(sectors_dict.values())!=100:
+    if sum(sectors_dict.values()) != 100:
         raise ValueError("Sector percentages don't add up to 100%")
 
     return sectors_dict

From 37662bb4c8d6475ae32669fffb3de9c2125044b2 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Fri, 7 Jul 2023 12:55:54 +0200
Subject: [PATCH 09/13] test sector

---
 bblocks/import_tools/world_bank_projects.py   |  4 +-
 .../test_world_bank_projects.py               | 84 ++++++++++---------
 2 files changed, 48 insertions(+), 40 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index 6d5e71a..a103a76 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -189,7 +189,7 @@ def clean_theme(data: dict) -> list[dict] | list:
     return theme_list
 
 
-def _get_sector_percentages(d: dict) -> dict:
+def _get_sector_data(d: dict) -> dict:
     """Get sector percentages from a project dictionary
 
     the function first finds all available sectors
@@ -343,7 +343,7 @@ def _format_sector_data(self) -> None:
             if 'sector' in proj_data.keys():
                 proj_id = proj_data['id']
 
-                sectors = _get_sector_percentages(proj_data)
+                sectors = _get_sector_data(proj_data)
                 sector_data.extend([{'project ID': proj_id,
                                      'sector': s,
                                      'percent': p}
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
index 0531b39..64339ee 100644
--- a/tests/test_import_tools/test_world_bank_projects.py
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -25,18 +25,18 @@ def test_init(self):
 
         # test that start_date is dropped if end_date is None
         assert (
-            "strdate"
-            not in world_bank_projects.QueryAPI(
-                end_date="2020-01-01", start_date=None
-            )._params
+                "strdate"
+                not in world_bank_projects.QueryAPI(
+            end_date="2020-01-01", start_date=None
+        )._params
         )
 
         # test that end_date is dropped if start_date is None
         assert (
-            "enddate"
-            not in world_bank_projects.QueryAPI(
-                start_date="2020-01-01", end_date=None
-            )._params
+                "enddate"
+                not in world_bank_projects.QueryAPI(
+            start_date="2020-01-01", end_date=None
+        )._params
         )
 
     def test_request(self):
@@ -271,36 +271,44 @@ def test_clean_theme_no_theme():
     assert world_bank_projects.clean_theme(test_data_dict) == []
 
 
-def test_clean_sector():
-    """Test clean_sector function."""
+def test_get_sector_data():
+    """test the get_sector_data function."""
 
-    test_series = pd.Series(
-        {
-            0: [
-                {"Name": "Public Administration - Transportation"},
-                {"Name": "Ports/Waterways"},
-            ],
-            1: [
-                {"Name": "Public Administration - Agriculture, Fishing & Forestry"},
-                {
-                    "Name": "Agricultural Extension, Research, and Other Support Activities"
-                },
-                {"Name": "Other Agriculture, Fishing and Forestry"},
-                {"Name": "Irrigation and Drainage"},
-                {"Name": "Agricultural markets, commercialization and agri-business"},
-            ],
-            2: np.nan,
-            3: np.nan,
-        }
-    )
+    d = {'id': 'P1',
+         'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'},
+                    {'Name': 'Agricultural extension and research', 'code': 'AX'}
+                    ],
+         'sector1': {'Name': 'Agriculture, fishing, and forestry',
+                     'Percent': 50},
+         'sector2': {'Name': 'Agricultural extension and research',
+                     'Percent': 50}
+         }
+
+    expected = {'Agriculture, fishing, and forestry': 50,
+                'Agricultural extension and research': 50}
+
+    assert world_bank_projects._get_sector_data(d) == expected
+
+def test_get_sector_data_missing_sector():
+    """Test the get_sector_data function with missing sector."""
+
+    d = {'id': 'P2',
+         'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'},
+                    {'Name': 'Agricultural extension and research', 'code': 'AX'},
+                    {'Name': 'Missing sector', 'code': 'XX'}
+                    ],
+         'sector1': {'Name': 'Agriculture, fishing, and forestry',
+                     'Percent': 40},
+         'sector2': {'Name': 'Agricultural extension and research',
+                     'Percent': 50},
+         'sector3': 'Missing sector',
+         }
+
+    expected = {'Agriculture, fishing, and forestry': 40,
+                'Agricultural extension and research': 50,
+                'Missing sector': 10
+                }
+
+    assert world_bank_projects._get_sector_data(d) == expected
 
-    expected = pd.Series(
-        {
-            0: "Public Administration - Transportation | Ports/Waterways",
-            1: "Public Administration - Agriculture, Fishing & Forestry | Agricultural Extension, Research, and Other Support Activities | Other Agriculture, Fishing and Forestry | Irrigation and Drainage | Agricultural markets, commercialization and agri-business",
-            2: np.nan,
-            3: np.nan,
-        }
-    )
 
-    assert world_bank_projects.clean_sector(test_series).equals(expected)

From 9dd269d45175150f8c90b1c837ac99f112e28834 Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Fri, 7 Jul 2023 15:09:51 +0200
Subject: [PATCH 10/13] update

---
 bblocks/import_tools/world_bank_projects.py   | 68 ++++++++++-----
 .../test_world_bank_projects.py               | 82 ++++++++++---------
 2 files changed, 94 insertions(+), 56 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index a103a76..55d85e9 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -1,7 +1,6 @@
 """World Bank Projects Database Importer"""
 
 import pandas as pd
-import numpy as np
 import requests
 import json
 from dataclasses import dataclass
@@ -30,21 +29,31 @@ def __init__(
         max_rows_per_response: int = 500,
         start_date: str | None = None,
         end_date: str | None = None,
+        fields: list[str] | str = "*",
     ):
-        """Initialize QueryAPI object"""
+        """Initialize QueryAPI object
+
+        Args:
+            max_rows_per_response: maximum number of rows to return per request.
+                                Must be less than or equal to 1000.
+            start_date: start date of projects to return. Format: YYYY-MM-DD
+            end_date: end date of projects to return. Format: YYYY-MM-DD
+            fields: fields to return. Can be a list of strings or a single string.
+                    By default, all fields are returned ('*').
+        """
 
         self.max_rows_per_response = max_rows_per_response
         self.start_date = start_date
         self.end_date = end_date
+        self.fields = fields
 
         self._params = {
-            "format": 'json',
+            "format": "json",
             "rows": self.max_rows_per_response,
             # 'os': 0, # offset
             "strdate": self.start_date,
             "enddate": self.end_date,
-            "fl": "*",
-            'apilang': 'en'
+            "fl": self.fields,
         }
 
         self._check_params()
@@ -118,6 +127,7 @@ def request_data(self) -> "QueryAPI":
         if len(self.response_data) == 0:
             raise EmptyDataException("No data was returned from API")
 
+        logger.info(f"Retrieved {len(self.response_data)} projects from API")
         return self
 
     def get_data(self) -> dict[dict]:
@@ -204,8 +214,8 @@ def _get_sector_data(d: dict) -> dict:
         d: project dictionary
     """
 
-    sectors_dict = {} # empty dict to store sector data as {sector_name: percent}
-    sector_names = [v['Name'] for v in d['sector']] # get list of sector names
+    sectors_dict = {}  # empty dict to store sector data as {sector_name: percent}
+    sector_names = [v["Name"] for v in d["sector"]]  # get list of sector names
 
     # get sectors fields which should contain percentages
     sectors = {key: value for key, value in d.items() if re.search(r"^sector\d+$", key)}
@@ -213,10 +223,10 @@ def _get_sector_data(d: dict) -> dict:
     # get available sector percentages
     for _, v in sectors.items():
         if isinstance(v, dict):
-            sectors_dict[v['Name']] = v['Percent']
+            sectors_dict[v["Name"]] = v["Percent"]
 
     # check if there are missing sectors from the dict
-    if (len(sectors_dict) == len(sectors)-1) and (sum(sectors_dict.values())<100):
+    if (len(sectors_dict) == len(sectors) - 1) and (sum(sectors_dict.values()) < 100):
 
         # loop through all the available sectors
         for s in sector_names:
@@ -238,7 +248,11 @@ def _get_sector_data(d: dict) -> dict:
     "url": "url",
     "teamleadname": "team leader",
     "status": "status",
+    "last_stage_reached_name": "last stage reached",
+    "pdo": "project development objective",
+    "cons_serv_reqd_ind": "consulting services required",
     "envassesmentcategorycode": "environmental assesment category",
+    "esrc_ovrl_risk_rate": "environmental and social risk",
     # dates
     "approvalfy": "fiscal year",
     "boardapprovaldate": "board approval date",
@@ -246,6 +260,7 @@ def _get_sector_data(d: dict) -> dict:
     "p2a_updated_date": "update date",
     # lending
     "lendinginstr": "lending instrument",
+    "projectfinancialtype": "financing type",
     "borrower": "borrower",
     "impagency": "implementing agency",
     "lendprojectcost": "project cost",
@@ -253,6 +268,7 @@ def _get_sector_data(d: dict) -> dict:
     "grantamt": "grant amount",
     "idacommamt": "IDA commitment amount",
     "ibrdcommamt": "IBRD commitment amount",
+    "curr_project_cost": "current project cost",
     "curr_total_commitment": "current total IBRD and IDA commitment",
     "curr_ibrd_commitment": "current IBRD commitment",
     "curr_ida_commitment": "current IDA commitment",
@@ -274,8 +290,8 @@ class WorldBankProjects(ImportData):
     If 'reload' is set to True, the data will be reloaded to the object.
 
     Parameters:
-        start_date: start date of data to import, in DD-MM-YYYY format
-        end_date: end date of data to import, in DD-MM-YYYY format.
+        start_date: start date of data to import, in YYYY-MM-DD format
+        end_date: end date of data to import, in YYYY-MM-DD format.
     """
 
     start_date: str | None = None
@@ -302,6 +318,7 @@ def _format_general_data(self) -> None:
             "curr_total_commitment",
             "curr_ibrd_commitment",
             "curr_ida_commitment",
+            "curr_project_cost",
         ]
 
         self._data["general_data"] = (
@@ -340,20 +357,24 @@ def _format_sector_data(self) -> None:
 
         sector_data = []
         for _, proj_data in self._raw_data.items():
-            if 'sector' in proj_data.keys():
-                proj_id = proj_data['id']
+            if "sector" in proj_data.keys():
+                proj_id = proj_data["id"]
 
                 sectors = _get_sector_data(proj_data)
-                sector_data.extend([{'project ID': proj_id,
-                                     'sector': s,
-                                     'percent': p}
-                                    for s, p in sectors.items()])
+                sector_data.extend(
+                    [
+                        {"project ID": proj_id, "sector": s, "percent": p}
+                        for s, p in sectors.items()
+                    ]
+                )
 
         self._data["sector_data"] = pd.DataFrame(sector_data)
 
     def _download(self) -> None:
         """Download data from World Bank Projects API and save it as a json file."""
 
+        logger.info(f"Starting download of World Bank Projects")
+
         with open(self._path, "w") as file:
             data = (
                 QueryAPI(start_date=self.start_date, end_date=self.end_date)
@@ -426,15 +447,24 @@ def get_data(
         Args:
             project_codes: project codes to retrieve data for. If 'all', data for all projects
             will be returned
-            data_type: type of data to retrieve. Either 'general' or 'theme'
+            data_type: type of data to retrieve. Either 'general', 'sector' or 'theme'
+
+        Returns:
+            dataframe with the requested data
         """
 
+        # check if data has been loaded
+        if len(self._data) == 0:
+            raise EmptyDataException("Data has not been loaded. Run load_data() first.")
+
         if data_type == "general":
             df = self._data["general_data"]
         elif data_type == "theme":
             df = self._data["theme_data"]
+        elif data_type == "sector":
+            df = self._data["sector_data"]
         else:
-            raise ValueError("data_type must be either 'general' or 'theme'")
+            raise ValueError("data_type must be either 'general', 'theme' or 'sector'")
 
         if project_codes != "all":
             if isinstance(project_codes, str):
diff --git a/tests/test_import_tools/test_world_bank_projects.py b/tests/test_import_tools/test_world_bank_projects.py
index 64339ee..08564e9 100644
--- a/tests/test_import_tools/test_world_bank_projects.py
+++ b/tests/test_import_tools/test_world_bank_projects.py
@@ -1,8 +1,6 @@
 """Tests for the world_bank_projects module."""
 
 import pytest
-import pandas as pd
-import numpy as np
 import requests
 from unittest.mock import Mock, patch, MagicMock
 
@@ -25,18 +23,18 @@ def test_init(self):
 
         # test that start_date is dropped if end_date is None
         assert (
-                "strdate"
-                not in world_bank_projects.QueryAPI(
-            end_date="2020-01-01", start_date=None
-        )._params
+            "strdate"
+            not in world_bank_projects.QueryAPI(
+                end_date="2020-01-01", start_date=None
+            )._params
         )
 
         # test that end_date is dropped if start_date is None
         assert (
-                "enddate"
-                not in world_bank_projects.QueryAPI(
-            start_date="2020-01-01", end_date=None
-        )._params
+            "enddate"
+            not in world_bank_projects.QueryAPI(
+                start_date="2020-01-01", end_date=None
+            )._params
         )
 
     def test_request(self):
@@ -274,41 +272,51 @@ def test_clean_theme_no_theme():
 def test_get_sector_data():
     """test the get_sector_data function."""
 
-    d = {'id': 'P1',
-         'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'},
-                    {'Name': 'Agricultural extension and research', 'code': 'AX'}
-                    ],
-         'sector1': {'Name': 'Agriculture, fishing, and forestry',
-                     'Percent': 50},
-         'sector2': {'Name': 'Agricultural extension and research',
-                     'Percent': 50}
-         }
+    d = {
+        "id": "P1",
+        "sector": [
+            {"Name": "Agriculture, fishing, and forestry", "code": "BX"},
+            {"Name": "Agricultural extension and research", "code": "AX"},
+        ],
+        "sector1": {"Name": "Agriculture, fishing, and forestry", "Percent": 50},
+        "sector2": {"Name": "Agricultural extension and research", "Percent": 50},
+    }
 
-    expected = {'Agriculture, fishing, and forestry': 50,
-                'Agricultural extension and research': 50}
+    expected = {
+        "Agriculture, fishing, and forestry": 50,
+        "Agricultural extension and research": 50,
+    }
 
     assert world_bank_projects._get_sector_data(d) == expected
 
+
 def test_get_sector_data_missing_sector():
     """Test the get_sector_data function with missing sector."""
 
-    d = {'id': 'P2',
-         'sector': [{'Name': 'Agriculture, fishing, and forestry', 'code': 'BX'},
-                    {'Name': 'Agricultural extension and research', 'code': 'AX'},
-                    {'Name': 'Missing sector', 'code': 'XX'}
-                    ],
-         'sector1': {'Name': 'Agriculture, fishing, and forestry',
-                     'Percent': 40},
-         'sector2': {'Name': 'Agricultural extension and research',
-                     'Percent': 50},
-         'sector3': 'Missing sector',
-         }
-
-    expected = {'Agriculture, fishing, and forestry': 40,
-                'Agricultural extension and research': 50,
-                'Missing sector': 10
-                }
+    d = {
+        "id": "P2",
+        "sector": [
+            {"Name": "Agriculture, fishing, and forestry", "code": "BX"},
+            {"Name": "Agricultural extension and research", "code": "AX"},
+            {"Name": "Missing sector", "code": "XX"},
+        ],
+        "sector1": {"Name": "Agriculture, fishing, and forestry", "Percent": 40},
+        "sector2": {"Name": "Agricultural extension and research", "Percent": 50},
+        "sector3": "Missing sector",
+    }
+
+    expected = {
+        "Agriculture, fishing, and forestry": 40,
+        "Agricultural extension and research": 50,
+        "Missing sector": 10,
+    }
 
     assert world_bank_projects._get_sector_data(d) == expected
 
 
+def test_get_data_no_data_loaded():
+    """Test the get_data function with no data loaded."""
+
+    with pytest.raises(world_bank_projects.EmptyDataException):
+        proj = world_bank_projects.WorldBankProjects()
+        proj.get_data()

From d51dd852b514d7c1b14400a8492d94106303b69a Mon Sep 17 00:00:00 2001
From: Jorge Rivera <jorge.rivera@one.org>
Date: Mon, 10 Jul 2023 16:00:34 +0200
Subject: [PATCH 11/13] Update world_bank_projects.py

---
 bblocks/import_tools/world_bank_projects.py | 141 +++++++++++++-------
 1 file changed, 95 insertions(+), 46 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index 55d85e9..f6221c0 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -45,7 +45,7 @@ def __init__(
         self.max_rows_per_response = max_rows_per_response
         self.start_date = start_date
         self.end_date = end_date
-        self.fields = fields
+        self.fields = list(set(fields)) if isinstance(fields, list) else [fields]
 
         self._params = {
             "format": "json",
@@ -109,7 +109,6 @@ def request_data(self) -> "QueryAPI":
         self._params["os"] = 0  # reset offset to 0
 
         while True:
-
             # request data
             data = self._request()
 
@@ -139,6 +138,58 @@ def get_data(self) -> dict[dict]:
         return self.response_data
 
 
+def _append_theme_to_list(
+    proj_id: str, theme_list: list[dict], theme_names: list[str], theme: dict
+) -> None:
+    """Appends a theme to the theme_list.
+
+    Args:
+        proj_id: The project ID.
+        theme_list: The list of theme dictionaries to append to.
+        theme_names): The names of the parent themes.
+        theme: The theme to append.
+    """
+    new_theme = {
+        "project ID": proj_id,
+        **{f"theme{idx + 1}": name for idx, name in enumerate(theme_names)},
+        "percent": clean.clean_number(theme["percent"]),
+    }
+    theme_list.append(new_theme)
+
+
+def _parse_themes(
+    proj_id: str,
+    theme_list: list[dict],
+    theme_names: list[str],
+    theme: dict,
+    theme_level: int,
+) -> None:
+    """Recursive function to handle nested themes.
+
+    Args:
+        proj_id (str): The project ID.
+        theme_list (list[dict]): The list of theme dictionaries to append to.
+        theme_names (list[str]): The names of the parent themes.
+        theme (dict): The current theme.
+        theme_level (int): The current level of theme nesting.
+    """
+    # Append the current theme to the list
+    _append_theme_to_list(
+        proj_id=proj_id, theme_list=theme_list, theme_names=theme_names, theme=theme
+    )
+
+    # Recursively call this function for each nested theme
+    nested_theme_key = f"theme{theme_level + 1}"
+    for nested_theme in theme.get(nested_theme_key, []):
+        _parse_themes(
+            proj_id=proj_id,
+            theme_list=theme_list,
+            theme_names=theme_names + [nested_theme["name"]],
+            theme=nested_theme,
+            theme_level=theme_level + 1,
+        )
+
+
 def clean_theme(data: dict) -> list[dict] | list:
     """Clean theme data from a nested list to a list of dictionaries with theme names and
     percentages.
@@ -152,50 +203,15 @@ def clean_theme(data: dict) -> list[dict] | list:
     """
 
     # if there are no themes, return an empty list
-    if "theme_list" not in data.keys():
+    if "theme_list" not in data:
         # return [{'project ID': proj_id}]
         return []
 
     theme_list = []
     proj_id = data["id"]
     for theme1 in data["theme_list"]:
+        _parse_themes(proj_id, theme_list, [theme1["name"]], theme1, 1)
 
-        # get first theme
-        name = theme1["name"]
-        theme_list.append(
-            {
-                "project ID": proj_id,
-                "theme1": name,
-                "percent": clean.clean_number(theme1["percent"]),
-            }
-        )
-
-        # get 2nd theme
-        if "theme2" in theme1.keys():
-            for theme2 in theme1["theme2"]:
-                name_2 = theme2["name"]
-                theme_list.append(
-                    {
-                        "project ID": proj_id,
-                        "theme1": name,
-                        "theme2": name_2,
-                        "percent": clean.clean_number(theme2["percent"]),
-                    }
-                )
-
-                # get 3rd theme
-                if "theme3" in theme2.keys():
-                    for theme3 in theme2["theme3"]:
-                        name_3 = theme3["name"]
-                        theme_list.append(
-                            {
-                                "project ID": proj_id,
-                                "theme1": name,
-                                "theme2": name_2,
-                                "theme3": name_3,
-                                "percent": clean.clean_number(theme3["percent"]),
-                            }
-                        )
     return theme_list
 
 
@@ -227,11 +243,10 @@ def _get_sector_data(d: dict) -> dict:
 
     # check if there are missing sectors from the dict
     if (len(sectors_dict) == len(sectors) - 1) and (sum(sectors_dict.values()) < 100):
-
         # loop through all the available sectors
         for s in sector_names:
             # if a sectors has not been picked up it must be the missing sector
-            if s not in sectors_dict.keys():
+            if s not in sectors_dict:
                 sectors_dict[s] = 100 - sum(sectors_dict.values())
 
     if sum(sectors_dict.values()) != 100:
@@ -240,7 +255,7 @@ def _get_sector_data(d: dict) -> dict:
     return sectors_dict
 
 
-general_fields = {  # general info
+GENERAL_FIELDS = {  # general info
     "id": "project ID",
     "project_name": "project name",
     "countryshortname": "country",
@@ -253,6 +268,9 @@ def _get_sector_data(d: dict) -> dict:
     "cons_serv_reqd_ind": "consulting services required",
     "envassesmentcategorycode": "environmental assesment category",
     "esrc_ovrl_risk_rate": "environmental and social risk",
+    "transactiontype:": "transaction type",
+    "financier_loan": "financier loan",
+    "interestandcharges": "interest and charges",
     # dates
     "approvalfy": "fiscal year",
     "boardapprovaldate": "board approval date",
@@ -261,6 +279,8 @@ def _get_sector_data(d: dict) -> dict:
     # lending
     "lendinginstr": "lending instrument",
     "projectfinancialtype": "financing type",
+    "loantype": "loan type",
+    "loantypedesc": "loan type description",
     "borrower": "borrower",
     "impagency": "implementing agency",
     "lendprojectcost": "project cost",
@@ -272,6 +292,31 @@ def _get_sector_data(d: dict) -> dict:
     "curr_total_commitment": "current total IBRD and IDA commitment",
     "curr_ibrd_commitment": "current IBRD commitment",
     "curr_ida_commitment": "current IDA commitment",
+    "reapayment": "repayment",
+}
+
+OTHER_FIELDS = {
+    "projectstatusdisplay": "project status display",
+    "sector1": "sector1",
+    "sector2": "sector2",
+    "sector3": "sector3",
+    "sector4": "sector4",
+    "sector5": "sector5",
+    "sector6": "sector6",
+    "sector7": "sector7",
+    "sector8": "sector8",
+    "sector": "sector",
+    "theme1": "theme1",
+    "theme2": "theme2",
+    "theme3": "theme3",
+    "theme4": "theme4",
+    "theme5": "theme5",
+    "fiscal_year": "fiscal year",
+    "fiscalyear": "fiscal year",
+    "fiscalyear_budget": "fiscal year budget",
+    "project_abstract": "project abstract",
+    "sectorlist": "sectorlist",
+    "theme_list": "theme_list",
 }
 
 
@@ -324,7 +369,7 @@ def _format_general_data(self) -> None:
         self._data["general_data"] = (
             pd.DataFrame.from_dict(self._raw_data, orient="index")
             .reset_index(drop=True)
-            .loc[:, general_fields.keys()]
+            .filter(list(GENERAL_FIELDS), axis=1)
             # change the fiscal year to int
             .assign(
                 approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int)
@@ -339,7 +384,7 @@ def _format_general_data(self) -> None:
                 p2a_updated_date=lambda d: clean.to_date_column(d["p2a_updated_date"]),
             )
             # rename columns
-            .rename(columns=general_fields)
+            .rename(columns=GENERAL_FIELDS)
         )
 
     def _format_theme_data(self) -> None:
@@ -357,7 +402,7 @@ def _format_sector_data(self) -> None:
 
         sector_data = []
         for _, proj_data in self._raw_data.items():
-            if "sector" in proj_data.keys():
+            if "sector" in proj_data:
                 proj_id = proj_data["id"]
 
                 sectors = _get_sector_data(proj_data)
@@ -377,7 +422,11 @@ def _download(self) -> None:
 
         with open(self._path, "w") as file:
             data = (
-                QueryAPI(start_date=self.start_date, end_date=self.end_date)
+                QueryAPI(
+                    start_date=self.start_date,
+                    end_date=self.end_date,
+                    fields=list(GENERAL_FIELDS) + list(OTHER_FIELDS),
+                )
                 .request_data()
                 .get_data()
             )

From 06530c3d659059c1a09bf5ff141092d72cddb40f Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Wed, 19 Jul 2023 17:26:02 +0200
Subject: [PATCH 12/13] Update world_bank_projects.py

---
 bblocks/import_tools/world_bank_projects.py | 53 ++++++++++++++++-----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index f6221c0..a00829f 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -292,7 +292,7 @@ def _get_sector_data(d: dict) -> dict:
     "curr_total_commitment": "current total IBRD and IDA commitment",
     "curr_ibrd_commitment": "current IBRD commitment",
     "curr_ida_commitment": "current IDA commitment",
-    "reapayment": "repayment",
+    "repayment": "repayment",
 }
 
 OTHER_FIELDS = {
@@ -351,7 +351,7 @@ def _path(self):
 
         return BBPaths.raw_data / f"world_bank_projects{start_date}{end_date}.json"
 
-    def _format_general_data(self) -> None:
+    def _format_general_data(self, additional_fields: list = None) -> None:
         """Clean and format general data and store it in _data attribute with key 'general_data'"""
 
         numeric_cols = [
@@ -369,7 +369,7 @@ def _format_general_data(self) -> None:
         self._data["general_data"] = (
             pd.DataFrame.from_dict(self._raw_data, orient="index")
             .reset_index(drop=True)
-            .filter(list(GENERAL_FIELDS), axis=1)
+            .filter(list(GENERAL_FIELDS) + additional_fields, axis=1)
             # change the fiscal year to int
             .assign(
                 approvalfy=lambda d: clean.clean_numeric_series(d["approvalfy"], to=int)
@@ -415,17 +415,24 @@ def _format_sector_data(self) -> None:
 
         self._data["sector_data"] = pd.DataFrame(sector_data)
 
-    def _download(self) -> None:
+    def _download(self, additional_fields: list | None = None) -> None:
         """Download data from World Bank Projects API and save it as a json file."""
 
         logger.info(f"Starting download of World Bank Projects")
 
+        if additional_fields is None:
+            additional_fields = []
+        if isinstance(additional_fields, str):
+            additional_fields = [additional_fields]
+
         with open(self._path, "w") as file:
             data = (
                 QueryAPI(
                     start_date=self.start_date,
                     end_date=self.end_date,
-                    fields=list(GENERAL_FIELDS) + list(OTHER_FIELDS),
+                    fields=list(GENERAL_FIELDS)
+                    + list(OTHER_FIELDS)
+                    + additional_fields,
                 )
                 .request_data()
                 .get_data()
@@ -434,7 +441,7 @@ def _download(self) -> None:
 
         logger.info(f"Successfully downloaded World Bank Projects")
 
-    def load_data(self) -> ImportData:
+    def load_data(self, *, additional_fields: str | list = None) -> ImportData:
         """Load data to the object
 
         This method will load the World Bank Project data to the object.
@@ -442,13 +449,34 @@ def load_data(self) -> ImportData:
         otherwise it will be downloaded from the API and saved as a json file and  loaded
         to the object.
 
+        Args:
+            additional_fields: additional fields to download from the API. If the data has
+                already been downloaded, the additional fields may not be loaded if they do not
+                exist in the downloaded file. To force download of data with additional fields,
+                use the update_data method passing the additional fields as argument
+
         Returns:
             object with loaded data
         """
 
+        # check if additional fields is a string or None and convert to list
+        if additional_fields is None:
+            additional_fields = []
+        if isinstance(additional_fields, str):
+            additional_fields = [additional_fields]
+
         # if file does not exist, download it and save it as a json file
         if not self._path.exists():
-            self._download()
+            self._download(additional_fields=additional_fields)
+
+        # if file exists and additional fields are passed, log warning
+        else:
+            if not additional_fields:
+                logger.warning(
+                    "Data already exists in disk. The additional fields may not be "
+                    "loaded. To force download of data with additional fields, use the"
+                    " update_data method passing the additional fields as argument"
+                )
 
         # load data from json file
         with open(self._path, "r") as file:
@@ -458,14 +486,16 @@ def load_data(self) -> ImportData:
             raise EmptyDataException("No data was retrieved")
 
         # set data
-        self._format_general_data()
+        self._format_general_data(additional_fields=additional_fields)
         self._format_theme_data()
         self._format_sector_data()
 
         logger.info(f"Successfully loaded World Bank Projects")
         return self
 
-    def update_data(self, reload: bool = True) -> ImportData:
+    def update_data(
+        self, reload: bool = True, *, additional_fields: str | list = None
+    ) -> ImportData:
         """Force update of data
 
         This method will download the data from the API.
@@ -473,14 +503,15 @@ def update_data(self, reload: bool = True) -> ImportData:
 
         Args:
             reload: if True, reload data to object after downloading it.
+            additional_fields: additional fields to download
 
         Returns:
             object with updated data
         """
 
-        self._download()
+        self._download(additional_fields=additional_fields)
         if reload:
-            self.load_data()
+            self.load_data(additional_fields=additional_fields)
 
         return self
 

From 436ae1f2759bc635bd2a0f4101b3582855c8c7be Mon Sep 17 00:00:00 2001
From: Luca Picci <lpicci96@gmail.com>
Date: Thu, 20 Jul 2023 16:27:00 +0200
Subject: [PATCH 13/13] update

---
 CHANGELOG.md                                |  5 +++++
 bblocks/import_tools/world_bank_projects.py | 23 ++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ab9b1e..76b618f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,11 @@
 Changelog
 =========
 
+[1.2.0] - 2023-07-20
+--------------------
+- Added new feature: `world_bank_projects` module in `import_tools` with an object 
+  to extract data from the World Bank Projects database.
+
 [1.1.1] - 2023-07-06
 --------------------
 - Updated requirements
diff --git a/bblocks/import_tools/world_bank_projects.py b/bblocks/import_tools/world_bank_projects.py
index a00829f..45a88d5 100644
--- a/bblocks/import_tools/world_bank_projects.py
+++ b/bblocks/import_tools/world_bank_projects.py
@@ -394,7 +394,10 @@ def _format_theme_data(self) -> None:
         for _, proj_data in self._raw_data.items():
             theme_data.extend(clean_theme(proj_data))
 
-        self._data["theme_data"] = pd.DataFrame(theme_data)
+        self._data["theme_data"] = pd.DataFrame(theme_data).filter(
+            ["project ID", "theme1", "theme2", "theme3", "theme4", "theme5", "percent"],
+            axis=1,
+        )
 
     def _format_sector_data(self) -> None:
         """Format sector data and store it as a dataframe in _data attribute
@@ -459,6 +462,15 @@ def load_data(self, *, additional_fields: str | list = None) -> ImportData:
             object with loaded data
         """
 
+        # if additional fields are set but the data is read from disk, log a warning
+        if self._path.exists() and additional_fields is not None:
+            logger.warning(
+                "Data already exists in disk. The additional fields might not be "
+                "loaded if they do not exist in the downloaded data. To force download "
+                "of data with additional fields, use the update_data method passing the "
+                "additional fields as argument"
+            )
+
         # check if additional fields is a string or None and convert to list
         if additional_fields is None:
             additional_fields = []
@@ -469,15 +481,6 @@ def load_data(self, *, additional_fields: str | list = None) -> ImportData:
         if not self._path.exists():
             self._download(additional_fields=additional_fields)
 
-        # if file exists and additional fields are passed, log warning
-        else:
-            if not additional_fields:
-                logger.warning(
-                    "Data already exists in disk. The additional fields may not be "
-                    "loaded. To force download of data with additional fields, use the"
-                    " update_data method passing the additional fields as argument"
-                )
-
         # load data from json file
         with open(self._path, "r") as file:
             self._raw_data = json.load(file)