Skip to content

Commit

Permalink
Merge pull request #13 from zenodallavalle/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
zenodallavalle authored Mar 7, 2023
2 parents e8c867f + 878659b commit 4265d8a
Show file tree
Hide file tree
Showing 9 changed files with 905 additions and 820 deletions.
5 changes: 2 additions & 3 deletions docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,8 @@ Or you can use italy-geopop to get data for your `pd.Series <https://pandas.pyda
Smart functionalities
-----------------------

For region and province, you can also use the ``smart_from_region`` and ``smart_from_province`` methods.
Those methods will try to guess the region or province from the input data and will return the data for the region or province only
if the match is not ambiguous.
``smart_from_municipality``, ``smart_from_region`` and ``smart_from_province`` methods are also available.
Those methods will try to guess from the input data and will return the data only if the match is unequivocal.

.. code-block:: python
:lineno-start: 9
Expand Down
2 changes: 1 addition & 1 deletion italy_geopop/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.0b0"
__version__ = "0.3.0"
5 changes: 3 additions & 2 deletions italy_geopop/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from functools import wraps
import pandas as pd
from typing import Any, Callable, Iterable
import re


def handle_return_cols(return_df, return_cols) -> pd.DataFrame:
Expand All @@ -26,7 +27,7 @@ def wrapper(*args, **kwargs) -> Any:


def match_single_word(words: Iterable[str], text: str) -> str | None:
"""return the word, taken from a list of words, that is found in text only if it's the only match.
"""return the word, taken from a list of words, that is found in text only if it's the only match. Word is searched as "exact word match".
:param words: a list or iterable of words to be searched into text.
:type words: Iterable[str]
Expand All @@ -38,7 +39,7 @@ def match_single_word(words: Iterable[str], text: str) -> str | None:
n_matches = 0
match = None
for word in words:
if word in text:
if re.search(r'\b{}\b'.format(word), text):
n_matches += 1
match = word
if n_matches == 1:
Expand Down
37 changes: 36 additions & 1 deletion italy_geopop/pandas_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,43 @@ def get_data(x) -> pd.Series:

return handle_return_cols(self._obj.apply(get_data), return_cols)

def smart_from_municipality(self, return_cols=None) -> pd.DataFrame | pd.Series:
"""Same as ``from_municipality`` but can understand more complex text. Values are returned only if match is unequivocal.
.. code-block:: python
:linenos:
>>> s = pd.Series(["Comune di Abano Terme", "Comune di Airasca", "Comune di Milano o di Verona?", 1001])
>>> s.italy_geopop.smart_from_municipality(return_cols='municipality')
0 Abano Terme
1 Airasca
2 NaN
3 Agliè
Name: municipality, dtype: object
"""
str_indexed = self._generate_municipality_dfs(
self.italy_geopop_df, include_geometry=self.include_geometry
)[0]
ret = self.from_municipality()
nans = self._obj[ret[ret.municipality.isnull()].index].copy()

empty_serie = self._generate_empty_serie(str_indexed.columns.to_list())
str_indexed = dict(str_indexed.iterrows())

@cache
def get_data(x):
key = match_single_word(str_indexed.keys(), str(x).strip().lower())
return str_indexed.get(key, empty_serie)

ret = ret.fillna(nans.apply(get_data))

return handle_return_cols(ret, return_cols=return_cols)

def smart_from_province(self, return_cols=None) -> pd.DataFrame | pd.Series:
"""Same as ``from_provinces`` but can understand more complex text. Values are returned only if match is unequivocal.
"""Same as ``from_province`` but can understand more complex text. Values are returned only if match is unequivocal.
.. code-block:: python
Expand Down
Loading

0 comments on commit 4265d8a

Please sign in to comment.