Skip to content

Commit

Permalink
fix: NA county_code is valid, not Nan
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessandroLorenzi committed May 4, 2023
1 parent 3e3235b commit 59022aa
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 3 deletions.
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"."
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
48 changes: 45 additions & 3 deletions pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,27 @@
"ZA",
]

NA_VALUES = [
"",
"#N/A",
"#N/A N/A",
"#NA",
"-1.#IND",
"-1.#QNAN",
"-NaN",
"-nan",
"1.#IND",
"1.#QNAN",
"<NA>",
"N/A",
# NA",
"NULL",
"NaN",
"n/a",
"nan",
"null",
]


@contextlib.contextmanager
def _open_extract_url(url: str, country: str) -> Any:
Expand Down Expand Up @@ -231,22 +252,41 @@ def _get_data(country: str) -> Tuple[str, pd.DataFrame]:

data_path = os.path.join(STORAGE_DIR, country.upper() + ".txt")
if os.path.exists(data_path):
data = pd.read_csv(data_path, dtype={"postal_code": str})
data = pd.read_csv(
data_path, dtype={"postal_code": str}, na_values=NA_VALUES
)
else:
download_urls = [
val.format(country=country) for val in DOWNLOAD_URL
]
with _open_extract_cycle_url(download_urls, country) as fh:
# TODO: remove this at end of tests
# Here we have "NA" as a value for the county_code (column 6)
# (Pdb) tmp = os.open("/tmp/it.txt")
# (Pdb) tmp.write(str(fh.read()))
# ...
# # cat /tmp/it.txt | sed 's/\\n/\n/g' | grep 80023
# IT\t80023\tCaivano\tCampania\t04\tNapoli\tNA\t\t\t40.9575\t14.3059\t4
# IT\t80023\tPascarola\tCampania\t04\tNapoli\tNA\t\t\t40.9764\t14.305\

# breakpoint()
data = pd.read_csv(
fh,
sep="\t",
header=None,
names=DATA_FIELDS,
dtype={"postal_code": str},
na_values=NA_VALUES,
)
# breakpoint()
# TODO: remove this at end of tests
# Here we have Nan as a value for the county_code
# (Pdb) data[data["postal_code"]=="80023"]
# country_code postal_code place_name state_name state_code county_name county_code community_name community_code latitude longitude accuracy
# 2363 IT 80023 Caivano Campania 4 Napoli NaN NaN NaN 40.9575 14.3059 4.0
# 2364 IT 80023 Pascarola Campania 4 Napoli NaN NaN NaN 40.9764 14.3050 4.0
os.makedirs(STORAGE_DIR, exist_ok=True)
data.to_csv(data_path, index=None)

return data_path, data

def _index_postal_codes(self) -> pd.DataFrame:
Expand All @@ -255,7 +295,9 @@ def _index_postal_codes(self) -> pd.DataFrame:

if os.path.exists(data_path_unique):
data_unique = pd.read_csv(
data_path_unique, dtype={"postal_code": str}
data_path_unique,
dtype={"postal_code": str},
na_values=NA_VALUES,
)
else:
# group together places with the same postal code
Expand Down
8 changes: 8 additions & 0 deletions test_pgeocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,14 @@ def test_query_location_exact():
assert res["state_name"].unique().tolist() == ["Île-de-France"]


def test_location_naples():
# https://github.com/symerio/pgeocode/issues/73
nomi = Nominatim("it")
res = nomi.query_location("Napoli")
assert res["county_name"].unique().tolist() == ["Napoli"]
assert res["county_code"].unique().tolist() == ["NA"]


def test_query_location_fuzzy():
pytest.importorskip("thefuzz")
nomi = Nominatim("fr")
Expand Down

0 comments on commit 59022aa

Please sign in to comment.