Skip to content

Commit

Permalink
Merge pull request #68 from fccoelho/optimize_SIA_Download
Browse files Browse the repository at this point in the history
Optimize sia download
  • Loading branch information
fccoelho authored Feb 3, 2022
2 parents 5691758 + 26133ea commit bf86aca
Show file tree
Hide file tree
Showing 6 changed files with 415 additions and 22 deletions.
1 change: 1 addition & 0 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies:
- pytest
- pytest-cov
- pre-commit
- numba
- pip:
- geobr
- facets-overview
302 changes: 302 additions & 0 deletions pysus/Notebooks/Analyzing SIA.ipynb

Large diffs are not rendered by default.

79 changes: 57 additions & 22 deletions pysus/online_data/SIA.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
from datetime import date
from ftplib import FTP
from typing import Dict, List, Optional, Tuple, Union
from pprint import pprint

import pandas as pd
from dbfread import DBF

from pysus.online_data import CACHEPATH
from pysus.utilities.readdbc import read_dbc
from pysus.utilities.readdbc import read_dbc, read_dbc_dbf, dbc2dbf

group_dict: Dict[str, Tuple[str, int, int]] = {
"PA": ("Produção Ambulatorial", 7, 1994),
Expand All @@ -35,13 +36,15 @@
"PS": ("RAAS Psicossocial", 1, 2008),
}

def show_datatypes():
pprint(group_dict)

def download(
state: str,
year: int,
month: int,
cache: bool = True,
group: Union[str, List[str]] = ["PA", "BI"],
state: str,
year: int,
month: int,
cache: bool = True,
group: Union[str, List[str]] = ["PA", "BI"],
) -> Union[Optional[pd.DataFrame], Tuple[Optional[pd.DataFrame], ...]]:
"""
Download SIASUS records for state year and month and returns dataframe
Expand Down Expand Up @@ -111,13 +114,13 @@ def download(
else:
try:
df = _fetch_file(fname, ftp, ftype)
if cache: # saves to cache
if cache and df: # saves to cache if df is not None
df.to_parquet(cachefile)
except Exception as e:
df = None
print(e)

dfs.append(df)
if df is not None:
dfs.append(df)

if len(dfs) == 1:
return dfs[0]
Expand All @@ -133,18 +136,50 @@ def _fetch_file(fname, ftp, ftype):
:param ftype: file type: DBF|DBC
:return: pandas dataframe
"""
print(f"Downloading {fname}...")
try:
ftp.retrbinary(f"RETR {fname}", open(fname, "wb").write)
except:
try:
ftp.retrbinary(f"RETR {fname.lower()}", open(fname, "wb").write)
except:
raise Exception(f"File {fname} not available")
if ftype == "DBC":
df = read_dbc(fname, encoding="iso-8859-1")
elif ftype == "DBF":
dbf = DBF(fname, encoding="iso-8859-1")
df = pd.DataFrame(list(dbf))

multiples = False
fnames = check_file_split(fname, ftp)

multiples = len(fnames) > 1

if multiples:
download_multiples(fnames, ftp)
print(f"This download is split into the following files: {fnames}\n"
f"They have been downloaded in {CACHEPATH}.\n"
f"To load them, use the pysus.utilities.read_dbc_dbf function.")
return
df = read_dbc_dbf(fname)

os.unlink(fname)
return df


def download_multiples(fnames, ftp):
for fn in fnames:
fnfull = os.path.join(CACHEPATH, fn)
print(f"Downloading {fn}...")
fobj = open(fnfull, "wb")
try:
ftp.retrbinary(f"RETR {fn}", fobj.write)
dbc2dbf(fnfull, fnfull.replace('.dbc', '.dbf'))
os.unlink(fnfull)
except Exception as exc:
raise Exception(f"Retrieval of file {fn} failed with the following error:\n {exc}")


def check_file_split(fname: str, ftp: FTP) -> list:
"""
Check for split filenames. Sometimes when files are too large, they are split into multiple files ending in a, b, c, ...
:param fname: filename
:param ftp: ftp conection
:return: list
"""
files = []
flist = ftp.nlst()
if fname not in flist:
for l in ['a', 'b', 'c', 'd']:
nm, ext = fname.split('.')
if f'{nm}{l}.{ext}' in flist:
files.append(f'{nm}{l}.{ext}')

return files
1 change: 1 addition & 0 deletions pysus/online_data/vaccine.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def download_covid(uf=None):
"loading from cache. Returning an iterator of Dataframes in chunks of 5000."
)
return pd.read_csv(tempfile, chunksize=5000)

auth = HTTPBasicAuth(user, pwd)
data_gen = elasticsearch_fetch(url, auth, query)

Expand Down
24 changes: 24 additions & 0 deletions pysus/tests/test_SIA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest
from ftplib import FTP
import pandas as pd
from pysus.online_data.SIA import download, check_file_split

class SIATestCase(unittest.TestCase):
def test_check_split_filenames(self):
ftp = FTP("ftp.datasus.gov.br")
ftp.login()
ftp.cwd("/dissemin/publicos/SIASUS/200801_/Dados")
names = check_file_split('PASP2012.dbc', ftp)
assert len(names) == 3
assert 'PASP2012b.dbc' in names

def test_download_large_PA(self):
res = download('SP', 2020, 12, group=['PA'])
if isinstance(res, pd.DataFrame):
assert not res.empty
else:
pass


if __name__ == '__main__':
unittest.main()
30 changes: 30 additions & 0 deletions pysus/utilities/readdbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
license: GPL V3 or Later
"""
import os
import csv
import gzip
from tqdm import tqdm
from io import BytesIO
from tempfile import NamedTemporaryFile

Expand Down Expand Up @@ -78,3 +81,30 @@ def read_dbc_geopandas(filename, encoding="utf-8"):
os.unlink(tf.name)

return df

def read_dbc_dbf(filename: str):
if filename.endswith(('dbc', 'DBC')):
df = read_dbc(filename, encoding="iso-8859-1")
elif filename.endswith(("DBF", "dbf")):
dbf = DBF(filename, encoding="iso-8859-1")
df = pd.DataFrame(list(dbf))
return df

def dbf_to_csvgz(filename: str, encoding: str='iso-8859-1'):
"""
Streams a dbf file to gzipped CSV file. The Gzipped csv will be saved on the same path but with a csv.gz extension.
:param filename: path to the dbf file
"""
data = DBF(filename, encoding=encoding, raw=False)
fn = os.path.splitext(filename)[0] + '.csv.gz'

with gzip.open(fn, 'wt') as gzf:
for i, d in tqdm(enumerate(data), desc='Converting',):
if i == 0:
csvwriter = csv.DictWriter(gzf, fieldnames=d.keys())
csvwriter.writeheader()
csvwriter.writerow(d)
else:
csvwriter.writerow(d)


0 comments on commit bf86aca

Please sign in to comment.