diff --git a/environment.yaml b/environment.yaml index a9899d46..20543743 100644 --- a/environment.yaml +++ b/environment.yaml @@ -27,6 +27,7 @@ dependencies: - pytest - pytest-cov - pre-commit + - numba - pip: - geobr - facets-overview diff --git a/pysus/Notebooks/Analyzing SIA.ipynb b/pysus/Notebooks/Analyzing SIA.ipynb new file mode 100644 index 00000000..cde70703 --- /dev/null +++ b/pysus/Notebooks/Analyzing SIA.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.9/dist-packages/geopandas/_compat.py:111: UserWarning: The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.10.0-CAPI-1.16.0). Conversions between both will be slow.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from ftplib import FTP\n", + "import os\n", + "import pandas as pd\n", + "from pysus.online_data.SIA import download, show_datatypes\n", + "from pysus.utilities.readdbc import dbf_to_csvgz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Dataset types\n", + "The SIA Information system contains multiple types of datasets we can download with PySUS. These are:\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'AB': ('APAC de Cirurgia Bariátrica', 1, 2008),\n", + " 'ACF': ('APAC de Confecção de Fístula', 1, 2008),\n", + " 'AD': ('APAC de Laudos Diversos', 1, 2008),\n", + " 'AM': ('APAC de Medicamentos', 1, 2008),\n", + " 'AMP': ('APAC de Acompanhamento Multiprofissional', 1, 2008),\n", + " 'AN': ('APAC de Nefrologia', 1, 2008),\n", + " 'AQ': ('APAC de Quimioterapia', 1, 2008),\n", + " 'AR': ('APAC de Radioterapia', 1, 2008),\n", + " 'ATD': ('APAC de Tratamento Dialítico', 1, 2008),\n", + " 'BI': ('Boletim de Produção Ambulatorial individualizado', 1, 2008),\n", + " 'PA': ('Produção Ambulatorial', 7, 1994),\n", + " 'PS': ('RAAS Psicossocial', 1, 2008),\n", + " 'SAD': ('RAAS de Atenção Domiciliar', 1, 2008)}\n" + ] + } + ], + "source": [ + "show_datatypes()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading PASP2012a.dbc...\n", + "Downloading PASP2012b.dbc...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Success\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", + "Input \u001B[0;32mIn [4]\u001B[0m, in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0m dfSP \u001B[38;5;241m=\u001B[39m \u001B[43mdownload\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mSP\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m2020\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m12\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgroup\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mPA\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m~/Documentos/Software_projects/PySUS/pysus/online_data/SIA.py:116\u001B[0m, in \u001B[0;36mdownload\u001B[0;34m(state, year, month, cache, group)\u001B[0m\n\u001B[1;32m 114\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 115\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 116\u001B[0m df \u001B[38;5;241m=\u001B[39m \u001B[43m_fetch_file\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfname\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mftp\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mftype\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 117\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m cache \u001B[38;5;129;01mand\u001B[39;00m df: \u001B[38;5;66;03m# saves to cache if df is not None\u001B[39;00m\n\u001B[1;32m 118\u001B[0m df\u001B[38;5;241m.\u001B[39mto_parquet(cachefile)\n", + "File \u001B[0;32m~/Documentos/Software_projects/PySUS/pysus/online_data/SIA.py:149\u001B[0m, in \u001B[0;36m_fetch_file\u001B[0;34m(fname, ftp, ftype)\u001B[0m\n\u001B[1;32m 147\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDownloading \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfn\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m...\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 148\u001B[0m fobj \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mopen\u001B[39m(fnfull, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mwb\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 149\u001B[0m \u001B[43mftp\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mretrbinary\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43mf\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mRETR \u001B[39;49m\u001B[38;5;132;43;01m{\u001B[39;49;00m\u001B[43mfn\u001B[49m\u001B[38;5;132;43;01m}\u001B[39;49;00m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mfobj\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mwrite\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 150\u001B[0m dbc2dbf(fnfull, fnfull\u001B[38;5;241m.\u001B[39mreplace(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m.dbc\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m.dbf\u001B[39m\u001B[38;5;124m'\u001B[39m))\n\u001B[1;32m 151\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m exc:\n", + "File \u001B[0;32m/usr/lib/python3.9/ftplib.py:445\u001B[0m, in \u001B[0;36mFTP.retrbinary\u001B[0;34m(self, cmd, callback, blocksize, rest)\u001B[0m\n\u001B[1;32m 443\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m _SSLSocket \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(conn, _SSLSocket):\n\u001B[1;32m 444\u001B[0m conn\u001B[38;5;241m.\u001B[39munwrap()\n\u001B[0;32m--> 445\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mvoidresp\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n", + "File \u001B[0;32m/usr/lib/python3.9/ftplib.py:259\u001B[0m, in \u001B[0;36mFTP.voidresp\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 257\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mvoidresp\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 258\u001B[0m \u001B[38;5;124;03m\"\"\"Expect a response beginning with '2'.\"\"\"\u001B[39;00m\n\u001B[0;32m--> 259\u001B[0m resp \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetresp\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 260\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m resp[:\u001B[38;5;241m1\u001B[39m] \u001B[38;5;241m!=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m2\u001B[39m\u001B[38;5;124m'\u001B[39m:\n\u001B[1;32m 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m error_reply(resp)\n", + "File \u001B[0;32m/usr/lib/python3.9/ftplib.py:244\u001B[0m, in \u001B[0;36mFTP.getresp\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 243\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgetresp\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[0;32m--> 244\u001B[0m resp \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetmultiline\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 245\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdebugging:\n\u001B[1;32m 246\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m*resp*\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msanitize(resp))\n", + "File \u001B[0;32m/usr/lib/python3.9/ftplib.py:230\u001B[0m, in \u001B[0;36mFTP.getmultiline\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 229\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgetmultiline\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[0;32m--> 230\u001B[0m line \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetline\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 231\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m line[\u001B[38;5;241m3\u001B[39m:\u001B[38;5;241m4\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m-\u001B[39m\u001B[38;5;124m'\u001B[39m:\n\u001B[1;32m 232\u001B[0m code \u001B[38;5;241m=\u001B[39m line[:\u001B[38;5;241m3\u001B[39m]\n", + "File \u001B[0;32m/usr/lib/python3.9/ftplib.py:212\u001B[0m, in \u001B[0;36mFTP.getline\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 211\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mgetline\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[0;32m--> 212\u001B[0m line \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfile\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mreadline\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmaxline\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m+\u001B[39;49m\u001B[43m \u001B[49m\u001B[38;5;241;43m1\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 213\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(line) \u001B[38;5;241m>\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmaxline:\n\u001B[1;32m 214\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m Error(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mgot more than \u001B[39m\u001B[38;5;132;01m%d\u001B[39;00m\u001B[38;5;124m bytes\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m%\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmaxline)\n", + "File \u001B[0;32m/usr/lib/python3.9/socket.py:704\u001B[0m, in \u001B[0;36mSocketIO.readinto\u001B[0;34m(self, b)\u001B[0m\n\u001B[1;32m 702\u001B[0m \u001B[38;5;28;01mwhile\u001B[39;00m \u001B[38;5;28;01mTrue\u001B[39;00m:\n\u001B[1;32m 703\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 704\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_sock\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrecv_into\u001B[49m\u001B[43m(\u001B[49m\u001B[43mb\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 705\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m timeout:\n\u001B[1;32m 706\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_timeout_occurred \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mTrue\u001B[39;00m\n", + "\u001B[0;31mKeyboardInterrupt\u001B[0m: " + ] + } + ], + "source": [ + "dfSP = download('SP', 2020, 12, group=['PA'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'dfSP' is not defined", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", + "Input \u001B[0;32mIn [5]\u001B[0m, in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mdfSP\u001B[49m\u001B[38;5;241m.\u001B[39mhead()\n", + "\u001B[0;31mNameError\u001B[0m: name 'dfSP' is not defined" + ] + } + ], + "source": [ + "dfSP.head()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "When a download is split in multiple files as in the case above, PySUS downloads the dbfs directly to the cache path, Without loading the dataframe to memory." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Converting: 2641548it [07:10, 6138.62it/s]\n" + ] + } + ], + "source": [ + "from pysus.online_data import CACHEPATH\n", + "import os\n", + "\n", + "dbf_to_csvgz(os.path.join(CACHEPATH, 'PASP2012a.dbf'))" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The number of lines in this CSV is still very large and loading it entirely into memory is not a good Idea.\n", + "\n", + "But now that it is on a CSV file you can load just a limited number of lines from it as shown below:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": " PA_CODUNI PA_GESTAO PA_CONDIC PA_UFMUN PA_REGCT PA_INCOUT PA_INCURG \\\n0 2080273 350000 EP 354780 7101 0 0 \n1 2080273 350000 EP 354780 7101 0 0 \n2 2080273 350000 EP 354780 7101 0 0 \n3 2080338 350000 EP 351880 7101 0 0 \n4 2080338 350000 EP 351880 7101 0 0 \n5 2080273 350000 EP 354780 7101 0 0 \n6 2080273 350000 EP 354780 7101 0 0 \n7 2080273 350000 EP 354780 7101 0 0 \n8 2090236 350000 EP 350550 7101 0 0 \n9 2090236 350000 EP 350550 7101 0 0 \n\n PA_TPUPS PA_TIPPRE PA_MN_IND ... PA_CODOCO PA_FLQT PA_FLER PA_ETNIA \\\n0 5 0 M ... 1 K 0 NaN \n1 5 0 M ... 1 K 0 NaN \n2 5 0 M ... 1 K 0 NaN \n3 5 0 M ... 1 K 0 NaN \n4 5 0 M ... 1 K 0 NaN \n5 5 0 M ... 1 K 0 NaN \n6 5 0 M ... 1 K 0 NaN \n7 5 0 M ... 1 K 0 NaN \n8 7 0 I ... 1 K 0 NaN \n9 7 0 I ... 1 K 0 NaN \n\n PA_VL_CF PA_VL_CL PA_VL_INC PA_SRV_C PA_INE PA_NAT_JUR \n0 0.0 0.0 0.0 NaN NaN 1023 \n1 0.0 0.0 0.0 NaN NaN 1023 \n2 0.0 0.0 0.0 NaN NaN 1023 \n3 0.0 0.0 0.0 121001.0 NaN 1023 \n4 0.0 0.0 0.0 121001.0 NaN 1023 \n5 0.0 0.0 0.0 NaN NaN 1023 \n6 0.0 0.0 0.0 NaN NaN 1023 \n7 0.0 0.0 0.0 NaN NaN 1023 \n8 0.0 0.0 0.0 120002.0 NaN 3069 \n9 0.0 0.0 0.0 120002.0 NaN 3069 \n\n[10 rows x 60 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PA_CODUNIPA_GESTAOPA_CONDICPA_UFMUNPA_REGCTPA_INCOUTPA_INCURGPA_TPUPSPA_TIPPREPA_MN_IND...PA_CODOCOPA_FLQTPA_FLERPA_ETNIAPA_VL_CFPA_VL_CLPA_VL_INCPA_SRV_CPA_INEPA_NAT_JUR
02080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
12080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
22080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
32080338350000EP35188071010050M...1K0NaN0.00.00.0121001.0NaN1023
42080338350000EP35188071010050M...1K0NaN0.00.00.0121001.0NaN1023
52080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
62080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
72080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
82090236350000EP35055071010070I...1K0NaN0.00.00.0120002.0NaN3069
92090236350000EP35055071010070I...1K0NaN0.00.00.0120002.0NaN3069
\n

10 rows × 60 columns

\n
" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn = os.path.join(CACHEPATH, 'PASP2012a.csv.gz')\n", + "df = pd.read_csv(fn, nrows=10)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Another approach, if you need to analyze the entire dataset, is to process it in chunks." + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "chunks = pd.read_csv(fn, iterator=True, chunksize=1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": " PA_CODUNI PA_GESTAO PA_CONDIC PA_UFMUN PA_REGCT PA_INCOUT PA_INCURG \\\n0 2080273 350000 EP 354780 7101 0 0 \n1 2080273 350000 EP 354780 7101 0 0 \n2 2080273 350000 EP 354780 7101 0 0 \n3 2080338 350000 EP 351880 7101 0 0 \n4 2080338 350000 EP 351880 7101 0 0 \n.. ... ... ... ... ... ... ... \n995 2705982 350000 EP 351620 7101 0 0 \n996 2716305 350000 EP 354980 7103 0 0 \n997 2705982 350000 EP 351620 7101 0 0 \n998 2705982 350000 EP 351620 7101 0 0 \n999 2705982 350000 EP 351620 7103 0 0 \n\n PA_TPUPS PA_TIPPRE PA_MN_IND ... PA_CODOCO PA_FLQT PA_FLER \\\n0 5 0 M ... 1 K 0 \n1 5 0 M ... 1 K 0 \n2 5 0 M ... 1 K 0 \n3 5 0 M ... 1 K 0 \n4 5 0 M ... 1 K 0 \n.. ... ... ... ... ... ... ... \n995 5 0 I ... 1 R 0 \n996 80 0 M ... 1 K 0 \n997 5 0 I ... 1 K 0 \n998 5 0 I ... 1 K 0 \n999 5 0 I ... 1 R 0 \n\n PA_ETNIA PA_VL_CF PA_VL_CL PA_VL_INC PA_SRV_C PA_INE PA_NAT_JUR \n0 NaN 0.0 0.0 0.0 NaN NaN 1023 \n1 NaN 0.0 0.0 0.0 NaN NaN 1023 \n2 NaN 0.0 0.0 0.0 NaN NaN 1023 \n3 NaN 0.0 0.0 0.0 121001.0 NaN 1023 \n4 NaN 0.0 0.0 0.0 121001.0 NaN 1023 \n.. ... ... ... ... ... ... ... \n995 NaN 0.0 0.0 0.0 NaN NaN 3069 \n996 NaN 0.0 0.0 0.0 145003.0 NaN 1023 \n997 NaN 0.0 0.0 0.0 NaN NaN 3069 \n998 NaN 0.0 0.0 0.0 NaN NaN 3069 \n999 NaN 0.0 0.0 0.0 NaN NaN 3069 \n\n[1000 rows x 60 columns]", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
PA_CODUNIPA_GESTAOPA_CONDICPA_UFMUNPA_REGCTPA_INCOUTPA_INCURGPA_TPUPSPA_TIPPREPA_MN_IND...PA_CODOCOPA_FLQTPA_FLERPA_ETNIAPA_VL_CFPA_VL_CLPA_VL_INCPA_SRV_CPA_INEPA_NAT_JUR
02080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
12080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
22080273350000EP35478071010050M...1K0NaN0.00.00.0NaNNaN1023
32080338350000EP35188071010050M...1K0NaN0.00.00.0121001.0NaN1023
42080338350000EP35188071010050M...1K0NaN0.00.00.0121001.0NaN1023
..................................................................
9952705982350000EP35162071010050I...1R0NaN0.00.00.0NaNNaN3069
9962716305350000EP354980710300800M...1K0NaN0.00.00.0145003.0NaN1023
9972705982350000EP35162071010050I...1K0NaN0.00.00.0NaNNaN3069
9982705982350000EP35162071010050I...1K0NaN0.00.00.0NaNNaN3069
9992705982350000EP35162071030050I...1R0NaN0.00.00.0NaNNaN3069
\n

1000 rows × 60 columns

\n
" + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for df in chunks:\n", + " break\n", + "\n", + "df\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Now that you have converted the `.dbf` to a `.csv.gz` file, you can safely delete the dbf file to save storage space on your computer.\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/pysus/online_data/SIA.py b/pysus/online_data/SIA.py index 864d0263..47f9f97c 100644 --- a/pysus/online_data/SIA.py +++ b/pysus/online_data/SIA.py @@ -12,12 +12,13 @@ from datetime import date from ftplib import FTP from typing import Dict, List, Optional, Tuple, Union +from pprint import pprint import pandas as pd from dbfread import DBF from pysus.online_data import CACHEPATH -from pysus.utilities.readdbc import read_dbc +from pysus.utilities.readdbc import read_dbc, read_dbc_dbf, dbc2dbf group_dict: Dict[str, Tuple[str, int, int]] = { "PA": ("Produção Ambulatorial", 7, 1994), @@ -35,13 +36,15 @@ "PS": ("RAAS Psicossocial", 1, 2008), } +def show_datatypes(): + pprint(group_dict) def download( - state: str, - year: int, - month: int, - cache: bool = True, - group: Union[str, List[str]] = ["PA", "BI"], + state: str, + year: int, + month: int, + cache: bool = True, + group: Union[str, List[str]] = ["PA", "BI"], ) -> Union[Optional[pd.DataFrame], Tuple[Optional[pd.DataFrame], ...]]: """ Download SIASUS records for state year and month and returns dataframe @@ -111,13 +114,13 @@ def download( else: try: df = _fetch_file(fname, ftp, ftype) - if cache: # saves to cache + if cache and df: # saves to cache if df is not None df.to_parquet(cachefile) except Exception as e: df = None print(e) - - dfs.append(df) + if df is not None: + dfs.append(df) if len(dfs) == 1: return dfs[0] @@ -133,18 +136,50 @@ def _fetch_file(fname, ftp, ftype): :param ftype: file type: DBF|DBC :return: pandas dataframe """ - print(f"Downloading {fname}...") - try: - ftp.retrbinary(f"RETR {fname}", open(fname, "wb").write) - except: - try: - ftp.retrbinary(f"RETR {fname.lower()}", open(fname, "wb").write) - except: - raise Exception(f"File {fname} not available") - if ftype == "DBC": - df = read_dbc(fname, encoding="iso-8859-1") - elif ftype == "DBF": - dbf = DBF(fname, encoding="iso-8859-1") - df = pd.DataFrame(list(dbf)) + + multiples = False + fnames = check_file_split(fname, ftp) + + multiples = len(fnames) > 1 + + if multiples: + download_multiples(fnames, ftp) + print(f"This download is split into the following files: {fnames}\n" + f"They have been downloaded in {CACHEPATH}.\n" + f"To load them, use the pysus.utilities.read_dbc_dbf function.") + return + df = read_dbc_dbf(fname) + os.unlink(fname) return df + + +def download_multiples(fnames, ftp): + for fn in fnames: + fnfull = os.path.join(CACHEPATH, fn) + print(f"Downloading {fn}...") + fobj = open(fnfull, "wb") + try: + ftp.retrbinary(f"RETR {fn}", fobj.write) + dbc2dbf(fnfull, fnfull.replace('.dbc', '.dbf')) + os.unlink(fnfull) + except Exception as exc: + raise Exception(f"Retrieval of file {fn} failed with the following error:\n {exc}") + + +def check_file_split(fname: str, ftp: FTP) -> list: + """ + Check for split filenames. Sometimes when files are too large, they are split into multiple files ending in a, b, c, ... + :param fname: filename + :param ftp: ftp conection + :return: list + """ + files = [] + flist = ftp.nlst() + if fname not in flist: + for l in ['a', 'b', 'c', 'd']: + nm, ext = fname.split('.') + if f'{nm}{l}.{ext}' in flist: + files.append(f'{nm}{l}.{ext}') + + return files diff --git a/pysus/online_data/vaccine.py b/pysus/online_data/vaccine.py index 0cfb6b4a..17fa4b6a 100644 --- a/pysus/online_data/vaccine.py +++ b/pysus/online_data/vaccine.py @@ -38,6 +38,7 @@ def download_covid(uf=None): "loading from cache. Returning an iterator of Dataframes in chunks of 5000." ) return pd.read_csv(tempfile, chunksize=5000) + auth = HTTPBasicAuth(user, pwd) data_gen = elasticsearch_fetch(url, auth, query) diff --git a/pysus/tests/test_SIA.py b/pysus/tests/test_SIA.py new file mode 100644 index 00000000..2ba0b20d --- /dev/null +++ b/pysus/tests/test_SIA.py @@ -0,0 +1,24 @@ +import unittest +from ftplib import FTP +import pandas as pd +from pysus.online_data.SIA import download, check_file_split + +class SIATestCase(unittest.TestCase): + def test_check_split_filenames(self): + ftp = FTP("ftp.datasus.gov.br") + ftp.login() + ftp.cwd("/dissemin/publicos/SIASUS/200801_/Dados") + names = check_file_split('PASP2012.dbc', ftp) + assert len(names) == 3 + assert 'PASP2012b.dbc' in names + + def test_download_large_PA(self): + res = download('SP', 2020, 12, group=['PA']) + if isinstance(res, pd.DataFrame): + assert not res.empty + else: + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/pysus/utilities/readdbc.py b/pysus/utilities/readdbc.py index e9bb1a71..0f5f7d88 100644 --- a/pysus/utilities/readdbc.py +++ b/pysus/utilities/readdbc.py @@ -4,6 +4,9 @@ license: GPL V3 or Later """ import os +import csv +import gzip +from tqdm import tqdm from io import BytesIO from tempfile import NamedTemporaryFile @@ -78,3 +81,30 @@ def read_dbc_geopandas(filename, encoding="utf-8"): os.unlink(tf.name) return df + +def read_dbc_dbf(filename: str): + if filename.endswith(('dbc', 'DBC')): + df = read_dbc(filename, encoding="iso-8859-1") + elif filename.endswith(("DBF", "dbf")): + dbf = DBF(filename, encoding="iso-8859-1") + df = pd.DataFrame(list(dbf)) + return df + +def dbf_to_csvgz(filename: str, encoding: str='iso-8859-1'): + """ + Streams a dbf file to gzipped CSV file. The Gzipped csv will be saved on the same path but with a csv.gz extension. + :param filename: path to the dbf file + """ + data = DBF(filename, encoding=encoding, raw=False) + fn = os.path.splitext(filename)[0] + '.csv.gz' + + with gzip.open(fn, 'wt') as gzf: + for i, d in tqdm(enumerate(data), desc='Converting',): + if i == 0: + csvwriter = csv.DictWriter(gzf, fieldnames=d.keys()) + csvwriter.writeheader() + csvwriter.writerow(d) + else: + csvwriter.writerow(d) + +