diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43a8f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,146 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +**__pycache__ + +# PyCharm +.idea/ + +# RStudio project files +**.Rproj.user/ +**.Rproj.user* +**.Rproj +**.Rhistory + +# MacOS +.DS_Store + + +# All Data Files +*.csv \ No newline at end of file diff --git a/README.MD b/README.MD new file mode 100644 index 0000000..fcacb47 --- /dev/null +++ b/README.MD @@ -0,0 +1,33 @@ +# dataio + +DataIO is a Python package designed to simplify data access within the Data Science Innovation Hub at ARTPARK. + +## Installation + +You can install DataIO using [Poetry](https://python-poetry.org/) and Git SSH by running the following command: + +```bash +poetry add git+ssh://git@github.com:dsih-artpark/dataio.git#v0.1.0 +``` + +## Usage +```python +from dataio.download import fetch_data_documentation, download_dataset_v2 + +# Example usage of fetch_data_documentation to get metadata +metadata, datadict = fetch_data_documentation(dsid="your_dataset_id") + +# Example usage of download_dataset_v2 to download the dataset +download_dataset_v2(dsid="your_dataset_id", data_state="standardised") +``` +The ```download_dataset_v2``` has more advanced functionality, to download specific files based on your need. Read about it [here][download_dataset_v2]. + +## Version +Current version: v0.1.0 +dataio is in limited alpha release. + + + + + +[download_dataset_v2]: src/dataio/download/__init__.py#L212 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..57d3a57 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,302 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "boto3" +version = "1.34.64" +description = "The AWS SDK for Python" +optional = false +python-versions = ">= 3.8" +files = [ + {file = "boto3-1.34.64-py3-none-any.whl", hash = "sha256:8c6fbd3d45399a4e4685010117fb2dc52fc6afdab5a9460957d463ae0c2cc55d"}, + {file = "boto3-1.34.64.tar.gz", hash = "sha256:e5d681f443645e6953ed0727bf756bf16d85efefcb69cf051d04a070ce65e545"}, +] + +[package.dependencies] +botocore = ">=1.34.64,<1.35.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.10.0,<0.11.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.34.64" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">= 3.8" +files = [ + {file = "botocore-1.34.64-py3-none-any.whl", hash = "sha256:0ab760908749fe82325698591c49755a5bb20307d85a419aca9cc74e783b9407"}, + {file = "botocore-1.34.64.tar.gz", hash = "sha256:084f8c45216d62dc1add2350e236a2d5283526aacd0681e9818b37a6a5e5438b"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} + +[package.extras] +crt = ["awscrt (==0.19.19)"] + +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + +[[package]] +name = "numpy" +version = "1.26.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, + {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"}, + {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"}, + {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"}, + {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"}, + {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"}, + {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"}, + {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"}, + {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"}, + {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"}, + {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"}, + {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"}, + {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"}, + {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"}, + {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"}, + {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"}, + {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"}, + {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"}, + {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"}, + {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"}, + {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"}, + {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"}, + {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, +] + +[[package]] +name = "pandas" +version = "2.2.1" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"}, + {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"}, + {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"}, + {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"}, + {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"}, + {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, +] + +[[package]] +name = "s3transfer" +version = "0.10.1" +description = "An Amazon S3 Transfer Manager" +optional = false +python-versions = ">= 3.8" +files = [ + {file = "s3transfer-0.10.1-py3-none-any.whl", hash = "sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d"}, + {file = "s3transfer-0.10.1.tar.gz", hash = "sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19"}, +] + +[package.dependencies] +botocore = ">=1.33.2,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + +[[package]] +name = "urllib3" +version = "2.2.1" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, + {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "174508b276a1ebc5f46de89f366a0e24e7609e1fa39f36f35e35437eac2b53d6" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1b6b4c2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.poetry] +name = "dataio" +version = "0.1.0" +description = "" +authors = ["Sai Sneha "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.11" +boto3 = "^1.34.64" +pandas = "^2.2.1" +pyyaml = "^6.0.1" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/src/dataio/__init__.py b/src/dataio/__init__.py new file mode 100644 index 0000000..59168aa --- /dev/null +++ b/src/dataio/__init__.py @@ -0,0 +1,6 @@ +import dataio.download # noqa: F401 +import dataio.upload # noqa: F401 +import importlib.metadata + + +__version__ = importlib.metadata.version("dataio") diff --git a/src/dataio/download/__init__.py b/src/dataio/download/__init__.py new file mode 100644 index 0000000..b5b9a58 --- /dev/null +++ b/src/dataio/download/__init__.py @@ -0,0 +1,443 @@ +import os +import warnings +import boto3 +from tempfile import NamedTemporaryFile +import requests +import yaml +from typing import Tuple, Dict, Optional, Union +import pkg_resources +import platform + + +def download_file_from_URI(URI: str, path: str = None, temp: bool = False): + """Downloads a file from a URI. + + Parameters: + URI (str): The URI from which to download the file. + path (str, optional): The path to save the file. If None and temp is False, an error is raised. + temp (bool, optional): If True, the file will be downloaded temporarily. Default is False. + + Raises: + ValueError: If path is None and temp is False, or if no extension is present in the URI, + or if the provided path does not exist, or if the URI is invalid. + Returns: + tuple: A tuple containing the full path if successful, or None if not, a boolean indicating success or failure, + and an Exception object if the download fails. If the download is successful, the third element of the tuple + will be None. + """ + # Check if URI is valid + if not URI.startswith("s3://"): + raise ValueError("Invalid URI. URI should start with 's3://'.") + + # Find the index of the first "/" after "s3://" + first_slash_index = URI.find("/", 5) + + # Check if there are characters between "s3://" and the next "/" + if first_slash_index == -1 or first_slash_index == 5: + raise ValueError("Invalid URI. URI should contain characters between 's3://' and the subsequent '/'.") + + # Check if there are characters after the third "/" + if first_slash_index == len(URI) - 1: + raise ValueError("Invalid URI. URI should contain characters after the subsequent '/'.") + + if path is None: + if not temp: + raise ValueError("Either temp must be True or a path must be provided.") + else: + if temp: + warnings.warn("Since path is provided, a temporary directory will not be created.") + + if not os.path.exists(path): + raise ValueError("The provided path does not exist.") + + # Infer file extension from URI + parts = URI.split('.') + if len(parts) < 2 or not parts[-1]: + raise ValueError("No extension found in the URI.") + ext = parts[-1] + + # If path is provided and temp is False, append filename to path + if path is not None: + filename = URI.split("/")[-1] + path = os.path.join(path, filename) + + # Create a named temporary file if needed + if path is None and temp: + with NamedTemporaryFile(suffix='.' + ext, delete=False) as temp_file: + path = temp_file.name + + client = boto3.client('s3') + bucket = URI.split("/")[2] + key = '/'.join(URI.split("/")[3:]) + + try: + client.download_file(Bucket=bucket, Key=key, Filename=path) + return path, True, None + except Exception as e: + return None, False, e + + +def fetch_data_documentation(*, dsid: str, + gh_urls: Optional[Dict[str, str]] = None, + repo_info: Optional[Dict[str, str]] = None, + default: bool = False, + binary: bool = False) -> Tuple[Dict, Dict]: + """ + Fetches metadata and data dictionary for a given dataset ID (DSID) from a GitHub repository. + + Parameters: + dsid (str): Dataset ID. + gh_urls (dict, optional): Dictionary containing custom GitHub URLs. + repo_info (dict, optional): Dictionary containing owner, repo, branch, etc. information. + default (bool): If True, suppress warnings about missing keys or no input provided. + binary (bool): If True, returns binary content of metadata and data dictionary. + + Returns: + Tuple[bytes, bytes]: A tuple containing binary content of metadata and data dictionary. + + Raises: + ValueError: If metadata or data dictionary files are not found for the specified dataset ID. + TypeError: If dsid is not a string. + """ + + # Validate repo_info dictionary + if repo_info is not None: + # Check for unexpected keys in repo_info + unexpected_keys = set(repo_info.keys()) - {'owner', 'repo', 'branch', 'catalogue_path', 'datadict_fname', 'metadata_fname'} + if unexpected_keys: + warnings.warn(f"Ignoring unexpected keys in repo_info: {unexpected_keys}", UserWarning) + + # Check for missing keys in repo_info + missing_keys = {'owner', 'repo', 'branch', 'catalogue_path', 'datadict_fname', 'metadata_fname'} - set(repo_info.keys()) + if missing_keys and not default: + warnings.warn(f"Missing keys in repo_info, using default values for: {missing_keys}", UserWarning) + elif missing_keys and default: + warnings.warn(f"Missing keys in repo_info, using default values for: {missing_keys}", UserWarning, stacklevel=2) + # Issue warning if custom values not provided for repo_info and default values are used. + elif not default: + warnings.warn("No custom values provided for repo_info, using default values", UserWarning) + + # Validate gh_urls dictionary + if gh_urls is not None: + # Check for unexpected keys in gh_urls + unexpected_keys = set(gh_urls.keys()) - {'api_base_url', 'raw_base_url'} + if unexpected_keys: + warnings.warn(f"Ignoring unexpected keys in gh_urls: {unexpected_keys}", UserWarning) + + # Check for missing keys in gh_urls + missing_keys = {'api_base_url', 'raw_base_url'} - set(gh_urls.keys()) + if missing_keys and not default: + warnings.warn(f"Missing keys in gh_urls, using default values for: {missing_keys}", UserWarning) + elif missing_keys and default: + warnings.warn(f"Missing keys in gh_urls, using default values for: {missing_keys}", UserWarning, stacklevel=2) + + # Issue warning if custom values not provided for gh_urls and default values are used. + elif not default: + warnings.warn("No custom values provided for gh_urls, using default values", UserWarning) + + # Set default GitHub URLs if not provided + gh_urls = gh_urls or {} + gh_api_base_url = gh_urls.get('api_base_url', "https://api.github.com/repos/") + gh_raw_base_url = gh_urls.get('raw_base_url', "https://raw.githubusercontent.com/") + + # Set default repository information if not provided + repo_info = repo_info or {} + owner = repo_info.get('owner', "dsih-artpark") + repo = repo_info.get('repo', "data-documentation") + branch = repo_info.get('branch', "production") + catalogue_path = repo_info.get('catalogue_path', "info") + datadict_fname = repo_info.get('datadict_fname', "datadictionary.yaml") + metadata_fname = repo_info.get('metadata_fname', "metadata.yaml") + + # Construct URL to fetch the tree of files + tree_url = f"{gh_api_base_url}{owner}/{repo}/git/trees/{branch}?recursive=1" + + # Make request to GitHub tree API endpoint + response = requests.get(tree_url) + + # Check status code of the response + if response.status_code == 200: + tree = response.json().get('tree', []) + elif response.status_code == 404: + raise ValueError("Resource not found. Please check if the repository or branch exists.") + elif response.status_code == 422: + raise ValueError("Validation failed or the endpoint has been spammed.") + else: + raise ValueError("Unknown error occurred while fetching tree data from GitHub.") + + # Construct path prefix based on dataset ID + dsid_path_prefix = f"{catalogue_path}/{dsid[0:2]}/{dsid}-" + + # Find data dictionary file in the tree + gh_datadict_path = None + for file_info in tree: + if file_info['path'].startswith(dsid_path_prefix) and file_info['path'].endswith(datadict_fname): + gh_datadict_path = file_info['path'] + break + + # Raise error if data dictionary file not found + if not gh_datadict_path: + raise ValueError(f"Data dictionary file not found for dataset ID '{dsid}'.") + + # Construct paths for metadata files + gh_metadata_path = gh_datadict_path.replace(datadict_fname, metadata_fname) + + # Construct URLs to fetch raw content of metadata and data dictionary files + gh_raw_metadata_url = f"{gh_raw_base_url}{owner}/{repo}/{branch}/{gh_metadata_path}" + gh_raw_datadict_url = f"{gh_raw_base_url}{owner}/{repo}/{branch}/{gh_datadict_path}" + + # Retrieve and parse metadata + raw_metadata_response = requests.get(gh_raw_metadata_url) + if raw_metadata_response.status_code == 404: + raise ValueError(f"Metadata file not found for dataset ID '{dsid}'.") + elif raw_metadata_response.status_code != 200: + raise ValueError(f"Failed to retrieve metadata for dataset ID '{dsid}'. Request failed.") + + # Retrieve and parse data dictionary + raw_datadict_response = requests.get(gh_raw_datadict_url) + if raw_datadict_response.status_code == 404: + raise ValueError(f"Data dictionary file not found for dataset ID '{dsid}'.") + elif raw_datadict_response.status_code != 200: + raise ValueError(f"Failed to retrieve data dictionary for dataset ID '{dsid}'. Request failed.") + + if binary: + return raw_metadata_response.content, raw_datadict_response.content + else: + metadata = yaml.safe_load(raw_metadata_response.content.decode('utf-8')) + datadict = yaml.safe_load(raw_datadict_response.content.decode('utf-8')) + + return metadata, datadict + + +def download_dataset_v2(*, + dsid: str, + data_state: str = "standardised", + contains_all: Union[str, list, None] = None, + contains_any: Union[str, list, None] = None, + suffixes: Union[str, list, None] = None, + datadir: str = "data", + update=True, + clean=False, + fetch_docs=False, + check_for_expected_files=False, + expected_file_list=[None], + verbose=False + ): + """ + Downloads files associated with a dataset from an S3 bucket and optionally fetches metadata and data dictionary. + + Parameters: + dsid (str): Dataset ID. + data_state (str, optional): State of the dataset. Defaults to "standardised". + contains_all (str, list, optional): List of substrings that must be present in the file names. + contains_any (str, list, optional): List of substrings of which at least one must be present in the file names. + suffixes (str, list, optional): List of file suffixes. + datadir (str, optional): Directory to download files to. Defaults to "data". + update (bool, optional): If True, checks for local file modifications and updates them if necessary. + clean (bool, optional): If True, deletes extraneous files in the datadir. + fetch_docs (bool, optional): If True, fetches metadata and data dictionary. + check_for_expected_files (bool, optional): If True, checks for expected files in the datadir. + expected_file_list (list, optional): List of expected files to check for. + verbose (bool, optional): If True, prints verbose output. + + Raises: + ValueError: If no files meet the specified criteria or if the dataset is not found in the S3 bucket. + TypeError: If dsid or data_state is not a string. + + Returns: + None + """ + + with open(pkg_resources.resource_filename(__name__, 'settings.yaml'), 'r') as f: + settings = yaml.safe_load(f) + + if not isinstance(dsid, str): + raise TypeError("dsid must be a string.") + if not isinstance(data_state, str): + raise TypeError("data_state must be a string.") + + Bucket = settings["data_state_buckets"].get(data_state) + if Bucket is None: + raise ValueError(f"{data_state} is not a valid data state. Must be one of {str(settings['data_state_buckets'].keys())}") + + # Initialize the S3 client + client = boto3.client('s3') + listobjv2_paginator = client.get_paginator('list_objects_v2') + + # Get the common prefixes (folders) from the bucket + dsid_names = {} + for prefix in listobjv2_paginator.paginate(Bucket=Bucket, Delimiter='/').search('CommonPrefixes'): + folder = prefix.get('Prefix') + dsid_names[folder.split("-")[0]] = folder + + # Determine the prefix for the specified dsid + dsid_name = dsid_names.get(dsid) + if dsid_name is None: + raise ValueError(f"Dataset {dsid} not found in specified state {data_state} on Bucket.") + + # List objects in the dsid prefix + listobjv2_files = listobjv2_paginator.paginate(Bucket=Bucket, Prefix=dsid_name) + + # Collect files found by iterating through all tranches + files_found = [] + for tranch in listobjv2_files: + files_found += [item['Key'] for item in tranch['Contents'] if not item['Key'].endswith("/")] + + # Filter files based on contains_any criteria and build the dictionary + if contains_any is not None: + if isinstance(contains_any, str): + contains_any = [contains_any] + elif not isinstance(contains_any, list): + raise TypeError("contains_any must be a string, list, or None.") + + # Initialize dictionary to store files containing each item from contains_any + files_containing_any = set() + firstLoop = True + for item in contains_any: + files_containing_this_item = [file for file in files_found if item in file] + if firstLoop: + files_containing_any = set(files_containing_this_item) + else: + if files_containing_any.isdisjoint(files_containing_this_item): + files_containing_any.update(files_containing_this_item) + else: + repeats = set.intersection(files_containing_any, files_containing_this_item) + raise ValueError(f"A file cannot contain more than one item from the contains_any list: {repeats}") + else: + files_containing_any = set(files_found) + + # Filter files based on contains_all criteria + if contains_all is not None: + if isinstance(contains_all, str): + contains_all = [contains_all] + elif not isinstance(contains_all, list): + raise TypeError("contains_all must be a string, list, or None.") + + # Initialising set to store files containing all items from contains_all + files_containing_all = set() + firstLoop = True + for item in contains_all: + files_containing_this_item = set([file for file in files_found if item in file]) + # Set needs to be initialised if + if firstLoop: + files_containing_all = files_containing_this_item + firstLoop = False + else: + files_containing_all = files_containing_all.intersection(files_containing_this_item) + else: + files_containing_all = set(files_found) + + # Filter files based on suffixes criteria + if suffixes is not None: + if isinstance(suffixes, str): + suffixes = [suffixes] + elif not isinstance(suffixes, list): + raise TypeError("suffixes must be a string, list, or None.") + + files_with_suffixes = set() + first_loop = True + for suffix in suffixes: + files_with_this_suffix = set([file for file in files_found if file.endswith(suffix)]) + if first_loop: + files_with_suffixes = files_with_this_suffix + first_loop = False + else: + files_with_suffixes = files_with_suffixes.intersection(files_with_this_suffix) + else: + files_with_suffixes = set(files_found) + + # Get the intersection of files_containing_any, files_containing_all, and files_with_suffixes + files_to_download = files_containing_any.intersection(files_containing_all, files_with_suffixes) + + # Check if the intersection is empty + if not files_to_download: + raise ValueError("No files meet specified criteria.") + + # Check if datadir is a string + if not isinstance(datadir, str): + raise ValueError(f"{datadir} is not a string.") + + # Download all files + for file_path in files_to_download: + # Construct the full destination path + destination_path = os.path.join(datadir, file_path) + + if platform.system() != 'Windows' and update: + warnings.warn("Due to limitations in UNIX systems, update will not check to ensure that you've not" + + f"changed files locally. Prune local dir '{datadir}'" + + "if you have made changes, or set 'update' to False.", Warning) + + # Check if the file exists locally and if update is enabled + if update and os.path.exists(destination_path): + # Get the last modified time of the local file + local_last_modified_time = os.path.getmtime(destination_path) + + # Get the creation time of the local file + local_creation_time = os.path.getctime(destination_path) + + # Get the last modified time of the file on S3 + response = client.head_object(Bucket=Bucket, Key=file_path) + s3_last_modified_time = response['LastModified'].timestamp() + + # Compare the last modified time with the creation time + if local_last_modified_time > local_creation_time: + # Attempt to update the file from S3 if it has been modified locally + client.download_file(Bucket=Bucket, Key=file_path, Filename=destination_path) + if verbose: + print(f"Local file '{destination_path}' has been modified since last download. Redownloading...") + + elif s3_last_modified_time > local_creation_time: + # Download the file from S3 if it has been updated since download + client.download_file(Bucket=Bucket, Key=file_path, Filename=destination_path) + if verbose: + print(f"File '{file_path}' has been updated on S3. Redownloading...") + elif verbose: + print(f"File '{file_path}' is up to date with S3. Ignoring...") + else: + # Create the directory structure if it doesn't exist + directory = os.path.dirname(destination_path) + if not os.path.exists(directory): + os.makedirs(directory) + # Download the file from S3 + client.download_file(Bucket=Bucket, Key=file_path, Filename=destination_path) + if verbose: + print(f"File '{file_path}' has been downloaded from S3.") + + # Prune the folder to remove extraneous elements + if clean: + exception_fnames = ["datadictionary.yaml", "metadata.yaml"] + for i in range(len(exception_fnames)): + exception_fnames[i] = os.path.join(dsid_name, exception_fnames[i]) + print(exception_fnames) + datadir_prefix_length = len(os.path.join(datadir)) + 1 + for root, dirs, files in os.walk(os.path.join(datadir, dsid_name)): + for file in files: + file_path = os.path.join(root, file) + file_path_relative = file_path[datadir_prefix_length:] + if file_path_relative not in files_found and file_path_relative not in exception_fnames: + if verbose: + warnings.warn(f"Deleting extraneous file: {file_path}") + os.remove(file_path) + + # If Requested, fetch all relevant documentation + if fetch_docs: + metadata, datadict = fetch_data_documentation(dsid=dsid, default=True, binary=True) + + if metadata is not None: + metadata_file_path = os.path.join(datadir, dsid_name, "metadata.yaml") + + # Ensure that the directory exists + os.makedirs(os.path.dirname(metadata_file_path), exist_ok=True) + + # Dump the dictionary to the metadata YAML file + with open(metadata_file_path, 'wb') as file: + file.write(metadata) + if datadict is not None: + datadict_file_path = os.path.join(datadir, dsid_name, "datadictionary.yaml") + + # Ensure that the directory exists + os.makedirs(os.path.dirname(datadict_file_path), exist_ok=True) + + # Dump the dictionary to the metadata YAML file + with open(datadict_file_path, 'wb') as file: + file.write(datadict) diff --git a/src/dataio/download/settings.yaml b/src/dataio/download/settings.yaml new file mode 100644 index 0000000..a026d83 --- /dev/null +++ b/src/dataio/download/settings.yaml @@ -0,0 +1,4 @@ +data_state_buckets: + raw: "dsih-artpark-01-raw-data" + preprocessed: "dsih-artpark-01-preprocessed-data" + standardised: "dsih-artpark-01-standardised-data" \ No newline at end of file diff --git a/src/dataio/upload/__init__.py b/src/dataio/upload/__init__.py new file mode 100644 index 0000000..3f4488d --- /dev/null +++ b/src/dataio/upload/__init__.py @@ -0,0 +1,15 @@ +import boto3 + + +def upload_file_to_URI(URI, file): + + client = boto3.client('s3') + + URI = URI.removeprefix("s3://") + Bucket = URI.split("/")[0] + Key = URI.removeprefix(Bucket + "/") + + client.upload_file(Filename=file.name, + Bucket=Bucket, + Key=Key + )