diff --git a/intermediate/remote-data.ipynb b/intermediate/remote-data.ipynb index 7f414473..9eaf6710 100644 --- a/intermediate/remote-data.ipynb +++ b/intermediate/remote-data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fe17af6b-6678-4ee6-866c-fd1c53787c15", + "id": "0", "metadata": {}, "source": [ "# Access Patterns to Remote Data with *fsspec*\n", @@ -20,534 +20,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "6c527e9e-cf5f-46e8-bfb4-72301ee51037", + "execution_count": null, + "id": "1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 8MB\n",
-       "Dimensions:  (lat: 89, lon: 180, time: 128)\n",
-       "Coordinates:\n",
-       "  * lat      (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n",
-       "  * lon      (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n",
-       "  * time     (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n",
-       "Data variables:\n",
-       "    sst      (time, lat, lon) float32 8MB ...\n",
-       "Attributes: (12/37)\n",
-       "    climatology:               Climatology is based on 1971-2000 SST, Xue, Y....\n",
-       "    description:               In situ data: ICOADS2.5 before 2007 and NCEP i...\n",
-       "    keywords_vocabulary:       NASA Global Change Master Directory (GCMD) Sci...\n",
-       "    keywords:                  Earth Science > Oceans > Ocean Temperature > S...\n",
-       "    instrument:                Conventional thermometers\n",
-       "    source_comment:            SSTs were observed by conventional thermometer...\n",
-       "    ...                        ...\n",
-       "    creator_url_original:      https://www.ncei.noaa.gov\n",
-       "    license:                   No constraints on data access or use\n",
-       "    comment:                   SSTs were observed by conventional thermometer...\n",
-       "    summary:                   ERSST.v5 is developed based on v4 after revisi...\n",
-       "    dataset_title:             NOAA Extended Reconstructed SST V5\n",
-       "    data_modified:             2020-09-07
" - ], - "text/plain": [ - " Size: 8MB\n", - "Dimensions: (lat: 89, lon: 180, time: 128)\n", - "Coordinates:\n", - " * lat (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n", - " * lon (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n", - " * time (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n", - "Data variables:\n", - " sst (time, lat, lon) float32 8MB ...\n", - "Attributes: (12/37)\n", - " climatology: Climatology is based on 1971-2000 SST, Xue, Y....\n", - " description: In situ data: ICOADS2.5 before 2007 and NCEP i...\n", - " keywords_vocabulary: NASA Global Change Master Directory (GCMD) Sci...\n", - " keywords: Earth Science > Oceans > Ocean Temperature > S...\n", - " instrument: Conventional thermometers\n", - " source_comment: SSTs were observed by conventional thermometer...\n", - " ... ...\n", - " creator_url_original: https://www.ncei.noaa.gov\n", - " license: No constraints on data access or use\n", - " comment: SSTs were observed by conventional thermometer...\n", - " summary: ERSST.v5 is developed based on v4 after revisi...\n", - " dataset_title: NOAA Extended Reconstructed SST V5\n", - " data_modified: 2020-09-07" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import xarray as xr\n", "\n", @@ -557,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "5a44116d-49b4-4e87-b505-7dca4914ab66", + "id": "2", "metadata": {}, "source": [ "### xarray backends under the hood\n", @@ -571,131 +47,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "01271135-40fe-49b6-8746-e5061130ccbf", + "execution_count": null, + "id": "3", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
def open_dataset() at xarray/backends/api.py:391 \n",
-       "def guess_engine() at xarray/backends/plugins.py:147 \n",
-       "def guess_can_open() at xarray/backends/netCDF4_.py:607 \n",
-       "def is_remote_uri() at xarray/core/utils.py:641 \n",
-       "def try_read_magic_number_from_path() at xarray/core/utils.py:664 \n",
-       "def read_magic_number_from_file() at xarray/core/utils.py:650 \n",
-       "def get_backend() at xarray/backends/plugins.py:200 \n",
-       "def open_dataset() at xarray/backends/netCDF4_.py:624 \n",
-       "def is_remote_uri() at xarray/core/utils.py:641 \n",
-       "def open() at xarray/backends/netCDF4_.py:361\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{k}{def} \\PY{n+nf}{open\\PYZus{}dataset}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{api}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{391} \n", - "\\PY{k}{def} \\PY{n+nf}{guess\\PYZus{}engine}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{plugins}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{147} \n", - "\\PY{k}{def} \\PY{n+nf}{guess\\PYZus{}can\\PYZus{}open}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{netCDF4\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{607} \n", - "\\PY{k}{def} \\PY{n+nf}{is\\PYZus{}remote\\PYZus{}uri}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{core}\\PY{o}{/}\\PY{n}{utils}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{641} \n", - "\\PY{k}{def} \\PY{n+nf}{try\\PYZus{}read\\PYZus{}magic\\PYZus{}number\\PYZus{}from\\PYZus{}path}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{core}\\PY{o}{/}\\PY{n}{utils}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{664} \n", - "\\PY{k}{def} \\PY{n+nf}{read\\PYZus{}magic\\PYZus{}number\\PYZus{}from\\PYZus{}file}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{core}\\PY{o}{/}\\PY{n}{utils}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{650} \n", - "\\PY{k}{def} \\PY{n+nf}{get\\PYZus{}backend}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{plugins}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{200} \n", - "\\PY{k}{def} \\PY{n+nf}{open\\PYZus{}dataset}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{netCDF4\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{624} \n", - "\\PY{k}{def} \\PY{n+nf}{is\\PYZus{}remote\\PYZus{}uri}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{core}\\PY{o}{/}\\PY{n}{utils}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{641} \n", - "\\PY{k}{def} \\PY{n+nf}{open}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{netCDF4\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{361}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "def open_dataset() at xarray/backends/api.py:391 \n", - "def guess_engine() at xarray/backends/plugins.py:147 \n", - "def guess_can_open() at xarray/backends/netCDF4_.py:607 \n", - "def is_remote_uri() at xarray/core/utils.py:641 \n", - "def try_read_magic_number_from_path() at xarray/core/utils.py:664 \n", - "def read_magic_number_from_file() at xarray/core/utils.py:650 \n", - "def get_backend() at xarray/backends/plugins.py:200 \n", - "def open_dataset() at xarray/backends/netCDF4_.py:624 \n", - "def is_remote_uri() at xarray/core/utils.py:641 \n", - "def open() at xarray/backends/netCDF4_.py:361" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import sys\n", "from IPython.display import Code\n", @@ -704,6 +59,7 @@ "tracing_output = []\n", "_match_pattern = \"xarray\"\n", "\n", + "\n", "def trace_calls(frame, event, arg):\n", " if event == 'call':\n", " code = frame.f_code\n", @@ -714,18 +70,19 @@ " tracing_output.append(f\"def {func_name}() at {func_file}:{func_line}\")\n", " return trace_calls\n", "\n", + "\n", "# we enable tracing and call open_dataset()\n", "sys.settrace(trace_calls)\n", "ds = xr.open_dataset(\"../data/sst.mnmean.nc\")\n", "sys.settrace(None)\n", "\n", - "# Print the trace with some syntax highlighting \n", - "Code(\" \\n\".join(tracing_output[0:10]), language='python') " + "# Print the trace with some syntax highlighting\n", + "Code(\" \\n\".join(tracing_output[0:10]), language='python')" ] }, { "cell_type": "markdown", - "id": "b2d90714-544e-4653-a391-204f94e767b5", + "id": "4", "metadata": {}, "source": [ "#### **What are we seeing?** \n", @@ -740,131 +97,10 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "c7376766-ea7e-4546-a082-5da4fc733ea8", + "execution_count": null, + "id": "5", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
def open_dataset() at xarray/backends/api.py:391 \n",
-       "def get_backend() at xarray/backends/plugins.py:200 \n",
-       "def open_dataset() at xarray/backends/h5netcdf_.py:383 \n",
-       "def is_remote_uri() at xarray/core/utils.py:641 \n",
-       "def open() at xarray/backends/h5netcdf_.py:135 \n",
-       "def increment() at xarray/backends/file_manager.py:307 \n",
-       "def ds() at xarray/backends/h5netcdf_.py:193 \n",
-       "def acquire_context() at xarray/backends/file_manager.py:196 \n",
-       "def acquire_context() at xarray/backends/file_manager.py:196 \n",
-       "def find_root_and_group() at xarray/backends/common.py:141\n",
-       "
\n" - ], - "text/latex": [ - "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", - "\\PY{k}{def} \\PY{n+nf}{open\\PYZus{}dataset}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{api}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{391} \n", - "\\PY{k}{def} \\PY{n+nf}{get\\PYZus{}backend}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{plugins}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{200} \n", - "\\PY{k}{def} \\PY{n+nf}{open\\PYZus{}dataset}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{h5netcdf\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{383} \n", - "\\PY{k}{def} \\PY{n+nf}{is\\PYZus{}remote\\PYZus{}uri}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{core}\\PY{o}{/}\\PY{n}{utils}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{641} \n", - "\\PY{k}{def} \\PY{n+nf}{open}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{h5netcdf\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{135} \n", - "\\PY{k}{def} \\PY{n+nf}{increment}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{file\\PYZus{}manager}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{307} \n", - "\\PY{k}{def} \\PY{n+nf}{ds}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{h5netcdf\\PYZus{}}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{193} \n", - "\\PY{k}{def} \\PY{n+nf}{acquire\\PYZus{}context}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{file\\PYZus{}manager}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{196} \n", - "\\PY{k}{def} \\PY{n+nf}{acquire\\PYZus{}context}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{file\\PYZus{}manager}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{196} \n", - "\\PY{k}{def} \\PY{n+nf}{find\\PYZus{}root\\PYZus{}and\\PYZus{}group}\\PY{p}{(}\\PY{p}{)} \\PY{n}{at} \\PY{n}{xarray}\\PY{o}{/}\\PY{n}{backends}\\PY{o}{/}\\PY{n}{common}\\PY{o}{.}\\PY{n}{py}\\PY{p}{:}\\PY{l+m+mi}{141}\n", - "\\end{Verbatim}\n" - ], - "text/plain": [ - "def open_dataset() at xarray/backends/api.py:391 \n", - "def get_backend() at xarray/backends/plugins.py:200 \n", - "def open_dataset() at xarray/backends/h5netcdf_.py:383 \n", - "def is_remote_uri() at xarray/core/utils.py:641 \n", - "def open() at xarray/backends/h5netcdf_.py:135 \n", - "def increment() at xarray/backends/file_manager.py:307 \n", - "def ds() at xarray/backends/h5netcdf_.py:193 \n", - "def acquire_context() at xarray/backends/file_manager.py:196 \n", - "def acquire_context() at xarray/backends/file_manager.py:196 \n", - "def find_root_and_group() at xarray/backends/common.py:141" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "tracing_output = []\n", "\n", @@ -873,12 +109,12 @@ "sys.settrace(None)\n", "\n", "# Print the top 10 calls to public methods\n", - "Code(\" \\n\".join(tracing_output[0:10]), language='python') " + "Code(\" \\n\".join(tracing_output[0:10]), language='python')" ] }, { "cell_type": "markdown", - "id": "9104e9b6-75cc-4e07-9424-e6c1bcb5b203", + "id": "6", "metadata": {}, "source": [ "> It is important to note that there are overlaps between the pre-installed backends in xarray. Many of these backends support the same formats (e.g., NetCDF-4), and xarray uses them in a specific order unless a particular backend is specified. For example, when we request the h5netcdf engine, xarray will not attempt to guess the backend. However, it will still check if the URI is remote, which will involve some calls to a context manager. By examining the call stack, we can observe the use of a file handler and a cache, which are crucial for efficiently accessing remote files." @@ -886,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "e124c821-0efc-4c82-83ab-dda94cd16754", + "id": "7", "metadata": {}, "source": [ "### Supported file formats by backend\n", @@ -903,37 +139,10 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "59a19602-1ad4-4d94-9448-4ddd7aad1631", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'netcdf4': \n", - " Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using netCDF4 in Xarray\n", - " Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.NetCDF4BackendEntrypoint.html,\n", - " 'h5netcdf': \n", - " Open netCDF (.nc, .nc4 and .cdf) and most HDF5 files using h5netcdf in Xarray\n", - " Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.H5netcdfBackendEntrypoint.html,\n", - " 'scipy': \n", - " Open netCDF files (.nc, .nc4, .cdf and .gz) using scipy in Xarray\n", - " Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.ScipyBackendEntrypoint.html,\n", - " 'store': \n", - " Open AbstractDataStore instances in Xarray\n", - " Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.StoreBackendEntrypoint.html,\n", - " 'zarr': \n", - " Open zarr files (.zarr) using zarr in Xarray\n", - " Learn more at https://docs.xarray.dev/en/stable/generated/xarray.backends.ZarrBackendEntrypoint.html}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], "source": [ "# Listing which backends we have available, if we install more they should show up here.\n", "xr.backends.list_engines()" @@ -941,7 +150,7 @@ }, { "cell_type": "markdown", - "id": "986c6a11-d4a7-4969-b2ac-e3200820fa97", + "id": "9", "metadata": {}, "source": [ "### Trying to access a file on cloud storage (AWS S3)\n", @@ -951,26 +160,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "c15211e6-5d8c-4ac6-96ff-bdd2dadf721d", + "execution_count": null, + "id": "10", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Errno -72] NetCDF: Malformed or inaccessible DAP2 DDS or DAP4 DMR response: 's3://its-live-data/test-space/sample-data/sst.mnmean.nc'\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "syntax error, unexpected WORD_WORD, expecting SCAN_ATTR or SCAN_DATASET or SCAN_ERROR\n", - "context: PermanentRedirectThe bucket you are attempting to access must be addressed using the specified endpoint. Please send all future requests to this endpoint.its-live-data.s3-us-west-2.amazonaws.comits-live-dataGX7XFKRVXPYZTN17a3ZB3OGB5i+GxN3KGCSw5jduxO8incl2vvRhsXFJ414Zzv5akhouRQWCznGRDdoj4iEV3vNcex4=\n" - ] - } - ], + "outputs": [], "source": [ "try:\n", " ds = xr.open_dataset(\"s3://its-live-data/test-space/sample-data/sst.mnmean.nc\")\n", @@ -980,7 +173,7 @@ }, { "cell_type": "markdown", - "id": "d6372a94-50e7-458d-a764-6ba5570e164e", + "id": "11", "metadata": {}, "source": [ "xarray iterated through the registered backends and netcdf4 returned a `\"yes, I can open that extension\"` see: [netCDF4_.py#L618 ](https://github.com/pydata/xarray/blob/6c2d8c3389afe049ccbfd1393e9a81dd5c759f78/xarray/backends/netCDF4_.py#L618). However, **the backend doesn't know how to \"talk\" to a remote store** and thus it fails to open our file.\n", @@ -989,7 +182,7 @@ }, { "cell_type": "markdown", - "id": "f16f114c-41a0-4c33-b33d-685d2382b1a3", + "id": "12", "metadata": {}, "source": [ "## Supported format + Read from Buffers = Remote access \n", @@ -1028,7 +221,7 @@ }, { "cell_type": "markdown", - "id": "bc0b8b57-3a60-4d35-ba96-7daf4abd830f", + "id": "13", "metadata": {}, "source": [ "## Remote Access and File Caching\n", @@ -1046,541 +239,17 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "4e028466-0562-4cb6-84bd-ef8551509653", + "execution_count": null, + "id": "14", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 8MB\n",
-       "Dimensions:  (lat: 89, lon: 180, time: 128)\n",
-       "Coordinates:\n",
-       "  * lat      (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n",
-       "  * lon      (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n",
-       "  * time     (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n",
-       "Data variables:\n",
-       "    sst      (time, lat, lon) float32 8MB ...\n",
-       "Attributes: (12/37)\n",
-       "    climatology:               Climatology is based on 1971-2000 SST, Xue, Y....\n",
-       "    description:               In situ data: ICOADS2.5 before 2007 and NCEP i...\n",
-       "    keywords_vocabulary:       NASA Global Change Master Directory (GCMD) Sci...\n",
-       "    keywords:                  Earth Science > Oceans > Ocean Temperature > S...\n",
-       "    instrument:                Conventional thermometers\n",
-       "    source_comment:            SSTs were observed by conventional thermometer...\n",
-       "    ...                        ...\n",
-       "    creator_url_original:      https://www.ncei.noaa.gov\n",
-       "    license:                   No constraints on data access or use\n",
-       "    comment:                   SSTs were observed by conventional thermometer...\n",
-       "    summary:                   ERSST.v5 is developed based on v4 after revisi...\n",
-       "    dataset_title:             NOAA Extended Reconstructed SST V5\n",
-       "    data_modified:             2020-09-07
" - ], - "text/plain": [ - " Size: 8MB\n", - "Dimensions: (lat: 89, lon: 180, time: 128)\n", - "Coordinates:\n", - " * lat (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n", - " * lon (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n", - " * time (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n", - "Data variables:\n", - " sst (time, lat, lon) float32 8MB ...\n", - "Attributes: (12/37)\n", - " climatology: Climatology is based on 1971-2000 SST, Xue, Y....\n", - " description: In situ data: ICOADS2.5 before 2007 and NCEP i...\n", - " keywords_vocabulary: NASA Global Change Master Directory (GCMD) Sci...\n", - " keywords: Earth Science > Oceans > Ocean Temperature > S...\n", - " instrument: Conventional thermometers\n", - " source_comment: SSTs were observed by conventional thermometer...\n", - " ... ...\n", - " creator_url_original: https://www.ncei.noaa.gov\n", - " license: No constraints on data access or use\n", - " comment: SSTs were observed by conventional thermometer...\n", - " summary: ERSST.v5 is developed based on v4 after revisi...\n", - " dataset_title: NOAA Extended Reconstructed SST V5\n", - " data_modified: 2020-09-07" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import fsspec\n", "\n", "\n", "uri = \"https://its-live-data.s3-us-west-2.amazonaws.com/test-space/sample-data/sst.mnmean.nc\"\n", "# we prepend the cache type to the URI, this is called protocol chaining in fsspec-speak\n", - "file = fsspec.open_local(f\"simplecache::{uri}\", filecache={'cache_storage':'/tmp/fsspec_cache'})\n", + "file = fsspec.open_local(f\"simplecache::{uri}\", filecache={'cache_storage': '/tmp/fsspec_cache'})\n", "\n", "ds = xr.open_dataset(file, engine=\"netcdf4\")\n", "ds" @@ -1588,7 +257,7 @@ }, { "cell_type": "markdown", - "id": "3fde50d4-7523-4c4e-af33-d2588a2b6670", + "id": "15", "metadata": {}, "source": [ "#### block cache + `open()`\n", @@ -1600,566 +269,10 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "f7198585-b995-47b2-9ea0-4e4ccf056df0", + "execution_count": null, + "id": "16", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 239 ms, sys: 37.3 ms, total: 276 ms\n", - "Wall time: 1.03 s\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 8MB\n",
-       "Dimensions:  (lat: 89, lon: 180, time: 128)\n",
-       "Coordinates:\n",
-       "  * lat      (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n",
-       "  * lon      (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n",
-       "  * time     (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n",
-       "Data variables:\n",
-       "    sst      (time, lat, lon) float32 8MB -1.8 -1.8 -1.8 -1.8 ... nan nan nan\n",
-       "Attributes: (12/37)\n",
-       "    climatology:               Climatology is based on 1971-2000 SST, Xue, Y....\n",
-       "    description:               In situ data: ICOADS2.5 before 2007 and NCEP i...\n",
-       "    keywords_vocabulary:       NASA Global Change Master Directory (GCMD) Sci...\n",
-       "    keywords:                  Earth Science > Oceans > Ocean Temperature > S...\n",
-       "    instrument:                Conventional thermometers\n",
-       "    source_comment:            SSTs were observed by conventional thermometer...\n",
-       "    ...                        ...\n",
-       "    creator_url_original:      https://www.ncei.noaa.gov\n",
-       "    license:                   No constraints on data access or use\n",
-       "    comment:                   SSTs were observed by conventional thermometer...\n",
-       "    summary:                   ERSST.v5 is developed based on v4 after revisi...\n",
-       "    dataset_title:             NOAA Extended Reconstructed SST V5\n",
-       "    data_modified:             2020-09-07
" - ], - "text/plain": [ - " Size: 8MB\n", - "Dimensions: (lat: 89, lon: 180, time: 128)\n", - "Coordinates:\n", - " * lat (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n", - " * lon (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n", - " * time (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n", - "Data variables:\n", - " sst (time, lat, lon) float32 8MB -1.8 -1.8 -1.8 -1.8 ... nan nan nan\n", - "Attributes: (12/37)\n", - " climatology: Climatology is based on 1971-2000 SST, Xue, Y....\n", - " description: In situ data: ICOADS2.5 before 2007 and NCEP i...\n", - " keywords_vocabulary: NASA Global Change Master Directory (GCMD) Sci...\n", - " keywords: Earth Science > Oceans > Ocean Temperature > S...\n", - " instrument: Conventional thermometers\n", - " source_comment: SSTs were observed by conventional thermometer...\n", - " ... ...\n", - " creator_url_original: https://www.ncei.noaa.gov\n", - " license: No constraints on data access or use\n", - " comment: SSTs were observed by conventional thermometer...\n", - " summary: ERSST.v5 is developed based on v4 after revisi...\n", - " dataset_title: NOAA Extended Reconstructed SST V5\n", - " data_modified: 2020-09-07" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "uri = \"https://its-live-data.s3-us-west-2.amazonaws.com/test-space/sample-data/sst.mnmean.nc\"\n", @@ -2167,8 +280,10 @@ "fs = fsspec.filesystem('http')\n", "\n", "fsspec_caching = {\n", - " \"cache_type\": \"blockcache\", # block cache stores blocks of fixed size and uses eviction using a LRU strategy. \n", - " \"block_size\": 8*1024*1024 # size in bytes per block, adjust depends on the file size but the recommended size is in the MB \n", + " \"cache_type\": \"blockcache\", # block cache stores blocks of fixed size and uses eviction using a LRU strategy.\n", + " \"block_size\": 8\n", + " * 1024\n", + " * 1024, # size in bytes per block, adjust depends on the file size but the recommended size is in the MB\n", "}\n", "\n", "# Note that if we use a context, we'll close the file after the block so operations on xarray may fail if we don't load our data arrays.\n", @@ -2180,7 +295,7 @@ }, { "cell_type": "markdown", - "id": "02283d10-0314-42a9-9671-2197a14c795f", + "id": "17", "metadata": {}, "source": [ "### Reading data from cloud storage\n", @@ -2192,542 +307,10 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "af6915ca-0bb0-47d8-b7b7-b3c4f0e48dd1", + "execution_count": null, + "id": "18", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 436 ms, sys: 53.5 ms, total: 490 ms\n", - "Wall time: 2.37 s\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset> Size: 8MB\n",
-       "Dimensions:  (lat: 89, lon: 180, time: 128)\n",
-       "Coordinates:\n",
-       "  * lat      (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n",
-       "  * lon      (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n",
-       "  * time     (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n",
-       "Data variables:\n",
-       "    sst      (time, lat, lon) float32 8MB ...\n",
-       "Attributes: (12/37)\n",
-       "    climatology:               Climatology is based on 1971-2000 SST, Xue, Y....\n",
-       "    description:               In situ data: ICOADS2.5 before 2007 and NCEP i...\n",
-       "    keywords_vocabulary:       NASA Global Change Master Directory (GCMD) Sci...\n",
-       "    keywords:                  Earth Science > Oceans > Ocean Temperature > S...\n",
-       "    instrument:                Conventional thermometers\n",
-       "    source_comment:            SSTs were observed by conventional thermometer...\n",
-       "    ...                        ...\n",
-       "    creator_url_original:      https://www.ncei.noaa.gov\n",
-       "    license:                   No constraints on data access or use\n",
-       "    comment:                   SSTs were observed by conventional thermometer...\n",
-       "    summary:                   ERSST.v5 is developed based on v4 after revisi...\n",
-       "    dataset_title:             NOAA Extended Reconstructed SST V5\n",
-       "    data_modified:             2020-09-07
" - ], - "text/plain": [ - " Size: 8MB\n", - "Dimensions: (lat: 89, lon: 180, time: 128)\n", - "Coordinates:\n", - " * lat (lat) float32 356B 88.0 86.0 84.0 82.0 ... -82.0 -84.0 -86.0 -88.0\n", - " * lon (lon) float32 720B 0.0 2.0 4.0 6.0 8.0 ... 352.0 354.0 356.0 358.0\n", - " * time (time) datetime64[ns] 1kB 2010-01-01 2010-02-01 ... 2020-08-01\n", - "Data variables:\n", - " sst (time, lat, lon) float32 8MB ...\n", - "Attributes: (12/37)\n", - " climatology: Climatology is based on 1971-2000 SST, Xue, Y....\n", - " description: In situ data: ICOADS2.5 before 2007 and NCEP i...\n", - " keywords_vocabulary: NASA Global Change Master Directory (GCMD) Sci...\n", - " keywords: Earth Science > Oceans > Ocean Temperature > S...\n", - " instrument: Conventional thermometers\n", - " source_comment: SSTs were observed by conventional thermometer...\n", - " ... ...\n", - " creator_url_original: https://www.ncei.noaa.gov\n", - " license: No constraints on data access or use\n", - " comment: SSTs were observed by conventional thermometer...\n", - " summary: ERSST.v5 is developed based on v4 after revisi...\n", - " dataset_title: NOAA Extended Reconstructed SST V5\n", - " data_modified: 2020-09-07" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "uri = \"s3://its-live-data/test-space/sample-data/sst.mnmean.nc\"\n", @@ -2736,8 +319,10 @@ "fs = fsspec.filesystem('s3', anon=True)\n", "\n", "fsspec_caching = {\n", - " \"cache_type\": \"blockcache\", # block cache stores blocks of fixed size and uses eviction using a LRU strategy. \n", - " \"block_size\": 8*1024*1024 # size in bytes per block, adjust depends on the file size but the recommended size is in the MB \n", + " \"cache_type\": \"blockcache\", # block cache stores blocks of fixed size and uses eviction using a LRU strategy.\n", + " \"block_size\": 8\n", + " * 1024\n", + " * 1024, # size in bytes per block, adjust depends on the file size but the recommended size is in the MB\n", "}\n", "\n", "# we are not using a context, we can use ds until we manually close it.\n", @@ -2747,7 +332,7 @@ }, { "cell_type": "markdown", - "id": "70a744c8-f1fb-4824-a681-e9fa50108ee1", + "id": "19", "metadata": {}, "source": [ "## Key Takeaways\n", @@ -2774,11 +359,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -2788,8 +368,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" + "pygments_lexer": "ipython3" } }, "nbformat": 4,