From c88c254b79f8097c1d3ca4c7cb5e4c5bcc1b93bb Mon Sep 17 00:00:00 2001
From: iammeghana <mkoramut@gmu.edu>
Date: Mon, 19 Aug 2024 12:26:39 -0400
Subject: [PATCH] added utility functions to amsr

---
 book/chapters/amsr.ipynb | 381 ++++++++++++++++++++-------------------
 1 file changed, 193 insertions(+), 188 deletions(-)

diff --git a/book/chapters/amsr.ipynb b/book/chapters/amsr.ipynb
index 50e98bc..24b8763 100644
--- a/book/chapters/amsr.ipynb
+++ b/book/chapters/amsr.ipynb
@@ -1353,100 +1353,12 @@
     "Lets breakdown each step involved in feature extraction. \n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "e4266f6f",
-   "metadata": {},
-   "source": [
-    "### 3.6.3.1 Importing required python libraries to run the script\n",
-    "\n",
-    "- **Importing Libraries**: Essential libraries are imported for handling files, processing large datasets, and performing complex calculations.\n",
-    "  - `os`, `shutil`, `subprocess`: For file handling, copying, and executing shell commands.\n",
-    "  - `csv`, `h5py`, `numpy`, `pandas`: For reading/writing files, handling HDF5 datasets, numerical computations, and data manipulation.\n",
-    "  - `dask`, `xarray`: To manage and process large datasets efficiently using parallel computing."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 248,
-   "id": "1345cc04",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import csv\n",
-    "import h5py\n",
-    "import shutil\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "from datetime import datetime\n",
-    "import dask\n",
-    "import dask.dataframe as dd\n",
-    "import dask.delayed as delayed\n",
-    "import dask.bag as db\n",
-    "import xarray as xr\n",
-    "import subprocess\n",
-    "\n",
-    "\n",
-    "# For demonstration purposes, we're using one week of data for training.\n",
-    "# The training period is set from December 24, 2022, to December 31, 2022.\n",
-    "train_start_date = \"2022-12-24\"\n",
-    "train_end_date = \"2022-12-31\"\n",
-    "\n",
-    "work_dir = \"../data/gridmet_test_run\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a8c9530",
-   "metadata": {},
-   "source": [
-    "###  3.6.3.2 Function to Copy .he5 Files from Source to Destination Directory\n",
-    "\n",
-    "The goal here is to copy all `.he5` files from a specified source directory to a destination directory.\n",
-    "\n",
-    "- `source_dir`: The directory where the `.he5` files are originally located.\n",
-    "- `destination_dir`: The target directory where the `.he5` files will be copied.\n",
-    "- `os.walk`: A function that traverses the directory tree, accessing all subdirectories and files.\n",
-    "- `shutil.copy`: A method used to copy the files from the source to the destination.\n",
-    "\n",
-    "The code specifically looks for files with the `.he5` extension to identify the relevant files for copying."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 249,
-   "id": "0f8c76b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def copy_he5_files(source_dir, destination_dir):\n",
-    "    '''\n",
-    "    Copy .he5 files from the source directory to the destination directory.\n",
-    "\n",
-    "    Args:\n",
-    "        source_dir (str): The source directory containing .he5 files to copy.\n",
-    "        destination_dir (str): The destination directory where .he5 files will be copied.\n",
-    "\n",
-    "    Returns:\n",
-    "        None\n",
-    "    '''\n",
-    "    # Get a list of all subdirectories and files in the source directory\n",
-    "    for root, dirs, files in os.walk(source_dir):\n",
-    "        for file in files:\n",
-    "            if file.endswith('.he5'):\n",
-    "                # Get the absolute path of the source file\n",
-    "                source_file_path = os.path.join(root, file)\n",
-    "                # Copy the file to the destination directory\n",
-    "                shutil.copy(source_file_path, destination_dir)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "22e1f179",
    "metadata": {},
    "source": [
-    "### 3.6.3.2 Finding the Closest Grid Cell Index for Given Latitude and Longitude\n",
+    "### 3.6.3.1 Finding the Closest Grid Cell Index for Given Latitude and Longitude\n",
     "\n",
     "- `target_latitude`, `target_longitude`: The coordinates of the specific location you want to match to a grid cell.\n",
     "- `lat_grid`, `lon_grid`: Arrays representing the grid of latitude and longitude values across a region.\n",
@@ -1495,7 +1407,7 @@
    "id": "580125e7",
    "metadata": {},
    "source": [
-    "### 3.6.3.3 Function to Map SNOTEL Stations to AMSR Grid Coordinates and Create a CSV Mapper\n",
+    "### 3.6.3.2 Function to Map SNOTEL Stations to AMSR Grid Coordinates and Create a CSV Mapper\n",
     "\n",
     "Next we map SNOTEL station locations to the nearest AMSR grid cells and save this mapping as a CSV file.\n",
     "\n",
@@ -1584,7 +1496,7 @@
    "id": "e97be1bd",
    "metadata": {},
    "source": [
-    "### 3.6.3.4 Extracting and Saving AMSR Snow Data to CSV\n",
+    "### 3.6.3.3 Extracting and Saving AMSR Snow Data to CSV\n",
     "\n",
     "Next, we extract `snow water equivalent (SWE)` data from AMSR files for a range of dates, match it to specific locations (such as SNOTEL stations), and save the processed data into a CSV file. \n",
     "\n",
@@ -1697,7 +1609,7 @@
    "id": "971336a0",
    "metadata": {},
    "source": [
-    "### 3.6.3.5 Running the AMSR Data Extraction Process\n",
+    "### 3.6.3.4 Running the AMSR Data Extraction Process\n",
     "\n",
     "Here we extract and save AMSR snow data for a specified range of dates, linking it to SNOTEL stations, and storing the results in a CSV file.\n",
     "\n",
@@ -1997,49 +1909,12 @@
     "It provides a streamlined and automated pipeline for handling AMSR data, from initial download and grid alignment to final data processing and analysis.\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "1a0828a7",
-   "metadata": {},
-   "source": [
-    "### 3.6.4.1 Importing Libraries and Setting Up for Snow Data Processing\n",
-    "\n",
-    "- `KDTree`: From `scipy.spatial`, used for performing efficient nearest-neighbor searches in spatial datasets.\n",
-    "- `plot_all_variables_in_one_csv`: A custom function from `convert_results_to_images`, used for visualizing processed data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 357,
-   "id": "1fc38737",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import h5py\n",
-    "import subprocess\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from datetime import datetime\n",
-    "from scipy.spatial import KDTree\n",
-    "import time\n",
-    "from datetime import datetime, timedelta, date\n",
-    "import warnings\n",
-    "import sys\n",
-    "# from convert_results_to_images import plot_all_variables_in_one_c\n",
-    "\n",
-    "homedir = os.path.expanduser('~')\n",
-    "work_dir = \"../data/gridmet_test_run\"\n",
-    "test_start_date = \"2024-07-18\"\n",
-    "western_us_coords = \"../data/dem_file.tif.csv\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "8b0c18cd",
    "metadata": {},
    "source": [
-    "### 3.6.4.2 Find Closest Point in a Grid\n",
+    "### 3.6.4.1 Find Closest Point in a Grid\n",
     "\n",
     "- `target_latitude` (float): The latitude of the target point.\n",
     "- `target_longitude` (float): The longitude of the target point.\n",
@@ -2066,62 +1941,12 @@
     "    return lat_idx, lon_idx, lat_grid[lat_idx, lon_idx], lon_grid[lat_idx, lon_idx]"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "d009d825",
-   "metadata": {},
-   "source": [
-    "### 3.6.4.3 Identify Binary File\n",
-    "\n",
-    "Here we determine whether a given file is a binary file or a text file.\n",
-    "\n",
-    "- We attempt to open the file in binary mode (`'rb'`) and read a chunk of bytes (1024 bytes).\n",
-    "- And the we check for null bytes (`b'\\x00'`), which are common in binary files. If a null byte is found, then it is binary file.\n",
-    "- Next, we check for a high percentage of non-printable ASCII characters by converting the byte chunk to characters and filtering out non-printable ones. If the chunk has no printable characters, the file is considered binary.\n",
-    "- If neither of the above conditions are met, the function assumes the file is a text file.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 359,
-   "id": "408f81be",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def is_binary(file_path):\n",
-    "    try:\n",
-    "        with open(file_path, 'rb') as file:\n",
-    "            # Read a chunk of bytes from the file\n",
-    "            chunk = file.read(1024)\n",
-    "\n",
-    "            # Check for null bytes, a common indicator of binary data\n",
-    "            if b'\\x00' in chunk:\n",
-    "                return True\n",
-    "\n",
-    "            # Check for a high percentage of non-printable ASCII characters\n",
-    "            text_characters = \"\".join(chr(byte) for byte in chunk if 32 <= byte <= 126)\n",
-    "            if not text_characters:\n",
-    "                return True\n",
-    "\n",
-    "            # If none of the binary indicators are found, assume it's a text file\n",
-    "            return False\n",
-    "\n",
-    "    except FileNotFoundError:\n",
-    "        print(f\"File '{file_path}' not found.\")\n",
-    "        return False\n",
-    "    except Exception as e:\n",
-    "        print(f\"An error occurred: {e}\")\n",
-    "        return False\n",
-    "  "
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "b7d58f9e",
    "metadata": {},
    "source": [
-    "### 3.6.4.4 Find the closest grid point indices for a target latitude and longitude using KDTree\n",
+    "### 3.6.4.2 Find the closest grid point indices for a target latitude and longitude using KDTree\n",
     "Here we find the closest grid point indices to a given target latitude and longitude using a KDTree for efficient spatial searching.\n",
     "\n",
     "- `lat_idx` (int): The index of the closest latitude in the grid.\n",
@@ -2181,7 +2006,7 @@
    "id": "502abe49",
    "metadata": {},
    "source": [
-    "### 3.6.4.5 Find the closest grid point indices for a target latitude and longitude.\n",
+    "### 3.6.4.3 Find the closest grid point indices for a target latitude and longitude.\n",
     "\n",
     "Here we find the grid point in a latitude-longitude array that is closest to a given target latitude and longitude.\n",
     "\n",
@@ -2228,9 +2053,9 @@
    "id": "de7ff44d",
    "metadata": {},
    "source": [
-    "### 3.6.4.6 Preparing the AMSR to GridMET Mapper\n",
+    "### 3.6.4.4 Preparing the AMSR to GridMET Mapper\n",
     "\n",
-    "The goal here is to create a mapping between AMSR grid data and GridMET grid points, saving the results to a CSV file. In `3.6.3.3`, we mapped SNOTEL stations to AMSR grid coordinates. Here, we map the gridMET grid to AMSR coordinates.\n",
+    "The goal here is to create a mapping between AMSR grid data and GridMET grid points, saving the results to a CSV file. \n",
     "\n",
     "- `target_csv_path`: The file path where the mapping between AMSR and GridMET grid points will be saved as a CSV file.\n",
     "- `target_amsr_hdf_path`: The path where the AMSR data file is stored or will be downloaded to if it doesn’t exist.- `western_us_coords`: A CSV file containing the latitude and longitude of GridMET grid points for the western U.S.\n",
@@ -2318,7 +2143,7 @@
    "id": "ac65dccd",
    "metadata": {},
    "source": [
-    "### 3.6.4.7 Downloading and Converting AMSR Snow Data to DEM Format\n",
+    "### 3.6.4.5 Downloading and Converting AMSR Snow Data to DEM Format\n",
     "\n",
     "Here we automate the downloading, conversion, and saving of AMSR data aligned with a DEM grid.\n",
     "And also adds a cumulative sum column to a DataFrame, useful for tracking cumulative metrics over time.\n",
@@ -2448,7 +2273,7 @@
    "id": "d9c56467",
    "metadata": {},
    "source": [
-    "### 3.6.4.8 Aggregate Cumulative AMSR Snow Data and Export to CSV\n",
+    "### 3.6.4.6 Aggregate Cumulative AMSR Snow Data and Export to CSV\n",
     "\n",
     "The goal of this code is to calculate the `cumulative Snow Water Equivalent (SWE)` values from AMSR data over a specific period, filling any gaps in the data, and saving the cumulative results into a CSV file. This is particularly useful for analyzing long-term snow accumulation trends.\n",
     "\n",
@@ -2594,7 +2419,7 @@
    "id": "2de48c38",
    "metadata": {},
    "source": [
-    "### 3.6.4.9 Interpolate Missing Values and Calculate Cumulative SWE In-Place for AMSR Data\n",
+    "### 3.6.4.7 Interpolate Missing Values and Calculate Cumulative SWE In-Place for AMSR Data\n",
     "\n",
     "Here we aim to ensure that any missing or anomalous data points within a specific column are handled appropriately through interpolation, and then a cumulative sum is calculated. \n",
     "\n",
@@ -2666,7 +2491,7 @@
    "id": "564e4d15",
    "metadata": {},
    "source": [
-    "### 3.6.4.10 Running the AMSR Data Extraction Process\n",
+    "### 3.6.4.8 Running the AMSR Data Extraction Process\n",
     "This script is to handle the entire workflow, from data preparation to the generation of cumulative time series data.\n",
     "\n",
     "- `prepare_amsr_grid_mapper()`: It maps the AMSR grid to the gridMET grid, preparing the necessary data for further processing.\n",
@@ -2767,6 +2592,186 @@
     "get_cumulative_amsr_data(force=False)\n",
     "input_time_series_file = f'{work_dir}/testing_ready_amsr_{test_start_date}_cumulative.csv_gap_filled.csv'\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "826941f6",
+   "metadata": {},
+   "source": [
+    "## Utility Functions for Feature Extraction\n",
+    "\n",
+    "The following functions are categorized as utility functions. These functions are not central to the main discussion but play a supportive role by providing necessary functionality. They can be referenced as needed throughout the chapter to simplify and streamline the main code examples.\n",
+    "\n",
+    "### 1. Importing required python libraries to run the script\n",
+    "\n",
+    "- **Importing Libraries**: Essential libraries are imported for handling files, processing large datasets, and performing complex calculations.\n",
+    "  - `os`, `shutil`, `subprocess`: For file handling, copying, and executing shell commands.\n",
+    "  - `csv`, `h5py`, `numpy`, `pandas`: For reading/writing files, handling HDF5 datasets, numerical computations, and data manipulation.\n",
+    "  - `dask`, `xarray`: To manage and process large datasets efficiently using parallel computing.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "134fc5cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import csv\n",
+    "import h5py\n",
+    "import shutil\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from datetime import datetime\n",
+    "import dask\n",
+    "import dask.dataframe as dd\n",
+    "import dask.delayed as delayed\n",
+    "import dask.bag as db\n",
+    "import xarray as xr\n",
+    "import subprocess\n",
+    "\n",
+    "\n",
+    "# For demonstration purposes, we're using one week of data for training.\n",
+    "# The training period is set from December 24, 2022, to December 31, 2022.\n",
+    "train_start_date = \"2022-12-24\"\n",
+    "train_end_date = \"2022-12-31\"\n",
+    "\n",
+    "work_dir = \"../data/gridmet_test_run\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36257e94",
+   "metadata": {},
+   "source": [
+    "###  2. Function to Copy .he5 Files from Source to Destination Directory\n",
+    "\n",
+    "The goal here is to copy all `.he5` files from a specified source directory to a destination directory.\n",
+    "\n",
+    "- `source_dir`: The directory where the `.he5` files are originally located.\n",
+    "- `destination_dir`: The target directory where the `.he5` files will be copied.\n",
+    "- `os.walk`: A function that traverses the directory tree, accessing all subdirectories and files.\n",
+    "- `shutil.copy`: A method used to copy the files from the source to the destination.\n",
+    "\n",
+    "The code specifically looks for files with the `.he5` extension to identify the relevant files for copying."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "4c7711af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def copy_he5_files(source_dir, destination_dir):\n",
+    "    '''\n",
+    "    Copy .he5 files from the source directory to the destination directory.\n",
+    "\n",
+    "    Args:\n",
+    "        source_dir (str): The source directory containing .he5 files to copy.\n",
+    "        destination_dir (str): The destination directory where .he5 files will be copied.\n",
+    "\n",
+    "    Returns:\n",
+    "        None\n",
+    "    '''\n",
+    "    # Get a list of all subdirectories and files in the source directory\n",
+    "    for root, dirs, files in os.walk(source_dir):\n",
+    "        for file in files:\n",
+    "            if file.endswith('.he5'):\n",
+    "                # Get the absolute path of the source file\n",
+    "                source_file_path = os.path.join(root, file)\n",
+    "                # Copy the file to the destination directory\n",
+    "                shutil.copy(source_file_path, destination_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aab13cdf",
+   "metadata": {},
+   "source": [
+    "## Utility Functions for Data Processing and Analysis Pipeline\n",
+    "\n",
+    "### 1. Library Imports and Setup for Snow Data\n",
+    "\n",
+    "- `KDTree`: From `scipy.spatial`, used for performing efficient nearest-neighbor searches in spatial datasets.\n",
+    "- `plot_all_variables_in_one_csv`: A custom function from `convert_results_to_images`, used for visualizing processed data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6d98378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import h5py\n",
+    "import subprocess\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from datetime import datetime\n",
+    "from scipy.spatial import KDTree\n",
+    "import time\n",
+    "from datetime import datetime, timedelta, date\n",
+    "import warnings\n",
+    "import sys\n",
+    "# from convert_results_to_images import plot_all_variables_in_one_c\n",
+    "\n",
+    "homedir = os.path.expanduser('~')\n",
+    "work_dir = \"../data/gridmet_test_run\"\n",
+    "test_start_date = \"2024-07-18\"\n",
+    "western_us_coords = \"../data/dem_file.tif.csv\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2550b08d",
+   "metadata": {},
+   "source": [
+    "### 2. Identifying Binary Files\n",
+    "\n",
+    "Here we determine whether a given file is a binary file or a text file.\n",
+    "\n",
+    "- We attempt to open the file in binary mode (`'rb'`) and read a chunk of bytes (1024 bytes).\n",
+    "- And the we check for null bytes (`b'\\x00'`), which are common in binary files. If a null byte is found, then it is binary file.\n",
+    "- Next, we check for a high percentage of non-printable ASCII characters by converting the byte chunk to characters and filtering out non-printable ones. If the chunk has no printable characters, the file is considered binary.\n",
+    "- If neither of the above conditions are met, the function assumes the file is a text file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "7004281e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def is_binary(file_path):\n",
+    "    try:\n",
+    "        with open(file_path, 'rb') as file:\n",
+    "            # Read a chunk of bytes from the file\n",
+    "            chunk = file.read(1024)\n",
+    "\n",
+    "            # Check for null bytes, a common indicator of binary data\n",
+    "            if b'\\x00' in chunk:\n",
+    "                return True\n",
+    "\n",
+    "            # Check for a high percentage of non-printable ASCII characters\n",
+    "            text_characters = \"\".join(chr(byte) for byte in chunk if 32 <= byte <= 126)\n",
+    "            if not text_characters:\n",
+    "                return True\n",
+    "\n",
+    "            # If none of the binary indicators are found, assume it's a text file\n",
+    "            return False\n",
+    "\n",
+    "    except FileNotFoundError:\n",
+    "        print(f\"File '{file_path}' not found.\")\n",
+    "        return False\n",
+    "    except Exception as e:\n",
+    "        print(f\"An error occurred: {e}\")\n",
+    "        return False\n",
+    "  "
+   ]
   }
  ],
  "metadata": {