From c88c254b79f8097c1d3ca4c7cb5e4c5bcc1b93bb Mon Sep 17 00:00:00 2001 From: iammeghana Date: Mon, 19 Aug 2024 12:26:39 -0400 Subject: [PATCH] added utility functions to amsr --- book/chapters/amsr.ipynb | 381 ++++++++++++++++++++------------------- 1 file changed, 193 insertions(+), 188 deletions(-) diff --git a/book/chapters/amsr.ipynb b/book/chapters/amsr.ipynb index 50e98bc..24b8763 100644 --- a/book/chapters/amsr.ipynb +++ b/book/chapters/amsr.ipynb @@ -1353,100 +1353,12 @@ "Lets breakdown each step involved in feature extraction. \n" ] }, - { - "cell_type": "markdown", - "id": "e4266f6f", - "metadata": {}, - "source": [ - "### 3.6.3.1 Importing required python libraries to run the script\n", - "\n", - "- **Importing Libraries**: Essential libraries are imported for handling files, processing large datasets, and performing complex calculations.\n", - " - `os`, `shutil`, `subprocess`: For file handling, copying, and executing shell commands.\n", - " - `csv`, `h5py`, `numpy`, `pandas`: For reading/writing files, handling HDF5 datasets, numerical computations, and data manipulation.\n", - " - `dask`, `xarray`: To manage and process large datasets efficiently using parallel computing." - ] - }, - { - "cell_type": "code", - "execution_count": 248, - "id": "1345cc04", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import csv\n", - "import h5py\n", - "import shutil\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datetime import datetime\n", - "import dask\n", - "import dask.dataframe as dd\n", - "import dask.delayed as delayed\n", - "import dask.bag as db\n", - "import xarray as xr\n", - "import subprocess\n", - "\n", - "\n", - "# For demonstration purposes, we're using one week of data for training.\n", - "# The training period is set from December 24, 2022, to December 31, 2022.\n", - "train_start_date = \"2022-12-24\"\n", - "train_end_date = \"2022-12-31\"\n", - "\n", - "work_dir = \"../data/gridmet_test_run\"" - ] - }, - { - "cell_type": "markdown", - "id": "8a8c9530", - "metadata": {}, - "source": [ - "### 3.6.3.2 Function to Copy .he5 Files from Source to Destination Directory\n", - "\n", - "The goal here is to copy all `.he5` files from a specified source directory to a destination directory.\n", - "\n", - "- `source_dir`: The directory where the `.he5` files are originally located.\n", - "- `destination_dir`: The target directory where the `.he5` files will be copied.\n", - "- `os.walk`: A function that traverses the directory tree, accessing all subdirectories and files.\n", - "- `shutil.copy`: A method used to copy the files from the source to the destination.\n", - "\n", - "The code specifically looks for files with the `.he5` extension to identify the relevant files for copying." - ] - }, - { - "cell_type": "code", - "execution_count": 249, - "id": "0f8c76b3", - "metadata": {}, - "outputs": [], - "source": [ - "def copy_he5_files(source_dir, destination_dir):\n", - " '''\n", - " Copy .he5 files from the source directory to the destination directory.\n", - "\n", - " Args:\n", - " source_dir (str): The source directory containing .he5 files to copy.\n", - " destination_dir (str): The destination directory where .he5 files will be copied.\n", - "\n", - " Returns:\n", - " None\n", - " '''\n", - " # Get a list of all subdirectories and files in the source directory\n", - " for root, dirs, files in os.walk(source_dir):\n", - " for file in files:\n", - " if file.endswith('.he5'):\n", - " # Get the absolute path of the source file\n", - " source_file_path = os.path.join(root, file)\n", - " # Copy the file to the destination directory\n", - " shutil.copy(source_file_path, destination_dir)" - ] - }, { "cell_type": "markdown", "id": "22e1f179", "metadata": {}, "source": [ - "### 3.6.3.2 Finding the Closest Grid Cell Index for Given Latitude and Longitude\n", + "### 3.6.3.1 Finding the Closest Grid Cell Index for Given Latitude and Longitude\n", "\n", "- `target_latitude`, `target_longitude`: The coordinates of the specific location you want to match to a grid cell.\n", "- `lat_grid`, `lon_grid`: Arrays representing the grid of latitude and longitude values across a region.\n", @@ -1495,7 +1407,7 @@ "id": "580125e7", "metadata": {}, "source": [ - "### 3.6.3.3 Function to Map SNOTEL Stations to AMSR Grid Coordinates and Create a CSV Mapper\n", + "### 3.6.3.2 Function to Map SNOTEL Stations to AMSR Grid Coordinates and Create a CSV Mapper\n", "\n", "Next we map SNOTEL station locations to the nearest AMSR grid cells and save this mapping as a CSV file.\n", "\n", @@ -1584,7 +1496,7 @@ "id": "e97be1bd", "metadata": {}, "source": [ - "### 3.6.3.4 Extracting and Saving AMSR Snow Data to CSV\n", + "### 3.6.3.3 Extracting and Saving AMSR Snow Data to CSV\n", "\n", "Next, we extract `snow water equivalent (SWE)` data from AMSR files for a range of dates, match it to specific locations (such as SNOTEL stations), and save the processed data into a CSV file. \n", "\n", @@ -1697,7 +1609,7 @@ "id": "971336a0", "metadata": {}, "source": [ - "### 3.6.3.5 Running the AMSR Data Extraction Process\n", + "### 3.6.3.4 Running the AMSR Data Extraction Process\n", "\n", "Here we extract and save AMSR snow data for a specified range of dates, linking it to SNOTEL stations, and storing the results in a CSV file.\n", "\n", @@ -1997,49 +1909,12 @@ "It provides a streamlined and automated pipeline for handling AMSR data, from initial download and grid alignment to final data processing and analysis.\n" ] }, - { - "cell_type": "markdown", - "id": "1a0828a7", - "metadata": {}, - "source": [ - "### 3.6.4.1 Importing Libraries and Setting Up for Snow Data Processing\n", - "\n", - "- `KDTree`: From `scipy.spatial`, used for performing efficient nearest-neighbor searches in spatial datasets.\n", - "- `plot_all_variables_in_one_csv`: A custom function from `convert_results_to_images`, used for visualizing processed data." - ] - }, - { - "cell_type": "code", - "execution_count": 357, - "id": "1fc38737", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import h5py\n", - "import subprocess\n", - "import pandas as pd\n", - "import numpy as np\n", - "from datetime import datetime\n", - "from scipy.spatial import KDTree\n", - "import time\n", - "from datetime import datetime, timedelta, date\n", - "import warnings\n", - "import sys\n", - "# from convert_results_to_images import plot_all_variables_in_one_c\n", - "\n", - "homedir = os.path.expanduser('~')\n", - "work_dir = \"../data/gridmet_test_run\"\n", - "test_start_date = \"2024-07-18\"\n", - "western_us_coords = \"../data/dem_file.tif.csv\"" - ] - }, { "cell_type": "markdown", "id": "8b0c18cd", "metadata": {}, "source": [ - "### 3.6.4.2 Find Closest Point in a Grid\n", + "### 3.6.4.1 Find Closest Point in a Grid\n", "\n", "- `target_latitude` (float): The latitude of the target point.\n", "- `target_longitude` (float): The longitude of the target point.\n", @@ -2066,62 +1941,12 @@ " return lat_idx, lon_idx, lat_grid[lat_idx, lon_idx], lon_grid[lat_idx, lon_idx]" ] }, - { - "cell_type": "markdown", - "id": "d009d825", - "metadata": {}, - "source": [ - "### 3.6.4.3 Identify Binary File\n", - "\n", - "Here we determine whether a given file is a binary file or a text file.\n", - "\n", - "- We attempt to open the file in binary mode (`'rb'`) and read a chunk of bytes (1024 bytes).\n", - "- And the we check for null bytes (`b'\\x00'`), which are common in binary files. If a null byte is found, then it is binary file.\n", - "- Next, we check for a high percentage of non-printable ASCII characters by converting the byte chunk to characters and filtering out non-printable ones. If the chunk has no printable characters, the file is considered binary.\n", - "- If neither of the above conditions are met, the function assumes the file is a text file.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 359, - "id": "408f81be", - "metadata": {}, - "outputs": [], - "source": [ - "def is_binary(file_path):\n", - " try:\n", - " with open(file_path, 'rb') as file:\n", - " # Read a chunk of bytes from the file\n", - " chunk = file.read(1024)\n", - "\n", - " # Check for null bytes, a common indicator of binary data\n", - " if b'\\x00' in chunk:\n", - " return True\n", - "\n", - " # Check for a high percentage of non-printable ASCII characters\n", - " text_characters = \"\".join(chr(byte) for byte in chunk if 32 <= byte <= 126)\n", - " if not text_characters:\n", - " return True\n", - "\n", - " # If none of the binary indicators are found, assume it's a text file\n", - " return False\n", - "\n", - " except FileNotFoundError:\n", - " print(f\"File '{file_path}' not found.\")\n", - " return False\n", - " except Exception as e:\n", - " print(f\"An error occurred: {e}\")\n", - " return False\n", - " " - ] - }, { "cell_type": "markdown", "id": "b7d58f9e", "metadata": {}, "source": [ - "### 3.6.4.4 Find the closest grid point indices for a target latitude and longitude using KDTree\n", + "### 3.6.4.2 Find the closest grid point indices for a target latitude and longitude using KDTree\n", "Here we find the closest grid point indices to a given target latitude and longitude using a KDTree for efficient spatial searching.\n", "\n", "- `lat_idx` (int): The index of the closest latitude in the grid.\n", @@ -2181,7 +2006,7 @@ "id": "502abe49", "metadata": {}, "source": [ - "### 3.6.4.5 Find the closest grid point indices for a target latitude and longitude.\n", + "### 3.6.4.3 Find the closest grid point indices for a target latitude and longitude.\n", "\n", "Here we find the grid point in a latitude-longitude array that is closest to a given target latitude and longitude.\n", "\n", @@ -2228,9 +2053,9 @@ "id": "de7ff44d", "metadata": {}, "source": [ - "### 3.6.4.6 Preparing the AMSR to GridMET Mapper\n", + "### 3.6.4.4 Preparing the AMSR to GridMET Mapper\n", "\n", - "The goal here is to create a mapping between AMSR grid data and GridMET grid points, saving the results to a CSV file. In `3.6.3.3`, we mapped SNOTEL stations to AMSR grid coordinates. Here, we map the gridMET grid to AMSR coordinates.\n", + "The goal here is to create a mapping between AMSR grid data and GridMET grid points, saving the results to a CSV file. \n", "\n", "- `target_csv_path`: The file path where the mapping between AMSR and GridMET grid points will be saved as a CSV file.\n", "- `target_amsr_hdf_path`: The path where the AMSR data file is stored or will be downloaded to if it doesn’t exist.- `western_us_coords`: A CSV file containing the latitude and longitude of GridMET grid points for the western U.S.\n", @@ -2318,7 +2143,7 @@ "id": "ac65dccd", "metadata": {}, "source": [ - "### 3.6.4.7 Downloading and Converting AMSR Snow Data to DEM Format\n", + "### 3.6.4.5 Downloading and Converting AMSR Snow Data to DEM Format\n", "\n", "Here we automate the downloading, conversion, and saving of AMSR data aligned with a DEM grid.\n", "And also adds a cumulative sum column to a DataFrame, useful for tracking cumulative metrics over time.\n", @@ -2448,7 +2273,7 @@ "id": "d9c56467", "metadata": {}, "source": [ - "### 3.6.4.8 Aggregate Cumulative AMSR Snow Data and Export to CSV\n", + "### 3.6.4.6 Aggregate Cumulative AMSR Snow Data and Export to CSV\n", "\n", "The goal of this code is to calculate the `cumulative Snow Water Equivalent (SWE)` values from AMSR data over a specific period, filling any gaps in the data, and saving the cumulative results into a CSV file. This is particularly useful for analyzing long-term snow accumulation trends.\n", "\n", @@ -2594,7 +2419,7 @@ "id": "2de48c38", "metadata": {}, "source": [ - "### 3.6.4.9 Interpolate Missing Values and Calculate Cumulative SWE In-Place for AMSR Data\n", + "### 3.6.4.7 Interpolate Missing Values and Calculate Cumulative SWE In-Place for AMSR Data\n", "\n", "Here we aim to ensure that any missing or anomalous data points within a specific column are handled appropriately through interpolation, and then a cumulative sum is calculated. \n", "\n", @@ -2666,7 +2491,7 @@ "id": "564e4d15", "metadata": {}, "source": [ - "### 3.6.4.10 Running the AMSR Data Extraction Process\n", + "### 3.6.4.8 Running the AMSR Data Extraction Process\n", "This script is to handle the entire workflow, from data preparation to the generation of cumulative time series data.\n", "\n", "- `prepare_amsr_grid_mapper()`: It maps the AMSR grid to the gridMET grid, preparing the necessary data for further processing.\n", @@ -2767,6 +2592,186 @@ "get_cumulative_amsr_data(force=False)\n", "input_time_series_file = f'{work_dir}/testing_ready_amsr_{test_start_date}_cumulative.csv_gap_filled.csv'\n" ] + }, + { + "cell_type": "markdown", + "id": "826941f6", + "metadata": {}, + "source": [ + "## Utility Functions for Feature Extraction\n", + "\n", + "The following functions are categorized as utility functions. These functions are not central to the main discussion but play a supportive role by providing necessary functionality. They can be referenced as needed throughout the chapter to simplify and streamline the main code examples.\n", + "\n", + "### 1. Importing required python libraries to run the script\n", + "\n", + "- **Importing Libraries**: Essential libraries are imported for handling files, processing large datasets, and performing complex calculations.\n", + " - `os`, `shutil`, `subprocess`: For file handling, copying, and executing shell commands.\n", + " - `csv`, `h5py`, `numpy`, `pandas`: For reading/writing files, handling HDF5 datasets, numerical computations, and data manipulation.\n", + " - `dask`, `xarray`: To manage and process large datasets efficiently using parallel computing.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "134fc5cc", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import csv\n", + "import h5py\n", + "import shutil\n", + "import numpy as np\n", + "import pandas as pd\n", + "from datetime import datetime\n", + "import dask\n", + "import dask.dataframe as dd\n", + "import dask.delayed as delayed\n", + "import dask.bag as db\n", + "import xarray as xr\n", + "import subprocess\n", + "\n", + "\n", + "# For demonstration purposes, we're using one week of data for training.\n", + "# The training period is set from December 24, 2022, to December 31, 2022.\n", + "train_start_date = \"2022-12-24\"\n", + "train_end_date = \"2022-12-31\"\n", + "\n", + "work_dir = \"../data/gridmet_test_run\"" + ] + }, + { + "cell_type": "markdown", + "id": "36257e94", + "metadata": {}, + "source": [ + "### 2. Function to Copy .he5 Files from Source to Destination Directory\n", + "\n", + "The goal here is to copy all `.he5` files from a specified source directory to a destination directory.\n", + "\n", + "- `source_dir`: The directory where the `.he5` files are originally located.\n", + "- `destination_dir`: The target directory where the `.he5` files will be copied.\n", + "- `os.walk`: A function that traverses the directory tree, accessing all subdirectories and files.\n", + "- `shutil.copy`: A method used to copy the files from the source to the destination.\n", + "\n", + "The code specifically looks for files with the `.he5` extension to identify the relevant files for copying." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4c7711af", + "metadata": {}, + "outputs": [], + "source": [ + "def copy_he5_files(source_dir, destination_dir):\n", + " '''\n", + " Copy .he5 files from the source directory to the destination directory.\n", + "\n", + " Args:\n", + " source_dir (str): The source directory containing .he5 files to copy.\n", + " destination_dir (str): The destination directory where .he5 files will be copied.\n", + "\n", + " Returns:\n", + " None\n", + " '''\n", + " # Get a list of all subdirectories and files in the source directory\n", + " for root, dirs, files in os.walk(source_dir):\n", + " for file in files:\n", + " if file.endswith('.he5'):\n", + " # Get the absolute path of the source file\n", + " source_file_path = os.path.join(root, file)\n", + " # Copy the file to the destination directory\n", + " shutil.copy(source_file_path, destination_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "aab13cdf", + "metadata": {}, + "source": [ + "## Utility Functions for Data Processing and Analysis Pipeline\n", + "\n", + "### 1. Library Imports and Setup for Snow Data\n", + "\n", + "- `KDTree`: From `scipy.spatial`, used for performing efficient nearest-neighbor searches in spatial datasets.\n", + "- `plot_all_variables_in_one_csv`: A custom function from `convert_results_to_images`, used for visualizing processed data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6d98378", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import h5py\n", + "import subprocess\n", + "import pandas as pd\n", + "import numpy as np\n", + "from datetime import datetime\n", + "from scipy.spatial import KDTree\n", + "import time\n", + "from datetime import datetime, timedelta, date\n", + "import warnings\n", + "import sys\n", + "# from convert_results_to_images import plot_all_variables_in_one_c\n", + "\n", + "homedir = os.path.expanduser('~')\n", + "work_dir = \"../data/gridmet_test_run\"\n", + "test_start_date = \"2024-07-18\"\n", + "western_us_coords = \"../data/dem_file.tif.csv\"" + ] + }, + { + "cell_type": "markdown", + "id": "2550b08d", + "metadata": {}, + "source": [ + "### 2. Identifying Binary Files\n", + "\n", + "Here we determine whether a given file is a binary file or a text file.\n", + "\n", + "- We attempt to open the file in binary mode (`'rb'`) and read a chunk of bytes (1024 bytes).\n", + "- And the we check for null bytes (`b'\\x00'`), which are common in binary files. If a null byte is found, then it is binary file.\n", + "- Next, we check for a high percentage of non-printable ASCII characters by converting the byte chunk to characters and filtering out non-printable ones. If the chunk has no printable characters, the file is considered binary.\n", + "- If neither of the above conditions are met, the function assumes the file is a text file." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "7004281e", + "metadata": {}, + "outputs": [], + "source": [ + "def is_binary(file_path):\n", + " try:\n", + " with open(file_path, 'rb') as file:\n", + " # Read a chunk of bytes from the file\n", + " chunk = file.read(1024)\n", + "\n", + " # Check for null bytes, a common indicator of binary data\n", + " if b'\\x00' in chunk:\n", + " return True\n", + "\n", + " # Check for a high percentage of non-printable ASCII characters\n", + " text_characters = \"\".join(chr(byte) for byte in chunk if 32 <= byte <= 126)\n", + " if not text_characters:\n", + " return True\n", + "\n", + " # If none of the binary indicators are found, assume it's a text file\n", + " return False\n", + "\n", + " except FileNotFoundError:\n", + " print(f\"File '{file_path}' not found.\")\n", + " return False\n", + " except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n", + " return False\n", + " " + ] } ], "metadata": {