From 9e52c4bd19d68c25fc6af8eb3e81b97a4086b018 Mon Sep 17 00:00:00 2001
From: Jordan Laser <jordan.science.laser@gmail.com>
Date: Thu, 11 Jan 2024 09:38:00 -0700
Subject: [PATCH] update readme and process defaults

---
 README.md                                     | 25 ++++++++++++-------
 forcingprocessor/README.md                    |  8 +++---
 .../src/forcingprocessor/forcingprocessor.py  |  2 +-
 python/configure-datastream.py                |  9 ++++---
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 646ea8c5..c6cc7be8 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,13 @@
 The datastream automates the process of collecting and formatting input data for NextGen, orchestrating the NextGen run through NextGen In a Box (NGIAB), and handling outputs. In it's current implementation, the datastream is a shell script that orchestrates each step in the process. 
 
 ## Install
-If you'd like to just run the stream, clone this repo. The stream will handle initialization and installation of the datastream tools. To utilize the individual tools in the stream, see their respective readme's for installation instructions.
+If you'd like to run the stream, clone this repo and execute the command below. The stream will handle initialization and installation of the datastream tools. To utilize the individual tools in the stream, see their respective readme's for installation instructions.
 
 ## Run it
 ```
 /ngen-datastream/scripts/stream.sh /ngen-datastream/configs/conf_datastream_daily.json
 ```
 requires `jq` and `wget`
-also requires `pip install pytz`
 
 ### Example `conf_datastream.json`
 ```
@@ -20,7 +19,12 @@ also requires `pip install pytz`
         "data_dir"     : "ngen-datastream-test",
         "resource_dir" : "datastream-resources-dev",
         "relative_to"  : "/ngen-datastream/data"
-        "subset_id"    : ""
+    },
+
+    "subset" :{
+        "id_type" : "",
+        "id"      : "",
+        "version" : "",
     }
 }
 ```
@@ -32,9 +36,11 @@ also requires `pip install pytz`
 | start_time        | Start simulation time (YYYYMMDDHHMM) | :white_check_mark: |
 | end_time          | End simulation time  (YYYYMMDDHHMM) | :white_check_mark: |
 | data_dir          | Name used in constructing the parent directory of the datastream. Must not exist prior to datastream run | :white_check_mark: |
-| resource_dir      | Folder name that contains the datastream resources. If not provided, datastream will create this folder with default options |  |
+| resource_dir      | Folder name that contains the datastream resources. If not provided, datastream will create this folder with [default options](#datastream-resources-defaults) |  |
 | relative_path     | Absolute path to be prepended to any other path given in configuration file |  |
-| subset_id         | catchment id to subset. If not provided, the geopackage in the resource_dir will define the spatial domain in entirety |   |
+| id_type         | id type corresponding to "id" [See hfsubset for options](https://github.com/LynkerIntel/hfsubset) |   |
+| id         | catchment id to subset. If not provided, spatial domain is set to CONUS [See hfsubset for options](https://github.com/LynkerIntel/hfsubset) |   |
+| version  [See hfsubset for options](https://github.com/LynkerIntel/hfsubset)  | hydrofabric version |   |
 
 ## NextGen Datastream Directory Stucture
 ```
@@ -61,7 +67,7 @@ datastream-configs/
 ├── conf_nwmurl.json
 ```
 ### `datastream-resources/` 
-Copied into `data_dir` if user supplied, generated if not. Holds the data files required to perform computations required by the datastream. The user can supply this directory by pointing the configuration file to `resource_dir`. If not given by the user, datastream will generate this folder with these [defaults](#resource_dir). If the user executes the stream in this way, there is no control over the spatial domain. This option is intended for demonstration purposes only.
+Copied into `data_dir` if user supplied, generated with defaults if not. Holds the data files required to perform computations required by the datastream. The user can supply this directory by pointing the configuration file to `resource_dir`. If not given by the user, datastream will generate this folder with these [defaults](#resource_dir). If the user executes the stream in this way, there is no control over the spatial domain. 
 ```
 datastream-resources/
 │
@@ -71,14 +77,15 @@ datastream-resources/
 |
 ├── <nwm-example-grid-file>.nc
 ```
-`ngen-configs/` holds all non-hydrofabric configuration files for NextGen (`realizion.json`,`config.ini`)
+#### `ngen-configs/` holds all non-hydrofabric configuration files for NextGen (`realizion.json`,`config.ini`)
 
-`datastream-resources/` Defaults
+#### `datastream-resources/` Defaults
 ```
 GRID_FILE_DEFAULT="https://ngenresourcesdev.s3.us-east-2.amazonaws.com/nwm.t00z.short_range.forcing.f001.conus.nc"
 NGEN_CONF_DEFAULT="https://ngenresourcesdev.s3.us-east-2.amazonaws.com/ngen-run-pass/configs/config.ini"
 NGEN_REAL_DEFAULT="https://ngenresourcesdev.s3.us-east-2.amazonaws.com/ngen-run-pass/configs/realization.json"
-GEOPACKAGE_DEFAULT="https://lynker-spatial.s3.amazonaws.com/v20.1/gpkg/nextgen_01.gpkg"
+WEIGHTS_DEFAULT="https://ngenresourcesdev.s3.us-east-2.amazonaws.com/weights_conus_v21.json"https://lynker-spatial.s3.amazonaws.com/v20.1/conus.gpkg
+
 ```
 
 ### Useful Hacks
diff --git a/forcingprocessor/README.md b/forcingprocessor/README.md
index 87533587..b108c698 100644
--- a/forcingprocessor/README.md
+++ b/forcingprocessor/README.md
@@ -59,8 +59,8 @@ See the docker README for example run commands from the container.
 |-------------------|--------------------------------|----------|
 | verbose           | Get print statements, defaults to false           |  :white_check_mark: |
 | collect_stats     | Collect forcing metadata, defaults to true       |  :white_check_mark: |
-| proc_process      | Number of data processing threads, defaults to 80% available cores |   |
-| write_process     | Number of writing threads, defaults to 100% available cores      |   |
+| proc_process      | Number of data processing processes, defaults to 50% available cores |   |
+| write_process     | Number of writing processes, defaults to 100% available cores      |   |
 | nfile_chunk       | Number of files to process each write, defaults to 1000000. Only set this if experiencing memory constraints due to large number of nwm forcing files |   |
 
 ## nwm_file
@@ -91,7 +91,7 @@ In order to retrieve forcing data from a NWM grid for a given catchment, the ind
  python weight_generator.py <path to geopackage> <path to output weights to> <path to example NWM forcing file>
  ```
 
-The weight generator will input an example NWM forcing netcdf to reference the NWM grid, a geopackage that contains all of the catchments the user wants weights for, and a file name for the weight file. Subsetted geopackages can be made with [hfsubset](https://github.com/LynkerIntel/hfsubset). Python based subsetting tools are available [here](https://github.com/CIROH-UA/ngen-datastream/tree/main/subsetting), but plans exist to deprecate this as functionality is built out in hfsubset.
+The weight generator will input an example NWM forcing netcdf to reference the NWM grid, a geopackage that contains all of the catchments the user wants weights for, and a file name for the weight file. Subsetted geopackages can be made with [hfsubset](https://github.com/LynkerIntel/hfsubset). 
 
 ## Run Notes
-This tool is CPU, memory, and I/O intensive. For the best performance, run with `proc_threads` equal to than half of available cores and `write_threads` equal to the number of available cores. Best to experiment with your resources to find out what works best. These options default to 80% and 100% available cores respectively.
+This tool is CPU, memory, and I/O intensive. For the best performance, run with `proc_process` equal to than half of available cores and `write_threads` equal to the number of available cores. Best to experiment with your resources to find out what works best. These options default to 50% and 100% available cores respectively.
diff --git a/forcingprocessor/src/forcingprocessor/forcingprocessor.py b/forcingprocessor/src/forcingprocessor/forcingprocessor.py
index 06ec92cb..dbb32dc5 100644
--- a/forcingprocessor/src/forcingprocessor/forcingprocessor.py
+++ b/forcingprocessor/src/forcingprocessor/forcingprocessor.py
@@ -461,7 +461,7 @@ def prep_ngen_data(conf):
     write_process = conf["run"].get("write_process",None)
     nfile_chunk = conf["run"].get("nfile_chunk",None)
 
-    if proc_process is None: proc_process   = int(os.cpu_count() * 0.8)
+    if proc_process is None: proc_process   = int(os.cpu_count() * 0.5)
     if write_process is None: write_process = os.cpu_count()
     if nfile_chunk is None: nfile_chunk     = 100000
 
diff --git a/python/configure-datastream.py b/python/configure-datastream.py
index 6d881465..29c2f7f3 100644
--- a/python/configure-datastream.py
+++ b/python/configure-datastream.py
@@ -28,8 +28,10 @@ def create_ds_confs_daily(conf, today, tomorrow):
             "output_file_type" : "csv",
         },
         "run" : {
-            "verbose"       : True,
-            "collect_stats" : True
+            "verbose"        : True,
+            "collect_stats"  : True,
+            "proc_process"   : int(os.cpu_count() * 0.8),
+            "write_process"  : os.cpu_count()
         }
     }
 
@@ -43,7 +45,8 @@ def create_ds_confs_daily(conf, today, tomorrow):
         "meminput"     : 0,
         "urlbaseinput" : 7,
         "fcst_cycle"   : [0],
-        "lead_time"    : [x+1 for x in range(24)]
+        'lead_time'    : [1]
+        # "lead_time"    : [x+1 for x in range(24)]
     }
 
     conf['forcingprcoessor'] = fp_conf