Feature output thresholded tiles (#11)

* Replaced the per pixel script with a script that creates per pixel tiles for the full model extent as well as per hectare tiles for forest extent (((TCD2000>30 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations), and per pixel tiles for forest extent. Tested on a few tiles locally and ready to try to run on r5d machine. Also, added new step to run_full_model.py but haven't tried testing it from that context or running it in a full model run yet. * Fine-tuning the number of processors to use for supplementary output creation. * Fine-tuning the number of processors to use for supplementary output creation. * Had to make a separate tile_id_list_outer list for the area, tcd, mangrove, etc. tiles being downloaded so it didn't conflict with the list of model output tiles being analyzed. * Had to make a separate tile_id_list_outer list for the area, tcd, mangrove, etc. tiles being downloaded so it didn't conflict with the list of model output tiles being analyzed. * Further fixing tile_id_list because the command line argument of "all" kept getting overwritten in the for loop. * Updated readme file to reflect changes from creating supplementary outputs, as well as general clarifications and corrections.
wri · Feb 11, 2021 · a51a51c · a51a51c
1 parent 786a766
commit a51a51c
Show file tree

Hide file tree

Showing 10 changed files with 521 additions and 370 deletions.
diff --git a/analyses/aggregate_results_to_4_km.py b/analyses/aggregate_results_to_4_km.py
@@ -192,11 +192,7 @@ def aggregate(tile, thresh, sensit_type):
         # Applies the tree cover density threshold to the 30x30m pixels
         if thresh > 0:
 
-            # in_window = np.ma.masked_where(tcd_window < thresh, in_window)
-            # in_window = in_window.filled(0)
-
-            # in_window = np.where((tcd_window > thresh), in_window, 0)
-            # in_window = np.where((tcd_window > thresh) | (gain_window == 1), in_window, 0)
+            # QCed this line before publication and then again afterwards in response to question from Lena Schulte-Uebbing at Wageningen Uni.
             in_window = np.where((tcd_window > thresh) | (gain_window == 1) | (mangrove_window != 0), in_window, 0)
 
         # Calculates the per-pixel value from the input tile value (/ha to /pixel)

diff --git a/analyses/create_supplementary_outputs.py b/analyses/create_supplementary_outputs.py
@@ -0,0 +1,148 @@
+'''
+Script to create three supplementary tiled outputs for each main model output (gross emissions, gross removals, net flux),
+which are already in per hectare values for full model extent:
+1. per pixel values for full model extent (all pixels included in model extent)
+2. per hectare values for forest extent (within the model extent, pixels that have TCD>30 OR Hansen gain OR mangrove biomass)
+3. per pixel values for forest extent
+The forest extent outputs are for sharing with partners because they limit the model to just the relevant pixels
+(those within forests).
+Forest extent is defined in the methods section of Harris et al. 2021 Nature Climate Change.
+It is roughly implemented in mp_model_extent.py but using TCD>0 rather thant TCD>30. Here, the TCD>30 requirement
+is implemented instead as a subset of the full model extent pixels.
+Forest extent is: ((TCD2000>30 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations.
+The WHRC AGB2000 and pre-2000 plantations conditions were set in mp_model_extent.py, so they don't show up here.
+'''
+
+import numpy as np
+from subprocess import Popen, PIPE, STDOUT, check_call
+import os
+import rasterio
+from rasterio.transform import from_origin
+import datetime
+import sys
+sys.path.append('../')
+import constants_and_names as cn
+import universal_util as uu
+
+def create_supplementary_outputs(tile_id, input_pattern, output_patterns, sensit_type):
+
+    # start time
+    start = datetime.datetime.now()
+
+    # Extracts the tile id, tile type, and bounding box for the tile
+    tile_id = uu.get_tile_id(tile_id)
+
+    # Names of inputs
+    focal_tile = '{0}_{1}.tif'.format(tile_id, input_pattern)
+    pixel_area = '{0}_{1}.tif'.format(cn.pattern_pixel_area, tile_id)
+    tcd = '{0}_{1}.tif'.format(cn.pattern_tcd, tile_id)
+    gain = '{0}_{1}.tif'.format(cn.pattern_gain, tile_id)
+    mangrove = '{0}_{1}.tif'.format(tile_id, cn.pattern_mangrove_biomass_2000)
+
+    # Names of outputs.
+    # Requires that output patterns be listed in main script in the correct order for here
+    # (currently, per pixel full extent, per hectare forest extent, per pixel forest extent).
+    per_pixel_full_extent = '{0}_{1}.tif'.format(tile_id, output_patterns[0])
+    per_hectare_forest_extent = '{0}_{1}.tif'.format(tile_id, output_patterns[1])
+    per_pixel_forest_extent = '{0}_{1}.tif'.format(tile_id, output_patterns[2])
+
+    # Opens input tiles for rasterio
+    in_src = rasterio.open(focal_tile)
+    # Grabs metadata about the tif, like its location/projection/cellsize
+    kwargs = in_src.meta
+    # Grabs the windows of the tile (stripes) so we can iterate over the entire tif without running out of memory
+    windows = in_src.block_windows(1)
+
+    pixel_area_src = rasterio.open(pixel_area)
+    tcd_src = rasterio.open(tcd)
+    gain_src = rasterio.open(gain)
+
+    try:
+        mangrove_src = rasterio.open(mangrove)
+        uu.print_log("    Mangrove tile found for {}".format(tile_id))
+    except:
+        uu.print_log("    No mangrove tile found for {}".format(tile_id))
+
+    uu.print_log("  Creating outputs for {}...".format(focal_tile))
+
+    kwargs.update(
+        driver='GTiff',
+        count=1,
+        compress='lzw',
+        nodata=0,
+        dtype='float32'
+    )
+
+    # Opens output tiles, giving them the arguments of the input tiles
+    per_pixel_full_extent_dst = rasterio.open(per_pixel_full_extent, 'w', **kwargs)
+    per_hectare_forest_extent_dst = rasterio.open(per_hectare_forest_extent, 'w', **kwargs)
+    per_pixel_forest_extent_dst = rasterio.open(per_pixel_forest_extent, 'w', **kwargs)
+
+    # Adds metadata tags to the output rasters
+
+    uu.add_rasterio_tags(per_pixel_full_extent_dst, sensit_type)
+    per_pixel_full_extent_dst.update_tags(
+        units='Mg CO2e/pixel over model duration (2001-20{})'.format(cn.loss_years))
+    per_pixel_full_extent_dst.update_tags(
+        source='per hectare full model extent tile')
+    per_pixel_full_extent_dst.update_tags(
+        extent='Full model extent: ((TCD2000>0 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations')
+
+    uu.add_rasterio_tags(per_hectare_forest_extent_dst, sensit_type)
+    per_hectare_forest_extent_dst.update_tags(
+        units='Mg CO2e/hectare over model duration (2001-20{})'.format(cn.loss_years))
+    per_hectare_forest_extent_dst.update_tags(
+        source='per hectare full model extent tile')
+    per_hectare_forest_extent_dst.update_tags(
+        extent='Forest extent: ((TCD2000>30 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations')
+
+    uu.add_rasterio_tags(per_pixel_forest_extent_dst, sensit_type)
+    per_pixel_forest_extent_dst.update_tags(
+        units='Mg CO2e/pixel over model duration (2001-20{})'.format(cn.loss_years))
+    per_pixel_forest_extent_dst.update_tags(
+        source='per hectare forest model extent tile')
+    per_pixel_forest_extent_dst.update_tags(
+        extent='Forest extent: ((TCD2000>30 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations')
+
+    if "net_flux" in focal_tile:
+        per_pixel_full_extent_dst.update_tags(
+            scale='Negative values are net sinks. Positive values are net sources.')
+        per_hectare_forest_extent_dst.update_tags(
+            scale='Negative values are net sinks. Positive values are net sources.')
+        per_pixel_forest_extent_dst.update_tags(
+            scale='Negative values are net sinks. Positive values are net sources.')
+
+
+    # Iterates across the windows of the input tiles
+    for idx, window in windows:
+
+        # Creates windows for each input tile
+        in_window = in_src.read(1, window=window)
+        pixel_area_window = pixel_area_src.read(1, window=window)
+        tcd_window = tcd_src.read(1, window=window)
+        gain_window = gain_src.read(1, window=window)
+
+        try:
+            mangrove_window = mangrove_src.read(1, window=window)
+        except:
+            mangrove_window = np.zeros((window.height, window.width), dtype='uint8')
+
+        # Output window for per pixel full extent raster
+        dst_window_per_pixel_full_extent = in_window * pixel_area_window / cn.m2_per_ha
+
+        # Output window for per hectare forest extent raster
+        # QCed this line before publication and then again afterwards in response to question from Lena Schulte-Uebbing at Wageningen Uni.
+        dst_window_per_hectare_forest_extent = np.where((tcd_window > cn.canopy_threshold) | (gain_window == 1) | (mangrove_window != 0), in_window, 0)
+
+        # Output window for per pixel forest extent raster
+        dst_window_per_pixel_forest_extent = dst_window_per_hectare_forest_extent * pixel_area_window / cn.m2_per_ha
+
+        # Writes arrays to output raster
+        per_pixel_full_extent_dst.write_band(1, dst_window_per_pixel_full_extent, window=window)
+        per_hectare_forest_extent_dst.write_band(1, dst_window_per_hectare_forest_extent, window=window)
+        per_pixel_forest_extent_dst.write_band(1, dst_window_per_pixel_forest_extent, window=window)
+
+    uu.print_log("  Output tiles created for {}...".format(tile_id))
+
+    # Prints information about the tile that was just processed
+    uu.end_of_fx_summary(start, tile_id, output_patterns[0])
diff --git a/analyses/mp_create_supplementary_outputs.py b/analyses/mp_create_supplementary_outputs.py
@@ -0,0 +1,200 @@
+'''
+Script to create three supplementary tiled outputs for each main model output (gross emissions, gross removals, net flux),
+which are already in per hectare values for full model extent:
+1. per pixel values for full model extent (all pixels included in model extent)
+2. per hectare values for forest extent (within the model extent, pixels that have TCD>30 OR Hansen gain OR mangrove biomass)
+3. per pixel values for forest extent
+The forest extent outputs are for sharing with partners because they limit the model to just the relevant pixels
+(those within forests).
+Forest extent is defined in the methods section of Harris et al. 2021 Nature Climate Change.
+It is roughly implemented in mp_model_extent.py but using TCD>0 rather thant TCD>30. Here, the TCD>30 requirement
+is implemented instead as a subset of the full model extent pixels.
+Forest extent is: ((TCD2000>30 AND WHRC AGB2000>0) OR Hansen gain=1 OR mangrove AGB2000>0) NOT IN pre-2000 plantations.
+The WHRC AGB2000 and pre-2000 plantations conditions were set in mp_model_extent.py, so they don't show up here.
+'''
+
+
+import multiprocessing
+from subprocess import Popen, PIPE, STDOUT, check_call
+from functools import partial
+import datetime
+import argparse
+import os
+import glob
+import sys
+sys.path.append('../')
+import constants_and_names as cn
+import universal_util as uu
+sys.path.append(os.path.join(cn.docker_app,'analyses'))
+import create_supplementary_outputs
+
+def mp_create_supplementary_outputs(sensit_type, tile_id_list, run_date = None):
+
+    os.chdir(cn.docker_base_dir)
+
+    # If a full model run is specified, the correct set of tiles for the particular script is listed
+    if tile_id_list == 'all':
+        # List of tiles to run in the model
+        tile_id_list_outer = uu.tile_list_s3(cn.net_flux_dir, sensit_type)
+
+    uu.print_log(tile_id_list_outer)
+    uu.print_log("There are {} tiles to process".format(str(len(tile_id_list_outer))) + "\n")
+
+
+    # Files to download for this script
+    download_dict = {
+        cn.cumul_gain_AGCO2_BGCO2_all_types_dir: [cn.pattern_cumul_gain_AGCO2_BGCO2_all_types],
+        cn.gross_emis_all_gases_all_drivers_biomass_soil_dir: [cn.pattern_gross_emis_all_gases_all_drivers_biomass_soil],
+        cn.net_flux_dir: [cn.pattern_net_flux]
+    }
+
+    # List of output directories and output file name patterns.
+    # Outputs must be in the same order as the download dictionary above, and then follow the same order for all outputs.
+    # Currently, it's: per pixel full extent, per hectare forest extent, per pixel forest extent.
+    output_dir_list = [
+                        cn.cumul_gain_AGCO2_BGCO2_all_types_per_pixel_full_extent_dir,
+                        cn.cumul_gain_AGCO2_BGCO2_all_types_forest_extent_dir,
+                        cn.cumul_gain_AGCO2_BGCO2_all_types_per_pixel_forest_extent_dir,
+                        cn.gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_full_extent_dir,
+                        cn.gross_emis_all_gases_all_drivers_biomass_soil_forest_extent_dir,
+                        cn.gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_forest_extent_dir,
+                        cn.net_flux_per_pixel_full_extent_dir,
+                        cn.net_flux_forest_extent_dir,
+                        cn.net_flux_per_pixel_forest_extent_dir]
+    output_pattern_list = [
+                            cn.pattern_cumul_gain_AGCO2_BGCO2_all_types_per_pixel_full_extent,
+                            cn.pattern_cumul_gain_AGCO2_BGCO2_all_types_forest_extent,
+                            cn.pattern_cumul_gain_AGCO2_BGCO2_all_types_per_pixel_forest_extent,
+                            cn.pattern_gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_full_extent,
+                            cn.pattern_gross_emis_all_gases_all_drivers_biomass_soil_forest_extent,
+                            cn.pattern_gross_emis_all_gases_all_drivers_biomass_soil_per_pixel_forest_extent,
+                            cn.pattern_net_flux_per_pixel_full_extent,
+                            cn.pattern_net_flux_forest_extent,
+                            cn.pattern_net_flux_per_pixel_forest_extent
+    ]
+
+
+    # Pixel area tiles-- necessary for calculating per pixel values
+    uu.s3_flexible_download(cn.pixel_area_dir, cn.pattern_pixel_area, cn.docker_base_dir, sensit_type, tile_id_list_outer)
+    # Tree cover density, Hansen gain, and mangrove biomass tiles-- necessary for masking to forest extent
+    uu.s3_flexible_download(cn.tcd_dir, cn.pattern_tcd, cn.docker_base_dir, sensit_type, tile_id_list_outer)
+    uu.s3_flexible_download(cn.gain_dir, cn.pattern_gain, cn.docker_base_dir, sensit_type, tile_id_list_outer)
+    uu.s3_flexible_download(cn.mangrove_biomass_2000_dir, cn.pattern_mangrove_biomass_2000, cn.docker_base_dir, sensit_type, tile_id_list_outer)
+
+    uu.print_log("Model outputs to process are:", download_dict)
+
+    # If the model run isn't the standard one, the output directory is changed
+    if sensit_type != 'std':
+        uu.print_log("Changing output directory and file name pattern based on sensitivity analysis")
+        output_dir_list = uu.alter_dirs(sensit_type, output_dir_list)
+
+    # A date can optionally be provided by the full model script or a run of this script.
+    # This replaces the date in constants_and_names.
+    if run_date is not None:
+        output_dir_list = uu.replace_output_dir_date(output_dir_list, run_date)
+
+
+    # Iterates through input tile sets
+    for key, values in download_dict.items():
+
+        # Sets the directory and pattern for the input being processed
+        input_dir = key
+        input_pattern = values[0]
+
+        # If a full model run is specified, the correct set of tiles for the particular script is listed.
+        # A new list is named so that tile_id_list stays as the command line argument.
+        if tile_id_list == 'all':
+            # List of tiles to run in the model
+            tile_id_list_input = uu.tile_list_s3(input_dir, sensit_type)
+        else:
+            tile_id_list_input = tile_id_list
+
+        uu.print_log(tile_id_list_input)
+        uu.print_log("There are {} tiles to process".format(str(len(tile_id_list_input))) + "\n")
+
+        uu.print_log("Downloading tiles from", input_dir)
+        uu.s3_flexible_download(input_dir, input_pattern, cn.docker_base_dir, sensit_type, tile_id_list_input)
+
+        # Blank list of output patterns, populated below
+        output_patterns = []
+
+        # Matches the output patterns with the input pattern.
+        # This requires that the output patterns be grouped by input pattern and be in the order described in
+        # the comment above.
+        if "gross_removals" in input_pattern:
+            output_patterns = output_pattern_list[0:3]
+        elif "gross_emis" in input_pattern:
+            output_patterns = output_pattern_list[3:6]
+        elif "net_flux" in input_pattern:
+            output_patterns = output_pattern_list[6:9]
+        else:
+            uu.exception_log("No output patterns found for input pattern. Please check.")
+
+        uu.print_log("Input pattern:", input_pattern)
+        uu.print_log("Output patterns:", output_patterns)
+
+        # Gross removals: 20 processors = >740 GB peak; 15 = 570 GB peak; 17 = 660 GB peak; 18 = 670 GB peak
+        # Gross emissions: 17 processors = 660 GB peak; 18 = 710 GB peak
+        if cn.count == 96:
+            processes = 18
+        else:
+            processes = 2
+        uu.print_log("Creating derivative outputs for {0} with {1} processors...".format(input_pattern, processes))
+        pool = multiprocessing.Pool(processes)
+        pool.map(partial(create_supplementary_outputs.create_supplementary_outputs, input_pattern=input_pattern,
+                         output_patterns=output_patterns, sensit_type=sensit_type), tile_id_list_input)
+        pool.close()
+        pool.join()
+
+        # # For single processor use
+        # for tile_id in tile_id_list_input:
+        #     create_supplementary_outputs.create_supplementary_outputs(tile_id, input_pattern, output_patterns, sensit_type)
+
+        # Checks the two forest extent output tiles created from each input tile for whether there is data in them.
+        # Because the extent is restricted in the forest extent pixels, some tiles with pixels in the full extent
+        # version may not have pixels in the forest extent version.
+        for output_pattern in output_patterns[1:3]:
+            if cn.count <= 2:  # For local tests
+                processes = 1
+                uu.print_log("Checking for empty tiles of {0} pattern with {1} processors using light function...".format(output_pattern, processes))
+                pool = multiprocessing.Pool(processes)
+                pool.map(partial(uu.check_and_delete_if_empty_light, output_pattern=output_pattern), tile_id_list_input)
+                pool.close()
+                pool.join()
+            else:
+                processes = 55  # 50 processors = 560 GB peak for gross removals; 55 = XXX GB peak
+                uu.print_log("Checking for empty tiles of {0} pattern with {1} processors...".format(output_pattern, processes))
+                pool = multiprocessing.Pool(processes)
+                pool.map(partial(uu.check_and_delete_if_empty, output_pattern=output_pattern), tile_id_list_input)
+                pool.close()
+                pool.join()
+
+    # Uploads output tiles to s3
+    for i in range(0, len(output_dir_list)):
+        uu.upload_final_set(output_dir_list[i], output_pattern_list[i])
+
+
+if __name__ == '__main__':
+
+    # The argument for what kind of model run is being done: standard conditions or a sensitivity analysis run
+    parser = argparse.ArgumentParser(
+        description='Create tiles of the number of years of carbon gain for mangrove forests')
+    parser.add_argument('--model-type', '-t', required=True,
+                        help='{}'.format(cn.model_type_arg_help))
+    parser.add_argument('--tile_id_list', '-l', required=True,
+                        help='List of tile ids to use in the model. Should be of form 00N_110E or 00N_110E,00N_120E or all.')
+    parser.add_argument('--run-date', '-d', required=False,
+                        help='Date of run. Must be format YYYYMMDD.')
+    args = parser.parse_args()
+    sensit_type = args.model_type
+    tile_id_list = args.tile_id_list
+    run_date = args.run_date
+
+    # Create the output log
+    uu.initiate_log(tile_id_list=tile_id_list, sensit_type=sensit_type, run_date=run_date)
+
+    # Checks whether the sensitivity analysis and tile_id_list arguments are valid
+    uu.check_sensit_type(sensit_type)
+    tile_id_list = uu.tile_id_list_check(tile_id_list)
+
+    mp_create_supplementary_outputs(sensit_type=sensit_type, tile_id_list=tile_id_list, run_date=run_date)