diff --git a/build.sbt b/build.sbt index 4055d0c5..617590de 100644 --- a/build.sbt +++ b/build.sbt @@ -184,8 +184,8 @@ sparkS3JarFolder := "s3://wri-users/dgibbs/geotrellis/jars" sparkS3LogUri := Some("s3://wri-users/dgibbs/geotrellis/logs") sparkSubnetId := Some("subnet-8c2b5ea1") sparkSecurityGroupIds := Seq("sg-00ca15563a40c5687", "sg-6c6a5911") -//sparkInstanceCount := 201 // 201 for carbonflux and carbon_sensitivity -sparkInstanceCount := 10 // 201 for carbonflux and carbon_sensitivity +sparkInstanceCount := 201 // 201 for carbonflux and carbon_sensitivity +//sparkInstanceCount := 10 // for running test areas in EMR sparkMasterType := "r4.2xlarge" sparkCoreType := "r4.2xlarge" sparkMasterEbsSize := Some(10) @@ -198,6 +198,8 @@ sparkInstanceRole := "EMR_EC2_DefaultRole" sparkJobFlowInstancesConfig := sparkJobFlowInstancesConfig.value.withEc2KeyName( "dgibbs_wri" ) + +// For carbonflux runs sparkEmrBootstrap := List( BootstrapAction( "Install GDAL 3.1.2 dependencies", @@ -205,6 +207,16 @@ sparkEmrBootstrap := List( "3.1.2" ) ) + +//// For other runs +//sparkEmrBootstrap := List( +// BootstrapAction( +// "Install GDAL 3.8.3 dependencies", +// "s3://gfw-pipelines/geotrellis/bootstrap/gdal-3.8.3.sh", +// "3.8.3" +// ) +//) + sparkRunJobFlowRequest := sparkRunJobFlowRequest.value .withTags(new Tag("Project", "Global Forest Watch")) .withTags(new Tag("Job", "Carbon Flux Analysis Geotrellis")) diff --git a/src/main/scala/org/globalforestwatch/summarystats/carbonflux/CarbonFluxExport.scala b/src/main/scala/org/globalforestwatch/summarystats/carbonflux/CarbonFluxExport.scala index 0cc3225a..fe0c43c7 100644 --- a/src/main/scala/org/globalforestwatch/summarystats/carbonflux/CarbonFluxExport.scala +++ b/src/main/scala/org/globalforestwatch/summarystats/carbonflux/CarbonFluxExport.scala @@ -63,8 +63,8 @@ object CarbonFluxExport extends SummaryExport { val adm2ApiDF = df .transform(CarbonFluxDF.aggSummary(List("iso", "adm1", "adm2"))) -// .coalesce(200) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(250) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv adm2ApiDF.write @@ -73,8 +73,8 @@ object CarbonFluxExport extends SummaryExport { val adm1ApiDF = adm2ApiDF .transform(CarbonFluxDF.aggSummary2(List("iso", "adm1"))) -// .coalesce(60) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(80) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv adm1ApiDF.write @@ -83,8 +83,8 @@ object CarbonFluxExport extends SummaryExport { val isoApiDF = adm1ApiDF .transform(CarbonFluxDF.aggSummary2(List("iso"))) -// .coalesce(32) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(40) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv isoApiDF.write @@ -101,8 +101,8 @@ object CarbonFluxExport extends SummaryExport { .filter($"umd_tree_cover_loss__year".isNotNull && ($"umd_tree_cover_loss__ha" > 0 || $"gfw_full_extent_gross_emissions_biomass_soil__Mg_CO2e" > 0)) .transform(CarbonFluxDF.aggChange(List("iso", "adm1", "adm2"))) -// .coalesce(350) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(350) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv adm2ApiDF.write @@ -111,8 +111,8 @@ object CarbonFluxExport extends SummaryExport { val adm1ApiDF = adm2ApiDF .transform(CarbonFluxDF.aggChange(List("iso", "adm1"))) -// .coalesce(200) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(200) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv adm1ApiDF.write @@ -121,8 +121,8 @@ object CarbonFluxExport extends SummaryExport { val isoApiDF = adm1ApiDF .transform(CarbonFluxDF.aggChange(List("iso"))) -// .coalesce(70) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues - .coalesce(1) // for local testing: produces one output csv + .coalesce(70) // this should result in an avg file size of 50MB. We try to keep filesize small due to memory issues +// .coalesce(1) // for local testing: produces one output csv isoApiDF.write