diff --git a/.github/workflows/scoring.yaml b/.github/workflows/scoring.yaml index c0b95407cf..765d56a80b 100644 --- a/.github/workflows/scoring.yaml +++ b/.github/workflows/scoring.yaml @@ -1,8 +1,8 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - #schedule: - # - cron: '0 0 * * *' + schedule: + - cron: '0 0 */3 * *' workflow_dispatch: name: scoring @@ -19,6 +19,8 @@ jobs: #container: eco4cast/rocker-neon4cast:latest steps: - uses: actions/checkout@v3 + with: + ref: prod - name: Install shell: Rscript {0} diff --git a/scoring/build_score_inventory.R b/scoring/build_score_inventory.R index fba2215c1a..399354fe4e 100644 --- a/scoring/build_score_inventory.R +++ b/scoring/build_score_inventory.R @@ -10,6 +10,7 @@ inventory_df <- arrow::open_dataset(s3) |> mutate(reference_date = lubridate::as_date(reference_datetime), date = lubridate::as_date(datetime), pub_date = lubridate::as_date(pub_datetime)) |> + filter(project_id == config$project_id) |> distinct(duration, model_id, site_id, reference_date, variable, date, project_id, pub_date) |> collect() |> mutate(path = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}"), @@ -18,8 +19,7 @@ inventory_df <- arrow::open_dataset(s3) |> sites <- readr::read_csv(config$site_table, show_col_types = FALSE) |> - select(field_site_id, latitude, longitude) |> - rename(site_id = field_site_id) + select(site_id, latitude, longitude) inventory_df <- dplyr::left_join(inventory_df, sites, by = "site_id") diff --git a/scoring/delete_scores.R b/scoring/delete_scores.R deleted file mode 100644 index ea06c01a8b..0000000000 --- a/scoring/delete_scores.R +++ /dev/null @@ -1,34 +0,0 @@ -df <- aws.s3::get_bucket_df(bucket = "bio230121-bucket01", - prefix = "vera4cast/scores/", - region = "renc", - base_url = "osn.xsede.org", - key = Sys.getenv("OSN_KEY"), - secret = Sys.getenv("OSN_SECRET")) - -for(i in 1:nrow(df)){ - - aws.s3::delete_object(object = df$Key[i], - bucket = "bio230121-bucket01", - region = "renc", - base_url = "osn.xsede.org", - key = Sys.getenv("OSN_KEY"), - secret = Sys.getenv("OSN_SECRET")) -} - -df <- aws.s3::get_bucket_df(bucket = "bio230121-bucket01", - prefix = "vera4cast/prov/", - region = "renc", - base_url = "osn.xsede.org", - key = Sys.getenv("OSN_KEY"), - secret = Sys.getenv("OSN_SECRET")) - -for(i in 1:nrow(df)){ - - aws.s3::delete_object(object = df$Key[i], - bucket = "bio230121-bucket01", - region = "renc", - base_url = "osn.xsede.org", - key = Sys.getenv("OSN_KEY"), - secret = Sys.getenv("OSN_SECRET")) -} - diff --git a/scoring/scoring.R b/scoring/scoring.R index 7eef5c4f8b..5a02d0ce29 100644 --- a/scoring/scoring.R +++ b/scoring/scoring.R @@ -81,7 +81,7 @@ furrr::future_walk(1:nrow(variable_duration), function(k, variable_duration, con schema = arrow::schema( project_id = arrow::string(), site_id = arrow::string(), - datetime = arrow::timestamp(unit = "ns", timezone = "UTC"), + datetime = arrow::timestamp(unit = "ns"), # timezone = "UTC"), duration = arrow::string(), #depth_m = arrow::float(), #project_specific variable = arrow::string(), @@ -96,8 +96,26 @@ furrr::future_walk(1:nrow(variable_duration), function(k, variable_duration, con curr_duration <- duration curr_project_id <- project_id - groupings <- arrow::open_dataset(s3_inv) |> - dplyr::filter(variable == curr_variable, duration == curr_duration) |> + groupings <- arrow::open_dataset(s3_inv, + schema = arrow::schema( + duration = arrow::string(), + model_id = arrow::string(), + site_id = arrow::string(), + reference_date = arrow::date32(), + variable = arrow::string(), + date = arrow::date32(), + project_id = arrow::string(), + pub_date = arrow::date32(), + path = arrow::string(), + path_full = arrow::string(), + path_summaries = arrow::string(), + endpoint = arrow::string(), + latitude = arrow::float(), + longitude = arrow::float(), + )) |> + dplyr::filter(variable == curr_variable, + duration == curr_duration, + project_id == curr_project_id) |> dplyr::select(-site_id) |> dplyr::collect() |> dplyr::distinct() |>