RMI-PACTA · AlexAxthelm · Feb 1, 2024 · Jan 30, 2024 · Jan 31, 2024 · Jan 31, 2024
@@ -27,11 +27,13 @@ RoxygenNote: 7.3.1
 Imports: 
     digest,
     dplyr,
+    jsonvalidate (>= 1.4.0),
     logger,
     pacta.portfolio.import,
     uuid
 Remotes:
-    RMI-PACTA/pacta.portfolio.import
+    RMI-PACTA/pacta.portfolio.import,
+    ropensci/jsonvalidate
 Suggests: 
     testthat (>= 3.0.0),
     withr

@@ -14,7 +14,10 @@ LABEL org.opencontainers.image.authors=""
 
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
+      libcurl4-openssl-dev=7.81.* \
       libicu-dev=70.* \
+      libnode-dev=12.22.* \
+      libssl-dev=3.0.* \
     && chmod -R a+rwX /root \
     && rm -rf /var/lib/apt/lists/*
 

@@ -2,7 +2,8 @@
 export_portfolio <- function(
   portfolio_data,
   group_data,
-  output_directory
+  output_directory,
+  validate = TRUE
 ) {
 
   logger::log_trace("cleaning and rearranging data prior to export")
@@ -53,23 +54,32 @@ export_portfolio <- function(
   )
   logger::log_debug("Portfolio data written to file: ", output_filepath)
 
-  output_digest <- digest::digest(
+  output_md5 <- digest::digest(
     object = output_filepath,
     file = TRUE,
     algo = "md5",
     serialize = FALSE
   )
-  logger::log_trace("Portfolio data digest: ", output_digest)
+  logger::log_trace("Portfolio data digest: ", output_md5)
 
   portfolio_metadata <- c(
     list(
-      output_digest = output_digest,
+      output_md5 = output_md5,
       output_filename = output_filename,
       output_rows = output_rows
     ),
     as.list(group_data)
   )
 
+  if (validate) {
+    logger::log_trace("Validating output.")
+    schema_serialize(
+      object = list(portfolio_metadata),
+      reference = "#/items/properties/portfolios"
+    )
+  } else {
+    logger::log_trace("Skipping JSON validation.")
+  }
 
   return(portfolio_metadata)
 }
@@ -1,7 +1,8 @@
 #' @export
 process_directory <- function(
   input_directory = "/mnt/input",
-  output_directory = "/mnt/output"
+  output_directory = "/mnt/output",
+  validate = TRUE
 ) {
   # Get the list of files in the directory
   files <- list.files(input_directory, full.names = TRUE)
@@ -15,5 +16,24 @@ process_directory <- function(
     all_summaries <- c(all_summaries, list(portfolio_summary))
   }
   logger::log_info("Done processing directory.")
+
+  logger::log_info("Preparing metadata JSON file.")
+  if (validate) {
+    logger::log_info("Validating output.")
+    summaries_json <- schema_serialize(all_summaries)
+  } else {
+    logger::log_warn("Skipping JSON validation.")
+    summaries_json <- jsonlite::toJSON(
+      x = all_summaries,
+      pretty = TRUE,
+      auto_unbox = TRUE
+    )
+  }
+
+  metadata_path <- file.path(output_directory, "processed_portfolios.json")
+  logger::log_info("Writing metadata JSON to file \"", metadata_path, "\".")
+  writeLines(summaries_json, metadata_path)
+  logger::log_debug("Metadata JSON file written.")
+
   return(all_summaries)
 }
@@ -1,7 +1,8 @@
 #' @export
 reexport_portfolio <- function(
   input_filepath,
-  output_directory
+  output_directory,
+  validate = TRUE
 ) {
 
   if (length(input_filepath) > 1L) {
@@ -18,7 +19,7 @@ reexport_portfolio <- function(
   logger::log_debug("Portfolio data read.")
 
   logger::log_trace("Indentifying portfolio metadata.")
-  input_digest <- digest::digest(
+  input_md5 <- digest::digest(
     object = input_filepath,
     file = TRUE,
     algo = "md5",
@@ -29,7 +30,8 @@ reexport_portfolio <- function(
 
   file_summary <- list(
     input_filename = input_filename,
-    input_digest = input_digest
+    input_md5 = input_md5,
+    system_info = get_system_info()
   )
 
   # read_portfolio_csv retruns NA if it cannot process a portfolio
@@ -87,6 +89,16 @@ reexport_portfolio <- function(
     file_summary[["portfolios"]] <- NULL
   }
 
+  if (validate) {
+    logger::log_trace("Validating output.")
+    schema_serialize(
+      object = file_summary,
+      reference = "#/items"
+    )
+  } else {
+    logger::log_trace("Skipping JSON validation.")
+  }
+
   logger::log_info("Finished processing file: ", input_filepath)
   return(file_summary)
 }
@@ -0,0 +1,39 @@
+schema_serialize <- function(
+  object,
+  schema_file = system.file(
+    "extdata", "schema", "parsedPortfolio_0-1-0.json",
+    package = "workflow.portfolio.parsing"
+  ),
+  reference = NULL
+) {
+  sch <- jsonvalidate::json_schema[["new"]](
+    schema = readLines(schema_file),
+    strict = TRUE,
+    engine = "ajv",
+    reference = reference
+  )
+  json <- sch[["serialise"]](object)
+  json_is_valid <- sch[["validate"]](json, verbose = TRUE)
+  if (json_is_valid) {
+    logger::log_trace("JSON is valid.")
+  } else {
+    json_errors <- attributes(json_is_valid)[["errors"]]
+    logger::log_warn(
+      "object could not be validated against ",
+      "JSON schema: \"", schema_file, "\",",
+      " reference: \"", reference, "\"."
+    )
+    logger::log_trace(
+      logger::skip_formatter(paste("JSON string: ", json))
+    )
+    logger::log_trace("Validation errors:")
+    for (i in seq(from = 1L, to = nrow(json_errors), by = 1L)) {
+      logger::log_trace(
+        "instancePath: ", json_errors[i, "instancePath"],
+        " message: ", json_errors[i, "message"]
+      )
+    }
+    warning("Object could not be validated against schema.")
+  }
+  return(json)
+}
@@ -0,0 +1,45 @@
+get_system_info <- function() {
+  logger::log_trace("Getting system information")
+  package <- getPackageName()
+  version <- as.character(packageVersion(package))
+  logger::log_trace("Package: ", package, " version: ", version)
+  raw_deps <- trimws(
+    strsplit(
+      x = packageDescription(package)[["Imports"]],
+      split = ",",
+      fixed = TRUE
+    )[[1L]]
+  )
+  deps <- trimws(
+    gsub(
+      x = raw_deps,
+      pattern = "\\s+\\(.*\\)",
+      replacement = ""
+    )
+  )
+  deps_version <- as.list(
+    lapply(
+      X = deps,
+      FUN = function(x) {
+        list(
+          package = x,
+          version = as.character(packageVersion(x))
+        )
+      }
+    )
+  )
+
+  return(
+    list(
+      timestamp = format(
+        Sys.time(),
+        format = "%Y-%m-%dT%H:%M:%SZ",
+        tz = "UTC"
+      ),
+      package = package,
+      packageVersion = version,
+      RVersion = as.character(getRversion()),
+      dependencies = deps_version
+    )
+  )
+}
@@ -2,4 +2,107 @@
 
 [![Project Status: WIP – Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
 
-The docker image defines by this repo accepts a directory of portfolios (mounted to `/mnt/input/`) and exports sanitized versions of those portfolios ready for further processing via PACTA (in `/mnt/outputs`)
+The docker image defined by this repo accepts a directory of portfolios (mounted to `/mnt/input/`, see [Inputs](#inputs)) and exports sanitized versions of those portfolios ready for further processing via PACTA (in `/mnt/output`, see [Outputs](#outputs),)
+
+## Docker Image
+
+The intended method for invoking this workflow is with a Docker container defined by the image in the [Dockerfile](Dockerfile).
+GitHub Actions builds the offical image, which is available at: `ghcr.io/rmi-pacta/workflow.portfolio.parsing:main`
+
+Running the workflow from a docker image requires mounting an input and output directory:
+
+```sh
+
+# note that the input mount can have a readonly connection
+# You can set logging verbosity via the LOG_LEVEL envvar
+
+docker run --rm \
+    --mount type=bind,source="$(pwd)"/input,target=/mnt/input,readonly \
+    --mount type=bind,source="$(pwd)"/output,target=/mnt/output \
+    --env LOG_LEVEL=TRACE \
+    ghcr.io/rmi-pacta/workflow.portfolio.parsing:pr16
+
+```
+
+The container will process any files in the `input` directory, and export any valid portfolios along with a metadata files (see [Outputs](#outputs), below).
+
+## Metadata file (`processed_portfolios.json`)
+
+Along with portfolios (in a standardized `csv` format), the parser exports a metadata file about the parsed inputs, and the exported portfolio files.
+The file is in JSON format, and validates against a [JSON Schema in this repository](inst/extdata/schema/parsedPortfolio_0-1-0.json).
+
+The file is array of objects, with the highest level opbects in the array centered around the input files, with the exported files contained in an array with the `portfolio` key  in each input file object.
+
+A simple example of the output file:
+
+```jsonc
+[
+  {
+    "input_filename": "simple.csv",
+    "input_md5": "8e84d71c0f3892e34e0d9342cfc91a4d",
+    "system_info": {
+      "timestamp": "2024-01-31T19:11:56Z",
+      "package": "workflow.portfolio.parsing",
+      "packageVersion": "0.0.0.9000",
+      "RVersion": "4.3.2",
+      "dependencies": [
+        {
+          "package": "digest",
+          "version": "0.6.33"
+        },
+      // ... array elided
+      ]
+    },
+    "input_entries": 1,
+    "group_cols": [],
+    "subportfolios_count": 1,
+    "portfolios": [
+      {
+        "output_md5": "0f51946d64ef6ee4daca1a6969317cba",
+        "output_filename": "be1e7db9-3d7c-4978-9c1c-4eba4ad2cff5.csv",
+        "output_rows": 1
+      }
+    ]
+  }
+]
+```
+
+Note that a input file object may have an `errors` key (exclusive to the `portfolios` key), or `warnings` (not exclusive to `portfolios` or `errors`) which indicates a processing error (or warning).
+The `errors` object will be an array with messages which are suitable for presentation to end users.
+
+Here is an example `jq` query to see a simple mapping between input and output files:
+
+```sh
+
+cat output/processed_portfolios.json | jq '
+    .[] |
+    [{
+      input_file: .input_filename,
+      output_file: .portfolios[].output_filename
+    }]
+'
+
+```
+
+## R Package
+
+This repo defines an R Package, `{workflow.portfolio.parsing}`.
+The R package structure allows for easy management of dependencies, tests, and access to package files (such as the [JSON Schema](inst/extdata/schema/parsedPortfolio_0-1-0.json)).
+Because using the R Package locally not intended as the primary use-case, running locally (beyond development) is technically unsupported, but should not pose any issues.
+
+The package exports functions, but the main entrypoint is `process_directory()`. When called with default arguments, it works as intended for use with the docker image from this repo.
+
+## Inputs
+
+This workflow reads files from a directory (by convention mounted in docker container as `/mnt/input`).
+The files must be plain csv files (though they do not need to have a `csv` file extension), parsable by [`pacta.portfolio.import::read_portfolio_csv()`](https://rmi-pacta.github.io/pacta.portfolio.import/reference/read_portfolio_csv.html).
+The workflow will attempt to parse other files in that directory, but will throw warnings.
+The workflow will not recurse into subdirectories.
+
+## Outputs
+
+This workflow writes files to a directory (by convention mounted in docker container as `/mnt/output`):
+
+- `*.csv`: csv files that contain portfolio data, with columns and column names standardized, 1 portfolio per file.
+- `processed_portfolios.json`: A JSON file with metadata about the files, including the input file, as well as file hashes and summary information for both input and output files.
+This file validates against the JSON Schema defined in this repository ([found here](inst/extdata/schema/parsedPortfolio_0-1-0.json))