Skip to content

Commit

Permalink
Merge branch 'main' into chore/prepare-for-dev-release
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexAxthelm authored Feb 2, 2024
2 parents 76e3fa0 + 1b904ea commit 27f9eb6
Show file tree
Hide file tree
Showing 12 changed files with 446 additions and 30 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ RoxygenNote: 7.3.1
Imports:
digest,
dplyr,
jsonvalidate (>= 1.4.0),
logger,
pacta.portfolio.import,
uuid
Remotes:
RMI-PACTA/pacta.portfolio.import
RMI-PACTA/pacta.portfolio.import,
ropensci/jsonvalidate
Suggests:
testthat (>= 3.0.0),
withr
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ LABEL org.opencontainers.image.authors=""

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libcurl4-openssl-dev=7.81.* \
libicu-dev=70.* \
libnode-dev=12.22.* \
libssl-dev=3.0.* \
&& chmod -R a+rwX /root \
&& rm -rf /var/lib/apt/lists/*

Expand Down
18 changes: 14 additions & 4 deletions R/export_portfolio.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
export_portfolio <- function(
portfolio_data,
group_data,
output_directory
output_directory,
validate = TRUE
) {

logger::log_trace("cleaning and rearranging data prior to export")
Expand Down Expand Up @@ -53,23 +54,32 @@ export_portfolio <- function(
)
logger::log_debug("Portfolio data written to file: ", output_filepath)

output_digest <- digest::digest(
output_md5 <- digest::digest(
object = output_filepath,
file = TRUE,
algo = "md5",
serialize = FALSE
)
logger::log_trace("Portfolio data digest: ", output_digest)
logger::log_trace("Portfolio data digest: ", output_md5)

portfolio_metadata <- c(
list(
output_digest = output_digest,
output_md5 = output_md5,
output_filename = output_filename,
output_rows = output_rows
),
as.list(group_data)
)

if (validate) {
logger::log_trace("Validating output.")
schema_serialize(
object = list(portfolio_metadata),
reference = "#/items/properties/portfolios"
)
} else {
logger::log_trace("Skipping JSON validation.")
}

return(portfolio_metadata)
}
22 changes: 21 additions & 1 deletion R/process_directory.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' @export
process_directory <- function(
input_directory = "/mnt/input",
output_directory = "/mnt/output"
output_directory = "/mnt/output",
validate = TRUE
) {
# Get the list of files in the directory
files <- list.files(input_directory, full.names = TRUE)
Expand All @@ -15,5 +16,24 @@ process_directory <- function(
all_summaries <- c(all_summaries, list(portfolio_summary))
}
logger::log_info("Done processing directory.")

logger::log_info("Preparing metadata JSON file.")
if (validate) {
logger::log_info("Validating output.")
summaries_json <- schema_serialize(all_summaries)
} else {
logger::log_warn("Skipping JSON validation.")
summaries_json <- jsonlite::toJSON(
x = all_summaries,
pretty = TRUE,
auto_unbox = TRUE
)
}

metadata_path <- file.path(output_directory, "processed_portfolios.json")
logger::log_info("Writing metadata JSON to file \"", metadata_path, "\".")
writeLines(summaries_json, metadata_path)
logger::log_debug("Metadata JSON file written.")

return(invisible(all_summaries))
}
18 changes: 15 additions & 3 deletions R/reexport_portfolio.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' @export
reexport_portfolio <- function(
input_filepath,
output_directory
output_directory,
validate = TRUE
) {

if (length(input_filepath) > 1L) {
Expand All @@ -18,7 +19,7 @@ reexport_portfolio <- function(
logger::log_debug("Portfolio data read.")

logger::log_trace("Indentifying portfolio metadata.")
input_digest <- digest::digest(
input_md5 <- digest::digest(
object = input_filepath,
file = TRUE,
algo = "md5",
Expand All @@ -29,7 +30,8 @@ reexport_portfolio <- function(

file_summary <- list(
input_filename = input_filename,
input_digest = input_digest
input_md5 = input_md5,
system_info = get_system_info()
)

# read_portfolio_csv retruns NA if it cannot process a portfolio
Expand Down Expand Up @@ -87,6 +89,16 @@ reexport_portfolio <- function(
file_summary[["portfolios"]] <- NULL
}

if (validate) {
logger::log_trace("Validating output.")
schema_serialize(
object = file_summary,
reference = "#/items"
)
} else {
logger::log_trace("Skipping JSON validation.")
}

logger::log_info("Finished processing file: ", input_filepath)
return(file_summary)
}
39 changes: 39 additions & 0 deletions R/schema_serialize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
schema_serialize <- function(
object,
schema_file = system.file(
"extdata", "schema", "parsedPortfolio_0-1-0.json",
package = "workflow.portfolio.parsing"
),
reference = NULL
) {
sch <- jsonvalidate::json_schema[["new"]](
schema = readLines(schema_file),
strict = TRUE,
engine = "ajv",
reference = reference
)
json <- sch[["serialise"]](object)
json_is_valid <- sch[["validate"]](json, verbose = TRUE)
if (json_is_valid) {
logger::log_trace("JSON is valid.")
} else {
json_errors <- attributes(json_is_valid)[["errors"]]
logger::log_warn(
"object could not be validated against ",
"JSON schema: \"", schema_file, "\",",
" reference: \"", reference, "\"."
)
logger::log_trace(
logger::skip_formatter(paste("JSON string: ", json))
)
logger::log_trace("Validation errors:")
for (i in seq(from = 1L, to = nrow(json_errors), by = 1L)) {
logger::log_trace(
"instancePath: ", json_errors[i, "instancePath"],
" message: ", json_errors[i, "message"]
)
}
warning("Object could not be validated against schema.")
}
return(json)
}
45 changes: 45 additions & 0 deletions R/system_info.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
get_system_info <- function() {
logger::log_trace("Getting system information")
package <- getPackageName()
version <- as.character(packageVersion(package))
logger::log_trace("Package: ", package, " version: ", version)
raw_deps <- trimws(
strsplit(
x = packageDescription(package)[["Imports"]],
split = ",",
fixed = TRUE
)[[1L]]
)
deps <- trimws(
gsub(
x = raw_deps,
pattern = "\\s+\\(.*\\)",
replacement = ""
)
)
deps_version <- as.list(
lapply(
X = deps,
FUN = function(x) {
list(
package = x,
version = as.character(packageVersion(x))
)
}
)
)

return(
list(
timestamp = format(
Sys.time(),
format = "%Y-%m-%dT%H:%M:%SZ",
tz = "UTC"
),
package = package,
packageVersion = version,
RVersion = as.character(getRversion()),
dependencies = deps_version
)
)
}
105 changes: 104 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,107 @@
[![codecov](https://codecov.io/gh/RMI-PACTA/workflow.portfolio.parsing/graph/badge.svg?token=ewpls5qPVK)](https://codecov.io/gh/RMI-PACTA/workflow.portfolio.parsing)
<!-- badges: end -->

The docker image defines by this repo accepts a directory of portfolios (mounted to `/mnt/input/`) and exports sanitized versions of those portfolios ready for further processing via PACTA (in `/mnt/outputs`)
The docker image defined by this repo accepts a directory of portfolios (mounted to `/mnt/input/`, see [Inputs](#inputs)) and exports sanitized versions of those portfolios ready for further processing via PACTA (in `/mnt/output`, see [Outputs](#outputs),)

## Docker Image

The intended method for invoking this workflow is with a Docker container defined by the image in the [Dockerfile](Dockerfile).
GitHub Actions builds the offical image, which is available at: `ghcr.io/rmi-pacta/workflow.portfolio.parsing:main`

Running the workflow from a docker image requires mounting an input and output directory:

```sh

# note that the input mount can have a readonly connection
# You can set logging verbosity via the LOG_LEVEL envvar

docker run --rm \
--mount type=bind,source="$(pwd)"/input,target=/mnt/input,readonly \
--mount type=bind,source="$(pwd)"/output,target=/mnt/output \
--env LOG_LEVEL=TRACE \
ghcr.io/rmi-pacta/workflow.portfolio.parsing:pr16

```

The container will process any files in the `input` directory, and export any valid portfolios along with a metadata files (see [Outputs](#outputs), below).

## Metadata file (`processed_portfolios.json`)

Along with portfolios (in a standardized `csv` format), the parser exports a metadata file about the parsed inputs, and the exported portfolio files.
The file is in JSON format, and validates against a [JSON Schema in this repository](inst/extdata/schema/parsedPortfolio_0-1-0.json).

The file is array of objects, with the highest level opbects in the array centered around the input files, with the exported files contained in an array with the `portfolio` key in each input file object.

A simple example of the output file:

```jsonc
[
{
"input_filename": "simple.csv",
"input_md5": "8e84d71c0f3892e34e0d9342cfc91a4d",
"system_info": {
"timestamp": "2024-01-31T19:11:56Z",
"package": "workflow.portfolio.parsing",
"packageVersion": "0.0.0.9000",
"RVersion": "4.3.2",
"dependencies": [
{
"package": "digest",
"version": "0.6.33"
},
// ... array elided
]
},
"input_entries": 1,
"group_cols": [],
"subportfolios_count": 1,
"portfolios": [
{
"output_md5": "0f51946d64ef6ee4daca1a6969317cba",
"output_filename": "be1e7db9-3d7c-4978-9c1c-4eba4ad2cff5.csv",
"output_rows": 1
}
]
}
]
```

Note that a input file object may have an `errors` key (exclusive to the `portfolios` key), or `warnings` (not exclusive to `portfolios` or `errors`) which indicates a processing error (or warning).
The `errors` object will be an array with messages which are suitable for presentation to end users.

Here is an example `jq` query to see a simple mapping between input and output files:

```sh

cat output/processed_portfolios.json | jq '
.[] |
[{
input_file: .input_filename,
output_file: .portfolios[].output_filename
}]
'

```

## R Package

This repo defines an R Package, `{workflow.portfolio.parsing}`.
The R package structure allows for easy management of dependencies, tests, and access to package files (such as the [JSON Schema](inst/extdata/schema/parsedPortfolio_0-1-0.json)).
Because using the R Package locally not intended as the primary use-case, running locally (beyond development) is technically unsupported, but should not pose any issues.

The package exports functions, but the main entrypoint is `process_directory()`. When called with default arguments, it works as intended for use with the docker image from this repo.

## Inputs

This workflow reads files from a directory (by convention mounted in docker container as `/mnt/input`).
The files must be plain csv files (though they do not need to have a `csv` file extension), parsable by [`pacta.portfolio.import::read_portfolio_csv()`](https://rmi-pacta.github.io/pacta.portfolio.import/reference/read_portfolio_csv.html).
The workflow will attempt to parse other files in that directory, but will throw warnings.
The workflow will not recurse into subdirectories.

## Outputs

This workflow writes files to a directory (by convention mounted in docker container as `/mnt/output`):

- `*.csv`: csv files that contain portfolio data, with columns and column names standardized, 1 portfolio per file.
- `processed_portfolios.json`: A JSON file with metadata about the files, including the input file, as well as file hashes and summary information for both input and output files.
This file validates against the JSON Schema defined in this repository ([found here](inst/extdata/schema/parsedPortfolio_0-1-0.json))
Loading

0 comments on commit 27f9eb6

Please sign in to comment.