From b8435111cef59a9973099f5ccdd6e8b19338b4a1 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 16:56:23 +0200 Subject: [PATCH 01/34] Add Dockerfile for Azure Container Instances --- Dockerfile.azure | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Dockerfile.azure diff --git a/Dockerfile.azure b/Dockerfile.azure new file mode 100644 index 0000000..1c76dac --- /dev/null +++ b/Dockerfile.azure @@ -0,0 +1,26 @@ +FROM --platform=linux/amd64 rocker/tidyverse + +# install system dependencies for R packages +RUN apt-get update \ + && [ $(which google-chrome) ] || apt-get install -y gnupg curl \ + && [ $(which google-chrome) ] || curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ + && [ $(which google-chrome) ] || DEBIAN_FRONTEND='noninteractive' apt-get install -y /tmp/google-chrome.deb \ + && apt-get install -y libcurl4-openssl-dev libssl-dev make libicu-dev libxml2-dev \ + zlib1g-dev libfontconfig1-dev libfreetype6-dev libfribidi-dev libharfbuzz-dev libjpeg-dev \ + libpng-dev libtiff-dev pandoc git libgit2-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'install.packages(c("pak", "renv"))' + +COPY . /workflow.data.preparation + +WORKDIR /workflow.data.preparation + +RUN Rscript -e '\ + readRenviron(".env"); \ + non_cran_pkg_deps <- c("RMI-PACTA/pacta.scenario.preparation", "RMI-PACTA/pacta.data.preparation", "RMI-PACTA/pacta.data.scraping"); \ + cran_pkg_deps <- setdiff(renv::dependencies()$Package, basename(non_cran_pkg_deps)); \ + pak::pkg_install(pkg = c(non_cran_pkg_deps, cran_pkg_deps)); \ + ' + +CMD Rscript run_pacta_data_preparation.R From c99da36d680fbef0d22ac66872dd3b95ee509ef6 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 19:00:15 +0200 Subject: [PATCH 02/34] Pin package and FROM versions --- Dockerfile.azure | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/Dockerfile.azure b/Dockerfile.azure index 1c76dac..6b1958f 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -1,14 +1,28 @@ -FROM --platform=linux/amd64 rocker/tidyverse +FROM rocker/tidyverse:4.3.1 # install system dependencies for R packages -RUN apt-get update \ - && [ $(which google-chrome) ] || apt-get install -y gnupg curl \ - && [ $(which google-chrome) ] || curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ - && [ $(which google-chrome) ] || DEBIAN_FRONTEND='noninteractive' apt-get install -y /tmp/google-chrome.deb \ - && apt-get install -y libcurl4-openssl-dev libssl-dev make libicu-dev libxml2-dev \ - zlib1g-dev libfontconfig1-dev libfreetype6-dev libfribidi-dev libharfbuzz-dev libjpeg-dev \ - libpng-dev libtiff-dev pandoc git libgit2-dev \ - && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install --no-install-recommends -y \ + curl=7.81.* \ + git=1:2.34.* \ + gnupg=2.2.* \ + libcurl4-openssl-dev=7.81.* \ + libfontconfig1-dev=2.13.* \ + libfreetype6-dev=2.11.* \ + libfribidi-dev=1.0.* \ + libgit2-dev=1.1.* \ + libharfbuzz-dev=2.7.* \ + libicu-dev=70.1-* \ + libjpeg-dev=8c-* \ + libpng-dev=1.6.* \ + libssl-dev=3.0.* \ + libtiff-dev=4.3.* \ + libxml2-dev=2.9.* \ + make=4.3-* \ + pandoc=2.9.2.* \ + zlib1g-dev=1:1.2.* \ + # && curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ + # && DEBIAN_FRONTEND='noninteractive' apt-get install -y /tmp/google-chrome.deb \ + && rm -rf /var/lib/apt/lists/* RUN Rscript -e 'install.packages(c("pak", "renv"))' From 797d7114951833b565c0ca0ffe2f5f1a2008b13f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 19:17:54 +0200 Subject: [PATCH 03/34] Separate Chrome Installation --- Dockerfile.azure | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Dockerfile.azure b/Dockerfile.azure index 6b1958f..87b5714 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -20,8 +20,12 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ make=4.3-* \ pandoc=2.9.2.* \ zlib1g-dev=1:1.2.* \ - # && curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ - # && DEBIAN_FRONTEND='noninteractive' apt-get install -y /tmp/google-chrome.deb \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \ + && apt-get update \ + && DEBIAN_FRONTEND='noninteractive' apt-get install --no-install-recommends -y /tmp/google-chrome.deb \ + && rm /tmp/google-chrome.deb \ && rm -rf /var/lib/apt/lists/* RUN Rscript -e 'install.packages(c("pak", "renv"))' From 7683c6bf76f12520cad6fa3e40687a61d1d294fc Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 20:15:22 +0200 Subject: [PATCH 04/34] Set working directory before copy, limit copy --- Dockerfile.azure | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile.azure b/Dockerfile.azure index 87b5714..22a29dc 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -28,11 +28,12 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog && rm /tmp/google-chrome.deb \ && rm -rf /var/lib/apt/lists/* +WORKDIR /workflow.data.preparation + RUN Rscript -e 'install.packages(c("pak", "renv"))' -COPY . /workflow.data.preparation +COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R -WORKDIR /workflow.data.preparation RUN Rscript -e '\ readRenviron(".env"); \ From 815110b15e877eac21bc769470f0f38e1a3f63e1 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 22:11:43 +0200 Subject: [PATCH 05/34] Use docker build secrets to pass github auth --- Dockerfile.azure | 13 ++++--------- README.md | 24 ++++++++++++++++++++++++ install_dependencies.R | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 9 deletions(-) create mode 100644 install_dependencies.R diff --git a/Dockerfile.azure b/Dockerfile.azure index 22a29dc..f9fa672 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -30,16 +30,11 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog WORKDIR /workflow.data.preparation -RUN Rscript -e 'install.packages(c("pak", "renv"))' - -COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R +COPY ./install_dependencies.R install_dependencies.R +RUN --mount=type=secret,id=github_pat \ + Rscript install_dependencies.R -RUN Rscript -e '\ - readRenviron(".env"); \ - non_cran_pkg_deps <- c("RMI-PACTA/pacta.scenario.preparation", "RMI-PACTA/pacta.data.preparation", "RMI-PACTA/pacta.data.scraping"); \ - cran_pkg_deps <- setdiff(renv::dependencies()$Package, basename(non_cran_pkg_deps)); \ - pak::pkg_install(pkg = c(non_cran_pkg_deps, cran_pkg_deps)); \ - ' +COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R CMD Rscript run_pacta_data_preparation.R diff --git a/README.md b/README.md index 8399005..9463f81 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # workflow.data.preparation Running the workflow requires a file `.env` to exist in the root directory, that looks like... + ``` sh HOST_INPUTS_PATH=/PATH/TO/AR_YYYYQQ HOST_OUTPUTS_PATH=/PATH/TO/YYYYQQ_pacta_analysis_inputs_YYYY-MM-DD/YYYYQQ @@ -23,3 +24,26 @@ R_CONFIG_ACTIVE=YYYYQQ Run `docker-compose up` from the root directory, and docker will build the image (if necessary), and then run the data.prep process given the specified options in the .env file. Use `docker-compose build --no-cache` to force a rebuild of the Docker image. + +## Docker image for Azure Container Instance + +`Dockerfile.azure` is intended to be built and run as an Azure Container Instance. + +Please note that this Dockerfile is intended to be built using [buildkit](https://docs.docker.com/build/buildkit/), since it relies on passing secrets. + +To build this image, create a file containing the _value_ of the GitHub PAT (with access to necessary repos), and build using buildkit: + +```sh +# docker buildx build +# is equivilent to +# DOCKER_BUILDKIT=1 docker build + +# Note that path to secretfile must be an absolute path +# or use $(pwd) if in working dir + +docker buildx build \ + --secret id=gh_pat,src=/path/to/secretfile \ + --tag workflow.data.preparation_aci \ + -f Dockerfile.azure . + +``` diff --git a/install_dependencies.R b/install_dependencies.R new file mode 100644 index 0000000..cc8b373 --- /dev/null +++ b/install_dependencies.R @@ -0,0 +1,40 @@ +dependencies <- c( + "DBI", + "RSQLite", + "config", + "dplyr", + "readr", + "rlang", + "rlog", + # "stats", # base package, do not update + "stringr", + "tidyr" +) + +github_dependencies <- c( + "RMI-PACTA/pacta.data.preparation", + "RMI-PACTA/pacta.data.scraping", + "RMI-PACTA/pacta.scenario.preparation" +) + +# get github_pat from docker build secrets +github_pat <- readLines("/run/secrets/github_pat") +message("github_pat: ", github_pat) +if (!nzchar(github_pat)) { + stop("github_pat secret is empty. Is it being passed in build secrets?") +} + +install.packages( + pkgs = dependencies, + repos = "https://packagemanager.posit.co/cran/__linux__/jammy/2023-08-31", + dependencies = c("Depends", "Imports", "LinkingTo") +) + +# remotes available as part of rocker/tidyverse +remotes::install_github( + repo = github_dependencies, + auth_token = github_pat, + dependencies = c("Depends", "Imports", "LinkingTo"), + repos = "https://packagemanager.posit.co/cran/__linux__/jammy/2023-08-31", + upgrade = "always" +) From 658ac323ff3254b89aa514d76bc642b6420c4982 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 22:29:18 +0200 Subject: [PATCH 06/34] Update documentation, and do not leak secrets --- README.md | 20 ++++++++++++++------ install_dependencies.R | 1 - 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9463f81..87cee57 100644 --- a/README.md +++ b/README.md @@ -31,19 +31,27 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. Please note that this Dockerfile is intended to be built using [buildkit](https://docs.docker.com/build/buildkit/), since it relies on passing secrets. -To build this image, create a file containing the _value_ of the GitHub PAT (with access to necessary repos), and build using buildkit: +To build this image, create a file containing the _value_ of the GitHub PAT (with access to necessary repos), and build using buildkit. +(note `docker buildx build` is equivilent to `DOCKER_BUILDKIT=1 docker build`) + +If your installed docker engine (found by running `docker version`) is > 20.10.0, then the secret can read from your local `GITHUB_PAT` envvar (must be `export`ed). ```sh -# docker buildx build -# is equivilent to -# DOCKER_BUILDKIT=1 docker build +docker buildx build \ + --secret id=github_pat,env=GITHUB_PAT \ + --progress=plain \ + --tag workflow.data.preparation_aci \ + -f Dockerfile.azure . +``` + +For older docker versions that support buildkit, you can write the _value_ of the token to a file, and specifiy the absolute path to that file instead. +```sh # Note that path to secretfile must be an absolute path # or use $(pwd) if in working dir docker buildx build \ - --secret id=gh_pat,src=/path/to/secretfile \ + --secret id=github_pat,src=/path/to/secretfile \ --tag workflow.data.preparation_aci \ -f Dockerfile.azure . - ``` diff --git a/install_dependencies.R b/install_dependencies.R index cc8b373..3a51f59 100644 --- a/install_dependencies.R +++ b/install_dependencies.R @@ -19,7 +19,6 @@ github_dependencies <- c( # get github_pat from docker build secrets github_pat <- readLines("/run/secrets/github_pat") -message("github_pat: ", github_pat) if (!nzchar(github_pat)) { stop("github_pat secret is empty. Is it being passed in build secrets?") } From e65debc3069028188fedebe73cd7bd3c2f433dc7 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Mon, 25 Sep 2023 22:34:13 +0200 Subject: [PATCH 07/34] Update buildkit information --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 87cee57..f21718f 100644 --- a/README.md +++ b/README.md @@ -32,12 +32,16 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. Please note that this Dockerfile is intended to be built using [buildkit](https://docs.docker.com/build/buildkit/), since it relies on passing secrets. To build this image, create a file containing the _value_ of the GitHub PAT (with access to necessary repos), and build using buildkit. -(note `docker buildx build` is equivilent to `DOCKER_BUILDKIT=1 docker build`) + +Up-to-date installations of docker on MacOS and Windows likely already have buildkit enabled. +It is possible to check your docker configuration, and check for `buildkit: true`. +If it is not enabled on your system, then you can either `export DOCKER_BUILDKIT=1 docker build`, or replace the `docker build` commands below with `docker buildx build` (either works). If your installed docker engine (found by running `docker version`) is > 20.10.0, then the secret can read from your local `GITHUB_PAT` envvar (must be `export`ed). ```sh -docker buildx build \ +# must be built with buildkit +docker build \ --secret id=github_pat,env=GITHUB_PAT \ --progress=plain \ --tag workflow.data.preparation_aci \ @@ -50,7 +54,8 @@ For older docker versions that support buildkit, you can write the _value_ of th # Note that path to secretfile must be an absolute path # or use $(pwd) if in working dir -docker buildx build \ +# must be built with buildkit +docker build \ --secret id=github_pat,src=/path/to/secretfile \ --tag workflow.data.preparation_aci \ -f Dockerfile.azure . From 7dea5e5f290bfdbb8e39f3ce6597cd9c3cab035d Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 26 Sep 2023 20:01:22 +0200 Subject: [PATCH 08/34] Add deploy ARM Template --- .gitignore | 1 + Dockerfile.azure | 2 +- README.md | 39 +++++ deploy/azure-deploy.json | 184 ++++++++++++++++++++ deploy/example-azure-deploy.parameters.json | 52 ++++++ 5 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 deploy/azure-deploy.json create mode 100644 deploy/example-azure-deploy.parameters.json diff --git a/.gitignore b/.gitignore index 6d9b786..67ad48f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata .env .DS_Store +*parameters.json diff --git a/Dockerfile.azure b/Dockerfile.azure index f9fa672..64be17e 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -37,4 +37,4 @@ RUN --mount=type=secret,id=github_pat \ COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R -CMD Rscript run_pacta_data_preparation.R +CMD ["Rscript", "run_pacta_data_preparation.R"] diff --git a/README.md b/README.md index f21718f..ceff9f4 100644 --- a/README.md +++ b/README.md @@ -60,3 +60,42 @@ docker build \ --tag workflow.data.preparation_aci \ -f Dockerfile.azure . ``` + +The image then needs to be pushed to a registry, for use with `azure-deploy.json` + +### Deploy process + +#### Prerequisites + +[Containers ARM Schema](https://learn.microsoft.com/en-us/azure/templates/microsoft.containerinstance/containergroups?pivots=deployment-language-arm-template#resource-format) + +[secrets](https://learn.microsoft.com/en-us/azure/container-apps/manage-secrets?tabs=azure-portal) + +- Azure Key Vault: the deploy process reads secrets from an Azure Key vault. The essential values refenced in the ARM template are: + - Storage Account Key for raw data storage (`rawdata-storageAccountKey`) + - Storage Account Key for "input" data storage (`dataprepinputs-storageAccountKey`) + - Storage Account Key for "output" data storage (`dataprepoutputs-storageAccountKey`) + - Username for FactSet database (`factset-database-user`) + - Password for FactSet database (`factset-database-password`) +Note that the Storage account keys are passed as parameters via `azure-deploy.parameters.json`, while the database credentials are used by the application itself, are are __freely readable__ if accessing the container (via `exec`, for example). + +To get the storage keys: + +```sh +# replace these values with storage account name and resource group appropriate to your deployment +ACI_PERS_STORAGE_ACCOUNT_NAME="pactadata" +ACI_PERS_RESOURCE_GROUP="pacta-data" + +STORAGE_KEY=$(az storage account keys list --resource-group "$ACI_PERS_RESOURCE_GROUP" --account-name "$ACI_PERS_STORAGE_ACCOUNT_NAME" --query "[0].value" --output tsv) +echo "$STORAGE_KEY" +``` + +#### Deploy + +```sh +# change this value as needed. +RESOURCEGROUP="myResourceGroup" + +az deployment group create --resource-group "$RESOURCEGROUP" --template-file azure-deploy.json --parameters @azure-deploy.parameters.json + +``` diff --git a/deploy/azure-deploy.json b/deploy/azure-deploy.json new file mode 100644 index 0000000..5b9222b --- /dev/null +++ b/deploy/azure-deploy.json @@ -0,0 +1,184 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "0.0.4", + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Location for all resources." + } + }, + "identity": { + "type": "string", + "metadata": { + "description": "The ID of the user assigned identity to use for the container group." + } + }, + "serviceprincipal": { + "type": "string", + "metadata": { + "description": "The ID of the service principal to use for the container group." + } + }, + "containerGroupName": { + "type": "string", + "metadata": { + "description": "The name of the container group." + } + }, + "restartPolicy": { + "type": "string", + "defaultValue": "OnFailure", + "allowedValues": [ + "Always", + "Never", + "OnFailure" + ], + "metadata": { + "description": "The behavior of Azure runtime if container has stopped." + } + }, + "rawdata-storageaccountkey": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "dataprepinputs-storageaccountkey": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "dataprepoutputs-storageaccountkey": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "factset-database-user": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "factset-database-password": { + "type": "securestring", + "metadata": { + "description": "The storage account key for the rawdata storage account." + } + }, + "starttime": { + "type": "string", + "defaultValue": "[utcNow()]", + "metadata": { + "description": "The time to start the container group." + } + } + }, + "variables": { + "azurecontainerregistry": "transitionmonitordockerregistry.azurecr.io" + }, + "functions": [], + "resources": [ + { + "type": "Microsoft.ContainerInstance/containerGroups", + "apiVersion": "2021-09-01", + "name": "[parameters('containerGroupName')]", + "location": "[parameters('location')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[parameters('identity')]": {} + } + }, + "properties": { + "containers": [ + { + "name": "data-prep", + "properties": { + "image": "[concat(variables('azurecontainerregistry'),'/workflow.data.preparation_aci:latest')]", + "ports": [], + "resources": { + "requests": { + "cpu": 1, + "memoryInGB": 1 + } + }, + "environmentVariables": [ + { + "name": "R_DATABASE_USER", + "secureValue": "factset-database-user" + }, + { + "name": "R_DATABASE_PASSWORD", + "secureValue": "factset-database-password" + }, + { + "name": "DEPLOY_START_TIME", + "value": "[parameters('starttime')]" + } + ], + "volumeMounts": [ + { + "name": "rawdatavolume", + "mountPath": "/mnt/rawdata/" + }, + { + "name": "inputsvolume", + "mountPath": "/mnt/inputs/" + }, + { + "name": "outputsvolume", + "mountPath": "/mnt/outputs/" + } + ], + "command": [ + "tail", "-f", "/dev/null" + ] + } + } + ], + "imageRegistryCredentials": [ + { + "server": "[variables('azurecontainerregistry')]", + "identity": "[parameters('identity')]" + } + ], + "restartPolicy": "[parameters('restartPolicy')]", + "osType": "Linux", + "volumes": [ + { + "name": "rawdatavolume", + "azureFile": { + "shareName": "rawdata", + "readOnly": true, + "storageAccountName": "pactarawdata", + "storageAccountKey": "[parameters('rawdata-storageaccountkey')]" + } + }, + { + "name": "inputsvolume", + "azureFile": { + "shareName": "data-prep-inputs", + "readOnly": false, + "storageAccountName": "pactadata", + "storageAccountKey": "[parameters('dataprepinputs-storageaccountkey')]" + } + }, + { + "name": "outputsvolume", + "azureFile": { + "shareName": "data-prep-outputs", + "readOnly": false, + "storageAccountName": "pactadata", + "storageAccountKey": "[parameters('dataprepoutputs-storageaccountkey')]" + } + } + ] + } + } + ], + "outputs": {} +} diff --git a/deploy/example-azure-deploy.parameters.json b/deploy/example-azure-deploy.parameters.json new file mode 100644 index 0000000..17016e0 --- /dev/null +++ b/deploy/example-azure-deploy.parameters.json @@ -0,0 +1,52 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "identity": { + "value": "/subscriptions//resourcegroups//providers/Microsoft.ManagedIdentity/userAssignedIdentities/" + }, + "serviceprincipal": { + "value": "" + }, + "rawdata-storageaccountkey": { + "reference": { + "keyVault": { + "id": "/subscriptions//resourceGroups//providers/Microsoft.KeyVault/vaults/" + }, + "secretName": "rawdata-storageaccountkey" + } + }, + "dataprepinputs-storageaccountkey": { + "reference": { + "keyVault": { + "id": "/subscriptions//resourceGroups//providers/Microsoft.KeyVault/vaults/" + }, + "secretName": "dataprepinputs-storageaccountkey" + } + }, + "dataprepoutputs-storageaccountkey": { + "reference": { + "keyVault": { + "id": "/subscriptions//resourceGroups//providers/Microsoft.KeyVault/vaults/" + }, + "secretName": "dataprepoutputs-storageaccountkey" + } + }, + "factset-database-user": { + "reference": { + "keyVault": { + "id": "/subscriptions//resourceGroups//providers/Microsoft.KeyVault/vaults/" + }, + "secretName": "factset-database-user" + } + }, + "factset-database-password": { + "reference": { + "keyVault": { + "id": "/subscriptions//resourceGroups//providers/Microsoft.KeyVault/vaults/" + }, + "secretName": "factset-database-password" + } + } + } +} From 239ef62afca99229e778d4f9defc17f996ae7a49 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 3 Oct 2023 08:57:54 +0200 Subject: [PATCH 09/34] Add infrastructure to copy files from rawdata to inputs --- Dockerfile.azure | 2 + README.md | 23 ++++++++++- config.yml | 7 ++++ copy_raw_data.R | 88 ++++++++++++++++++++++++++++++++++++++++ deploy/azure-deploy.json | 20 +++++++-- 5 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 copy_raw_data.R diff --git a/Dockerfile.azure b/Dockerfile.azure index 64be17e..58f5a0e 100644 --- a/Dockerfile.azure +++ b/Dockerfile.azure @@ -36,5 +36,7 @@ RUN --mount=type=secret,id=github_pat \ Rscript install_dependencies.R COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R +COPY ./config.yml config.yml +COPY ./copy_raw_data.R copy_raw_data.R CMD ["Rscript", "run_pacta_data_preparation.R"] diff --git a/README.md b/README.md index ceff9f4..14707f1 100644 --- a/README.md +++ b/README.md @@ -40,12 +40,14 @@ If it is not enabled on your system, then you can either `export DOCKER_BUILDKIT If your installed docker engine (found by running `docker version`) is > 20.10.0, then the secret can read from your local `GITHUB_PAT` envvar (must be `export`ed). ```sh + # must be built with buildkit docker build \ --secret id=github_pat,env=GITHUB_PAT \ --progress=plain \ --tag workflow.data.preparation_aci \ -f Dockerfile.azure . + ``` For older docker versions that support buildkit, you can write the _value_ of the token to a file, and specifiy the absolute path to that file instead. @@ -56,9 +58,10 @@ For older docker versions that support buildkit, you can write the _value_ of th # must be built with buildkit docker build \ - --secret id=github_pat,src=/path/to/secretfile \ + --secret id=github_pat,src=$(pwd)/secretfile \ --tag workflow.data.preparation_aci \ -f Dockerfile.azure . + ``` The image then needs to be pushed to a registry, for use with `azure-deploy.json` @@ -99,3 +102,21 @@ RESOURCEGROUP="myResourceGroup" az deployment group create --resource-group "$RESOURCEGROUP" --template-file azure-deploy.json --parameters @azure-deploy.parameters.json ``` + +### Helpful tips + +To attach to the container and execute commands interactively (for debugging) + +```sh + +az container exec --resource-group "$RESOURCEGROUP" --name "" --container-name "data-prep" --exec-command "/bin/bash" + +``` + +To start a long-running process (to allow for attaching and debugging), add this to `properties` for the container: + +```json + "command": [ + "tail", "-f", "/dev/null" + ] +``` diff --git a/config.yml b/config.yml index ef60be7..0646759 100644 --- a/config.yml +++ b/config.yml @@ -1,3 +1,4 @@ +--- default: data_prep_inputs_path: "/inputs" data_prep_outputs_path: "/outputs" @@ -92,3 +93,9 @@ default: scenario_sources_list: ["GECO2022", "IPR2021", "ISF2021", "WEO2022"] scenario_raw_data_to_include: ["geco_2022", "ipr_2021", "isf_2021", "weo_2022"] global_aggregate_scenario_sources_list: ["WEO2022"] + +2022Q4_CICD: + inherits: 2022Q4 + raw_data_path: !expr file.path("/mnt", "rawdata") + data_prep_inputs_path: !expr file.path("/mnt", "inputs", Sys.etenv( "DEPLOY_START_TIME")) + data_prep_outputs_path: !expr file.path( "/mnt", "outputs", Sys.getenv( "DEPLOY_START_TIME")) diff --git a/copy_raw_data.R b/copy_raw_data.R new file mode 100644 index 0000000..42f0ee9 --- /dev/null +++ b/copy_raw_data.R @@ -0,0 +1,88 @@ +# Load config +library(rlog) + +log_info("Loading config") +cfg <- config::get() + +masterdata_path <- file.path( + cfg[["raw_data_path"]], + "AssetImpact", + "Masterdata", + cfg[["pacta_financial_timestamp"]] +) + +masterdata_debt <- file.path( + masterdata_path, + cfg[["masterdata_debt_filename"]] +) + +masterdata_ownership <- file.path( + masterdata_path, + cfg[["masterdata_ownership_filename"]] +) + +ar_fs_bridge <- file.path( + cfg[["raw_data_path"]], + "AssetImpact", + "FactSet_Bridge", + cfg[["ar_company_id__factset_entity_id_filename"]] +) + +files_to_copy <- c( + masterdata_debt, + masterdata_ownership, + ar_fs_bridge +) + +missing_files <- !file.exists(files_to_copy) +if (any(missing_files)) { + log_error("The following files are missing:") + log_error(files_to_copy[missing_files]) + stop("Please ensure the config points to extant files.") +} + +if (!dir.exists(cfg[["data_prep_inputs_path"]])) { + log_info("Creating data_prep_inputs_path") + dir.create(cfg[["data_prep_inputs_path"]]) +} else { + log_warn("data_prep_inputs_path already exists") +} + +if (!dir.exists(cfg[["data_prep_outputs_path"]])) { + log_info("Creating data_prep_outputs_path") + dir.create(cfg[["data_prep_outputs_path"]]) +} else { + log_warn("data_prep_outputs_path already exists") +} + +log_info("Copying files") +for (x in files_to_copy) { + destination <- file.path(cfg[["data_prep_inputs_path"]], basename(x)) + log_debug(sprintf("Copying %s to %s", x, destination)) + copy_success <- file.copy( + from = x, + to = destination, + overwrite = FALSE + ) + if (!copy_success) { + log_error(sprintf("Failed to copy %s to %s", x, destination)) + stop("Please ensure the config points to extant files.") + } + + source_md5 <- digest::digest( + object = x, + algo = "md5", + file = TRUE + ) + destination_md5 <- digest::digest( + object = destination, + algo = "md5", + file = TRUE + ) + if (source_md5 != destination_md5) { + log_error(sprintf("MD5 mismatch for %s", basename(x))) + stop("MD5 mismatch.") + } + +} +log_info("Files copied.") diff --git a/deploy/azure-deploy.json b/deploy/azure-deploy.json index 5b9222b..ebde161 100644 --- a/deploy/azure-deploy.json +++ b/deploy/azure-deploy.json @@ -109,16 +109,30 @@ "environmentVariables": [ { "name": "R_DATABASE_USER", - "secureValue": "factset-database-user" + "secureValue": "[parameters('factset-database-user')]" }, { "name": "R_DATABASE_PASSWORD", - "secureValue": "factset-database-password" + "secureValue": "[parameters('factset-database-password')]" }, { "name": "DEPLOY_START_TIME", "value": "[parameters('starttime')]" - } + }, + { + "name": "R_CONFIG_ACTIVE", + "value": "2022Q4_CICD" + }, + { + "name": "R_CONFIG_FILE", + "value": "/workflow.data.preparation/config.yml" + }, + { + "name": "LOG_LEVEL", + "value": "INFO" + "metadata": { + "description": "The log level for the container. See {rlog} docs." + } ], "volumeMounts": [ { From 6f90760078a8d9ea396e756715df0d2506c57305 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 3 Oct 2023 09:17:41 +0200 Subject: [PATCH 10/34] Rearrange files --- Dockerfile.azure => ACI/Dockerfile.ACI | 0 {deploy => ACI}/azure-deploy.json | 0 copy_raw_data.R => ACI/copy_raw_data.R | 0 {deploy => ACI}/example-azure-deploy.parameters.json | 0 install_dependencies.R => ACI/install_dependencies.R | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename Dockerfile.azure => ACI/Dockerfile.ACI (100%) rename {deploy => ACI}/azure-deploy.json (100%) rename copy_raw_data.R => ACI/copy_raw_data.R (100%) rename {deploy => ACI}/example-azure-deploy.parameters.json (100%) rename install_dependencies.R => ACI/install_dependencies.R (100%) diff --git a/Dockerfile.azure b/ACI/Dockerfile.ACI similarity index 100% rename from Dockerfile.azure rename to ACI/Dockerfile.ACI diff --git a/deploy/azure-deploy.json b/ACI/azure-deploy.json similarity index 100% rename from deploy/azure-deploy.json rename to ACI/azure-deploy.json diff --git a/copy_raw_data.R b/ACI/copy_raw_data.R similarity index 100% rename from copy_raw_data.R rename to ACI/copy_raw_data.R diff --git a/deploy/example-azure-deploy.parameters.json b/ACI/example-azure-deploy.parameters.json similarity index 100% rename from deploy/example-azure-deploy.parameters.json rename to ACI/example-azure-deploy.parameters.json diff --git a/install_dependencies.R b/ACI/install_dependencies.R similarity index 100% rename from install_dependencies.R rename to ACI/install_dependencies.R From 1e2b176aad06561baa4a6adbfe3d0626396c6bc3 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Tue, 14 Nov 2023 18:57:35 +0100 Subject: [PATCH 11/34] WIP: deploy works until factset pull --- ACI/Dockerfile.ACI | 8 +++++--- ACI/azure-deploy.json | 19 ++++++++++++------- ACI/copy_files_and_run_data_prep.sh | 12 ++++++++++++ ACI/copy_raw_data.R | 2 +- README.md | 12 ++++++++---- config.yml | 2 +- run_pacta_data_preparation.R | 16 +++++++++++++--- 7 files changed, 52 insertions(+), 19 deletions(-) create mode 100644 ACI/copy_files_and_run_data_prep.sh diff --git a/ACI/Dockerfile.ACI b/ACI/Dockerfile.ACI index 58f5a0e..d94da15 100644 --- a/ACI/Dockerfile.ACI +++ b/ACI/Dockerfile.ACI @@ -30,13 +30,15 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog WORKDIR /workflow.data.preparation -COPY ./install_dependencies.R install_dependencies.R +COPY ./ACI/install_dependencies.R install_dependencies.R RUN --mount=type=secret,id=github_pat \ Rscript install_dependencies.R COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R COPY ./config.yml config.yml -COPY ./copy_raw_data.R copy_raw_data.R +COPY ./ACI/copy_raw_data.R copy_raw_data.R -CMD ["Rscript", "run_pacta_data_preparation.R"] +COPY ./ACI/copy_files_and_run_data_prep.sh /usr/local/bin/copy_files_and_run_data_prep + +CMD ["copy_files_and_run_data_prep"] diff --git a/ACI/azure-deploy.json b/ACI/azure-deploy.json index ebde161..31144f8 100644 --- a/ACI/azure-deploy.json +++ b/ACI/azure-deploy.json @@ -93,6 +93,16 @@ "[parameters('identity')]": {} } }, + "metadata": { + "data-prep environmentVariables description": { + "R_DATABASE_USER": "The username for the database.", + "R_DATABASE_PASSWORD": "The password for the database.", + "DEPLOY_START_TIME": "The time the container was deployed.", + "R_CONFIG_ACTIVE": "The active config for the container.", + "R_CONFIG_FILE": "The config file for the container.", + "LOG_LEVEL": "The log level for the container. See {rlog} docs." + } + }, "properties": { "containers": [ { @@ -129,10 +139,8 @@ }, { "name": "LOG_LEVEL", - "value": "INFO" - "metadata": { - "description": "The log level for the container. See {rlog} docs." - } + "value": "DEBUG" + } ], "volumeMounts": [ { @@ -147,9 +155,6 @@ "name": "outputsvolume", "mountPath": "/mnt/outputs/" } - ], - "command": [ - "tail", "-f", "/dev/null" ] } } diff --git a/ACI/copy_files_and_run_data_prep.sh b/ACI/copy_files_and_run_data_prep.sh new file mode 100644 index 0000000..95ccd9e --- /dev/null +++ b/ACI/copy_files_and_run_data_prep.sh @@ -0,0 +1,12 @@ +#! /bin/sh +set -e + +inputs_dir="/mnt/inputs" + +Rscript /workflow.data.preparation/copy_raw_data.R 2>&1 | \ + tee "$inputs_dir/$DEPLOY_START_TIME-copy.log" + +Rscript /workflow.data.preparation/run_pacta_data_preparation.R 2>&1 | \ + tee "$inputs_dir/$DEPLOY_START_TIME-prep.log" + +exit 0 diff --git a/ACI/copy_raw_data.R b/ACI/copy_raw_data.R index 42f0ee9..1f8c296 100644 --- a/ACI/copy_raw_data.R +++ b/ACI/copy_raw_data.R @@ -66,7 +66,7 @@ for (x in files_to_copy) { ) if (!copy_success) { log_error(sprintf("Failed to copy %s to %s", x, destination)) - stop("Please ensure the config points to extant files.") + stop("File copy error") } source_md5 <- digest::digest( diff --git a/README.md b/README.md index 14707f1..d7fbab0 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Use `docker-compose build --no-cache` to force a rebuild of the Docker image. ## Docker image for Azure Container Instance -`Dockerfile.azure` is intended to be built and run as an Azure Container Instance. +`Dockerfile.ACI` is intended to be built and run as an Azure Container Instance. Please note that this Dockerfile is intended to be built using [buildkit](https://docs.docker.com/build/buildkit/), since it relies on passing secrets. @@ -42,11 +42,12 @@ If your installed docker engine (found by running `docker version`) is > 20.10.0 ```sh # must be built with buildkit +# run from repo root docker build \ --secret id=github_pat,env=GITHUB_PAT \ --progress=plain \ --tag workflow.data.preparation_aci \ - -f Dockerfile.azure . + -f ACI/Dockerfile.ACI . ``` @@ -57,10 +58,12 @@ For older docker versions that support buildkit, you can write the _value_ of th # or use $(pwd) if in working dir # must be built with buildkit +# run from repo root docker build \ --secret id=github_pat,src=$(pwd)/secretfile \ + --progress=plain \ --tag workflow.data.preparation_aci \ - -f Dockerfile.azure . + -f ACI/Dockerfile.ACI . ``` @@ -99,7 +102,8 @@ echo "$STORAGE_KEY" # change this value as needed. RESOURCEGROUP="myResourceGroup" -az deployment group create --resource-group "$RESOURCEGROUP" --template-file azure-deploy.json --parameters @azure-deploy.parameters.json +# run from repo root +az deployment group create --resource-group "$RESOURCEGROUP" --template-file ACI/azure-deploy.json --parameters @ACI/azure-deploy.parameters.json ``` diff --git a/config.yml b/config.yml index 0646759..8e8280d 100644 --- a/config.yml +++ b/config.yml @@ -97,5 +97,5 @@ default: 2022Q4_CICD: inherits: 2022Q4 raw_data_path: !expr file.path("/mnt", "rawdata") - data_prep_inputs_path: !expr file.path("/mnt", "inputs", Sys.etenv( "DEPLOY_START_TIME")) + data_prep_inputs_path: !expr file.path("/mnt", "inputs", Sys.getenv( "DEPLOY_START_TIME")) data_prep_outputs_path: !expr file.path( "/mnt", "outputs", Sys.getenv( "DEPLOY_START_TIME")) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index dd85364..f7f3e35 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -21,11 +21,21 @@ suppressPackageStartupMessages({ # config ----------------------------------------------------------------------- -readRenviron(".env") +# if any essential envvars are missing, read the .env file. +# These should be set already as part of an ACI deployment. +if (any( + !nzchar(c( + Sys.getenv("R_DATABASE_USER"), + Sys.getenv("R_DATABASE_PASSWORD"), + Sys.getenv("R_CONFIG_ACTIVE") + )) +)) { + readRenviron(".env") +} config <- config::get( - file = "config.yml", + file = Sys.getenv("R_CONFIG_FILE", "config.yml"), config = Sys.getenv("R_CONFIG_ACTIVE"), use_parent = FALSE ) @@ -167,7 +177,7 @@ log_info("Scraping index regions... ") index_regions <- pacta.data.scraping::get_index_regions() -# pull factset data ------------------------------------------------------------ + pull factset data ------------------------------------------------------------ if (update_factset) { log_info("Fetching financial data... ") From 9257d1a70a4b21a80349394768126ddc16ff8da8 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Wed, 17 Jan 2024 14:13:44 +0100 Subject: [PATCH 12/34] Use DESCRIPTION for dependency management --- .gitignore | 1 + ACI/.gitignore | 2 ++ ACI/Dockerfile.ACI | 13 +++++++++++-- ACI/azure-deploy.json | 35 +++++++++++++---------------------- ACI/docker-compose.yml | 23 +++++++++++++++++++++++ DESCRIPTION | 31 +++++++++++++++++++++++++++++++ 6 files changed, 81 insertions(+), 24 deletions(-) create mode 100644 ACI/.gitignore create mode 100644 ACI/docker-compose.yml create mode 100644 DESCRIPTION diff --git a/.gitignore b/.gitignore index 67ad48f..5c18a1a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ .env .DS_Store *parameters.json +github_pat.txt diff --git a/ACI/.gitignore b/ACI/.gitignore new file mode 100644 index 0000000..f72f5fd --- /dev/null +++ b/ACI/.gitignore @@ -0,0 +1,2 @@ +inputs/ +outputs/ diff --git a/ACI/Dockerfile.ACI b/ACI/Dockerfile.ACI index d94da15..8df6b5d 100644 --- a/ACI/Dockerfile.ACI +++ b/ACI/Dockerfile.ACI @@ -30,10 +30,19 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog WORKDIR /workflow.data.preparation -COPY ./ACI/install_dependencies.R install_dependencies.R +# Install R dependencies +COPY DESCRIPTION /workflow.portfolio.parser/DESCRIPTION +# install pak, find dependencises from DESCRIPTION, and install them. RUN --mount=type=secret,id=github_pat \ - Rscript install_dependencies.R + Rscript -e "\ + Sys.setenv(GITHUB_PAT = readLines('/run/secrets/github_pat')); \ + install.packages('pak'); \ + deps <- pak::local_deps(root = '/workflow.portfolio.parser'); \ + pkg_deps <- deps[!deps[['direct']], 'ref']; \ + pak::pak(pkg_deps); \ + Sys.unsetenv('GITHUB_PAT') \ + " COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R COPY ./config.yml config.yml diff --git a/ACI/azure-deploy.json b/ACI/azure-deploy.json index 31144f8..9143358 100644 --- a/ACI/azure-deploy.json +++ b/ACI/azure-deploy.json @@ -57,18 +57,6 @@ "description": "The storage account key for the rawdata storage account." } }, - "factset-database-user": { - "type": "securestring", - "metadata": { - "description": "The storage account key for the rawdata storage account." - } - }, - "factset-database-password": { - "type": "securestring", - "metadata": { - "description": "The storage account key for the rawdata storage account." - } - }, "starttime": { "type": "string", "defaultValue": "[utcNow()]", @@ -95,8 +83,6 @@ }, "metadata": { "data-prep environmentVariables description": { - "R_DATABASE_USER": "The username for the database.", - "R_DATABASE_PASSWORD": "The password for the database.", "DEPLOY_START_TIME": "The time the container was deployed.", "R_CONFIG_ACTIVE": "The active config for the container.", "R_CONFIG_FILE": "The config file for the container.", @@ -117,14 +103,6 @@ } }, "environmentVariables": [ - { - "name": "R_DATABASE_USER", - "secureValue": "[parameters('factset-database-user')]" - }, - { - "name": "R_DATABASE_PASSWORD", - "secureValue": "[parameters('factset-database-password')]" - }, { "name": "DEPLOY_START_TIME", "value": "[parameters('starttime')]" @@ -143,6 +121,10 @@ } ], "volumeMounts": [ + { + "name": "factset-extracted", + "mountPath": "/mnt/factset-extracted/" + }, { "name": "rawdatavolume", "mountPath": "/mnt/rawdata/" @@ -168,6 +150,15 @@ "restartPolicy": "[parameters('restartPolicy')]", "osType": "Linux", "volumes": [ + { + "name": "factset-extracted", + "azureFile": { + "shareName": "factset-extracted", + "readOnly": true, + "storageAccountName": "pactarawdata", + "storageAccountKey": "[parameters('rawdata-storageaccountkey')]" + } + }, { "name": "rawdatavolume", "azureFile": { diff --git a/ACI/docker-compose.yml b/ACI/docker-compose.yml new file mode 100644 index 0000000..ef5b715 --- /dev/null +++ b/ACI/docker-compose.yml @@ -0,0 +1,23 @@ +version: "3.2" + +services: + workflow.data.preparation_aci: + build: + context: .. + dockerfile: ACI/Dockerfile.ACI + secrets: + - github_pat + volumes: + - type: bind + source: ./inputs/factset-extracted + target: /mnt/factset-extracted + - type: bind + source: ./inputs/rawdata + target: /mnt/rawdata + - type: bind + source: ./outputs + target: /outputs + +secrets: + github_pat: + file: ./github_pat.txt diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..eb5938b --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,31 @@ +Package: workflow.data.preparation +Title: What the Package Does (One Line, Title Case) +Version: 0.0.0.9000 +Authors@R: + person("First", "Last", , "first.last@example.com", role = c("aut", "cre"), + comment = c(ORCID = "YOUR-ORCID-ID")) +Description: What the package does (one paragraph). +License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a + license +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 +Imports: + DBI, + RSQLite + RSQLite, + config, + dplyr, + pacta.data.preparation, + pacta.data.scraping, + pacta.scenario.preparation, + readr, + rlang, + rlog, + stats, + stringr, + tidyr +Remotes: + RMI-PACTA/pacta.data.preparation, + RMI-PACTA/pacta.data.scraping, + RMI-PACTA/pacta.scenario.preparation From f8f6dec542fdd803608afa07185d2f4d719c0e38 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Wed, 17 Jan 2024 14:51:15 +0100 Subject: [PATCH 13/34] Resolve dependency installation --- ACI/Dockerfile.ACI | 12 +++++++++--- ACI/copy_files_and_run_data_prep.sh | 0 DESCRIPTION | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) mode change 100644 => 100755 ACI/copy_files_and_run_data_prep.sh diff --git a/ACI/Dockerfile.ACI b/ACI/Dockerfile.ACI index 8df6b5d..7ed09cc 100644 --- a/ACI/Dockerfile.ACI +++ b/ACI/Dockerfile.ACI @@ -30,18 +30,24 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog WORKDIR /workflow.data.preparation +# set frozen CRAN repo +ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" +ARG R_HOME="/usr/local/lib/R" +RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" + # Install R dependencies -COPY DESCRIPTION /workflow.portfolio.parser/DESCRIPTION +COPY DESCRIPTION DESCRIPTION # install pak, find dependencises from DESCRIPTION, and install them. RUN --mount=type=secret,id=github_pat \ Rscript -e "\ Sys.setenv(GITHUB_PAT = readLines('/run/secrets/github_pat')); \ install.packages('pak'); \ - deps <- pak::local_deps(root = '/workflow.portfolio.parser'); \ + deps <- pak::local_deps(root = '.'); \ pkg_deps <- deps[!deps[['direct']], 'ref']; \ + cat(pkg_deps); \ pak::pak(pkg_deps); \ - Sys.unsetenv('GITHUB_PAT') \ + Sys.unsetenv('GITHUB_PAT'); \ " COPY ./run_pacta_data_preparation.R run_pacta_data_preparation.R diff --git a/ACI/copy_files_and_run_data_prep.sh b/ACI/copy_files_and_run_data_prep.sh old mode 100644 new mode 100755 diff --git a/DESCRIPTION b/DESCRIPTION index eb5938b..953e9da 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,6 @@ Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 Imports: DBI, - RSQLite RSQLite, config, dplyr, From 1f862ecd880322b21c6fc4b226e96484f758e23e Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 19:19:44 +0100 Subject: [PATCH 14/34] Wrap up file copy step --- ACI/.gitignore | 1 + ACI/copy_files_and_run_data_prep.sh | 1 + ACI/copy_raw_data.R | 109 ++++++++++++++++++++-------- ACI/docker-compose.yml | 9 +++ DESCRIPTION | 3 +- config.yml | 8 +- 6 files changed, 97 insertions(+), 34 deletions(-) diff --git a/ACI/.gitignore b/ACI/.gitignore index f72f5fd..14badde 100644 --- a/ACI/.gitignore +++ b/ACI/.gitignore @@ -1,2 +1,3 @@ inputs/ outputs/ +dataprep_inputs/ diff --git a/ACI/copy_files_and_run_data_prep.sh b/ACI/copy_files_and_run_data_prep.sh index 95ccd9e..395ffc1 100755 --- a/ACI/copy_files_and_run_data_prep.sh +++ b/ACI/copy_files_and_run_data_prep.sh @@ -3,6 +3,7 @@ set -e inputs_dir="/mnt/inputs" +# copy raw data, then run normal data prep script Rscript /workflow.data.preparation/copy_raw_data.R 2>&1 | \ tee "$inputs_dir/$DEPLOY_START_TIME-copy.log" diff --git a/ACI/copy_raw_data.R b/ACI/copy_raw_data.R index 1f8c296..d3bd1f8 100644 --- a/ACI/copy_raw_data.R +++ b/ACI/copy_raw_data.R @@ -1,8 +1,29 @@ -# Load config -library(rlog) +logger::log_threshold(Sys.getenv("LOG_LEVEL", "INFO")) +logger::log_formatter(logger::formatter_glue) -log_info("Loading config") +# Check value and format of $DEPLOY_START_TIME +deploy_start_time <- Sys.getenv("DEPLOY_START_TIME", "") +time_pattern <- "^[[:digit:]]{8}T[[:digit:]]{6}Z$" +if (grepl(x = deploy_start_time, pattern = time_pattern)) { + logger::log_debug("DEPLOY_START_TIME: ", deploy_start_time) + logger::log_trace("DEPLOY_START_TIME format is correct. ({time_pattern})") +} else if (nchar(deploy_start_time) == 0L) { + logger::log_error( + "Environment variable $DEPLOY_START_TIME not set or is empty" + ) + stop("Environment variable DEPLOY_START_TIME not set") +} else { + logger::log_warn(" + Environment variable $DEPLOY_START_TIME is not in the expected format. \\ + Expected format: '{time_pattern}'. \\ + Actual value: '{deploy_start_time}'. \\ + This variable is used to ensure consistency in accessing datasets. \\ + ") +} + +logger::log_info("Loading config: ", Sys.getenv("R_CONFIG_ACTIVE", "default")) cfg <- config::get() +logger::log_trace("Config loaded.") masterdata_path <- file.path( cfg[["raw_data_path"]], @@ -10,16 +31,19 @@ masterdata_path <- file.path( "Masterdata", cfg[["pacta_financial_timestamp"]] ) +logger::log_trace("masterdata_path: ", masterdata_path) masterdata_debt <- file.path( masterdata_path, cfg[["masterdata_debt_filename"]] ) +logger::log_trace("masterdata_debt file: ", masterdata_debt) masterdata_ownership <- file.path( masterdata_path, cfg[["masterdata_ownership_filename"]] ) +logger::log_trace("masterdata_ownership file: ", masterdata_ownership) ar_fs_bridge <- file.path( cfg[["raw_data_path"]], @@ -27,62 +51,87 @@ ar_fs_bridge <- file.path( "FactSet_Bridge", cfg[["ar_company_id__factset_entity_id_filename"]] ) +logger::log_trace("ar_fs_bridge file: ", ar_fs_bridge) + +factset_files <- list.files( + path = file.path( + cfg[["factset-extracted_path"]], + cfg[["factset_dataset"]] + ), + include.dirs = FALSE, + full.names = TRUE +) +logger::log_trace("factset_file: {factset_files}") files_to_copy <- c( masterdata_debt, masterdata_ownership, - ar_fs_bridge + ar_fs_bridge#, + # factset_files ) missing_files <- !file.exists(files_to_copy) if (any(missing_files)) { - log_error("The following files are missing:") - log_error(files_to_copy[missing_files]) - stop("Please ensure the config points to extant files.") + logger::log_error("The following files are missing:") + logger::log_error("{files_to_copy[missing_files]}") + stop("Please ensure the config points to existing files.") } -if (!dir.exists(cfg[["data_prep_inputs_path"]])) { - log_info("Creating data_prep_inputs_path") - dir.create(cfg[["data_prep_inputs_path"]]) +if (dir.exists(cfg[["data_prep_inputs_path"]])) { + logger::log_warn("data_prep_inputs_path already exists") } else { - log_warn("data_prep_inputs_path already exists") + logger::log_debug( + "Creating data_prep_inputs_path: {cfg[['data_prep_inputs_path']]}}" + ) + dir.create(cfg[["data_prep_inputs_path"]]) } +logger::log_info( + "copying files to data_prep_inputs_path: {cfg[['data_prep_inputs_path']]}}" +) -if (!dir.exists(cfg[["data_prep_outputs_path"]])) { - log_info("Creating data_prep_outputs_path") - dir.create(cfg[["data_prep_outputs_path"]]) -} else { - log_warn("data_prep_outputs_path already exists") -} +logger::log_info("Copying files") +for (source_file in files_to_copy) { -log_info("Copying files") -for (x in files_to_copy) { - destination <- file.path(cfg[["data_prep_inputs_path"]], basename(x)) - log_debug(sprintf("Copying %s to %s", x, destination)) + destination_file <- file.path( + cfg[["data_prep_inputs_path"]], + basename(source_file) + ) + if (file.exists(destination_file)) { + logger::log_warn( + "Destination file already exists: {destination_file}." + ) + } + logger::log_debug("Copying: {source_file} -> {destination_file}") copy_success <- file.copy( - from = x, - to = destination, + from = source_file, + to = destination_file, overwrite = FALSE ) - if (!copy_success) { - log_error(sprintf("Failed to copy %s to %s", x, destination)) + if (copy_success) { + logger::log_trace("Copy success") + } else { + logger::log_error("Failed to copy {source_file} to {destination_file}") stop("File copy error") } source_md5 <- digest::digest( - object = x, + object = source_file, algo = "md5", file = TRUE ) destination_md5 <- digest::digest( - object = destination, + object = destination_file, algo = "md5", file = TRUE ) - if (source_md5 != destination_md5) { - log_error(sprintf("MD5 mismatch for %s", basename(x))) + if (identical(source_md5, destination_md5)) { + logger::log_trace("MD5 match: {unique(source_md5, destination_md5)}") + } else { + logger::log_error(sprintf("MD5 mismatch for %s", basename(source_file))) + logger::log_error("Source MD5: {source_md5} {source_file}") + logger::log_error("Destination MD5: {destination_md5} {destination_file}") stop("MD5 mismatch.") } } -log_info("Files copied.") +logger::log_info("Done copying files") diff --git a/ACI/docker-compose.yml b/ACI/docker-compose.yml index ef5b715..e67dd58 100644 --- a/ACI/docker-compose.yml +++ b/ACI/docker-compose.yml @@ -2,12 +2,21 @@ version: "3.2" services: workflow.data.preparation_aci: + # stdin_open: true + # tty: true + # command: ["R"] build: context: .. dockerfile: ACI/Dockerfile.ACI secrets: - github_pat + environment: + LOG_LEVEL: TRACE + R_CONFIG_ACTIVE: 2022Q4_CICD volumes: + - type: bind + source: ./dataprep_inputs + target: /mnt/dataprep_inputs - type: bind source: ./inputs/factset-extracted target: /mnt/factset-extracted diff --git a/DESCRIPTION b/DESCRIPTION index 953e9da..a6cfe52 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,13 +14,14 @@ Imports: DBI, RSQLite, config, + digest, dplyr, + logger, pacta.data.preparation, pacta.data.scraping, pacta.scenario.preparation, readr, rlang, - rlog, stats, stringr, tidyr diff --git a/config.yml b/config.yml index 8e8280d..fd04823 100644 --- a/config.yml +++ b/config.yml @@ -96,6 +96,8 @@ default: 2022Q4_CICD: inherits: 2022Q4 - raw_data_path: !expr file.path("/mnt", "rawdata") - data_prep_inputs_path: !expr file.path("/mnt", "inputs", Sys.getenv( "DEPLOY_START_TIME")) - data_prep_outputs_path: !expr file.path( "/mnt", "outputs", Sys.getenv( "DEPLOY_START_TIME")) + raw_data_path: "/mnt/rawdata" + data_prep_inputs_path: !expr file.path("/mnt", "dataprep_inputs", Sys.getenv("DEPLOY_START_TIME")) + data_prep_outputs_path: !expr file.path("/mnt", "outputs", Sys.getenv("DEPLOY_START_TIME")) + factset-extracted_path: "/mnt/factset-extracted" + factset_dataset: "factset-pacta_timestamp-20221231T000000Z_pulled-20231221T195325Z" From 4d2dcdb5167ca316931b407ae043f15d036930d9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 20:26:00 +0100 Subject: [PATCH 15/34] copy FS files --- ACI/copy_raw_data.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ACI/copy_raw_data.R b/ACI/copy_raw_data.R index d3bd1f8..160f871 100644 --- a/ACI/copy_raw_data.R +++ b/ACI/copy_raw_data.R @@ -66,8 +66,8 @@ logger::log_trace("factset_file: {factset_files}") files_to_copy <- c( masterdata_debt, masterdata_ownership, - ar_fs_bridge#, - # factset_files + ar_fs_bridge, + factset_files ) missing_files <- !file.exists(files_to_copy) From b977903144faa378d681d1fc01aa5c79e633985a Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 21:48:38 +0100 Subject: [PATCH 16/34] convert from `{rlog}` to `{logger}` --- run_pacta_data_preparation.R | 101 +++++++++++++++++------------------ 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index f453093..a539a06 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -1,3 +1,6 @@ +logger::log_threshold(Sys.getenv("LOG_LEVEL", "INFO")) +logger::log_formatter(logger::formatter_glue) + # necessary packages ----------------------------------------------------------- suppressPackageStartupMessages({ @@ -12,10 +15,6 @@ suppressPackageStartupMessages({ library(RSQLite) library(stringr) library(tidyr) - - # used for logging - library(rlog) - if (interactive()) Sys.setenv("LOG_LEVEL" = "ERROR") }) @@ -101,7 +100,7 @@ relevant_years <- sort( market_share_target_reference_year:(market_share_target_reference_year + time_horizon) ) ) -log_info( +logger::log_info( paste( "Full time horizon set to:", paste0(relevant_years, collapse = ", ") @@ -133,10 +132,10 @@ if (!update_factset) { # pre-flight ------------------------------------------------------------------- -log_info("Fetching pre-flight data... ") +logger::log_info("Fetching pre-flight data... ") -log_info("Preparing scenario data... ") +logger::log_info("Preparing scenario data... ") scenario_raw_data <- bind_rows(scenario_raw_data_to_include) # scenario values will be linearly interpolated for each group below @@ -163,14 +162,14 @@ pacta.scenario.preparation::scenario_regions %>% # web scraping ----------------------------------------------------------------- if (update_currencies) { - log_info("Fetching currency data... ") + logger::log_info("Fetching currency data... ") pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) %>% saveRDS(currencies_data_path) } -log_info("Scraping index regions... ") +logger::log_info("Scraping index regions... ") index_regions <- pacta.data.scraping::get_index_regions() @@ -178,7 +177,7 @@ index_regions <- pacta.data.scraping::get_index_regions() pull factset data ------------------------------------------------------------ if (update_factset) { - log_info("Fetching financial data... ") + logger::log_info("Fetching financial data... ") pacta.data.preparation::get_factset_financial_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -188,7 +187,7 @@ if (update_factset) { ) %>% saveRDS(factset_financial_data_path) - log_info("Fetching entity info data... ") + logger::log_info("Fetching entity info data... ") pacta.data.preparation::get_factset_entity_info( dbname = dbname, host = host, @@ -197,7 +196,7 @@ if (update_factset) { ) %>% saveRDS(factset_entity_info_path) - log_info("Fetching entity financing data... ") + logger::log_info("Fetching entity financing data... ") pacta.data.preparation::get_factset_entity_financing_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -207,7 +206,7 @@ if (update_factset) { ) %>% saveRDS(factset_entity_financing_data_path) - log_info("Fetching fund data... ") + logger::log_info("Fetching fund data... ") pacta.data.preparation::get_factset_fund_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -217,7 +216,7 @@ if (update_factset) { ) %>% saveRDS(factset_fund_data_path) - log_info("Fetching fund ISINs... ") + logger::log_info("Fetching fund ISINs... ") pacta.data.preparation::get_factset_isin_to_fund_table( dbname = dbname, host = host, @@ -226,7 +225,7 @@ if (update_factset) { ) %>% saveRDS(factset_isin_to_fund_table_path) - log_info("Fetching ISS emissions data... ") + logger::log_info("Fetching ISS emissions data... ") pacta.data.preparation::get_factset_iss_emissions_data( year = iss_emissions_year, dbname = dbname, @@ -237,12 +236,12 @@ if (update_factset) { saveRDS(factset_iss_emissions_data_path) } -log_info("Pre-flight data prepared.") +logger::log_info("Pre-flight data prepared.") # intermediary files ----------------------------------------------------------- -log_info("Preparing scenario data... ") +logger::log_info("Preparing scenario data... ") scenario_regions <- readr::read_csv(scenario_regions_path, na = "", show_col_types = FALSE) @@ -286,12 +285,12 @@ scenarios_long <- scenario_raw %>% ) ) -log_info("Scenario data prepared.") +logger::log_info("Scenario data prepared.") # currency data output --------------------------------------------------------- -log_info("Saving currencies.rds... ") +logger::log_info("Saving currencies.rds... ") readRDS(currencies_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "currencies.rds")) @@ -299,22 +298,22 @@ readRDS(currencies_data_path) %>% # financial data output -------------------------------------------------------- -log_info("Preparing financial data... ") +logger::log_info("Preparing financial data... ") # read raw FactSet financial data, filter to unique rows, merge AR company_id, # merge PACTA sectors from AR data -log_info("Formatting and saving financial_data.rds... ") +logger::log_info("Formatting and saving financial_data.rds... ") readRDS(factset_financial_data_path) %>% pacta.data.preparation::prepare_financial_data(factset_issue_code_bridge) %>% saveRDS(file.path(data_prep_outputs_path, "financial_data.rds")) -log_info("Formatting and saving entity_financing.rds... ") +logger::log_info("Formatting and saving entity_financing.rds... ") readRDS(factset_entity_financing_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "entity_financing.rds")) -log_info("Formatting and saving entity_info.rds... ") +logger::log_info("Formatting and saving entity_info.rds... ") factset_entity_id__ar_company_id <- readr::read_csv(ar_company_id__factset_entity_id_path, col_types = "c") %>% @@ -327,12 +326,12 @@ readRDS(factset_entity_info_path) %>% pacta.data.preparation::prepare_entity_info(factset_entity_id__ar_company_id) %>% saveRDS(file.path(data_prep_outputs_path, "entity_info.rds")) -log_info("Financial data prepared.") +logger::log_info("Financial data prepared.") # ABCD data output ------------------------------------------------------------- -log_info("Preparing ABCD... ") +logger::log_info("Preparing ABCD... ") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -351,7 +350,7 @@ ar_company_id__credit_parent_ar_company_id <- rm(entity_info) -log_info("Formatting and saving masterdata_ownership_datastore.rds... ") +logger::log_info("Formatting and saving masterdata_ownership_datastore.rds... ") readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% pacta.data.preparation::prepare_masterdata( @@ -362,7 +361,7 @@ readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% saveRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) -log_info("Formatting and saving masterdata_debt_datastore.rds... ") +logger::log_info("Formatting and saving masterdata_debt_datastore.rds... ") masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE) @@ -401,12 +400,12 @@ rm(company_id__creditor_company_id) rm(ar_company_id__country_of_domicile) rm(ar_company_id__credit_parent_ar_company_id) -log_info("ABCD prepared.") +logger::log_info("ABCD prepared.") # abcd_flags ------------------------------------------------------------------- -log_info("Preparing ABCD flags... ") +logger::log_info("Preparing ABCD flags... ") financial_data <- readRDS(file.path(data_prep_outputs_path, "financial_data.rds")) entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -421,7 +420,7 @@ factset_entity_id__security_mapped_sector <- select(factset_entity_id, security_mapped_sector) -log_info("Formatting and saving abcd_flags_equity.rds... ") +logger::log_info("Formatting and saving abcd_flags_equity.rds... ") ar_company_id__sectors_with_assets__ownership <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% @@ -446,7 +445,7 @@ financial_data %>% saveRDS(file.path(data_prep_outputs_path, "abcd_flags_equity.rds")) -log_info("Formatting and saving abcd_flags_bonds.rds... ") +logger::log_info("Formatting and saving abcd_flags_bonds.rds... ") ar_company_id__sectors_with_assets__debt <- readRDS(file.path(data_prep_outputs_path, "masterdata_debt_datastore.rds")) %>% @@ -484,12 +483,12 @@ rm(financial_data) rm(entity_info) rm(factset_entity_id__ar_company_id) rm(factset_entity_id__security_mapped_sector) -log_info("ABCD flags prepared.") +logger::log_info("ABCD flags prepared.") # fund data output ------------------------------------------------------------- -log_info("Preparing fund data... ") +logger::log_info("Preparing fund data... ") fund_data <- readRDS(factset_fund_data_path) @@ -517,14 +516,14 @@ fund_data %>% saveRDS(file.path(data_prep_outputs_path, "fund_data.rds")) -log_info("Saving total_fund_list.rds... ") +logger::log_info("Saving total_fund_list.rds... ") fund_data %>% select(factset_fund_id) %>% distinct() %>% saveRDS(file.path(data_prep_outputs_path, "total_fund_list.rds")) -log_info("Saving isin_to_fund_table.rds... ") +logger::log_info("Saving isin_to_fund_table.rds... ") isin_to_fund_table <- readRDS(factset_isin_to_fund_table_path) @@ -555,7 +554,7 @@ isin_to_fund_table %>% rm(fund_data) rm(isin_to_fund_table) -log_info("Fund data prepared.") +logger::log_info("Fund data prepared.") # emission data output --------------------------------------------------------- @@ -571,7 +570,7 @@ iss_company_emissions <- ) %>% mutate(icc_total_emissions_units = "tCO2e") # units are defined in the ISS/FactSet documentation (see #144) -log_info("Formatting and saving iss_entity_emission_intensities.rds... ") +logger::log_info("Formatting and saving iss_entity_emission_intensities.rds... ") iss_entity_emission_intensities <- readRDS(factset_entity_financing_data_path) %>% @@ -612,7 +611,7 @@ saveRDS( ) -log_info("Formatting and saving iss_average_sector_emission_intensities.rds... ") +logger::log_info("Formatting and saving iss_average_sector_emission_intensities.rds... ") factset_entity_info <- readRDS(factset_entity_info_path) @@ -641,12 +640,12 @@ rm(iss_company_emissions) rm(iss_entity_emission_intensities) rm(factset_entity_info) -log_info("Emissions data prepared.") +logger::log_info("Emissions data prepared.") # combined ABCD and scenarios output ------------------------------------------- -log_info("Preparing combined ABCD scenario output... ") +logger::log_info("Preparing combined ABCD scenario output... ") masterdata_ownership_datastore <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% @@ -655,7 +654,7 @@ masterdata_ownership_datastore <- for (scenario_source in unique(scenarios_long$scenario_source)) { filename <- paste0("equity_abcd_scenario_", scenario_source, ".rds") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - log_info(paste0("Formatting and saving ", filename, "... ")) + logger::log_info(paste0("Formatting and saving ", filename, "... ")) pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_ownership_datastore, scenario_data = scenarios_long_source, @@ -673,7 +672,7 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { saveRDS(file.path(data_prep_outputs_path, filename)) } -log_info("Formatting and saving equity_abcd_scenario.rds... ") +logger::log_info("Formatting and saving equity_abcd_scenario.rds... ") list.files( data_prep_outputs_path, pattern = "^equity_abcd_scenario_", @@ -691,7 +690,7 @@ masterdata_debt_datastore <- for (scenario_source in unique(scenarios_long$scenario_source)) { filename <- paste0("bonds_abcd_scenario_", scenario_source, ".rds") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - log_info(paste0("Formatting and saving ", filename, "... ")) + logger::log_info(paste0("Formatting and saving ", filename, "... ")) pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_debt_datastore, scenario_data = scenarios_long_source, @@ -709,7 +708,7 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { saveRDS(file.path(data_prep_outputs_path, filename)) } -log_info("Formatting and saving bonds_abcd_scenario.rds... ") +logger::log_info("Formatting and saving bonds_abcd_scenario.rds... ") list.files( data_prep_outputs_path, pattern = "^bonds_abcd_scenario_", @@ -719,14 +718,14 @@ list.files( bind_rows() %>% saveRDS(file.path(data_prep_outputs_path, "bonds_abcd_scenario.rds")) -log_info("Combined ABCD scenario output prepared.") +logger::log_info("Combined ABCD scenario output prepared.") # export SQLite versions of relevant files ------------------------------------- if (export_sqlite_files) { # entity_info - log_info("Formatting and saving entity_info.sqlite... ") + logger::log_info("Formatting and saving entity_info.sqlite... ") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -750,7 +749,7 @@ if (export_sqlite_files) { rm(entity_info) # equity_abcd_scenario - log_info("Formatting and saving equity_abcd_scenario.sqlite... ") + logger::log_info("Formatting and saving equity_abcd_scenario.sqlite... ") equity_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "equity_abcd_scenario.rds")) @@ -780,7 +779,7 @@ if (export_sqlite_files) { rm(equity_abcd_scenario) # bonds_abcd_scenario - log_info("Formatting and saving bonds_abcd_scenario.sqlite... ") + logger::log_info("Formatting and saving bonds_abcd_scenario.sqlite... ") bonds_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "bonds_abcd_scenario.rds")) @@ -813,7 +812,7 @@ if (export_sqlite_files) { # manifests of input and output file ------------------------------------------- -log_info("Formatting and saving manifest.json... ") +logger::log_info("Formatting and saving manifest.json... ") ent_entity_affiliates_last_update <- readRDS(factset_entity_info_path) %>% @@ -895,7 +894,7 @@ pacta.data.preparation::write_manifest( # copy in NEWs.md files from relevant PACTA packages --------------------------- -log_info("Copying NEW.md files from relevant PACTA packages... ") +logger::log_info("Copying NEW.md files from relevant PACTA packages... ") # `pacta_packages` defined above to add NEWS text to manifest for (pkg_name in pacta_packages) { @@ -908,4 +907,4 @@ for (pkg_name in pacta_packages) { # ------------------------------------------------------------------------------ -log_info("PACTA Data Preparation Complete.") +logger::log_info("PACTA Data Preparation Complete.") From 845ca1a2e6bc274dbfd7a64f20599861b89744b2 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 23:36:15 +0100 Subject: [PATCH 17/34] Clean logging strings --- run_pacta_data_preparation.R | 100 ++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index a539a06..bc78343 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -7,7 +7,6 @@ suppressPackageStartupMessages({ library(pacta.data.preparation) library(pacta.data.scraping) library(pacta.scenario.preparation) - library(DBI) library(dplyr) library(readr) @@ -101,10 +100,7 @@ relevant_years <- sort( ) ) logger::log_info( - paste( - "Full time horizon set to:", - paste0(relevant_years, collapse = ", ") - ) + "Full time horizon set to: {paste0(relevant_years, collapse = ', ')}." ) scenario_raw_data_to_include <- lapply(scenario_raw_data_to_include, get, envir = asNamespace("pacta.scenario.preparation")) @@ -132,10 +128,10 @@ if (!update_factset) { # pre-flight ------------------------------------------------------------------- -logger::log_info("Fetching pre-flight data... ") +logger::log_info("Fetching pre-flight data.") -logger::log_info("Preparing scenario data... ") +logger::log_info("Preparing scenario data.") scenario_raw_data <- bind_rows(scenario_raw_data_to_include) # scenario values will be linearly interpolated for each group below @@ -162,22 +158,22 @@ pacta.scenario.preparation::scenario_regions %>% # web scraping ----------------------------------------------------------------- if (update_currencies) { - logger::log_info("Fetching currency data... ") + logger::log_info("Fetching currency data.") pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) %>% saveRDS(currencies_data_path) } -logger::log_info("Scraping index regions... ") - +logger::log_info("Scraping index regions.") index_regions <- pacta.data.scraping::get_index_regions() pull factset data ------------------------------------------------------------ if (update_factset) { - logger::log_info("Fetching financial data... ") + + logger::log_info("Fetching financial data.") pacta.data.preparation::get_factset_financial_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -187,7 +183,7 @@ if (update_factset) { ) %>% saveRDS(factset_financial_data_path) - logger::log_info("Fetching entity info data... ") + logger::log_info("Fetching entity info data.") pacta.data.preparation::get_factset_entity_info( dbname = dbname, host = host, @@ -196,7 +192,7 @@ if (update_factset) { ) %>% saveRDS(factset_entity_info_path) - logger::log_info("Fetching entity financing data... ") + logger::log_info("Fetching entity financing data.") pacta.data.preparation::get_factset_entity_financing_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -206,7 +202,7 @@ if (update_factset) { ) %>% saveRDS(factset_entity_financing_data_path) - logger::log_info("Fetching fund data... ") + logger::log_info("Fetching fund data.") pacta.data.preparation::get_factset_fund_data( data_timestamp = factset_data_timestamp, dbname = dbname, @@ -216,7 +212,7 @@ if (update_factset) { ) %>% saveRDS(factset_fund_data_path) - logger::log_info("Fetching fund ISINs... ") + logger::log_info("Fetching fund ISINs.") pacta.data.preparation::get_factset_isin_to_fund_table( dbname = dbname, host = host, @@ -225,7 +221,7 @@ if (update_factset) { ) %>% saveRDS(factset_isin_to_fund_table_path) - logger::log_info("Fetching ISS emissions data... ") + logger::log_info("Fetching ISS emissions data.") pacta.data.preparation::get_factset_iss_emissions_data( year = iss_emissions_year, dbname = dbname, @@ -241,7 +237,7 @@ logger::log_info("Pre-flight data prepared.") # intermediary files ----------------------------------------------------------- -logger::log_info("Preparing scenario data... ") +logger::log_info("Preparing scenario data.") scenario_regions <- readr::read_csv(scenario_regions_path, na = "", show_col_types = FALSE) @@ -290,38 +286,33 @@ logger::log_info("Scenario data prepared.") # currency data output --------------------------------------------------------- -logger::log_info("Saving currencies.rds... ") - +logger::log_info("Saving file: \"currencies.rds\".") readRDS(currencies_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "currencies.rds")) # financial data output -------------------------------------------------------- -logger::log_info("Preparing financial data... ") +logger::log_info("Preparing financial data.") # read raw FactSet financial data, filter to unique rows, merge AR company_id, # merge PACTA sectors from AR data -logger::log_info("Formatting and saving financial_data.rds... ") - +logger::log_info("Formatting and saving file: \"financial_data.rds\".") readRDS(factset_financial_data_path) %>% pacta.data.preparation::prepare_financial_data(factset_issue_code_bridge) %>% saveRDS(file.path(data_prep_outputs_path, "financial_data.rds")) -logger::log_info("Formatting and saving entity_financing.rds... ") - +logger::log_info("Formatting and saving file: \"entity_financing.rds\".") readRDS(factset_entity_financing_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "entity_financing.rds")) -logger::log_info("Formatting and saving entity_info.rds... ") - +logger::log_info("Formatting and saving file: \"entity_info.rds\".") factset_entity_id__ar_company_id <- readr::read_csv(ar_company_id__factset_entity_id_path, col_types = "c") %>% select( factset_entity_id = "factset_id", ar_company_id = "company_id" ) - readRDS(factset_entity_info_path) %>% pacta.data.preparation::prepare_entity_info(factset_entity_id__ar_company_id) %>% saveRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -331,7 +322,7 @@ logger::log_info("Financial data prepared.") # ABCD data output ------------------------------------------------------------- -logger::log_info("Preparing ABCD... ") +logger::log_info("Preparing ABCD.") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -350,8 +341,9 @@ ar_company_id__credit_parent_ar_company_id <- rm(entity_info) -logger::log_info("Formatting and saving masterdata_ownership_datastore.rds... ") - +logger::log_info( + "Formatting and saving file: \"masterdata_ownership_datastore.rds\"." +) readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% pacta.data.preparation::prepare_masterdata( ar_company_id__country_of_domicile, @@ -361,7 +353,9 @@ readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% saveRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) -logger::log_info("Formatting and saving masterdata_debt_datastore.rds... ") +logger::log_info( + "Formatting and saving file: \"masterdata_debt_datastore.rds\"." +) masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE) @@ -405,7 +399,7 @@ logger::log_info("ABCD prepared.") # abcd_flags ------------------------------------------------------------------- -logger::log_info("Preparing ABCD flags... ") +logger::log_info("Preparing ABCD flags.") financial_data <- readRDS(file.path(data_prep_outputs_path, "financial_data.rds")) entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -420,7 +414,7 @@ factset_entity_id__security_mapped_sector <- select(factset_entity_id, security_mapped_sector) -logger::log_info("Formatting and saving abcd_flags_equity.rds... ") +logger::log_info("Formatting and saving file: \"abcd_flags_equity.rds\".") ar_company_id__sectors_with_assets__ownership <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% @@ -445,7 +439,7 @@ financial_data %>% saveRDS(file.path(data_prep_outputs_path, "abcd_flags_equity.rds")) -logger::log_info("Formatting and saving abcd_flags_bonds.rds... ") +logger::log_info("Formatting and saving file: \"abcd_flags_bonds.rds\".") ar_company_id__sectors_with_assets__debt <- readRDS(file.path(data_prep_outputs_path, "masterdata_debt_datastore.rds")) %>% @@ -488,7 +482,7 @@ logger::log_info("ABCD flags prepared.") # fund data output ------------------------------------------------------------- -logger::log_info("Preparing fund data... ") +logger::log_info("Preparing fund data.") fund_data <- readRDS(factset_fund_data_path) @@ -516,14 +510,14 @@ fund_data %>% saveRDS(file.path(data_prep_outputs_path, "fund_data.rds")) -logger::log_info("Saving total_fund_list.rds... ") +logger::log_info("Saving file: \"total_fund_list.rds\".") fund_data %>% select(factset_fund_id) %>% distinct() %>% saveRDS(file.path(data_prep_outputs_path, "total_fund_list.rds")) -logger::log_info("Saving isin_to_fund_table.rds... ") +logger::log_info("Saving file: \"isin_to_fund_table.rds\".") isin_to_fund_table <- readRDS(factset_isin_to_fund_table_path) @@ -570,7 +564,9 @@ iss_company_emissions <- ) %>% mutate(icc_total_emissions_units = "tCO2e") # units are defined in the ISS/FactSet documentation (see #144) -logger::log_info("Formatting and saving iss_entity_emission_intensities.rds... ") +logger::log_info( + "Formatting and saving file: \"iss_entity_emission_intensities.rds\"." +) iss_entity_emission_intensities <- readRDS(factset_entity_financing_data_path) %>% @@ -611,7 +607,9 @@ saveRDS( ) -logger::log_info("Formatting and saving iss_average_sector_emission_intensities.rds... ") +logger::log_info( + "Formatting and saving file: \"iss_average_sector_emission_intensities.rds\"." +) factset_entity_info <- readRDS(factset_entity_info_path) @@ -645,7 +643,7 @@ logger::log_info("Emissions data prepared.") # combined ABCD and scenarios output ------------------------------------------- -logger::log_info("Preparing combined ABCD scenario output... ") +logger::log_info("Preparing combined ABCD scenario output.") masterdata_ownership_datastore <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% @@ -654,7 +652,7 @@ masterdata_ownership_datastore <- for (scenario_source in unique(scenarios_long$scenario_source)) { filename <- paste0("equity_abcd_scenario_", scenario_source, ".rds") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - logger::log_info(paste0("Formatting and saving ", filename, "... ")) + logger::log_info("Formatting and saving file: \"{filename}\".") pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_ownership_datastore, scenario_data = scenarios_long_source, @@ -672,7 +670,7 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { saveRDS(file.path(data_prep_outputs_path, filename)) } -logger::log_info("Formatting and saving equity_abcd_scenario.rds... ") +logger::log_info("Formatting and saving file: \"equity_abcd_scenario.rds\".") list.files( data_prep_outputs_path, pattern = "^equity_abcd_scenario_", @@ -690,7 +688,7 @@ masterdata_debt_datastore <- for (scenario_source in unique(scenarios_long$scenario_source)) { filename <- paste0("bonds_abcd_scenario_", scenario_source, ".rds") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - logger::log_info(paste0("Formatting and saving ", filename, "... ")) + logger::log_info("Formatting and saving file: \"{filename}\".") pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_debt_datastore, scenario_data = scenarios_long_source, @@ -708,7 +706,7 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { saveRDS(file.path(data_prep_outputs_path, filename)) } -logger::log_info("Formatting and saving bonds_abcd_scenario.rds... ") +logger::log_info("Formatting and saving file: \"bonds_abcd_scenario.rds\".") list.files( data_prep_outputs_path, pattern = "^bonds_abcd_scenario_", @@ -725,7 +723,7 @@ logger::log_info("Combined ABCD scenario output prepared.") if (export_sqlite_files) { # entity_info - logger::log_info("Formatting and saving entity_info.sqlite... ") + logger::log_info("Formatting and saving file: \"entity_info.sqlite\".") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) @@ -749,7 +747,9 @@ if (export_sqlite_files) { rm(entity_info) # equity_abcd_scenario - logger::log_info("Formatting and saving equity_abcd_scenario.sqlite... ") + logger::log_info( + "Formatting and saving file: \"equity_abcd_scenario.sqlite\"." + ) equity_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "equity_abcd_scenario.rds")) @@ -779,7 +779,9 @@ if (export_sqlite_files) { rm(equity_abcd_scenario) # bonds_abcd_scenario - logger::log_info("Formatting and saving bonds_abcd_scenario.sqlite... ") + logger::log_info( + "Formatting and saving file: \"bonds_abcd_scenario.sqlite\"." + ) bonds_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "bonds_abcd_scenario.rds")) @@ -812,7 +814,7 @@ if (export_sqlite_files) { # manifests of input and output file ------------------------------------------- -logger::log_info("Formatting and saving manifest.json... ") +logger::log_info("Formatting and saving file: \"manifest.json\".") ent_entity_affiliates_last_update <- readRDS(factset_entity_info_path) %>% @@ -894,7 +896,7 @@ pacta.data.preparation::write_manifest( # copy in NEWs.md files from relevant PACTA packages --------------------------- -logger::log_info("Copying NEW.md files from relevant PACTA packages... ") +logger::log_info("Copying NEWS.md files from relevant PACTA packages.") # `pacta_packages` defined above to add NEWS text to manifest for (pkg_name in pacta_packages) { From 6d81834f5e65166c8306f6eff003a7d7ba1607c6 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 23:45:50 +0100 Subject: [PATCH 18/34] add {glue} to dependencies --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index a6cfe52..53460e8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,7 @@ Imports: config, digest, dplyr, + glue, logger, pacta.data.preparation, pacta.data.scraping, From 264d9dc3eff8e07ba05be0352a20ddc55796e192 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 23:56:38 +0100 Subject: [PATCH 19/34] Disable readr progress bar --- .lintr | 3 +++ ACI/Dockerfile.ACI | 2 +- ACI/RProfile.site | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 .lintr create mode 100644 ACI/RProfile.site diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..3ee3055 --- /dev/null +++ b/.lintr @@ -0,0 +1,3 @@ +linters: linters_with_defaults( + line_length_linter = NULL + ) diff --git a/ACI/Dockerfile.ACI b/ACI/Dockerfile.ACI index 7ed09cc..b431287 100644 --- a/ACI/Dockerfile.ACI +++ b/ACI/Dockerfile.ACI @@ -33,7 +33,7 @@ WORKDIR /workflow.data.preparation # set frozen CRAN repo ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" ARG R_HOME="/usr/local/lib/R" -RUN echo "options(repos = c(CRAN = '$CRAN_REPO'), pkg.sysreqs = FALSE)" >> "${R_HOME}/etc/Rprofile.site" +COPY ./ACI/Rprofile.site "${R_HOME}/etc/Rprofile.site" # Install R dependencies COPY DESCRIPTION DESCRIPTION diff --git a/ACI/RProfile.site b/ACI/RProfile.site new file mode 100644 index 0000000..3bca408 --- /dev/null +++ b/ACI/RProfile.site @@ -0,0 +1,5 @@ +options( + pkg.sysreqs = FALSE, + readr.show_progress = FALSE, + repos = c(CRAN = '$CRAN_REPO') +) From 879faac12dd973c3f6987173ef8137c31ed42dbc Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 23:57:06 +0100 Subject: [PATCH 20/34] Don't update factset data on CICD runs --- config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yml b/config.yml index 316c891..d4943cb 100644 --- a/config.yml +++ b/config.yml @@ -103,3 +103,4 @@ default: data_prep_outputs_path: !expr file.path("/mnt", "outputs", Sys.getenv("DEPLOY_START_TIME")) factset-extracted_path: "/mnt/factset-extracted" factset_dataset: "factset-pacta_timestamp-20221231T000000Z_pulled-20231221T195325Z" + update_factset: false From 7d0c0d30e37644791d6d40744a9c182bbff5b8db Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Thu, 18 Jan 2024 23:59:20 +0100 Subject: [PATCH 21/34] Ensure output path exists --- run_pacta_data_preparation.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index bc78343..a82cde1 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -68,6 +68,11 @@ scenario_geographies_list <- config$scenario_geographies_list global_aggregate_scenario_sources_list <- config$global_aggregate_scenario_sources_list global_aggregate_sector_list <- config$global_aggregate_sector_list +#ensure data_prep_outputs_path exists +if (!dir.exists(data_prep_outputs_path)) { + dir.create(data_prep_outputs_path) +} +logger::log_info("Data prep outputs path: {data_prep_outputs_path}") # input filepaths -------------------------------------------------------------- @@ -169,7 +174,7 @@ logger::log_info("Scraping index regions.") index_regions <- pacta.data.scraping::get_index_regions() - pull factset data ------------------------------------------------------------ +# pull factset data ------------------------------------------------------------ if (update_factset) { From 65b792c74c02f17ac93b53946881e37f0ffd9844 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 00:07:22 +0100 Subject: [PATCH 22/34] fix bad mount path --- ACI/docker-compose.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ACI/docker-compose.yml b/ACI/docker-compose.yml index e67dd58..7af4c73 100644 --- a/ACI/docker-compose.yml +++ b/ACI/docker-compose.yml @@ -2,9 +2,9 @@ version: "3.2" services: workflow.data.preparation_aci: - # stdin_open: true - # tty: true - # command: ["R"] + stdin_open: true + tty: true + command: ["sh"] build: context: .. dockerfile: ACI/Dockerfile.ACI @@ -25,7 +25,7 @@ services: target: /mnt/rawdata - type: bind source: ./outputs - target: /outputs + target: /mnt/outputs secrets: github_pat: From 5b10fa7774fc558cf21eb8ac74deacee086b5f12 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 10:52:12 +0100 Subject: [PATCH 23/34] disable object_name_linter --- .lintr | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.lintr b/.lintr index 3ee3055..52ff41d 100644 --- a/.lintr +++ b/.lintr @@ -1,3 +1,4 @@ linters: linters_with_defaults( - line_length_linter = NULL + line_length_linter = NULL, + object_length_linter = NULL ) From d830af0a6dc1bf09cd6664a14f91501ebda07892 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 10:52:48 +0100 Subject: [PATCH 24/34] improve creating output directory --- run_pacta_data_preparation.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index a82cde1..d76ad50 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -69,7 +69,10 @@ global_aggregate_scenario_sources_list <- config$global_aggregate_scenario_sourc global_aggregate_sector_list <- config$global_aggregate_sector_list #ensure data_prep_outputs_path exists -if (!dir.exists(data_prep_outputs_path)) { +if (dir.exists(data_prep_outputs_path)) { + logger::log_warn("Data prep outputs path already exists.") +} else { + logger::log_debug("Creating data prep outputs path.") dir.create(data_prep_outputs_path) } logger::log_info("Data prep outputs path: {data_prep_outputs_path}") @@ -105,7 +108,7 @@ relevant_years <- sort( ) ) logger::log_info( - "Full time horizon set to: {paste0(relevant_years, collapse = ', ')}." + "Full time horizon set to: {paste0(relevant_years, collapse = ', ')}." ) scenario_raw_data_to_include <- lapply(scenario_raw_data_to_include, get, envir = asNamespace("pacta.scenario.preparation")) @@ -273,8 +276,8 @@ scenarios_long <- scenario_raw %>% by = c( scenario_source = "source", scenario_geography = "scenario_geography_source" - ) - ) %>% + ) + ) %>% select(-"scenario_geography") %>% rename(scenario_geography = "scenario_geography_pacta") %>% filter( From 9387905167a0119b032fce5aeaef666d4e354ee5 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 13:56:52 +0100 Subject: [PATCH 25/34] prefer `seq` over `x:y` --- run_pacta_data_preparation.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index d76ad50..08b7bf4 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -104,7 +104,11 @@ factset_iss_emissions_data_path <- file.path(data_prep_inputs_path, "factset_iss relevant_years <- sort( unique( - market_share_target_reference_year:(market_share_target_reference_year + time_horizon) + seq( + from = market_share_target_reference_year, + to = (market_share_target_reference_year + time_horizon), + by = 1L + ) ) ) logger::log_info( From ea8dfb8f576bf249cc3d6b7d1609682ff551b27a Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 16:42:26 +0100 Subject: [PATCH 26/34] Add DEBUG and TRACE logging --- run_pacta_data_preparation.R | 282 +++++++++++++++++++++++++++-------- 1 file changed, 218 insertions(+), 64 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 08b7bf4..a568897 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -3,6 +3,7 @@ logger::log_formatter(logger::formatter_glue) # necessary packages ----------------------------------------------------------- +logger::log_debug("Loading necessary packages.") suppressPackageStartupMessages({ library(pacta.data.preparation) library(pacta.data.scraping) @@ -15,12 +16,13 @@ suppressPackageStartupMessages({ library(stringr) library(tidyr) }) - +logger::log_trace("Necessary packages loaded.") # config ----------------------------------------------------------------------- # if any essential envvars are missing, read the .env file. # These should be set already as part of an ACI deployment. +logger::log_debug("Checking for missing envvars.") if (any( !nzchar(c( Sys.getenv("R_DATABASE_USER"), @@ -31,13 +33,16 @@ if (any( readRenviron(".env") } +logger::log_debug("Loading config.") config <- config::get( file = Sys.getenv("R_CONFIG_FILE", "config.yml"), config = Sys.getenv("R_CONFIG_ACTIVE"), use_parent = FALSE ) +logger::log_trace("Config loaded.") +logger::log_debug("Setting config values as R objects.") data_prep_inputs_path <- config$data_prep_inputs_path data_prep_outputs_path <- config$data_prep_outputs_path masterdata_ownership_filename <- config$masterdata_ownership_filename @@ -67,8 +72,10 @@ tech_exclude <- config$tech_exclude scenario_geographies_list <- config$scenario_geographies_list global_aggregate_scenario_sources_list <- config$global_aggregate_scenario_sources_list global_aggregate_sector_list <- config$global_aggregate_sector_list +logger::log_trace("Config values set as R objects.") #ensure data_prep_outputs_path exists +logger::log_debug("Checking data prep outputs path.") if (dir.exists(data_prep_outputs_path)) { logger::log_warn("Data prep outputs path already exists.") } else { @@ -79,13 +86,16 @@ logger::log_info("Data prep outputs path: {data_prep_outputs_path}") # input filepaths -------------------------------------------------------------- +logger::log_debug("Setting input filepaths.") masterdata_ownership_path <- file.path(data_prep_inputs_path, masterdata_ownership_filename) +logger::log_trace("Masterdata ownership path: {masterdata_ownership_path}") masterdata_debt_path <- file.path(data_prep_inputs_path, masterdata_debt_filename) +logger::log_trace("Masterdata debt path: {masterdata_debt_path}") ar_company_id__factset_entity_id_path <- file.path(data_prep_inputs_path, ar_company_id__factset_entity_id_filename) - +logger::log_trace("AR company ID to FactSet entity ID path: {ar_company_id__factset_entity_id_path}") # pre-flight filepaths --------------------------------------------------------- @@ -99,7 +109,6 @@ factset_fund_data_path <- file.path(data_prep_inputs_path, "factset_fund_data.rd factset_isin_to_fund_table_path <- file.path(data_prep_inputs_path, "factset_isin_to_fund_table.rds") factset_iss_emissions_data_path <- file.path(data_prep_inputs_path, "factset_iss_emissions.rds") - # computed options ------------------------------------------------------------- relevant_years <- sort( @@ -115,38 +124,55 @@ logger::log_info( "Full time horizon set to: {paste0(relevant_years, collapse = ', ')}." ) +logger::log_debug("Getting scenario data.") +logger::log_trace("Scenario data to include: {scenario_raw_data_to_include}") scenario_raw_data_to_include <- lapply(scenario_raw_data_to_include, get, envir = asNamespace("pacta.scenario.preparation")) - +logger::log_trace("Scenario data retrieved.") # check that everything is ready to go ----------------------------------------- +logger::log_debug("Checking that AI files exist.") stopifnot(file.exists(masterdata_ownership_path)) stopifnot(file.exists(masterdata_debt_path)) stopifnot(file.exists(ar_company_id__factset_entity_id_path)) +logger::log_trace("AI files exist.") -if (!update_currencies) { +if (update_currencies) { + logger::log_debug( + "update_currencies is TRUE. Skipping preflight check for currency file." + ) +} else { + logger::log_debug("Checking that currencies file exist.") stopifnot(file.exists(currencies_data_path)) + logger::log_trace("Currencies file exist.") } -if (!update_factset) { +if (update_factset) { + logger::log_debug( + "update_factset is TRUE. Skipping preflight check for FactSet files." + ) +} else { + logger::log_debug("Checking that FactSet files exist.") stopifnot(file.exists(factset_financial_data_path)) stopifnot(file.exists(factset_entity_info_path)) stopifnot(file.exists(factset_entity_financing_data_path)) stopifnot(file.exists(factset_fund_data_path)) stopifnot(file.exists(factset_isin_to_fund_table_path)) stopifnot(file.exists(factset_iss_emissions_data_path)) + logger::log_trace("FactSet files exist.") } - # pre-flight ------------------------------------------------------------------- logger::log_info("Fetching pre-flight data.") - logger::log_info("Preparing scenario data.") +logger::log_debug("Binding raw scenario data.") scenario_raw_data <- bind_rows(scenario_raw_data_to_include) +logger::log_trace("Raw scenario data bound.") # scenario values will be linearly interpolated for each group below +logger::log_debug("Setting interpolation groups.") interpolation_groups <- c( "source", "scenario", @@ -156,30 +182,52 @@ interpolation_groups <- c( "indicator", "units" ) +logger::log_trace("Interpolation groups set: {interpolation_groups}") +logger::log_debug(" + Preparing and writing scenario raw data to intermediary file: \\ + \"{scenario_regions_path}\". +") scenario_raw_data %>% pacta.scenario.preparation::interpolate_yearly(!!!rlang::syms(interpolation_groups)) %>% filter(.data$year >= .env$market_share_target_reference_year) %>% pacta.scenario.preparation::add_market_share_columns(reference_year = market_share_target_reference_year) %>% pacta.scenario.preparation::format_p4i(green_techs) %>% write_csv(scenarios_analysis_input_path, na = "") +logger::log_trace( + "Scenario raw data written: \"{scenarios_analysis_input_path}\"." +) +logger::log_debug(" + Preparing and writing scenario regions to intermediary file: \\ + \"{scenario_regions_path}\". +") pacta.scenario.preparation::scenario_regions %>% write_csv(scenario_regions_path, na = "") +logger::log_trace( + "Scenario regions written: \"{scenarios_analysis_input_path}\"." +) # web scraping ----------------------------------------------------------------- if (update_currencies) { - logger::log_info("Fetching currency data.") + logger::log_info("Fetching and writing currency data to intermediate file: \\ + \"{currencies_data_path}\". + ") pacta.data.scraping::get_currency_exchange_rates( quarter = imf_quarter_timestamp ) %>% saveRDS(currencies_data_path) + logger::log_trace( + "Currency data written: \"{currencies_data_path}\"." + ) +} else { + logger::log_info("Skipping currency data update.") } logger::log_info("Scraping index regions.") index_regions <- pacta.data.scraping::get_index_regions() - +logger::log_trace("Index regions scraped.") # pull factset data ------------------------------------------------------------ @@ -246,13 +294,18 @@ if (update_factset) { logger::log_info("Pre-flight data prepared.") - # intermediary files ----------------------------------------------------------- logger::log_info("Preparing scenario data.") +logger::log_debug(" + Reading scenario regions from intermediary file: \\ + \"{scenario_regions_path}\". +") scenario_regions <- readr::read_csv(scenario_regions_path, na = "", show_col_types = FALSE) +logger::log_trace("Scenario regions read.") +logger::log_debug("preparing factset_issue_code_bridge.") factset_issue_code_bridge <- pacta.data.preparation::factset_issue_code_bridge %>% select(issue_type_code, asset_type) %>% @@ -265,15 +318,23 @@ factset_issue_code_bridge <- TRUE ~ "Others" ) ) +logger::log_trace("factset_issue_code_bridge prepared.") +logger::log_debug("preparing factset_industry_map_bridge.") factset_industry_map_bridge <- pacta.data.preparation::factset_industry_map_bridge %>% select(factset_industry_code, pacta_sector) +logger::log_trace("factset_industry_map_bridge prepared.") # scenarios_analysisinput_inputs +logger::log_debug("Reading raw scenario data from intermediary file: \\ + \"{scenarios_analysis_input_path}\". +") scenario_raw <- readr::read_csv(scenarios_analysis_input_path, show_col_types = FALSE) +logger::log_trace("Raw scenario data read.") # filter for relevant scenario data +logger::log_debug("Filtering raw scenario data and joining geography bridge.") scenarios_long <- scenario_raw %>% inner_join( pacta.scenario.preparation::scenario_source_pacta_geography_bridge, @@ -292,16 +353,16 @@ scenarios_long <- scenario_raw %>% c(.env$relevant_years, .env$market_share_target_reference_year + 10) ) ) +logger::log_trace("Raw scenario data filtered and geography bridge joined.") logger::log_info("Scenario data prepared.") - # currency data output --------------------------------------------------------- -logger::log_info("Saving file: \"currencies.rds\".") +logger::log_info("Exporting file: \"currencies.rds\".") readRDS(currencies_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "currencies.rds")) - +logger::log_debug("Currency data exported.") # financial data output -------------------------------------------------------- @@ -309,52 +370,62 @@ logger::log_info("Preparing financial data.") # read raw FactSet financial data, filter to unique rows, merge AR company_id, # merge PACTA sectors from AR data -logger::log_info("Formatting and saving file: \"financial_data.rds\".") +logger::log_info("Formatting and exporting file: \"financial_data.rds\".") readRDS(factset_financial_data_path) %>% pacta.data.preparation::prepare_financial_data(factset_issue_code_bridge) %>% saveRDS(file.path(data_prep_outputs_path, "financial_data.rds")) +logger::log_debug("Financial data exported.") -logger::log_info("Formatting and saving file: \"entity_financing.rds\".") +logger::log_info("Formatting and exporting file: \"entity_financing.rds\".") readRDS(factset_entity_financing_data_path) %>% saveRDS(file.path(data_prep_outputs_path, "entity_financing.rds")) +logger::log_debug("Entity financing data exported.") -logger::log_info("Formatting and saving file: \"entity_info.rds\".") +logger::log_debug("Reading AR company ID to FactSet entity ID mapping.") factset_entity_id__ar_company_id <- readr::read_csv(ar_company_id__factset_entity_id_path, col_types = "c") %>% select( factset_entity_id = "factset_id", ar_company_id = "company_id" ) +logger::log_trace("AR company ID to FactSet entity ID mapping read.") +logger::log_info("Formatting and exporting file: \"entity_info.rds\".") readRDS(factset_entity_info_path) %>% pacta.data.preparation::prepare_entity_info(factset_entity_id__ar_company_id) %>% saveRDS(file.path(data_prep_outputs_path, "entity_info.rds")) +logger::log_debug("Entity info data exported.") logger::log_info("Financial data prepared.") - # ABCD data output ------------------------------------------------------------- -logger::log_info("Preparing ABCD.") +logger::log_info("Preparing Asset Based Company Data (ABCD).") +logger::log_debug("Reading entity info.") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) +logger::log_trace("Entity info read.") +logger::log_debug("Preparing AR company ID to country of domicile mapping.") ar_company_id__country_of_domicile <- entity_info %>% select("ar_company_id", "country_of_domicile") %>% filter(!is.na(.data$ar_company_id)) %>% distinct() +logger::log_trace("AR company ID to country of domicile mapping prepared.") +logger::log_debug("Preparing AR company ID to credit parent mapping.") ar_company_id__credit_parent_ar_company_id <- entity_info %>% select("ar_company_id", "credit_parent_ar_company_id") %>% filter(!is.na(.data$ar_company_id)) %>% distinct() +logger::log_trace("AR company ID to credit parent mapping prepared.") +log_trace("removing entity_info to clear memory.") rm(entity_info) - logger::log_info( - "Formatting and saving file: \"masterdata_ownership_datastore.rds\"." + "Formatting and exporting file: \"masterdata_ownership_datastore.rds\"." ) readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% pacta.data.preparation::prepare_masterdata( @@ -363,20 +434,23 @@ readr::read_csv(masterdata_ownership_path, na = "", show_col_types = FALSE) %>% zero_emission_factor_techs ) %>% saveRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) +logger::log_debug("Masterdata ownership exported.") - -logger::log_info( - "Formatting and saving file: \"masterdata_debt_datastore.rds\"." -) - +logger::log_debug("Reading masterdata debt.") masterdata_debt <- readr::read_csv(masterdata_debt_path, na = "", show_col_types = FALSE) +logger::log_trace("Masterdata debt read.") +logger::log_debug("Preparing AR company ID to creditor company ID mapping.") company_id__creditor_company_id <- masterdata_debt %>% select("company_id", "creditor_company_id") %>% distinct() %>% mutate(across(.cols = dplyr::everything(), .fns = as.character)) +logger::log_trace("AR company ID to creditor company ID mapping prepared.") +logger::log_info( + "Formatting and saving file: \"masterdata_debt_datastore.rds\"." +) masterdata_debt %>% pacta.data.preparation::prepare_masterdata( ar_company_id__country_of_domicile, @@ -399,35 +473,42 @@ masterdata_debt %>% .groups = "drop" ) %>% saveRDS(file.path(data_prep_outputs_path, "masterdata_debt_datastore.rds")) +logger::log_debug("Masterdata debt exported.") +logger::log_trace("removing objects to clear memory.") rm(masterdata_debt) rm(company_id__creditor_company_id) - rm(ar_company_id__country_of_domicile) rm(ar_company_id__credit_parent_ar_company_id) logger::log_info("ABCD prepared.") - # abcd_flags ------------------------------------------------------------------- logger::log_info("Preparing ABCD flags.") + +logger::log_debug("Reading financial data.") financial_data <- readRDS(file.path(data_prep_outputs_path, "financial_data.rds")) +logger::log_trace("Financial data read.") +logger::log_debug("Reading entity info.") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) +logger::log_trace("Entity info read.") +logger::log_debug("Preparing AR company ID to FactSet entity ID mapping.") factset_entity_id__ar_company_id <- entity_info %>% select(factset_entity_id, ar_company_id) %>% filter(!is.na(ar_company_id)) +logger::log_trace("AR company ID to FactSet entity ID mapping prepared.") +logger::log_debug("Preparing FactSet entity ID to security sector mapping.") factset_entity_id__security_mapped_sector <- entity_info %>% select(factset_entity_id, security_mapped_sector) +logger::log_trace("FactSet entity ID to security sector mapping prepared.") - -logger::log_info("Formatting and saving file: \"abcd_flags_equity.rds\".") - +logger::log_debug("Preparing AR Ownership company ID to sector mapping.") ar_company_id__sectors_with_assets__ownership <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% filter(year %in% relevant_years) %>% @@ -435,7 +516,9 @@ ar_company_id__sectors_with_assets__ownership <- distinct() %>% group_by(ar_company_id) %>% summarise(sectors_with_assets = paste(unique(ald_sector), collapse = " + ")) +logger::log_trace("AR ownership company ID to sector mapping prepared.") +logger::log_info("Formatting and exporting file: \"abcd_flags_equity.rds\".") financial_data %>% left_join(factset_entity_id__ar_company_id, by = "factset_entity_id") %>% left_join(factset_entity_id__security_mapped_sector, by = "factset_entity_id") %>% @@ -449,10 +532,9 @@ financial_data %>% sectors_with_assets ) %>% saveRDS(file.path(data_prep_outputs_path, "abcd_flags_equity.rds")) +logger::log_debug("Equity ABCD flags exported.") - -logger::log_info("Formatting and saving file: \"abcd_flags_bonds.rds\".") - +logger::log_debug("Preparing AR Debt company ID to sector mapping.") ar_company_id__sectors_with_assets__debt <- readRDS(file.path(data_prep_outputs_path, "masterdata_debt_datastore.rds")) %>% filter(year %in% relevant_years) %>% @@ -460,7 +542,9 @@ ar_company_id__sectors_with_assets__debt <- distinct() %>% group_by(ar_company_id) %>% summarise(sectors_with_assets = paste(unique(ald_sector), collapse = " + ")) +logger::log_trace("AR debt company ID to sector mapping prepared.") +logger::log_info("Formatting and exporting file: \"abcd_flags_bonds.rds\".") financial_data %>% left_join(factset_entity_id__ar_company_id, by = "factset_entity_id") %>% left_join(factset_entity_id__security_mapped_sector, by = "factset_entity_id") %>% @@ -483,29 +567,37 @@ financial_data %>% ) %>% ungroup() %>% saveRDS(file.path(data_prep_outputs_path, "abcd_flags_bonds.rds")) +logger::log_debug("Bonds ABCD flags exported.") - +logger::log_trace("removing objects to clear memory.") rm(financial_data) rm(entity_info) rm(factset_entity_id__ar_company_id) rm(factset_entity_id__security_mapped_sector) logger::log_info("ABCD flags prepared.") - # fund data output ------------------------------------------------------------- logger::log_info("Preparing fund data.") +logger::log_debug("Reading fund data.") fund_data <- readRDS(factset_fund_data_path) +logger::log_trace("Fund data read.") +logger::log_debug(" + Filtering fund data to include funds with reported holdings appoximately \\ + equal to reported market value + ") # remove funds above the threshold fund_data <- fund_data %>% group_by(factset_fund_id, fund_reported_mv) %>% filter((fund_reported_mv[[1]] - sum(holding_reported_mv)) / fund_reported_mv[[1]] > -1e-5) %>% ungroup() +logger::log_trace("Fund data filtered.") # build MISSINGWEIGHT for under and over +logger::log_debug("Building MISSINGWEIGHT for under and over.") fund_missing_mv <- fund_data %>% group_by(factset_fund_id, fund_reported_mv) %>% @@ -516,20 +608,20 @@ fund_missing_mv <- ) %>% ungroup() %>% filter(holding_reported_mv != 0) +logger::log_trace("MISSINGWEIGHT built.") +logger::log_info("Preparing and exporting file: \"fund_data.rds\".") fund_data %>% bind_rows(fund_missing_mv) %>% saveRDS(file.path(data_prep_outputs_path, "fund_data.rds")) +logger::log_debug("Fund data exported.") - -logger::log_info("Saving file: \"total_fund_list.rds\".") +logger::log_info("Preparing and exporting file: \"total_fund_list.rds\".") fund_data %>% select(factset_fund_id) %>% distinct() %>% saveRDS(file.path(data_prep_outputs_path, "total_fund_list.rds")) - - -logger::log_info("Saving file: \"isin_to_fund_table.rds\".") +logger::log_debug("Total fund list exported.") isin_to_fund_table <- readRDS(factset_isin_to_fund_table_path) @@ -553,20 +645,24 @@ isin_to_fund_table <- ungroup() %>% select(-n, -has_fund_data) +logger::log_info("Exporting file: \"isin_to_fund_table.rds\".") isin_to_fund_table %>% saveRDS(file.path(data_prep_outputs_path, "isin_to_fund_table.rds")) +logger::log_debug("ISIN to fund table exported.") +logger::log_info("Fund data prepared.") +logger::log_trace("removing objects to clear memory.") rm(fund_data) rm(isin_to_fund_table) -logger::log_info("Fund data prepared.") - - # emission data output --------------------------------------------------------- +logger::log_debug("Reading currencies data.") currencies <- readRDS(file.path(data_prep_outputs_path, "currencies.rds")) +logger::log_trace("Currencies data read.") +logger::log_debug("Preparing ISS company emissions data.") iss_company_emissions <- readRDS(factset_iss_emissions_data_path) %>% group_by(factset_entity_id) %>% @@ -575,11 +671,9 @@ iss_company_emissions <- .groups = "drop" ) %>% mutate(icc_total_emissions_units = "tCO2e") # units are defined in the ISS/FactSet documentation (see #144) +logger::log_trace("ISS company emissions data prepared.") -logger::log_info( - "Formatting and saving file: \"iss_entity_emission_intensities.rds\"." -) - +logger::log_debug("Preparing ISS entity emission intensities.") iss_entity_emission_intensities <- readRDS(factset_entity_financing_data_path) %>% left_join(currencies, by = "currency") %>% @@ -612,19 +706,24 @@ iss_entity_emission_intensities <- ff_debt, units = paste0(icc_total_emissions_units, " / ", "$ USD") ) +logger::log_trace("ISS entity emission intensities prepared.") +logger::log_info( + "Formatting and exporting file: \"iss_entity_emission_intensities.rds\"." +) saveRDS( select(iss_entity_emission_intensities, -c("ff_mkt_val", "ff_debt")), file.path(data_prep_outputs_path, "iss_entity_emission_intensities.rds") ) +logger::log_debug("ISS entity emission intensities exported.") +logger::log_debug("Reading entity info.") +factset_entity_info <- readRDS(factset_entity_info_path) +logger::log_trace("Entity info read.") logger::log_info( - "Formatting and saving file: \"iss_average_sector_emission_intensities.rds\"." + "Formatting and exporting file: \"iss_average_sector_emission_intensities.rds\"." ) - -factset_entity_info <- readRDS(factset_entity_info_path) - iss_entity_emission_intensities %>% inner_join(factset_entity_info, by = "factset_entity_id") %>% group_by(sector_code, factset_sector_desc, units) %>% @@ -643,8 +742,9 @@ iss_entity_emission_intensities %>% ) %>% ungroup() %>% saveRDS(file.path(data_prep_outputs_path, "iss_average_sector_emission_intensities.rds")) +logger::log_debug("ISS average sector emission intensities exported.") - +logger::log_trace("removing objects to clear memory.") rm(currencies) rm(iss_company_emissions) rm(iss_entity_emission_intensities) @@ -652,19 +752,27 @@ rm(factset_entity_info) logger::log_info("Emissions data prepared.") - # combined ABCD and scenarios output ------------------------------------------- logger::log_info("Preparing combined ABCD scenario output.") +logger::log_debug("Reading masterdata ownership, filtering to relevant years.") masterdata_ownership_datastore <- readRDS(file.path(data_prep_outputs_path, "masterdata_ownership_datastore.rds")) %>% filter(year %in% relevant_years) +logger::log_trace("Masterdata ownership read and filtered.") +logger::log_debug("Preparing individual equity scenario ABCD files.") for (scenario_source in unique(scenarios_long$scenario_source)) { + logger::log_debug( + "Preparing equity ABCD scenario output for source: \"{scenario_source}\"." + ) filename <- paste0("equity_abcd_scenario_", scenario_source, ".rds") + logger::log_trace("Filtering scenario data: \"{scenario_source}\".") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - logger::log_info("Formatting and saving file: \"{filename}\".") + logger::log_info( + "Formatting and exporting scenario ABCD file: \"{filename}\"." + ) pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_ownership_datastore, scenario_data = scenarios_long_source, @@ -680,7 +788,11 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { index_regions = index_regions ) %>% saveRDS(file.path(data_prep_outputs_path, filename)) + logger::log_debug( + "equity ABCD scenario output for source: \"{scenario_source}\" exported" + ) } +logger::log_debug("Individual equity scenario ABCD files prepared.") logger::log_info("Formatting and saving file: \"equity_abcd_scenario.rds\".") list.files( @@ -691,16 +803,24 @@ list.files( lapply(readRDS) %>% bind_rows() %>% saveRDS(file.path(data_prep_outputs_path, "equity_abcd_scenario.rds")) +logger::log_debug("Equity ABCD scenario output prepared.") - +logger::log_debug("Reading masterdata debt, filtering to relevant years.") masterdata_debt_datastore <- readRDS(file.path(data_prep_outputs_path, "masterdata_debt_datastore.rds")) %>% filter(year %in% relevant_years) +logger::log_trace("Masterdata debt read and filtered.") +logger::log_debug("Preparing individual bonds scenario ABCD files.") for (scenario_source in unique(scenarios_long$scenario_source)) { + logger::log_debug( + "Preparing bonds ABCD scenario output for source: \"{scenario_source}\"." + ) filename <- paste0("bonds_abcd_scenario_", scenario_source, ".rds") scenarios_long_source <- filter(scenarios_long, .data$scenario_source == .env$scenario_source) - logger::log_info("Formatting and saving file: \"{filename}\".") + logger::log_info( + "Formatting and exporting scenario ABCD file: \"{filename}\"." + ) pacta.data.preparation::dataprep_abcd_scen_connection( abcd_data = masterdata_debt_datastore, scenario_data = scenarios_long_source, @@ -716,6 +836,9 @@ for (scenario_source in unique(scenarios_long$scenario_source)) { index_regions = index_regions ) %>% saveRDS(file.path(data_prep_outputs_path, filename)) + logger::log_debug( + "Bonds ABCD scenario output for source: \"{scenario_source}\" exported" + ) } logger::log_info("Formatting and saving file: \"bonds_abcd_scenario.rds\".") @@ -727,25 +850,32 @@ list.files( lapply(readRDS) %>% bind_rows() %>% saveRDS(file.path(data_prep_outputs_path, "bonds_abcd_scenario.rds")) +logger::log_debug("Bonds ABCD scenario output prepared.") logger::log_info("Combined ABCD scenario output prepared.") - # export SQLite versions of relevant files ------------------------------------- if (export_sqlite_files) { + logger::log_info("Exporting SQLite versions of relevant files.") + # entity_info logger::log_info("Formatting and saving file: \"entity_info.sqlite\".") + logger::log_debug("Reading entity info.") entity_info <- readRDS(file.path(data_prep_outputs_path, "entity_info.rds")) + logger::log_trace("Entity info read.") + logger::log_debug("Establishing SQLite connection.") con <- DBI::dbConnect( drv = RSQLite::SQLite(), dbname = file.path(data_prep_outputs_path, "entity_info.sqlite") ) RSQLite::sqliteSetBusyHandler(con, 3000L) + logger::log_trace("SQLite connection established.") + logger::log_debug("Writing entity info to SQLite file.") dplyr::copy_to( dest = con, df = entity_info, @@ -754,7 +884,9 @@ if (export_sqlite_files) { temporary = FALSE, indexes = list("factset_entity_id") ) + logger::log_trace("Entity info written to SQLite file.") + logger::log_debug("Closing SQLite connection and freeing memory") DBI::dbDisconnect(con) rm(entity_info) @@ -762,16 +894,20 @@ if (export_sqlite_files) { logger::log_info( "Formatting and saving file: \"equity_abcd_scenario.sqlite\"." ) - + logger::log_debug("Reading equity ABCD scenario data.") equity_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "equity_abcd_scenario.rds")) + logger::log_trace("Equity ABCD scenario data read.") + logger::log_debug("Establishing SQLite connection.") con <- DBI::dbConnect( drv = RSQLite::SQLite(), dbname = file.path(data_prep_outputs_path, "equity_abcd_scenario.sqlite") ) RSQLite::sqliteSetBusyHandler(con, 3000L) + logger::log_trace("SQLite connection established.") + logger::log_debug("Writing equity ABCD scenario data to SQLite file.") dplyr::copy_to( dest = con, df = equity_abcd_scenario, @@ -786,7 +922,9 @@ if (export_sqlite_files) { "ald_sector" ) ) + logger::log_trace("Equity ABCD scenario data written to SQLite file.") + logger::log_debug("Closing SQLite connection and freeing memory") DBI::dbDisconnect(con) rm(equity_abcd_scenario) @@ -795,15 +933,20 @@ if (export_sqlite_files) { "Formatting and saving file: \"bonds_abcd_scenario.sqlite\"." ) + logger::log_debug("Reading bonds ABCD scenario data.") bonds_abcd_scenario <- readRDS(file.path(data_prep_outputs_path, "bonds_abcd_scenario.rds")) + logger::log_trace("Bonds ABCD scenario data read.") + logger::log_debug("Establishing SQLite connection.") con <- DBI::dbConnect( drv = RSQLite::SQLite(), dbname = file.path(data_prep_outputs_path, "bonds_abcd_scenario.sqlite") ) RSQLite::sqliteSetBusyHandler(con, 3000L) + logger::log_trace("SQLite connection established.") + logger::log_debug("Writing bonds ABCD scenario data to SQLite file.") dplyr::copy_to( dest = con, df = bonds_abcd_scenario, @@ -818,23 +961,30 @@ if (export_sqlite_files) { "ald_sector" ) ) + logger::log_trace("Bonds ABCD scenario data written to SQLite file.") + logger::log_debug("Closing SQLite connection and freeing memory") DBI::dbDisconnect(con) rm(bonds_abcd_scenario) +} else { + logger::log_info("Skipping SQLite file export.") } - # manifests of input and output file ------------------------------------------- logger::log_info("Formatting and saving file: \"manifest.json\".") +# get the last update date of the ent_entity_affiliates table +logger::log_debug("Reading ent_entity_affiliates last update.") ent_entity_affiliates_last_update <- readRDS(factset_entity_info_path) %>% filter(!is.na(ent_entity_affiliates_last_update)) %>% pull(ent_entity_affiliates_last_update) %>% unique() +logger::log_trace("ent_entity_affiliates last update read.") # include PACTA packages NEWS.md test in the parameters to export +logger::log_debug("Reading NEWS.md files from relevant PACTA packages.") pacta_packages <- c("pacta.data.preparation", "pacta.scenario.preparation") package_news <- vapply( @@ -846,7 +996,9 @@ package_news <- FUN.VALUE = list(1), USE.NAMES = TRUE ) +logger::log_trace("NEWS.md files read.") +logger::log_debug("Preparing metadata parameters.") parameters <- list( input_filepaths = list( @@ -897,28 +1049,30 @@ parameters <- update_factset = update_factset, package_news = package_news ) +logger::log_trace("Metadata parameters prepared.") +logger::log_debug("Writing manifest file.") pacta.data.preparation::write_manifest( path = file.path(data_prep_outputs_path, "manifest.json"), parameters = parameters, data_prep_inputs_path = data_prep_inputs_path, data_prep_outputs_path = data_prep_outputs_path ) +logger::log_trace("Manifest file written.") - -# copy in NEWs.md files from relevant PACTA packages --------------------------- +# copy in NEWS.md files from relevant PACTA packages --------------------------- logger::log_info("Copying NEWS.md files from relevant PACTA packages.") - # `pacta_packages` defined above to add NEWS text to manifest for (pkg_name in pacta_packages) { + logger::log_debug("Copying NEWS.md file from package: \"{pkg_name}\".") file.copy( system.file("NEWS.md", package = pkg_name), to = file.path(data_prep_outputs_path, paste0(pkg_name, "-NEWS.md")) ) + logger::log_trace("NEWS.md file copied.") } - # ------------------------------------------------------------------------------ logger::log_info("PACTA Data Preparation Complete.") From 4222390d148ea503cdcdcd2420869451f2fd5636 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 16:58:40 +0100 Subject: [PATCH 27/34] remove unused file --- ACI/install_dependencies.R | 39 -------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 ACI/install_dependencies.R diff --git a/ACI/install_dependencies.R b/ACI/install_dependencies.R deleted file mode 100644 index 3a51f59..0000000 --- a/ACI/install_dependencies.R +++ /dev/null @@ -1,39 +0,0 @@ -dependencies <- c( - "DBI", - "RSQLite", - "config", - "dplyr", - "readr", - "rlang", - "rlog", - # "stats", # base package, do not update - "stringr", - "tidyr" -) - -github_dependencies <- c( - "RMI-PACTA/pacta.data.preparation", - "RMI-PACTA/pacta.data.scraping", - "RMI-PACTA/pacta.scenario.preparation" -) - -# get github_pat from docker build secrets -github_pat <- readLines("/run/secrets/github_pat") -if (!nzchar(github_pat)) { - stop("github_pat secret is empty. Is it being passed in build secrets?") -} - -install.packages( - pkgs = dependencies, - repos = "https://packagemanager.posit.co/cran/__linux__/jammy/2023-08-31", - dependencies = c("Depends", "Imports", "LinkingTo") -) - -# remotes available as part of rocker/tidyverse -remotes::install_github( - repo = github_dependencies, - auth_token = github_pat, - dependencies = c("Depends", "Imports", "LinkingTo"), - repos = "https://packagemanager.posit.co/cran/__linux__/jammy/2023-08-31", - upgrade = "always" -) From 4466032403878b4fdfb5fdd798090a5cc3fa6d88 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 18:03:41 +0100 Subject: [PATCH 28/34] don't check for missing envvars, just read .env --- run_pacta_data_preparation.R | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index a568897..46bfa2e 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -22,16 +22,8 @@ logger::log_trace("Necessary packages loaded.") # if any essential envvars are missing, read the .env file. # These should be set already as part of an ACI deployment. -logger::log_debug("Checking for missing envvars.") -if (any( - !nzchar(c( - Sys.getenv("R_DATABASE_USER"), - Sys.getenv("R_DATABASE_PASSWORD"), - Sys.getenv("R_CONFIG_ACTIVE") - )) -)) { - readRenviron(".env") -} +logger::log_debug("Reading .env file.") +readRenviron(".env") logger::log_debug("Loading config.") config <- From 4d2dee16fb8125bea96bfced0a73a8b41f02fdf9 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Fri, 19 Jan 2024 18:35:30 +0100 Subject: [PATCH 29/34] Add pak options to not update sysreqs db --- ACI/RProfile.site | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ACI/RProfile.site b/ACI/RProfile.site index 3bca408..9e29c1d 100644 --- a/ACI/RProfile.site +++ b/ACI/RProfile.site @@ -1,5 +1,7 @@ options( pkg.sysreqs = FALSE, + pkg.sysreqs_db_update = FALSE, + pkg.sysreqs_update = FALSE, readr.show_progress = FALSE, repos = c(CRAN = '$CRAN_REPO') ) From 676a778531ad03c8b5cab0524a08c833604d298b Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 20 Jan 2024 18:04:21 +0100 Subject: [PATCH 30/34] Add tar to pack up files --- run_pacta_data_preparation.R | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 46bfa2e..75bd82e 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -413,7 +413,7 @@ ar_company_id__credit_parent_ar_company_id <- distinct() logger::log_trace("AR company ID to credit parent mapping prepared.") -log_trace("removing entity_info to clear memory.") +logger::log_trace("removing entity_info to clear memory.") rm(entity_info) logger::log_info( @@ -1067,4 +1067,29 @@ for (pkg_name in pacta_packages) { # ------------------------------------------------------------------------------ +# Create tar file if requested +if (create_tar) { + logger::log_info("Creating tar file.") + tar_file_path <- file.path( + data_prep_outputs_path, + paste0(basename(data_prep_outputs_path), ".tar.gz") + ) + logger::log_trace("Tar file path: \"{tar_file_path}\".") + system2( + command = "tar", + args = c( + "--create", + "--exclude-backups", + "--exclude-vcs", + "--gzip", + "--verbose", + "-C", dirname(data_prep_outputs_path), + paste0("--file=", tar_file_path), + basename(data_prep_outputs_path) + ) + ) + logger::log_info("Tar file created at ", tar_file_path) +} + + logger::log_info("PACTA Data Preparation Complete.") From 1f0cd0047c0ea0cc96afed3d7287a4faddb67b61 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 20 Jan 2024 18:08:22 +0100 Subject: [PATCH 31/34] Allow option to not create tar --- config.yml | 3 +++ run_pacta_data_preparation.R | 1 + 2 files changed, 4 insertions(+) diff --git a/config.yml b/config.yml index d4943cb..01e4ca6 100644 --- a/config.yml +++ b/config.yml @@ -26,6 +26,7 @@ default: scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["ETP2020", "GECO2021", "IPR2021", "ISF2021", "WEO2021"] global_aggregate_sector_list: ["Power"] + create_tar: true 2021Q4: @@ -47,6 +48,7 @@ default: scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["ETP2020", "GECO2021", "IPR2021", "ISF2021", "WEO2021"] global_aggregate_sector_list: ["Power"] + create_tar: true 2021Q4_dev_vm: inherits: 2021Q4 @@ -81,6 +83,7 @@ default: scenario_geographies_list: ["Global", "NonOECD", "OECD"] global_aggregate_scenario_sources_list: ["ETP2020", "GECO2021", "IPR2021", "ISF2021", "WEO2021"] global_aggregate_sector_list: ["Power"] + create_tar: true 2022Q4: dbname: "fds_20230705" diff --git a/run_pacta_data_preparation.R b/run_pacta_data_preparation.R index 75bd82e..12bf0c3 100644 --- a/run_pacta_data_preparation.R +++ b/run_pacta_data_preparation.R @@ -64,6 +64,7 @@ tech_exclude <- config$tech_exclude scenario_geographies_list <- config$scenario_geographies_list global_aggregate_scenario_sources_list <- config$global_aggregate_scenario_sources_list global_aggregate_sector_list <- config$global_aggregate_sector_list +create_tar <- config$create_tar logger::log_trace("Config values set as R objects.") #ensure data_prep_outputs_path exists From 75d70b1cdb817d3d857220fdd6d553671e6dc813 Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sat, 20 Jan 2024 18:09:28 +0100 Subject: [PATCH 32/34] Set CRAN Repo in Rprofile.site --- ACI/Dockerfile.ACI | 6 ++---- ACI/RProfile.site | 4 +++- ACI/copy_files_and_run_data_prep.sh | 2 +- ACI/docker-compose.yml | 3 --- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ACI/Dockerfile.ACI b/ACI/Dockerfile.ACI index b431287..a7d7d34 100644 --- a/ACI/Dockerfile.ACI +++ b/ACI/Dockerfile.ACI @@ -30,10 +30,8 @@ RUN curl -fsSL -o /tmp/google-chrome.deb https://dl.google.com/linux/direct/goog WORKDIR /workflow.data.preparation -# set frozen CRAN repo -ARG CRAN_REPO="https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" -ARG R_HOME="/usr/local/lib/R" -COPY ./ACI/Rprofile.site "${R_HOME}/etc/Rprofile.site" +# set frozen CRAN repo and other R options() +COPY ./ACI/Rprofile.site "/usr/local/lib/R/etc/Rprofile.site" # Install R dependencies COPY DESCRIPTION DESCRIPTION diff --git a/ACI/RProfile.site b/ACI/RProfile.site index 9e29c1d..852d498 100644 --- a/ACI/RProfile.site +++ b/ACI/RProfile.site @@ -3,5 +3,7 @@ options( pkg.sysreqs_db_update = FALSE, pkg.sysreqs_update = FALSE, readr.show_progress = FALSE, - repos = c(CRAN = '$CRAN_REPO') + repos = c( + CRAN = "https://packagemanager.posit.co/cran/__linux__/jammy/2023-10-30" + ) ) diff --git a/ACI/copy_files_and_run_data_prep.sh b/ACI/copy_files_and_run_data_prep.sh index 395ffc1..1624c91 100755 --- a/ACI/copy_files_and_run_data_prep.sh +++ b/ACI/copy_files_and_run_data_prep.sh @@ -1,7 +1,7 @@ #! /bin/sh set -e -inputs_dir="/mnt/inputs" +inputs_dir="/mnt/dataprep_inputs" # copy raw data, then run normal data prep script Rscript /workflow.data.preparation/copy_raw_data.R 2>&1 | \ diff --git a/ACI/docker-compose.yml b/ACI/docker-compose.yml index 7af4c73..da45c48 100644 --- a/ACI/docker-compose.yml +++ b/ACI/docker-compose.yml @@ -2,9 +2,6 @@ version: "3.2" services: workflow.data.preparation_aci: - stdin_open: true - tty: true - command: ["sh"] build: context: .. dockerfile: ACI/Dockerfile.ACI From 468f1ff079ac21002435a431ba4018453d808c0f Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 21 Jan 2024 12:04:10 +0100 Subject: [PATCH 33/34] Increase memory available --- ACI/azure-deploy.json | 34 +++++++++++------------------ ACI/copy_files_and_run_data_prep.sh | 3 +++ 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/ACI/azure-deploy.json b/ACI/azure-deploy.json index 9143358..05f4955 100644 --- a/ACI/azure-deploy.json +++ b/ACI/azure-deploy.json @@ -1,6 +1,6 @@ { "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", - "contentVersion": "0.0.4", + "contentVersion": "0.0.0.5", "parameters": { "location": { "type": "string", @@ -15,12 +15,6 @@ "description": "The ID of the user assigned identity to use for the container group." } }, - "serviceprincipal": { - "type": "string", - "metadata": { - "description": "The ID of the service principal to use for the container group." - } - }, "containerGroupName": { "type": "string", "metadata": { @@ -45,12 +39,6 @@ "description": "The storage account key for the rawdata storage account." } }, - "dataprepinputs-storageaccountkey": { - "type": "securestring", - "metadata": { - "description": "The storage account key for the rawdata storage account." - } - }, "dataprepoutputs-storageaccountkey": { "type": "securestring", "metadata": { @@ -99,7 +87,11 @@ "resources": { "requests": { "cpu": 1, - "memoryInGB": 1 + "memoryInGB": 32 + gpu: { + "count": 1, + "sku": "K80" + } } }, "environmentVariables": [ @@ -117,7 +109,7 @@ }, { "name": "LOG_LEVEL", - "value": "DEBUG" + "value": "TRACE" } ], "volumeMounts": [ @@ -130,8 +122,8 @@ "mountPath": "/mnt/rawdata/" }, { - "name": "inputsvolume", - "mountPath": "/mnt/inputs/" + "name": "dataprepinputsvolume", + "mountPath": "/mnt/dataprep_inputs" }, { "name": "outputsvolume", @@ -169,12 +161,12 @@ } }, { - "name": "inputsvolume", + "name": "dataprepinputsvolume", "azureFile": { - "shareName": "data-prep-inputs", + "shareName": "dataprep-inputs", "readOnly": false, - "storageAccountName": "pactadata", - "storageAccountKey": "[parameters('dataprepinputs-storageaccountkey')]" + "storageAccountName": "pactarawdata", + "storageAccountKey": "[parameters('rawdata-storageaccountkey')]" } }, { diff --git a/ACI/copy_files_and_run_data_prep.sh b/ACI/copy_files_and_run_data_prep.sh index 1624c91..bcc07d7 100755 --- a/ACI/copy_files_and_run_data_prep.sh +++ b/ACI/copy_files_and_run_data_prep.sh @@ -1,6 +1,9 @@ #! /bin/sh set -e +# check memory available +free -m | cat + inputs_dir="/mnt/dataprep_inputs" # copy raw data, then run normal data prep script From acc9c0fa0fd66fe3f536c2d07ba37b381125b77c Mon Sep 17 00:00:00 2001 From: Alex Axthelm Date: Sun, 21 Jan 2024 12:33:54 +0100 Subject: [PATCH 34/34] Change to supporte GPU, update docs --- ACI/azure-deploy.json | 4 ++-- README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ACI/azure-deploy.json b/ACI/azure-deploy.json index 05f4955..bd06cd8 100644 --- a/ACI/azure-deploy.json +++ b/ACI/azure-deploy.json @@ -88,9 +88,9 @@ "requests": { "cpu": 1, "memoryInGB": 32 - gpu: { + "gpu": { "count": 1, - "sku": "K80" + "sku": "V100" } } }, diff --git a/README.md b/README.md index d7fbab0..f4dd990 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,9 @@ For older docker versions that support buildkit, you can write the _value_ of th # must be built with buildkit # run from repo root docker build \ - --secret id=github_pat,src=$(pwd)/secretfile \ + --secret id=github_pat,src=$(pwd)/ACI/github_pat.txt \ --progress=plain \ - --tag workflow.data.preparation_aci \ + --tag transitionmonitordockerregistry.azurecr.io/workflow.data.preparation_aci \ -f ACI/Dockerfile.ACI . ```