Skip to content

Commit

Permalink
initial df-helper commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mwes committed Dec 20, 2017
1 parent 23262de commit 1880dac
Show file tree
Hide file tree
Showing 24 changed files with 760 additions and 0 deletions.
2 changes: 2 additions & 0 deletions reactors/df-helper/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
test-data-cache
.dirty
4 changes: 4 additions & 0 deletions reactors/df-helper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
test-data-cache/*
localtest/*
df-helper-0.1.0/localtest/*
.dirty
32 changes: 32 additions & 0 deletions reactors/df-helper/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Pin to a specific major version, not latest
# xenial = 16.0.4 LTS
# trusty = 14.0.4 LTS

FROM sd2e/base:ubuntu16

RUN apt-get update && \
apt-get install python python-pip -y && \
apt-get clean

RUN pip install --upgrade pip

# Customizing 101
#
# 1. Try to avoid working in / (unless that's your intent)
# 2. Do ADD and COPY operations as late as possible as
# they invalidate the Docker cache on downstream layers
# 3. Import archive files from GitHub using tagged releases
# 4. Clean up your build directories when done
# 5. Put scripts and other assets in relatively standard places
# 6. Don't actually have the default ENTRYPOINT or
# CMD do work. Enlist it for debugging instead.

WORKDIR /root

RUN mkdir -p /opt/scripts

ADD src /opt/scripts

# After much discussion, it seems that a standard is for
# the default code in the container at least return help/usage
CMD python /opt/scripts/df-helper.py --help
25 changes: 25 additions & 0 deletions reactors/df-helper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

# df-helper

Helper code to support common tasks related to post-ETL dataframe generation

For now, this does very little:

It retrieves manifests and queries for a plan given a manifest

Will be expanded or shared among dataframe producing code for each ETL'd data type

$ python df-helper.py --manifest <input_manifest>

# Provisioning test data

1. Edit `test-data.tsv` to suit your needs
2. Run `./import-test-data.sh`. A copy of the files and directories specified will be downloaded to `test-data-cache`
3. Run `stage-test-data.sh <app-directory> <override-localtest>`
4. Change into `<app-directory>`. Edit `tester.sh` if needed to refer to a specific file or files in `localtest`. Run your tests!
5. To deploy, run `./clean-test-data.sh <app-directory>` then run `./deploy.sh <app-directory> <app-json>`

## Source management

The application's local `.gitignore` has been updated to ignore the contents of `test-data-cache` and `<app-directory>/localtest`. If you change any of the default paths, you will need to update the `.gitignore` as well.

7 changes: 7 additions & 0 deletions reactors/df-helper/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

version=$(cat df-helper-0.1.0/VERSION)

CONTAINER_IMAGE="sd2e/df-helper:$version"

docker build -t ${CONTAINER_IMAGE} .
21 changes: 21 additions & 0 deletions reactors/df-helper/clean-test-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Path to app bundle
APP=$1
DEST=${2-localtest}

TEST_DATA_CACHE=${TEST_DATA_CACHE:-test-data-cache}

if [ ! -d "$APP/$DEST" ]
then
echo "Can't find or access $APP/$DEST. Re-run $0 <app> <destpath>"
fi

function clean_data() {

local APATH=$1
local RECURSE=
rm -rf $APP/$DEST/*
find . -name .dirty -exec rm {} \;

}

clean_data "${APP}"
90 changes: 90 additions & 0 deletions reactors/df-helper/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env bash

# Path to app bundle
APP=$1
APPJSON=$2

if [ -z "${APPJSON}" ]
then
APPJSON=$(find $APP -type f -name "*app.json")
fi

if [ -z "${APPJSON}" ]
then
echo "Can't find an application JSON. Re-run $0 <app> <jsonfile>"
exit 0
fi

# stage-test-data.csv marks localtest with a hidden file to indicate that its a bad idea to deploy it
if [[ -n $(find "${APP}" -name ".dirty" ) ]]
then
echo "Error: A localtest directory inside $APP is marked as dirty. Please run clean-test-data.sh $APP <local-test-path> before deploying."
exit 1
fi

if [ -z "$NODOCKER" ]
then
if [ ! -f "./Dockerfile" ]
then
echo "Warning: ./Dockerfile not found. Image not built."
else
{
bash build.sh
bash push.sh
} || {
echo "No ./build.sh script. Image not built."
}
fi
fi

# Agave application management
# Assumptions:
_APPNAME=$(jq -r .name $APPJSON)
_APPVERS=$(jq -r .version $APPJSON)
_APPID="${_APPNAME}-${_APPVERS}"

# username/apps/app-0.1.0
_DEPPATH=$(jq -r .deploymentPath $APPJSON)
# username/apps/app-0.1.0 -> username/apps
_DEPAPPSPATH=$(dirname "${_DEPPATH}")
_DEPSYS=$(jq -r .deploymentSystem $APPJSON)

# username/apps -> username
_USERDIRECTORY=$(dirname "${_DEPAPPSPATH}")
# username/apps -> apps
_APPDIRECTORY=$(basename "${_DEPAPPSPATH}")

# Deploy application assets
#
# Policies
# ATOMIC - Try to rename original DEPPATH on DEPSYS
# DESTRUCTIVE - Delete DEPPATH on DEPSYS (Yolo)
# INPLACE (default) - One-way sync to DEPSYS/DEPPATH

_FILEOPTS=
if [ ! -z "$_DEPSYS" ]
then
_FILEOPTS="${_FILEOPTS} -S ${_DEPSYS}"
fi

if [ -z "$NOFILEOPS" ]
then
set -x
# DESTRUCTIVE
files-delete ${_FILEOPTS} ${_DEPPATH}
# files-mkdir -S system -N apps username
files-mkdir ${_FILEOPTS} -N ${_APPDIRECTORY} ${_USERDIRECTORY}
files-upload -q ${_FILEOPTS} -F ${APP} ${_DEPAPPSPATH}/
set +x
fi

# Register the application
if [ -z "$NOUPDATE" ]
then
set -x
apps-addupdate -F ${APPJSON}
set +x
fi

# Sleep 1 then check its status

1 change: 1 addition & 0 deletions reactors/df-helper/df-helper-0.1.0/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
latest
119 changes: 119 additions & 0 deletions reactors/df-helper/df-helper-0.1.0/_util/container_exec.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@

function container_exec() {

# [TODO] Check for existence of docker or singularity executable
# [TODO] Enable honoring a DEBUG global
# [TODO] Figure out how to accept more optional arguments (env-file, etc)
# [TODO] Better error handling and reporting
# [TODO] Handle "urllib2.URLError: <urlopen error [Errno -3] Temporary failure in name resolution>"

local CONTAINER_IMAGE=$1
shift
local COMMAND=$1
shift
local PARAMS=$@

# A litte logging to help with the edge cases
if [ ! -z "$DEBUG" ];
then
local _PID=$$
echo $CONTAINER_IMAGE > .container_exec.${_PID}.log
echo $COMMAND >> .container_exec.${_PID}.log
echo $PARAMS >> .container_exec.${_PID}.log
echo $PWD >> .container_exec.${_PID}.log
echo $(ls $PWD) >> .container_exec.${_PID}.log
env > .container_exec.${_PID}.env
fi

# Detect container engine
local _CONTAINER_APP=$(which singularity)
if [ ! -z "${_CONTAINER_APP}" ]
then
_CONTAINER_ENGINE="singularity"
else
_CONTAINER_APP=$(which docker)
if [ ! -z "${_CONTAINER_APP}" ]
then
_CONTAINER_ENGINE="docker"
fi
fi

if [ -z "$SINGULARITY_PULLFOLDER" ];
then
if [ ! -z "$STOCKYARD" ];
then
SINGULARITY_PULLFOLDER="${STOCKYARD}/.singularity"
else
SINGULARITY_PULLFOLDER="$HOME/.singularity"
fi
fi

if [ -z "$SINGULARITY_CACHEDIR" ];
then
if [ ! -z "$STOCKYARD" ];
then
SINGULARITY_CACHEDIR="${STOCKYARD}/.singularity"
else
SINGULARITY_CACHEDIR="$HOME/.singularity"
fi
fi

local _UID=$(id -u)
local _GID=$(id -g)
chmod g+rwxs .
umask 002 .

if [[ "$_CONTAINER_ENGINE" == "docker" ]];
then
#local OPTS="--network=none --cpus=1.0000 --memory=1G --device-read-iops=/dev/sda:1500 --device-read-iops=/dev/sda:1500"

# Set group ownership on all files making them readable by archive process
OPTS="$OPTS --rm --user=0:${_GID} -v $PWD:/home:rw -w /home"
if [ ! -z "$ENVFILE" ]
then
OPTS="$OPTS --env-file ${ENVFILE}"
fi
if [ ! -z "$DEBUG" ];
then
set -x
fi
docker run $OPTS ${CONTAINER_IMAGE} ${COMMAND} ${PARAMS}
if [ ! -z "$DEBUG" ];
then
set +x
fi
elif [[ "$_CONTAINER_ENGINE" == "singularity" ]];
then
# [TODO] Detect and deal if an .img has been passed it (rare)
singularity exec docker://${CONTAINER_IMAGE} ${COMMAND} ${PARAMS}
else
echo "_CONTAINER_ENGINE needs to be 'docker' or 'singularity' [$_CONTAINER_ENGINE]"
fi

}


function count_logical_cores() {

local _count_cores=4
local _uname=$(uname)

if [ "$_uname" == "Darwin" ]
then
_count_cores=$(sysctl -n hw.logicalcpu)
elif [ "$_uname" == "Linux" ]
then
_count_cores=$(grep -c processor /proc/cpuinfo)
fi

echo $_count_cores

}

function auto_maxthreads() {

local hwcore=$(count_logical_cores)
hwcore=$((hwcore-1))
echo $hwcore

}
42 changes: 42 additions & 0 deletions reactors/df-helper/df-helper-0.1.0/app-maverick-mweston.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"checkpointable": false,
"name": "df-helper-mweston",
"executionSystem": "hpc-tacc-maverick-mweston",
"executionType": "HPC",
"deploymentPath": "mweston/apps/df-helper-0.1.0",
"deploymentSystem": "data-sd2e-projects-users",
"helpURI": "https://sd2e.org/develop/",
"label": "Dataframe helper [TACC mweston]",
"longDescription": "",
"modules": ["load tacc-singularity/2.3.1"],
"ontology": ["http://edamontology.org/topic_3520"],
"parallelism": "SERIAL",
"shortDescription": "Dataframe helper",
"tags": ["df-helper", "docker://index.docker.io/sd2e/df-helper:latest"],
"templatePath": "runner-template.sh",
"testPath": "tester.sh",
"version": "0.1.0",
"inputs": [{
"id": "manifestFile",
"details": {
"label": "manifest file",
"showAttribute": false,
"attribute": "--manifest "
},
"semantics": {
"minCardinality": 1,
"maxCardinality": 1,
"ontology": [
"http://edamontology.org/format_1929",
"http://edamontology.org/format_2332",
"http://edamontology.org/format_3245"
]
},
"value": {
"default": "agave://data-sd2e-community/sample/biofab/yeast-gates_q0/3/manifest/107795-manifest.json",
"required": true,
"visible": true
}
}],
"defaultMaxRunTime": "00:30:00"
}
42 changes: 42 additions & 0 deletions reactors/df-helper/df-helper-0.1.0/app-maverick.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"checkpointable": false,
"name": "df-helper-maverick",
"executionSystem": "hpc-tacc-maverick",
"executionType": "HPC",
"deploymentPath": "sd2eadm/apps/df-helper-0.1.0",
"deploymentSystem": "data-sd2e-projects-users",
"helpURI": "https://sd2e.org/develop/",
"label": "Dataframe helper [TACC Maverick]",
"longDescription": "",
"modules": ["load tacc-singularity/2.3.1"],
"ontology": ["http://edamontology.org/topic_3520"],
"parallelism": "SERIAL",
"shortDescription": "Dataframe helper",
"tags": ["df-helper", "docker://index.docker.io/sd2e/df-helper:latest"],
"templatePath": "runner-template.sh",
"testPath": "tester.sh",
"version": "0.1.0",
"inputs": [{
"id": "manifestFile",
"details": {
"label": "manifest file",
"showAttribute": false,
"attribute": "--manifest "
},
"semantics": {
"minCardinality": 1,
"maxCardinality": 1,
"ontology": [
"http://edamontology.org/format_1929",
"http://edamontology.org/format_2332",
"http://edamontology.org/format_3245"
]
},
"value": {
"default": "agave://data-sd2e-community/sample/biofab/yeast-gates_q0/3/manifest/107795-manifest.json",
"required": true,
"visible": true
}
}],
"defaultMaxRunTime": "00:30:00"
}
Loading

0 comments on commit 1880dac

Please sign in to comment.