diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..18c196d8 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,40 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "staging service devcontainer", + "dockerComposeFile": [ + "docker-compose.yml" + ], + // The 'service' property is the name of the service for the container that VS Code should + // use. Update this value and .devcontainer/docker-compose.yml to the real service name. + "service": "staging_service", + // "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached", + // The optional 'workspaceFolder' property is the path VS Code should open by default when + // connected. This is typically a file mount in .devcontainer/docker-compose.yml + "workspaceFolder": "/workspace", + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "tamasfe.even-better-toml", + "mhutchie.git-graph", + "ms-python.black-formatter", + "ms-python.flake8", + "mikoz.autoflake-extension", + "ms-azuretools.vscode-docker", + "DavidAnson.vscode-markdownlint", + "matangover.mypy" + ] + } + } + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} \ No newline at end of file diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml new file mode 100644 index 00000000..01401422 --- /dev/null +++ b/.devcontainer/docker-compose.yml @@ -0,0 +1,29 @@ +version: '3.6' +networks: + kbase-dev: + name: kbase-dev +services: + staging_service: + build: + context: .. + target: dev + + # image: kbase/staging_server_devcontainer:dev + + container_name: staging_server_devcontainer + dns: 8.8.8.8 + volumes: + - ..:/workspace + networks: + - kbase-dev + # Set some default environment variables, which enable the + # service to work inside the container without too much fussing. + env: + KB_DEPLOYMENT_CONFIG=./deployment/conf/local.cfg + FILE_LIFETIME=90 + # Required for a devcontainer -- keeps the container running. + # Don't worry, our main interaction with the container is through the + # VSC terminal, which for a devcontainer opens a shell within the + # container. + command: /bin/sh -c "while sleep 1000; do :; done" + \ No newline at end of file diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 262a872e..d5063df2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -1,26 +1,12 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - name: Run Staging Service Tests - on: [pull_request] - jobs: build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.11 - uses: actions/setup-python@v2 - with: - python-version: 3.11.4 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - name: Test with pytest - run: | - bash run_tests.sh + - name: Checkout the repo + uses: actions/checkout@v3 + + - name: Run the tests + run: ./development/scripts/run scripts/run_tests.sh \ No newline at end of file diff --git a/.gitignore b/.gitignore index a3c1517e..743a6a0f 100644 --- a/.gitignore +++ b/.gitignore @@ -107,4 +107,10 @@ data/ .DS_Store .virtualenvs/ test.env -run_tests_single.sh \ No newline at end of file +run_tests_single.sh +_attic + +# IDEs + +# jetbrains +.idea diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 00000000..eb5393e3 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,10 @@ +MD013: + # Matches black line length for python code. + line_length: 88 + # some long examples would break the line length and are awkward to rearrange + code_blocks: false + +# We don't want the annoying warnings when there are duplicated non-sibling headers as +# many types of docs have a pattern for headings that is repeated throughout the document. +MD024: + siblings_only: true diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b891d07f..00000000 --- a/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -sudo: required -services: - - docker -language: python -python: "3.6" -install: - - pip install -r requirements.txt -script: - - ./run_tests.sh - - make -after_success: - - IMAGE_NAME=kbase/staging_service ./build/push2dockerhub.sh -env: - global: - - secure: "mH9OrGjZYV021EfQPjx0hnwi7/UlGVNTnE6kC81O1/9/lOWihnDKBb/GXuWyj7hbiCZgA2ofDblkm+bIZQxU/+40k8ZrQyk4xe9Bm3jM9MCZ7y6r3g3HZnKXGbzdQTACLoJwdHRWEnYWcBC+aEnDsJjm/uh+gFE3WFXABHO5nOBVBJ1vnZoqczPnLIqOtTWwybV58eYxXOm7J6GvxrTWxdqoEOKsLcjdbm9dej/i9xsxHr6x0I/fiB8mFsIgl1QS9WobpTjs1AQghShz8oFO4YI70mJF4b9tU0FbfF1Wm6K4fe69ff65RwUhT9o8ZgwBt/mzrDFTLMI9f/3J8v6Bu5UxQ1yEPc9QP78IWW/g2G20WxjLzmbKogsGZkSyPEz1TLq8CgEGvU5IOoDSW1NjQJr9XQaE8A1ILDEfi1xVG/d6o8uhEMEiloveRa8iKdNFoGv5O0fbS8RC6KVs9SPP0nkApmv29MhWf0injEROZNXiDJr6Da4HiU+RkDwdM8rbIOS0586t/czVIRremmon5Xb9NDZapLbqdIX/zd//IC6WGM0ytX36t8FgU0Du+V+KfaAc7XP3ff+GLUL/sxMGGjdl+gCCdUCrTPCl2+D9P54/HRB5Q5xKNnO/OVou9WOe1HanthH5n2QGelzLisU2OxCiWl/iZ15pl+CGntF9+Ek=" - - secure: "sHLRR/xLA6fiPkJB6IolHUaJxNJJYcfSKG3MmRAU7v0LxxoqUkneBfk9jyk2GdDdtOR+RocOQmwie0Vf6ik5bIxcs/Bse1tR0+yhmg9EpgClokfgZN8815yNVclGkVFWZS7BhRjm7mt0K3Vo7Of0mKikp5L/tYmwQy1hesJ+mTa83igeQKpFU3QgkRJB2ALgSI54ruhO88x+9HtLqkhN8KqHj5ykGzS7ShagFywflFDVFWJoBK76NideU0gTU7+2dNV0+EPmnRWs9QoAh5LSUDa8v7fo+I2WHFCOLfiKzaNMc85+Gc72qyxvkpWl2UCJpiFVK1X4DFGvHYU9FzM0W9VuhrBAa3h1XFncYk52WzKNGVvQnOZE965iNrl/xSh2oTUOfyVIfX5JuldEWffU/vhAjvnqfsoq17geRxLyqbRx/4ZvNq6jf3Ef6kLwKlnPzcw85zZs2jsy3qC5UGJf0xqchWfsqxACvmGhVyv+KxHJxaFq48UHp6Ezb/X2DweD6/6WPfwXI6cOwXy6Uj3uvkYSsKE7HJu3K9aMeh158ZMrW/JeyH5dCRAv1qeAyLN0yOw/lPUapdrbB9nXkLbHIOhV8b6A2cOtkkuM6HXXBxICJ5BIHmVzO+4W7VXueXfVjAVY2+xs0ouFsPIOINfR5hgsglncDY+r/wU8LrUHxHg=" \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c8d57bf4..52301c69 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,34 +1,78 @@ -FROM python:3.11.4-slim-buster +FROM python:3.11.5-slim-bookworm AS base # ----------------------------------------- -RUN mkdir -p /kb/deployment/lib RUN apt-get update && \ - apt-get install -y --no-install-recommends zip=3.0-11+b1 unzip=6.0-23+deb10u3 bzip2=1.0.6-9.2~deb10u2 libmagic-dev=1:5.35-4+deb10u2 htop=2.2.0-1+b1 wget=1.20.1-1.1 && \ + apt-get install -y --no-install-recommends zip=3.0-13 unzip=6.0-28 bzip2=1.0.8-5+b1 libmagic-dev=1:5.44-3 && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -COPY ./requirements.txt /requirements.txt -RUN python -m pip install "pip==23.1.2" && pip install -r /requirements.txt && rm /requirements.txt +COPY requirements.txt /kb/requirements.txt +RUN python -m pip install "pip==23.2.1" && pip install -r /kb/requirements.txt && rm -r /kb +# The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to +# the end +LABEL org.label-schema.build-date=$BUILD_DATE \ + org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.schema-version="1.1.8" \ + us.kbase.vcs-branch=$BRANCH \ + maintainer="Steve Chan sychan@lbl.gov" + +# +# Dev Layer +# Used in devcontainer, and as base for tools +# +FROM base AS dev +# Install OS dependencies required by or nice-to-have in a development image +RUN apt-get update && \ + apt-get install -y --no-install-recommends htop=3.2.2-2 wget=1.21.3-1+b2 git=1:2.39.2-1.1 openssh-client=1:9.2p1-2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +# Install Python dependencies require by development tools (cli tools and devcontainer) +COPY ./requirements_dev.txt /kb/requirements_dev.txt +RUN pip install -r /kb/requirements_dev.txt && rm -r /kb +WORKDIR /kb/module +# Note - entrypoint defined in docker compose file, and /kb/module is volume mounted by +# the devcontainer and the tools + +# +# Prod layer +# +FROM base AS prod + +# Install globus configuration into expected location. +# TODO: point to location for documentation of this. COPY ./globus.cfg /etc/globus.cfg RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log +# We expect this to run on port 3000 +# TODO: this is weird, kbase services usually run at port 5000. +EXPOSE 3000 + +# We keep the entire repo in /kb/module; for why, I know not. +# TODO: could someone add a comment here explaining why? COPY ./ /kb/module -RUN cp -r /kb/module/staging_service /kb/deployment/lib -RUN cp -r /kb/module/deployment /kb +WORKDIR /kb/module +# Otherwise, the service is installed in /kb/deployment (!) +# RUN mkdir -p /kb/deployment/lib +# RUN cp -r /kb/module/staging_service /kb/deployment/lib -EXPOSE 3000 +# +# Here we copy all of the required runtime components that need +# to be in the image. +# -WORKDIR /kb/deployment/lib +# This contains the entrypoint +COPY ./deployment/bin /kb/deployment/bin +# This contains the CI deployment +# TODO: why is it copied to the codebase, though? +COPY ./deployment/conf/deployment.cfg /kb/deployment/conf/deployment.cfg -# The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to -# the end -LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.schema-version="1.1.8" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" +# Configuration for mapping file extensions to importers +COPY ./deployment/conf/supported_apps_w_extensions.json /kb/deployment/conf/supported_apps_w_extensions.json + +# The service code. +COPY ./staging_service /kb/deployment/lib/staging_service -ENTRYPOINT ["/kb/deployment/bin/entrypoint.sh"] +ENTRYPOINT ["/kb/module/deployment/bin/entrypoint.sh"] diff --git a/README.md b/README.md index 67aae27d..8313446e 100644 --- a/README.md +++ b/README.md @@ -1,1237 +1,67 @@ -# staging_service +# KBase Staging Service -In order to setup local development, you must have docker installed and if you -want to run it locally you must have python 3.11.4 or greater installed. +## Background -## setup +The staging service provides an API to a shared filesystem designed primarily to +allow KBase users to upload raw data files for usage by KBase apps within a Narrative. +The API allows for uploading, decompressing and downloading files. In addition it +provides some utility for Globus transfer integration into the filesystem. -make a folder called /data as well as inside that /bulk and inside that a folder -for any usernames you wish it to work with +The `staging_service` is a core KBase service designed to be run within the KBase +infrastructure. The runtime requires access to other KBase services as well as a +suitable filesystem. Internal libraries implement the interfaces to these services, +which are all open source and documented in their respective repos. In a break from the +traditional KBase service, it is not based on the KBase SDK, but rather `aiohttp`. -- data - - bulk - - username1 - - username2 +For more about how data import works from the user perspective, please [refer to the KBase +documentation](https://docs.kbase.us/getting-started/narrative/add-data). -if you want to run locally you must install requirements.txt for python3 +## Installation -## running +The service is designed to be packaged into a Docker image and run as a container. -to run locally run /deployment/bin/entrypoint.sh +For deployment, the image is built by GitHub actions and stored in GitHub Container +Registry. -to run inside docker run /run_in_docker.sh +For local development and evaluation, it may be run locally from within a devcontainer, +or from a locally-built image. -## tests +> TODO: provide barebones instructions here -- to test use `./run_tests.sh` -- requires python 3.11.4 or higher -- requires installation on mac of libmagic: `brew install libmagic` or `sudo port install libmagic`. +## Usage -## debugging +There are three basic usage scenarios - development, local deployment, and production +deployment. -Included configurations for the Visual Studio Code debugger for python that -mirror what is in the entrypoint.sh and testing configuration to run locally in -the debugger, set breakpoints and if you open the project in VSCode the debugger -should be good to go. The provided configurations can run locally and run tests -locally +Development has [its own documentation](./docs/development/inde.md). -## development +Production deployment is out of scope (though it can be larged deduced from local +deployment). -When releasing a new version: - -- Update the release notes -- Update the version in [staging_service/app.py](staging_service/app.py).VERSION - -## expected command line utilities - -to run locally you will need all of these utils on your system: tar, unzip, zip, -gzip, bzip2, md5sum, head, tail, wc - -in the docker container all of these should be available - -## API - -all paths should be specified treating the user's home directory as root - -### Test Service - -**URL** : `ci.kbase.us/services/staging_service/test-service` - -**local URL** : `localhost:3000/test-service` - -**Method** : `GET` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -This is just a test. This is only a test. -``` - -### Test Auth - -**URL** : `ci.kbase.us/services/staging_service/test-auth` - -**local URL** : `localhost:3000/test-auth` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -I'm authenticated as -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -### File Lifetime - -**URL** : `ci.kbase.us/services/staging_service/file-lifetime` -**local URL** : `localhost:3000/file-lifetime` - -**Method** : `GET` - -#### Success Response - -**Code** : `200 OK` - -**Content example** -number of days a file will be held for in staging service before being deleted -this is not actually handled by the server but is expected to be performed by a -cron job which shares the env variable read here - -``` -90 -``` - -### List Directory - -defaults to not show hidden dotfiles - -**URL** : `ci.kbase.us/services/staging_service/list/{path to directory}` - -**URL -** : `ci.kbase.us/services/staging_service/list/{path to directory}?showHidden={True/False}` - -**local URL** : `localhost:3000/list/{path to directory}` - -**local URL -** : `localhost:3000/list/{path to directory}?showHidden={True/False}` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -```json -[ - { - "name": "testFolder", - "path": "nixonpjoshua/testFolder", - "mtime": 1510949575000, - "size": 96, - "isFolder": true - }, - { - "name": "testfile", - "path": "nixonpjoshua/testfile", - "mtime": 1510949629000, - "size": 335, - "isFolder": false - } -] -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `404 Not Found` - -**Content** : - -``` -path / does not exist -``` - -### Download file - -**URL** : `ci.kbase.us/services/staging_service/download/{path to file}` - -**URL** : `ci.kbase.us/services/staging_service/download/{path to file}` - -**local URL** : `localhost:3000/download/{path to file}` - -**local URL** : `localhost:3000/download/{path to file}` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` -**Content** : `` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `400 Bad Request` - -**Content** : - -``` -/ is a directory not a file -``` - -**Code** : `404 Not Found` - -**Content** : - -``` -path / does not exist -``` - -### Search files and folders - -defaults to not show hidden dotfiles - -**URL** : `ci.kbase.us/services/staging_service/search/{search query}` - -**URL -** : `ci.kbase.us/services/staging_service/search/{search query}?showHidden={True/False}` - -**local URL** : `localhost:3000/search/{search query}` - -**local URL** : `localhost:3000/search/?showHidden={True/False}` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -```json -[ - { - "name": "testfile", - "path": "nixonpjoshua/testfile", - "mtime": 1510949629000, - "size": 335, - "isFolder": false - }, - { - "name": "testFolder", - "path": "nixonpjoshua/testFolder", - "mtime": 1510949575000, - "size": 96, - "isFolder": true - }, - { - "name": "testinnerFile", - "path": "nixonpjoshua/testFolder/testinnerFile", - "mtime": 1510949575000, - "size": 0, - "isFolder": false - } -] -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -### File and Folder Metadata - -**URL -** : `ci.kbase.us/services/staging_service/metadata/{path to file or folder}` - -**local URL** : `localhost:3000/metadata/{path to file or folder}` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -```json -{ - "name": "testFolder", - "path": "nixonpjoshua/testFolder", - "mtime": 1510949575000, - "size": 96, - "isFolder": true -} -``` - -```json -{ - "md5": "73cf08ad9d78d3fc826f0f265139de33", - "lineCount": "13", - "head": "there is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom", - "tail": "there is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom", - "name": "testFile", - "path": "nixonpjoshua/testFile", - "mtime": 1510949629000, - "size": 335, - "isFolder": false -} -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `404 Not Found` - -**Content** : - -``` -path / does not exist -``` - -### Upload File - -**URL** : `ci.kbase.us/services/staging_service/upload` - -**local URL** : `localhost:3000/upload` - -**Method** : `POST` - -**Headers** : `Authorization: ` - -**Body constraints** - -first element in request body should be - -destPath: {path file should end up in} - -second element in request body should be multipart file data - -uploads: {multipart file} - -Files starting with whitespace or a '.' are not allowed - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -```json -[ - { - "name": "fasciculatum_supercontig.fasta", - "path": "nixonpjoshua/fasciculatum_supercontig.fasta", - "mtime": 1510950061000, - "size": 31536508, - "isFolder": false - } -] -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -### Define/Create UPA for file which has been imported - -**URL -** : `ci.kbase.us/services/staging_service/define-upa/{path to imported file}` - -**local URL** : `localhost:3000/define-upa/{path to imported file}` - -**Method** : `POST` - -**Headers** : `Authorization: ` - -**Body constraints** - -first element in request body should be - -UPA: {the actual UPA of imported file} - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -successfully update UPA for file -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -must provide UPA field in body -``` - -### Delete file or folder (will delete things contained in folder) - -**URL** : `ci.kbase.us/services/staging_service/delete/{path to file or folder}` - -**local URL** : `localhost:3000/delete/{path to file or folder}` - -**Method** : `DELETE` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -successfully deleted UPA -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `404 Not Found` - -**Content** - -``` -could not delete -``` - -**Code** : `403 Forbidden` - -**Content** - -``` -cannot delete home directory -``` - -``` -cannot delete protected file -``` - -### Move/rename a file or folder - -**URL** : `ci.kbase.us/services/staging_service/mv/{path to file or folder}` - -**local URL** : `localhost:3000/mv/{path to file or folder}` - -**Method** : `PATCH` - -**Headers** : `Authorization: ` - -**Body constraints** - -first element in request body should be - -newPath : {the new location/name for file or folder} - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -successfully moved to -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -must provide newPath field in body -``` - -**Code** : `403 Forbidden` - -**Content** - -``` -cannot rename home or move directory -``` - -``` -cannot rename or move protected file -``` - -**Code**: `409 Conflict` - -**Content** - -``` - already exists -``` - -### Decompress various archive formats - -supported archive formats are: -.zip, .ZIP, .tar.gz, .tgz, .tar.bz, .tar.bz2, .tar, .gz, .bz2, .bzip2 -**URL** : `ci.kbase.us/services/staging_service/decompress/{path to archive` - -**local URL** : `localhost:3000/decompress/{path to archive}` - -**Method** : `PATCH` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -successfully decompressed -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -Must supply token -``` - -**Code** : `400 Bad Request` - -**Content** - -``` -cannot decompress a file -``` - -### Add Globus ACL - -After authenticating at this endpoint, AUTH is queried to get your filepath and -globus id file for -linking to globus. - -**URL** : `ci.kbase.us/services/staging_service/add-acl` - -**local URL** : `localhost:3000/add-acl` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -{ - "success": true, - "principal": "KBase-Example-59436z4-z0b6-z49f-zc5c-zbd455f97c39", - "path": "/username/", - "permissions": "rw" -} -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Condition** : If issue with Globus API or ACL Already Exists - -**Code** : `500 Internal Server Error` - -**Content** - -``` -{ - 'success': False, - 'error_type': 'TransferAPIError', - 'error': "Can't create ACL rule; it already exists", - 'error_code': 'Exists', 'shared_directory_basename': '/username/' -} -``` - -### Remove Globus ACL - -After authenticating at this endpoint, AUTH is queried to get your filepath and -globus id file for -linking to globus. - -**URL** : `ci.kbase.us/services/staging_service/remove-acl` - -**local URL** : `localhost:3000/remove-acl` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -{ - "message": "{\n \"DATA_TYPE\": \"result\",\n \"code\": \"Deleted\", - "message\": \"Access rule 'KBASE-examplex766ada0-x8aa-x1e8-xc7b-xa1d4c5c824a' deleted successfully\", - "request_id\": \"x2KFzfop05\",\n \"resource\": \"/endpoint/KBaseExample2a-5e5b-11e6-8309-22000b97daec/access/KBaseExample-ada0-d8aa-11e8-8c7b-0a1d4c5c824a\"}", - "Success": true -} -``` - -#### Error Response - -**Condition** : if authentication is incorrect - -**Code** : `401 Unauthorized` - -**Content** : - -``` -Error Connecting to auth service ... -``` - -**Condition** : If issue with Globus API or ACL Already Exists - -**Code** : `500 Internal Server Error` - -**Content** +Local deployment is as easy as running +```shell +./scripts/run-dev-server.sh ``` -{ - 'success': False, - 'error_type': 'TransferAPIError', - 'error': "Can't create ACL rule; it already exists", - 'error_code': 'Exists', 'shared_directory_basename': '/username/' -} -``` - -### Parse bulk specifications - -This endpoint parses one or more bulk specification files in the staging area -into a data -structure (close to) ready for insertion into the Narrative bulk import or -analysis cell. - -It can parse `.tsv`, `.csv`, and Excel (`.xls` and `.xlsx`) files. Templates for -the currently -supported data types are available in -the [templates](./import_specifications/templates) -directory of this repo. See -the [README.md](./import_specifications/templates/README.md) file -for instructions on template usage. - -See the [import specification ADR document](./docs/import_specifications.ADR.md) -for design -details. - -**URL** : `ci.kbase.us/services/staging_service/bulk_specification` - -**local URL** : `localhost:3000/bulk_specification` - -**Method** : `GET` - -**Headers** : `Authorization: ` - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -GET bulk_specification/?files=file1.[,file2.,...] -``` - -`` is one of `csv`, `tsv`, `xls`, or `xlsx`. - -Reponse: - -``` -{ - "types": { - : [ - {: , : , ...}, - {: , : , ...}, - ... - ], - : [ - {: , : , ...}, - ... - ], - ... - }, - "files": { - : {"file": "/file1.", "tab": "tabname"}, - : {"file": "/file2.", "tab": null}, - ... - } -} -``` - -- `` is a data type ID from - the [Mappings.py](./staging_service/autodetect/Mappings.py) file and the Narrative staging area configuration file - it is a shared namespace between the staging service and Narrative to specify bulk applications, and has a 1:1 mapping to an app. It is determined by the first header line from the templates. -- `` is the ID of an input parameter from a `KB-SDK` app's `spec.json` file. These are determined by the second header line from the templates and will differ by the data type. -- `` is the user-provided value for the input for a given `spec.json` ID and import or analysis instance, where an import/analysis instance is effectively a row in the data file. Each data file row is provided in order for each type. Each row is provided in a mapping of `spec.json` ID to the data for the row. Lines > 3 in the templates are user-provided data, and each line corresponds to a single import or analysis. - -#### Error Response - -Error responses are of the general form: - -``` -{ - "errors": [ - {"type": , - ... other fields depending on the error code ... - }, - ... - ] -} -``` - -Existing error codes are currently: - -- `cannot_find_file` if an input file cannot be found -- `cannot_parse_file` if an input file cannot be parsed -- `incorrect_column_count` if the column count is not as expected - - For Excel files, this may mean there is a non-empty cell outside the bounds of the data area -- `multiple_specifications_for_data_type` if more than one tab or file per data type is submitted -- `no_files_provided` if no files were provided -- `unexpected_error` if some other error occurs - -The HTTP code returned will be, in order of precedence: - -- 400 if any error other than `cannot_find_file` or `unexpected_error` occurs -- 404 if at least one error is `cannot_find_file` but there are no 400-type errors -- 500 if all errors are `unexpected_error` - -The per error type data structures are: - -##### `cannot_find_file` - -``` -{ - "type": "cannot_find_file", - "file": -} -``` - -##### `cannot_parse_file` - -``` -{ - "type": "cannot_parse_file", - "file": , - "tab": , - "message": -} -``` - -##### `incorrect_column_count` - -``` -{ - "type": "incorrect_column_count", - "file": , - "tab": , - "message": -} -``` - -##### `multiple_specifications_for_data_type` - -``` -{ - "type": "multiple_specifications_for_data_type", - "file_1": , - "tab_1": , - "file_2": , - "tab_2": , - "message": -} -``` - -##### `no_files_provided` - -``` -{ - "type": "no_files_provided" -} -``` - -##### `unexpected_error` - -``` -{ - "type": "unexpected_error", - "file": - "message": -} -``` - -### Write bulk specifications - -This endpoint is the reverse of the parse bulk specifications endpoint - it takes a similar data structure to that which the parse endpoint returns and writes bulk specification templates. - -**URL** : `ci.kbase.us/services/staging_service/write_bulk_specification` - -**local URL** : `localhost:3000/write_bulk_specification` - -**Method** : `POST` - -**Headers** : - -- `Authorization: ` -- `Content-Type: Application/JSON` - -#### Success Response -**Code** : `200 OK` +from within the devcontainer. -**Content example** - -``` -POST write_bulk_specification/ -{ - "output_directory": , - "output_file_type": , - "types": { - : { - "order_and_display: [ - [, ], - [, ], - ... - ], - "data": [ - {: , : , ...}, - {: , : , ...} - ... - ] - }, - : { - "order_and_display: [ - [, ], - ... - ], - "data": [ - {: , : , ...}, - ... - ] - }, - ... - } -} -``` - -- `output_directory` specifies where the output files should be written in the user's staging area. -- `output_file_type` specifies the format of the output files. -- `` is a data type ID from the [Mappings.py](./staging_service/autodetect/Mappings.py) file and the Narrative staging area configuration file - it is a shared namespace between the staging service and Narrative to specify bulk applications, and has a 1:1 mapping to an app. It is included in the first header line in the templates. -- `order_and_display` determines the ordering of the columns in the written templates, as well as mapping the spec.json ID of the parameter to the human-readable name of the parameter in the display.yml file. -- `` is the ID of an input parameter from a `KB-SDK` app's `spec.json` file. These are written to the second header line from the import templates and will differ by the data type. -- `data` contains any data to be written to the file as example data, and is analogous to the data structure returned from the parse endpoint. To specify that no data should be written to the template provide an empty list. -- `` is the value for the input for a given `spec.json` ID and import or analysis instance, where an import/analysis instance is effectively a row in the data file. Each data file row is provided in order for each type. Each row is provided in a mapping of `spec.json` ID to the data for the row. Lines > 3 in the templates are user-provided data, and each line corresponds to a single import or analysis. - -Reponse: - -``` -{ - "output_file_type": , - "files": { - : , - ... - : , - } -} -``` - -- `output_file_type` has the same definition as above. -- `files` contains a mapping of each provided data type to the output template file for that type. In the case of Excel, all the file paths will be the same since the data types are all written to different tabs in the same file. - -#### Error Response - -Method specific errors have the form: - -``` -{"error": } -``` - -The error code in this case will be a 4XX error. - -The AioHTTP server may also return built in errors that are not in JSON format - -an example of -this is overly large (> 1MB) request bodies. - -### Get Importer Mappings - -This endpoint returns: - -1) a mapping between a list of files and predicted importer apps, and -2) a file information list that includes the input file names split between the - file prefix and the file suffix, if any, that was used to determine the file -> importer - mapping, and a list of file types based on the file suffix. If a file has a suffix that does not - match any mapping (e.g. `.sys`), the suffix will be `null`, the prefix the entire - file name, and the file type list empty. - -For example, - -- if we pass in nothing we get a response with no mappings -- if we pass in a list of files, such as ["file1.fasta", "file2.fq", "None"], we would get back a response that maps to Fasta Importers and FastQ Importers, with a weight of 0 to 1 which represents the probability that this is the correct importer for you. -- for files for which there is no predicted app, the return is a null value -- this endpoint is used to power the dropdowns for the staging service window in the Narrative - -**URL** : `ci.kbase.us/services/staging_service/importer_mappings` - -**local URL** : `localhost:3000/importer_mappings` - -**Method** : `POST` - -**Headers** : Not Required - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -data = {"file_list": ["file1.txt", "file2.zip", "file3.gff3.gz"]} - async with AppClient(config, username) as cli: - resp = await cli.post( - "importer_mappings/", data=data - ) -``` - -Response: - -``` -{ - "mappings": [ - null, - [{ - "id": "decompress", - "title": "decompress/unpack", - "app_weight": 1, - }], - [{ - "app_weight": 1, - "id": "gff_genome", - "title": "GFF/FASTA Genome", - }, - { - "app_weight": 1, - "id": "gff_metagenome", - "title": "GFF/FASTA MetaGenome", - }] - ], - "fileinfo": [ - {"prefix": "file1.txt", "suffix": null, "file_ext_type": []}, - {"prefix": "file2", "suffix": "zip", "file_ext_type": ["CompressedFileFormatArchive"]}, - {"prefix": "file3", "suffix": "gff3.gz", "file_ext_type": ["GFF"]} - ] -} -``` - -#### Error Response - -**Code** : `400 Bad Request` - -**Content** - -``` -must provide file_list field -``` - -### Get importer filetypes - -This endpoint returns information about the file types associated with data -types and the file -extensions for those file types. It is primarily of use for creating UI elements -describing -which file extensions may be selected when performing bulk file selections. - -**URL** : `ci.kbase.us/services/staging_service/importer_filetypes` - -**local URL** : `localhost:3000/importer_filetypes` - -**Method** : `GET` - -**Headers** : Not Required - -#### Success Response - -**Code** : `200 OK` - -**Content example** - -``` -GET importer_filetypes/ -``` - -Response: - -``` -{ - "datatype_to_filetype": { - : [, ... ], - ... - : [, ... ], - }, - "filetype_to_extensions": { - : [, ..., ], - ... - : [, ..., ], - } -} -``` +## API -- `` is a data type ID from the [Mappings.py](./staging_service/autodetect/Mappings.py) file and the Narrative staging area configuration file - it is a shared namespace between the staging service and Narrative to specify bulk applications, and has a 1:1 mapping to an import app. It is included in the first header line in the templates. -- `` is a file type like `FASTA` or `GENBANK`. The supported file types are listed below. -- `` is a file extension like `*.fa` or `*.gbk`. +See [the API documentation](./docs/api.md). -## Autodetect App and File Type IDs +## Contributing -### App type IDs +This repo is realistically only open to contributions from users within the KBase +organization. The contribution model is Pull Requests make from branches off of main. So +clearly one needs to have the ability to push up branches and create PRs. -These are the currently supported upload app type IDs: +Forks are not supported at this time. -``` -fastq_reads_interleaved -fastq_reads_noninterleaved -sra_reads -genbank_genome -gff_genome -gff_metagenome -expression_matrix -media -fba_model -assembly -phenotype_set -sample_set -metabolic_annotation -metabolic_annotation_bulk -escher_map -decompress -``` +## Development -Note that decompress is only returned when no other file type can be detected -from the file extension. +For development support, see [the development docs](./docs/development.md) -### File type IDs +## License -These are the currently supported file type IDs. These are primarily useful for -apps that take -two different file types, like GFF/FASTA genomes. - -``` -FASTA -FASTQ -SRA -GFF -GENBANK -SBML -JSON -TSV -CSV -EXCEL -CompressedFileFormatArchive -``` +See [the KBase License](./LICENSE.md) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 8114a293..0acd6fee 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,83 +2,102 @@ ## Unreleased -- update to Python 3.11.4 -- run black on all service and test files, fix most linting complaints (pylint, +- update to Python 3.11.5 +- image base is python 3.11.5 / debian bookworm +- pin all image dependencies to those currently supported by the debian bookworm +- update all Python dependencies to latest versions +- run black on all service and test files, fix most linting complaints (pylint, sonarlint) +- add markdownlint plugin to devcontainer, config file to repo, and fix all linting + errors in markdown files. ## Version 1.3.6 -- Fixed a bug that would cause NaN and Inf values in xSV to be returned as JSON barewords, which could cause some JSON parsers to fail. They are now returned as strings. -- Changed the Excel parser to not consider NaN and Inf as missing values to maintain consistency with the xSV parsers +- Fixed a bug that would cause NaN and Inf values in xSV to be returned as JSON + barewords, which could cause some JSON parsers to fail. They are now returned as + strings. +- Changed the Excel parser to not consider NaN and Inf as missing values to maintain + consistency with the xSV parsers ## Version 1.3.5 -- Fixed a bug that under some circumstances could cause incomplete file metadata to be returned. +- Fixed a bug that under some circumstances could cause incomplete file metadata to be returned. ## Version 1.3.4 -- Alter the behavior of the bulk specification file writers to return an error if the input `types` parameter is empty. -- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the first header of a file had trailing separators. This occurs if a csv/tsv file is opened and saved by Excel. +- Alter the behavior of the bulk specification file writers to return an error if the + input `types` parameter is empty. +- Fixed a bug in the csv/tsv bulk specification parser that would case a failure if the + first header of a file had trailing separators. This occurs if a csv/tsv file is + opened and saved by Excel. ## Version 1.3.3 -- Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry for each empty line in the file. +- Fixed a bug in the csv/tsv bulk specification parser that would include an empty entry + for each empty line in the file. ## Version 1.3.2 -- Add `write_bulk_specification` endpoint for writing bulk specifications -- Add `import_filetypes` endpoint for getting datatype -> filetype -> extension mappings -- Fixed a bug in the csv/tsv bulk specification parser that would throw an error on any empty lines in the file, even at the end. The parser now ignores empty lines the same way the Excel parser does. -- Fixed a bug in the `bulk_specification` endpoint where a missing header item in a `*.?sv` file would cause it to be replaced with a strange name. -- Fixed a bug in the `bulk_specification` endpoint where a missing header item in an Excel file would cause a duplicate header error rather than a missing header error. -- As part of the two fixes above, some error message text has changed due to the rewrite of the parsers. +- Add `write_bulk_specification` endpoint for writing bulk specifications +- Add `import_filetypes` endpoint for getting datatype -> filetype -> extension mappings +- Fixed a bug in the csv/tsv bulk specification parser that would throw an error on any + empty lines in the file, even at the end. The parser now ignores empty lines the same + way the Excel parser does. +- Fixed a bug in the `bulk_specification` endpoint where a missing header item in a + `*.?sv` file would cause it to be replaced with a strange name. +- Fixed a bug in the `bulk_specification` endpoint where a missing header item in an + Excel file would cause a duplicate header error rather than a missing header error. +- As part of the two fixes above, some error message text has changed due to the rewrite + of the parsers. ## Version 1.3.1 -- added the `files` key to the returned data from the `bulk_specification` endpoint. +- added the `files` key to the returned data from the `bulk_specification` endpoint. ## Version 1.3.0 -- Update to Python 3.9 -- Add `bulk_specification` endpoint for parsing import specifications +- Update to Python 3.9 +- Add `bulk_specification` endpoint for parsing import specifications ## Version 1.2.0 -- BACKWARDS INCOMPATIBILITY: remove the unused `apps` key from the importer mappings endpoint. -- added a `fileinfo` field to the return of the importer mappings endpoint that includes the file prefix, suffix, and file type(s), if any. -- reverted change to expose dotfiles in the api by default -- attempting to upload a dotfile will now cause an error +- BACKWARDS INCOMPATIBILITY: remove the unused `apps` key from the importer mappings endpoint. +- added a `fileinfo` field to the return of the importer mappings endpoint that includes + the file prefix, suffix, and file type(s), if any. +- reverted change to expose dotfiles in the api by default +- attempting to upload a dotfile will now cause an error ## Version 1.1.9 -- Added support for Genbank `*.gb` and `*.gbff` extensions -- Added support for gzipped Reads, Assemblies, Genbank Files, and GFF files. +- Added support for Genbank `*.gb` and `*.gbff` extensions +- Added support for gzipped Reads, Assemblies, Genbank Files, and GFF files. ## Version 1.1.8 -- Added new endpoint `importer-mappings/` for getting a mapping of importers for file names -- Updated endpoint to use GET and query_string -- Ran black -- BUGFIX: Change head/tail functionality to return 1024 chars to avoid bad cases with really large one line files -- Update FASTQ/SRA to use their own APP Uis +- Added new endpoint `importer-mappings/` for getting a mapping of importers for file names +- Updated endpoint to use GET and query_string +- Ran black +- BUGFIX: Change head/tail functionality to return 1024 chars to avoid bad cases with + really large one line files +- Update FASTQ/SRA to use their own APP Uis ## Version 1.1.7 -- Add a file name check to void user uploading files with name starting with space -- Update list endpoint so that it only hide .globus_id file by default +- Add a file name check to void user uploading files with name starting with space +- Update list endpoint so that it only hide .globus_id file by default ## Version 1.1.3 -- Add a add-acl-concierge endpoint -- Added configs for that endpoint -- Added options to dockerfile/docker-compose +- Add a add-acl-concierge endpoint +- Added configs for that endpoint +- Added options to dockerfile/docker-compose ## Version 1.1.2 -- Added capability to check 'kbase_session_backup' cookie -- Added a `add-acl` and `remove-acl` endpoint for globus endpoint access -- Change logging to STDOUT +- Added capability to check 'kbase_session_backup' cookie +- Added a `add-acl` and `remove-acl` endpoint for globus endpoint access +- Change logging to STDOUT ### Version 1.1.0 -- Added a `download` endpoint for files +- Added a `download` endpoint for files diff --git a/deployment/bin/entrypoint.sh b/deployment/bin/entrypoint.sh index 787b2446..083f9eab 100755 --- a/deployment/bin/entrypoint.sh +++ b/deployment/bin/entrypoint.sh @@ -1,19 +1,10 @@ -#!/bin/bash -# if you change this file and the places it looks for local paths: -# please also update the launch.json for vscode accordingly -# the things that must be kept in sync are KB_DEPLOYMENT_CONFIG and PYTHONPATH +#!/usr/bin/env bash -#top section for local running -DIR="$( cd "$( dirname "$0" )" && pwd )" -if [ -d "$DIR/../../staging_service" ]; then - PYTHONPATH="$DIR/../../staging_service" - export KB_DEPLOYMENT_CONFIG="$DIR/../conf/local.cfg" - export FILE_LIFETIME="90" -fi +# +# This is the production entrypoint, whose sole job is to start the service. +# The service starts via staging_service/__main__.py which is why the python +# invocation below references the staging_service directory. +# -#bottom section for running inside docker -if [ -d "kb/deployment/lib/staging_service" ]; then - PYTHONPATH="kb/deployment/lib/staging_service" - # environment variable for KB_DEPLOYMENT_CONFIG set in docker-compose.yml -fi -python3 -m staging_service \ No newline at end of file +export PYTHONPATH="/kb/deployment/lib" +python -m staging_service \ No newline at end of file diff --git a/deployment/conf/local.cfg b/deployment/conf/local.cfg index b0033d2b..17771ccd 100644 --- a/deployment/conf/local.cfg +++ b/deployment/conf/local.cfg @@ -2,4 +2,5 @@ META_DIR = ./data/metadata/ DATA_DIR = ./data/bulk/ AUTH_URL = https://ci.kbase.us/services/auth/api/V2/token -CONCIERGE_PATH = /kbaseconcierge \ No newline at end of file +CONCIERGE_PATH = /kbaseconcierge +FILE_EXTENSION_MAPPINGS = ./deployment/conf/supported_apps_w_extensions.json \ No newline at end of file diff --git a/development/scripts/README.md b/development/scripts/README.md new file mode 100644 index 00000000..44542f6f --- /dev/null +++ b/development/scripts/README.md @@ -0,0 +1,35 @@ +# Scripts + +## `generate-data-files.py` + +A small tool, an example function really, to generate files for manual testing of upload and download. I use it by copy/pasting into a Python REPL session in the directory in which I want to generate files, and just run the function with the required parameters. + +## `list-uploads.sh` + +Handy for listing the contents of the `./data` directory while developing the `/upload` endpoint of the server locally. It simply lists the directory every 0.5s. + +## `run` + +Runs the Docker container located in /development/tools/runner with docker compose. This is a "runner container" designed to run things inside a container which is very similar to the service container. It is used to run tests, and can be used to run code quality tools also like black and mypy (not yet used in this project.) + +The advantage of using `run` is that one does not have to arrange the precise version of Python and OS dependencies in a virtual environment. + +Here are some usages of it: + +To run mypy against the codebase: + +```shell +./development/run mypy staging_service +``` + +To run black against the codebase: + +```shell +./development/run black staging_service +``` + +To run black against the codebase and apply formatting fixes: + +```shell +./development/run black staging_service --fix +``` \ No newline at end of file diff --git a/development/scripts/generate-data-files.py b/development/scripts/generate-data-files.py new file mode 100644 index 00000000..ceebe63d --- /dev/null +++ b/development/scripts/generate-data-files.py @@ -0,0 +1,15 @@ +import io + + +def generate_null_file(size: int, name: str): + """ + Generate a file full of 0 bytes, for testing different + file size scenarios. + + e.g. generate_null_file(5100000000, "5.1g.out") + """ + with open(name, "wb") as out: + bw = io.BufferedWriter(out, 10000000) + for _ in range(size): + bw.write(b"\0") + bw.flush() diff --git a/development/scripts/generate-random-strings.py b/development/scripts/generate-random-strings.py new file mode 100644 index 00000000..ae2e7bda --- /dev/null +++ b/development/scripts/generate-random-strings.py @@ -0,0 +1,24 @@ +""" +Generate a set of text files for testing filled with random ascii characters. +""" +import random +import string + + +def make_random_string(string_length: int) -> str: + """ + Generate a string of random ascii letters of the given length + """ + possible_letters = string.ascii_letters + # ignore the Sonarcloud and bandit warning below triggered by codacy; + # this is just for generating data for testing, either formal or informa; + # security is not an issue. + return "".join(random.choice(possible_letters) for _ in range(string_length)) # nosec NOSONAR + + +if __name__ == "__main__": + for file_length in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]: + filename = f"sample-text-file-{file_length}" + print(f"generating {filename}") + with open(f"sample-text-file-{file_length}", "w", encoding="utf-8") as out: + out.write(make_random_string(file_length)) diff --git a/development/scripts/list-uploads.sh b/development/scripts/list-uploads.sh new file mode 100755 index 00000000..10f87f8e --- /dev/null +++ b/development/scripts/list-uploads.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# +# A handy script to display the contents of the data directory in a +# terminal window, refreshing every 0.5 seconds and excluding +# any macOS .DS_Store Desktop Services files. +# +while : +do + clear + find data -type f \( ! -name ".DS_Store" \) -ls + sleep 0.5 +done \ No newline at end of file diff --git a/development/scripts/run b/development/scripts/run new file mode 100755 index 00000000..dce2653f --- /dev/null +++ b/development/scripts/run @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +PROJECT_DIRECTORY=${DIR:-$PWD} +echo "Project Directory: ${PROJECT_DIRECTORY}" +docker compose \ + -f "${PROJECT_DIRECTORY}/development/tools/runner/docker-compose.yml" \ + --project-name tools \ + run --rm runner "${@:1}" \ No newline at end of file diff --git a/development/scripts/run-dev-server.sh b/development/scripts/run-dev-server.sh new file mode 100755 index 00000000..6c2f6d65 --- /dev/null +++ b/development/scripts/run-dev-server.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Running server in development..." +docker compose \ + -f development/docker-compose-kbase-ui.yml \ + --project-directory "${PWD}" \ + run --rm staging_service \ No newline at end of file diff --git a/development/tools/runner/docker-compose.yml b/development/tools/runner/docker-compose.yml new file mode 100644 index 00000000..9d48d9d3 --- /dev/null +++ b/development/tools/runner/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3.8' +services: + runner: + build: + context: ../../.. + target: dev + volumes: + - ../../..:/kb/module + entrypoint: + - /kb/module/development/tools/runner/entrypoint.sh diff --git a/development/tools/runner/entrypoint.sh b/development/tools/runner/entrypoint.sh new file mode 100755 index 00000000..5939162f --- /dev/null +++ b/development/tools/runner/entrypoint.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -e + +# Nice to set this up for all future python code being run. +# We set the root of the repo as the python path. +export PYTHONPATH="${PWD}" + +# +# This execs whatever is provided as a COMMAND to the container. By default, as established +# in the image via the Dockerfile, it is scripts/start-server.sh +# This mechanism, though, allows us to run anything inside the container. We use this +# for running tests, mypy, black, etc. through docker compose. +# Note that surrounding $@ with double quote causes the expansion (which starts with +# parameter 1 not 0) to result in each parameter being retained without splitting. +# Sort of like "$@" -> "$1" "$2" "$3" .. +# +exec "$@" diff --git a/docker-compose.yml b/docker-compose.yml index f14b742c..ba0f0576 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,16 +1,37 @@ # WARNING this file does not sync up with the deployment automatically +# +# This compose file is for running the service locally. +# +# Local (host) directories are overlaid into the container in order to facilitate +# live coding of source while exercising the service api live. +# The file locations in the container should be those expected by the service. +# +# It is probably more efficacious to use a devcontainer, e.g. with VSC, though, +# so that the development UI can recognize installed libraries, etc. +# +# The "kbase-dev" network allows the container to interact with KBase user interfaces +# via the docker network. In this development scenario, kbase-ui provides a proxy so +# that kbase-ui and the narrative can communicate with the local service over +# https://ci.kbase.us and provide a natural co-development experience. +# version: "3.1" +networks: + kbase-dev: + name: kbase-dev services: staging_service: - image: kbase/staging_service:develop + # image: kbase/staging_service:develop + build: + context: . + networks: + - kbase-dev ports: - "3000:3000" # The following mount assumes the user dirs are under /data/bulk/{username} # it further assumes that there is a pre-existing /data/metadata directory volumes: - - "./data:/kb/deployment/lib/src/data" - - "./:/staging_service" - + - ./data:/kb/deployment/lib/src/data + - ./staging_service:/kb/deployment/lib/staging_service environment: - - KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg - - FILE_LIFETIME="90" + - KB_DEPLOYMENT_CONFIG=/kb/module/deployment/conf/deployment.cfg + - FILE_LIFETIME=90 diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 00000000..aa84aef3 --- /dev/null +++ b/docs/api.md @@ -0,0 +1,1425 @@ + +# API + +- all paths should be specified treating the user's home directory as root + +- The url base, in an actual deployment, will be: + - `https://ci.kbase.us` for CI + - `https://next.kbase.us` for Next + - `https://appdev.kbase.us` for Appdev + - `https://kbase.us` for Production + +- The base path in all environments is `/services/staging_service` + +- All api paths will be suffixed to the url base and base path. For example: + + `https://ci.kbase.us/services/staging_service/file-lifetime` + will invoke the + `file-lifetime` endpoint, which returns the number of days a file will be retained. + +- For local usage (i.e. spinning the service up locally) + - the base url is `http://localhost:3000` + - the base path is not required + - e.g. `http://localhost:3000/file-lifetime` + +- For endpoints that require authorization, the `Authorization` header must be supplied, + with the value a KBase auth token. + +## Test Service + +`GET /test-service` + +Returns a fixed text response. Used to determine if the service is running? + +> TODO: we probably don't need this endpoint + +### Success Response + +`200 OK` + +#### Example + +```text +This is just a test. This is only a test. +``` + +## Test Auth + +`GET /test-auth` + +Returns a text response indicating that the + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +`Content-Type: text/plain` + +#### Example + +```text +I'm authenticated as +``` + +### Error Responses + +The [common responses for an authorized request](#common-authorization-error-responses) + +## File Lifetime + +`GET /file-lifetime` + +Number of days a file will be held for in staging service before being deleted. + +This is not actually handled by the server but is expected to be performed by a cron job +which shares the env variable read here. + +### Success Response + +`200 OK` + +#### Content + +`text/plain` + +#### Example + +```text +90 +``` + +## List Directory + +`GET /list/{path to directory}?[showHidden={True/False}]` + +Returns a JSON Array containing entries for each file and folder within the provided directory. + +Defaults to not show hidden dotfiles. + +### Headers + +`Authorization: ` + +### Success Response + +`200 OK` + +### Example + +```json +[ + { + "name": "testFolder", + "path": "nixonpjoshua/testFolder", + "mtime": 1510949575000, + "size": 96, + "isFolder": true + }, + { + "name": "testfile", + "path": "nixonpjoshua/testfile", + "mtime": 1510949629000, + "size": 335, + "isFolder": false + } +] +``` + +### Error Responses + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `404 Not Found` + +Results if the requested path is not found for the user associated with the +Authorization. + +##### Content + +`text/plain` + +```text +path / does not exist +``` + +## Download file + +`GET /download/{path to file}` + +Returns the request file. + +### Headers + +- `Authorization: ` + +### Success Response + +#### `200 OK` + +##### Content + +`application/octet-stream` + +The contents of the file are returned as the entire body of the response + +### Error Responses + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `400 Bad Request` + +Returned if the requested file is a directory, not a file. + +##### Content + +`text/plain` + +```text +/ is a directory not a file +``` + +#### `404 Not Found` + +The file does not exist + +##### Content + +`text/plain` + +```text +path / does not exist +``` + +## Search files and folders + +`GET /search/{search query}?[showHidden={True/False}]` + +Returns a list of files for the user identified by the Authorization, and matching the +given query text + +defaults to not show hidden dotfiles + +> TODO: explain how the search works! + +### Headers + +- `Authorization: ` + +### Success Response + +#### `200 OK` + +##### Content + +`application/json` + +```json +[ + { + "name": "testfile", + "path": "nixonpjoshua/testfile", + "mtime": 1510949629000, + "size": 335, + "isFolder": false + }, + { + "name": "testFolder", + "path": "nixonpjoshua/testFolder", + "mtime": 1510949575000, + "size": 96, + "isFolder": true + }, + { + "name": "testinnerFile", + "path": "nixonpjoshua/testFolder/testinnerFile", + "mtime": 1510949575000, + "size": 0, + "isFolder": false + } +] +``` + +### Error Responses + +The [common responses for an authorized request](#common-authorization-error-responses) + +## File and Folder Metadata + +`GET /metadata/{path to file or folder}` + +### Headers + +- `Authorization: ` + +### Success Response + +#### `200 OK` + +If the file or directory is found, a Metadata object is returned + +##### Content + +`application/json` + +##### Examples + +```json +{ + "name": "testFolder", + "path": "nixonpjoshua/testFolder", + "mtime": 1510949575000, + "size": 96, + "isFolder": true +} +``` + +```json +{ + "md5": "73cf08ad9d78d3fc826f0f265139de33", + "lineCount": "13", + "head": "there is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom", + "tail": "there is stuff in this file\nthere is stuff in this file\nthere is stuff in this file\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom\nstuff at the bottom", + "name": "testFile", + "path": "nixonpjoshua/testFile", + "mtime": 1510949629000, + "size": 335, + "isFolder": false +} +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `404 Not Found` + +##### Content + +`text/plain` + +```text +path / does not exist +``` + +## Upload File + +`POST /upload` + +This most important endpoint is designed to receive one or more files in a `POST` +request sending a `multipart/form-data` body. + +The body format is described in more detail below. + +The response is a JSON object containing file metadata. + +> This might seem a bit of an odd choice, and does complicate the API. It was chosen, I +> believe because the Narrative front-end component handling the submission of file +> uploads supports `multipart/form-data`, or it may be because historically the first +> front-end implementation was a form itself, as HTML forms will send as +> `mutlipart/form-data` if a file control is used to select a file. +> +> The current front-end implementation does not require this, although it does support +> it, so I would suggest that in the future we refactor to be a simple binary body, with +> the filename specified in the url search (query param). + +### Headers + +- `Authorization: ` +- `Content-Type: multipart/form-data` + +### Body + +The multipart form-data format supports sending of multiple form fields, each with their +own metadata as well as body. + +The service requires that two fields be present in the specified order: + +- `destPath` - path file should end up in +- `uploads` - the file itself. + +Filenames starting with whitespace or a '.' are not allowed + +The `destPath` field contains the path at which the file should be created. It will +probably have been set based on the path of the file chosen for upload in the Narrative +interface (although the service API is agnostic about the how and why.) + +The `uploads` field will contain a binary representation of the file. The file binary +content will be literally copied into the destination file, with no encoding or validation. + +#### Example + +```text +------WebKitFormBoundaryA0xgUu1fi1whAcSB +Content-Disposition: form-data; name="destPath" + +/ +------WebKitFormBoundaryA0xgUu1fi1whAcSB +Content-Disposition: form-data; name="uploads"; filename="foo.csv" +Content-Type: text/markdown + +[binary content excluded] +------WebKitFormBoundaryA0xgUu1fi1whAcSB-- +``` + +In this example, the `destPath` is `/`, indicating that the file will be created in the +root directory of the user's staging area. + +The `uploads` field would contain the file's binary content. Note that the field +metadata includes the "filename" property indicating that the target file +name is `"foo.csv"`. + +### Success Response + +`200 OK` + +A successful response will return a JSON object containing a metadata description of the +file. + +#### Content + +`application/json` + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "array", + "items": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "path": { + "type": "string" + }, + "mtime": { + "type": "integer" + }, + "size": { + "type": "integer" + }, + "isFolder": { + "type": "boolean" + } + }, + "required": [ + "name", + "path", + "mtime", + "size", + "isFolder" + ] + } + ] +} +``` + +#### Example + +```json +[ + { + "name": "fasciculatum_supercontig.fasta", + "path": "nixonpjoshua/fasciculatum_supercontig.fasta", + "mtime": 1510950061000, + "size": 31536508, + "isFolder": false + } +] +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +## Define/Create UPA for file which has been imported + +`POST /define-upa/{path to imported file}` + +### Headers + +- `Authorization: ` +- `Content-Type: application/json` + +### Body + +The POST request body contains an object whose sole (??) property contains the `upa` to +associate with the provided file. + +#### Schema + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "upa": { + "type": "string", + "description": "the actual UPA of imported file" + } + }, + "required": [ + "upa" + ] +} +``` + +#### Example + +```json +{ + "upa": "123.4.5" +} +``` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: text/plain` + +#### Body + +```text +successfully update UPA for file +``` + +### Error Responses + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `400 Bad Request` + +UPA missing + +##### Content + +`text/plain` + +```text +must provide UPA field in body +``` + +## Delete file or folder (will delete things contained in folder) + +`DELETE /delete/{path to file or folder}` + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: text/plain` + +#### Body + +```text +successfully deleted UPA +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `403 Forbidden` + +##### Headers + +- `Content-Type: text/plain` + +##### Content + +```text +cannot delete home directory +``` + +```text +cannot delete protected file +``` + +or other file access errors. + +#### `404 Not Found` + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text +could not delete +``` + +## Move/rename a file or folder + +`PATCH /mv/{path to file or folder}` + +### Headers + +- `Authorization: ` +- `Content-Type: application/json` + +### Body + +#### Schema + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "newPath": { + "type": "string", + "description": "the new location/name for file or folder" + } + }, + "required": [ + "newPath" + ] +} +``` + +#### Example + +```json +{ + "newPath": "/foo/bar" +} +``` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: text/plain` + +#### Body + +```text +successfully moved to +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `400 Bad Request` + +If the newPath field is missing in the content body + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text +must provide newPath field in body +``` + +#### `403 Forbidden` + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text +cannot rename home or move directory +``` + +```text +cannot rename or move protected file +``` + +#### `409 Conflict` + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text + already exists +``` + +## Decompress various archive formats + +`PATCH /decompress/{path to archive}` + +supported archive formats are: +.zip, .ZIP, .tar.gz, .tgz, .tar.bz, .tar.bz2, .tar, .gz, .bz2, .bzip2 + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: text/plain` + +#### Body + +```text +successfully decompressed +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `400 Bad Request` + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text +cannot decompress a file +``` + +## Add Globus ACL + +`GET /add-acl` + +After authenticating at this endpoint, AUTH is queried to get your filepath and globus +id file for linking to globus. + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: application/json` + +#### Body + +##### Schema + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "principal": { + "type": "string" + }, + "path": { + "type": "string" + }, + "permissions": { + "type": "string" + } + }, + "required": [ + "success", + "principal", + "path", + "permissions" + ] +} +``` + +##### Example + +```json +{ + "success": true, + "principal": "KBase-Example-59436z4-z0b6-z49f-zc5c-zbd455f97c39", + "path": "/username/", + "permissions": "rw" +} +``` + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `500 Internal Server Error` + +If issue with Globus API or ACL Already Exists + +##### Headers + +- `Content-Type: application/json` + +##### Body + +###### Schema + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "error_type": { + "type": "string" + }, + "error": { + "type": "string" + }, + "error_code": { + "type": "string" + }, + "shared_directory_basename": { + "type": "string" + } + }, + "required": [ + "success", + "error_type", + "error", + "error_code", + "shared_directory_basename" + ] +} +``` + +###### Content (example) + +```json +{ + "success": false, + "error_type": "TransferAPIError", + "error": "Can't create ACL rule; it already exists", + "error_code": "Exists", + "shared_directory_basename": "/username/" +} +``` + +## Remove Globus ACL + +`GET /remove-acl` + +After authenticating at this endpoint, AUTH is queried to get your filepath and globus +id file for linking to globus. + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: application/json` + +#### Body + +##### Schema + +##### Content (example) + +```json +{ + "message": "... message elided...", + "Success": true +} +``` + +Note that the "message" is that returned by the globus api, and out of scope for +documentation here. + +> TODO: we should provide our own message, or return the globus response data, but not +> return the globus response data (a json object) into a string and call it a message! + +### Error Response + +The [common responses for an authorized request](#common-authorization-error-responses) + +#### `500 Internal Server Error` + +An issue with Globus API or ACL Already Exists + +##### Headers + +- `Content-Type: application/json` + +##### Body + +####### Schema + +```json +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "success": { + "type": "boolean" + }, + "error_type": { + "type": "string" + }, + "error": { + "type": "string" + }, + "error_code": { + "type": "string" + }, + "shared_directory_basename": { + "type": "string" + } + }, + "required": [ + "success", + "error_type", + "error", + "error_code", + "shared_directory_basename" + ] +} +``` + +####### Content (example) + +```json +{ + "success": false, + "error_type": "TansferAPIError", + "error": "Can't create ACL rule; it already exists", + "error_code": "Exists", + "shared_directory_basename": "/username/" +} +``` + +## Parse bulk specifications + +`GET /bulk_specification/?files=file1.[,file2.,...]` + +where `` is one of `csv`, `tsv`, `xls`, or `xlsx`. + +This endpoint parses one or more bulk specification files in the staging area into a data +structure (close to) ready for insertion into the Narrative bulk import or analysis cell. + +It can parse `.tsv`, `.csv`, and Excel (`.xls` and `.xlsx`) files. Templates for the +currently supported data types are available in the +[templates](./import_specifications/templates) directory of this repo. See the +[README.md](./import_specifications/templates/README.md) file for instructions on +template usage. + +See the [import specification ADR document](./docs/import_specifications.ADR.md) for design +details. + +### Headers + +- `Authorization: ` + +### Success Response + +`200 OK` + +#### Headers + +#### Body + +##### Schema + +> TODO + +##### Content (example) + +```text +{ + "types": { + : [ + {: , : , ...}, + {: , : , ...}, + ... + ], + : [ + {: , : , ...}, + ... + ], + ... + }, + "files": { + : {"file": "/file1.", "tab": "tabname"}, + : {"file": "/file2.", "tab": null}, + ... + } +} +``` + +- `` is a data type ID from the [Mappings.py](./staging_service/autodetect/Mappings.py) + file and the Narrative staging area configuration file - it is a shared namespace + between the staging service and Narrative to specify bulk applications, and has a + 1:1 mapping to an app. It is determined by the first header line from the templates. +- `` is the ID of an input parameter from a `KB-SDK` app's `spec.json` file. + These are determined by the second header line from the templates and will differ + by the data type. +- `` is the user-provided value for the input for a given + `spec.json` + ID and import or analysis instance, where an import/analysis instance is effectively a + row in the data file. Each data file row is provided in order for each type. Each row is + provided in a mapping of `spec.json` ID to the data for the row. Lines > 3 in the + templates are user-provided data, and each line corresponds to a single import or analysis. + +### Error Response + +Error responses are of the general form: + +```json +{ + "errors": [ + {"type": , + ... other fields depending on the error code ... + }, + ... + ] +} +``` + +Existing error codes are currently: + +- `cannot_find_file` if an input file cannot be found +- `cannot_parse_file` if an input file cannot be parsed +- `incorrect_column_count` if the column count is not as expected + - For Excel files, this may mean there is a non-empty cell outside the bounds of the + data area +- `multiple_specifications_for_data_type` if more than one tab or file per data type is submitted +- `no_files_provided` if no files were provided +- `unexpected_error` if some other error occurs + +The HTTP code returned will be, in order of precedence: + +- 400 if any error other than `cannot_find_file` or `unexpected_error` occurs +- 404 if at least one error is `cannot_find_file` but there are no 400-type errors +- 500 if all errors are `unexpected_error` + +The per error type data structures are: + +#### `cannot_find_file` + +```json +{ + "type": "cannot_find_file", + "file": +} +``` + +#### `cannot_parse_file` + +```json +{ + "type": "cannot_parse_file", + "file": , + "tab": , + "message": +} +``` + +#### `incorrect_column_count` + +```json +{ + "type": "incorrect_column_count", + "file": , + "tab": , + "message": +} +``` + +#### `multiple_specifications_for_data_type` + +```json +{ + "type": "multiple_specifications_for_data_type", + "file_1": , + "tab_1": , + "file_2": , + "tab_2": , + "message": +} +``` + +#### `no_files_provided` + +```json +{ + "type": "no_files_provided" +} +``` + +#### `unexpected_error` + +```json +{ + "type": "unexpected_error", + "file": + "message": +} +``` + +## Write bulk specifications + +`POST /write_bulk_specification` + +This endpoint is the reverse of the parse bulk specifications endpoint - it takes a similar +data structure to that which the parse endpoint returns and writes bulk specification templates. + +### Headers + +- `Authorization: ` +- `Content-Type: application/json` + +#### Body + +##### Schema + +> TODO + +##### Content (example) + +```json +{ + "output_directory": , + "output_file_type": , + "types": { + : { + "order_and_display: [ + [, ], + [, ], + ... + ], + "data": [ + {: , : , ...}, + {: , : , ...} + ... + ] + }, + : { + "order_and_display: [ + [, ], + ... + ], + "data": [ + {: , : , ...}, + ... + ] + }, + ... + } +} +``` + +- `output_directory` specifies where the output files should be written in the user's + staging area. +- `output_file_type` specifies the format of the output files. +- `` is a data type ID from the [Mappings.py](./staging_service/autodetect/Mappings.py) + file and the Narrative staging area configuration file - it is a shared namespace + between the staging service and Narrative to specify bulk applications, and has a + 1:1 mapping to an app. It is included in the first header line in the templates. +- `order_and_display` determines the ordering of the columns in the written templates, + as well as mapping the spec.json ID of the parameter to the human readable name of the + parameter in the display.yml file. +- `` is the ID of an input parameter from a `KB-SDK` app's `spec.json` file. + These are written to the second header line from the import templates and will differ + by the data type. +- `data` contains any data to be written to the file as example data, and is analogous + to the data structure returned from the parse endpoint. To specify that no data + should be written to the template provide an empty list. +- `` is the value for the input for a given `spec.json` ID + and import or analysis instance, where an import/analysis instance is effectively a row + in the data file. Each data file row is provided in order for each type. Each row is + provided in a mapping of `spec.json` ID to the data for the row. Lines > 3 in the + templates are user-provided data, and each line corresponds to a single import or analysis. + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: application/json` + +#### Body + +##### Schema + +> TODO + +##### Content (example) + +```json +{ + "output_file_type": , + "files": { + : , + ... + : , + } +} +``` + +- `output_file_type` has the same definition as above. +- `files` contains a mapping of each provided data type to the output template file for + that type. In the case of Excel, all the file paths will be the same since the data + types are all written to different tabs in the same file. + +### Error Response + +Method specific errors have the form: + +```json +{"error": ""} +``` + +The error code in this case will be a 4XX error. + +The AioHTTP server may also return built in errors that are not in JSON format - an +example of this is overly large (> 1MB) request bodies. + +## Get Importer Mappings + +`POST /importer_mappings` + +This endpoint returns: + +1) a mapping between a list of files and predicted importer apps, and +2) a file information list that includes the input file names split between the file + prefix and the file suffix, if any, that was used to determine the file -> importer + mapping, and a list of file types based on the file suffix. If a file has a suffix + that does not match any mapping (e.g. `.sys`), the suffix will be `null`, the prefix + the entire file name, and the file type list empty. + +For example, + +- if we pass in nothing we get a response with no mappings +- if we pass in a list of files, such as ["file1.fasta", "file2.fq", "None"], we would + get back a response that maps to Fasta Importers and FastQ Importers, with a weight + of 0 to 1 which represents the probability that this is the correct importer for you. +- for files for which there is no predicted app, the return is a null value +- this endpoint is used to power the dropdowns for the staging service window in the Narrative + +### Headers + +none + +### Success Response + +`200 OK` + +#### Headers + +- `Content-Type: application/json` + +#### Body + +##### Schema + +> TODO + +##### Content (example) + +```python +data = {"file_list": ["file1.txt", "file2.zip", "file3.gff3.gz"]} + async with AppClient(config, username) as cli: + resp = await cli.post( + "importer_mappings/", data=data + ) +``` + +```json +{ + "mappings": [ + null, + [{ + "id": "decompress", + "title": "decompress/unpack", + "app_weight": 1, + }], + [{ + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }] + ], + "fileinfo": [ + {"prefix": "file1.txt", "suffix": null, "file_ext_type": []}, + {"prefix": "file2", "suffix": "zip", "file_ext_type": ["CompressedFileFormatArchive"]}, + {"prefix": "file3", "suffix": "gff3.gz", "file_ext_type": ["GFF"]} + ] +} +``` + +### Error Responses + +> TODO: hopefully the typical 401/400 auth errors are returned + +#### `400 Bad Request` + +##### Headers + +- `Content-Type: text/plain` + +##### Body + +```text +must provide file_list field +``` + +## Get importer filetypes + +`GET /importer_filetypes` + +This endpoint returns information about the file types associated with data types and +the file extensions for those file types. It is primarily of use for creating UI +elements describing which file extensions may be selected when performing bulk file +selections. + +### Headers + +none + +### Success Response + +`200 OK` + +#### Headers + +#### Body + +##### Schema + +> TODO + +##### Content (example) + +```json +{ + "datatype_to_filetype": { + : [, ... ], + ... + : [, ... ], + }, + "filetype_to_extensions": { + : [, ..., ], + ... + : [, ..., ], + } +} +``` + +- `` is a data type ID from the [Mappings.py](./staging_service/autodetect/Mappings.py) + file and the Narrative staging area configuration file - it is a shared namespace + between the staging service and Narrative to specify bulk applications, and has a + 1:1 mapping to an import app. It is included in the first header line in the templates. +- `` is a file type like `FASTA` or `GENBANK`. The supported file types are + listed below. +- `` is a file extension like `*.fa` or `*.gbk`. + +## Autodetect App and File Type IDs + +### App type IDs + +These are the currently supported upload app type IDs: + +```text +fastq_reads_interleaved +fastq_reads_noninterleaved +sra_reads +genbank_genome +gff_genome +gff_metagenome +expression_matrix +media +fba_model +assembly +phenotype_set +sample_set +metabolic_annotation +metabolic_annotation_bulk +escher_map +decompress +``` + +Note that decompress is only returned when no other file type can be detected from the file +extension. + +### File type IDs + +These are the currently supported file type IDs. These are primarily useful for apps +that take two different file types, like GFF/FASTA genomes. + +```text +FASTA +FASTQ +SRA +GFF +GENBANK +SBML +JSON +TSV +CSV +EXCEL +CompressedFileFormatArchive +``` + +## Common Authorization Error Responses + +### `401 Unauthorized` + +Authentication is incorrect + +#### Content + +`text/plain` + +```text +Error Connecting to auth service ... +``` + +### `400 Bad Request` + +Results if the `Authorization` header field is absent or empty + +> TODO: this should be a 403 + +#### Content + +`text/plain` + +```text +Must supply token +``` diff --git a/docs/development/development-tools.md b/docs/development/development-tools.md new file mode 100644 index 00000000..84ca5c27 --- /dev/null +++ b/docs/development/development-tools.md @@ -0,0 +1,119 @@ +# Development Tools + +This document provides an overview of two different but related development approaches - +devcontainers (via Visual Studio Code), and command line tools via a Docker container. + +## Visual Studio Code devcontainer + +One of your best friends for development may be Visual Studio Code (VSC) and a devcontainer. +This repo contains a VSC devcontainer all ready for use. + +> At the time of writing, devcontainer support for PyCharm is not mature enough to use; +> too many features are missing. However, in the near future this should also be a very +> productive development environment. + +A devcontainer provides a stable, reproducible development platform. For this repo, it +is based on the Python 3.11 debian image, the same one used for deployment. In fact, the +devcontainer should reflect the same OS and Python environment as the deployment and the +development tools images. This feature is important for reproducibility, and providing +the least surprise! + +There is also support for locally hosted development via VSC, but this developer (EAP) +does not do this style of Python development. + +### Getting Started + +1. Ensure you have docker running. + +2. Open the project in VSC + + There are a few ways to open a project in VSC: + + - from the repo directory `code .` + - from VSC menu: + - File > New Window + - From the new window: + - Click on Explorer in the left-nav, then click on Open Folder + - or + - Select: File > Open Folder + - Select the folder containing the repo (project) + - Click Open + +3. Start the devcontainer. + + - press the keys `Shift` - `Command` - `P` (macOS) to open the command palette + - start typing "Dev Container: Reopen Folder in Container" + - when you see it appear, click on it + - the image will build, the container will start + +4. Open a terminal + + If a terminal is not already open, open the built-in terminal with `Control` - `~` + (that is a tilde) + +5. Depending on the docker installation, you may need to grant access. If you don't see + git things enabled, click on the Git tool, and follow the instructions. + +Now you should treat this just like a local environment. The Python dependencies will +already be installed. If you have changes to them, however, you may simply make the +change to the requirements file and then reinstall them from the VSC devcontainer +terminal. + +### Running Tools + +All of the tools available for running from the host via docker are also directly +available within the devcontainer. I still often run them from a host terminal anyway, as +I can control the position and display of the native terminal a bit better, and it is +also decoupled from any issues VSC may have. YMMV. + +Here are the common ones: + +- `mypy staging_service` +- `black staging_service` +- `isort staging_service` +- `./scripts/run_tests.sh` + +### Running the server + +It is very efficacious to develop the server as it is running, and to exercise new or +changed endpoints directly. This can be through a local host and port, or via kbase-ui +integration and an official kbase environment host. + +In either case, the startup of the service is the same: + +```shell +FILE_LIFETIME="90" KB_DEPLOYMENT_CONFIG="${PWD}/deployment/conf/local.cfg" PYTHONPATH="${PWD}/staging_service" python -m staging_service +``` + +The `FILE_LIFETIME` environment variable is required, and sets the file retention policy. + +We ensure that the configuration is available via `KB_DEPLOYMENT_CONFIG`. In our case we +are using a configuration file already prepared for local usage. If you inspect it, +you'll find that all directory references are relative to the current directory. We are +referencing the `local.cfg` local configuration where it is stored in the codebase. + +We also ensure that the `staging_service` is included in the `PYTHONPATH`. + +Finally, we invoke the service simply by starting the bare aiohttp server invocation +located in `staging_service/__main__.py`. + +As a quick sanity test once the service is running, try this from a host terminal: + +```shell +curl http://localhost:3000/test-service +``` + +> TODO: The service should have a `/status` endpoint, not `/test-service`. + +## Host Tools + +The code checking tools can all be run from the host via the `run` script. This script +actually runs the tools in a Docker container which is very similar to the devcontainer +and service Docker configurations. + +To run the tools: + +- `./development/scripts/run mypy staging_service` +- `./development/scripts/run black staging_service` +- `./development/scripts/run isort staging_service` +- `./development/scripts/run ./scripts/run_tests.sh` diff --git a/docs/development/index.md b/docs/development/index.md new file mode 100644 index 00000000..d65a9c39 --- /dev/null +++ b/docs/development/index.md @@ -0,0 +1,5 @@ +# Development + +- [Original docs from readme](./original-doc.md) +- [Development Tools](./development-tools.md) +- [Maintaining the Dockerfiles](./maintaining-dockerfile.md) diff --git a/docs/development/maintaining-dockerfile.md b/docs/development/maintaining-dockerfile.md new file mode 100644 index 00000000..1d67e389 --- /dev/null +++ b/docs/development/maintaining-dockerfile.md @@ -0,0 +1,44 @@ +# Maintaining Dockerfile + +## Base Image + +The Base Image should be an official Python distribution. We should prefer the most +recent release, when possible. + +## OS version + +The OS should be Debian, as Debian/Ubuntu are widely used at KBase. The version should +be the current stable. The LTS version, which is often several versions behind stable, +does not seem to be supported by recent Python distributions. That is, if we want to get +recent Python versions we probably can't use LTS, and conversely if we wished to use the +LTS debian we wouldn't be able to use the most recent Python releases. + +It is more important to have a current Python, than a current OS. The OS really isn't +much of a concern to the service, as long as the required OS-level dependencies are +available. Python itself takes care of the interface to the OS, and that is all we are +concerned with. + +## OS dependencies + +The OS dependencies are indicated in the Dockerfile as exact versions. This ensures that +images are consistent, even as the base image evolves over time. We should keep an eye +on this, as there are reports from some Linux distros (e.g. Alpine) that package +retention is not permanent, and older packages may eventually be dropped, meaning that a +future build may actually fail if it pins package versions. There is some controversy +over this, with distro maintainers complaining that they dont' have infinite storage +space for all package versions they have distributed. + +## Dockerfile Design + +The Dockerfile serves at least three purpopses. Its design reflects this by utilizing a +multi-stage build. Multi-stage builds are primarily for creating internal build layers +that can be omitted from the final image. However, in this case we use this design to +create a base image with most OS and Python dependencies installed, a dev stage which +has development dependencies (OS and Python) installed, and finally a production stage +which adds all runtime files and the entrypoint. + +The dev stage is used by the devcontainer and tool docker-compose files to run a +container which is all ready for development, just needing the repo to be volume mounted +at `/kb/module``. + +The production deploy image can be exercised with the top level `docker-compose.yml`. diff --git a/docs/development/original-doc.md b/docs/development/original-doc.md new file mode 100644 index 00000000..51206820 --- /dev/null +++ b/docs/development/original-doc.md @@ -0,0 +1,68 @@ +# staging_service + +In order to setup local development, you must have docker installed and if you want to +run it locally you must have python 3.11.5 or greater installed + +## setup + +make a folder called /data as well as inside that /bulk and inside that a folder for any +usernames you wish it to work with + +data + -bulk + -username + -username + +if you want to run locally you must install requirements.txt for python3 + +## running + +to run locally run `/deployment/bin/entrypoint.sh` + +to run inside docker run `./scripts/run_in_docker.sh` + +## tests + +### Run on host + +- to test use `./scripts/run_tests.sh` +- requires python 3.11.5 or higher +- requires installation on mac of libmagic `brew install libmagic` or `sudo port install + libmagic` + +### Run in container + +You can also run tests in a container which uses the same base image and uses the same +dependencies. (This container can also run other python tasks.) + +```shell +./development/scripts/run scripts/run_tests.sh +``` + +To run tests in individual test file you may supply the path to it. By default, the +tests run against `tests/`. + +```shell +./development/scripts/run scripts/run_tests.sh tests/test_app.py +``` + +## debugging + +Included configurations for the Visual Studio Code debugger for python that mirror what +is in the entrypoint.sh and testing configuration to run locally in the debugger, set +breakpoints and if you open the project in VSCode the debugger should be good to go. The +provided configurations can run locally and run tests locally + +## development + +When releasing a new version: + +- Update the release notes +- Update the version in [staging_service/app.py](staging_service/app.py).VERSION + +## expected command line utilities + +to run locally you will need all of these utils on your system: tar, unzip, zip, gzip, +bzip2, md5sum, head, tail, wc + +in the docker container all of these should be available diff --git a/docs/import_specifications.ADR.md b/docs/import_specifications.ADR.md index dbd04386..44eed525 100644 --- a/docs/import_specifications.ADR.md +++ b/docs/import_specifications.ADR.md @@ -1,31 +1,32 @@ # Import Specifications Architecture Design Record -This document specifies the design for handling import specifications in the Staging Service (StS). -An import specification is an Excel, CSV, or TSV file that contains instructions for how -to import one or more files in the staging area to KBase as KBase data types. +This document specifies the design for handling import specifications in the Staging +Service (StS). An import specification is an Excel, CSV, or TSV file that contains +instructions for how to import one or more files in the staging area to KBase as KBase +data types. ## Resources -* [The original strategy document for this approach](https://docs.google.com/document/d/1ocmZVBlTzAh_cdZaWGRwIbAuH-mcPRZFdhcwRhAfzxM/edit) - * Readable for everyone that has access to the KBase Google Docs folder - * The document contains short descriptions of future projects not included in this work, +- [The original strategy document for this approach](https://docs.google.com/document/d/1ocmZVBlTzAh_cdZaWGRwIbAuH-mcPRZFdhcwRhAfzxM/edit) + - Readable for everyone that has access to the KBase Google Docs folder + - The document contains short descriptions of future projects not included in this work, such as generating template files on the fly and rich templates. ## Front end changes -The design introduces a new StS data type, `import_specification`. The FE's current -behavior is to display any data types returned from the StS in the file dropdown, but silently -ignore user-selected files for which the selected data type is unknown to the narrative, a bug. -The FE will be updated to ignore unknown data types returned from the StS, allowing for phased, -non-lockstep upgrades. This work is not included in this project, but will be in a future FE -project. +The design introduces a new StS data type, `import_specification`. The FE's current +behavior is to display any data types returned from the StS in the file dropdown, but +silently ignore user-selected files for which the selected data type is unknown to the +narrative, a bug. The FE will be updated to ignore unknown data types returned from the +StS, allowing for phased, non-lockstep upgrades. This work is not included in this +project, but will be in a future FE project. ## Import specification input files -Input file formats may be Excel, CSV, or TSV. An example CSV file structure for GFF/FASTA imports -is below: +Input file formats may be Excel, CSV, or TSV. An example CSV file structure for +GFF/FASTA imports is below: -``` +```csv Data type: gff_metagenome; Columns: 7; Version: 1 fasta_file, gff_file, genome_name, source, release, genetic_code, generate_missing_genes FASTA File Path, GFF3 File Path, Metagenome Object Name, Source of metagenome, Release or Version of the Source Data, Genetic Code for protein translation, Spoof Genes for parentless CDS @@ -35,66 +36,72 @@ mygenome2.fa, mygenome2.gff3, mygenomeobject2, yermumspoo, 30456, 11, 1 ``` The file, by row, is: -1. The data type, in this case `gff_metagenome`, the column count, and the version, in this case 1. - * The data type is from the list in the + +1. The data type, in this case `gff_metagenome`, the column count, and the version, in + this case 1. + - The data type is from the list in the [Mappings.py](https://github.com/kbase/staging_service/blob/master/staging_service/autodetect/Mappings.py) - file in the StS. The Narrative is expected to understand these types and map them to - importer apps. - * The column count allows for error checking row sizes. This particularly important for + file in the StS. The Narrative is expected to understand these types and map them + to importer apps. + - The column count allows for error checking row sizes. This particularly important for the Excel parser, which uses `pandas` under the hood. `pandas` will silently pad data and headers out to match the longest row in the sheet, so the column count is required to detect errors. - * The version allows us to update the file format and increment the version, + - The version allows us to update the file format and increment the version, allowing backwards compatibility - the staging service can process the file appropriately depending on the version number. -2. The IDs of the app inputs from the `spec.json` file. +2. The IDs of the app inputs from the `spec.json` file. 3. The corresponding human readable names of the app inputs from the `display.yaml` file. 4. (and beyond) Import specifications. Each line corresponds to a single import. -For Excel files, the first two rows may be hidden in any provided templates. Additionally, -Excel files may contain multiple data types, one per tab. Empty tabs will be ignored, and tabs -that don't match the expected structure will be treated as an error. +For Excel files, the first two rows may be hidden in any provided templates. +Additionally, Excel files may contain multiple data types, one per tab. Empty tabs will +be ignored, and tabs that don't match the expected structure will be treated as an +error. -Parsing will be on a "best effort" basis given the underlying `pandas` technology. We have -decided not to include type information per column in the spec, which means that the type -will be determined by `pandas`. Pandas specifies a single type per column, which means that -if a single `string` value is accidentally inserted into a column that is otherwise -full of `float`s, `pandas` will coerce all the values in that column to `string`s. Under normal -operations this should not be an issue, but could cause unexpected type returns if a user adds -an incorrect value to a column +Parsing will be on a "best effort" basis given the underlying `pandas` technology. We +have decided not to include type information per column in the spec, which means that +the type will be determined by `pandas`. Pandas specifies a single type per column, +which means that if a single `string` value is accidentally inserted into a column that +is otherwise full of `float`s, `pandas` will coerce all the values in that column to +`string`s. Under normal operations this should not be an issue, but could cause +unexpected type returns if a user adds an incorrect value to a column As part of this project we will deliver: + 1. CSV templates for each in scope app (e.g. the first 3 lines of the example file) 2. An Excel template containing a tab for each in scope app 3. A `README.md` file explaining how to use the templates. - * The `README.md` should include a link to rich import specification documentation on the KBase - website once it is developed. - * Note: cover booleans / checkboxes which are not intuitive. 0 will indicate unchecked, and 1 - checked. These values may show up as strings in the API, and the consumer will be expected to - handle conversion appropriately. + - The `README.md` should include a link to rich import specification documentation on + the KBase website once it is developed. + - Note: cover booleans / checkboxes which are not intuitive. 0 will indicate + unchecked, and 1 checked. These values may show up as strings in the API, and the + consumer will be expected to handle conversion appropriately. These files will reside in the StS repo. As part of the front end effort, some means of delivering the templates and instructions to users will be developed. -Currently, for all in scope apps, the inputs are all strings, numbers, or booleans. There are -no unusual inputs such as grouped parameters or dynamic lookups. Including future import apps -with these features in CSV-based import may require additional engineering. +Currently, for all in scope apps, the inputs are all strings, numbers, or booleans. +There are no unusual inputs such as grouped parameters or dynamic lookups. Including +future import apps with these features in CSV-based import may require additional +engineering. + +Note that the endpoint will return individual data for each row, while the current front +end only supports individual input files and output objects (this may be improved in a +future update). The front end will be expected to either -Note that the endpoint will return individual data for each row, while the current front end -only supports individual input files and output objects (this may be improved in a future update). -The front end will be expected to either 1. Ignore all parameters other than in the first entry in the return per type, or 2. Throw an error if parameters differ from the first entry. ## User operations -* The user uploads the import specification files to the staging area along with all the files - inluded in the specification. -* The user selects the `Import Specification` type for the specification files. - * The user may also select other files in the staging area to include in the import along +- The user uploads the import specification files to the staging area along with all the + files inluded in the specification. +- The user selects the `Import Specification` type for the specification files. + - The user may also select other files in the staging area to include in the import along with the files listed in the specification. - * The user *does not* have to select any files included in the specification. -* The user clicks `Import Selected`. + - The user *does not* have to select any files included in the specification. +- The user clicks `Import Selected`. As such, a new type must be added to the StS: `import_specification` with the title `Import Specification`. *Nota bene*: It may be preferable to have the Narrative specify the @@ -108,12 +115,13 @@ cell with those specifications. ## Staging service import specification endpoint -The StS endpoint responds to an HTTP GET with a file list in a `files` URL parameter. It is -extremely unlikely that there will be a large enough set of data types that URL length limits -will be reached so this seems adequate. The StS will respond with the contents of the files -parsed into a structure that is similar to that of the input spec, mapped by data type: +The StS endpoint responds to an HTTP GET with a file list in a `files` URL parameter. It +is extremely unlikely that there will be a large enough set of data types that URL +length limits will be reached so this seems adequate. The StS will respond with the +contents of the files parsed into a structure that is similar to that of the input spec, +mapped by data type: -``` +```text Headers: Authorization: @@ -149,33 +157,36 @@ Response: The order of the input structures MUST be the same as the order in the input files. -Notably, the service will provide the contents of the files as is and will not perform most error -checking, including for missing or unknown input parameters. Most error checking will be performed -in the bulk import cell configuration tab like other imports, allowing for a consistent user -experience. +Notably, the service will provide the contents of the files as is and will not perform +most error checking, including for missing or unknown input parameters. Most error +checking will be performed in the bulk import cell configuration tab like other imports, +allowing for a consistent user experience. ### Error handling The StS will return errors on a (mostly) per input file basis, with a string error code for each error type. Currently the error types are: -* `cannot_find_file` if an input file cannot be found -* `cannot_parse_file` if an input file cannot be parsed -* `incorrect_column_count` if the column count is not as expected - * For Excel files, this may mean there is a non-empty cell outside the bounds of the data area -* `multiple_specifications_for_data_type` if more than one tab or file per data type is submitted -* `no_files_provided` if no files were provided -* `unexpected_error` if some other error occurs +- `cannot_find_file` if an input file cannot be found + +- `cannot_parse_file` if an input file cannot be parsed +- `incorrect_column_count` if the column count is not as expected + - For Excel files, this may mean there is a non-empty cell outside the bounds of the + data area +- `multiple_specifications_for_data_type` if more than one tab or file per data type + is submitted +- `no_files_provided` if no files were provided +- `unexpected_error` if some other error occurs The HTTP code returned will be, in order of precedence: -* 400 if any error other than `cannot_find_file` or `unexpected_error` occurs -* 404 if at least one error is `cannot_find_file` but there are no 400-type errors -* 500 if all errors are `unexpected_error` +- 400 if any error other than `cannot_find_file` or `unexpected_error` occurs +- 404 if at least one error is `cannot_find_file` but there are no 400-type errors +- 500 if all errors are `unexpected_error` The general structure of the error response is: -``` +```json {"errors": [ {"type": , ... other fields depending on the error code ... @@ -188,7 +199,7 @@ The individual error structures per error type are as follows: #### `cannot_find_file` -``` +```json {"type": "cannot_find_file", "file": } @@ -196,7 +207,7 @@ The individual error structures per error type are as follows: #### `cannot_parse_file` -``` +```json {"type": "cannot_parse_file", "file": , "tab": , @@ -204,15 +215,15 @@ The individual error structures per error type are as follows: } ``` -The service will check that the data type is valid and that rows >=2 all have the same number of -entries, but will not do further error checking. +The service will check that the data type is valid and that rows >=2 all have the same +number of entries, but will not do further error checking. In this case the service should log the stack trace along with the filename for each invalid file if the trace would assist in debugging the error. #### `incorrect_column_count` -``` +```json {"type": "incorrect_column_count", "file": , "tab": , @@ -222,7 +233,7 @@ each invalid file if the trace would assist in debugging the error. #### `multiple_specifications_for_data_type` -``` +```json {"type": "multiple_specifications_for_data_type", "file_1": , "tab_1": , @@ -234,13 +245,13 @@ each invalid file if the trace would assist in debugging the error. #### `no_files_provided` -``` +```json {"type": "no_files_provided"} ``` #### `unexpected_error` -``` +```json {"type": "unexpected_error", "file": "message": @@ -251,42 +262,42 @@ Note in this case the service MUST log the stack trace along with the filename f ## Alternatives explored -* We considered parsing the files in the Narrative backend (pulling the files as is from the StS) - but the current design allows for other services and UIs reusing the parsing code. There doesn't - otherwise seem to be a strong reason to include the code one place or the other so we decided - to include it in the StS. +- We considered parsing the files in the Narrative backend (pulling the files as is from + the StS) but the current design allows for other services and UIs reusing the parsing + code. There doesn't otherwise seem to be a strong reason to include the code one place + or the other so we decided to include it in the StS. ## Questions -* Should it be an error to submit multiple files or tabs for the same type? If not, +- Should it be an error to submit multiple files or tabs for the same type? If not, how should the different files be denoted and ordered? - * This has follow on effect for how spreadsheet type views in the UI should be displayed. - * A: For the MVP disallow submitting more than one file / tab per type. Post release we'll - find out if this is a use case that users care about. -* Should we disallow filenames with commas? They may cause problems with the new endpoint. - * A: Disallow commas, the same way we disallow files staring with whitespace or periods. -* Should we strictly enforce a column count for every row in xSV files? - * Not enforcing a count makes it somewhat easier for users to fill in the data - they don't - need to add extraneous commas or tabs to the end of the line. - * Enforcing a count makes it less likely that a user will commit silent counting errors if - there are many empty entries between items in a line. - * A: Enforce a column count to prevent user errors. + - This has follow on effect for how spreadsheet type views in the UI should be displayed. + - A: For the MVP disallow submitting more than one file / tab per type. Post release + we'll find out if this is a use case that users care about. +- Should we disallow filenames with commas? They may cause problems with the new endpoint. + - A: Disallow commas, the same way we disallow files staring with whitespace or periods. +- Should we strictly enforce a column count for every row in xSV files? + - Not enforcing a count makes it somewhat easier for users to fill in the data - + they don't need to add extraneous commas or tabs to the end of the line. + - Enforcing a count makes it less likely that a user will commit silent counting + errors if there are many empty entries between items in a line. + - A: Enforce a column count to prevent user errors. ## Addendum: dynamic parameter lookup Dynamic scientific name to taxon lookup may be added to the Genbank (and the currently out of scope, but trivial to add GFF/FASTA Genome) importer in the near future. If that occurs, for the purpose of xSV import the user will be expected to provide the entire, correct, -scientific name as returned from the taxon API. +scientific name as returned from the taxon API. -* The user could get this name by starting a genome import and running the query from the +- The user could get this name by starting a genome import and running the query from the import app cell configuration screen. - * This will be documented in the README.md for the template files. -* As part of the UI work we could theoretically provide a landing page for looking up valid + - This will be documented in the README.md for the template files. +- As part of the UI work we could theoretically provide a landing page for looking up valid scientific names. -* Presumably the UI would need to run the dynamic query and report an error to the user if the - dynamic service returns 0 or > 1 entries. -* Providing the scientific name vs. the taxon ID seems simpler because the machinery already +- Presumably the UI would need to run the dynamic query and report an error to the user + if the dynamic service returns 0 or > 1 entries. +- Providing the scientific name vs. the taxon ID seems simpler because the machinery already exists to perform the query and is part of the spec. -* Expect these plans to change as it becomes more clear how dynamic fields will work in the - context of bulk import. \ No newline at end of file +- Expect these plans to change as it becomes more clear how dynamic fields will work + in the context of bulk import. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..37d001c3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.black] +line-length = 100 + +[tool.isort] +profile = "black" + +[tool.mypy] +strict=true + +[[tool.mypy.overrides]] +module = "aiohttp_cors.*" +ignore_missing_imports = true + + +[[tool.mypy.overrides]] +module = "globus_sdk.*" +ignore_missing_imports = true + + +[tool.autoflake] +remove_all_unused_imports = true +remove_unused_variables = true +quiet = true diff --git a/requirements.txt b/requirements.txt index 065408ad..8c8cf3c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,18 +1,13 @@ -aiofiles==23.1.0 +aiofiles==23.2.1 +aiohttp==3.8.5 aiohttp_cors==0.7.0 -aiohttp==3.8.4 -black==23.7.0 -coverage==7.2.7 defusedxml==0.7.1 frozendict==2.3.8 +# Hold globus-sdk back until further notice. globus-sdk==1.6.1 -hypothesis==6.81.1 +gunicorn==21.2.0 openpyxl==3.1.2 -pandas==2.0.3 -pytest-aiohttp==1.0.4 -pytest-asyncio==0.21.0 -pytest-cov==4.1.0 -pytest==7.4.0 +pandas==2.1.0 python-dotenv==1.0.0 python-magic==0.4.27 uvloop==0.17.0 diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 00000000..c13b5200 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,12 @@ +autoflake==2.2.1 +black==23.9.1 +coverage==7.3.1 +hypothesis==6.84.3 +mypy==1.5.1 +pymarkdownlnt==0.9.13.4 +pylint==2.17.5 +pytest-aiohttp==1.0.5 +pytest-asyncio==0.21.1 +pytest-cov==4.1.0 +pytest==7.4.2 +types-openpyxl===3.1.0.19 diff --git a/run_tests.sh b/run_tests.sh deleted file mode 100755 index 52ea58d5..00000000 --- a/run_tests.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -DIR="$( cd "$( dirname "$0" )" && pwd )" -export KB_DEPLOYMENT_CONFIG="$DIR/deployment/conf/testing.cfg" -export FILE_LIFETIME="90" -export TESTS="${1:-tests}" -echo -echo "****************************" -echo "**" -echo "** Running tests in ${TESTS}" -echo "**" -echo "****************************" -echo -python3 -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html "${TESTS}" - diff --git a/scripts/run-dev-server.sh b/scripts/run-dev-server.sh new file mode 100755 index 00000000..c46b413c --- /dev/null +++ b/scripts/run-dev-server.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +FILE_LIFETIME="90" KB_DEPLOYMENT_CONFIG="${PWD}/deployment/conf/local.cfg" PYTHONPATH="${PWD}/staging_service" python -m staging_service diff --git a/run_in_docker.sh b/scripts/run_in_docker.sh similarity index 100% rename from run_in_docker.sh rename to scripts/run_in_docker.sh diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100755 index 00000000..5e72dc3e --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# DIR is the root of the project. +DIR="$( cd "$( dirname "$0" )" && pwd )/.." + +export PYTHONPATH="${PWD}" +export KB_DEPLOYMENT_CONFIG="${DIR}/deployment/conf/testing.cfg" +export FILE_LIFETIME="90" + +# Tests are located in the `tests` directory. Specific tests or groups of +# tests may be run by proving an argument to the script which is an acceptable +# test path spec for pytest. E.g. `./scripts/run_tests.sh tests/test_app.py` +# will run just the tests in `tests/test_app.py`. +export TESTS="${1:-tests}" + +echo +echo "****************************" +echo "**" +echo "** Running tests in ${TESTS}" +echo "**" +echo "****************************" +echo +python -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html "${TESTS}" + diff --git a/staging_service/app.py b/staging_service/app.py index 89a1d4a3..a2e57f46 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -123,9 +123,7 @@ async def bulk_specification(request: web.Request) -> web.json_response: res = parse_import_specifications( tuple(list(paths)), _file_type_resolver, - lambda e: logging.error( - "Unexpected error while parsing import specs", exc_info=e - ), + lambda e: logging.error("Unexpected error while parsing import specs", exc_info=e), ) if res.results: types = {dt: result.result for dt, result in res.results.items()} @@ -191,9 +189,7 @@ async def write_bulk_specification(request: web.Request) -> web.json_response: folder = data.get("output_directory") type_ = data.get("output_file_type") if type(folder) != str: - return _createJSONErrorResponse( - "output_directory is required and must be a string" - ) + return _createJSONErrorResponse("output_directory is required and must be a string") writer = _IMPSPEC_FILE_TO_WRITER.get(type_) if not writer: return _createJSONErrorResponse(f"Invalid output_file_type: {type_}") @@ -218,12 +214,8 @@ async def add_acl_concierge(request: web.Request): user_dir = Path.validate_path(username).full_path concierge_path = f"{Path._CONCIERGE_PATH}/{username}/" aclm = AclManager() - result = aclm.add_acl_concierge( - shared_directory=user_dir, concierge_path=concierge_path - ) - result[ - "msg" - ] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" + result = aclm.add_acl_concierge(shared_directory=user_dir, concierge_path=concierge_path) + result["msg"] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" result[ "link" ] = f"https://app.globus.org/file-manager?destination_id={aclm.endpoint_id}&destination_path={concierge_path}" @@ -273,9 +265,7 @@ async def file_exists(request: web.Request): show_hidden = False # this scans the entire directory recursively just to see if one file exists... why? results = await dir_info(user_dir, show_hidden, query) - filtered_results = [ - file_info for file_info in results if file_info["name"] == query - ] + filtered_results = [file_info for file_info in results if file_info["name"] == query] if filtered_results: exists = True just_is_folder = [file_info["isFolder"] for file_info in filtered_results] @@ -322,9 +312,7 @@ async def download_files(request: web.Request): elif not os.path.isfile(path.full_path): raise web.HTTPBadRequest(text=f"{path.full_path} is a directory not a file") # hard coding the mime type to force download - return web.FileResponse( - path.full_path, headers={"content-type": "application/octet-stream"} - ) + return web.FileResponse(path.full_path, headers={"content-type": "application/octet-stream"}) @routes.get("/similar/{path:.+}") @@ -413,9 +401,7 @@ async def upload_files_chunked(request: web.Request): counter = 0 user_file = None destination_path = None - while ( - counter < 100 - ): # TODO this is arbitrary to keep an attacker from creating infinite loop + while counter < 100: # TODO this is arbitrary to keep an attacker from creating infinite loop # This loop handles the null parts that come in inbetween destpath and file part = await reader.next() @@ -487,9 +473,7 @@ async def define_upa(request: web.Request): except KeyError as key_error: raise web.HTTPBadRequest(text="must provide UPA field in body") from key_error await add_upa(path, UPA) - return web.Response( - text=f"successfully updated UPA {UPA} for file {path.user_path}" - ) + return web.Response(text=f"successfully updated UPA {UPA} for file {path.user_path}") @routes.delete("/delete/{path:.+}") @@ -533,9 +517,7 @@ async def rename(request: web.Request): try: new_path = body["newPath"] except KeyError as wrong_key: - raise web.HTTPBadRequest( - text="must provide newPath field in body" - ) from wrong_key + raise web.HTTPBadRequest(text="must provide newPath field in body") from wrong_key new_path = Path.validate_path(username, new_path) if os.path.exists(path.full_path): if not os.path.exists(new_path.full_path): @@ -546,9 +528,7 @@ async def rename(request: web.Request): raise web.HTTPConflict(text="{new_path.user_path} allready exists") else: raise web.HTTPNotFound(text=f"{path.user_path} not found") - return web.Response( - text=f"successfully moved {path.user_path} to {new_path.user_path}" - ) + return web.Response(text=f"successfully moved {path.user_path} to {new_path.user_path}") @routes.patch("/decompress/{path:.+}") @@ -563,13 +543,9 @@ async def decompress(request: web.Request): # 2 could try again after doing an automatic rename scheme (add numbers to end) # 3 just overwrite and force destination = os.path.dirname(path.full_path) - if ( - upper_file_extension == ".tar" and file_extension == ".gz" - ) or file_extension == ".tgz": + if (upper_file_extension == ".tar" and file_extension == ".gz") or file_extension == ".tgz": await run_command("tar", "xzf", path.full_path, "-C", destination) - elif upper_file_extension == ".tar" and ( - file_extension == ".bz" or file_extension == ".bz2" - ): + elif upper_file_extension == ".tar" and (file_extension == ".bz" or file_extension == ".bz2"): await run_command("tar", "xjf", path.full_path, "-C", destination) elif file_extension == ".zip" or file_extension == ".ZIP": await run_command("unzip", path.full_path, "-d", destination) @@ -640,9 +616,7 @@ def inject_config_dependencies(config): if FILE_EXTENSION_MAPPINGS is None: raise Exception("Please provide FILE_EXTENSION_MAPPINGS in the config file ") - with open( - FILE_EXTENSION_MAPPINGS, "r", encoding="utf-8" - ) as file_extension_mappings_file: + with open(FILE_EXTENSION_MAPPINGS, "r", encoding="utf-8") as file_extension_mappings_file: AutoDetectUtils.set_mappings(json.load(file_extension_mappings_file)) datatypes = defaultdict(set) extensions = defaultdict(set) diff --git a/staging_service/app_error_formatter.py b/staging_service/app_error_formatter.py index 5766444a..1e83212b 100644 --- a/staging_service/app_error_formatter.py +++ b/staging_service/app_error_formatter.py @@ -68,8 +68,6 @@ def format_import_spec_errors( file2 = str(path_translations[error.source_2.file]) tab2 = error.source_2.tab errs.append( - _IMPORT_SPEC_ERROR_FORMATTERS[error.error_type]( - error.message, file1, tab1, file2, tab2 - ) + _IMPORT_SPEC_ERROR_FORMATTERS[error.error_type](error.message, file1, tab1, file2, tab2) ) return errs diff --git a/staging_service/auth2Client.py b/staging_service/auth2Client.py index 3b136b4f..27a43443 100644 --- a/staging_service/auth2Client.py +++ b/staging_service/auth2Client.py @@ -40,9 +40,7 @@ def add_valid_token(self, token, user, expire_time): token = hashlib.sha256(token.encode("utf8")).hexdigest() self._cache[token] = [user, _time.time(), expire_time] if len(self._cache) > self._maxsize: - for i, (t, _) in enumerate( - sorted(self._cache.items(), key=lambda v: v[1][1]) - ): + for i, (t, _) in enumerate(sorted(self._cache.items(), key=lambda v: v[1][1])): if i <= self._halfmax: del self._cache[t] else: @@ -69,9 +67,7 @@ async def get_user(self, token): return user async with aiohttp.ClientSession() as session: - async with session.get( - self._authurl, headers={"Authorization": token} - ) as resp: + async with session.get(self._authurl, headers={"Authorization": token}) as resp: ret = await resp.json() if resp.reason != "OK": http_code = ret["error"]["httpcode"] diff --git a/staging_service/autodetect/GenerateMappings.py b/staging_service/autodetect/GenerateMappings.py index ab4e44da..528d61b6 100644 --- a/staging_service/autodetect/GenerateMappings.py +++ b/staging_service/autodetect/GenerateMappings.py @@ -26,17 +26,38 @@ """ from collections import defaultdict -from staging_service.autodetect.Mappings import CSV, EXCEL, FASTA, FASTQ, GENBANK, GFF, JSON, SBML, SRA, TSV, ZIP, \ - assembly_id, \ - decompress_id, \ - escher_map_id, \ - expression_matrix_id, \ - extension_to_file_format_mapping, fastq_reads_interleaved_id, \ - fastq_reads_noninterleaved_id, \ - fba_model_id, file_format_to_extension_mapping, genbank_genome_id, gff_genome_id, gff_metagenome_id, \ - import_specification, media_id, \ - metabolic_annotations_bulk_id, \ - metabolic_annotations_id, phenotype_set_id, sample_set_id, sra_reads_id +from staging_service.autodetect.Mappings import ( + CSV, + EXCEL, + FASTA, + FASTQ, + GENBANK, + GFF, + JSON, + SBML, + SRA, + TSV, + ZIP, + assembly_id, + decompress_id, + escher_map_id, + expression_matrix_id, + extension_to_file_format_mapping, + fastq_reads_interleaved_id, + fastq_reads_noninterleaved_id, + fba_model_id, + file_format_to_extension_mapping, + genbank_genome_id, + gff_genome_id, + gff_metagenome_id, + import_specification, + media_id, + metabolic_annotations_bulk_id, + metabolic_annotations_id, + phenotype_set_id, + sample_set_id, + sra_reads_id, +) # Note that some upload apps are not included - in particular batch apps, which are now # redundant, and MSAs and attribute mappings because they're out of scope at the current time. @@ -104,9 +125,7 @@ app_id_to_extensions = defaultdict(list) for filecat, apps in file_format_to_app_mapping.items(): for app_id in apps: - app_id_to_extensions[app_id].extend( - file_format_to_extension_mapping[filecat] - ) + app_id_to_extensions[app_id].extend(file_format_to_extension_mapping[filecat]) # Create the mapping between file extensions and apps # For example, the .gbk and .genkbank extensions map to app with id of "genbank_genome" diff --git a/staging_service/autodetect/Mappings.py b/staging_service/autodetect/Mappings.py index fffd09ac..af08e54c 100644 --- a/staging_service/autodetect/Mappings.py +++ b/staging_service/autodetect/Mappings.py @@ -71,9 +71,7 @@ def _flatten(some_list): # longer term there's probably a better way to do this but this is quick def _add_gzip(extension_list): - return _flatten( - [[ext + comp for comp in _COMPRESSION_EXT] for ext in extension_list] - ) + return _flatten([[ext + comp for comp in _COMPRESSION_EXT] for ext in extension_list]) file_format_to_extension_mapping = { @@ -116,7 +114,5 @@ def _add_gzip(extension_list): for ext in extensions: if ext in extension_to_file_format_mapping: type2 = extension_to_file_format_mapping[ext] - raise ValueError( - f"Duplicate entry for extension {ext} in {type_} and {type2}" - ) + raise ValueError(f"Duplicate entry for extension {ext} in {type_} and {type2}") extension_to_file_format_mapping[ext] = type_ diff --git a/staging_service/globus.py b/staging_service/globus.py index 6a905adf..2b23666a 100644 --- a/staging_service/globus.py +++ b/staging_service/globus.py @@ -64,7 +64,5 @@ async def assert_globusid_exists(username, token): # such as the commented code below, for multiple linked accounts # text = '\n'.join(globus_ids) text = globus_ids[0] - async with aiofiles.open( - path.full_path, mode="w", encoding="utf-8" - ) as globus_file: + async with aiofiles.open(path.full_path, mode="w", encoding="utf-8") as globus_file: await globus_file.writelines(text) diff --git a/staging_service/import_specifications/file_parser.py b/staging_service/import_specifications/file_parser.py index 9347a27a..7c51af85 100644 --- a/staging_service/import_specifications/file_parser.py +++ b/staging_service/import_specifications/file_parser.py @@ -171,9 +171,7 @@ class FileTypeResolution: def __post_init__(self): if not (bool(self.parser) ^ bool(self.unsupported_type)): # xnor - raise ValueError( - "Exactly one of parser or unsupported_type must be supplied" - ) + raise ValueError("Exactly one of parser or unsupported_type must be supplied") def parse_import_specifications( diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 02a1e455..a56b3eeb 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -89,13 +89,9 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): _check_string(datatype, "A data type") spec = types[datatype] if not isinstance(spec, dict): - raise ImportSpecWriteException( - f"The value for data type {datatype} must be a mapping" - ) + raise ImportSpecWriteException(f"The value for data type {datatype} must be a mapping") if _ORDER_AND_DISPLAY not in spec: - raise ImportSpecWriteException( - f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key" - ) + raise ImportSpecWriteException(f"Data type {datatype} missing {_ORDER_AND_DISPLAY} key") _check_is_sequence( spec[_ORDER_AND_DISPLAY], f"Data type {datatype} {_ORDER_AND_DISPLAY} value" ) @@ -109,10 +105,7 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): param_ids = set() for i, id_display in enumerate(spec[_ORDER_AND_DISPLAY]): - err = ( - f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " - + f"at index {i} " - ) + err = f"Invalid {_ORDER_AND_DISPLAY} entry for datatype {datatype} " + f"at index {i} " _check_is_sequence(id_display, err + "- the entry") if len(id_display) != 2: raise ImportSpecWriteException(err + "- expected 2 item list") @@ -129,14 +122,9 @@ def _check_import_specification(types: dict[str, dict[str, list[Any]]]): err + f" does not have the same keys as {_ORDER_AND_DISPLAY}" ) for pid, v in datarow.items(): - if ( - v is not None - and not isinstance(v, numbers.Number) - and not isinstance(v, str) - ): + if v is not None and not isinstance(v, numbers.Number) and not isinstance(v, str): raise ImportSpecWriteException( - err - + f"'s value for parameter {pid} is not a number or a string" + err + f"'s value for parameter {pid} is not a number or a string" ) @@ -148,9 +136,7 @@ def _check_string(tocheck: Any, errprefix: str): def _check_is_sequence(tocheck: Any, errprefix: str): - if not ( - isinstance(tocheck, collections.abc.Sequence) and not isinstance(tocheck, str) - ): + if not (isinstance(tocheck, collections.abc.Sequence) and not isinstance(tocheck, str)): raise ImportSpecWriteException(errprefix + " is not a list") @@ -170,9 +156,7 @@ def write_tsv(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, return _write_xsv(folder, types, _EXT_TSV, _SEP_TSV) -def _write_xsv( - folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, sep: str -): +def _write_xsv(folder: Path, types: dict[str, dict[str, list[Any]]], ext: str, sep: str): _check_write_args(folder, types) res = {} for datatype in types: @@ -206,9 +190,7 @@ def _check_write_args(folder: Path, types: dict[str, dict[str, list[Any]]]): _check_import_specification(types) -def write_excel( - folder: Path, types: dict[str, dict[str, list[Any]]] -) -> dict[str, Path]: +def write_excel(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[str, Path]: """ Writes import specifications to an Excel files. All the writers in this module have the same function signatures; see the module level documentation. @@ -253,9 +235,7 @@ def _expand_excel_columns_to_max_width(sheet: Worksheet): # https://stackoverflow.com/a/40935194/643675 for column_cells in sheet.columns: length = max(len(_as_text(cell.value)) for cell in column_cells) - sheet.column_dimensions[ - get_column_letter(column_cells[0].column) - ].width = length + sheet.column_dimensions[get_column_letter(column_cells[0].column)].width = length def _as_text(value): diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 42183157..00eee18b 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -177,9 +177,7 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: try: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: - return _error( - Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc) - ) + return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) with open(path, newline="", encoding="utf-8") as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") @@ -202,24 +200,15 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: ) ) results.append( - frozendict( - { - param_ids[j]: _normalize_xsv(row[j]) - for j in range(len(row)) - } - ) + frozendict({param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row))}) ) if not results: - raise _ParseException( - Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc) - ) + raise _ParseException(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) return ParseResults(frozendict({datatype: ParseResult(spcsrc, tuple(results))})) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: - return _error( - Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) - ) + return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) except _ParseException as e: return _error(e.args[0]) @@ -255,9 +244,7 @@ def _process_excel_row( def _process_excel_tab( excel: pandas.ExcelFile, spcsrc: SpecificationSource ) -> Tuple[Optional[str], Optional[ParseResult]]: - df = excel.parse( - sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False - ) + df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, @@ -273,9 +260,7 @@ def _process_excel_tab( row = _process_excel_row(row, i, columns, spcsrc) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows results.append( - frozendict( - {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))} - ) + frozendict({param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))}) ) return datatype, ParseResult(spcsrc, tuple(results)) @@ -314,9 +299,7 @@ def parse_excel(path: Path) -> ParseResults: except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: - return _error( - Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) - ) + return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) except ValueError as e: if "Excel file format cannot be determined" in str(e): return _error( diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 0b7f0af8..0a2167f9 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -116,9 +116,7 @@ async def _only_source(path: Path): return data["source"] -async def dir_info( - path: Path, show_hidden: bool, query: str = "", recurse=True -) -> list: +async def dir_info(path: Path, show_hidden: bool, query: str = "", recurse=True) -> list: """ only call this on a validated full path """ @@ -133,9 +131,7 @@ async def dir_info( if query == "" or specific_path.user_path.find(query) != -1: response.append(await stat_data(specific_path)) if recurse: - response.extend( - await dir_info(specific_path, show_hidden, query, recurse) - ) + response.extend(await dir_info(specific_path, show_hidden, query, recurse)) if entry.is_file(): if query == "" or specific_path.user_path.find(query) != -1: data = await stat_data(specific_path) diff --git a/staging_service/utils.py b/staging_service/utils.py index 65282d5f..835ccedc 100644 --- a/staging_service/utils.py +++ b/staging_service/utils.py @@ -31,12 +31,10 @@ async def run_command(*args): if process.returncode == 0: return stdout.decode().strip() else: - error_msg = ( - "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( - cmd=" ".join(args), - returncode=process.returncode, - error=stderr.decode().strip(), - ) + error_msg = "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( + cmd=" ".join(args), + returncode=process.returncode, + error=stderr.decode().strip(), ) raise HTTPInternalServerError(text=error_msg) @@ -100,15 +98,9 @@ def __init__(self): client = globus_sdk.NativeAppAuthClient(cf["client_id"]) try: - transfer_authorizer = globus_sdk.RefreshTokenAuthorizer( - cf["transfer_token"], client - ) - self.globus_transfer_client = globus_sdk.TransferClient( - authorizer=transfer_authorizer - ) - auth_authorizer = globus_sdk.RefreshTokenAuthorizer( - cf["auth_token"], client - ) + transfer_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["transfer_token"], client) + self.globus_transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer) + auth_authorizer = globus_sdk.RefreshTokenAuthorizer(cf["auth_token"], client) self.globus_auth_client = globus_sdk.AuthClient(authorizer=auth_authorizer) except globus_sdk.GlobusAPIError as error: logging.error(str(error.code) + error.raw_text) @@ -124,18 +116,14 @@ def _get_globus_identities(self, shared_directory: str): globus_id_filename = "{}.globus_id".format(shared_directory) with open(globus_id_filename, "r", encoding="utf-8") as fp: ident = fp.read() - return self.globus_auth_client.get_identities( - usernames=ident.split("\n")[0] - ) + return self.globus_auth_client.get_identities(usernames=ident.split("\n")[0]) def _get_globus_identity(self, globus_id_filename: str): """ Get the first identity for the username in the .globus_id file """ try: - return self._get_globus_identities(globus_id_filename)["identities"][0][ - "id" - ] + return self._get_globus_identities(globus_id_filename)["identities"][0]["id"] except FileNotFoundError as error: response = { "success": False, @@ -188,9 +176,7 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): } logging.info(response) - logging.info( - "Shared %s with %s\n", shared_directory_basename, user_identity_id - ) + logging.info("Shared %s with %s\n", shared_directory_basename, user_identity_id) logging.info(response) return response @@ -205,22 +191,16 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): } logging.error(response) if error.code == "Exists": - raise HTTPOk( - text=json.dumps(response), content_type="application/json" - ) from error + raise HTTPOk(text=json.dumps(response), content_type="application/json") from error - raise HTTPInternalServerError( - text=json.dumps(response), content_type="application/json" - ) + raise HTTPInternalServerError(text=json.dumps(response), content_type="application/json") def _remove_acl(self, user_identity_id: str): """ Get all ACLS and attempt to remove the correct ACL for the given user_identity """ try: - acls = self.globus_transfer_client.endpoint_acl_list(self.endpoint_id)[ - "DATA" - ] + acls = self.globus_transfer_client.endpoint_acl_list(self.endpoint_id)["DATA"] for acl in acls: if user_identity_id == acl["principal"]: if "id" in acl and acl["id"] is not None: diff --git a/tests/test_app.py b/tests/test_app.py index d656befb..788dbbb9 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -31,6 +31,8 @@ decoder = JSONDecoder() +MTIME_TEST_PRECISION = 1 + config = configparser.ConfigParser() config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) @@ -45,22 +47,6 @@ utils.Path._META_DIR = META_DIR -def asyncgiven(**kwargs): - """alternative to hypothesis.given decorator for async""" - - def real_decorator(fn): - @given(**kwargs) - def aio_wrapper(*args, **kwargs): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - future = asyncio.wait_for(fn(*args, **kwargs), timeout=60) - loop.run_until_complete(future) - - return aio_wrapper - - return real_decorator - - def mock_auth_app(): application = app.app_factory(config) @@ -97,6 +83,10 @@ def __init__(self, base_dir=DATA_DIR): self.base_dir = base_dir def __enter__(self): + """ + Removes the base directory and contents, if it exists, then creates it so that + it provides a blank slate. + """ os.makedirs(self.base_dir, exist_ok=True) shutil.rmtree(self.base_dir) os.makedirs(self.base_dir, exist_ok=False) @@ -184,7 +174,7 @@ def test_path_sanitation(username_first, username_rest, path): assert validated.metadata_path.find("../") == -1 -@asyncgiven(txt=st.text()) +@given(txt=st.text()) async def test_cmd(txt): with FileUtil(DATA_DIR) as fs: d = fs.make_dir("test") @@ -223,8 +213,8 @@ async def test_jbi_metadata(): username = "testuser" jbi_metadata = '{"file_owner": "sdm", "added_date": "2013-08-12T00:21:53.844000"}' - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) fs.make_file( @@ -253,8 +243,8 @@ async def test_jbi_metadata(): async def test_metadata(): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res1 = await cli.get( @@ -353,8 +343,8 @@ async def test_metadata(): async def test_define_upa(): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # generating metadata file @@ -434,8 +424,8 @@ async def test_define_upa(): async def test_mv(): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) @@ -508,8 +498,8 @@ async def test_mv(): async def test_delete(): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) @@ -556,8 +546,8 @@ async def test_delete(): async def test_list(): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) fs.make_dir(os.path.join(username, "test", "test_sub_dir")) @@ -581,7 +571,42 @@ async def test_list(): assert json[0]["isFolder"] is True assert json[0]["name"] == "test" assert json[0]["path"] == "testuser/test" - assert json[0]["mtime"] <= time.time() * 1000 + + # This test is meant to prove that the time of the file just created is less + # than the current time measured immediately afterward. This carries some + # truth, as the actions in a single-threaded application should be in + # chronological sequence. + # However, a naive implementation can fail due to the manner in which the + # file modification time, or "mtime" is measured. In some systems it may + # have precision of "second", even if the underlying system can measure time + # at higher precision. In this case, the time may also be "rounded up" to + # the nearest second. If the comparision time has a highter precision, e.g. + # ms or ns, the comparison time may actually appear to be BEFORE the file's + # mtime which was actually measured BEFORE. + + # One solution could be to round the comparison time, but that would implie + # that we KNOW that the system will round up to the second. But we don't + # actually know that, although we could measure that through sampling, I + # suppose. + + # In any case, the method we choose to use is to change the test condition a + # bit. We just want to assert that the file's mtime, as provided by the api, + # is within the ballpark of the current time. We define this ballpark as 3 + # seconds, since in at least some cases, Windows mtime precision is 2 seconds. + + # Here is an actual example of a test that failed with the naive + # implementation: + + # FAILED tests/test_app.py::test_list - assert 1693508770000 <= + # (1693508769.9508653 * 1000) + # Thursday, August 31, 2023 7:06:10 PM + # Thursday, August 31, 2023 7:06:09.950 PM + + # The "mtime" in the structure is in ms. + + diff = json[0]["mtime"]/1000 - time.time() + assert abs(diff) < MTIME_TEST_PRECISION + assert len(file_folder_count) == 4 # 2 folders and 2 files assert sum(file_folder_count) == 2 @@ -594,7 +619,10 @@ async def test_list(): assert json[0]["isFolder"] is True assert json[0]["name"] == "test" assert json[0]["path"] == "testuser/test" - assert json[0]["mtime"] <= time.time() * 1000 + + diff = json[0]["mtime"]/1000 - time.time() + assert abs(diff) < MTIME_TEST_PRECISION + assert len(file_folder_count) == 4 # 2 folders and 2 files assert sum(file_folder_count) == 2 @@ -643,14 +671,20 @@ async def test_list(): assert len(file_names) == 4 -@asyncgiven(txt=st.text()) +# Not sure that we need hypothesis here; I think we should poke at the boundaries +# of what the download should handle. +@given(txt=st.text()) async def test_download(txt): + # async def test_download(): username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: + # Creates the test file (test_file_1) with text provided by + # hypothesis. fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + # Then we download it and ensure the contents are as expected. res = await cli.get( os.path.join("download", "test", "test_file_1"), headers={"Authorization": ""}, @@ -662,8 +696,8 @@ async def test_download(txt): async def test_download_errors(): username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) res1 = await cli.get("dwnload", headers={"Authorization": ""}) @@ -677,8 +711,8 @@ async def test_download_errors(): async def test_similar(): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) fs.make_dir(os.path.join(username, "test", "test_sub_dir")) @@ -718,8 +752,8 @@ async def test_similar(): async def test_existence(): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test_file_1"), txt) fs.make_dir(os.path.join(username, "test", "test_sub_dir")) @@ -780,8 +814,8 @@ async def test_existence(): async def test_search(): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) fs.make_file(os.path.join(username, "test", "test1"), txt) fs.make_dir(os.path.join(username, "test", "test2")) @@ -806,12 +840,8 @@ async def test_search(): async def test_upload(): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - # testing missing body - res1 = await cli.post("upload", headers={"Authorization": ""}) - assert res1.status == 400 - - with FileUtil() as fs: + with FileUtil() as fs: + async with AppClient(config, username) as cli: fs.make_dir(os.path.join(username, "test")) f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) @@ -824,6 +854,14 @@ async def test_upload(): assert res2.status == 200 +async def test_upload_missing_body(): + username = "testuser" + async with AppClient(config, username) as cli: + # testing missing body + res1 = await cli.post("upload", headers={"Authorization": ""}) + assert res1.status == 400 + + async def _upload_file_fail_filename(filename: str, err: str): # Note two file uploads in a row causes a test error: # https://github.com/aio-libs/aiohttp/issues/3968 @@ -855,7 +893,7 @@ async def test_upload_fail_comma_in_file(): @settings(deadline=None) -@asyncgiven(contents=st.text()) +@given(contents=st.text()) async def test_directory_decompression(contents): fname = "test" dirname = "dirname" @@ -875,9 +913,10 @@ async def test_directory_decompression(contents): ("bztar", ".tar.bz"), ("tar", ".tar"), ] - async with AppClient(config, username) as cli: - for method, extension in methods: - with FileUtil() as fs: + + for method, extension in methods: + with FileUtil() as fs: + async with AppClient(config, username) as cli: d = fs.make_dir(os.path.join(username, dirname)) f1 = fs.make_file(path.user_path, contents) d2 = fs.make_dir(os.path.join(username, dirname, dirname)) @@ -915,7 +954,8 @@ async def test_directory_decompression(contents): assert os.path.exists(f3) -@asyncgiven(contents=st.text()) +@settings(deadline=None) +@given(contents=st.text()) async def test_file_decompression(contents): fname = "test" dirname = "dirname" @@ -927,9 +967,9 @@ async def test_file_decompression(contents): return methods = [("gzip", ".gz"), ("bzip2", ".bz2")] - async with AppClient(config, username) as cli: - for method, extension in methods: - with FileUtil() as fs: + for method, extension in methods: + with FileUtil() as fs: + async with AppClient(config, username) as cli: d = fs.make_dir(os.path.join(username, dirname)) f1 = fs.make_file(path.user_path, contents) name = fname + extension @@ -1023,8 +1063,8 @@ async def test_importer_mappings(): async def test_bulk_specification_success(): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser/somefolder") # testuser is hardcoded in the auth mock base = Path(fu.base_dir) / "testuser" tsv = "genomes.tsv" @@ -1121,8 +1161,8 @@ async def test_bulk_specification_fail_no_files(): async def test_bulk_specification_fail_not_found(): - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir( "testuser/otherfolder" ) # testuser is hardcoded in the auth mock @@ -1152,8 +1192,8 @@ async def test_bulk_specification_fail_parse_fail(): Tests a number of different parse fail cases, including incorrect file types. Also tests that all the errors are combined correctly. """ - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir( "testuser/otherfolder" ) # testuser is hardcoded in the auth mock @@ -1235,7 +1275,9 @@ async def test_bulk_specification_fail_parse_fail(): }, { "type": "cannot_parse_file", - "message": "fasta.gz is not a supported file type for import specifications", + "message": ( + "fasta.gz is not a supported file type for import specifications" + ), "file": "testuser/badfile.fasta.gz", "tab": None, }, @@ -1253,7 +1295,9 @@ async def test_bulk_specification_fail_parse_fail(): }, { "type": "cannot_parse_file", - "message": "badfile. is not a supported file type for import specifications", + "message": ( + "badfile. is not a supported file type for import specifications" + ), "file": "testuser/badfile.", "tab": None, }, @@ -1263,8 +1307,8 @@ async def test_bulk_specification_fail_parse_fail(): async def test_bulk_specification_fail_column_count(): - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock base = Path(fu.base_dir) / "testuser" tsv = "genomes.tsv" @@ -1336,8 +1380,8 @@ async def test_bulk_specification_fail_column_count(): async def test_bulk_specification_fail_multiple_specs_per_type(): - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock base = Path(fu.base_dir) / "testuser" tsv = "genomes.tsv" @@ -1431,8 +1475,8 @@ async def test_bulk_specification_fail_multiple_specs_per_type_excel(): Test an excel file with an internal data type collision. This is the only case when all 5 error fields are filled out. """ - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock base = Path(fu.base_dir) / "testuser" excel = "stuff.xlsx" @@ -1506,8 +1550,8 @@ async def test_bulk_specification_fail_multiple_specs_per_type_excel(): async def test_write_bulk_specification_success_csv(): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock resp = await cli.post( "write_bulk_specification/", # NOSONAR @@ -1549,8 +1593,8 @@ async def test_write_bulk_specification_success_csv(): async def test_write_bulk_specification_success_tsv(): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock types = dict(_IMPORT_SPEC_TEST_DATA) types["reads"] = dict(types["reads"]) @@ -1594,8 +1638,8 @@ async def test_write_bulk_specification_success_tsv(): async def test_write_bulk_specification_success_excel(): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: - with FileUtil() as fu: + with FileUtil() as fu: + async with AppClient(config) as cli: fu.make_dir("testuser") # testuser is hardcoded in the auth mock resp = await cli.post( "write_bulk_specification/", diff --git a/tox.ini b/tox.ini index 51b50a04..bba11d30 100644 --- a/tox.ini +++ b/tox.ini @@ -1,2 +1,5 @@ [flake8] -max-line-length = 100 \ No newline at end of file +# unfortunately, flake8 does not play with pyproject.tom. what a shame. +# one day perhaps we'll switch to ruff. +max-line-length = 100 +extend-ignore = E203 \ No newline at end of file