diff --git a/.gitignore b/.gitignore index 894eaeeb..85072a3b 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ Session.vim #********** IntelliJ files ****** *.iml + +# built docs +built_docs/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..88ee345e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,5 @@ +[submodule "clyde"] + path = hugo/themes/clyde + url = https://github.com/DNAstack/clyde.git + branch = v1.1.1 + diff --git a/.travis.yml b/.travis.yml index d3fe9e7f..25f567d9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,31 @@ dist: bionic sudo: false jdk: - openjdk11 -env: - - GH_URL=https://raw.githubusercontent.com FILE_TO_VALIDATE=spec/search-api.yaml URL_TO_VALIDATE=$GH_URL/${TRAVIS_PULL_REQUEST_SLUG:-$TRAVIS_REPO_SLUG}/${TRAVIS_PULL_REQUEST_BRANCH:-$TRAVIS_BRANCH}/$FILE_TO_VALIDATE -before_install: - - git clone --branch=v1.1.0 https://github.com/mcupak/oas-validator.git -script: - - ./oas-validator/validate.sh "$URL_TO_VALIDATE" + +jobs: + include: + - name: validation + language: java + jdk: openjdk11 + env: + - GH_URL=https://raw.githubusercontent.com FILE_TO_VALIDATE=spec/api.yaml URL_TO_VALIDATE=$GH_URL/${TRAVIS_PULL_REQUEST_SLUG:-$TRAVIS_REPO_SLUG}/${TRAVIS_PULL_REQUEST_BRANCH:-$TRAVIS_BRANCH}/$FILE_TO_VALIDATE + before_install: + - git clone --branch=v1.1.0 https://github.com/mcupak/oas-validator.git + script: + - ./oas-validator/validate.sh "$URL_TO_VALIDATE" + - name: docs + language: node_js + node_js: 14 + before_install: + - npm install -g @redocly/openapi-cli && npm install -g redoc-cli + - wget "https://github.com/gohugoio/hugo/releases/download/v0.79.0/hugo_extended_0.79.0_Linux-64bit.deb" && sudo dpkg -i hugo*.deb + script: + - git submodule update --remote --merge + - make build_prod + deploy: + provider: pages + skip-cleanup: true + keep_history: false + github-token: $GITHUB_TOKEN + on: + branch: develop diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d53c98cc..d12dd905 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,11 +22,30 @@ Some general rules to follow: - Create a branch for each update that you're working on. These branches are often called "feature" or "topic" branches. Any changes that you push to your feature branch will automatically be shown in the pull request. - Keep your pull requests as small as possible. Large pull requests are hard to review. Try to break up your changes into self-contained and incremental pull requests. - The first line of commit messages should be a short (<80 character) summary, followed by an empty line and then any details that you want to share about the commit. +- Each pull request should be associated with an issue. - Please try to follow [common commit message conventions](https://chris.beams.io/posts/git-commit/). +### Voting +Once a pull request or issue have been submitted, maintainers can comment or vote on to express their opinion following the [Apache voting system](https://www.apache.org/foundation/voting.html). Quick summary: + +- +1 something you agree with +- -1 if you have a strong objection to an issue, which will be taken very seriously. A -1 vote should provide an alternative solution. +- +0 or -0 for neutral comments or weak opinions. +- It's okay to have input without voting. +- Silence gives assent. +- In a pull request review: + - Approval is considered a +1 vote on the pull request. + - "Request changes" is considered a -1 vote on the pull request. +- A pull request is ready to be merged when either of the following is true: + - A pull request has at least two +1 votes, no -1 votes, and has been open for at least 3 days. + - A pull request has no -1 votes, and has been open for at least 14 days. + - We sometimes waive the time constraint for cosmetic-only changes -- use good judgment. If an issue gets any -1 votes, the comments on the issue need to reach consensus before the issue can be resolved one way or the other. There isn't any strict time limit on a contentious issue. + +The project will strive for full consensus on everything until it runs into a problem with this model. + ### Topic branches -If you wish to collaborate on a new feature with other GA4GH members, you can create a topic branch. Once a topic branch exists, pull requests can be made against it the usual way. It may also be brought up to date with new changes merged into develop by anyone with commit access, if the changes produce merely a fast-forward merge for each constituent branch. However, if changes from the develop branch create a new merge commit in or or more of the repositories, that commit needs to be reviewed in a pull request. +If you wish to collaborate on a new feature with other GA4GH members, you can create a topic branch. Once a topic branch exists, pull requests can be made against it the usual way. It may also be brought up to date with new changes merged into `develop` by anyone with commit access, if the changes produce merely a fast-forward merge for each constituent branch. However, if changes from the `develop` branch create a new merge commit, that commit needs to be reviewed in a pull request. Changes made in a topic branch can be merged into develop by creating a pull request against the `develop` branch and then resolving the normal way. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..bdde68e3 --- /dev/null +++ b/Makefile @@ -0,0 +1,23 @@ +run: build_api_docs + echo "start run" + cd hugo && hugo serve -D +build: build_api_docs + echo "start doc build" + git submodule update --init --recursive + echo "start hugo build" + cd hugo && hugo --minify -d ../built_docs/ + echo "end build" +build_prod: build_api_docs + echo "start prod doc build" + echo -e "\033[33mUSE MAKE BUILD FOR TESTING LOCALLY, DON'T TRACK docs/!\033[0m" + git submodule update --init --recursive + echo "start hugo build" + cd hugo && hugo --minify -d ../docs/ + echo "end prod build" +build_api_docs: + echo "start api docs build" + redoc-cli bundle spec/api.yaml + mkdir -p `dirname ./hugo/content/api/index.html` + mv ./redoc-static.html ./hugo/content/api/index.html +clean: + rm -rf hugo/docs/ docs/ built_docs/ hugo/built_docs/ hugo/content/api/index.html diff --git a/README.md b/README.md index 828a2c24..b67b7b88 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,172 @@ -# Search Swagger Validator [![](https://travis-ci.org/ga4gh-discovery/ga4gh-discovery-search.svg?branch=develop)](https://travis-ci.org/ga4gh-discovery/ga4gh-discovery-search) [![](https://img.shields.io/badge/license-Apache%202-blue.svg)](https://raw.githubusercontent.com/ga4gh-discovery/ga4gh-discovery-search/develop/LICENSE) +![](https://www.ga4gh.org/wp-content/themes/ga4gh-theme/gfx/GA-logo-horizontal-tag-RGB.svg) -`Search` is a framework for searching genomics and clinical data. +# Data Connect API [![](https://img.shields.io/badge/specification-Full%20Text-green)](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md) Swagger Validator [![](https://img.shields.io/badge/-Documentation-blue)](https://ga4gh-discovery.github.io/data-connect/) [![](https://travis-ci.com/ga4gh-discovery/data-connect.svg?branch=develop)](https://travis-ci.com/ga4gh-discovery/data-connect) [![](https://img.shields.io/badge/license-Apache%202-blue.svg)](https://raw.githubusercontent.com/ga4gh-discovery/data-connect/develop/LICENSE) -The Search framework is comprised of a collection of complementary standards that data custodians can implement to make their biomedical data more discoverable. +Data Connect is a standard for discovery and search of biomedical data, developed by the [Discovery Work Stream](https://github.com/ga4gh-discovery/ga4gh-discovery.github.io) of the [Global Alliance for Genomics & Health](http://ga4gh.org). -The schemas for most components of the framework are developed by the [Discovery Work Stream](https://github.com/ga4gh-discovery/ga4gh-discovery.github.io) of the [Global Alliance for Genomics & Health](http://ga4gh.org). +The standard provides a mechanism for: -## Background +- Describing data and its data model. + - Data Connect's _Table API_ component provides a way to organize data into "Tables" and describe their data model, leveraging the JSON Schema standard. +- Searching the data with the given data model. + - Data Connect's _Search API_ component provides a way to query "Tables" of data, leveraging the SQL standard. + +It is **not** in the scope of the standard to: + +- Define said data models. + - Data Connect relies on other efforts in GA4GH (e.g. [GA4GH SchemaBlocks](https://schemablocks.org/)), as well as outside implementers. -The GA4GH has previously developed two standards for discovery. `Beacon` is a standard for discovery of genomic variants, while `Matchmaker` is a standard for discovery of subjects with certain genomic and phenotypic features. Implementations of these standards have been linked into federated networks ([Beacon Network](http//beacon-network.org) and [Matchmaker Exchange](http://matchmakerexchange.org), respectively). +For more information: -Each standard (and corresponding network) has been successful in its own right. It was acknowledged that it would be broadly useful to develop standards that abstracted common utilities for building searchable, federated networks for a variety of applications in genomics and health. +- Read the [full specification](SPEC.md). +- Explore the [API](spec/api.yaml) (view in [Swagger Editor](https://editor.swagger.io/?url=https://raw.githubusercontent.com/ga4gh-discovery/data-connect/develop/spec/api.yaml)). +- See [documentation for adopters](https://ga4gh-discovery.github.io/data-connect/). -The Discovery Work Stream develops `Search` as a general-purpose framework for building federatable search-based applications. -## Goals -* `federation` It is possible to federate searches across multiple implementations. Federations of the search framework reference common schemas and properties. -* `backend agnostic` It is possible to implement the framework across a large variety of backend datastores. +## Background + +GA4GH has previously developed two standards for discovery. `Beacon` is a standard for discovery of genomic variants, while `Matchmaker` is a standard for discovery of subjects with certain genomic and phenotypic features. Implementations of these standards have been linked into federated networks (e.g. [Beacon Network](https://beacon-network.org/) and [Matchmaker Exchange](http://matchmakerexchange.org)). -## Out of scope -* `developing data models` The Search framework **does not** define data models. It defers that effort to others in the GA4GH or outside implementers. -* `application development` The Search framework **does not** prescribe a specific application. It is intentionally general-purpose. It defers to other efforts in the Discovery Work Stream, GA4GH, and beyond to build domain-specific applications. +Both standards (and the corresponding networks) have been successful in their own right, but had a lot in common. It was acknowledged that it would be broadly useful to develop standards that abstract common infrastructure for building searchable, federated networks for a variety of applications in genomics and health. -## Complementary standards +Data Connect, formerly known as _GA4GH Search_, is this general-purpose middleware for building federated, search-based applications. The name of the API reflects its purpose of: -The following standards are complementary but not required by the Search framework: +- Giving data providers a mechanism to enable others to connect to their data via the described data models. +- Allowing data consumers to make connections within the data through a flexible query language. -* The [Service Info](https://github.com/ga4gh-discovery/service-info) standard can be used to describe the service -* The [Service Registry](https://github.com/ga4gh-discovery/service-registry) standard can be used to create networks of search services +## Benefits -## Architecture +- **Interoperable**. Simple, interoperable, uniform mechanism to publish, discover, and search biomedical data. +- **Flexible**. Works with any data that can be serialized as an array of JSON objects. Recommends the use of [GA4GH SchemaBlocks](https://schemablocks.org/) data models, but allows custodians to specify their own data models to make their data available without extensive ETL transformations. +- **Supports federation**. Serves as a general-purpose framework for building federatable search-based applications across multiple implementations. Federations reference common schemas and properties. +- **Minimal by design**. The API is purposely kept minimal so that the barriers to publishing existing data are as small as possible. +- **Backend agnostic**. It is possible to implement the API across a large variety of backend datastores. +- **General purpose**. Admits use cases that have not yet been thought of. - - +## Intended Audience -## Components +The intended audience of this standard includes: -The search API consists of [Table](TABLE.md) and Query APIs, describing search results and queries, respectively. +- Data custodians looking to make their data discoverable and searchable, especially in a federated way. +- Data consumers looking to discover and search data in an interoperable way, including outside of the genomics community. +- Developers of applications, such as data explorers. +- API developers looking to incorporate search functionality into their APIs. +- Data model developers looking to make their data models searchable and interoperable with other standards. ## Use cases -See [USECASES.md](USECASES.md) +Data Connect is an intentionally general-purpose middleware meant to enable the development of a diverse ecosystem of applications. + + + +The community has built versions of the following applications on top of Data Connect: + +- Data Explorers +- Beacons +- Patient matchmaking +- Jupyter notebooks +- R data frames +- Command line query tools +- Data and metadata indexers +- Data federations +- Concept cross-references + +We're looking forward to seeing things we haven’t yet imagined! -### Examples +The community has also connected data through the following data sources: -* Find subjects with HP:0001519 and candidate gene FBN1 (use case of [Matchmaker Exchange](https://www.matchmakerexchange.org/)) -* Find male subjects with HP:0009726 consented for General Research Use (use case of [European Genome-phenome Archive](https://www.ebi.ac.uk/ega/home)) -* Find adult males diagnosed with autism having a harmful mutation in SHANK1 (use case of [Autism Sharing Initiative](http://autismsharinginitiative.org)) -* Find dataset from subject on European data center hosted on Amazon (use case of [Cloud Work Stream](https://github.com/ga4gh/wiki/wiki)) +- FHIR +- Relational databases +- CSV/TSV files with data dictionaries +- VCF+TBI files +- Phenopackets +- Google BigQuery +- Google Sheets +- and more! -## Implementations and tooling +Examples of queries on the data that can be answered via Data Connect include: -- [Tables-in-a-bucket (no-code implementation)](#dataset-in-a-bucket-no-code-implementation) +- Find subjects with HP:0001519 and candidate gene FBN1 (use case of [Matchmaker Exchange](https://www.matchmakerexchange.org/)) +- Find male subjects with HP:0009726 consented for General Research Use (use case of [European Genome-phenome Archive](https://ega-archive.org/)) +- Find adult males diagnosed with autism having a harmful mutation in SHANK1 (use case of [Autism Sharing Initiative](http://autismsharinginitiative.org)) +- Find dataset from subject on European data center hosted on Amazon (use case of [Cloud Work Stream](https://github.com/ga4gh/wiki/wiki)) + +Full summary of use cases can be found in [USECASES.md](USECASES.md). + +## Implementations + +### Server implementations + +Several open-source implementations are available: + +- [Tables-in-a-bucket (no-code implementation)](#tables-in-a-bucket-no-code-implementation) - [Google Sheets implementation](#google-sheets-implementation) +- [Implementation based on Trino](#implementation-based-on-trino) -### Tables-in-a-bucket (no-code implementation) -The specification allows for a no-code implementation as a collection of files served statically (e.g. in a cloud bucket, or a Git repository). To do this, you need the following JSON files: +#### Tables-in-a-bucket (no-code implementation) +The specification allows for a no-code implementation as a collection of files served statically (e.g. in a cloud bucket or a Git repository). To do this, you need the following JSON files: - ```tables```: served in response to ```GET /tables``` - ```table/{table_name}/info```: served in response to ```GET /table/{table_name}/info```. e.g. a table with the name ```mytable``` should have a corresponding file ```table/mytable/info``` - ```table/{table_name}/data```: served in response to ```GET /table/{table_name}/data```. e.g. a table with the name ```mytable``` should have a corresponding file ```table/mytable/data``` -- ```table/{table_name}/data_{pageNumber}```, which will be linked in the next_page_url of the first table (e.g. ```mytable```), or in the next_page_url/prev_page_url of previous or subsequent pages. -- ```table/{table_name}/data_models/{schemaFile}```: Though not required, data models may be linked via [$ref](https://json-schema.org/latest/json-schema-core.html#rfc.section.8.3). Data models can also be stored as static JSON documents, and be referred to by relative or absolute URLs. +- ```table/{table_name}/data_{pageNumber}```, which will be linked in the `next_page_url` of the first table (e.g. ```mytable```). + - The above is just an illustrative example. Data Connect clients are capable of following any absolute or relative URL. + - The first page has to be called `/table/{table_name}/data`, then you can use any naming scheme you like for subsequent pages. + - All the nitty-gritty details of pagination are detailed in [the specification](SPEC.md#pagination-and-long-running-queries). +- ```table/{table_name}/data_models/{schemaFile}``` + - Though not required, data models may be linked via [$ref](https://json-schema.org/latest/json-schema-core.html#rfc.section.8.3). + - Data models can also be stored anywhere as static JSON documents, and like data pages, can be referred to by relative or absolute URLs. + +A concrete, example test implementation is available [here](https://storage.googleapis.com/ga4gh-tables-example/tables). + +#### Google Sheets implementation +A Google Sheets spreadsheet can also be exposed via the Tables API using the sheets adapter, located [here](https://github.com/DNAstack/ga4gh-search-adapter-google-sheets). + +#### Implementation based on Trino + +DNAstack has provided an [implementation of Data Connect](https://github.com/dnastack/ga4gh-search-adapter-presto) on top of [Trino](https://trino.io/). This implementation includes examples of data stored in the FHIR and Phenopackets formats. + +### Client implementations + +[Several open-source implementations](https://ga4gh-discovery.github.io/data-connect/docs/getting-started/consume-data/) based on different technology stacks are available: +- Python +- R +- CLI + +## Security + +Sensitive information transmitted over public networks, such as access tokens and human genomic data, MUST be protected using Transport Level Security (TLS) version 1.2 or later, as specified in [RFC 5246](https://tools.ietf.org/html/rfc5246). + +If the data holder requires client authentication and/or authorization, then the client’s HTTPS API request MUST present an OAuth 2.0 bearer access token as specified in [RFC 6750](https://tools.ietf.org/html/rfc6750), in the `Authorization` request header field with the Bearer authentication scheme: + +``` +Authorization: Bearer [access_token] +``` + +The policies and processes used to perform user authentication and authorization, and the means through which access tokens are issued, are beyond the scope of this API specification. GA4GH recommends the use of the [OpenID Connect](https://openid.net/connect/) and [OAuth 2.0 framework (RFC 6749)](https://tools.ietf.org/html/rfc6749) for authentication and authorization. + +A stand-alone security review has been performed on the API. Nevertheless, GA4GH cannot guarantee the security of any implementation to which the API documentation links. If you integrate this code into your application it is AT YOUR OWN RISK AND RESPONSIBILITY to arrange for an audit to ensure compliance with any applicable regulatory and security requirements, especially where personal data may be at issue. + +To report security issues with the specification, please send an email to [security-notification@ga4gh.org](mailto:security-notification@ga4gh.org). + +## CORS +Cross-origin resource sharing (CORS) is an essential technique used to overcome the same origin content policy seen in browsers. This policy restricts a webpage from making a request to another website and leaking potentially sensitive information. However the same origin policy is a barrier to using open APIs. GA4GH open API implementers should enable CORS to an acceptable level as defined by their internal policy. All public API implementations should allow requests from any server. + +GA4GH has provided a [CORS best practices document](https://docs.google.com/document/d/1Ifiik9afTO-CEpWGKEZ5TlixQ6tiKcvug4XLd9GNcqo/edit?usp=sharing), which implementers should refer to for guidance when enabling CORS on public API instances. + +## Development + +### Validating + +The API is specified in OpenAPI 3. Use [Swagger Validator Badge](https://github.com/swagger-api/validator-badge) to validate the YAML file, or its [OAS Validator](https://github.com/mcupak/oas-validator) wrapper. + +### Documentation +Documentation is sourced from the `hugo/` directory. Building the docs requires the [Hugo framework](https://gohugo.io/documentation/) with the [Clyde theme](https://github.com/DNAstack/clyde). Edit the markdown files under `hugo/content/` for content changes. -A concrete, example test implementation is [available](https://storage.googleapis.com/ga4gh-tables-example/tables) (list endpoint) with [documentation](https://storage.googleapis.com/ga4gh-tables-example/EXAMPLE.md). +Run the docs locally using `make run`, which is served at `http://localhost:1313/data-connect/`. Clean up before commiting using `make clean`. -### Google Sheets implementation -A Google Sheets spreadsheet can also be exposed via the tables API via the sheets adapter, located [here](https://github.com/DNAstack/ga4gh-search-adapter-google-sheets). +To manually inspect the build artifacts, use `make build`. Clean up before commiting using `make clean`. -## Contributing +### Contributing -The GA4GH is an open community that strives for inclusivity. Teleconferences and corresponding [meeting minutes](https://docs.google.com/document/d/1sG--PPVlVWb1-_ZN7cHta79uU9tU2y-17U11PYzvMu8/edit#heading=h.lwhinfkfmlx4) are open to the public. To learn how to contribute to this effort, please email Rishi Nag ([rishi.nag@ga4gh.org](mailto:rishi.nag@ga4gh.org)). +The GA4GH is an open community that strives for inclusivity. Guidelines for contributing to this repository are listed in [CONTRIBUTING.md](CONTRIBUTING.md). Teleconferences and corresponding [meeting minutes](https://w3id.org/ga4gh/minutes/discovery-search) are open to the public. To learn how to contribute to this effort, please [contact us](mailto:info@ga4gh.org). diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 00000000..df298a89 --- /dev/null +++ b/SPEC.md @@ -0,0 +1,1451 @@ +# Data Connect Specification + +This document describes the overall structure of the Data Connect API and specifies how an implementation should parse, execute, and respond to a search expressed as an SQL query. Independently developed implementations that conform to this specification can be used interchangeably by a client, or networked together into a tree-structured federation of Data Connect nodes. + +## Table of Contents + +- [Data Connect Specification](#data-connect-specification) + - [Overview](#overview) + - [Conventions](#conventions) + - [Table Discovery and Browsing](#table-discovery-and-browsing) + - [Table Discovery and Browsing Examples](#table-discovery-and-browsing-examples) + - [Search](#search) + - [Search Example](#search-example) + - [Search Request](#search-request) + - [Positional Query Parameters](#positional-query-parameters) + - [Correspondence Between SQL and JSON Data Types in Search Request](#correspondence-between-sql-and-json-data-types-in-search-request) + - [Search Result](#search-result) + - [Correspondence Between SQL and JSON Data Types in the Search Result](#correspondence-between-sql-and-json-data-types-in-the-search-result) + - [Semantic Data Types](#semantic-data-types) + - [Example: Semantic Data Types on a Table](#example-semantic-data-types-on-a-table) + - [Attaching Semantic Data Types To Search Results](#attaching-semantic-data-types-to-search-results) + - [Example: Semantic Data Types in Search Results](#example-semantic-data-types-in-search-results) + - [SQL Functions](#sql-functions) + - [Pagination and Long Running Queries](#pagination-and-long-running-queries) +- [Supplementary Information](#supplementary-information) + - [Interop with other data storage and transmission standards](#interop-with-other-data-storage-and-transmission-standards) + - [Phenopackets](#phenopackets) + - [Concrete Example](#concrete-example) + - [Organizing Into Tables](#organizing-into-tables) + - [How to Secure Implementations Based on Trino Connectors or PostgreSQL Foreign Data Wrappers](#how-to-secure-implementations-based-on-trino-connectors-or-postgresql-foreign-data-wrappers) + - [Implementing a Federation of SQL Query Nodes](#implementing-a-federation-of-sql-query-nodes) +- [Appendix A: SQL Grammar](#appendix-a-sql-grammar) + +## Overview + +The primary container for data in the Data Connect API is the **Table**. Tables contain rows of data, where each row is a JSON object with key/value pairs. The table describes the structure of its row objects using [JSON Schema](https://json-schema.org/). Row attributes can take on any legal JSON value, e.g. numbers, strings, booleans, nulls, arrays, and nested JSON objects. + +The API supports browsing and discovery of data models and table metadata, listing table data, and optionally searching table data using arbitrarily complex expressions including joins and aggregations. The query language is SQL with domain-specific functions to facilitate informative typing of the result fields. + +All discovery, browsing and search operations are specified formally in the [OpenAPI specification](https://github.com/ga4gh-discovery/ga4gh-discovery-search/blob/develop/spec/api.yaml) document. + +## Conventions + +The keywords "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://tools.ietf.org/html/rfc2119). + +## Table Discovery and Browsing + +The Table Discovery and Browsing part of the Data Connect API consists of these HTTP resources, which provide information about the tables available. Machine actionable descriptions of their schema and semantics are provided. + +| Request | Description | +| -------------------------- | --------------------------------------------------------------------------- | +| `GET /tables` | Retrieve a paginated list of tables available from this Data Connect API instance | +| `GET /table/{id}/info` | Retrieve the data model (JSON Schema) associated with the given table | +| `GET /table/{id}/data` | Retrieve the data rows (paginated) from the given table and the data model of the retrieved data. | + +More information on the table structure is provided in [TABLE.md](TABLE.md). + +### Table Discovery and Browsing Examples + +``` +GET /tables +``` +``` +{ + "tables": [ + { + "name": "search_cloud.brca_exchange.v32", + "data_model": { + "$ref": "https://example.com/table/search_cloud.brca_exchange.v32/info" + } + }, + { + "name": "pgpc.ontology.axiom", + "data_model": { + "$ref": "https://example.com/table/pgpc.ontology.axiom/info" + } + }, + ... + ... + ], + "pagination": { + "next_page_url": "https://example.com/tables/catalog/search_drs" + } +} +``` + +``` +GET `/table/pgpc.ontology.axiom/info` +``` +``` +{ + "name": "pgpc.ontology.axiom", + "data_model": { + "$id": "https://example.com/table/pgpc.ontology.axiom/info", + "description": "Automatically generated schema", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "ontology": { + "format": "varchar", + "type": "string", + }, + "ontology_version": { + "format": "varchar", + "type": "string", + }, + "from_term": { + "format": "varchar", + "type": "string", + }, + "relation": { + "format": "varchar", + "type": "string", + }, + "to_term": { + "format": "varchar", + "type": "string", + } + } + } +} +``` + + +``` +GET /table/pgpc.ontology.axiom/data +``` +``` +{ + "data_model": { }, + "data": [ + { + "ontology": "http://purl.obolibrary.org/obo/hp.owl", + "ontology_version": "http://purl.obolibrary.org/obo/hp/releases/2019-04-15", + "from_term": "HP_0100572", + "relation": "SubClassOf", + "to_term": "HP_0100571" + }, + { + "ontology": "http://purl.obolibrary.org/obo/hp.owl", + "ontology_version": "http://purl.obolibrary.org/obo/hp/releases/2019-04-15", + "from_term": "UBERON_0005047", + "relation": "SubClassOf", + "to_term": "UBERON_0001824" + }, + ... + ... + ], + "pagination": { + "next_page_url": "https://example.com/search/v1/statement/executing/20200901_013328_00039_szpff/y4134d1e51a8262d0f8fed899b2eed9fd02e200e9/1" +} +``` + +## Search + +The Search part of the Data Connect API consists of the following HTTP endpoint: + +| Request | Description | +| -------------------------- | --------------------------------------------------------------------------- | +| `POST /search` | Executes the given SQL query and returns the results as a Table | + + +### Search Example + + +#### Search Request + +Here is a concrete example of a search request against a Data Connect implementation. + +``` +POST Request: +/search + +Header: +content-type: application/json + +Request body: +{ "query": "SELECT * from pgpc.ontology.axiom WHERE to_term='UBERON_0000464'"} +``` + +##### Positional Query Parameters + +This query has the effect as the previous example, but is expressed using a positional parameter: + +``` +POST Request: +/search + +Header: +content-type: application/json + +Request body: +{ + "query": "SELECT * from pgpc.ontology.axiom WHERE to_term=?" + "parameters": [ "UBERON_0000464" ] +} +``` + +A positional parameter is marked by a `?` anywhere a literal value of any type could appear in the query. + +If a query has no positional parameters, the client MAY omit the `parameters` property from the request. +If the client supplies `parameters` in a query with no positional parameters, its value MUST be an empty +array. + +If a query has one or more positional parameters, the request body MUST include a `parameters` property, +which is a JSON array whose element count matches the number of `?` placeholders in the query. Values +will be substituted from the array into the query on the server side in the order the `?` placeholders +appear in the text of the SQL query. + +##### Correspondence Between SQL and JSON Data Types in Data Connect Request + +The SQL type of `?` placeholder in the query is determined by its corresponding entry in the +`parameters` array, according to the following table. + +| JSON Parameter Type | SQL Type | Example Values | +| ---------------------------------------------- | --------- | --------------------------------------------- | +| boolean | boolean | `true`, `false` | +| number | double | `123`, `-7000`, `123.456`, `7.445e-17` | +| string | varchar | `"Hello world"`, `"12345678910"` | +| array (note all elements must have same type) | array | `[ 1, 3, 5 ]`, `[ "one", "three", "five" ]` | +| object | row | `{ "colname1": "colvalue1", "colname2": 42 }` | + +Queries that require parameters with SQL types not covered above should use the SQL CAST operation. For +example, `CAST(? AS DATE)`. + +#### Search Result + +The result is returned in the same data structure as tables are returned by the discovery and browsing part of the Data Connect API: a **TableData** object. + +``` + +{ + "data_model": { }, + "data": [ + { + "ontology": "http://purl.obolibrary.org/obo/hp.owl", + "ontology_version": "http://purl.obolibrary.org/obo/hp/releases/2019-04-15", + "from_term": "UBERON_0009572", + "relation": "SubClassOf", + "to_term": "UBERON_0000464" + }, + { + "ontology": "http://purl.obolibrary.org/obo/hp.owl", + "ontology_version": "http://purl.obolibrary.org/obo/hp/releases/2019-04-15", + "from_term": "UBERON_0009670", + "relation": "SubClassOf", + "to_term": "UBERON_0000464" + }, + ... + ... + ], + "pagination": { + "next_page_url": "https://example.com/search/v1/statement/executing/20200831_235126_36756_szpff/yf9a38c74873e654f04309fe956cb40c8fb2d022f/1" + } +} + +``` +##### Correspondence Between SQL and JSON Data Types in the Search Result + +Data is manipulated in the query using the following types. Each SQL type is expressed as a physical JSON value in the response table. Semantic types (defined by JSON Schema reference URLs) are covered in the next section. + +| SQL Type | JSON Type | Example Values | +| ----------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| boolean | boolean | true false | +| tinyint, smallint, integer | number | 123 -7000 | +| real, double | number | 123.456 7.445e-17 | +| decimal, bigint | string | "12345.678910" | +| varchar, char | string | "Hello world" | +| JSON | Same JSON type (object, array, string, number, boolean, or null) | { "k1": "v1", "k2": false }[ 1, 3, 5, "seven", [ 1 ] ] "Hello JSON" 123.456 false null | +| date | string in ISO 8601 format | "2020-05-27" | +| time [without time zone] | string in ISO 8601 format | "12:22:27.000" | +| time with time zone | string in ISO 8601 format | "12:22:27.000Z" "12:22:27.000-03:00" | +| timestamp [without time zone] | string in ISO 8601 format | "2020-05-27T12:22:27.000" | +| timestamp with time zone | string in ISO 8601 format | "2020-05-27T12:22:27.000Z" "2020-05-27T12:22:27.000-05:00" | +| interval day to month | String in ISO 8601 period format | "P3Y2M" | +| interval day to second | String in ISO 8601 duration format | "P3DT4H3M2S" "PT3M2S" "PT4H3M" | +| array | array | [ 1, 3, 5, "seven", [ 1 ] ] | +| map | object | { "key": "value" } | +| row | object | { "colname": "colvalue" } | + +## Semantic Data Types + +To enable discovery of tables based on the kind of information contained within them, and to enable query tools to offer to filter and join data from different sources in a sensible way, tables need to declare not only the physical type of their rows (e.g. how data is represented as JSON) but also the semantic type (what the data means). + +Data Connect API describes the _meaning_ of data through JSON Schema references ($ref). Clients can discover that attributes in different tables refer to the same concept as each other by examining the target of each attribute’s JSON Schema reference. If the $ref URLs are the same, then the client knows that the attributes have the same meaning. + +Clients can use the attribute meanings to: + +* Recommend joining tables that contain similar types of information +* Display table attributes in a meaningful way +* Construct queries across tables in an informed way which retains the underlying meaning of the data, or create new meaning + +This system of identifying types through reference URLs is amenable to building up cross-references. With a rich set of cross-references, a Data Connect API client can help join up data from sources that use different nomenclatures, and link concepts to external ontologies. + + +### Example: Semantic Data Types on a Table + +Assume the following JSON Schema is published at https://schemablocks.org/schemas/playground/current/BloodGroup.json: + +``` +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://schemablocks.org/schemas/playground/current/BloodGroup.json", + "allOf": [ + { + "$ref": "https://schemablocks.org/schemas/sb-phenopackets/v1.0.0/OntologyClass.json#/properties" + }, + { + "properties": { + "id": { + "description": "Subtype of HP:0032224 (ABO Blood Group)", + "oneOf": [ + { + "const": "HP:0032442", + "title": "O" + }, + { + "const": "HP:0032370", + "title": "A" + }, + { + "const": "HP:0032440", + "title": "B" + }, + { + "const": "HP:0032441", + "title": "AB" + } + ] + } + } + } + ] +} +``` + + + +Then data exposed through Data Connect API could refer to the concept of “ABO Blood Group” as: + +``` +"$ref": "https://schemablocks.org/schemas/playground/current/BloodGroup.json" +``` + +SchemaBlocks is the recommended repository for centrally defined types, but any URL that points to a valid JSON Schema definition is acceptable. In many cases, the quickest route to publishing data will be to translate existing data dictionaries into JSON Schema and publish those alongside the dataset. However, the dataset will provide greater utility to its consumers if concepts are mapped to SchemaBlocks definitions where possible. + +#### Identifiers + +Many columns in datasets will contain identifiers of related objects. The descriptions for such columns should provide machine actionable information about the type of identifiers used. The general purpose CURIE SchemaBlock may be used https://schemablocks.org/schemas/sb-vr-spec/current/Curie.json. Preferably the column description should indicate the specific CURIE prefix (namespace) for the identifiers. This should cover both the use case where the values in the described table are prefixed, and the use case where they are not. In either case, the identifier name space provided in the column metadata allows simple identification of what other resources the column can be linked to. + +#### DRS Identifiers + +A common use case will be to provide GA4GH [Data Repository Service](https://github.com/ga4gh/data-repository-service-schemas) (DRS) identifiers which can be used to retrieve specific digital files for analysis. Where a column of a table contains DRS ids, the column description should indicate this. DRS ids maybe host based URIs, or a CURIE as above. + + +### Attaching Semantic Data Types To Search Results + +Since search results are also Tables, there are many scenarios where users would benefit from semantic schema references being embedded in Search results as well as static views of tables. + +When submitting an SQL query to the /search endpoint, the tool generating the query can wrap each selected column in the function `ga4gh_type()`, which directs the Data Connect implementation to generate a corresponding JSON Schema `$ref` for that column in the result table. + +``` +SELECT ga4gh_type( + t.age, '$ref:https://schemablocks.org/schemas/sb-phenopackets/current/Age.json#properties/age') AS age +FROM mytable t +WHERE t.age > 18 +``` + +Any selected columns that are not wrapped in the ga4gh_type() function will only have their physical characteristics described in the result table’s schema. This is perfectly acceptable for some client applications, but greatly limits the value of result tables that are archived or forwarded to another tool for further processing. + + +### Example: Semantic Data Types in Search Results + +Assume a Data Connect implementation has the table `pgpc.public.participant` contains the following data: + +| id | blood_type | +| ------------------ | ------------ | +| PGPC-44 | 0 | +| PGPC-46 | AB | + +The following query to the `/search` endpoint will lift this raw, unharmonized data into a semantically typed table +with data errors corrected, and types mapped to externally defined schema concepts: + +``` +select + ga4gh_type(id, '$ref:https://schemablocks.org/schemas/sb-phenopackets/current/Person.json#properties/individualId') as id, + ga4gh_type( + case when blood_type = '' then null + else cast(row( + case regexp_extract(blood_type, '(\w+)([+-])', 1) + when '0' then 'HP:0032442' -- source data has '0' where it should have 'O' + when 'O' then 'HP:0032442' + when 'A' then 'HP:0032370' + when 'B' then 'HP:0032440' + when 'AB' then 'HP:0032441' + else 'error' + end, + case regexp_extract(blood_type, '(\w+)([+-])', 1) + when '0' then 'O' + else regexp_extract(blood_type, '(\w+)([+-])', 1) + end + ) + as row(id varchar, label varchar)) + end, + '$ref:https://schemablocks.org/schemas/playground/current/BloodGroup.json') as blood_group +from pgpc.public.participant +``` + +The Data Connect service responds with the following table: + +``` +{ + "data_model": { + "description": "Schema specified by query", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "id": { + "$ref": "https://schemablocks.org/schemas/sb-phenopackets/current/Person.json#properties/individualId" + }, + "blood_group": { + "$ref": "https://schemablocks.org/schemas/playground/current/BloodGroup.json" + } + } + }, + "data": [ + { + "id": "PGPC-44", + "blood_group": {"id": "HP:0032442", "label": "O"} + }, + { + "id": "PGPC-46", + "blood_group": {"id": "HP:0032441", "label": "AB"} + } + ] +} +``` + +## SQL Functions + +The Data Connect specification is implementation-agnostic and does not prescribe use of a relational database or a particular database technology. As a result, the exact SQL dialect that is available will vary between implementations of the standard. + +The functions listed below SHOULD be supported by any implementation of Data Connect that supports the `search` endpoint. These functions are supported by major database platforms including Trino, PostgreSQL, MySQL and BigQuery. There are occasional name or signature differences, but a Data Connect API implementation atop any of the major database platforms should be able to pass through queries that use the functions listed below with only minor tweaks. + +* ga4gh_type (described above) +* **Logical Operators** + * `AND`, `OR`, `NOT` +* **Comparison Operators** + * `<`, `>`, `<=`, `>=`, `=`, `<>`, `!=` + * `IS NULL, IS NOT NULL` + * `LIKE` +* **Type conversion operators** + * `CAST` +* **Conditional Expressions** + * `IF` + * `CASE` + * `COALESCE` +* **String manipulation** + * `substring(string, start)` → `varchar` + * `Concatenation (||)` +* **Date manipulation** + * `extract(field FROM date)` + * `current_date` + * `current_time` + * `current_timestamp` + * `+`, `-` operators for dates +* **Aggregate functions** + * `count(*)` + * `max(x)` + * `min(x)` + * `sum(x)` +* **Structured data** + * `json_extract(json, json_path)` + * `unnest(array)` + +An implementation of Data Connect MAY support any number of additional SQL functions. Wherever possible these additional functions SHOULD conform to the ANSI SQL standard. + +Trino (formerly PrestoSQL) has proven to be a popular choice in existing Data Connect implementations, owing to the highly configurable nature of the engine. A simplified version of Trino's SQL grammar is presented in [Appendix A](#appendix-a-sql-grammar). + +To assist with implementations directly on other database platforms, the [Trino Functions Support Matrix](https://docs.google.com/document/d/1y51qNuoe2ELX9kCOyQbFB4jihiKt2N8Qcd6-zzadIvk) captures differences in the implementation of common functions in granular detail. + +## Pagination and Long Running Queries + +**Pagination sequence** + +A pagination sequence is the singly-linked list of URLs formed by following the next_page_url property of the pagination section of an initial TableData or ListTablesResponse. A pagination sequence begins at the first response returned from any request that yields a TableData or ListTablesResponse, and ends at the page in the sequence whose pagination property is omitted, whose pagination.next_page_url is omitted, or whose pagination.next_page_url is null. + +Servers MAY return a unique pagination sequence in response to successive requests for the same query, table data listing, or table listing. + +Except for the last page, pagination.next_page_url property MUST be either an absolute URL or a relative reference as defined by [RFC 3986 section 4.2](https://tools.ietf.org/html/rfc3986#section-4.2) whose base URL is the URL that the page containing the reference was fetched from. + +Every non-empty TableData page in a pagination sequence MUST include a data_model property. If present, the data_model property MUST be a valid JSON Schema. + +Across all TableData pages in the pagination sequence that have a data_model value, the data_models MUST be identical. Some TableData pages may lack a data_model. See the empty page rules below. + +Servers MAY respond with an HTTP 4xx error code if the same page is requested more than once. + +Due to both rules above, clients MUST NOT rely on the ability to re-fetch previously encountered pages. + +Servers MAY include a Retry-After HTTP header in each response that is part of a pagination sequence, and clients MUST respect the delay specified by such header before attempting to fetch the next page. + +**Empty TableData pages** + +While many types of queries will be completed quickly, others will take minutes or even hours to yield a result. The simplest solution would be a synchronous design: query requests block until data is ready, then return a TableData response with the initial rows of the result set. However, asking clients to block for hours on a single HTTP response is fraught with difficulty: open connections are costly and fragile. If an intermediary times out the request, the results will be lost and the client must start over. + +To allow servers to direct clients to poll for results rather than hold open HTTP connections for long-running queries, the following special pagination rules apply to empty pages. + +An empty page is defined as a TableData object whose data property is a zero element array. + +A pagination sequence MAY include any number of empty pages anywhere in the sequence. + +An empty TableData page MAY omit its data_model property entirely. This allows servers to direct clients to poll for results before the result schema has been determined. + +A server that returns an empty page SHOULD include a Retry-after header in the HTTP response. If a client encounters an empty page with no Retry-after header, the client SHOULD delay at least 1 second before requesting the next page. + +**Example: Server returning empty pages to make client poll** + +This example illustrates a server returning a series of empty pages to a client while it is preparing the result set. The client polls for results by following next_page_url at the rate specified by the server. The form of the pagination URLs are only an example of one possible scheme. Servers are free to employ any pagination URL scheme. + +**Initial Request** + + +``` +POST /search +content-type: application/json + +{"query":"select distinct gene_symbol from search_cloud.brca_exchange.v32"} + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/abc123?token=ZXhhbXBsZTEK"}} +``` + + +**2nd request (Polling after sleeping for 1000ms)** + + +``` +GET /search/v1/abc123?token=ZXhhbXBsZTEK + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/abc123?token=ZXhhbXBsZTIK"}} +``` + + +**3rd request (Polling again after sleeping for 1000ms)** + + +``` +GET /search/v1/abc123?token=ZXhhbXBsZTIK + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/abc123?token=ZXhhbXBsZTMK"}} +``` + + +**4th request (Polling again after sleeping for 1000ms)** + + +``` +GET /search/v1/abc123?token=ZXhhbXBsZTMK + +HTTP/1.1 200 OK +content-type: application/json + +{"data_model":{"description":"Automatically generated schema","$schema":"http://json-schema.org/draft-07/schema#","properties":{"gene_symbol":{"format":"varchar","type":"string"}}},"data":[{"gene_symbol":"BRCA2"},{"gene_symbol":"BRCA1"}],"pagination":{"next_page_url":"/search/v1/abc123?token=ZXhhbXBsZTQK"}} +``` + + +**Final request (no delay because page was nonempty and no retry-after header was present on the response)** + + +``` +GET /search/v1/abc123?token=ZXhhbXBsZTQK + +HTTP/1.1 200 OK +content-type: application/json + +{"data_model":{"description":"Automatically generated schema","$schema":"http://json-schema.org/draft-07/schema#","properties":{"gene_symbol":{"format":"varchar","type":"string"}}},"data":[],"pagination":{}} +``` + + +**Example: Client algorithm for consuming TableData pages** + +The algorithm provided here simply illustrates one way to comply with the rules above. Any algorithm that satisfies all rules acceptable. + + + +1. Start with an empty data buffer and undefined data model. +2. Loop: + 1. If the response is an error, report the error and abort + 2. If no data_model has been seen so far, check if this page contains a data_model. If so, define the data model for the whole pagination sequence as this page’s data_model. + 3. Append the row data from the current page to the data buffer (there may be 0 rows on any given page) + 4. Delay for the time specified in the “Retry-After” HTTP response header for the current page (default is no delay) + 5. If there is a pagination object and it has a non-null next_page_url, fetch that URL, make that response the current page, and start back at step 2a; otherwise end. + + +# Supplementary Information + +This section provides advice to implementers. Nothing in this section is required of a conforming implementation. + + +## Interop with other data storage and transmission standards + +This section demonstrates how to expose data stored in commonly used formats using Data Connect Table structures and their embedded JSON schema specifications. + + +### Phenopackets + +Phenopacket is a GA4GH approved standard file format for sharing phenotypic information. A Phenopacket file contains a set of mandatory and optional fields to share information about a patient or participant’s phenotype, such as clinical diagnosis, age of onset, results from lab tests, and disease severity. + + +#### Concrete Example + +Here is a detailed example of a directory full of Phenopacket files exposed as a single table via the Data Connect API. Each row corresponds to one Phenopacket. The table has two columns: + +* **id**, the ID of that row’s Phenopacket +* **phenopacket**, the entire contents of the Phenopacket as a JSON object + +``` +/tables +``` + + +``` +{ + "tables": [ + { + "name": "gecco_phenopackets", + "description": "Table / directory containing Phenopacket JSON files", + "data_model": { + "$ref": "table/gecco_phenopackets/info" + } + }, + { + "name": "hpo_phenopackets", + "description": "Table / directory containing Phenopacket JSON files", + "data_model": { + "$ref": "table/hpo_phenopackets/info" + } + } + ] +} +``` + + + +``` +/table/hpo_phenopackets/info +``` +``` +{ + "name": "hpo_phenopackets", + "description": "Table / directory containing Phenopacket JSON files", + "data_model": { + "$id": "https://storage.googleapis.com/ga4gh-phenopackets-example/phenopacket-with-id", + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "Phenopacket JSON data model", + "properties": { + "id": { + "type": "string", + "description": "An identifier specific for this phenopacket" + }, + "phenopacket": { + "$ref": "https://schemablocks.org/schemas/sb-phenopackets/current/Phenopacket.json" + } + } + } +} +``` +``` +/table/hpo_phenopackets/data +``` + + +``` +{ + "data_model": { + "$id": "https://storage.googleapis.com/ga4gh-phenopackets-example/phenopacket-with-id", + "$schema": "http://json-schema.org/draft-07/schema#", + "description": "Phenopacket JSON data model", + "properties": { + "id": { + "type": "string", + "description": "An identifier specific for this phenopacket" + }, + "phenopacket": { + "$ref": "https://schemablocks.org/schemas/sb-phenopackets/current/Phenopacket.json" + } + } + }, + "data": [ + { + "id": "PMID:27435956-Naz_Villalba-2016-NLRP3-proband", + "phenopacket": {actual phenopacket json} + }, + { + "id": "PMID:27672653-Abdul_Wahab-2016-GCDH-Patient_5", + "phenopacket": {actual phenopacket json} + }, + { + "id": "PMID:20149460-Papanastasiou-2010-STAT3-12_year_old_girl", + "phenopacket": {actual phenopacket json} + }, + ... + ... + ... + ] +} +``` + +``` +/table/hpo_phenopackets/search +``` + +``` + +REQUEST: +--------------------------------------------------------------------------------------- +WITH pp_genes AS ( + SELECT + pp.id AS packet_id, + json_extract_scalar(g.gene, '$.id') AS gene_id, + json_extract_scalar(g.gene, '$.symbol') AS gene_symbol + FROM + sample_phenopackets.ga4gh_tables.hpo_phenopackets pp, + UNNEST(CAST(json_extract(pp.phenopacket, '$.genes') as ARRAY(json))) + as g (gene) +) +SELECT pp_genes.* +FROM pp_genes +WHERE gene_symbol LIKE 'ANTXR%' +LIMIT 100; +RESPONSE: +------------------------------------------------------------+-----------------+-------- + PMID:30050362-Schussler-2018-ANTXR2-II-3_ | NCBIGene:118429 | ANTXR2 + PMID:27587992-Salas-Alanís-2016-ANTXR1-14_year_old_brother | NCBIGene:84168 | ANTXR1 + +``` + +#### Organizing Into Tables + +Here we demonstrate two possibilities for organizing a collection of Phenopacket JSON files into tables. Other layouts are also possible. + +FLAT hierarchy - all files in a single table + +* [https://storage.googleapis.com/ga4gh-phenopackets-example/flat/tables](https://storage.googleapis.com/ga4gh-phenopackets-example/flat/tables) +* [https://storage.googleapis.com/ga4gh-phenopackets-example/flat/table/phenopacket_table/info](https://storage.googleapis.com/ga4gh-phenopackets-example/flat/table/phenopacket_table/info) +* [https://storage.googleapis.com/ga4gh-phenopackets-example/flat/table/phenopacket_table/data](https://storage.googleapis.com/ga4gh-phenopackets-example/flat/table/phenopacket_table/data) + +BY_SUBJECT hierarchy - one table per subject ID + +* [https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/tables](https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/tables) +* [https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27435956_longitudinal/info](https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27435956_longitudinal/info) +* [https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27435956_longitudinal/data](https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27435956_longitudinal/data) (has 1 Phenopacket) +* [https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27040691_longitudinal/data](https://storage.googleapis.com/ga4gh-phenopackets-example/by_subject/table/PMID:27040691_longitudinal/data) (has multiple Phenopackets) + +The difference between the two formats is the way in which the Phenopacket JSON data is structured in one table (flat) or multiple tables (by_subject) as shown in the following diagram. + + +![Phenopacket tables in a bucket example](assets/phenopacket-tables-in-a-bucket-example.svg "Phenopacket tables in a bucket example") + +## How to Secure Implementations Based on Trino Connectors or PostgreSQL Foreign Data Wrappers + +* Filter data at the connector level +* Use simple OAuth scopes to decide what data can be returned +* If certain scopes should only see aggregated data (for privacy reasons), use separate aggregated tables (or views). The connector should only pull data from these pre-summarized views. + + +## Implementing a Federation of SQL Query Nodes + +* Two approaches: “foreign data wrappers” and “fan-out/hub-and-spoke” +* Foreign data wrappers: + * Many SQL engines support user-provided connectors to external data: + * PostgreSQL: foreign data wrappers + * Presto DB: Connectors + * Apache Hive: Deserializers (SerDe without a serialization support) + + +# Appendix A: SQL Grammar + +ANTLR grammar for Data Connect (based on Trino (formerly Trino v. 323, ASL 2.0 license), with the DML and DDL parts removed: + + +``` +grammar DataConnect; + +tokens { + DELIMITER +} + +singleStatement + : statement EOF + ; + +standaloneExpression + : expression EOF + ; + +standaloneType + : type EOF + ; + +statement + : query #statementDefault + | USE schema=identifier #use + | USE catalog=identifier '.' schema=identifier #use + | EXPLAIN ANALYZE? VERBOSE? + ('(' explainOption (',' explainOption)* ')')? statement #explain + | SHOW TABLES ((FROM | IN) qualifiedName)? + (LIKE pattern=string (ESCAPE escape=string)?)? #showTables + | SHOW SCHEMAS ((FROM | IN) identifier)? + (LIKE pattern=string (ESCAPE escape=string)?)? #showSchemas + | SHOW CATALOGS (LIKE pattern=string)? #showCatalogs + | SHOW COLUMNS (FROM | IN) qualifiedName #showColumns + | DESCRIBE qualifiedName #showColumns + | DESC qualifiedName #showColumns + | SHOW FUNCTIONS #showFunctions + ; + +query + : with? queryNoWith + ; + +with + : WITH RECURSIVE? namedQuery (',' namedQuery)* + ; + +queryNoWith: + queryTerm + (ORDER BY sortItem (',' sortItem)*)? + (OFFSET offset=INTEGER_VALUE (ROW | ROWS)?)? + ((LIMIT limit=(INTEGER_VALUE | ALL)) | (FETCH (FIRST | NEXT) (fetchFirst=INTEGER_VALUE)? (ROW | ROWS) (ONLY | WITH TIES)))? + ; + +queryTerm + : queryPrimary #queryTermDefault + | left=queryTerm operator=INTERSECT setQuantifier? right=queryTerm #setOperation + | left=queryTerm operator=(UNION | EXCEPT) setQuantifier? right=queryTerm #setOperation + ; + +queryPrimary + : querySpecification #queryPrimaryDefault + | TABLE qualifiedName #table + | VALUES expression (',' expression)* #inlineTable + | '(' queryNoWith ')' #subquery + ; + +sortItem + : expression ordering=(ASC | DESC)? (NULLS nullOrdering=(FIRST | LAST))? + ; + +querySpecification + : SELECT setQuantifier? selectItem (',' selectItem)* + (FROM relation (',' relation)*)? + (WHERE where=booleanExpression)? + (GROUP BY groupBy)? + (HAVING having=booleanExpression)? + ; + +groupBy + : setQuantifier? groupingElement (',' groupingElement)* + ; + +groupingElement + : groupingSet #singleGroupingSet + | ROLLUP '(' (expression (',' expression)*)? ')' #rollup + | CUBE '(' (expression (',' expression)*)? ')' #cube + | GROUPING SETS '(' groupingSet (',' groupingSet)* ')' #multipleGroupingSets + ; + +groupingSet + : '(' (expression (',' expression)*)? ')' + | expression + ; + +namedQuery + : name=identifier (columnAliases)? AS '(' query ')' + ; + +setQuantifier + : DISTINCT + | ALL + ; + +selectItem + : expression (AS? identifier)? #selectSingle + | primaryExpression '.' ASTERISK (AS columnAliases)? #selectAll + | ASTERISK #selectAll + ; + +relation + : left=relation + ( CROSS JOIN right=sampledRelation + | joinType JOIN rightRelation=relation joinCriteria + | NATURAL joinType JOIN right=sampledRelation + ) #joinRelation + | sampledRelation #relationDefault + ; + +joinType + : INNER? + | LEFT OUTER? + | RIGHT OUTER? + | FULL OUTER? + ; + +joinCriteria + : ON booleanExpression + | USING '(' identifier (',' identifier)* ')' + ; + +sampledRelation + : aliasedRelation ( + TABLESAMPLE sampleType '(' percentage=expression ')' + )? + ; + +sampleType + : BERNOULLI + | SYSTEM + ; + +aliasedRelation + : relationPrimary (AS? identifier columnAliases?)? + ; + +columnAliases + : '(' identifier (',' identifier)* ')' + ; + +relationPrimary + : qualifiedName #tableName + | '(' query ')' #subqueryRelation + | UNNEST '(' expression (',' expression)* ')' (WITH ORDINALITY)? #unnest + | LATERAL '(' query ')' #lateral + | '(' relation ')' #parenthesizedRelation + ; + +expression + : booleanExpression + ; + +booleanExpression + : valueExpression predicate[$valueExpression.ctx]? #predicated + | NOT booleanExpression #logicalNot + | left=booleanExpression operator=AND right=booleanExpression #logicalBinary + | left=booleanExpression operator=OR right=booleanExpression #logicalBinary + ; + +// workaround for https://github.com/antlr/antlr4/issues/780 +predicate[ParserRuleContext value] + : comparisonOperator right=valueExpression #comparison + | comparisonOperator comparisonQuantifier '(' query ')' #quantifiedComparison + | NOT? BETWEEN lower=valueExpression AND upper=valueExpression #between + | NOT? IN '(' expression (',' expression)* ')' #inList + | NOT? IN '(' query ')' #inSubquery + | NOT? LIKE pattern=valueExpression (ESCAPE escape=valueExpression)? #like + | IS NOT? NULL #nullPredicate + | IS NOT? DISTINCT FROM right=valueExpression #distinctFrom + ; + +valueExpression + : primaryExpression #valueExpressionDefault + | valueExpression AT timeZoneSpecifier #atTimeZone + | operator=(MINUS | PLUS) valueExpression #arithmeticUnary + | left=valueExpression operator=(ASTERISK | SLASH | PERCENT) right=valueExpression #arithmeticBinary + | left=valueExpression operator=(PLUS | MINUS) right=valueExpression #arithmeticBinary + | left=valueExpression CONCAT right=valueExpression #concatenation + ; + +primaryExpression + : NULL #nullLiteral + | interval #intervalLiteral + | identifier string #typeConstructor + | DOUBLE PRECISION string #typeConstructor + | number #numericLiteral + | booleanValue #booleanLiteral + | string #stringLiteral + | BINARY_LITERAL #binaryLiteral + | '?' #parameter + | POSITION '(' valueExpression IN valueExpression ')' #position + | '(' expression (',' expression)+ ')' #rowConstructor + | ROW '(' expression (',' expression)* ')' #rowConstructor + | qualifiedName '(' ASTERISK ')' filter? over? #functionCall + | qualifiedName '(' (setQuantifier? expression (',' expression)*)? + (ORDER BY sortItem (',' sortItem)*)? ')' filter? (nullTreatment? over)? #functionCall + | identifier '->' expression #lambda + | '(' (identifier (',' identifier)*)? ')' '->' expression #lambda + | '(' query ')' #subqueryExpression + // This is an extension to ANSI SQL, which considers EXISTS to be a + | EXISTS '(' query ')' #exists + | CASE operand=expression whenClause+ (ELSE elseExpression=expression)? END #simpleCase + | CASE whenClause+ (ELSE elseExpression=expression)? END #searchedCase + | CAST '(' expression AS type ')' #cast + | TRY_CAST '(' expression AS type ')' #cast + | ARRAY '[' (expression (',' expression)*)? ']' #arrayConstructor + | value=primaryExpression '[' index=valueExpression ']' #subscript + | identifier #columnReference + | base=primaryExpression '.' fieldName=identifier #dereference + | name=CURRENT_DATE #specialDateTimeFunction + | name=CURRENT_TIME ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=CURRENT_TIMESTAMP ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=LOCALTIME ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=LOCALTIMESTAMP ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=CURRENT_USER #currentUser + | name=CURRENT_PATH #currentPath + | SUBSTRING '(' valueExpression FROM valueExpression (FOR valueExpression)? ')' #substring + | NORMALIZE '(' valueExpression (',' normalForm)? ')' #normalize + | EXTRACT '(' identifier FROM valueExpression ')' #extract + | '(' expression ')' #parenthesizedExpression + | GROUPING '(' (qualifiedName (',' qualifiedName)*)? ')' #groupingOperation + ; + +nullTreatment + : IGNORE NULLS + | RESPECT NULLS + ; + +string + : STRING #basicStringLiteral + | UNICODE_STRING (UESCAPE STRING)? #unicodeStringLiteral + ; + +timeZoneSpecifier + : TIME ZONE interval #timeZoneInterval + | TIME ZONE string #timeZoneString + ; + +comparisonOperator + : EQ | NEQ | LT | LTE | GT | GTE + ; + +comparisonQuantifier + : ALL | SOME | ANY + ; + +booleanValue + : TRUE | FALSE + ; + +interval + : INTERVAL sign=(PLUS | MINUS)? string from=intervalField (TO to=intervalField)? + ; + +intervalField + : YEAR | MONTH | DAY | HOUR | MINUTE | SECOND + ; + +normalForm + : NFD | NFC | NFKD | NFKC + ; + +type + : ROW '(' rowField (',' rowField)* ')' #rowType + | INTERVAL from=intervalField (TO to=intervalField)? #intervalType + | base=TIMESTAMP ('(' precision = INTEGER_VALUE ')')? (WITHOUT TIME ZONE)? #dateTimeType + | base=TIMESTAMP ('(' precision = INTEGER_VALUE ')')? WITH TIME ZONE #dateTimeType + | base=TIME ('(' precision = INTEGER_VALUE ')')? (WITHOUT TIME ZONE)? #dateTimeType + | base=TIME ('(' precision = INTEGER_VALUE ')')? WITH TIME ZONE #dateTimeType + | DOUBLE PRECISION #doublePrecisionType + | ARRAY '<' type '>' #legacyArrayType + | MAP '<' keyType=type ',' valueType=type '>' #legacyMapType + | type ARRAY ('[' INTEGER_VALUE ']')? #arrayType + | identifier ('(' typeParameter (',' typeParameter)* ')')? #genericType + ; + +rowField + : identifier? type; + +typeParameter + : INTEGER_VALUE | type + ; + +whenClause + : WHEN condition=expression THEN result=expression + ; + +filter + : FILTER '(' WHERE booleanExpression ')' + ; + +over + : OVER '(' + (PARTITION BY partition+=expression (',' partition+=expression)*)? + (ORDER BY sortItem (',' sortItem)*)? + windowFrame? + ')' + ; + +windowFrame + : frameType=RANGE start=frameBound + | frameType=ROWS start=frameBound + | frameType=RANGE BETWEEN start=frameBound AND end=frameBound + | frameType=ROWS BETWEEN start=frameBound AND end=frameBound + ; + +frameBound + : UNBOUNDED boundType=PRECEDING #unboundedFrame + | UNBOUNDED boundType=FOLLOWING #unboundedFrame + | CURRENT ROW #currentRowBound + | expression boundType=(PRECEDING | FOLLOWING) #boundedFrame + ; + +explainOption + : FORMAT value=(TEXT | GRAPHVIZ | JSON) #explainFormat + | TYPE value=(LOGICAL | DISTRIBUTED | VALIDATE | IO) #explainType + ; + +qualifiedName + : identifier ('.' identifier)* + ; + +identifier + : IDENTIFIER #unquotedIdentifier + | QUOTED_IDENTIFIER #quotedIdentifier + | nonReserved #unquotedIdentifier + | BACKQUOTED_IDENTIFIER #backQuotedIdentifier + | DIGIT_IDENTIFIER #digitIdentifier + ; + +number + : MINUS? DECIMAL_VALUE #decimalLiteral + | MINUS? DOUBLE_VALUE #doubleLiteral + | MINUS? INTEGER_VALUE #integerLiteral + ; + +nonReserved + // IMPORTANT: this rule must only contain tokens. Nested rules are not supported. See SqlParser.exitNonReserved + : ADD | ADMIN | ALL | ANALYZE | ANY | ARRAY | ASC | AT + | BERNOULLI + | CALL | CASCADE | CATALOGS | COLUMN | COLUMNS | COMMENT | COMMIT | COMMITTED | CURRENT + | DATA | DATE | DAY | DEFINER | DESC | DISTRIBUTED | DOUBLE + | EXCLUDING | EXPLAIN + | FETCH | FILTER | FIRST | FOLLOWING | FORMAT | FUNCTIONS + | GRANT | GRANTED | GRANTS | GRAPHVIZ + | HOUR + | IF | IGNORE | INCLUDING | INPUT | INTERVAL | INVOKER | IO | ISOLATION + | JSON + | LAST | LATERAL | LEVEL | LIMIT | LOGICAL + | MAP | MINUTE | MONTH + | NEXT | NFC | NFD | NFKC | NFKD | NO | NONE | NULLIF | NULLS + | OFFSET | ONLY | OPTION | ORDINALITY | OUTPUT | OVER + | PARTITION | PARTITIONS | PATH | POSITION | PRECEDING | PRECISION | PRIVILEGES | PROPERTIES + | RANGE | READ | RENAME | REPEATABLE | REPLACE | RESET | RESPECT | RESTRICT | REVOKE | ROLE | ROLES | ROLLBACK | ROW | ROWS + | SCHEMA | SCHEMAS | SECOND | SECURITY | SERIALIZABLE | SESSION | SET | SETS + | SHOW | SOME | START | STATS | SUBSTRING | SYSTEM + | TABLES | TABLESAMPLE | TEXT | TIES | TIME | TIMESTAMP | TO | TRANSACTION | TRY_CAST | TYPE + | UNBOUNDED | UNCOMMITTED | USE | USER + | VALIDATE | VERBOSE | VIEW + | WITHOUT | WORK | WRITE + | YEAR + | ZONE + ; + +ADD: 'ADD'; +ADMIN: 'ADMIN'; +ALL: 'ALL'; +ALTER: 'ALTER'; +ANALYZE: 'ANALYZE'; +AND: 'AND'; +ANY: 'ANY'; +ARRAY: 'ARRAY'; +AS: 'AS'; +ASC: 'ASC'; +AT: 'AT'; +BERNOULLI: 'BERNOULLI'; +BETWEEN: 'BETWEEN'; +BY: 'BY'; +CALL: 'CALL'; +CASCADE: 'CASCADE'; +CASE: 'CASE'; +CAST: 'CAST'; +CATALOGS: 'CATALOGS'; +COLUMN: 'COLUMN'; +COLUMNS: 'COLUMNS'; +COMMENT: 'COMMENT'; +COMMIT: 'COMMIT'; +COMMITTED: 'COMMITTED'; +CONSTRAINT: 'CONSTRAINT'; +CREATE: 'CREATE'; +CROSS: 'CROSS'; +CUBE: 'CUBE'; +CURRENT: 'CURRENT'; +CURRENT_DATE: 'CURRENT_DATE'; +CURRENT_PATH: 'CURRENT_PATH'; +CURRENT_ROLE: 'CURRENT_ROLE'; +CURRENT_TIME: 'CURRENT_TIME'; +CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; +CURRENT_USER: 'CURRENT_USER'; +DATA: 'DATA'; +DATE: 'DATE'; +DAY: 'DAY'; +DEALLOCATE: 'DEALLOCATE'; +DEFINER: 'DEFINER'; +DELETE: 'DELETE'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DISTINCT: 'DISTINCT'; +DISTRIBUTED: 'DISTRIBUTED'; +DOUBLE: 'DOUBLE'; +DROP: 'DROP'; +ELSE: 'ELSE'; +END: 'END'; +ESCAPE: 'ESCAPE'; +EXCEPT: 'EXCEPT'; +EXCLUDING: 'EXCLUDING'; +EXECUTE: 'EXECUTE'; +EXISTS: 'EXISTS'; +EXPLAIN: 'EXPLAIN'; +EXTRACT: 'EXTRACT'; +FALSE: 'FALSE'; +FETCH: 'FETCH'; +FILTER: 'FILTER'; +FIRST: 'FIRST'; +FOLLOWING: 'FOLLOWING'; +FOR: 'FOR'; +FORMAT: 'FORMAT'; +FROM: 'FROM'; +FULL: 'FULL'; +FUNCTIONS: 'FUNCTIONS'; +GRANT: 'GRANT'; +GRANTED: 'GRANTED'; +GRANTS: 'GRANTS'; +GRAPHVIZ: 'GRAPHVIZ'; +GROUP: 'GROUP'; +GROUPING: 'GROUPING'; +HAVING: 'HAVING'; +HOUR: 'HOUR'; +IF: 'IF'; +IGNORE: 'IGNORE'; +IN: 'IN'; +INCLUDING: 'INCLUDING'; +INNER: 'INNER'; +INPUT: 'INPUT'; +INSERT: 'INSERT'; +INTERSECT: 'INTERSECT'; +INTERVAL: 'INTERVAL'; +INTO: 'INTO'; +INVOKER: 'INVOKER'; +IO: 'IO'; +IS: 'IS'; +ISOLATION: 'ISOLATION'; +JSON: 'JSON'; +JOIN: 'JOIN'; +LAST: 'LAST'; +LATERAL: 'LATERAL'; +LEFT: 'LEFT'; +LEVEL: 'LEVEL'; +LIKE: 'LIKE'; +LIMIT: 'LIMIT'; +LOCALTIME: 'LOCALTIME'; +LOCALTIMESTAMP: 'LOCALTIMESTAMP'; +LOGICAL: 'LOGICAL'; +MAP: 'MAP'; +MINUTE: 'MINUTE'; +MONTH: 'MONTH'; +NATURAL: 'NATURAL'; +NEXT: 'NEXT'; +NFC : 'NFC'; +NFD : 'NFD'; +NFKC : 'NFKC'; +NFKD : 'NFKD'; +NO: 'NO'; +NONE: 'NONE'; +NORMALIZE: 'NORMALIZE'; +NOT: 'NOT'; +NULL: 'NULL'; +NULLIF: 'NULLIF'; +NULLS: 'NULLS'; +OFFSET: 'OFFSET'; +ON: 'ON'; +ONLY: 'ONLY'; +OPTION: 'OPTION'; +OR: 'OR'; +ORDER: 'ORDER'; +ORDINALITY: 'ORDINALITY'; +OUTER: 'OUTER'; +OUTPUT: 'OUTPUT'; +OVER: 'OVER'; +PARTITION: 'PARTITION'; +PARTITIONS: 'PARTITIONS'; +PATH: 'PATH'; +POSITION: 'POSITION'; +PRECEDING: 'PRECEDING'; +PREPARE: 'PREPARE'; +PRIVILEGES: 'PRIVILEGES'; +PRECISION: 'PRECISION'; +PROPERTIES: 'PROPERTIES'; +RANGE: 'RANGE'; +READ: 'READ'; +RECURSIVE: 'RECURSIVE'; +RENAME: 'RENAME'; +REPEATABLE: 'REPEATABLE'; +REPLACE: 'REPLACE'; +RESET: 'RESET'; +RESPECT: 'RESPECT'; +RESTRICT: 'RESTRICT'; +REVOKE: 'REVOKE'; +RIGHT: 'RIGHT'; +ROLE: 'ROLE'; +ROLES: 'ROLES'; +ROLLBACK: 'ROLLBACK'; +ROLLUP: 'ROLLUP'; +ROW: 'ROW'; +ROWS: 'ROWS'; +SCHEMA: 'SCHEMA'; +SCHEMAS: 'SCHEMAS'; +SECOND: 'SECOND'; +SECURITY: 'SECURITY'; +SELECT: 'SELECT'; +SERIALIZABLE: 'SERIALIZABLE'; +SESSION: 'SESSION'; +SET: 'SET'; +SETS: 'SETS'; +SHOW: 'SHOW'; +SOME: 'SOME'; +START: 'START'; +STATS: 'STATS'; +SUBSTRING: 'SUBSTRING'; +SYSTEM: 'SYSTEM'; +TABLE: 'TABLE'; +TABLES: 'TABLES'; +TABLESAMPLE: 'TABLESAMPLE'; +TEXT: 'TEXT'; +THEN: 'THEN'; +TIES: 'TIES'; +TIME: 'TIME'; +TIMESTAMP: 'TIMESTAMP'; +TO: 'TO'; +TRANSACTION: 'TRANSACTION'; +TRUE: 'TRUE'; +TRY_CAST: 'TRY_CAST'; +TYPE: 'TYPE'; +UESCAPE: 'UESCAPE'; +UNBOUNDED: 'UNBOUNDED'; +UNCOMMITTED: 'UNCOMMITTED'; +UNION: 'UNION'; +UNNEST: 'UNNEST'; +USE: 'USE'; +USER: 'USER'; +USING: 'USING'; +VALIDATE: 'VALIDATE'; +VALUES: 'VALUES'; +VERBOSE: 'VERBOSE'; +VIEW: 'VIEW'; +WHEN: 'WHEN'; +WHERE: 'WHERE'; +WITH: 'WITH'; +WITHOUT: 'WITHOUT'; +WORK: 'WORK'; +WRITE: 'WRITE'; +YEAR: 'YEAR'; +ZONE: 'ZONE'; + +EQ : '='; +NEQ : '<>' | '!='; +LT : '<'; +LTE : '<='; +GT : '>'; +GTE : '>='; + +PLUS: '+'; +MINUS: '-'; +ASTERISK: '*'; +SLASH: '/'; +PERCENT: '%'; +CONCAT: '||'; + +STRING + : '\'' ( ~'\'' | '\'\'' )* '\'' + ; + +UNICODE_STRING + : 'U&\'' ( ~'\'' | '\'\'' )* '\'' + ; + +// Note: we allow any character inside the binary literal and validate +// its a correct literal when the AST is being constructed. This +// allows us to provide more meaningful error messages to the user +BINARY_LITERAL + : 'X\'' (~'\'')* '\'' + ; + +INTEGER_VALUE + : DIGIT+ + ; + +DECIMAL_VALUE + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + +DOUBLE_VALUE + : DIGIT+ ('.' DIGIT*)? EXPONENT + | '.' DIGIT+ EXPONENT + ; + +IDENTIFIER + : (LETTER | '_') (LETTER | DIGIT | '_' | '@' | ':')* + ; + +DIGIT_IDENTIFIER + : DIGIT (LETTER | DIGIT | '_' | '@' | ':')+ + ; + +QUOTED_IDENTIFIER + : '"' ( ~'"' | '""' )* '"' + ; + +BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + +fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; + +fragment DIGIT + : [0-9] + ; + +fragment LETTER + : [A-Z] + ; + +SIMPLE_COMMENT + : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN) + ; + +BRACKETED_COMMENT + : '/*' .*? '*/' -> channel(HIDDEN) + ; + +WS + : [ \r\n\t]+ -> channel(HIDDEN) + ; + +// Catch-all for anything we can't recognize. +// We use this to be able to ignore and recover all the text +// when splitting statements with DelimiterLexer +UNRECOGNIZED + : . + ; +``` diff --git a/USECASES.md b/USECASES.md index 7650b810..a5a9605a 100644 --- a/USECASES.md +++ b/USECASES.md @@ -1,117 +1,117 @@ -# Introduction -This document attempts to summarize search use cases collected from key driver -projects and other interested parties. The information was collected from -three primary sources: - -* A questionnaire that collected answers to a series of curated questions (2 - responses) - -* A feature matrix spreadsheet that indexed various high level features against - interested driver projects (6 driver projects commented on at least one - feature in the matrix) - -* A "suggested queries" document that collected high level queries that are of - interest (6 groups replied to this document providing 19 queries) - -If you contributed to one of these sources and feel that your feedback was -either missed or misrepresented here, or contributed via another channel that -was not captured, please reach out to us at ga4gh-discovery-search@ga4gh.org. - -# Key themes -## Multimodal queries -In nearly all cases, the responses indicate a desire to link variant data with -phenotypic or clinical data. Only a few responses indicate a desire to query -over simple variant data (retrieving counts and allele/genotype frequency for a -variant) and even then it is a step along a path to a patient or cohort. - -All responses to the feature matrix rated this as "must have". - -See the section on data types below for more information. - -## Controlled Access -Most responses indicate some kind of requirement for controlled access, -including tiers of access ranging from public to tightly controlled by per -sample consent terms. Although configuration of access control is outside the -scope of the search API, considerations may need to be made for expressing -these concepts in queries and responses. - -## Complex Matching -In some cases (positional data for example) exact or simple fuzzy matching is -desired but for phenotypic matching needs to be performed using custom rules -and/or based on distances within an ontology hierarchy. - -Other potentially interesting matching/filtering modes mentioned include: -* returning results for only de novo variants -* matching against mutational burden -* filtering for rare diseases using gnomAD -* “abnormal” RNA expression of a particular gene -* generating a confusion matrix between two groups of individuals - -"Custom functions" were identified as "nice to have" (4) and "don't know" (2). - -Ben Hutten notes some thoughts on the topic of matching in the [MME -repository](https://github.com/ga4gh/mme-apis/wiki/Phenotype-matching). - -## Aggregation, Grouping and Sorting -Responses were mixed on the topic of aggregation. Responses on the topic of -"Aggregate functions (eg. minimum, maximum, average) range from "not needed" -(1), "nice to have" (3) to "must have" (2). There is a division here around -whether this sort of operation should be performed at query time or after the -results are generated. One important point of consideration is that in some -cases data may be available to a particular user only in aggregate meaning that -it must be performed as part of the query. - -The closely related feature of grouping (for example, counting the number of -variants grouped by gene) was rated as "not needed" (3), "nice to have" (1) and -"must have" (2) by respondents. - -Sorting, or the related concept of ranking, was rated as either "not needed" -(2) or "must have" (2) by respondents. - -## Data types -The search API is not expected to define data types, but it will be important -that some shared vocabulary are available to facilitate federated queries. - -All responses in the feature matrix indicated that the availability of a -standard set of fields is "nice to have" or "must have", while the ability to -have arbitrary non-standard fields is a "must have". - -Data type mentioned in the responses include: - -* Variant -* Age -* Sex -* HPO terms -* Contact information -* Method of inheritance -* Consent types (eg, to find patients with data consented for a particular use) -* Various associated clinical data and/or assessments - -In terms of how to represent these types in the API, there are a few -suggestions: - -* Schema Blocks from GA4GH -* schema.org from W3C and the related DCAT project -* DATS (a joint collaboration funded by NIH) - -# Caveats - -## Small number of responses -This summary was produced from a relatively small amount of response data, and -the individual responses were often collected against an evolving document. In -particular, the feature matrix doubled in size between some responses, and the -questionnaire was changed completely between the two responses. - -## Lack of clarity in feature definitions -The items expressed in the feature matrix are not fully defined, leaving their -interpretation up to the reader. In some cases, respondents added comments to -try to clarify how they interpreted the feature. - -# Open Questions -If you have an answer to one of these questions, please either file an issue, -open a PR against this document or reach out to the list at -ga4gh-discovery-search@ga4gh.org. - -* How exactly are fuzzy matches for phenotypes being performed today? For - example: this term and up the tree, this term and down the tree, other well - defined functions or something completely custom? +# Introduction +This document attempts to summarize a variety of use cases collected from key driver +projects and other interested parties. The information was collected from +three primary sources: + +* A questionnaire that collected answers to a series of curated questions (2 + responses) + +* A feature matrix spreadsheet that indexed various high level features against + interested driver projects (6 driver projects commented on at least one + feature in the matrix) + +* A "suggested queries" document that collected high level queries that are of + interest (6 groups replied to this document providing 19 queries) + +If you contributed to one of these sources and feel that your feedback was +either missed or misrepresented here, or contributed via another channel that +was not captured, please reach out to us at ga4gh-discovery-search@ga4gh.org. + +# Key themes +## Multimodal queries +In nearly all cases, the responses indicate a desire to link variant data with +phenotypic or clinical data. Only a few responses indicate a desire to query +over simple variant data (retrieving counts and allele/genotype frequency for a +variant) and even then it is a step along a path to a patient or cohort. + +All responses to the feature matrix rated this as "must have". + +See the section on data types below for more information. + +## Controlled Access +Most responses indicate some kind of requirement for controlled access, +including tiers of access ranging from public to tightly controlled by per +sample consent terms. Although configuration of access control is outside the +scope of the Data Connect API, considerations may need to be made for expressing +these concepts in queries and responses. + +## Complex Matching +In some cases (positional data for example) exact or simple fuzzy matching is +desired but for phenotypic matching needs to be performed using custom rules +and/or based on distances within an ontology hierarchy. + +Other potentially interesting matching/filtering modes mentioned include: +* returning results for only de novo variants +* matching against mutational burden +* filtering for rare diseases using gnomAD +* “abnormal” RNA expression of a particular gene +* generating a confusion matrix between two groups of individuals + +"Custom functions" were identified as "nice to have" (4) and "don't know" (2). + +Ben Hutten notes some thoughts on the topic of matching in the [MME +repository](https://github.com/ga4gh/mme-apis/wiki/Phenotype-matching). + +## Aggregation, Grouping and Sorting +Responses were mixed on the topic of aggregation. Responses on the topic of +"Aggregate functions (eg. minimum, maximum, average) range from "not needed" +(1), "nice to have" (3) to "must have" (2). There is a division here around +whether this sort of operation should be performed at query time or after the +results are generated. One important point of consideration is that in some +cases data may be available to a particular user only in aggregate meaning that +it must be performed as part of the query. + +The closely related feature of grouping (for example, counting the number of +variants grouped by gene) was rated as "not needed" (3), "nice to have" (1) and +"must have" (2) by respondents. + +Sorting, or the related concept of ranking, was rated as either "not needed" +(2) or "must have" (2) by respondents. + +## Data types +The Data Connect API is not expected to define data types, but it will be important +that some shared vocabulary are available to facilitate federated queries. + +All responses in the feature matrix indicated that the availability of a +standard set of fields is "nice to have" or "must have", while the ability to +have arbitrary non-standard fields is a "must have". + +Data types mentioned in the responses include: + +* Variant +* Age +* Sex +* HPO terms +* Contact information +* Method of inheritance +* Consent types (eg, to find patients with data consented for a particular use) +* Various associated clinical data and/or assessments + +In terms of how to represent these types in the API, there are a few +suggestions: + +* Schema Blocks from GA4GH +* schema.org from W3C and the related DCAT project +* DATS (a joint collaboration funded by NIH) + +# Caveats + +## Small number of responses +This summary was produced from a relatively small amount of response data, and +the individual responses were often collected against an evolving document. In +particular, the feature matrix doubled in size between some responses, and the +questionnaire was changed completely between the two responses. + +## Lack of clarity in feature definitions +The items expressed in the feature matrix are not fully defined, leaving their +interpretation up to the reader. In some cases, respondents added comments to +try to clarify how they interpreted the feature. + +# Open Questions +If you have an answer to one of these questions, please either file an issue, +open a PR against this document or reach out to the list at +ga4gh-discovery-search@ga4gh.org. + +* How exactly are fuzzy matches for phenotypes being performed today? For + example: this term and up the tree, this term and down the tree, other well + defined functions or something completely custom? diff --git a/assets/data-connect.png b/assets/data-connect.png new file mode 100644 index 00000000..f235b001 Binary files /dev/null and b/assets/data-connect.png differ diff --git a/assets/ga4gh-discovery-search.svg b/assets/ga4gh-discovery-search.svg deleted file mode 100644 index 9d9ec72f..00000000 --- a/assets/ga4gh-discovery-search.svg +++ /dev/null @@ -1,2 +0,0 @@ - -
Search Architecture
Search Architecture
Applications
Applications
Beacon
Beacon
Matchmaker
Matchmaker
Data Explorers
Data Explorers<br>
...
...
Search
Search
Search API
Search API
Datastores
Datastores
Object Store
Object Store
File System
File System
Elastic Search
Elastic Search
GraphQL
GraphQL
SQL
SQL
FHIR
FHIR
DICOM
DICOM
...
...
\ No newline at end of file diff --git a/assets/ga4gh-discovery-search.xml b/assets/ga4gh-discovery-search.xml deleted file mode 100644 index 4c4836d1..00000000 --- a/assets/ga4gh-discovery-search.xml +++ /dev/null @@ -1 +0,0 @@ -7VvbcuI4EP0aHidlyzbYjyH3qqSSHXZ2Z/ZN2AI7ERYliwDz9dsCGV9kiEPCLWMeEqvVskWf0622WrSsi9HshuNx+MACQlvICGYt67KFkIfa8FcK5kuB43lLwZBHwVJkZoJe9JsooaGkkyggSUFRMEZFNC4KfRbHxBcFGeacTYtqA0aLTx3jIdEEPR9TXfpvFIhQSU3DyDpuSTQM1aNdR3X0sf8y5GwSq+e1kDVYfJbdI5zeS+knIQ7YNCeyrlrWBWdMLK9GswtCpWlTsy3HXa/pXc2bk1jUGYD6NnFN5PUNFNhuZ/BN3eEV04myRY9g7ocgO4d/kQBzTzhRsxfz1GLJNBpRHEOrG4oRBaEJlzCABvd4ziZyNokA66Stbsh49JvFAktlAwTQzYXiAjIKGj05UqktrEuCdFBqQNmguE9odwXBBaOMQ1fMFvNKBGcvJBUCMsbis+pJkZYTH0SU5jQVhiCH6VzjUUQlsf8hPMAxVmI1c9NQ7aoHYRoNY5D5AA+Bzu4r4SIC3p2rjlEUBNKkXR3IFBkYQWY5kQL2hrAREXwOKmmvq0imnLCjmtOM0W7qcGGOzCgdh5UXDVe3zpgEF4pMNYmFNGKdj8cUvruIWJzU5NMGiigK6HZbS/HahkS64TpeheG8T7AbteL/bm/m1OE/5r9/PN/8JI/WN+RqxusSDMFPM1vmHNJcU+mvvTH2Ze8UInXJmh/wnb7r2E4933F94vtl3/kbh2xUdh1U5SJr0HoDZLQRUattnDkF50BuKsihXAWyZe0KZE8D+QELPxzhF7BDA/RWQKMjBNoyNKAvscAguZqNwZqEQzhsU7lG9gH49lCsjNMQ4L0EsB3v+Aigr4VnZ2cawmAFUV7+8tgowPIgKFH9NKOKN8UEqx6Wu4DOcYr5i6XDZlelL7tahU1Pg4gE8NqgmoyLkA1ZjOlVJu1m0nvGxgrHZyLEXJkPTwQrogz24vOf0vxAVNX8pdBYNC5nhdb8XUgFOAlXYYPEwbl8XcqoA5LriKbZlFR+wgJIFC9GIGOR1U/46+oWS5tIQ9THXekkbMJ9ssHiKu2BpG9IxCZkjGoicUIhvXwtzqyKF4uhYAk8zymMWRSLJHfnJynIhRZUCixm6V2rpI8268PFcgYZQ1df5QNZhbEf0v7B5Fvz8nBc5LPSLZm9ks9syLdj8q1ZQvdLPtPbTL439HdEPtQs10dKWusYSPtWxHSQfYCIaX0F0pJZJHI3h9av9Llwnd1aNuY57n45otunQPT2QYhuN0T/QkTvnALROzsmumZmFcwrimknUO1QRvI+ofihnvWd+ALHQ/jSWSZYUZRquxW3L9ekMJU+gQXpyt2qpITjp4QoveiSlUKf7jQM97Yh2/Zd0h/U2ZANMHEHe9+QXZPdpS+idqfohFXbsabp6BT4jP1YbbK2BrLci08E4+Q0ipKpI7k1/XR1huHT90f1+tVj/1mezkBGT9rzcB7jor7VbtfyGIe4gb1vj1mTJqb5mVlA2tGB3mv1wtTLV5CwyJjemyeCjBqYt4PZ1EqVB0daPwx0RSE2Rr4EuzqTacCuBbZelz442EgDe3G276/7BuPtMLa00vPBMa54IWnw3RZf/WjBwfHVs9nr27vvDcDbAew47rEB3NEAvry7eHxoEN4ytQYXtnIf+6jQtvT86888J7QZxE7pnJDTrthR2OtJIevkqzhfdaPaq2bSce1Tm4c4q2HtqSBTj3sF4tTh3kq5moi75F56cPtN8rmHIJ9VPnthvPOshrEP8jkN+bYkX93I91HyfWxBbB8Tvtsum6fEi/RHBvsJSnoU8UrlCbOUay3npUZl7HpvdCsf0nk7um3Uf290g2b2W82levZ7WOvqfw== \ No newline at end of file diff --git a/assets/phenopacket-tables-in-a-bucket-example.svg b/assets/phenopacket-tables-in-a-bucket-example.svg new file mode 100644 index 00000000..3e5a9e95 --- /dev/null +++ b/assets/phenopacket-tables-in-a-bucket-example.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/hugo/.gitignore b/hugo/.gitignore new file mode 100644 index 00000000..aa6272c8 --- /dev/null +++ b/hugo/.gitignore @@ -0,0 +1,3 @@ +public/ +resources/ +.DS_Store \ No newline at end of file diff --git a/hugo/archetypes/default.md b/hugo/archetypes/default.md new file mode 100644 index 00000000..00e77bd7 --- /dev/null +++ b/hugo/archetypes/default.md @@ -0,0 +1,6 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +date: {{ .Date }} +draft: true +--- + diff --git a/hugo/config.toml b/hugo/config.toml new file mode 100644 index 00000000..ada687fd --- /dev/null +++ b/hugo/config.toml @@ -0,0 +1,75 @@ +baseURL = "http://example.org/data-connect/" +languageCode = "en-us" +title = "Data Connect" +theme = "clyde" +pygmentsUseClasses=true +publishDir = "docs" +relativeURLs = true +canonifyURLs = true + + +[markup] + [markup.goldmark] + [markup.goldmark.extensions] + definitionList = true + footnote = true + linkify = true + strikethrough = true + table = true + taskList = true + typographer = true + [markup.goldmark.parser] + attribute = true + autoHeadingID = true + autoHeadingIDType = "github" + [markup.goldmark.renderer] + hardWraps = false + unsafe = false + xhtml = false + +[params] + company = "GA4GH Discovery Workstream" + #colors + secondary = "#1E1F21" + sidebar_bg = "#f6f6f8" + sidebar_primary = "#1E1F21" + primary = "#fff" + accent = "#e34a3a" + grey200 = "#F9F9FA" + grey600 = "#1E1F21" + grey_head= "#EFEFF3" + + logo = "/images/logo.png" + + home = "/docs/getting-started/introduction" + + show_title = true + + heading_font = "Poppins" + body_font = "Roboto" + code_font = "Source Code Pro" + + +[outputs] + section = ["JSON", "HTML"] + home = ["JSON", "HTML"] + +[menu] + [[menu.main]] + identifier = "api" + pre = "" + name = "API" + url = "/api" + weight = -999 + [[menu.main]] + identifier = "specs" + pre = "" + name = "Specification" + url = "https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md" + weight = -130 + [[menu.main]] + identifier = "github" + pre = "" + name = "Github" + url = "https://github.com/ga4gh-discovery/data-connect/" + weight = -120 diff --git a/hugo/content/docs/getting-started/_index.md b/hugo/content/docs/getting-started/_index.md new file mode 100644 index 00000000..dac8dc65 --- /dev/null +++ b/hugo/content/docs/getting-started/_index.md @@ -0,0 +1,6 @@ +--- +title: "Quick Start" +icon: "icon-layers" +type : "category" +weight: 1 +--- diff --git a/hugo/content/docs/getting-started/clients.md b/hugo/content/docs/getting-started/clients.md new file mode 100644 index 00000000..02e947af --- /dev/null +++ b/hugo/content/docs/getting-started/clients.md @@ -0,0 +1,51 @@ +--- +title: "Install Clients" +weight: 2 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: two-col +--- +{row-divider} +#### Installing Client Libraries +Data Connect has client libraries for R and Python, as well as a command-line interface. We’ll be using these client libraries in the following examples. +{divider} +{{}} +{{< tabs tabTotal="3" tabID="1" tabName1="Python" tabName2="R" tabName3="CLI">}} +{{% tab tabNum="1" %}} +```bash +# Installing the client library form PyPi +pip install search-python-client +# Installing from Github +pip install git+https://github.com/DNAstack/search-python-client --no-cache-dir +``` +{{% /tab %}} + +{{% tab tabNum="2" %}} +```R +# Setup devtools +dir.create(path = Sys.getenv("R_LIBS_USER"), showWarnings = FALSE, recursive = TRUE) +install.packages("devtools", lib = Sys.getenv("R_LIBS_USER"), repos = "https://cran.rstudio.com/") +``` +``` R +# installing the R client +devtools::install_github("DNAstack/ga4gh-search-client-r") +``` +{{% /tab %}} + +{{% tab tabNum="3" %}} +**This CLI requires Java 11+ on your system** +``` bash +curl https://storage.googleapis.com/ga4gh-search-cli/tables-cli-2.1-55-gc484f8b-executable.jar > search-cli +chmod +x search-cli +mv search-cli /usr/local/bin # (somewhere on your path) +search-cli --version +``` +You should see: +``` bash +tables-api-cli Version : 1.0-0.2.1-55-gc484f8b +``` +{{% /tab %}} +{{< /tabs >}} +{{}} \ No newline at end of file diff --git a/hugo/content/docs/getting-started/consume-data.md b/hugo/content/docs/getting-started/consume-data.md new file mode 100644 index 00000000..21152849 --- /dev/null +++ b/hugo/content/docs/getting-started/consume-data.md @@ -0,0 +1,427 @@ +--- +title: "Consume Data" +weight: 4 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: two-col +description: This section provides information about setting up Data Connect to expose data. +--- +{row-divider} +#### Browsing +At minimum, Data Connect implementations support browsing by table. This means [these operations](/api/#tag/tables) from the API specs are supported for table by table browsing. + +On the right is example code to browse [the tables-in-a-bucket](/docs/getting-started/provision-data/#tables-in-a-bucket-example) implementation of Data Connect. +{divider} +{{< tabs tabTotal="3" tabID="2" tabName1="Python" tabName2="R" tabName3="CLI">}} +{{% tab tabNum="1" %}} +[Follow along in Colab](https://colab.research.google.com/drive/1NytWLzQFWwGc3pqTaL0HD81S5B3zznLj?usp=sharing) +``` python +# init search client +from search_python_client.search import DrsClient, SearchClient +base_url_tiab = 'https://storage.googleapis.com/ga4gh-tables-example/' +search_client_tiab = SearchClient(base_url=base_url_tiab) +``` +``` python +# get tables +tables_iterator = search_client_tiab.get_table_list() +tables = [next(tables_iterator, None) for i in range(10)] +tables = list(filter(None, tables)) +print(tables) +``` +``` python +# get table info +table_name = tables[0]['name'] +table_info = search_client_tiab.get_table_info(table_name) +print(table_info) +``` +``` python +# get table data +table_name = tables[0]['name'] +table_data_iterator = search_client_tiab.get_table_data(table_name) +table_data = [next(table_data_iterator, None) for i in range(10)] +table_data = list(filter(None, table_data)) +print(table_data) +``` +{{% /tab %}} +{{% tab tabNum="2" %}} +``` +Under construction +https://colab.research.google.com/drive/1VOP2IcPjsX4U-DfuiTs7Tr0SVlAD0IMh?usp=sharing <= doesn't work right now. +``` +{{% /tab %}} +{{% tab tabNum="3" %}} +Get list of tables +``` bash +search-cli list --api-url https://storage.googleapis.com/ga4gh-tables-example +``` +``` bash +search-cli info subjects --api-url https://storage.googleapis.com/ga4gh-tables-example +``` +``` bash +search-cli data subjects --api-url https://storage.googleapis.com/ga4gh-tables-example +``` +{{% /tab %}} + +{{< /tabs >}} + +{row-divider} +#### Queries + +Data Connect supports query operation through SQL statements. + +Data Connect's SQL dialect has a familiar interface inspired by current major open source database platforms, including Trino, PostgreSQL, MySQL, and BigQuery. If you have prior experience with these database platforms, you'll feel right at home with only minor adjustments. + +[Supported SQL functions](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#sql-functions) + +[Supported SQL grammar](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#appendix-a-sql-grammar) + +{divider} +{{}} +{{< tabs tabTotal="2" tabID="float" tabName1="Example #1" tabName2="Example #2">}} +{{% tab tabNum="1" %}} +This query returns all female patients from the `patient` table. +``` SQL +/* you can scroll on this tab */ +SELECT * +FROM kidsfirst.ga4gh_tables.patient +WHERE Json_extract_scalar(patient, '$.gender') = 'female' +LIMIT 5; +``` +{{% /tab %}} + +{{% tab tabNum="2" %}} + +This query returns all conditions observed in female patients from the `patient` table. +``` SQL +/* you can scroll on this tab */ +SELECT Json_extract_scalar(ncpi_disease, '$.code.text') AS disease, + Json_extract_scalar(ncpi_disease, '$.identifier[0].value') AS identifier +FROM kidsfirst.ga4gh_tables.ncpi_disease disease + INNER JOIN kidsfirst.ga4gh_tables.patient patient + ON patient.id = REPLACE(Json_extract_scalar(ncpi_disease, + '$.subject.reference'), + 'Patient/') +WHERE Json_extract_scalar(patient, '$.gender') = 'female' +LIMIT 5; +``` +{{% /tab %}} +{{< /tabs >}} +{{}} + +{row-divider} + +#### Issuing Queries Using Data Connect + +Data Connect can be accessed through the straightforward HTTP calls described in its OpenAPI specification. + +While Data Connect API can be navigated using programs like cURL or Postman, it is best accessed programmatically. The results could be split into multiple pages, which is easier to navigate with programmatic access. + +Fetch each page only once. Data Connect servers are allowed to "forget" page URLs after you fetch them. This allows the server implementations to be more efficient. + +On the right, we provide examples to consume data from Data Connect using the GA4GH Commandline Interface, the R client, Python, and cURL. + +> [Need help installing client libraries?](/docs/getting-started/clients/) + +{divider} +{{}} +{{< tabs tabTotal="4" tabID="queries" tabName1="Python" tabName2="R" tabName3="CLI" tabName4="cURL">}} + +{{% tab tabNum="1" %}} +[Follow Along in Google Colab](https://colab.research.google.com/drive/1efGB5O68_dtMgyqCeIjLG8ezMzDBBQj9?usp=sharing) +```bash +# Installing the client library form PyPi +pip install search-python-client +# Installing from Github +pip install git+https://github.com/DNAstack/search-python-client --no-cache-dir +``` +```python +# Building the query +from search_python_client.search import DrsClient, SearchClient +base_url = 'https://search-presto-public.staging.dnastack.com' +search_client = SearchClient(base_url=base_url) +query = """ +SELECT Json_extract_scalar(ncpi_disease, '$.code.text') AS disease, + Json_extract_scalar(ncpi_disease, '$.identifier[0].value') AS identifier +FROM kidsfirst.ga4gh_tables.ncpi_disease disease + INNER JOIN kidsfirst.ga4gh_tables.patient patient + ON patient.id = REPLACE(Json_extract_scalar(ncpi_disease, + '$.subject.reference'), + 'Patient/') +WHERE Json_extract_scalar(patient, '$.gender') = 'female' +LIMIT 5 +""" +``` +```python +# Executing the query +table_data_iterator = search_client.search_table(query) +for item in table_data_iterator: + print(item) +``` +```python +# Results +{'disease': 'Aortic atresia', 'identifier': 'Condition|SD_PREASA7S|272|Aortic atresia|None'} +{'disease': 'Mitral atresia', 'identifier': 'Condition|SD_PREASA7S|272|Mitral atresia|None'} +{'disease': 'Hypoplasia ascending aorta', 'identifier': 'Condition|SD_PREASA7S|272|Hypoplasia ascending aorta|None'} +{'disease': 'Hypoplastic left heart syndrome', 'identifier': 'Condition|SD_PREASA7S|272|Hypoplastic left heart syndrome|None'} +{'disease': 'Hypoplastic left ventricle (subnormal cavity volume)', 'identifier': 'Condition|SD_PREASA7S|272|Hypoplastic left ventricle (subnormal cavity volume)|None'} +``` +{{% /tab %}} + +{{% tab tabNum="2" %}} +[Follow Along in Google Colab](https://colab.research.google.com/drive/1Y6r1772AW-FWZ1OrOutNoDOvca8Osz3z?usp=sharing) +```R +# installing devtools +dir.create(path = Sys.getenv("R_LIBS_USER"), showWarnings = FALSE, recursive = TRUE) +install.packages("devtools", lib = Sys.getenv("R_LIBS_USER"), repos = "https://cran.rstudio.com/") +``` +```R +# installing the R client +devtools::install_github("DNAstack/ga4gh-search-client-r") +``` +```R +# Making the request +library(httr) +conditionsInFemalePatients <- ga4gh.search::ga4gh_search("https://search-presto-public.staging.dnastack.com", "select json_extract_scalar(ncpi_disease, '$.code.text') as disease, json_extract_scalar(ncpi_disease, '$.identifier[0].value') as identifier from kidsfirst.ga4gh_tables.ncpi_disease disease INNER JOIN kidsfirst.ga4gh_tables.patient patient ON patient.id=replace(json_extract_scalar(ncpi_disease, '$.subject.reference'), 'Patient/') WHERE json_extract_scalar(patient, '$.gender')='female' limit 5") +``` +```R +# View the results +print(conditionsInFemalePatients) +``` + +Output: +``` bash + disease +1 Aortic atresia +2 Mitral atresia +3 Hypoplasia ascending aorta +4 Hypoplastic left heart syndrome +5 Hypoplastic left ventricle (subnormal cavity volume) + identifier +1 Condition|SD_PREASA7S|272|Aortic atresia|None +2 Condition|SD_PREASA7S|272|Mitral atresia|None +3 Condition|SD_PREASA7S|272|Hypoplasia ascending aorta|None +4 Condition|SD_PREASA7S|272|Hypoplastic left heart syndrome|None +5 Condition|SD_PREASA7S|272|Hypoplastic left ventricle (subnormal cavity volume)|None +``` +{{% /tab %}} + + +{{% tab tabNum="3" %}} + +``` bash +search-cli query -q "select json_extract_scalar(ncpi_disease, '$.code.text') as disease, json_extract_scalar(ncpi_disease, '$.identifier[0].value') as identifier from kidsfirst.ga4gh_tables.ncpi_disease disease INNER JOIN kidsfirst.ga4gh_tables.patient patient ON patient.id=replace(json_extract_scalar(ncpi_disease, '$.subject.reference'), 'Patient/') WHERE json_extract_scalar(patient, '$.gender')='female' limit 5" --api-url https://search-presto-public.staging.dnastack.com +``` +{{% /tab %}} +{{% tab tabNum="4" %}} +These requests +This query returns all female patients from the `patient` table. +``` bash +curl --request POST \ + --url https://search-presto-public.staging.dnastack.com/search \ + --header 'content-type: application/json' \ + --data '{ "query": "select * from kidsfirst.ga4gh_tables.patient WHERE json_extract_scalar(patient, '\''$.gender'\'')='\''female'\'' limit 5"}' +``` + +This query returns all conditions observed in female patients from the `patient` table. +``` bash +curl --request POST \ + --url https://search-presto-public.staging.dnastack.com/search \ + --header 'content-type: application/json' \ + --data '{ "query": "select json_extract_scalar(ncpi_disease, '\''$.code.text'\'') as disease, json_extract_scalar(ncpi_disease, '\''$.identifier[0].value'\'') as identifier from kidsfirst.ga4gh_tables.ncpi_disease disease INNER JOIN kidsfirst.ga4gh_tables.patient patient ON patient.id=replace(json_extract_scalar(ncpi_disease, '\''$.subject.reference'\''), '\''Patient/'\'') WHERE json_extract_scalar(patient, '\''$.gender'\'')='\''female'\'' limit 5"}' +``` +{{% /tab %}} +{{< /tabs >}} +{{}} + +{row-divider} +#### More Examples +##### dbGaP GECCO Example +This is a public implementation of Data Connect. Feel free to follow along with the examples and explore this endpoint with your own script. +{{< tabs tabTotal="3" tabID="3" tabName1="Python" tabName2="R" tabName3="CLI">}} +{{% tab tabNum="1" %}} +[Follow along in Colab](https://colab.research.google.com/drive/1f_BZibUx3nWdaJXkgcoW5WqwxnLDgzzY?usp=sharing) +``` python +# init search client +from search_python_client.search import DrsClient, SearchClient +base_url = 'https://search-presto-public.prod.dnastack.com/' +search_client = SearchClient(base_url=base_url) +``` +``` python +# Find available tables +tables_iterator = search_client.get_table_list() +tables = list(tables_iterator) +import pprint +pprint.pprint(tables) +``` +```python +#Get more information about a table returned +table_info = search_client.get_table_info("dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi") +pprint.pprint(table_info) +``` +```python +# Dig into the table a little further +table_data_iterator = search_client.get_table_data("dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi") +``` +```python +# Limit to first 10 items +tables = [next(table_data_iterator, None) for i in range(10)] +tables = list(filter(None, tables)) +pprint.pprint(tables) +``` +``` python +# Select all items from the CPS-II study +query = """ +SELECT * +FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi +WHERE study = 'CPS-II' +LIMIT 5 +""" +``` +``` python +# Executing the query +table_data_iterator = search_client.search_table(query) +for item in table_data_iterator: + print(item) +``` +{{% /tab %}} +{{% tab tabNum="2" %}} +[Follow along in Colab](https://colab.research.google.com/drive/1X7EZ71v29iFnxbHjsc-9_0Om41xEw32m?usp=sharing) +``` R +# installing devtools +dir.create(path = Sys.getenv("R_LIBS_USER"), showWarnings = FALSE, recursive = TRUE) +install.packages("devtools", lib = Sys.getenv("R_LIBS_USER"), repos = "https://cran.rstudio.com/") +``` +``` R +# installing the R client +devtools::install_github("DNAstack/ga4gh-search-client-r") +``` +``` R +# Making the request +library(httr) +ga4gh.search::ga4gh_list_tables("https://search-presto-public.prod.dnastack.com") +``` +``` R +# Select all items from the CPS-II study +query <- "SELECT * FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi WHERE study = 'CPS-II' LIMIT 5" +``` +``` R +# Executing the query +ga4gh.search::ga4gh_search("https://search-presto-public.prod.dnastack.com", query) +``` +{{% /tab %}} +{{% tab tabNum="3" %}} +List tables +``` bash +search-cli list --api-url "https://search-presto-public.prod.dnastack.com" +``` +Get table info +``` bash +search-cli info dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi --api-url "https://search-presto-public.prod.dnastack.com" +``` +Now run a query and pipe the results to a file called `results.txt` +``` bash +search-cli query -q "SELECT * FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi WHERE study = 'CPS-II' LIMIT 5" \ + --api-url "https://search-presto-public.prod.dnastack.com" > results.txt +``` +{{% /tab %}} + +{{< /tabs >}} + +{divider} + +--- + +##### COVID Cloud Example +This is a public implementation of Data Connect for COVID Cloud. Find more about COVID Cloud [here](https://international.covidcloud.ca/). +{{< tabs tabTotal="3" tabID="4" tabName1="Python" tabName2="R" tabName3="CLI">}} +{{% tab tabNum="1" %}} +[Follow along in Colab](https://colab.research.google.com/drive/1hnHTqquYP2HjUF0dDHn8FKiO9f7t0yGO?usp=sharing) +``` python +from search_python_client.search import DrsClient, SearchClient +base_url = 'https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com/' +search_client = SearchClient(base_url=base_url) +``` +```python +# Find available tables +tables_iterator = search_client.get_table_list() +tables = list(tables_iterator) +import pprint +pprint.pprint(tables) +``` +```python +# Get more information about a table returned +table_info = search_client.get_table_info("covid.cloud.sequences") +pprint.pprint(table_info) +``` +```python +# Dig into the table a little further +table_data_iterator = search_client.get_table_data("covid.cloud.sequences") +# Limit to first 10 items +tables = [next(table_data_iterator, None) for i in range(1)] +tables = list(filter(None, tables)) +pprint.pprint(tables) +``` +```python +# Select all sequences from GenBank +query = """ +SELECT * +FROM covid.cloud.sequences +WHERE sequence_type='GenBank' +LIMIT 25 +""" +``` +```python +table_data_iterator = search_client.search_table(query) +for item in table_data_iterator: + print(item) +``` +{{% /tab %}} +{{% tab tabNum="2" %}} +[Follow along in Colab](https://colab.research.google.com/drive/1FCpiUIHSOS-qewaw5efF8T_SipR5DSZR?usp=sharing) +``` R +# installing devtools +dir.create(path = Sys.getenv("R_LIBS_USER"), showWarnings = FALSE, recursive = TRUE) +install.packages("devtools", lib = Sys.getenv("R_LIBS_USER"), repos = "https://cran.rstudio.com/") +``` +``` R +# installing the R client +devtools::install_github("DNAstack/ga4gh-search-client-r") +``` +``` R +# Making the request +library(httr) +ga4gh.search::ga4gh_list_tables("https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com") +``` +``` R +# Select all data from Genbank. +query <- "SELECT * FROM covid.cloud.sequences WHERE sequence_type='GenBank' LIMIT 25" +``` +``` R +# Executing the query +ga4gh.search::ga4gh_search("https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com", query) + +``` +{{% /tab %}} +{{% tab tabNum="3" %}} +List tables +``` bash +search-cli list --api-url "https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com" +``` +Get table info +``` bash +search-cli info covid.cloud.sequences --api-url "https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com" +``` +Now run a query and pipe the results to a file called `results.txt` +``` bash +search-cli query -q "SELECT * FROM covid.cloud.sequences WHERE sequence_type='GenBank' LIMIT 25" \ + --api-url "https://ga4gh-search-adapter-presto-covid19-public.prod.dnastack.com" > results.txt +``` +{{% /tab %}} + +{{< /tabs >}} + +# + + diff --git a/hugo/content/docs/getting-started/introduction.md b/hugo/content/docs/getting-started/introduction.md new file mode 100644 index 00000000..c835e753 --- /dev/null +++ b/hugo/content/docs/getting-started/introduction.md @@ -0,0 +1,97 @@ +--- +title: "Introduction" +weight: 1 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: two-col +--- +{row-divider} + +Data Connect is a standard for discovery and search of biomedical data, developed by the [Discovery Work Stream](https://github.com/ga4gh-discovery/ga4gh-discovery.github.io) of the [Global Alliance for Genomics & Health](http://ga4gh.org). + +The standard provides a mechanism for: + +- Describing data and its data model. + - Data Connect's _Table API_ component provides a way to organize data into "Tables" and describe their data model, leveraging the JSON Schema standard. +- Searching the data with the given data model. + - Data Connect's _Search API_ component provides a way to query "Tables" of data, leveraging the SQL standard. + +It is **not** in the scope of the standard to: + +- Define said data models. + - Data Connect relies on other efforts in GA4GH (e.g. [GA4GH SchemaBlocks](https://schemablocks.org/)), as well as outside implementers. + +### Background + +GA4GH has previously developed two standards for discovery. `Beacon` is a standard for discovery of genomic variants, while `Matchmaker` is a standard for discovery of subjects with certain genomic and phenotypic features. Implementations of these standards have been linked into federated networks (e.g. [Beacon Network](https://beacon-network.org/) and [Matchmaker Exchange](http://matchmakerexchange.org)). + +Both standards (and the corresponding networks) have been successful in their own right, but had a lot in common. It was acknowledged that it would be broadly useful to develop standards that abstract common infrastructure for building searchable, federated networks for a variety of applications in genomics and health. + +Data Connect, formerly known as _GA4GH Search_, is this general-purpose middleware for building federated, search-based applications. The name of the API reflects its purpose of: + +- Giving data providers a mechanism to enable others to connect to their data via the described data models. +- Allowing data consumers to make connections within the data through a flexible query language. + +### Benefits + +- **Interoperable**. Simple, interoperable, uniform mechanism to publish, discover, and search biomedical data. +- **Flexible**. Works with any data that can be serialized as an array of JSON objects. Recommends the use of [GA4GH SchemaBlocks](https://schemablocks.org/) data models, but allows custodians to specify their own data models to make their data available without extensive ETL transformations. +- **Supports federation**. Serves as a general-purpose framework for building federatable search-based applications across multiple implementations. Federations reference common schemas and properties. +- **Minimal by design**. The API is purposely kept minimal so that the barriers to publishing existing data are as small as possible. +- **Backend agnostic**. It is possible to implement the API across a large variety of backend datastores. +- **General purpose**. Admits use cases that have not yet been thought of. + +### Use cases + +Data Connect is an intentionally general-purpose middleware meant to enable the development of a diverse ecosystem of applications. + +The community has built versions of the following applications on top of Data Connect: + +- Data Explorers +- Beacons +- Patient matchmaking +- Jupyter notebooks +- R data frames +- Command line query tools +- Data and metadata indexers +- Data federations +- Concept cross-references + +We're looking forward to seeing things we haven’t yet imagined! + +The community has also connected data through the following data sources: + +- FHIR +- Relational databases +- CSV/TSV files with data dictionaries +- VCF+TBI files +- Phenopackets +- Google BigQuery +- Google Sheets +- and more! + +Examples of queries on the data that can be answered via Data Connect include: + +- Find subjects with HP:0001519 and candidate gene FBN1 (use case of [Matchmaker Exchange](https://www.matchmakerexchange.org/)) +- Find male subjects with HP:0009726 consented for General Research Use (use case of [European Genome-phenome Archive](https://ega-archive.org/)) +- Find adult males diagnosed with autism having a harmful mutation in SHANK1 (use case of [Autism Sharing Initiative](http://autismsharinginitiative.org)) +- Find dataset from subject on European data center hosted on Amazon (use case of [Cloud Work Stream](https://github.com/ga4gh/wiki/wiki)) + +Full summary of use cases can be found in [USECASES.md](USECASES.md). + +{divider} +{{}} +{{%content-textbox%}} +##### Quick Links +--- +[Specification](/api) + +[Installing Client Libraries](/docs/getting-started/clients/) + +[Publishing Data Examples](/docs/getting-started/provision-data/) + +[Data Consumption Examples](/docs/getting-started/consume-data/) +{{%/content-textbox%}} +{{}} \ No newline at end of file diff --git a/hugo/content/docs/getting-started/provision-data.md b/hugo/content/docs/getting-started/provision-data.md new file mode 100644 index 00000000..903153a1 --- /dev/null +++ b/hugo/content/docs/getting-started/provision-data.md @@ -0,0 +1,132 @@ +--- +title: "Provision Data" +weight: 3 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: two-col +--- + +{row-divider} +#### Implementation + +Data Connect requires [table operations](/api/#tag/tables) to be implemented to specification for basic discovery and browsing. + +Optional but not required, [query operations](/api/#tag/search) may be implemented to support querying with SQL. + +The Data Connect API is backend agnostic, which means any solution that implements the [API specification](/api) is valid. You can use your favorite backend web application framework to implement Data Connect Endpoints or any HTTPS file server (a cloud blob store, for example) for a tables-in-a-bucket implementation requiring no code. + +Checkout the following examples for some inspiration. +{divider} +{{}} +{{%content-textbox%}} +##### Quick Links +--- +[Full API Specifications](/api) + +[Example Use Cases](/docs/use-exisitng-data/) +{{%/content-textbox%}} +{{}} +{row-divider} +#### Tables-in-a-bucket example +The specification allows for a no-code implementation as a collection of files served statically. This is the easiest way to start experimenting with Data Connect. As long as your storage bucket conforms to the correct file structure and it has the correct sharing permissions, it is a valid Data Connect implementation. + +A concrete example implementation is [available here](https://storage.googleapis.com/ga4gh-tables-example/tables) and [try browsing this implementation](/docs/getting-started/consume-data/#browsing) with these commands. + +{divider} +{{}} +{{%content-textbox%}} +Here's how you'll need to organize your folders +- ```tables```: served in response to ```GET /tables``` +- ```table/{table_name}/info```: served in response to ```GET /table/{table_name}/info```. e.g. a table with the name ```mytable``` should have a corresponding file ```table/mytable/info``` +- ```table/{table_name}/data```: served in response to ```GET /table/{table_name}/data```. e.g. a table with the name ```mytable``` should have a corresponding file ```table/mytable/data``` +- ```table/{table_name}/data_{pageNumber}```, which will be linked in the next_page_url of the first table (e.g. ```mytable```). +- ```table/{table_name}/data_models/{schemaFile}```: Though not required, data models may be linked via [$ref](https://json-schema.org/latest/json-schema-core.html#rfc.section.8.3). Data models can also be stored as static JSON documents, and be referred to by relative or absolute URLs. +{{%/content-textbox%}} +{{}} + +{row-divider} +#### Try a Reference Implementation + +This example was shown as a demo during the 2020 GA4GH Plenary. This app will run a reference Data Connect implementation on docker and use a Trino instance hosted by DNAstack as the data source. + +You’ll need docker set up on your system to run the Spring app, and you’ll need to have one of the client libraries installed from the [Installing Clients Section](/docs/getting-started/clients/). + +Further information about this example can be found [here](/docs/use-exisitng-data/using-preso/doc/). +{divider} +{{}} +{{< tabs tabTotal="1" tabID="2" tabName1="30 second quick start">}} +{{% tab tabNum="1" %}} +``` bash +docker pull postgres:latest +docker run -d --rm --network="host" --name dnastack-ga4gh-search-db -e POSTGRES_USER=ga4ghsearchadapterpresto -e POSTGRES_PASSWORD=ga4ghsearchadapterpresto postgres +docker pull dnastack/ga4gh-search-adapter-presto:latest +docker run --rm --name dnastack-ga4gh-search -p 8089:8089 -e PRESTO_DATASOURCE_URL=https://presto-public.prod.dnastack.com -e SPRING_PROFILES_ACTIVE=no-auth dnastack/ga4gh-search-adapter-presto:latest +``` +{{% /tab %}} +{{< /tabs >}} +{{}} + +{{}} +{{< tabs tabTotal="3" tabID="2" tabName1="Python" tabName2="R" tabName3="CLI">}} +{{% tab tabNum="1" %}} +``` Python +# init search client +from search_python_client.search import DrsClient, SearchClient +base_url = 'http://localhost:8089/' +search_client = SearchClient(base_url=base_url) +``` +``` python +# get tables +tables_iterator = search_client.get_table_list() +tables = [next(tables_iterator, None) for i in range(10)] +tables = list(filter(None, tables)) +print(tables) +``` +``` python +# get table info +table_name = "sample_phenopackets.ga4gh_tables.gecco_phenopackets" +table_info = search_client.get_table_info(table_name) +print(table_info) +``` +``` python +# get table data +table_name = "sample_phenopackets.ga4gh_tables.gecco_phenopackets" +table_data_iterator = search_client.get_table_data(table_name) +table_data = [next(table_data_iterator, None) for i in range(10)] +table_data = list(filter(None, table_data)) +print(table_data) +``` +{{% /tab %}} +{{% tab tabNum="2" %}} +``` R +# Fetch table list +library(httr) +tables <- ga4gh.search::ga4gh_list_tables("http://localhost:8089") +print(tables) +``` +``` R +# Try a query +search_result <- ga4gh.search::ga4gh_search("http://localhost:8089", "SELECT sample_phenopackets.ga4gh_tables.gecco_phenopackets") +print(tables) +``` +{{% /tab %}} +{{% tab tabNum="3" %}} +List tables +``` bash +search-cli list --api-url http://localhost:8089 +``` +Get table info +``` bash +search-cli info dbgap_demo.scr_gecco_susceptibility.sample_multi --api-url http://localhost:8089 +``` +Get table data +``` bash +search-cli data dbgap_demo.scr_gecco_susceptibility.sample_multi --api-url http://localhost:8089 +``` +{{% /tab %}} + +{{< /tabs >}} +{{}} + diff --git a/hugo/content/docs/reference/_index.md b/hugo/content/docs/reference/_index.md new file mode 100644 index 00000000..e690d7d5 --- /dev/null +++ b/hugo/content/docs/reference/_index.md @@ -0,0 +1,6 @@ +--- +title: "Implementation" +icon: "icon-book" +type : "category" +weight: 9 +--- diff --git a/hugo/content/docs/reference/pagination-long-queries.md b/hugo/content/docs/reference/pagination-long-queries.md new file mode 100644 index 00000000..3fad03e3 --- /dev/null +++ b/hugo/content/docs/reference/pagination-long-queries.md @@ -0,0 +1,129 @@ +--- +title: "Pagination and Long Running Queries" +weight: 1 +draft: false +lastmod: 2020-12-3 +type: docs +layout: single-col +--- +**Pagination Sequence** + +A pagination sequence is the singly-linked list of URLs formed by following the `next_page_url` property of the pagination section of an initial `TableData` or `ListTablesResponse`. A pagination sequence begins at the first response returned from any request that yields a `TableData` or `ListTablesResponse`, and ends at the page in the sequence whose pagination property is omitted, whose `pagination.next_page_url` is omitted, or whose `pagination.next_page_url` is `null`. + +Servers **may** return a unique pagination sequence in response to successive requests for the same query, table data listing, or table listing. + +Except for the last page, `pagination.next_page_url property` **must** be either an absolute URL or a relative reference as defined by [RFC 3986 section 4.2](https://tools.ietf.org/html/rfc3986#section-4.2) whose base URL is the URL that the page containing the reference was fetched from. + +Every non-empty `TableData` page in a pagination sequence **must** include a `data_model` property. If present, the `data_model` property `must` be a valid JSON Schema. + +Across all `TableData` pages in the pagination sequence that have a `data_model` value, the `data_models` **must** be identical. Some `TableData` pages may lack a `data_model`. See the empty page rules below. + +Servers **may** respond with an `HTTP 4xx` error code if the same page is requested more than once. + +Due to both rules above, clients **must not** rely on the ability to re-fetch previously encountered pages. + +Servers **may** include a Retry-After HTTP header in each response that is part of a pagination sequence, and clients **must** respect the delay specified by such header before attempting to fetch the next page. + +**Empty TableData Pages** + +While many types of queries will be completed quickly, others will take minutes or even hours to yield a result. The simplest solution would be a synchronous design: query requests block until data is ready, then return a `TableData` response with the initial rows of the result set. However, asking clients to block for hours on a single HTTP response is fraught with difficulty: open connections are costly and fragile. If an intermediary times out the request, the results will be lost and the client must start over. + +To allow servers to direct clients to poll for results rather than hold open HTTP connections for long-running queries, the following special pagination rules apply to empty pages. + +An empty page is defined as a `TableData` object whose data property is a zero element array. + +A pagination sequence MAY include any number of empty pages anywhere in the sequence. + +An empty `TableData` page **may** omit its data_model property entirely. This allows servers to direct clients to poll for results before the result schema has been determined. + +A server that returns an empty page **should** include a `Retry-After` header in the HTTP response. If a client encounters an empty page with no `Retry-After` header, the client **should** delay at least 1 second before requesting the next page. + +**Example: Server returning empty pages to make client poll** + +This example illustrates a server returning a series of empty pages to a client while it is preparing the result set. The client polls for results by following `next_page_url` at the rate specified by the server. The form of the pagination URLs are only an example of one possible scheme. Servers are free to employ any pagination URL scheme. + +**Initial Request** + + +```json +POST /search +content-type: application/json + +{"query":"select distinct gene_symbol from example_project.brca_exchange.v32"} + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/statement/abc123/queued/1"}} +``` + + +**2nd request (Polling after sleeping for 1000ms)** + + +```json +GET /search/v1/statement/abc123/queued/1 + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/statement/abc123/queued/2"}} +``` + + +**3rd request (Polling again after sleeping for 1000ms)** + + +```json +GET /search/v1/statement/abc123/queued/2 + +HTTP/1.1 200 OK +content-type: application/json +retry-after: 1000 + +{"data":[],"pagination":{"next_page_url":"/search/v1/statement/abc123/executing/1"}} +``` + + +**4th request (Polling again after sleeping for 1000ms)** + + +```json +GET /search/v1/statement/abc123/executing/1 + +HTTP/1.1 200 OK +content-type: application/json + +{"data_model":{"description":"Automatically generated schema","$schema":"http://json-schema.org/draft-07/schema#","properties":{"gene_symbol":{"format":"varchar","type":"string"}}},"data":[{"gene_symbol":"BRCA2"},{"gene_symbol":"BRCA1"}],"pagination":{"next_page_url":"/search/v1/statement/abc123/executing/2"}} +``` + + +**Final request (no delay because page was nonempty and no retry-after header was present on the response)** + + +```json +GET /search/v1/statement/abc123/executing/2 + +HTTP/1.1 200 OK +content-type: application/json + +{"data_model":{"description":"Automatically generated schema","$schema":"http://json-schema.org/draft-07/schema#","properties":{"gene_symbol":{"format":"varchar","type":"string"}}},"data":[],"pagination":{}} +``` + + +**Example: Client algorithm for consuming TableData pages** + +The algorithm provided here simply illustrates one way to comply with the rules above. Any algorithm that satisfies all rules acceptable. + + + +1. Start with an empty data buffer and undefined data model. +2. Loop: + 1. If the response is an **error**, report the **error** and **abort** + 2. If no `data_model` has been seen so far, check if this page contains a `data_model`. If so, define the data model for the whole pagination sequence as this page’s `data_model`. + 3. Append the row data from the current page to the data buffer (there may be 0 rows on any given page) + 4. Delay for the time specified in the `Retry-After` HTTP response header for the current page (default is no delay) + 5. If there is a pagination object and it has a non-null `next_page_url`, fetch that URL, make that response the current page, and start back at step 2a; otherwise end. + diff --git a/hugo/content/docs/reference/sql-functions.md b/hugo/content/docs/reference/sql-functions.md new file mode 100644 index 00000000..19887516 --- /dev/null +++ b/hugo/content/docs/reference/sql-functions.md @@ -0,0 +1,190 @@ +--- +title: "SQL Functions" +weight: 1 +draft: false +lastmod: 2020-12-3 +type: docs +layout: single-col +--- +Data Connect's SQL dialect has been selected for compatibility with current major open source database platforms including Trino, PostgreSQL, and MySQL, and BigQuery. There are occasional name or signature differences, but a Data Connect implementation atop any of the major database platforms should be able to pass through queries that use the functions listed below with only minor tweaks. + +The functions below are a subset of those available in Trino 341. In a conformant Data Connect implementation, these functions must behave according to the Trino documentation. To assist with implementations directly on other database platforms, the [Trino Functions Support Matrix](https://docs.google.com/document/d/1y51qNuoe2ELX9kCOyQbFB4jihiKt2N8Qcd6-zzadIvk) captures the differences between platforms in granular detail. + +* **Logical Operators** + * `AND`, `OR`, `NOT` +* **Comparison Operators** + * `<`, `>`, `<=`, `>=`, `=`, `<>`, `!=` + * `BETWEEN, IS NULL, IS NOT NULL` + * `IS DISTINCT FROM`* + * `IS NOT DISTINCT FROM`* + * `GREATEST`, `LEAST` + * Quantified Comparison Predicates: `ALL`, `ANY` and `SOME`* + * Pattern Comparison: `LIKE` +* **Conditional Expressions** + * `CASE`, `IF`, `COALESCE`, `NULLIF` +* **Conversion Functions** + * `cast(value AS type)` → `type` + * `format(format, args...)` → `varchar` +* **Mathematical Functions** + * Most basic functions are supported across implementations. Notably missing are hyperbolic trig functions, infinity, floating point, and statistical/CDF functions. + * `abs(x)` → [same as input] + * `ceil(x)` → [same as input] + * `ceiling(x)` → [same as input] + * `degrees(x)` → `double`* + * `exp(x)` → `double` + * `floor(x)` → [same as input] + * `ln(x)` → `double` + * `log(b, x)` → `double` + * `log10(x)` → `double` + * `mod(n, m)` → [same as input] + * `pi()` → `double` + * `pow(x, p)` → `double`* + * `power(x, p)` → `double` + * `radians(x)` → `double`* + * `round(x)` → [same as input] + * `round(x, d)` → [same as input] + * `sign(x)` → [same as input] + * `sqrt(x)` → `double` + * `truncate(x)` → `double`* + * Random Functions: + * `rand()` → `double`* + * `random()` → `double`* + * `random(n)` → [same as input]* + * `random(m, n)` → [same as input]* + * Trigonometric Functions: + * `acos(x)` → `double` + * `asin(x)` → `double` + * `atan(x)` → `double` + * `atan2(y, x)` → `double` + * `cos(x)` → `double` + * `sin(x)` → `double` + * `tan(x)` → `double` +* **Bitwise Functions** + * `bitwise_and(x, y)` → `bigint` + * `bitwise_or(x, y)` → `bigint` + * `bitwise_xor(x, y)` → `bigint` + * `bitwise_not(x)` → `bigint` + * `bitwise_left_shift(value, shift)` → [same as value] + * `bitwise_right_shift(value, shift, digits)` → [same as value] + * `bit_count(x, bits)` → `bigint`* +* **Regular Expression Functions** + * `regexp_extract_all(string, pattern)` -> `array(varchar)`* + * `regexp_extract_all(string, pattern, group)` -> `array(varchar)`* + * `regexp_extract(string, pattern)` → `varchar`* + * `regexp_extract(string, pattern, group)` → `varchar`* + * `regexp_like(string, pattern)` → `boolean`* + * `regexp_replace(string, pattern)` → `varchar`* + * `regexp_replace(string, pattern, replacement)` → `varchar`* + * `regexp_replace(string, pattern, function)` → `varchar`* +* **UUID Functions** + * `uuid()*` +* **Session Information Functions** + * `current_user`* +* **String manipulation** + * **Operators:** + * `Concatenation (||)`* + * `LIKE` + * **Functions:** + * `chr(n)` → `varchar`* + * `codepoint(string)` → `integer`* + * `format(format, args...)` → `varchar` + * `length(string)` → `bigint` + * `lower(string)` → `varchar` + * `lpad(string, size, padstring)` → `varchar` + * `ltrim(string)` → `varchar` + * `position(substring IN string)` → `bigint`* + * `replace(string, search, replace)` → `varchar` + * `reverse(string)` → `varchar` + * `rpad(string, size, padstring)` → `varchar` + * `rtrim(string)` → `varchar` + * `split(string, delimiter, limit)` -> `array(varchar)`* + * `starts_with(string, substring)` → `boolean`* + * `strpos(string, substring)` → `bigint`* + * `substr(string, start)` → `varchar`* + * `substring(string, start)` → `varchar` + * `substr(string, start, length)` → `varchar`* + * `substring(string, start, length)` → `varchar` + * `trim(string)` → `varchar` + * `upper(string)` → `varchar` +* **Date manipulation** +>Be aware of different quotation (`'`) syntax requirements between MySQL and PostgreSQL. BigQuery does not support the `+`/`-` operators for dates. Convenience methods could be replaced with `EXTRACT()`. + +* **Operators:** + * `+`, `- *` + * `AT TIME ZONE`* +* **Functions:** + * `current_date` + * `current_time` + * `current_timestamp` + * `current_timestamp(p)`* + * `date(x)` → `date`* + * `date_trunc(unit, x)` → [same as input]* + * `date_add(unit, value, timestamp)` → [same as input]* + * `date_diff(unit, timestamp1, timestamp2)` → `bigint`* + * `extract(field FROM x)` → `bigint`* + * `from_unixtime(unixtime)` -> `timestamp(3)`* + * `from_unixtime(unixtime, zone)` → `timestamp(3) with time zone`* + * `from_unixtime(unixtime, hours, minutes)` → `timestamp(3) with time zone`* + * `Localtime`* + * `localtimestamp`* + * `localtimestamp(p)`* + * `now()` → `timestamp(3)` with time zone* + * `to_unixtime(timestamp)` → `double`* +* **MySQL-like date functions:** + * `date_format(timestamp, format)` → `varchar`* + * `date_parse(string, format)` → `timestamp(3)`* +* **Aggregate functions** +**Note that Trino provides a much larger superset of functions. Bitwise, map, and approximate aggregations are mostly absent. Only BigQuery has a few native approximate aggregation functions. + * `array_agg(x)` → `array<`[same as input]>* + * `avg(x)` → `double` + * `bool_and(boolean)` → `boolean`* + * `bool_or(boolean)` → `boolean`* + * `count(*)` → `bigint`* + * `count(x)` → `bigint` + * `count_if(x)` → `bigint`* + * `every(boolean)` → `boolean`* + * `max(x)` → [same as input] + * `max(x, n)` → `array<`[same as x]>* + * `min(x)` → [same as input] + * `min(x, n)` → `array<`[same as x]>* + * `sum(x)` → [same as input] + * **Statistical Aggregate Functions:** + * `corr(y, x)` → `double`* + * `covar_pop(y, x)`→ `double`* + * `covar_samp(y, x)` → `double`* + * `stddev(x)` → `double` + * `stddev_pop(x)` → `double` + * `stddev_samp(x)` → `double` + * `variance(x)` → `double` + * `var_pop(x)` → `double` + * `var_samp(x)` → `double` +* **Window functions** + * **Ranking Functions:** + * `cume_dist()` → `bigint` + * `dense_rank()` → `bigint` + * `ntile(n)` → `bigint` + * `percent_rank()` → `double` + * `rank()` → `bigint` + * `row_number()` → `bigint` + * **Value Functions:** + * `first_value(x)` → [same as input] + * `last_value(x)` → [same as input] + * `nth_value(x, offset)` → [same as input] + * `lead(x[, offset[, default_value]])` → [same as input] + * `lag(x[, offset[, default_value]])` → [same as input] +* **JSON functions** +In general, function signatures and behaviour differs across implementations for many JSON related functions. + * `json_array_length(json)` → `bigint`* + * `json_extract(json, json_path)` → `json`* + * `json_extract_scalar(json, json_path)` → varchar* + * `json_format(json)` → `varchar`* + * `json_size(json, json_path)` → `bigint`* +* Functions for working with nested and repeated data (`ROW` and `ARRAY`) +See also `UNNEST`, which is part of the SQL grammar and allows working with nested arrays as if they were rows in a joined table. + +Note: Arrays are mostly absent in MySQL + * Array Subscript Operator: `[]` + * Array Concatenation Operator: `||` + * `concat(array1, array2, ..., arrayN)` → `array` + * `cardinality(x)` → `bigint`* +* `ga4gh_type` [described in the Data Connect specification](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#attaching-semantic-data-types-to-search-results) diff --git a/hugo/content/docs/reference/sql-grammar.md b/hugo/content/docs/reference/sql-grammar.md new file mode 100644 index 00000000..1f586df1 --- /dev/null +++ b/hugo/content/docs/reference/sql-grammar.md @@ -0,0 +1,672 @@ +--- +title: "SQL Grammar" +weight: 1 +draft: false +lastmod: 2020-12-3 +type: docs +layout: single-col +--- +This is the ANTLR grammar from Trino version 323 (ASL 2.0 license), with the DML and DDL parts removed. + + +```antlrv4 +grammar DataConnect; + +tokens { + DELIMITER +} + +singleStatement + : statement EOF + ; + +standaloneExpression + : expression EOF + ; + +standaloneType + : type EOF + ; + +statement + : query #statementDefault + | USE schema=identifier #use + | USE catalog=identifier '.' schema=identifier #use + | EXPLAIN ANALYZE? VERBOSE? + ('(' explainOption (',' explainOption)* ')')? statement #explain + | SHOW TABLES ((FROM | IN) qualifiedName)? + (LIKE pattern=string (ESCAPE escape=string)?)? #showTables + | SHOW SCHEMAS ((FROM | IN) identifier)? + (LIKE pattern=string (ESCAPE escape=string)?)? #showSchemas + | SHOW CATALOGS (LIKE pattern=string)? #showCatalogs + | SHOW COLUMNS (FROM | IN) qualifiedName #showColumns + | DESCRIBE qualifiedName #showColumns + | DESC qualifiedName #showColumns + | SHOW FUNCTIONS #showFunctions + ; + +query + : with? queryNoWith + ; + +with + : WITH RECURSIVE? namedQuery (',' namedQuery)* + ; + +queryNoWith: + queryTerm + (ORDER BY sortItem (',' sortItem)*)? + (OFFSET offset=INTEGER_VALUE (ROW | ROWS)?)? + ((LIMIT limit=(INTEGER_VALUE | ALL)) | (FETCH (FIRST | NEXT) (fetchFirst=INTEGER_VALUE)? (ROW | ROWS) (ONLY | WITH TIES)))? + ; + +queryTerm + : queryPrimary #queryTermDefault + | left=queryTerm operator=INTERSECT setQuantifier? right=queryTerm #setOperation + | left=queryTerm operator=(UNION | EXCEPT) setQuantifier? right=queryTerm #setOperation + ; + +queryPrimary + : querySpecification #queryPrimaryDefault + | TABLE qualifiedName #table + | VALUES expression (',' expression)* #inlineTable + | '(' queryNoWith ')' #subquery + ; + +sortItem + : expression ordering=(ASC | DESC)? (NULLS nullOrdering=(FIRST | LAST))? + ; + +querySpecification + : SELECT setQuantifier? selectItem (',' selectItem)* + (FROM relation (',' relation)*)? + (WHERE where=booleanExpression)? + (GROUP BY groupBy)? + (HAVING having=booleanExpression)? + ; + +groupBy + : setQuantifier? groupingElement (',' groupingElement)* + ; + +groupingElement + : groupingSet #singleGroupingSet + | ROLLUP '(' (expression (',' expression)*)? ')' #rollup + | CUBE '(' (expression (',' expression)*)? ')' #cube + | GROUPING SETS '(' groupingSet (',' groupingSet)* ')' #multipleGroupingSets + ; + +groupingSet + : '(' (expression (',' expression)*)? ')' + | expression + ; + +namedQuery + : name=identifier (columnAliases)? AS '(' query ')' + ; + +setQuantifier + : DISTINCT + | ALL + ; + +selectItem + : expression (AS? identifier)? #selectSingle + | primaryExpression '.' ASTERISK (AS columnAliases)? #selectAll + | ASTERISK #selectAll + ; + +relation + : left=relation + ( CROSS JOIN right=sampledRelation + | joinType JOIN rightRelation=relation joinCriteria + | NATURAL joinType JOIN right=sampledRelation + ) #joinRelation + | sampledRelation #relationDefault + ; + +joinType + : INNER? + | LEFT OUTER? + | RIGHT OUTER? + | FULL OUTER? + ; + +joinCriteria + : ON booleanExpression + | USING '(' identifier (',' identifier)* ')' + ; + +sampledRelation + : aliasedRelation ( + TABLESAMPLE sampleType '(' percentage=expression ')' + )? + ; + +sampleType + : BERNOULLI + | SYSTEM + ; + +aliasedRelation + : relationPrimary (AS? identifier columnAliases?)? + ; + +columnAliases + : '(' identifier (',' identifier)* ')' + ; + +relationPrimary + : qualifiedName #tableName + | '(' query ')' #subqueryRelation + | UNNEST '(' expression (',' expression)* ')' (WITH ORDINALITY)? #unnest + | LATERAL '(' query ')' #lateral + | '(' relation ')' #parenthesizedRelation + ; + +expression + : booleanExpression + ; + +booleanExpression + : valueExpression predicate[$valueExpression.ctx]? #predicated + | NOT booleanExpression #logicalNot + | left=booleanExpression operator=AND right=booleanExpression #logicalBinary + | left=booleanExpression operator=OR right=booleanExpression #logicalBinary + ; + +// workaround for https://github.com/antlr/antlr4/issues/780 +predicate[ParserRuleContext value] + : comparisonOperator right=valueExpression #comparison + | comparisonOperator comparisonQuantifier '(' query ')' #quantifiedComparison + | NOT? BETWEEN lower=valueExpression AND upper=valueExpression #between + | NOT? IN '(' expression (',' expression)* ')' #inList + | NOT? IN '(' query ')' #inSubquery + | NOT? LIKE pattern=valueExpression (ESCAPE escape=valueExpression)? #like + | IS NOT? NULL #nullPredicate + | IS NOT? DISTINCT FROM right=valueExpression #distinctFrom + ; + +valueExpression + : primaryExpression #valueExpressionDefault + | valueExpression AT timeZoneSpecifier #atTimeZone + | operator=(MINUS | PLUS) valueExpression #arithmeticUnary + | left=valueExpression operator=(ASTERISK | SLASH | PERCENT) right=valueExpression #arithmeticBinary + | left=valueExpression operator=(PLUS | MINUS) right=valueExpression #arithmeticBinary + | left=valueExpression CONCAT right=valueExpression #concatenation + ; + +primaryExpression + : NULL #nullLiteral + | interval #intervalLiteral + | identifier string #typeConstructor + | DOUBLE PRECISION string #typeConstructor + | number #numericLiteral + | booleanValue #booleanLiteral + | string #stringLiteral + | BINARY_LITERAL #binaryLiteral + | '?' #parameter + | POSITION '(' valueExpression IN valueExpression ')' #position + | '(' expression (',' expression)+ ')' #rowConstructor + | ROW '(' expression (',' expression)* ')' #rowConstructor + | qualifiedName '(' ASTERISK ')' filter? over? #functionCall + | qualifiedName '(' (setQuantifier? expression (',' expression)*)? + (ORDER BY sortItem (',' sortItem)*)? ')' filter? (nullTreatment? over)? #functionCall + | identifier '->' expression #lambda + | '(' (identifier (',' identifier)*)? ')' '->' expression #lambda + | '(' query ')' #subqueryExpression + // This is an extension to ANSI SQL, which considers EXISTS to be a + | EXISTS '(' query ')' #exists + | CASE operand=expression whenClause+ (ELSE elseExpression=expression)? END #simpleCase + | CASE whenClause+ (ELSE elseExpression=expression)? END #searchedCase + | CAST '(' expression AS type ')' #cast + | TRY_CAST '(' expression AS type ')' #cast + | ARRAY '[' (expression (',' expression)*)? ']' #arrayConstructor + | value=primaryExpression '[' index=valueExpression ']' #subscript + | identifier #columnReference + | base=primaryExpression '.' fieldName=identifier #dereference + | name=CURRENT_DATE #specialDateTimeFunction + | name=CURRENT_TIME ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=CURRENT_TIMESTAMP ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=LOCALTIME ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=LOCALTIMESTAMP ('(' precision=INTEGER_VALUE ')')? #specialDateTimeFunction + | name=CURRENT_USER #currentUser + | name=CURRENT_PATH #currentPath + | SUBSTRING '(' valueExpression FROM valueExpression (FOR valueExpression)? ')' #substring + | NORMALIZE '(' valueExpression (',' normalForm)? ')' #normalize + | EXTRACT '(' identifier FROM valueExpression ')' #extract + | '(' expression ')' #parenthesizedExpression + | GROUPING '(' (qualifiedName (',' qualifiedName)*)? ')' #groupingOperation + ; + +nullTreatment + : IGNORE NULLS + | RESPECT NULLS + ; + +string + : STRING #basicStringLiteral + | UNICODE_STRING (UESCAPE STRING)? #unicodeStringLiteral + ; + +timeZoneSpecifier + : TIME ZONE interval #timeZoneInterval + | TIME ZONE string #timeZoneString + ; + +comparisonOperator + : EQ | NEQ | LT | LTE | GT | GTE + ; + +comparisonQuantifier + : ALL | SOME | ANY + ; + +booleanValue + : TRUE | FALSE + ; + +interval + : INTERVAL sign=(PLUS | MINUS)? string from=intervalField (TO to=intervalField)? + ; + +intervalField + : YEAR | MONTH | DAY | HOUR | MINUTE | SECOND + ; + +normalForm + : NFD | NFC | NFKD | NFKC + ; + +type + : ROW '(' rowField (',' rowField)* ')' #rowType + | INTERVAL from=intervalField (TO to=intervalField)? #intervalType + | base=TIMESTAMP ('(' precision = INTEGER_VALUE ')')? (WITHOUT TIME ZONE)? #dateTimeType + | base=TIMESTAMP ('(' precision = INTEGER_VALUE ')')? WITH TIME ZONE #dateTimeType + | base=TIME ('(' precision = INTEGER_VALUE ')')? (WITHOUT TIME ZONE)? #dateTimeType + | base=TIME ('(' precision = INTEGER_VALUE ')')? WITH TIME ZONE #dateTimeType + | DOUBLE PRECISION #doublePrecisionType + | ARRAY '<' type '>' #legacyArrayType + | MAP '<' keyType=type ',' valueType=type '>' #legacyMapType + | type ARRAY ('[' INTEGER_VALUE ']')? #arrayType + | identifier ('(' typeParameter (',' typeParameter)* ')')? #genericType + ; + +rowField + : identifier? type; + +typeParameter + : INTEGER_VALUE | type + ; + +whenClause + : WHEN condition=expression THEN result=expression + ; + +filter + : FILTER '(' WHERE booleanExpression ')' + ; + +over + : OVER '(' + (PARTITION BY partition+=expression (',' partition+=expression)*)? + (ORDER BY sortItem (',' sortItem)*)? + windowFrame? + ')' + ; + +windowFrame + : frameType=RANGE start=frameBound + | frameType=ROWS start=frameBound + | frameType=RANGE BETWEEN start=frameBound AND end=frameBound + | frameType=ROWS BETWEEN start=frameBound AND end=frameBound + ; + +frameBound + : UNBOUNDED boundType=PRECEDING #unboundedFrame + | UNBOUNDED boundType=FOLLOWING #unboundedFrame + | CURRENT ROW #currentRowBound + | expression boundType=(PRECEDING | FOLLOWING) #boundedFrame + ; + +explainOption + : FORMAT value=(TEXT | GRAPHVIZ | JSON) #explainFormat + | TYPE value=(LOGICAL | DISTRIBUTED | VALIDATE | IO) #explainType + ; + +qualifiedName + : identifier ('.' identifier)* + ; + +identifier + : IDENTIFIER #unquotedIdentifier + | QUOTED_IDENTIFIER #quotedIdentifier + | nonReserved #unquotedIdentifier + | BACKQUOTED_IDENTIFIER #backQuotedIdentifier + | DIGIT_IDENTIFIER #digitIdentifier + ; + +number + : MINUS? DECIMAL_VALUE #decimalLiteral + | MINUS? DOUBLE_VALUE #doubleLiteral + | MINUS? INTEGER_VALUE #integerLiteral + ; + +nonReserved + // IMPORTANT: this rule must only contain tokens. Nested rules are not supported. See SqlParser.exitNonReserved + : ADD | ADMIN | ALL | ANALYZE | ANY | ARRAY | ASC | AT + | BERNOULLI + | CALL | CASCADE | CATALOGS | COLUMN | COLUMNS | COMMENT | COMMIT | COMMITTED | CURRENT + | DATA | DATE | DAY | DEFINER | DESC | DISTRIBUTED | DOUBLE + | EXCLUDING | EXPLAIN + | FETCH | FILTER | FIRST | FOLLOWING | FORMAT | FUNCTIONS + | GRANT | GRANTED | GRANTS | GRAPHVIZ + | HOUR + | IF | IGNORE | INCLUDING | INPUT | INTERVAL | INVOKER | IO | ISOLATION + | JSON + | LAST | LATERAL | LEVEL | LIMIT | LOGICAL + | MAP | MINUTE | MONTH + | NEXT | NFC | NFD | NFKC | NFKD | NO | NONE | NULLIF | NULLS + | OFFSET | ONLY | OPTION | ORDINALITY | OUTPUT | OVER + | PARTITION | PARTITIONS | PATH | POSITION | PRECEDING | PRECISION | PRIVILEGES | PROPERTIES + | RANGE | READ | RENAME | REPEATABLE | REPLACE | RESET | RESPECT | RESTRICT | REVOKE | ROLE | ROLES | ROLLBACK | ROW | ROWS + | SCHEMA | SCHEMAS | SECOND | SECURITY | SERIALIZABLE | SESSION | SET | SETS + | SHOW | SOME | START | STATS | SUBSTRING | SYSTEM + | TABLES | TABLESAMPLE | TEXT | TIES | TIME | TIMESTAMP | TO | TRANSACTION | TRY_CAST | TYPE + | UNBOUNDED | UNCOMMITTED | USE | USER + | VALIDATE | VERBOSE | VIEW + | WITHOUT | WORK | WRITE + | YEAR + | ZONE + ; + +ADD: 'ADD'; +ADMIN: 'ADMIN'; +ALL: 'ALL'; +ALTER: 'ALTER'; +ANALYZE: 'ANALYZE'; +AND: 'AND'; +ANY: 'ANY'; +ARRAY: 'ARRAY'; +AS: 'AS'; +ASC: 'ASC'; +AT: 'AT'; +BERNOULLI: 'BERNOULLI'; +BETWEEN: 'BETWEEN'; +BY: 'BY'; +CALL: 'CALL'; +CASCADE: 'CASCADE'; +CASE: 'CASE'; +CAST: 'CAST'; +CATALOGS: 'CATALOGS'; +COLUMN: 'COLUMN'; +COLUMNS: 'COLUMNS'; +COMMENT: 'COMMENT'; +COMMIT: 'COMMIT'; +COMMITTED: 'COMMITTED'; +CONSTRAINT: 'CONSTRAINT'; +CREATE: 'CREATE'; +CROSS: 'CROSS'; +CUBE: 'CUBE'; +CURRENT: 'CURRENT'; +CURRENT_DATE: 'CURRENT_DATE'; +CURRENT_PATH: 'CURRENT_PATH'; +CURRENT_ROLE: 'CURRENT_ROLE'; +CURRENT_TIME: 'CURRENT_TIME'; +CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'; +CURRENT_USER: 'CURRENT_USER'; +DATA: 'DATA'; +DATE: 'DATE'; +DAY: 'DAY'; +DEALLOCATE: 'DEALLOCATE'; +DEFINER: 'DEFINER'; +DELETE: 'DELETE'; +DESC: 'DESC'; +DESCRIBE: 'DESCRIBE'; +DISTINCT: 'DISTINCT'; +DISTRIBUTED: 'DISTRIBUTED'; +DOUBLE: 'DOUBLE'; +DROP: 'DROP'; +ELSE: 'ELSE'; +END: 'END'; +ESCAPE: 'ESCAPE'; +EXCEPT: 'EXCEPT'; +EXCLUDING: 'EXCLUDING'; +EXECUTE: 'EXECUTE'; +EXISTS: 'EXISTS'; +EXPLAIN: 'EXPLAIN'; +EXTRACT: 'EXTRACT'; +FALSE: 'FALSE'; +FETCH: 'FETCH'; +FILTER: 'FILTER'; +FIRST: 'FIRST'; +FOLLOWING: 'FOLLOWING'; +FOR: 'FOR'; +FORMAT: 'FORMAT'; +FROM: 'FROM'; +FULL: 'FULL'; +FUNCTIONS: 'FUNCTIONS'; +GRANT: 'GRANT'; +GRANTED: 'GRANTED'; +GRANTS: 'GRANTS'; +GRAPHVIZ: 'GRAPHVIZ'; +GROUP: 'GROUP'; +GROUPING: 'GROUPING'; +HAVING: 'HAVING'; +HOUR: 'HOUR'; +IF: 'IF'; +IGNORE: 'IGNORE'; +IN: 'IN'; +INCLUDING: 'INCLUDING'; +INNER: 'INNER'; +INPUT: 'INPUT'; +INSERT: 'INSERT'; +INTERSECT: 'INTERSECT'; +INTERVAL: 'INTERVAL'; +INTO: 'INTO'; +INVOKER: 'INVOKER'; +IO: 'IO'; +IS: 'IS'; +ISOLATION: 'ISOLATION'; +JSON: 'JSON'; +JOIN: 'JOIN'; +LAST: 'LAST'; +LATERAL: 'LATERAL'; +LEFT: 'LEFT'; +LEVEL: 'LEVEL'; +LIKE: 'LIKE'; +LIMIT: 'LIMIT'; +LOCALTIME: 'LOCALTIME'; +LOCALTIMESTAMP: 'LOCALTIMESTAMP'; +LOGICAL: 'LOGICAL'; +MAP: 'MAP'; +MINUTE: 'MINUTE'; +MONTH: 'MONTH'; +NATURAL: 'NATURAL'; +NEXT: 'NEXT'; +NFC : 'NFC'; +NFD : 'NFD'; +NFKC : 'NFKC'; +NFKD : 'NFKD'; +NO: 'NO'; +NONE: 'NONE'; +NORMALIZE: 'NORMALIZE'; +NOT: 'NOT'; +NULL: 'NULL'; +NULLIF: 'NULLIF'; +NULLS: 'NULLS'; +OFFSET: 'OFFSET'; +ON: 'ON'; +ONLY: 'ONLY'; +OPTION: 'OPTION'; +OR: 'OR'; +ORDER: 'ORDER'; +ORDINALITY: 'ORDINALITY'; +OUTER: 'OUTER'; +OUTPUT: 'OUTPUT'; +OVER: 'OVER'; +PARTITION: 'PARTITION'; +PARTITIONS: 'PARTITIONS'; +PATH: 'PATH'; +POSITION: 'POSITION'; +PRECEDING: 'PRECEDING'; +PREPARE: 'PREPARE'; +PRIVILEGES: 'PRIVILEGES'; +PRECISION: 'PRECISION'; +PROPERTIES: 'PROPERTIES'; +RANGE: 'RANGE'; +READ: 'READ'; +RECURSIVE: 'RECURSIVE'; +RENAME: 'RENAME'; +REPEATABLE: 'REPEATABLE'; +REPLACE: 'REPLACE'; +RESET: 'RESET'; +RESPECT: 'RESPECT'; +RESTRICT: 'RESTRICT'; +REVOKE: 'REVOKE'; +RIGHT: 'RIGHT'; +ROLE: 'ROLE'; +ROLES: 'ROLES'; +ROLLBACK: 'ROLLBACK'; +ROLLUP: 'ROLLUP'; +ROW: 'ROW'; +ROWS: 'ROWS'; +SCHEMA: 'SCHEMA'; +SCHEMAS: 'SCHEMAS'; +SECOND: 'SECOND'; +SECURITY: 'SECURITY'; +SELECT: 'SELECT'; +SERIALIZABLE: 'SERIALIZABLE'; +SESSION: 'SESSION'; +SET: 'SET'; +SETS: 'SETS'; +SHOW: 'SHOW'; +SOME: 'SOME'; +START: 'START'; +STATS: 'STATS'; +SUBSTRING: 'SUBSTRING'; +SYSTEM: 'SYSTEM'; +TABLE: 'TABLE'; +TABLES: 'TABLES'; +TABLESAMPLE: 'TABLESAMPLE'; +TEXT: 'TEXT'; +THEN: 'THEN'; +TIES: 'TIES'; +TIME: 'TIME'; +TIMESTAMP: 'TIMESTAMP'; +TO: 'TO'; +TRANSACTION: 'TRANSACTION'; +TRUE: 'TRUE'; +TRY_CAST: 'TRY_CAST'; +TYPE: 'TYPE'; +UESCAPE: 'UESCAPE'; +UNBOUNDED: 'UNBOUNDED'; +UNCOMMITTED: 'UNCOMMITTED'; +UNION: 'UNION'; +UNNEST: 'UNNEST'; +USE: 'USE'; +USER: 'USER'; +USING: 'USING'; +VALIDATE: 'VALIDATE'; +VALUES: 'VALUES'; +VERBOSE: 'VERBOSE'; +VIEW: 'VIEW'; +WHEN: 'WHEN'; +WHERE: 'WHERE'; +WITH: 'WITH'; +WITHOUT: 'WITHOUT'; +WORK: 'WORK'; +WRITE: 'WRITE'; +YEAR: 'YEAR'; +ZONE: 'ZONE'; + +EQ : '='; +NEQ : '<>' | '!='; +LT : '<'; +LTE : '<='; +GT : '>'; +GTE : '>='; + +PLUS: '+'; +MINUS: '-'; +ASTERISK: '*'; +SLASH: '/'; +PERCENT: '%'; +CONCAT: '||'; + +STRING + : '\'' ( ~'\'' | '\'\'' )* '\'' + ; + +UNICODE_STRING + : 'U&\'' ( ~'\'' | '\'\'' )* '\'' + ; + +// Note: we allow any character inside the binary literal and validate +// its a correct literal when the AST is being constructed. This +// allows us to provide more meaningful error messages to the user +BINARY_LITERAL + : 'X\'' (~'\'')* '\'' + ; + +INTEGER_VALUE + : DIGIT+ + ; + +DECIMAL_VALUE + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + +DOUBLE_VALUE + : DIGIT+ ('.' DIGIT*)? EXPONENT + | '.' DIGIT+ EXPONENT + ; + +IDENTIFIER + : (LETTER | '_') (LETTER | DIGIT | '_' | '@' | ':')* + ; + +DIGIT_IDENTIFIER + : DIGIT (LETTER | DIGIT | '_' | '@' | ':')+ + ; + +QUOTED_IDENTIFIER + : '"' ( ~'"' | '""' )* '"' + ; + +BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + +fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; + +fragment DIGIT + : [0-9] + ; + +fragment LETTER + : [A-Z] + ; + +SIMPLE_COMMENT + : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN) + ; + +BRACKETED_COMMENT + : '/*' .*? '*/' -> channel(HIDDEN) + ; + +WS + : [ \r\n\t]+ -> channel(HIDDEN) + ; + +// Catch-all for anything we can't recognize. +// We use this to be able to ignore and recover all the text +// when splitting statements with DelimiterLexer +UNRECOGNIZED + : . + ; +``` \ No newline at end of file diff --git a/hugo/content/docs/security/_index.md b/hugo/content/docs/security/_index.md new file mode 100644 index 00000000..65d4b9d1 --- /dev/null +++ b/hugo/content/docs/security/_index.md @@ -0,0 +1,6 @@ +--- +title: "Security" +icon: "icon-shield" +type : "category" +weight: 3 +--- diff --git a/hugo/content/docs/security/data-source.md b/hugo/content/docs/security/data-source.md new file mode 100644 index 00000000..1035b080 --- /dev/null +++ b/hugo/content/docs/security/data-source.md @@ -0,0 +1,77 @@ +--- +title: "At Data Sources" +weight: 1 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: single-col +--- +#### Securing Data Sources + +This page discusses different approaches to securing a Data Connect implementation depending on which implementation path you choose and how complex your access needs are. + +Data Connect can be implemented in many ways. For example: + +* as static files in a web server or cloud bucket ("tables in a bucket") +* in front of a single database server (for example, PostgreSQL, MySQL, or ElasticSearch) +* in front of many database servers (for example, using Trino) + +In addition, your dataset might require a single tier of access, where someone either has access to the whole thing or nothing at all, or you might require multiple access tiers where different users can access different subsets of the data. + +#### Tables in a Bucket + +If you implement Data Connect using static JSON files in a web server or cloud file storage system, you can set the web server or cloud bucket to require an authentication tokens with each request. + +If you are hosting your tables in a web server such as nginx, Express.js, Tomcat, or Apache HTTPD, you have the option of providing a custom authentication module that understands JWT bearer tokens. + +With data consumers accessing cloud buckets directly, the easiest approach is to use an authentication mechanism supported by the cloud vendor. This may be acceptable if the data consumers are inside your own organization, and they already have a way to obtain cloud credentials. + +To customize the authentication mechanism on tables-in-a-bucket (for example, if you are experimenting with a GA4GH Passport integration) then you may have a few options, depending on the cloud platform: + +1. Put your cloud bucket behind an HTTP proxy that checks authentication in a custom way. If you do this, ensure links such as `nextPageUrl` are relative in all your JSON files. +1. Check if your cloud storage system can delegate request authorization to a serverless function that you supply (eg. AWS Lambda, Google Cloud Function, Azure Function). This may be possible directly, or you may need to route requests through an API Gateway. + +##### Multi Tiered Access + +With tables-in-a-bucket, consider creating separate Data Connect implementations, each in their own bucket, for each access tier. This allows you to keep access policies uniform with each bucket, and gives you the flexibility to provide different data granularity to users within each tier. + +#### In Front of a Single Database + +In this case, you will be running custom server code that translates incoming requests from Data Connect API requests into the format natively understood by your backend database. + +##### Single Tiered Access + +Create a single database user for your Data Connect API server. Grant this user read-only access to only the tables that you wish to expose via Data Connect. Your custom server will access the database as this user. + +On each incoming Data Connect HTTP request, check for a valid OAuth2 bearer token. If the token is valid, make the corresponding request to the backend database. The Data Connect API user's scope of access will be limited to what the database user can see. + +##### Multi Tiered Access + +Create a database user for each access tier your Data Connect API server will support. Grant each user read-only access to only the tables that you wish to expose at that access tier. Your custom server will select the correct database user based on the credentials in the incoming requests. + +On each incoming Data Connect HTTP request, check for a valid JWT OAuth2 bearer token. If the token is valid, examine its claims and select the appropriate database user. The Data Connect API user's scope of access will be limited to what the database user for their access tier can see. + +If some access tiers should only see data at a coarser grain (for example, cohort-level statistics rather than subject-level data), consider one of the following approaches: + +* create views of the data that only reveal data at the coarser grain, and grant the 'tiered database users' access to these views only +* pre-aggregate the data into tables or materialized views, and grant the 'tiered database users' access to these tables or materialized views only + +Since there will typically be many more users with access to the coarser-grained view of the data, pre-aggregating the data offers a performance advantage as well. + +#### In Front of Many Databases + +If you are exposing many databases under a single Data Connect API instance, you are probably using a Trino based implementation. + +Trino provides the [SystemAccessControl interface](https://github.com/trinodb/trino/blob/master/core/trino-spi/src/main/java/io/trino/spi/security/SystemAccessControl.java) which you can implement yourself to secure your data source. + +A Trino-based Data Connect implementation will have a Data Connect API adapter service in front of Trino which accepts Data Connect API calls and relays them to Trino in its own API, just like the single database case outlined above. The adapter service should extract the user's JWT bearer token from the inbound request and include it in the Trino request under the `X-Trino-Extra-Credential` header. + +From there, your implementation of the SystemAccessControl interface will have access to the JWT and its claims, and will be able to control access: + +* Allow/Deny: + * Access per catalog, schema, table, and column + * Access to see the definition of a view +* Filter: + * Visibility of catalogs, schemas, tables, and columns + * Row-by-row table data using a filter expression diff --git a/hugo/content/docs/security/search-endpoint.md b/hugo/content/docs/security/search-endpoint.md new file mode 100644 index 00000000..ebe838b4 --- /dev/null +++ b/hugo/content/docs/security/search-endpoint.md @@ -0,0 +1,16 @@ +--- +title: "At Endpoints" +weight: 1 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: single-col +--- +#### Securing Data Connect Endpoints + +A future version of Data Connect will document how to use [GA4GH Passports and Visas](https://github.com/ga4gh-duri/ga4gh-duri.github.io/tree/master/researcher_ids) to authenticate and authorize requests to the API. + +There is already work underway to specify how DRS will work with Passports. Rather than jumping in now and creating confusion, the Data Connect working group is monitoring the Passport efforts in DRS and will document a specific Data Connect Passport integration that makes sense in the context of what has been decided for DRS. + +For now, prefer JSON Web Tokens (JWTs) presented as OAuth2 bearer tokens on each Data Connect API request. This will likely put you in a good position to implement the recommended Passport integration when the path forward becomes clear. diff --git a/hugo/content/docs/use-exisitng-data/_index.md b/hugo/content/docs/use-exisitng-data/_index.md new file mode 100644 index 00000000..8a81991d --- /dev/null +++ b/hugo/content/docs/use-exisitng-data/_index.md @@ -0,0 +1,6 @@ +--- +title: "Use Existing Data" +icon: "icon-database" +type : "category" +weight: 2 +--- diff --git a/hugo/content/docs/use-exisitng-data/using-trino/doc.md b/hugo/content/docs/use-exisitng-data/using-trino/doc.md new file mode 100644 index 00000000..b25155df --- /dev/null +++ b/hugo/content/docs/use-exisitng-data/using-trino/doc.md @@ -0,0 +1,79 @@ +--- +title: "Using Trino" +weight: 1 +draft: false +lastmod: 2020-11-5 +# search related keywords +type: docs +layout: single-col +--- +### The dbGaP GECCO Example + +In the [provision data section](/docs/getting-started/provision-data/), we've shown a quick start recipe with the [ga4gh-search-adapter-presto](https://github.com/DNAstack/ga4gh-search-adapter-presto) docker container connected to a Trino instance hosted at `https://presto-public.prod.dnastack.com`. This section provides more information on how this was accomplished. + +{{}} +{{%content-textbox%}} +##### Quick Links +--- +[ga4gh-search-adapter-presto](https://github.com/DNAstack/ga4gh-search-adapter-presto) + +[Open API 3 Reference](/api) + +[Full Data Connect Specification](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md) + +[Table Object Specification](https://github.com/ga4gh-discovery/data-connect/blob/develop/TABLE.md) + +[Data Connect API’s SQL dialect](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#sql-functions) + +{{%/content-textbox%}} +{{}} + +#### Prerequisites +The following is required before we start. +1. Java 11+ +1. A Trino server you can access anonymously over HTTP(S). +1. Git +> If you don't have a Trino server to work against and you wish to try the app, try using `https://presto-public.prod.dnastack.com` as the data source. + +**1. Building the Trino Adapter App** + +Clone the repository +``` bash +git clone https://github.com/DNAstack/ga4gh-search-adapter-presto.git +``` +Build the app +```bash +mvn clean package +``` + + +**2. Configuration** + +For a minimal configuration, we need to provide two parameters, `PRESTO_DATASOURCE_URL` and `SPRING_PROFILES_ACTIVE`. + +`PRESTO_DATASOURCE_URL` points to the Trino server you wish to expose with a Data Connect API. +{{%content-textbox%}} +Clone the repository: +``` bash +export PRESTO_DATASOURCE_URL=https:// +export SPRING_PROFILES_ACTIVE=no-auth +``` +The adapter app requires a local PostgreSQL database connection. To start the app locally with the default settings, you can spin up the database with this docker command: +```bash +docker run -d -p 5432:5432 --name ga4ghsearchadapterpresto -e POSTGRES_USER=ga4ghsearchadapterpresto -e POSTGRES_PASSWORD=ga4ghsearchadapterpresto postgres +``` +{{%/content-textbox%}} + +**3. Run the adapter App** + +{{%content-textbox%}} +``` bash +mvn clean spring-boot:run +``` +Your application should now be accessible at [http://localhost:8089/tables](http://localhost:8089/tables) + +To test the app out, follow the [consuming data](/docs/getting-started/consume-data/) section. +{{%/content-textbox%}} + +#### Further Configuration +Further configuration can be found at: [https://github.com/DNAstack/ga4gh-search-adapter-presto](https://github.com/DNAstack/ga4gh-search-adapter-presto). diff --git a/hugo/layouts/docs/list.json b/hugo/layouts/docs/list.json new file mode 100644 index 00000000..137492b2 --- /dev/null +++ b/hugo/layouts/docs/list.json @@ -0,0 +1,10 @@ +[ + {{ range $index, $value := where .Site.Pages "Type" "docs" }} + {{ if $index }}, {{ end }} + { + "url": "{{ .RelPermalink }}", + "title": "{{ .Title }}", + "content": {{ .Content | plainify | jsonify }} + } + {{ end }} +] \ No newline at end of file diff --git a/hugo/layouts/partials/search.html b/hugo/layouts/partials/search.html new file mode 100644 index 00000000..d1612baa --- /dev/null +++ b/hugo/layouts/partials/search.html @@ -0,0 +1,110 @@ + + + \ No newline at end of file diff --git a/hugo/static/images/favicon.ico b/hugo/static/images/favicon.ico new file mode 100644 index 00000000..161f8d16 Binary files /dev/null and b/hugo/static/images/favicon.ico differ diff --git a/hugo/static/images/logo.png b/hugo/static/images/logo.png new file mode 100644 index 00000000..ceecafec Binary files /dev/null and b/hugo/static/images/logo.png differ diff --git a/hugo/themes/clyde b/hugo/themes/clyde new file mode 160000 index 00000000..601e2bc4 --- /dev/null +++ b/hugo/themes/clyde @@ -0,0 +1 @@ +Subproject commit 601e2bc4be239eb629d6c8d3a64964bf939866bf diff --git a/spec/api.yaml b/spec/api.yaml new file mode 100644 index 00000000..d1958def --- /dev/null +++ b/spec/api.yaml @@ -0,0 +1,280 @@ +openapi: "3.0.2" +info: + title: Data Connect API + version: 1.0.0 + description: | + Data Connect is a standard for discovery and search of biomedical data. + + More information on [GitHub](https://github.com/ga4gh-discovery/data-connect). + license: + name: 'Apache 2.0' + url: 'https://raw.githubusercontent.com/ga4gh-discovery/data-connect/develop/LICENSE' + contact: + name: 'Data Connect Team' + email: 'ga4gh-discovery-search@ga4gh.org' +security: + - bearerAuth: [ ] +paths: + /tables: + get: + summary: List Tables + description: Returns a list of Tables. + operationId: listTables + tags: + - Table API + responses: + '200': + description: A list of Tables + content: + application/json: + schema: + $ref: "#/components/schemas/ListTablesResponse" + '500': + description: An unexpected error occurred + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /table/{table_name}/info: + get: + summary: Get a Table + description: Returns the information about the Table identified by name. + operationId: getTable + tags: + - Table API + parameters: + - name: table_name + in: path + description: A table name + required: true + schema: + type: string + responses: + '200': + description: A Table + content: + application/json: + schema: + $ref: "#/components/schemas/Table" + '404': + description: The table doesn't exist + '500': + description: An unexpected error occurred + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /table/{table_name}/data: + get: + summary: Fetch data from a Table + description: Returns the data of a Table + operationId: getData + tags: + - Table API + parameters: + - name: table_name + in: path + description: A table name + required: true + schema: + type: string + responses: + '200': + description: Data from the Table + content: + application/json: + schema: + $ref: "#/components/schemas/TableData" + '404': + description: The table doesn't exist + '500': + description: An unexpected error occurred + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /search: + post: + summary: Perform a search on Tables + description: Optional operation that accepts a SearchRequest and returns a TableData + operationId: search + requestBody: + description: Query to execute + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SearchRequest" + tags: + - Search API + responses: + '200': + description: Query results returned as TableData + content: + application/json: + schema: + $ref: "#/components/schemas/TableData" + '400': + description: Error in request headers or body, for example if the query is invalid. Details are provided in the ErrorResponse body. + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + '404': + description: This server does not implement the search operation + '500': + description: An unexpected error occurred + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorResponse" + /service-info: + get: + summary: 'Show information about the Data Connect service' + operationId: getServiceInfo + tags: + - Service Info API + responses: + '200': + description: | + Show information about this Data Connect service. + + Use `"type": {"group": "org.ga4gh", "artifact": "data-connect", "version": "1.0.0"}` when implementing this specification directly. + content: + application/json: + schema: + $ref: '#/components/schemas/Service' +components: + securitySchemes: + bearerAuth: + type: http + scheme: bearer + schemas: + ListTablesResponse: + required: + - tables + type: object + properties: + tables: + type: array + items: + $ref: "#/components/schemas/Table" + pagination: + $ref: '#/components/schemas/Pagination' + errors: + $ref: "#/components/schemas/ErrorList" + Table: + required: + - name + - data_model + type: object + properties: + name: + type: string + description: | + Uniquely identifies a table within this Data Connect service. + + Table names should be human-readable and will typically (but not necessarily) + be split into two or three parts separated by a dot (`.`). + example: my_catalog.some_schema.a_table + description: + type: string + description: Optional description of the Table + data_model: + $ref: "http://json-schema.org/draft-07/schema#" + errors: + $ref: "#/components/schemas/ErrorList" + description: | + Describes a Table hosted by this Data Connect node. + TableData: + required: + - data + type: object + properties: + data_model: + $ref: "http://json-schema.org/draft-07/schema#" + data: + type: array + description: Page of JSON values, each adhering to the schema given in the `data_model` property + items: + # Each item must conform to the schema provided in the "data_model" section of this Table. + # Not sure if this constraint can be expressed in OpenAPI 3.0. + type: object + pagination: + $ref: "#/components/schemas/Pagination" + errors: + $ref: "#/components/schemas/ErrorList" + description: | + A paginated collection of tabular data. + + Note that the `data_model` property is required unless `data` is also absent. + See [the pagination rules](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#pagination-and-long-running-queries) for details. + ErrorResponse: + type: object + properties: + errors: + $ref: "#/components/schemas/ErrorList" + description: The response body when no part of the request can be fulfilled + ErrorList: + type: array + description: List of errors encountered + items: + $ref: "#/components/schemas/Error" + Error: + type: object + properties: + source: + type: string + description: > + The "source" field should only be present when the error originated in an attached data source backing the Data Connect + API. The value of source can be any of the following: + + 1. A fully qualified table `name` + 2. Any prefix of a table `name` that ends before a `.` character in the name. For example + if there is a table called `foo.bar.baz`, valid prefixes would be `foo` and `foo.bar`. Partial prefixes + which do not end just before a `.`, are not valid. For example: `foo.b` and `fo` are not allowed. + + If the error originated inside the Data Connect API implementation and is not associated with any particular table + or group of tables, then the source must be absent. An example of this would be an error occurring from a bug + in the implementation. Additionally, if the error is due to a bad request from the client, the source must + also be absent. + title: + type: string + description: | + A short, human-readable description of the error. + The value should not change from occurrence to occurrence of an error, except for purposes of localization. + example: 'Internal server error' + detail: + type: string + description: 'A human-readable explanation specific to this occurrence of the error.' + example: 'Internal server error' + required: + - title + Pagination: + type: object + properties: + next_page_url: + type: string + description: | + URL pointing to the next page of data. Null or absent on last page. + + See [the pagination rules](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#pagination-and-long-running-queries) for full details. + format: uri + SearchRequest: + description: + Request body containing an SQL query with zero or more positional parameters. + type: object + required: + - query + properties: + query: + type: string + description: Query in SQL. Supported SQL grammar, data types, and functions are described in [the specification](https://github.com/ga4gh-discovery/data-connect/blob/develop/SPEC.md#query). + example: SELECT some_string, some_num FROM a_table WHERE some_string=? AND some_num=? + parameters: + type: array + items: {} + description: Positional parameters for the query in `query` property. + example: [ hello, 42 ] + Service: + $ref: 'https://raw.githubusercontent.com/ga4gh-discovery/ga4gh-service-info/v1.0.0/service-info.yaml#/components/schemas/Service' \ No newline at end of file diff --git a/spec/search-api.yaml b/spec/search-api.yaml deleted file mode 100644 index 120c121e..00000000 --- a/spec/search-api.yaml +++ /dev/null @@ -1,149 +0,0 @@ -openapi: "3.0.2" -info: - title: GA4GH Discovery Search API - description: Definition of GA4GH Discovery Search API - termsOfService: https://www.ga4gh.org/ - contact: - email: rishi.nag@ga4gh.org - license: - name: Apache 2.0 - url: http://www.apache.org/licenses/LICENSE-2.0.html - version: "0.1.0" -servers: [] -security: [] -paths: - /tables: - get: - summary: List Tables - description: Returns a list of Tables. - operationId: listTables - responses: - '200': - description: A list of Tables - content: - application/json: - schema: - $ref: "#/components/schemas/ListTablesResponse" - '500': - description: An unexpected error occurred - /table/{table_name}/info: - get: - summary: Get a Table - description: Returns the information about the Table identified by name. - operationId: getTable - parameters: - - name: table_name - in: path - description: Unique Table name, unique to this implementation - required: true - schema: - type: string - responses: - '200': - description: A Table - content: - application/json: - schema: - $ref: "#/components/schemas/Table" - '404': - description: This server does not implement the data operation - '500': - description: An unexpected error occurred - /table/{table_name}/data: - get: - summary: Fetch data from a Table - description: Optional operation that returns data from a Table - operationId: getData - parameters: - - name: table_name - in: path - description: Unique Table name, unique to this implementation - required: true - schema: - type: string - responses: - '200': - description: Data from the Table - content: - application/json: - schema: - $ref: "#/components/schemas/TableData" - '404': - description: This server does not implement the data operation - '500': - description: An unexpected error occurred - /search: - post: - summary: Perform a search on Tables - description: Optional operation that accepts a Query and returns a TableData - operationId: search - responses: - '200': - description: Query results returned as TableData - content: - application/json: - schema: - $ref: "#/components/schemas/TableData" - '404': - description: This server does not implement the search operation - '500': - description: An unexpected error occurred -components: - schemas: - ListTablesResponse: - required: - - tables - type: object - properties: - tables: - type: array - items: - $ref: "#/components/schemas/Table" - pagination: - $ref: '#/components/schemas/Pagination' - additionalProperties: false - Table: - required: - - name - - data_model - type: object - properties: - name: - type: string - description: Table name - description: - type: string - description: Optional description of the Table - data_model: - $ref: "http://json-schema.org/draft-07/schema#" - description: | - Describes a Table hosted by this search node. - TableData: - required: - - data_model - - data - type: object - properties: - data_model: - $ref: "http://json-schema.org/draft-07/schema#" - data: - type: array - description: Page of JSON values, each adhering to the schema given in the "data_model" property - items: - # Each item must conform to the schema provided in the "data_model" section of this Table. - # Not sure if this constraint can be expressed in OpenAPI 3.0. - type: object - pagination: - $ref: "#/components/schemas/Pagination" - description: A paginated collection of tabular data - Pagination: - type: object - properties: - next_page_url: - type: string - description: URL pointing to the next page of the same Table. Null or absent on last page. - format: uri - previous_page_url: - type: string - description: URL pointing to the previous page of the same Table. Null or absent on first page. - format: uri