From ffb5c75f13dfa8bc61d7441bb18f573b0ee5fcef Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 26 Nov 2024 22:37:23 +0000 Subject: [PATCH] feat: generate `frictionless` data package metadata Closes #629 - Adds `build_datapackage.py` script - Generates initial metadata in both `.yaml` and `.json` formats --- datapackage.json | 2675 ++++++++++++++++++++++++++++++++++ datapackage.yaml | 1537 +++++++++++++++++++ scripts/build_datapackage.py | 195 +++ 3 files changed, 4407 insertions(+) create mode 100644 datapackage.json create mode 100644 datapackage.yaml create mode 100644 scripts/build_datapackage.py diff --git a/datapackage.json b/datapackage.json new file mode 100644 index 00000000..8bbcd982 --- /dev/null +++ b/datapackage.json @@ -0,0 +1,2675 @@ +{ + "name": "vega-datasets", + "description": "Common repository for example datasets used by Vega related projects.", + "homepage": "http://github.com/vega/vega-datasets.git", + "licenses": [ + { + "name": "BSD-3-Clause", + "path": "https://opensource.org/license/bsd-3-clause", + "title": "The 3-Clause BSD License" + } + ], + "sources": [ + { + "path": "https://github.com/vega/vega-datasets/blob/next/SOURCES.md" + } + ], + "contributors": [ + { + "title": "UW Interactive Data Lab", + "path": "http://idl.cs.washington.edu" + } + ], + "version": "2.11.0", + "created": "2024-11-26T22:32:24.187288+00:00", + "resources": [ + { + "name": "7zip", + "type": "file", + "path": "7zip.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "airports", + "type": "table", + "path": "airports.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "iata", + "type": "string" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "longitude", + "type": "number" + } + ] + } + }, + { + "name": "annual-precip", + "type": "json", + "path": "annual-precip.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "anscombe", + "type": "table", + "path": "anscombe.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Series", + "type": "string" + }, + { + "name": "X", + "type": "integer" + }, + { + "name": "Y", + "type": "number" + } + ] + } + }, + { + "name": "barley", + "type": "table", + "path": "barley.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "yield", + "type": "number" + }, + { + "name": "variety", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "site", + "type": "string" + } + ] + } + }, + { + "name": "birdstrikes", + "type": "table", + "path": "birdstrikes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Airport Name", + "type": "string" + }, + { + "name": "Aircraft Make Model", + "type": "string" + }, + { + "name": "Effect Amount of damage", + "type": "string" + }, + { + "name": "Flight Date", + "type": "date" + }, + { + "name": "Aircraft Airline Operator", + "type": "string" + }, + { + "name": "Origin State", + "type": "string" + }, + { + "name": "Phase of flight", + "type": "string" + }, + { + "name": "Wildlife Size", + "type": "string" + }, + { + "name": "Wildlife Species", + "type": "string" + }, + { + "name": "Time of day", + "type": "string" + }, + { + "name": "Cost Other", + "type": "integer" + }, + { + "name": "Cost Repair", + "type": "integer" + }, + { + "name": "Cost Total $", + "type": "integer" + }, + { + "name": "Speed IAS in knots", + "type": "integer" + } + ] + } + }, + { + "name": "budget", + "type": "table", + "path": "budget.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Source Category Code", + "type": "integer" + }, + { + "name": "Source category name", + "type": "string" + }, + { + "name": "Source subcategory", + "type": "integer" + }, + { + "name": "Source subcategory name", + "type": "string" + }, + { + "name": "Agency code", + "type": "integer" + }, + { + "name": "Agency name", + "type": "string" + }, + { + "name": "Bureau code", + "type": "integer" + }, + { + "name": "Bureau name", + "type": "string" + }, + { + "name": "Account code", + "type": "integer" + }, + { + "name": "Account name", + "type": "string" + }, + { + "name": "Treasury Agency code", + "type": "integer" + }, + { + "name": "On- or off-budget", + "type": "string" + }, + { + "name": "1962", + "type": "string" + }, + { + "name": "1963", + "type": "string" + }, + { + "name": "1964", + "type": "string" + }, + { + "name": "1965", + "type": "string" + }, + { + "name": "1966", + "type": "string" + }, + { + "name": "1967", + "type": "string" + }, + { + "name": "1968", + "type": "string" + }, + { + "name": "1969", + "type": "string" + }, + { + "name": "1970", + "type": "string" + }, + { + "name": "1971", + "type": "string" + }, + { + "name": "1972", + "type": "string" + }, + { + "name": "1973", + "type": "string" + }, + { + "name": "1974", + "type": "string" + }, + { + "name": "1975", + "type": "string" + }, + { + "name": "1976", + "type": "string" + }, + { + "name": "TQ", + "type": "string" + }, + { + "name": "1977", + "type": "string" + }, + { + "name": "1978", + "type": "string" + }, + { + "name": "1979", + "type": "string" + }, + { + "name": "1980", + "type": "string" + }, + { + "name": "1981", + "type": "string" + }, + { + "name": "1982", + "type": "string" + }, + { + "name": "1983", + "type": "string" + }, + { + "name": "1984", + "type": "string" + }, + { + "name": "1985", + "type": "string" + }, + { + "name": "1986", + "type": "string" + }, + { + "name": "1987", + "type": "string" + }, + { + "name": "1988", + "type": "string" + }, + { + "name": "1989", + "type": "string" + }, + { + "name": "1990", + "type": "string" + }, + { + "name": "1991", + "type": "string" + }, + { + "name": "1992", + "type": "string" + }, + { + "name": "1993", + "type": "string" + }, + { + "name": "1994", + "type": "string" + }, + { + "name": "1995", + "type": "string" + }, + { + "name": "1996", + "type": "string" + }, + { + "name": "1997", + "type": "string" + }, + { + "name": "1998", + "type": "string" + }, + { + "name": "1999", + "type": "string" + }, + { + "name": "2000", + "type": "string" + }, + { + "name": "2001", + "type": "string" + }, + { + "name": "2002", + "type": "string" + }, + { + "name": "2003", + "type": "string" + }, + { + "name": "2004", + "type": "string" + }, + { + "name": "2005", + "type": "string" + }, + { + "name": "2006", + "type": "string" + }, + { + "name": "2007", + "type": "string" + }, + { + "name": "2008", + "type": "string" + }, + { + "name": "2009", + "type": "string" + }, + { + "name": "2010", + "type": "string" + }, + { + "name": "2011", + "type": "string" + }, + { + "name": "2012", + "type": "string" + }, + { + "name": "2013", + "type": "string" + }, + { + "name": "2014", + "type": "string" + }, + { + "name": "2015", + "type": "string" + }, + { + "name": "2016", + "type": "string" + }, + { + "name": "2017", + "type": "string" + }, + { + "name": "2018", + "type": "string" + }, + { + "name": "2019", + "type": "string" + }, + { + "name": "2020", + "type": "string" + } + ] + } + }, + { + "name": "budgets", + "type": "table", + "path": "budgets.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "budgetYear", + "type": "integer" + }, + { + "name": "forecastYear", + "type": "integer" + }, + { + "name": "value", + "type": "number" + } + ] + } + }, + { + "name": "burtin", + "type": "table", + "path": "burtin.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Bacteria", + "type": "string" + }, + { + "name": "Penicillin", + "type": "number" + }, + { + "name": "Streptomycin", + "type": "number" + }, + { + "name": "Neomycin", + "type": "number" + }, + { + "name": "Gram_Staining", + "type": "string" + }, + { + "name": "Genus", + "type": "string" + } + ] + } + }, + { + "name": "cars", + "type": "table", + "path": "cars.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Name", + "type": "string" + }, + { + "name": "Miles_per_Gallon", + "type": "integer" + }, + { + "name": "Cylinders", + "type": "integer" + }, + { + "name": "Displacement", + "type": "number" + }, + { + "name": "Horsepower", + "type": "integer" + }, + { + "name": "Weight_in_lbs", + "type": "integer" + }, + { + "name": "Acceleration", + "type": "number" + }, + { + "name": "Year", + "type": "date" + }, + { + "name": "Origin", + "type": "string" + } + ] + } + }, + { + "name": "co2-concentration", + "type": "table", + "path": "co2-concentration.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Date", + "type": "date" + }, + { + "name": "CO2", + "type": "number" + }, + { + "name": "adjusted CO2", + "type": "number" + } + ] + } + }, + { + "name": "countries", + "type": "table", + "path": "countries.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "_comment", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "fertility", + "type": "number" + }, + { + "name": "life_expect", + "type": "number" + }, + { + "name": "n_fertility", + "type": "number" + }, + { + "name": "n_life_expect", + "type": "number" + }, + { + "name": "country", + "type": "string" + } + ] + } + }, + { + "name": "crimea", + "type": "table", + "path": "crimea.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "wounds", + "type": "integer" + }, + { + "name": "other", + "type": "integer" + }, + { + "name": "disease", + "type": "integer" + } + ] + } + }, + { + "name": "disasters", + "type": "table", + "path": "disasters.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Entity", + "type": "string" + }, + { + "name": "Year", + "type": "integer" + }, + { + "name": "Deaths", + "type": "integer" + } + ] + } + }, + { + "name": "driving", + "type": "table", + "path": "driving.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "side", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "miles", + "type": "integer" + }, + { + "name": "gas", + "type": "number" + } + ] + } + }, + { + "name": "earthquakes", + "type": "json", + "path": "earthquakes.json", + "scheme": "file", + "format": "geojson", + "mediatype": "text/geojson", + "encoding": "utf-8" + }, + { + "name": "ffox", + "type": "file", + "path": "ffox.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "flare-dependencies", + "type": "table", + "path": "flare-dependencies.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "source", + "type": "integer" + }, + { + "name": "target", + "type": "integer" + } + ] + } + }, + { + "name": "flare", + "type": "table", + "path": "flare.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + } + ] + } + }, + { + "name": "flights-10k", + "type": "table", + "path": "flights-10k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flights-200k", + "type": "table", + "path": "flights-200k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "time", + "type": "number" + } + ] + } + }, + { + "name": "flights-20k", + "type": "table", + "path": "flights-20k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flights-2k", + "type": "table", + "path": "flights-2k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flights-3m", + "type": "table", + "path": "flights-3m.parquet", + "scheme": "file", + "format": "parquet", + "mediatype": "application/parquet", + "schema": { + "fields": [ + { + "name": "date", + "type": "integer" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flights-5k", + "type": "table", + "path": "flights-5k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flights-airport", + "type": "table", + "path": "flights-airport.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + }, + { + "name": "count", + "type": "integer" + } + ] + } + }, + { + "name": "football", + "type": "table", + "path": "football.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "division", + "type": "string" + }, + { + "name": "home_team", + "type": "string" + }, + { + "name": "away_team", + "type": "string" + }, + { + "name": "home_score", + "type": "integer" + }, + { + "name": "away_score", + "type": "integer" + } + ] + } + }, + { + "name": "gapminder-health-income", + "type": "table", + "path": "gapminder-health-income.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "country", + "type": "string" + }, + { + "name": "income", + "type": "integer" + }, + { + "name": "health", + "type": "number" + }, + { + "name": "population", + "type": "integer" + }, + { + "name": "region", + "type": "string" + } + ] + } + }, + { + "name": "gapminder", + "type": "table", + "path": "gapminder.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "cluster", + "type": "integer" + }, + { + "name": "pop", + "type": "integer" + }, + { + "name": "life_expect", + "type": "number" + }, + { + "name": "fertility", + "type": "number" + } + ] + } + }, + { + "name": "gimp", + "type": "file", + "path": "gimp.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "github", + "type": "table", + "path": "github.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "time", + "type": "string" + }, + { + "name": "count", + "type": "integer" + } + ] + } + }, + { + "name": "global-temp", + "type": "table", + "path": "global-temp.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "temp", + "type": "number" + } + ] + } + }, + { + "name": "income", + "type": "table", + "path": "income.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "region", + "type": "string" + }, + { + "name": "id", + "type": "integer" + }, + { + "name": "pct", + "type": "number" + }, + { + "name": "total", + "type": "integer" + }, + { + "name": "group", + "type": "string" + } + ] + } + }, + { + "name": "iowa-electricity", + "type": "table", + "path": "iowa-electricity.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "year", + "type": "date" + }, + { + "name": "source", + "type": "string" + }, + { + "name": "net_generation", + "type": "integer" + } + ] + } + }, + { + "name": "jobs", + "type": "table", + "path": "jobs.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "job", + "type": "string" + }, + { + "name": "sex", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "count", + "type": "integer" + }, + { + "name": "perc", + "type": "number" + } + ] + } + }, + { + "name": "la-riots", + "type": "table", + "path": "la-riots.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "gender", + "type": "string" + }, + { + "name": "race", + "type": "string" + }, + { + "name": "death_date", + "type": "date" + }, + { + "name": "address", + "type": "string" + }, + { + "name": "neighborhood", + "type": "string" + }, + { + "name": "type", + "type": "string" + }, + { + "name": "longitude", + "type": "number" + }, + { + "name": "latitude", + "type": "number" + } + ] + } + }, + { + "name": "londonboroughs", + "type": "json", + "path": "londonBoroughs.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "londoncentroids", + "type": "table", + "path": "londonCentroids.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "cx", + "type": "number" + }, + { + "name": "cy", + "type": "number" + } + ] + } + }, + { + "name": "londontubelines", + "type": "json", + "path": "londonTubeLines.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "lookup_groups", + "type": "table", + "path": "lookup_groups.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "group", + "type": "integer" + }, + { + "name": "person", + "type": "string" + } + ] + } + }, + { + "name": "lookup_people", + "type": "table", + "path": "lookup_people.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "height", + "type": "integer" + } + ] + } + }, + { + "name": "miserables", + "type": "json", + "path": "miserables.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "monarchs", + "type": "table", + "path": "monarchs.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + }, + { + "name": "index", + "type": "integer" + } + ] + } + }, + { + "name": "movies", + "type": "table", + "path": "movies.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Title", + "type": "string" + }, + { + "name": "US Gross", + "type": "integer" + }, + { + "name": "Worldwide Gross", + "type": "integer" + }, + { + "name": "US DVD Sales", + "type": "integer" + }, + { + "name": "Production Budget", + "type": "integer" + }, + { + "name": "Release Date", + "type": "string" + }, + { + "name": "MPAA Rating", + "type": "string" + }, + { + "name": "Running Time min", + "type": "integer" + }, + { + "name": "Distributor", + "type": "string" + }, + { + "name": "Source", + "type": "string" + }, + { + "name": "Major Genre", + "type": "string" + }, + { + "name": "Creative Type", + "type": "string" + }, + { + "name": "Director", + "type": "string" + }, + { + "name": "Rotten Tomatoes Rating", + "type": "integer" + }, + { + "name": "IMDB Rating", + "type": "number" + }, + { + "name": "IMDB Votes", + "type": "integer" + } + ] + } + }, + { + "name": "normal-2d", + "type": "table", + "path": "normal-2d.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "u", + "type": "number" + }, + { + "name": "v", + "type": "number" + } + ] + } + }, + { + "name": "obesity", + "type": "table", + "path": "obesity.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + }, + { + "name": "state", + "type": "string" + } + ] + } + }, + { + "name": "ohlc", + "type": "table", + "path": "ohlc.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "open", + "type": "number" + }, + { + "name": "high", + "type": "number" + }, + { + "name": "low", + "type": "number" + }, + { + "name": "close", + "type": "number" + }, + { + "name": "signal", + "type": "string" + }, + { + "name": "ret", + "type": "number" + } + ] + } + }, + { + "name": "penguins", + "type": "table", + "path": "penguins.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Species", + "type": "string" + }, + { + "name": "Island", + "type": "string" + }, + { + "name": "Beak Length (mm)", + "type": "number" + }, + { + "name": "Beak Depth (mm)", + "type": "number" + }, + { + "name": "Flipper Length (mm)", + "type": "integer" + }, + { + "name": "Body Mass (g)", + "type": "integer" + }, + { + "name": "Sex", + "type": "string" + } + ] + } + }, + { + "name": "platformer-terrain", + "type": "table", + "path": "platformer-terrain.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "x", + "type": "integer" + }, + { + "name": "y", + "type": "integer" + }, + { + "name": "lumosity", + "type": "number" + }, + { + "name": "saturation", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "id", + "type": "string" + }, + { + "name": "color", + "type": "string" + }, + { + "name": "key", + "type": "string" + } + ] + } + }, + { + "name": "points", + "type": "table", + "path": "points.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "x", + "type": "number" + }, + { + "name": "y", + "type": "number" + } + ] + } + }, + { + "name": "political-contributions", + "type": "table", + "path": "political-contributions.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Candidate_Identification", + "type": "string" + }, + { + "name": "Candidate_Name", + "type": "string" + }, + { + "name": "Incumbent_Challenger_Status", + "type": "string" + }, + { + "name": "Party_Code", + "type": "integer" + }, + { + "name": "Party_Affiliation", + "type": "string" + }, + { + "name": "Total_Receipts", + "type": "number" + }, + { + "name": "Transfers_from_Authorized_Committees", + "type": "integer" + }, + { + "name": "Total_Disbursements", + "type": "number" + }, + { + "name": "Transfers_to_Authorized_Committees", + "type": "number" + }, + { + "name": "Beginning_Cash", + "type": "number" + }, + { + "name": "Ending_Cash", + "type": "number" + }, + { + "name": "Contributions_from_Candidate", + "type": "number" + }, + { + "name": "Loans_from_Candidate", + "type": "integer" + }, + { + "name": "Other_Loans", + "type": "integer" + }, + { + "name": "Candidate_Loan_Repayments", + "type": "number" + }, + { + "name": "Other_Loan_Repayments", + "type": "integer" + }, + { + "name": "Debts_Owed_By", + "type": "number" + }, + { + "name": "Total_Individual_Contributions", + "type": "integer" + }, + { + "name": "Candidate_State", + "type": "string" + }, + { + "name": "Candidate_District", + "type": "integer" + }, + { + "name": "Contributions_from_Other_Political_Committees", + "type": "integer" + }, + { + "name": "Contributions_from_Party_Committees", + "type": "integer" + }, + { + "name": "Coverage_End_Date", + "type": "string" + }, + { + "name": "Refunds_to_Individuals", + "type": "integer" + }, + { + "name": "Refunds_to_Committees", + "type": "integer" + } + ] + } + }, + { + "name": "population", + "type": "table", + "path": "population.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "sex", + "type": "integer" + }, + { + "name": "people", + "type": "integer" + } + ] + } + }, + { + "name": "population_engineers_hurricanes", + "type": "table", + "path": "population_engineers_hurricanes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "state", + "type": "string" + }, + { + "name": "id", + "type": "integer" + }, + { + "name": "population", + "type": "integer" + }, + { + "name": "engineers", + "type": "number" + }, + { + "name": "hurricanes", + "type": "integer" + } + ] + } + }, + { + "name": "seattle-weather-hourly-normals", + "type": "table", + "path": "seattle-weather-hourly-normals.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "datetime" + }, + { + "name": "pressure", + "type": "number" + }, + { + "name": "temperature", + "type": "number" + }, + { + "name": "wind", + "type": "number" + } + ] + } + }, + { + "name": "seattle-weather", + "type": "table", + "path": "seattle-weather.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "precipitation", + "type": "number" + }, + { + "name": "temp_max", + "type": "number" + }, + { + "name": "temp_min", + "type": "number" + }, + { + "name": "wind", + "type": "number" + }, + { + "name": "weather", + "type": "string" + } + ] + } + }, + { + "name": "sp500-2000", + "type": "table", + "path": "sp500-2000.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "open", + "type": "number" + }, + { + "name": "high", + "type": "number" + }, + { + "name": "low", + "type": "number" + }, + { + "name": "close", + "type": "number" + }, + { + "name": "adjclose", + "type": "number" + }, + { + "name": "volume", + "type": "integer" + } + ] + } + }, + { + "name": "sp500", + "type": "table", + "path": "sp500.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "price", + "type": "number" + } + ] + } + }, + { + "name": "stocks", + "type": "table", + "path": "stocks.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "symbol", + "type": "string" + }, + { + "name": "date", + "type": "string" + }, + { + "name": "price", + "type": "number" + } + ] + } + }, + { + "name": "udistrict", + "type": "table", + "path": "udistrict.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "key", + "type": "string" + }, + { + "name": "lat", + "type": "number" + } + ] + } + }, + { + "name": "unemployment-across-industries", + "type": "table", + "path": "unemployment-across-industries.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "series", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "month", + "type": "integer" + }, + { + "name": "count", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + }, + { + "name": "date", + "type": "datetime" + } + ] + } + }, + { + "name": "unemployment", + "type": "table", + "path": "unemployment.tsv", + "scheme": "file", + "format": "tsv", + "mediatype": "text/tsv", + "encoding": "utf-8", + "dialect": { + "csv": { + "delimiter": "\t" + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + } + ] + } + }, + { + "name": "uniform-2d", + "type": "table", + "path": "uniform-2d.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "u", + "type": "number" + }, + { + "name": "v", + "type": "number" + } + ] + } + }, + { + "name": "us-10m", + "type": "json", + "path": "us-10m.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "us-employment", + "type": "table", + "path": "us-employment.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "month", + "type": "date" + }, + { + "name": "nonfarm", + "type": "integer" + }, + { + "name": "private", + "type": "integer" + }, + { + "name": "goods_producing", + "type": "integer" + }, + { + "name": "service_providing", + "type": "integer" + }, + { + "name": "private_service_providing", + "type": "integer" + }, + { + "name": "mining_and_logging", + "type": "integer" + }, + { + "name": "construction", + "type": "integer" + }, + { + "name": "manufacturing", + "type": "integer" + }, + { + "name": "durable_goods", + "type": "integer" + }, + { + "name": "nondurable_goods", + "type": "integer" + }, + { + "name": "trade_transportation_utilties", + "type": "integer" + }, + { + "name": "wholesale_trade", + "type": "number" + }, + { + "name": "retail_trade", + "type": "number" + }, + { + "name": "transportation_and_warehousing", + "type": "number" + }, + { + "name": "utilities", + "type": "number" + }, + { + "name": "information", + "type": "integer" + }, + { + "name": "financial_activities", + "type": "integer" + }, + { + "name": "professional_and_business_services", + "type": "integer" + }, + { + "name": "education_and_health_services", + "type": "integer" + }, + { + "name": "leisure_and_hospitality", + "type": "integer" + }, + { + "name": "other_services", + "type": "integer" + }, + { + "name": "government", + "type": "integer" + }, + { + "name": "nonfarm_change", + "type": "integer" + } + ] + } + }, + { + "name": "us-state-capitals", + "type": "table", + "path": "us-state-capitals.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "lon", + "type": "number" + }, + { + "name": "lat", + "type": "number" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ] + } + }, + { + "name": "volcano", + "type": "json", + "path": "volcano.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "weather", + "type": "table", + "path": "weather.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "location", + "type": "string" + }, + { + "name": "date", + "type": "date" + }, + { + "name": "precipitation", + "type": "number" + }, + { + "name": "temp_max", + "type": "number" + }, + { + "name": "temp_min", + "type": "number" + }, + { + "name": "wind", + "type": "number" + }, + { + "name": "weather", + "type": "string" + } + ] + } + }, + { + "name": "weather", + "type": "json", + "path": "weather.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "wheat", + "type": "table", + "path": "wheat.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "wheat", + "type": "number" + }, + { + "name": "wages", + "type": "number" + } + ] + } + }, + { + "name": "windvectors", + "type": "table", + "path": "windvectors.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "longitude", + "type": "number" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "dir", + "type": "integer" + }, + { + "name": "dirCat", + "type": "integer" + }, + { + "name": "speed", + "type": "number" + } + ] + } + }, + { + "name": "world-110m", + "type": "json", + "path": "world-110m.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "zipcodes", + "type": "table", + "path": "zipcodes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "zip_code", + "type": "integer" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "longitude", + "type": "number" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "county", + "type": "string" + } + ] + } + } + ] +} \ No newline at end of file diff --git a/datapackage.yaml b/datapackage.yaml new file mode 100644 index 00000000..590732b0 --- /dev/null +++ b/datapackage.yaml @@ -0,0 +1,1537 @@ +name: vega-datasets +description: Common repository for example datasets used by Vega related projects. +homepage: http://github.com/vega/vega-datasets.git +licenses: + - name: BSD-3-Clause + path: https://opensource.org/license/bsd-3-clause + title: The 3-Clause BSD License +sources: + - path: https://github.com/vega/vega-datasets/blob/next/SOURCES.md +contributors: + - title: UW Interactive Data Lab + path: http://idl.cs.washington.edu +version: 2.11.0 +created: '2024-11-26T22:32:24.187288+00:00' +resources: + - name: 7zip + type: file + path: 7zip.png + scheme: file + format: png + mediatype: image/png + encoding: utf-8 + - name: airports + type: table + path: airports.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: iata + type: string + - name: name + type: string + - name: city + type: string + - name: state + type: string + - name: country + type: string + - name: latitude + type: number + - name: longitude + type: number + - name: annual-precip + type: json + path: annual-precip.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + - name: anscombe + type: table + path: anscombe.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Series + type: string + - name: X + type: integer + - name: Y + type: number + - name: barley + type: table + path: barley.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: yield + type: number + - name: variety + type: string + - name: year + type: integer + - name: site + type: string + - name: birdstrikes + type: table + path: birdstrikes.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: Airport Name + type: string + - name: Aircraft Make Model + type: string + - name: Effect Amount of damage + type: string + - name: Flight Date + type: date + - name: Aircraft Airline Operator + type: string + - name: Origin State + type: string + - name: Phase of flight + type: string + - name: Wildlife Size + type: string + - name: Wildlife Species + type: string + - name: Time of day + type: string + - name: Cost Other + type: integer + - name: Cost Repair + type: integer + - name: Cost Total $ + type: integer + - name: Speed IAS in knots + type: integer + - name: budget + type: table + path: budget.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Source Category Code + type: integer + - name: Source category name + type: string + - name: Source subcategory + type: integer + - name: Source subcategory name + type: string + - name: Agency code + type: integer + - name: Agency name + type: string + - name: Bureau code + type: integer + - name: Bureau name + type: string + - name: Account code + type: integer + - name: Account name + type: string + - name: Treasury Agency code + type: integer + - name: On- or off-budget + type: string + - name: '1962' + type: string + - name: '1963' + type: string + - name: '1964' + type: string + - name: '1965' + type: string + - name: '1966' + type: string + - name: '1967' + type: string + - name: '1968' + type: string + - name: '1969' + type: string + - name: '1970' + type: string + - name: '1971' + type: string + - name: '1972' + type: string + - name: '1973' + type: string + - name: '1974' + type: string + - name: '1975' + type: string + - name: '1976' + type: string + - name: TQ + type: string + - name: '1977' + type: string + - name: '1978' + type: string + - name: '1979' + type: string + - name: '1980' + type: string + - name: '1981' + type: string + - name: '1982' + type: string + - name: '1983' + type: string + - name: '1984' + type: string + - name: '1985' + type: string + - name: '1986' + type: string + - name: '1987' + type: string + - name: '1988' + type: string + - name: '1989' + type: string + - name: '1990' + type: string + - name: '1991' + type: string + - name: '1992' + type: string + - name: '1993' + type: string + - name: '1994' + type: string + - name: '1995' + type: string + - name: '1996' + type: string + - name: '1997' + type: string + - name: '1998' + type: string + - name: '1999' + type: string + - name: '2000' + type: string + - name: '2001' + type: string + - name: '2002' + type: string + - name: '2003' + type: string + - name: '2004' + type: string + - name: '2005' + type: string + - name: '2006' + type: string + - name: '2007' + type: string + - name: '2008' + type: string + - name: '2009' + type: string + - name: '2010' + type: string + - name: '2011' + type: string + - name: '2012' + type: string + - name: '2013' + type: string + - name: '2014' + type: string + - name: '2015' + type: string + - name: '2016' + type: string + - name: '2017' + type: string + - name: '2018' + type: string + - name: '2019' + type: string + - name: '2020' + type: string + - name: budgets + type: table + path: budgets.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: budgetYear + type: integer + - name: forecastYear + type: integer + - name: value + type: number + - name: burtin + type: table + path: burtin.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Bacteria + type: string + - name: Penicillin + type: number + - name: Streptomycin + type: number + - name: Neomycin + type: number + - name: Gram_Staining + type: string + - name: Genus + type: string + - name: cars + type: table + path: cars.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Name + type: string + - name: Miles_per_Gallon + type: integer + - name: Cylinders + type: integer + - name: Displacement + type: number + - name: Horsepower + type: integer + - name: Weight_in_lbs + type: integer + - name: Acceleration + type: number + - name: Year + type: date + - name: Origin + type: string + - name: co2-concentration + type: table + path: co2-concentration.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: Date + type: date + - name: CO2 + type: number + - name: adjusted CO2 + type: number + - name: countries + type: table + path: countries.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: _comment + type: string + - name: year + type: integer + - name: fertility + type: number + - name: life_expect + type: number + - name: n_fertility + type: number + - name: n_life_expect + type: number + - name: country + type: string + - name: crimea + type: table + path: crimea.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: date + - name: wounds + type: integer + - name: other + type: integer + - name: disease + type: integer + - name: disasters + type: table + path: disasters.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: Entity + type: string + - name: Year + type: integer + - name: Deaths + type: integer + - name: driving + type: table + path: driving.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: side + type: string + - name: year + type: integer + - name: miles + type: integer + - name: gas + type: number + - name: earthquakes + type: json + path: earthquakes.json + scheme: file + format: geojson + mediatype: text/geojson + encoding: utf-8 + - name: ffox + type: file + path: ffox.png + scheme: file + format: png + mediatype: image/png + encoding: utf-8 + - name: flare-dependencies + type: table + path: flare-dependencies.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: source + type: integer + - name: target + type: integer + - name: flare + type: table + path: flare.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: id + type: integer + - name: name + type: string + - name: flights-10k + type: table + path: flights-10k.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: string + - name: delay + type: integer + - name: distance + type: integer + - name: origin + type: string + - name: destination + type: string + - name: flights-200k + type: table + path: flights-200k.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: delay + type: integer + - name: distance + type: integer + - name: time + type: number + - name: flights-20k + type: table + path: flights-20k.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: string + - name: delay + type: integer + - name: distance + type: integer + - name: origin + type: string + - name: destination + type: string + - name: flights-2k + type: table + path: flights-2k.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: string + - name: delay + type: integer + - name: distance + type: integer + - name: origin + type: string + - name: destination + type: string + - name: flights-3m + type: table + path: flights-3m.parquet + scheme: file + format: parquet + mediatype: application/parquet + schema: + fields: + - name: date + type: integer + - name: delay + type: integer + - name: distance + type: integer + - name: origin + type: string + - name: destination + type: string + - name: flights-5k + type: table + path: flights-5k.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: string + - name: delay + type: integer + - name: distance + type: integer + - name: origin + type: string + - name: destination + type: string + - name: flights-airport + type: table + path: flights-airport.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: origin + type: string + - name: destination + type: string + - name: count + type: integer + - name: football + type: table + path: football.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: date + - name: division + type: string + - name: home_team + type: string + - name: away_team + type: string + - name: home_score + type: integer + - name: away_score + type: integer + - name: gapminder-health-income + type: table + path: gapminder-health-income.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: country + type: string + - name: income + type: integer + - name: health + type: number + - name: population + type: integer + - name: region + type: string + - name: gapminder + type: table + path: gapminder.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: year + type: integer + - name: country + type: string + - name: cluster + type: integer + - name: pop + type: integer + - name: life_expect + type: number + - name: fertility + type: number + - name: gimp + type: file + path: gimp.png + scheme: file + format: png + mediatype: image/png + encoding: utf-8 + - name: github + type: table + path: github.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: time + type: string + - name: count + type: integer + - name: global-temp + type: table + path: global-temp.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: year + type: integer + - name: temp + type: number + - name: income + type: table + path: income.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: name + type: string + - name: region + type: string + - name: id + type: integer + - name: pct + type: number + - name: total + type: integer + - name: group + type: string + - name: iowa-electricity + type: table + path: iowa-electricity.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: year + type: date + - name: source + type: string + - name: net_generation + type: integer + - name: jobs + type: table + path: jobs.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: job + type: string + - name: sex + type: string + - name: year + type: integer + - name: count + type: integer + - name: perc + type: number + - name: la-riots + type: table + path: la-riots.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: first_name + type: string + - name: last_name + type: string + - name: age + type: integer + - name: gender + type: string + - name: race + type: string + - name: death_date + type: date + - name: address + type: string + - name: neighborhood + type: string + - name: type + type: string + - name: longitude + type: number + - name: latitude + type: number + - name: londonboroughs + type: json + path: londonBoroughs.json + scheme: file + format: topojson + mediatype: text/topojson + encoding: utf-8 + - name: londoncentroids + type: table + path: londonCentroids.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: name + type: string + - name: cx + type: number + - name: cy + type: number + - name: londontubelines + type: json + path: londonTubeLines.json + scheme: file + format: topojson + mediatype: text/topojson + encoding: utf-8 + - name: lookup_groups + type: table + path: lookup_groups.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: group + type: integer + - name: person + type: string + - name: lookup_people + type: table + path: lookup_people.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: name + type: string + - name: age + type: integer + - name: height + type: integer + - name: miserables + type: json + path: miserables.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + - name: monarchs + type: table + path: monarchs.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: name + type: string + - name: start + type: integer + - name: end + type: integer + - name: index + type: integer + - name: movies + type: table + path: movies.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Title + type: string + - name: US Gross + type: integer + - name: Worldwide Gross + type: integer + - name: US DVD Sales + type: integer + - name: Production Budget + type: integer + - name: Release Date + type: string + - name: MPAA Rating + type: string + - name: Running Time min + type: integer + - name: Distributor + type: string + - name: Source + type: string + - name: Major Genre + type: string + - name: Creative Type + type: string + - name: Director + type: string + - name: Rotten Tomatoes Rating + type: integer + - name: IMDB Rating + type: number + - name: IMDB Votes + type: integer + - name: normal-2d + type: table + path: normal-2d.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: u + type: number + - name: v + type: number + - name: obesity + type: table + path: obesity.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: id + type: integer + - name: rate + type: number + - name: state + type: string + - name: ohlc + type: table + path: ohlc.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: date + type: date + - name: open + type: number + - name: high + type: number + - name: low + type: number + - name: close + type: number + - name: signal + type: string + - name: ret + type: number + - name: penguins + type: table + path: penguins.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Species + type: string + - name: Island + type: string + - name: Beak Length (mm) + type: number + - name: Beak Depth (mm) + type: number + - name: Flipper Length (mm) + type: integer + - name: Body Mass (g) + type: integer + - name: Sex + type: string + - name: platformer-terrain + type: table + path: platformer-terrain.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: x + type: integer + - name: y + type: integer + - name: lumosity + type: number + - name: saturation + type: integer + - name: name + type: string + - name: id + type: string + - name: color + type: string + - name: key + type: string + - name: points + type: table + path: points.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: x + type: number + - name: y + type: number + - name: political-contributions + type: table + path: political-contributions.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: Candidate_Identification + type: string + - name: Candidate_Name + type: string + - name: Incumbent_Challenger_Status + type: string + - name: Party_Code + type: integer + - name: Party_Affiliation + type: string + - name: Total_Receipts + type: number + - name: Transfers_from_Authorized_Committees + type: integer + - name: Total_Disbursements + type: number + - name: Transfers_to_Authorized_Committees + type: number + - name: Beginning_Cash + type: number + - name: Ending_Cash + type: number + - name: Contributions_from_Candidate + type: number + - name: Loans_from_Candidate + type: integer + - name: Other_Loans + type: integer + - name: Candidate_Loan_Repayments + type: number + - name: Other_Loan_Repayments + type: integer + - name: Debts_Owed_By + type: number + - name: Total_Individual_Contributions + type: integer + - name: Candidate_State + type: string + - name: Candidate_District + type: integer + - name: Contributions_from_Other_Political_Committees + type: integer + - name: Contributions_from_Party_Committees + type: integer + - name: Coverage_End_Date + type: string + - name: Refunds_to_Individuals + type: integer + - name: Refunds_to_Committees + type: integer + - name: population + type: table + path: population.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: year + type: integer + - name: age + type: integer + - name: sex + type: integer + - name: people + type: integer + - name: population_engineers_hurricanes + type: table + path: population_engineers_hurricanes.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: state + type: string + - name: id + type: integer + - name: population + type: integer + - name: engineers + type: number + - name: hurricanes + type: integer + - name: seattle-weather-hourly-normals + type: table + path: seattle-weather-hourly-normals.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: date + type: datetime + - name: pressure + type: number + - name: temperature + type: number + - name: wind + type: number + - name: seattle-weather + type: table + path: seattle-weather.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: date + type: date + - name: precipitation + type: number + - name: temp_max + type: number + - name: temp_min + type: number + - name: wind + type: number + - name: weather + type: string + - name: sp500-2000 + type: table + path: sp500-2000.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: date + type: date + - name: open + type: number + - name: high + type: number + - name: low + type: number + - name: close + type: number + - name: adjclose + type: number + - name: volume + type: integer + - name: sp500 + type: table + path: sp500.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: date + type: string + - name: price + type: number + - name: stocks + type: table + path: stocks.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: symbol + type: string + - name: date + type: string + - name: price + type: number + - name: udistrict + type: table + path: udistrict.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: key + type: string + - name: lat + type: number + - name: unemployment-across-industries + type: table + path: unemployment-across-industries.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: series + type: string + - name: year + type: integer + - name: month + type: integer + - name: count + type: integer + - name: rate + type: number + - name: date + type: datetime + - name: unemployment + type: table + path: unemployment.tsv + scheme: file + format: tsv + mediatype: text/tsv + encoding: utf-8 + dialect: + csv: + delimiter: "\t" + schema: + fields: + - name: id + type: integer + - name: rate + type: number + - name: uniform-2d + type: table + path: uniform-2d.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: u + type: number + - name: v + type: number + - name: us-10m + type: json + path: us-10m.json + scheme: file + format: topojson + mediatype: text/topojson + encoding: utf-8 + - name: us-employment + type: table + path: us-employment.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: month + type: date + - name: nonfarm + type: integer + - name: private + type: integer + - name: goods_producing + type: integer + - name: service_providing + type: integer + - name: private_service_providing + type: integer + - name: mining_and_logging + type: integer + - name: construction + type: integer + - name: manufacturing + type: integer + - name: durable_goods + type: integer + - name: nondurable_goods + type: integer + - name: trade_transportation_utilties + type: integer + - name: wholesale_trade + type: number + - name: retail_trade + type: number + - name: transportation_and_warehousing + type: number + - name: utilities + type: number + - name: information + type: integer + - name: financial_activities + type: integer + - name: professional_and_business_services + type: integer + - name: education_and_health_services + type: integer + - name: leisure_and_hospitality + type: integer + - name: other_services + type: integer + - name: government + type: integer + - name: nonfarm_change + type: integer + - name: us-state-capitals + type: table + path: us-state-capitals.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: lon + type: number + - name: lat + type: number + - name: state + type: string + - name: city + type: string + - name: volcano + type: json + path: volcano.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + - name: weather + type: table + path: weather.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: location + type: string + - name: date + type: date + - name: precipitation + type: number + - name: temp_max + type: number + - name: temp_min + type: number + - name: wind + type: number + - name: weather + type: string + - name: weather + type: json + path: weather.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + - name: wheat + type: table + path: wheat.json + scheme: file + format: json + mediatype: text/json + encoding: utf-8 + dialect: + json: + keyed: true + schema: + fields: + - name: year + type: integer + - name: wheat + type: number + - name: wages + type: number + - name: windvectors + type: table + path: windvectors.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: longitude + type: number + - name: latitude + type: number + - name: dir + type: integer + - name: dirCat + type: integer + - name: speed + type: number + - name: world-110m + type: json + path: world-110m.json + scheme: file + format: topojson + mediatype: text/topojson + encoding: utf-8 + - name: zipcodes + type: table + path: zipcodes.csv + scheme: file + format: csv + mediatype: text/csv + encoding: utf-8 + schema: + fields: + - name: zip_code + type: integer + - name: latitude + type: number + - name: longitude + type: number + - name: city + type: string + - name: state + type: string + - name: county + type: string diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py new file mode 100644 index 00000000..e80d2575 --- /dev/null +++ b/scripts/build_datapackage.py @@ -0,0 +1,195 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "frictionless[json,parquet]", +# "polars", +# ] +# /// +""" +Generates machine-readable metadata, describing the contents of `/data/`_. + +Usage +----- +1. Install `uv`_. +2. Run this script from the repo root: + + >>> uv run scripts/build_datapackage.py # doctest: +SKIP + +Related +------- +- https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies +- https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata +- https://github.com/vega/vega-datasets/issues/629#issuecomment-2498618622 +- https://datapackage.org/ +- https://docs.pola.rs/ + + +.. _/data/: + https://github.com/vega/vega-datasets/tree/main/data +.. _uv: + https://docs.astral.sh/uv/getting-started/installation/ +""" + +from __future__ import annotations + +import datetime as dt +import json +import logging +import os +import warnings +from collections.abc import Mapping +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, TypedDict + +import polars as pl +from frictionless.resources import ( + JsonResource, + MapResource, + Package, + Resource, + TableResource, +) + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator, Sequence + from typing import Literal + + +logging.basicConfig(level=logging.INFO) + +type ResourceConstructor = Callable[..., Resource] + +TopoResource: ResourceConstructor = partial( + MapResource, format="topojson", datatype="map" +) +GeoResource: ResourceConstructor = partial( + MapResource, format="geojson", datatype="map" +) + +SUFFIX_IMAGE: set[str] = {".png"} +SUFFIX_TABULAR_SAFE: set[str] = {".csv", ".tsv", ".parquet"} +SUFFIX_JSON: Literal[".json"] = ".json" +SUFFIX_UNSUPPORTED: set[str] = {".arrow"} + + +class PackageMeta(TypedDict): + """ + A subset of the `Data Package`_ standard. + + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + + name: str + version: str + homepage: str + description: str + licenses: Sequence[Mapping[Literal["name", "path", "title"], str]] + contributors: Sequence[Mapping[Literal["title", "path", "email", "role"], str]] + sources: Sequence[Mapping[Literal["title", "path", "email"], str]] + created: str + + +def extract_package_metadata(repo_root: Path, /) -> PackageMeta: + """Repurpose `package.json`_ for the `Data Package`_ standard. + + .. _package.json: + https://github.com/vega/vega-datasets/blob/main/package.json + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + fp: Path = repo_root / "package.json" + with fp.open(encoding="utf-8") as f: + m = json.load(f) + if not isinstance(m, Mapping): + msg = f"Unexpected type returned from {fp!r}\n" f"{type(m).__name__!r}" + raise TypeError(msg) + return { + "name": m["name"], + "version": m["version"], + "homepage": m["repository"]["url"], + "description": m["description"], + "contributors": [{"title": m["author"]["name"], "path": m["author"]["url"]}], + "licenses": [ + { + "name": m["license"], + "path": "https://opensource.org/license/bsd-3-clause", + "title": "The 3-Clause BSD License", + } + ], + "sources": [ + {"path": "https://github.com/vega/vega-datasets/blob/next/SOURCES.md"} + ], + "created": dt.datetime.now(dt.UTC).isoformat(), + } + + +def infer_json_constructor(source: Path, /) -> ResourceConstructor: + """Identifies *non-tabular* files, adds basic tag for spatial data.""" + df: pl.DataFrame = pl.read_json(source) + if any(tp.is_nested() for tp in df.schema.dtypes()): + if df.columns[0] == "type": + return TopoResource if df.item(0, 0) == "Topology" else GeoResource + else: + return JsonResource + else: + return TableResource + + +def iter_resources(data_root: Path, /) -> Iterator[Resource]: + """Yield all parseable resources, selecting the most appropriate ``Resource`` class.""" + tp: ResourceConstructor + for fp in data_root.iterdir(): + suffix: str = fp.suffix + if not fp.is_file(): + continue + if suffix in SUFFIX_UNSUPPORTED: + continue + elif suffix in SUFFIX_IMAGE: + tp = Resource + elif suffix in SUFFIX_TABULAR_SAFE: + tp = TableResource + elif suffix == SUFFIX_JSON: + tp = infer_json_constructor(fp) + else: + msg = f"Skipping unexpected extension {suffix!r}\n\n{fp!r}" + warnings.warn(msg, stacklevel=2) + continue + yield tp(fp.name) + + +def main( + *, + stem: str = "datapackage", + output_format: Literal["json", "yaml", "both"] = "both", +) -> None: + if output_format not in {"json", "yaml", "both"}: + msg = f"Expected one of {["json", "yaml", "both"]!r} but got {output_format!r}" + raise TypeError(msg) + repo_dir: Path = Path(__file__).parent.parent + data_dir: Path = repo_dir / "data" + # NOTE: Forcing base directory here + # - Ensures ``frictionless`` doesn't insert platform-specific path separator(s) + os.chdir(data_dir) + pkg_meta = extract_package_metadata(repo_dir) + + logging.info( + f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." + ) + pkg = Package(resources=list(iter_resources(data_dir)), **pkg_meta) + logging.info(f"Collected {len(pkg.resources)} resources") + logging.info("Inferring metadata ...") + pkg.infer() + if output_format in {"json", "both"}: + p = (repo_dir / f"{stem}.json").as_posix() + logging.info(f"Writing {p!r}") + pkg.to_json(p) + if output_format in {"yaml", "both"}: + p = (repo_dir / f"{stem}.yaml").as_posix() + logging.info(f"Writing {p!r}") + pkg.to_yaml(p) + + +if __name__ == "__main__": + main()