From f2022315cfac39494c2a0c97a6bdcd0555745dff Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 27 Nov 2024 20:31:47 +0000 Subject: [PATCH] feat: generate `frictionless` data package metadata (#631) * feat: generate `frictionless` data package metadata Closes #629 - Adds `build_datapackage.py` script - Generates initial metadata in both `.yaml` and `.json` formats * feat(typing): spell mappings more explicitly Improves readability, but mainly to support per-resource `description`, `sources`, `licenses` See https://github.com/vega/vega-datasets/pull/631#issuecomment-2503760452 * refactor(ruff): misc linting * ci: change default output to `json`, fix missing `contributors` https://github.com/vega/vega-datasets/pull/631#pullrequestreview-2465311789, https://github.com/vega/vega-datasets/pull/631#issuecomment-2504151082 * feat: add support for `.arrow` https://github.com/vega/vega-datasets/pull/631#issuecomment-2503825716, https://github.com/vega/vega-datasets/pull/631#issuecomment-2504182615 * feat(DRAFT): add `.with_extras()`, for `description`, `source`, and `license` Unused currently, depends on having a more structured `SOURCES.md` https://github.com/vega/vega-datasets/pull/631#issuecomment-2503825716 * add data package to build step * ci: add uv * ci: fix uv setup * ci: fail if there are changes * just kidding (timestamps change things) * chore: update pr template --------- Co-authored-by: Dominik Moritz --- .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/workflows/test.yml | 4 + datapackage.json | 2699 ++++++++++++++++++++++++++++++ package.json | 2 +- scripts/build_datapackage.py | 339 ++++ 5 files changed, 3044 insertions(+), 1 deletion(-) create mode 100644 datapackage.json create mode 100755 scripts/build_datapackage.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 295f2f38..cf20ca92 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,4 @@ Please: - [ ] Update the changelog in `README.md`. Add a new version if needed and add a short description of the change. - [ ] Add information about new datasets to `sources.md`. +- [ ] Only add the `datapackage.json` if there are changes to it besides the timestamp. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 080b3c8b..603b4ec5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,6 +23,10 @@ jobs: cache: 'npm' node-version: 20 + - uses: astral-sh/setup-uv@v3 + with: + version: ">=0.5.0" + - name: Install Node dependencies run: npm ci diff --git a/datapackage.json b/datapackage.json new file mode 100644 index 00000000..21057199 --- /dev/null +++ b/datapackage.json @@ -0,0 +1,2699 @@ +{ + "name": "vega-datasets", + "description": "Common repository for example datasets used by Vega related projects.", + "homepage": "http://github.com/vega/vega-datasets.git", + "licenses": [ + { + "name": "BSD-3-Clause", + "path": "https://opensource.org/license/bsd-3-clause", + "title": "The 3-Clause BSD License" + } + ], + "sources": [ + { + "path": "https://github.com/vega/vega-datasets/blob/next/SOURCES.md" + } + ], + "contributors": [ + { + "title": "UW Interactive Data Lab", + "path": "http://idl.cs.washington.edu" + } + ], + "version": "2.11.0", + "created": "2024-11-27T20:10:38.655412+00:00", + "resources": [ + { + "name": "earthquakes", + "type": "json", + "path": "earthquakes.json", + "scheme": "file", + "format": "geojson", + "mediatype": "text/geojson", + "encoding": "utf-8" + }, + { + "name": "annual-precip", + "type": "json", + "path": "annual-precip.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "iowa-electricity", + "type": "table", + "path": "iowa-electricity.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "year", + "type": "date" + }, + { + "name": "source", + "type": "string" + }, + { + "name": "net_generation", + "type": "integer" + } + ] + } + }, + { + "name": "population", + "type": "table", + "path": "population.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "sex", + "type": "integer" + }, + { + "name": "people", + "type": "integer" + } + ] + } + }, + { + "name": "barley", + "type": "table", + "path": "barley.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "yield", + "type": "number" + }, + { + "name": "variety", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "site", + "type": "string" + } + ] + } + }, + { + "name": "disasters", + "type": "table", + "path": "disasters.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Entity", + "type": "string" + }, + { + "name": "Year", + "type": "integer" + }, + { + "name": "Deaths", + "type": "integer" + } + ] + } + }, + { + "name": "anscombe", + "type": "table", + "path": "anscombe.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Series", + "type": "string" + }, + { + "name": "X", + "type": "integer" + }, + { + "name": "Y", + "type": "number" + } + ] + } + }, + { + "name": "burtin", + "type": "table", + "path": "burtin.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Bacteria", + "type": "string" + }, + { + "name": "Penicillin", + "type": "number" + }, + { + "name": "Streptomycin", + "type": "number" + }, + { + "name": "Neomycin", + "type": "number" + }, + { + "name": "Gram_Staining", + "type": "string" + }, + { + "name": "Genus", + "type": "string" + } + ] + } + }, + { + "name": "la-riots", + "type": "table", + "path": "la-riots.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "first_name", + "type": "string" + }, + { + "name": "last_name", + "type": "string" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "gender", + "type": "string" + }, + { + "name": "race", + "type": "string" + }, + { + "name": "death_date", + "type": "date" + }, + { + "name": "address", + "type": "string" + }, + { + "name": "neighborhood", + "type": "string" + }, + { + "name": "type", + "type": "string" + }, + { + "name": "longitude", + "type": "number" + }, + { + "name": "latitude", + "type": "number" + } + ] + } + }, + { + "name": "movies", + "type": "table", + "path": "movies.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Title", + "type": "string" + }, + { + "name": "US Gross", + "type": "integer" + }, + { + "name": "Worldwide Gross", + "type": "integer" + }, + { + "name": "US DVD Sales", + "type": "integer" + }, + { + "name": "Production Budget", + "type": "integer" + }, + { + "name": "Release Date", + "type": "string" + }, + { + "name": "MPAA Rating", + "type": "string" + }, + { + "name": "Running Time min", + "type": "integer" + }, + { + "name": "Distributor", + "type": "string" + }, + { + "name": "Source", + "type": "string" + }, + { + "name": "Major Genre", + "type": "string" + }, + { + "name": "Creative Type", + "type": "string" + }, + { + "name": "Director", + "type": "string" + }, + { + "name": "Rotten Tomatoes Rating", + "type": "integer" + }, + { + "name": "IMDB Rating", + "type": "number" + }, + { + "name": "IMDB Votes", + "type": "integer" + } + ] + } + }, + { + "name": "unemployment", + "type": "table", + "path": "unemployment.tsv", + "scheme": "file", + "format": "tsv", + "mediatype": "text/tsv", + "encoding": "utf-8", + "dialect": { + "csv": { + "delimiter": "\t" + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + } + ] + } + }, + { + "name": "us-10m", + "type": "json", + "path": "us-10m.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "population_engineers_hurricanes", + "type": "table", + "path": "population_engineers_hurricanes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "state", + "type": "string" + }, + { + "name": "id", + "type": "integer" + }, + { + "name": "population", + "type": "integer" + }, + { + "name": "engineers", + "type": "number" + }, + { + "name": "hurricanes", + "type": "integer" + } + ] + } + }, + { + "name": "crimea", + "type": "table", + "path": "crimea.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "wounds", + "type": "integer" + }, + { + "name": "other", + "type": "integer" + }, + { + "name": "disease", + "type": "integer" + } + ] + } + }, + { + "name": "seattle-weather-hourly-normals", + "type": "table", + "path": "seattle-weather-hourly-normals.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "datetime" + }, + { + "name": "pressure", + "type": "number" + }, + { + "name": "temperature", + "type": "number" + }, + { + "name": "wind", + "type": "number" + } + ] + } + }, + { + "name": "flights-airport", + "type": "table", + "path": "flights-airport.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + }, + { + "name": "count", + "type": "integer" + } + ] + } + }, + { + "name": "flights-20k", + "type": "table", + "path": "flights-20k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "londoncentroids", + "type": "table", + "path": "londonCentroids.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "cx", + "type": "number" + }, + { + "name": "cy", + "type": "number" + } + ] + } + }, + { + "name": "flights-200k", + "type": "table", + "path": "flights-200k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "time", + "type": "number" + } + ] + } + }, + { + "name": "flights-200k", + "type": "table", + "path": "flights-200k.arrow", + "scheme": "file", + "format": "arrow", + "mediatype": "application/vnd.apache.arrow.file", + "schema": { + "fields": [ + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "time", + "type": "number" + } + ] + } + }, + { + "name": "obesity", + "type": "table", + "path": "obesity.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + }, + { + "name": "state", + "type": "string" + } + ] + } + }, + { + "name": "windvectors", + "type": "table", + "path": "windvectors.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "longitude", + "type": "number" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "dir", + "type": "integer" + }, + { + "name": "dirCat", + "type": "integer" + }, + { + "name": "speed", + "type": "number" + } + ] + } + }, + { + "name": "cars", + "type": "table", + "path": "cars.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Name", + "type": "string" + }, + { + "name": "Miles_per_Gallon", + "type": "integer" + }, + { + "name": "Cylinders", + "type": "integer" + }, + { + "name": "Displacement", + "type": "number" + }, + { + "name": "Horsepower", + "type": "integer" + }, + { + "name": "Weight_in_lbs", + "type": "integer" + }, + { + "name": "Acceleration", + "type": "number" + }, + { + "name": "Year", + "type": "date" + }, + { + "name": "Origin", + "type": "string" + } + ] + } + }, + { + "name": "seattle-weather", + "type": "table", + "path": "seattle-weather.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "precipitation", + "type": "number" + }, + { + "name": "temp_max", + "type": "number" + }, + { + "name": "temp_min", + "type": "number" + }, + { + "name": "wind", + "type": "number" + }, + { + "name": "weather", + "type": "string" + } + ] + } + }, + { + "name": "countries", + "type": "table", + "path": "countries.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "_comment", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "fertility", + "type": "number" + }, + { + "name": "life_expect", + "type": "number" + }, + { + "name": "n_fertility", + "type": "number" + }, + { + "name": "n_life_expect", + "type": "number" + }, + { + "name": "country", + "type": "string" + } + ] + } + }, + { + "name": "platformer-terrain", + "type": "table", + "path": "platformer-terrain.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "x", + "type": "integer" + }, + { + "name": "y", + "type": "integer" + }, + { + "name": "lumosity", + "type": "number" + }, + { + "name": "saturation", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "id", + "type": "string" + }, + { + "name": "color", + "type": "string" + }, + { + "name": "key", + "type": "string" + } + ] + } + }, + { + "name": "co2-concentration", + "type": "table", + "path": "co2-concentration.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Date", + "type": "date" + }, + { + "name": "CO2", + "type": "number" + }, + { + "name": "adjusted CO2", + "type": "number" + } + ] + } + }, + { + "name": "londontubelines", + "type": "json", + "path": "londonTubeLines.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "ohlc", + "type": "table", + "path": "ohlc.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "open", + "type": "number" + }, + { + "name": "high", + "type": "number" + }, + { + "name": "low", + "type": "number" + }, + { + "name": "close", + "type": "number" + }, + { + "name": "signal", + "type": "string" + }, + { + "name": "ret", + "type": "number" + } + ] + } + }, + { + "name": "7zip", + "type": "file", + "path": "7zip.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "points", + "type": "table", + "path": "points.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "x", + "type": "number" + }, + { + "name": "y", + "type": "number" + } + ] + } + }, + { + "name": "uniform-2d", + "type": "table", + "path": "uniform-2d.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "u", + "type": "number" + }, + { + "name": "v", + "type": "number" + } + ] + } + }, + { + "name": "github", + "type": "table", + "path": "github.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "time", + "type": "string" + }, + { + "name": "count", + "type": "integer" + } + ] + } + }, + { + "name": "unemployment-across-industries", + "type": "table", + "path": "unemployment-across-industries.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "series", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "month", + "type": "integer" + }, + { + "name": "count", + "type": "integer" + }, + { + "name": "rate", + "type": "number" + }, + { + "name": "date", + "type": "datetime" + } + ] + } + }, + { + "name": "lookup_people", + "type": "table", + "path": "lookup_people.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "age", + "type": "integer" + }, + { + "name": "height", + "type": "integer" + } + ] + } + }, + { + "name": "flare-dependencies", + "type": "table", + "path": "flare-dependencies.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "source", + "type": "integer" + }, + { + "name": "target", + "type": "integer" + } + ] + } + }, + { + "name": "world-110m", + "type": "json", + "path": "world-110m.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "gapminder", + "type": "table", + "path": "gapminder.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "cluster", + "type": "integer" + }, + { + "name": "pop", + "type": "integer" + }, + { + "name": "life_expect", + "type": "number" + }, + { + "name": "fertility", + "type": "number" + } + ] + } + }, + { + "name": "weather", + "type": "json", + "path": "weather.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "flights-2k", + "type": "table", + "path": "flights-2k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "budget", + "type": "table", + "path": "budget.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Source Category Code", + "type": "integer" + }, + { + "name": "Source category name", + "type": "string" + }, + { + "name": "Source subcategory", + "type": "integer" + }, + { + "name": "Source subcategory name", + "type": "string" + }, + { + "name": "Agency code", + "type": "integer" + }, + { + "name": "Agency name", + "type": "string" + }, + { + "name": "Bureau code", + "type": "integer" + }, + { + "name": "Bureau name", + "type": "string" + }, + { + "name": "Account code", + "type": "integer" + }, + { + "name": "Account name", + "type": "string" + }, + { + "name": "Treasury Agency code", + "type": "integer" + }, + { + "name": "On- or off-budget", + "type": "string" + }, + { + "name": "1962", + "type": "string" + }, + { + "name": "1963", + "type": "string" + }, + { + "name": "1964", + "type": "string" + }, + { + "name": "1965", + "type": "string" + }, + { + "name": "1966", + "type": "string" + }, + { + "name": "1967", + "type": "string" + }, + { + "name": "1968", + "type": "string" + }, + { + "name": "1969", + "type": "string" + }, + { + "name": "1970", + "type": "string" + }, + { + "name": "1971", + "type": "string" + }, + { + "name": "1972", + "type": "string" + }, + { + "name": "1973", + "type": "string" + }, + { + "name": "1974", + "type": "string" + }, + { + "name": "1975", + "type": "string" + }, + { + "name": "1976", + "type": "string" + }, + { + "name": "TQ", + "type": "string" + }, + { + "name": "1977", + "type": "string" + }, + { + "name": "1978", + "type": "string" + }, + { + "name": "1979", + "type": "string" + }, + { + "name": "1980", + "type": "string" + }, + { + "name": "1981", + "type": "string" + }, + { + "name": "1982", + "type": "string" + }, + { + "name": "1983", + "type": "string" + }, + { + "name": "1984", + "type": "string" + }, + { + "name": "1985", + "type": "string" + }, + { + "name": "1986", + "type": "string" + }, + { + "name": "1987", + "type": "string" + }, + { + "name": "1988", + "type": "string" + }, + { + "name": "1989", + "type": "string" + }, + { + "name": "1990", + "type": "string" + }, + { + "name": "1991", + "type": "string" + }, + { + "name": "1992", + "type": "string" + }, + { + "name": "1993", + "type": "string" + }, + { + "name": "1994", + "type": "string" + }, + { + "name": "1995", + "type": "string" + }, + { + "name": "1996", + "type": "string" + }, + { + "name": "1997", + "type": "string" + }, + { + "name": "1998", + "type": "string" + }, + { + "name": "1999", + "type": "string" + }, + { + "name": "2000", + "type": "string" + }, + { + "name": "2001", + "type": "string" + }, + { + "name": "2002", + "type": "string" + }, + { + "name": "2003", + "type": "string" + }, + { + "name": "2004", + "type": "string" + }, + { + "name": "2005", + "type": "string" + }, + { + "name": "2006", + "type": "string" + }, + { + "name": "2007", + "type": "string" + }, + { + "name": "2008", + "type": "string" + }, + { + "name": "2009", + "type": "string" + }, + { + "name": "2010", + "type": "string" + }, + { + "name": "2011", + "type": "string" + }, + { + "name": "2012", + "type": "string" + }, + { + "name": "2013", + "type": "string" + }, + { + "name": "2014", + "type": "string" + }, + { + "name": "2015", + "type": "string" + }, + { + "name": "2016", + "type": "string" + }, + { + "name": "2017", + "type": "string" + }, + { + "name": "2018", + "type": "string" + }, + { + "name": "2019", + "type": "string" + }, + { + "name": "2020", + "type": "string" + } + ] + } + }, + { + "name": "political-contributions", + "type": "table", + "path": "political-contributions.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Candidate_Identification", + "type": "string" + }, + { + "name": "Candidate_Name", + "type": "string" + }, + { + "name": "Incumbent_Challenger_Status", + "type": "string" + }, + { + "name": "Party_Code", + "type": "integer" + }, + { + "name": "Party_Affiliation", + "type": "string" + }, + { + "name": "Total_Receipts", + "type": "number" + }, + { + "name": "Transfers_from_Authorized_Committees", + "type": "integer" + }, + { + "name": "Total_Disbursements", + "type": "number" + }, + { + "name": "Transfers_to_Authorized_Committees", + "type": "number" + }, + { + "name": "Beginning_Cash", + "type": "number" + }, + { + "name": "Ending_Cash", + "type": "number" + }, + { + "name": "Contributions_from_Candidate", + "type": "number" + }, + { + "name": "Loans_from_Candidate", + "type": "integer" + }, + { + "name": "Other_Loans", + "type": "integer" + }, + { + "name": "Candidate_Loan_Repayments", + "type": "number" + }, + { + "name": "Other_Loan_Repayments", + "type": "integer" + }, + { + "name": "Debts_Owed_By", + "type": "number" + }, + { + "name": "Total_Individual_Contributions", + "type": "integer" + }, + { + "name": "Candidate_State", + "type": "string" + }, + { + "name": "Candidate_District", + "type": "integer" + }, + { + "name": "Contributions_from_Other_Political_Committees", + "type": "integer" + }, + { + "name": "Contributions_from_Party_Committees", + "type": "integer" + }, + { + "name": "Coverage_End_Date", + "type": "string" + }, + { + "name": "Refunds_to_Individuals", + "type": "integer" + }, + { + "name": "Refunds_to_Committees", + "type": "integer" + } + ] + } + }, + { + "name": "weather", + "type": "table", + "path": "weather.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "location", + "type": "string" + }, + { + "name": "date", + "type": "date" + }, + { + "name": "precipitation", + "type": "number" + }, + { + "name": "temp_max", + "type": "number" + }, + { + "name": "temp_min", + "type": "number" + }, + { + "name": "wind", + "type": "number" + }, + { + "name": "weather", + "type": "string" + } + ] + } + }, + { + "name": "volcano", + "type": "json", + "path": "volcano.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "lookup_groups", + "type": "table", + "path": "lookup_groups.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "group", + "type": "integer" + }, + { + "name": "person", + "type": "string" + } + ] + } + }, + { + "name": "budgets", + "type": "table", + "path": "budgets.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "budgetYear", + "type": "integer" + }, + { + "name": "forecastYear", + "type": "integer" + }, + { + "name": "value", + "type": "number" + } + ] + } + }, + { + "name": "londonboroughs", + "type": "json", + "path": "londonBoroughs.json", + "scheme": "file", + "format": "topojson", + "mediatype": "text/topojson", + "encoding": "utf-8" + }, + { + "name": "airports", + "type": "table", + "path": "airports.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "iata", + "type": "string" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "longitude", + "type": "number" + } + ] + } + }, + { + "name": "normal-2d", + "type": "table", + "path": "normal-2d.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "u", + "type": "number" + }, + { + "name": "v", + "type": "number" + } + ] + } + }, + { + "name": "us-state-capitals", + "type": "table", + "path": "us-state-capitals.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "lon", + "type": "number" + }, + { + "name": "lat", + "type": "number" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ] + } + }, + { + "name": "penguins", + "type": "table", + "path": "penguins.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "Species", + "type": "string" + }, + { + "name": "Island", + "type": "string" + }, + { + "name": "Beak Length (mm)", + "type": "number" + }, + { + "name": "Beak Depth (mm)", + "type": "number" + }, + { + "name": "Flipper Length (mm)", + "type": "integer" + }, + { + "name": "Body Mass (g)", + "type": "integer" + }, + { + "name": "Sex", + "type": "string" + } + ] + } + }, + { + "name": "miserables", + "type": "json", + "path": "miserables.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8" + }, + { + "name": "zipcodes", + "type": "table", + "path": "zipcodes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "zip_code", + "type": "integer" + }, + { + "name": "latitude", + "type": "number" + }, + { + "name": "longitude", + "type": "number" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "county", + "type": "string" + } + ] + } + }, + { + "name": "driving", + "type": "table", + "path": "driving.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "side", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "miles", + "type": "integer" + }, + { + "name": "gas", + "type": "number" + } + ] + } + }, + { + "name": "ffox", + "type": "file", + "path": "ffox.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "gapminder-health-income", + "type": "table", + "path": "gapminder-health-income.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "country", + "type": "string" + }, + { + "name": "income", + "type": "integer" + }, + { + "name": "health", + "type": "number" + }, + { + "name": "population", + "type": "integer" + }, + { + "name": "region", + "type": "string" + } + ] + } + }, + { + "name": "sp500", + "type": "table", + "path": "sp500.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "price", + "type": "number" + } + ] + } + }, + { + "name": "flights-10k", + "type": "table", + "path": "flights-10k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "birdstrikes", + "type": "table", + "path": "birdstrikes.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "Airport Name", + "type": "string" + }, + { + "name": "Aircraft Make Model", + "type": "string" + }, + { + "name": "Effect Amount of damage", + "type": "string" + }, + { + "name": "Flight Date", + "type": "date" + }, + { + "name": "Aircraft Airline Operator", + "type": "string" + }, + { + "name": "Origin State", + "type": "string" + }, + { + "name": "Phase of flight", + "type": "string" + }, + { + "name": "Wildlife Size", + "type": "string" + }, + { + "name": "Wildlife Species", + "type": "string" + }, + { + "name": "Time of day", + "type": "string" + }, + { + "name": "Cost Other", + "type": "integer" + }, + { + "name": "Cost Repair", + "type": "integer" + }, + { + "name": "Cost Total $", + "type": "integer" + }, + { + "name": "Speed IAS in knots", + "type": "integer" + } + ] + } + }, + { + "name": "monarchs", + "type": "table", + "path": "monarchs.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "start", + "type": "integer" + }, + { + "name": "end", + "type": "integer" + }, + { + "name": "index", + "type": "integer" + } + ] + } + }, + { + "name": "income", + "type": "table", + "path": "income.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "region", + "type": "string" + }, + { + "name": "id", + "type": "integer" + }, + { + "name": "pct", + "type": "number" + }, + { + "name": "total", + "type": "integer" + }, + { + "name": "group", + "type": "string" + } + ] + } + }, + { + "name": "us-employment", + "type": "table", + "path": "us-employment.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "month", + "type": "date" + }, + { + "name": "nonfarm", + "type": "integer" + }, + { + "name": "private", + "type": "integer" + }, + { + "name": "goods_producing", + "type": "integer" + }, + { + "name": "service_providing", + "type": "integer" + }, + { + "name": "private_service_providing", + "type": "integer" + }, + { + "name": "mining_and_logging", + "type": "integer" + }, + { + "name": "construction", + "type": "integer" + }, + { + "name": "manufacturing", + "type": "integer" + }, + { + "name": "durable_goods", + "type": "integer" + }, + { + "name": "nondurable_goods", + "type": "integer" + }, + { + "name": "trade_transportation_utilties", + "type": "integer" + }, + { + "name": "wholesale_trade", + "type": "number" + }, + { + "name": "retail_trade", + "type": "number" + }, + { + "name": "transportation_and_warehousing", + "type": "number" + }, + { + "name": "utilities", + "type": "number" + }, + { + "name": "information", + "type": "integer" + }, + { + "name": "financial_activities", + "type": "integer" + }, + { + "name": "professional_and_business_services", + "type": "integer" + }, + { + "name": "education_and_health_services", + "type": "integer" + }, + { + "name": "leisure_and_hospitality", + "type": "integer" + }, + { + "name": "other_services", + "type": "integer" + }, + { + "name": "government", + "type": "integer" + }, + { + "name": "nonfarm_change", + "type": "integer" + } + ] + } + }, + { + "name": "football", + "type": "table", + "path": "football.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "division", + "type": "string" + }, + { + "name": "home_team", + "type": "string" + }, + { + "name": "away_team", + "type": "string" + }, + { + "name": "home_score", + "type": "integer" + }, + { + "name": "away_score", + "type": "integer" + } + ] + } + }, + { + "name": "flights-5k", + "type": "table", + "path": "flights-5k.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "date", + "type": "string" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "gimp", + "type": "file", + "path": "gimp.png", + "scheme": "file", + "format": "png", + "mediatype": "image/png", + "encoding": "utf-8" + }, + { + "name": "jobs", + "type": "table", + "path": "jobs.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "job", + "type": "string" + }, + { + "name": "sex", + "type": "string" + }, + { + "name": "year", + "type": "integer" + }, + { + "name": "count", + "type": "integer" + }, + { + "name": "perc", + "type": "number" + } + ] + } + }, + { + "name": "udistrict", + "type": "table", + "path": "udistrict.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "key", + "type": "string" + }, + { + "name": "lat", + "type": "number" + } + ] + } + }, + { + "name": "global-temp", + "type": "table", + "path": "global-temp.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "temp", + "type": "number" + } + ] + } + }, + { + "name": "flights-3m", + "type": "table", + "path": "flights-3m.parquet", + "scheme": "file", + "format": "parquet", + "mediatype": "application/parquet", + "schema": { + "fields": [ + { + "name": "date", + "type": "integer" + }, + { + "name": "delay", + "type": "integer" + }, + { + "name": "distance", + "type": "integer" + }, + { + "name": "origin", + "type": "string" + }, + { + "name": "destination", + "type": "string" + } + ] + } + }, + { + "name": "flare", + "type": "table", + "path": "flare.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "id", + "type": "integer" + }, + { + "name": "name", + "type": "string" + } + ] + } + }, + { + "name": "sp500-2000", + "type": "table", + "path": "sp500-2000.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "date", + "type": "date" + }, + { + "name": "open", + "type": "number" + }, + { + "name": "high", + "type": "number" + }, + { + "name": "low", + "type": "number" + }, + { + "name": "close", + "type": "number" + }, + { + "name": "adjclose", + "type": "number" + }, + { + "name": "volume", + "type": "integer" + } + ] + } + }, + { + "name": "wheat", + "type": "table", + "path": "wheat.json", + "scheme": "file", + "format": "json", + "mediatype": "text/json", + "encoding": "utf-8", + "dialect": { + "json": { + "keyed": true + } + }, + "schema": { + "fields": [ + { + "name": "year", + "type": "integer" + }, + { + "name": "wheat", + "type": "number" + }, + { + "name": "wages", + "type": "number" + } + ] + } + }, + { + "name": "stocks", + "type": "table", + "path": "stocks.csv", + "scheme": "file", + "format": "csv", + "mediatype": "text/csv", + "encoding": "utf-8", + "schema": { + "fields": [ + { + "name": "symbol", + "type": "string" + }, + { + "name": "date", + "type": "string" + }, + { + "name": "price", + "type": "number" + } + ] + } + } + ] +} \ No newline at end of file diff --git a/package.json b/package.json index d3911e78..a57fea06 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,7 @@ "typescript": "^5.6.3" }, "scripts": { - "prebuild": "./scripts/make-url-index.sh > src/urls.ts", + "prebuild": "./scripts/make-url-index.sh > src/urls.ts && ./scripts/build_datapackage.py", "build": "rollup -c", "flights": "node scripts/flights.js", "github": "python scripts/github.py", diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py new file mode 100755 index 00000000..30834a63 --- /dev/null +++ b/scripts/build_datapackage.py @@ -0,0 +1,339 @@ +#!/usr/bin/env -S uv run + +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "frictionless[json,parquet]", +# "polars", +# ] +# /// +""" +Generates machine-readable metadata, describing the contents of `/data/`_. + +Usage +----- +1. Install `uv`_. +2. Run this script from the repo root: + + >>> uv run scripts/build_datapackage.py # doctest: +SKIP + +Related +------- +- https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies +- https://packaging.python.org/en/latest/specifications/inline-script-metadata/#inline-script-metadata +- https://github.com/vega/vega-datasets/issues/629#issuecomment-2498618622 +- https://datapackage.org/ +- https://docs.pola.rs/ + + +.. _/data/: + https://github.com/vega/vega-datasets/tree/main/data +.. _uv: + https://docs.astral.sh/uv/getting-started/installation/ +""" + +from __future__ import annotations + +import datetime as dt +import json +import logging +import os +import warnings +from collections.abc import Mapping +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, NotRequired, Required, TypedDict, Unpack + +import frictionless as fl +import polars as pl +from frictionless.fields import ( + AnyField, + ArrayField, + BooleanField, + DateField, + DatetimeField, + DurationField, + IntegerField, + NumberField, + ObjectField, + StringField, + TimeField, +) +from frictionless.resources import ( + JsonResource, + MapResource, + Package, + Resource, + TableResource, +) + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator, Sequence + from typing import ClassVar, Literal + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + +type ResourceConstructor = Callable[..., Resource] +type PathMeta = Literal["name", "path", "format", "scheme", "mediatype"] +type PythonDataType = ( + type[ + int + | float + | bool + | str + | dict + | list + | tuple + | dt.date + | dt.datetime + | dt.time + | dt.timedelta + | object + | bytes + ] + | None +) + + +POLARS_PY_TO_FL_FIELD: Mapping[PythonDataType, type[fl.Field]] = { + int: IntegerField, + float: NumberField, + bool: BooleanField, + str: StringField, + None: AnyField, # NOTE: Unclear why this isn't represented with a class + dict: ObjectField, + list: ArrayField, + tuple: ArrayField, + dt.date: DateField, + dt.datetime: DatetimeField, + dt.time: TimeField, + dt.timedelta: DurationField, + object: AnyField, + bytes: AnyField, +} +""" +Maps `polars python`_ type repr to ``datapackage`` `Field Types`_. + + +.. _polars python: + https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L167-L197 +.. _Field Types: + https://datapackage.org/standard/table-schema/#field-types +""" + +TopoResource: ResourceConstructor = partial( + MapResource, format="topojson", datatype="map" +) +GeoResource: ResourceConstructor = partial( + MapResource, format="geojson", datatype="map" +) + + +class ResourceAdapter: + mediatype: ClassVar[Mapping[str, str]] = { + ".arrow": "application/vnd.apache.arrow.file" + } + """https://www.iana.org/assignments/media-types/application/vnd.apache.arrow.file""" + + @classmethod + def from_path(cls, source: Path, /) -> Resource: + suffix = source.suffix + match suffix: + case ".csv" | ".tsv" | ".parquet": + return cls.from_tabular_safe(source) + case ".json": + return cls.from_json(source) + case ".png": + return cls.from_image(source) + case ".arrow": + return cls.from_arrow(source) + case _: + return None + + @classmethod + def infer_as(cls, source: Path, tp: ResourceConstructor, /) -> Resource: + resource = tp(source.name) + resource.infer() + return resource + + @classmethod + def from_arrow(cls, source: Path, /) -> Resource: + file_meta = cls._extract_file_parts(source) + return TableResource(**file_meta, schema=frame_to_schema(pl.scan_ipc(source))) + + @classmethod + def from_tabular_safe(cls, source: Path, /) -> Resource: + return cls.infer_as(source, TableResource) + + @classmethod + def from_image(cls, source: Path, /) -> Resource: + return cls.infer_as(source, Resource) + + @classmethod + def from_json(cls, source: Path, /) -> Resource: + """Identifies *non-tabular* files, adds basic tag for spatial data.""" + df: pl.DataFrame = pl.read_json(source) + if any(tp.is_nested() for tp in df.schema.dtypes()): + if df.columns[0] == "type": + tp = TopoResource if df.item(0, 0) == "Topology" else GeoResource + else: + tp = JsonResource + else: + tp = TableResource + return cls.infer_as(source, tp) + + @classmethod + def _extract_file_parts(cls, source: Path, /) -> dict[PathMeta, str]: + """Metadata that can be inferred from the file path *alone*.""" + parts = { + "name": source.stem, + "path": source.name, + "format": source.suffix[1:], + "scheme": "file", + } + if mediatype := cls.mediatype.get(source.suffix): + parts["mediatype"] = mediatype + return parts + + @staticmethod + def with_extras(resource: Resource, /, **extras: Unpack[ResourceMeta]) -> Resource: + """TODO: Use as part of https://github.com/vega/vega-datasets/pull/631#issuecomment-2503760452""" + for name, value in extras.items(): + setattr(resource, name, value) + return resource + + +class Source(TypedDict, total=False): + title: str + path: Required[str] + email: str + version: str + + +class License(TypedDict): + name: str + path: str + title: NotRequired[str] + + +class Contributor(TypedDict, total=False): + title: str + givenName: str + familyName: str + path: str + email: str + roles: Sequence[str] + organization: str + + +class ResourceMeta(TypedDict, total=False): + description: str + sources: Sequence[Source] + licenses: Sequence[License] + + +class PackageMeta(TypedDict): + """ + A subset of the `Data Package`_ standard. + + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + + name: str + version: str + homepage: str + description: str + licenses: Sequence[License] + contributors: Sequence[Contributor] + sources: Sequence[Source] + created: str + + +def frame_to_schema(frame: pl.LazyFrame | pl.DataFrame, /) -> fl.Schema: + py_schema = frame.lazy().collect_schema().to_python() + return fl.Schema( + fields=[POLARS_PY_TO_FL_FIELD[tp](name=name) for name, tp in py_schema.items()] + ) + + +def extract_package_metadata(repo_root: Path, /) -> PackageMeta: + """Repurpose `package.json`_ for the `Data Package`_ standard. + + .. _package.json: + https://github.com/vega/vega-datasets/blob/main/package.json + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + fp: Path = repo_root / "package.json" + with fp.open(encoding="utf-8") as f: + m = json.load(f) + if not isinstance(m, Mapping): + msg = f"Unexpected type returned from {fp!r}\n{type(m).__name__!r}" + raise TypeError(msg) + return PackageMeta( + name=m["name"], + version=m["version"], + homepage=m["repository"]["url"], + description=m["description"], + contributors=[Contributor(title=m["author"]["name"], path=m["author"]["url"])], + licenses=[ + License( + name=m["license"], + path="https://opensource.org/license/bsd-3-clause", + title="The 3-Clause BSD License", + ) + ], + sources=[ + Source(path="https://github.com/vega/vega-datasets/blob/next/SOURCES.md") + ], + created=dt.datetime.now(dt.UTC).isoformat(), + ) + + +def iter_resources(data_root: Path, /) -> Iterator[Resource]: + """Yield all parseable resources, selecting the most appropriate ``Resource`` class.""" + for fp in data_root.iterdir(): + if not fp.is_file(): + continue + if resource := ResourceAdapter.from_path(fp): + yield resource + else: + msg = f"Skipping unexpected extension {fp.suffix!r}\n\n{fp!r}" + warnings.warn(msg, stacklevel=2) + continue + + +def main( + *, + stem: str = "datapackage", + output_format: Literal["json", "yaml", "both"] = "json", +) -> None: + if output_format not in {"json", "yaml", "both"}: + msg = f"Expected one of {["json", "yaml", "both"]!r} but got {output_format!r}" + raise TypeError(msg) + repo_dir: Path = Path(__file__).parent.parent + data_dir: Path = repo_dir / "data" + # NOTE: Forcing base directory here + # - Ensures ``frictionless`` doesn't insert platform-specific path separator(s) + os.chdir(data_dir) + pkg_meta = extract_package_metadata(repo_dir) + logger.info( + f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." + ) + pkg = Package(resources=list(iter_resources(data_dir)), **pkg_meta) + logger.info(f"Collected {len(pkg.resources)} resources") + if output_format in {"json", "both"}: + p = (repo_dir / f"{stem}.json").as_posix() + logger.info(f"Writing {p!r}") + pkg.to_json(p) + if output_format in {"yaml", "both"}: + p = (repo_dir / f"{stem}.yaml").as_posix() + logger.info(f"Writing {p!r}") + pkg.to_yaml(p) + + +if __name__ == "__main__": + main()