Skip to content

Commit

Permalink
feat(PR GH Action): Workflow and script to validate and build automat…
Browse files Browse the repository at this point in the history
…ically the index
  • Loading branch information
caviri committed Aug 30, 2024
1 parent dd86a49 commit 9787ffd
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 89 deletions.
58 changes: 58 additions & 0 deletions .github/scripts/build_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import argparse
import json
import os
from pathlib import Path
import yaml
import jsonschema
from datetime import datetime

def read_yaml_files(base_path, folders):
yaml_files = []
for folder in folders:
folder_path = base_path.joinpath(folder)
for yaml_file in folder_path.glob('*.yaml'):
with open(yaml_file, 'r', encoding='utf-8') as f:
yaml_files.append((folder, yaml.safe_load(f)))
return yaml_files

def validate_yaml(yaml_content, schema_path):
with open(schema_path, 'r', encoding='utf-8') as f:
schema = json.load(f)
jsonschema.validate(instance=yaml_content, schema=schema)

def build_index(yaml_files, schemas_path):
index = {'index_timestamp': str(datetime.now()), 'catalog': {}}
for folder, content in yaml_files:
schema_path = schemas_path.joinpath(f"{folder}.json")
try:
validate_yaml(content, schema_path)
if folder not in index['catalog']:
index['catalog'][folder] = []
index['catalog'][folder].append(content)
except jsonschema.exceptions.ValidationError as e:
print(f"Validation error in {folder}: {e}")
return index

def main():
parser = argparse.ArgumentParser(description="Build and deploy index from YAML files.")
parser.add_argument("--build-branch", "-b", type=str, required=True, help="Path to the build branch.")
parser.add_argument("--deploy-branch", "-d", type=str, required=True, help="Path to the deploy branch.")
parser.add_argument("--folders-to-scan", "-f", type=str, nargs='+', default=["datasets"], help="List of folders to scan for YAML files.")
args = parser.parse_args()

build_path = Path(args.build_branch)
deploy_path = Path(args.deploy_branch)
schemas_path = build_path.joinpath('catalog/schemas')

yaml_files = read_yaml_files(build_path.joinpath('catalog'), args.folders_to_scan)
index = build_index(yaml_files, schemas_path)

deploy_path.mkdir(parents=True, exist_ok=True)
with open(deploy_path.joinpath('index.json'), 'w', encoding='utf-8') as f:
json.dump(index, f, indent=4)

with open(build_path.joinpath('src/index.json'), 'w', encoding='utf-8') as f:
json.dump(index, f, indent=4)

if __name__ == "__main__":
main()
71 changes: 71 additions & 0 deletions .github/workflows/build_index.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Build index

on:
workflow_dispatch:
push:
branches:
- main
paths:
- 'catalog/**.yaml'

permissions:
contents: write

jobs:
build-index:
runs-on: ubuntu-latest
if: github.event.repository.fork == false

steps:
- name: Checkout main
uses: actions/checkout@v4
with:
fetch-depth: 2
ref: 'main'
path: main

- name: Checkout gh-pages
uses: actions/checkout@v4
with:
fetch-depth: 2
ref: 'gh-pages'
path: gh-pages

- name: Setup python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install yaml & jsonschema
run: pip install pyyaml jsonschema

- name: Build index
run: python -B main/.github/scripts/build_index.py --build-branch main --deploy-branch gh-pages --folders-to-scan datasets

- name: Get last commit message - main
id: last-commit-message-main
run: echo "msg=$(git -C main log -1 --pretty=%s)" >> $GITHUB_OUTPUT

- name: Get last commit message - gh-pages
id: last-commit-message-gh-pages
run: echo "msg=$(git -C gh-pages log -1 --pretty=%s)" >> $GITHUB_OUTPUT

- name: Commit - main
uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: ${{ steps.last-commit-message-main.outputs.msg }}
commit_options: '--amend --no-edit'
file_pattern: '*.json catalog/**.yaml'
push_options: '--force'
skip_fetch: true
repository: main

- name: Commit - gh-pages
uses: stefanzweifel/git-auto-commit-action@v5
with:
commit_message: ${{ steps.last-commit-message-gh-pages.outputs.msg }}
commit_options: '--amend --no-edit'
file_pattern: '*.json'
push_options: '--force'
skip_fetch: true
repository: gh-pages
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ Making use of the last development in webassembly technologies

Open issues are registered in the repository and taken by the community.


## Development

### How to test the github action build index?

The github action can be tested by pulling both branches:

## Develop by

Develop by SDSC in the frame of the Open Research Data for the Sciences Hackathon in collaboration with the EPFL Open Science office. We thanks the pNeuma Team for the support during the development of this prototype.
20 changes: 20 additions & 0 deletions catalog/datasets/0001-pneuma-dataset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
schema_version: "1.0.0"
schema_type: "dataset"
dataset_id: "0001-pneuma-dataset"
title: "pNEUMA Dataset"
doi: "10.5281/zenodo.7426506"
version: "v1"
description: "pNEUMA is an open large-scale dataset of naturalistic trajectories of half a million vehicles that have been collected by a one-of-a-kind experiment by a swarm of drones in the congested downtown area of Athens, Greece."
created_at: "2022-12-08T16:26:11Z"
updated_at: "2022-12-08T16:26:11Z"
data_format: "zip"
tags:
- "traffic"
- "drone"
source: "Zenodo"
authors:
name: "Kim, Sohyeong"
orcid: "0000-0000-0000-0000"
license: "CC BY 4.0"
access_url: "https://zenodo.org/record/7426506/"
documentation_url: "https://zenodo.org/record/7426506"
22 changes: 22 additions & 0 deletions catalog/datasets/0002-pneuma-vision-dataset.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
schema_version: "1.0.0"
schema_type: "dataset"
dataset_id: "0002-pneuma-vision-dataset"
title: "pNEUMA vision dataset"
doi: "10.5281/zenodo.10491409"
version: "v1"
description: "The pNEUMA dataset is the drone traffic imagery dataset that contains images of frame and vehicle annotations as positions. This dataset is the expansion of the pNEUMA, the urban trajectory dataset collected by swarms of drones in Athens. For more details about pNEUMA and pNEUMA Vision, please check our website at https://open-traffic.epfl.ch and github."
created_at: "2023-08-30T15:12:56Z"
updated_at: "2023-08-30T15:12:56Z"
data_format: "csv"
tags:
- "traffic"
- "GNSS"
- "urban mobility"
- "traffic safety"
source: "Zenodo"
authors:
name: "Barmpounakis, Emmanouil"
orcid: "0000-0000-0000-0000"
license: "CC BY 4.0"
access_url: "https://zenodo.org/record/10491409"
documentation_url: "https://zenodo.org/record/10491409"
13 changes: 12 additions & 1 deletion catalog/schemas/dataset.json → catalog/schemas/datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
"version": "1.0.0",
"type": "object",
"properties": {
"schema_version": {
"type": "string",
"description": "The version of the dataset schema user"
},
"schema_type": {
"type": "string",
"description": "Which of the available schemas are used"
},
"dataset_id": {
"type": "string",
"description": "A unique identifier for the dataset"
Expand All @@ -14,6 +22,10 @@
"type": "string",
"description": "The DOI of the dataset"
},
"version": {
"type": "string",
"description": "Version of the dataset"
},
"description": {
"type": "string",
"description": "A brief description of the dataset"
Expand Down Expand Up @@ -86,7 +98,6 @@
"description",
"created_at",
"data_format",
"size_in_mb",
"source",
"authors",
"license",
Expand Down
File renamed without changes.
133 changes: 45 additions & 88 deletions src/index.json
Original file line number Diff line number Diff line change
@@ -1,92 +1,49 @@
{
"index_timestamp": "2024-08-30T12:00:00Z",
"catalog": {
"datasets": [
{
"version": "1.0.0",
"dataset_id": "123e4567-e89b-12d3-a456-426614174000",
"title": "Global Climate Data 2023",
"doi": "10.1234/global-climate-2023",
"description": "This dataset contains global climate data for the year 2023, including temperature, precipitation, and atmospheric pressure readings from various regions around the world.",
"created_at": "2023-01-01T12:00:00Z",
"updated_at": "2023-06-15T12:00:00Z",
"data_format": "CSV",
"tags": ["climate", "temperature", "precipitation", "atmospheric pressure"],
"source": "National Meteorological Organization",
"authors": {
"name": "Dr. Jane Doe",
"orcid": "0000-0002-1825-0097"
},
"license": "CC BY 4.0",
"access_url": "https://example.com/datasets/global-climate-2023",
"documentation_url": "https://example.com/datasets/global-climate-2023/documentation",
"access_endpoint": "https://api.example.com/climate/2023/data",
"documentation_endpoint": "https://api.example.com/climate/2023/docs"
"index_timestamp": "2024-08-30T12:00:00Z",
"catalog": {
"datasets": [
{
"schema_version": "1.0.0",
"schema_type": "dataset",
"dataset_id": "0001-pneuma-dataset",
"title": "pNEUMA Dataset",
"doi": "10.5281/zenodo.7426506",
"version": "v1",
"description": "pNEUMA is an open large-scale dataset of naturalistic trajectories of half a million vehicles that have been collected by a one-of-a-kind experiment by a swarm of drones in the congested downtown area of Athens, Greece. ",
"created_at": "2022-12-08T16:26:11Z",
"updated_at": "2022-12-08T16:26:11Z",
"data_format": "zip",
"tags": ["traffic", "drone"],
"source": "Zenodo",
"authors": {
"name": "Kim, Sohyeong",
"orcid": "0000-0000-0000-0000"
},
{
"version": "1.0.0",
"dataset_id": "223e4567-e89b-12d3-a456-426614174111",
"title": "Regional Climate Data 2022",
"doi": "10.1234/regional-climate-2022",
"description": "This dataset provides detailed climate data for specific regions for the year 2022.",
"created_at": "2022-01-01T12:00:00Z",
"updated_at": "2022-12-15T12:00:00Z",
"data_format": "JSON",
"tags": ["climate", "regional data", "temperature"],
"source": "Regional Weather Stations",
"authors": {
"name": "Dr. John Smith",
"orcid": "0000-0002-3456-7890"
},
"license": "MIT",
"access_url": "https://example.com/datasets/regional-climate-2022",
"documentation_url": "https://example.com/datasets/regional-climate-2022/documentation",
"access_endpoint": "https://api.example.com/climate/2022/data",
"documentation_endpoint": "https://api.example.com/climate/2022/docs"
"license": "CC BY 4.0",
"access_url": "https://zenodo.org/record/7426506/",
"documentation_url": "https://zenodo.org/record/7426506"
},
{
"schema_version": "1.0.0",
"schema_type": "dataset",
"dataset_id": "0002-pneuma-vision-dataset",
"title": "pNEUMA vision dataset",
"doi": "10.5281/zenodo.10491409",
"version": "v1",
"description": "The pNEUMA dataset is the drone traffic imagery dataset that contains images of frame and vehicle annotations as positions. This dataset is the expansion of the pNEUMA, the urban trajectory dataset collected by swarms of drones in Athens. For more details about pNEUMA and pNEUMA Vision, please check our website at https://open-traffic.epfl.ch and github. ",
"created_at": "2023-08-30T15:12:56Z",
"updated_at": "2023-08-30T15:12:56Z",
"data_format": "csv",
"tags": ["traffic", "GNSS", "urban mobility", "traffic safety"],
"source": "Zenodo",
"authors": {
"name": "Barmpounakis, Emmanouil",
"orcid": "0000-0000-0000-0000"
},
{
"version": "1.0.0",
"dataset_id": "323e4567-e89b-12d3-a456-426614174222",
"title": "Global Economic Indicators 2023",
"doi": "10.1234/global-economic-2023",
"description": "This dataset includes global economic indicators such as GDP, inflation rates, and unemployment figures for the year 2023.",
"created_at": "2023-02-01T12:00:00Z",
"updated_at": "2023-07-01T12:00:00Z",
"data_format": "Parquet",
"tags": ["economy", "GDP", "inflation", "unemployment"],
"source": "World Economic Forum",
"authors": {
"name": "Dr. Emily Johnson",
"orcid": "0000-0003-1234-5678"
},
"license": "CC BY-SA 4.0",
"access_url": "https://example.com/datasets/global-economic-2023",
"documentation_url": "https://example.com/datasets/global-economic-2023/documentation",
"access_endpoint": "https://api.example.com/economy/2023/data",
"documentation_endpoint": "https://api.example.com/economy/2023/docs"
},
{
"version": "1.0.0",
"dataset_id": "423e4567-e89b-12d3-a456-426614174333",
"title": "Regional Economic Data 2022",
"doi": "10.1234/regional-economic-2022",
"description": "This dataset provides economic data for various regions in 2022, including GDP and unemployment rates.",
"created_at": "2022-03-01T12:00:00Z",
"updated_at": "2022-09-01T12:00:00Z",
"data_format": "CSV",
"tags": ["economy", "regional", "GDP", "unemployment"],
"source": "Regional Economic Offices",
"authors": {
"name": "Dr. Michael Lee",
"orcid": "0000-0004-5678-9101"
},
"license": "Apache 2.0",
"access_url": "https://example.com/datasets/regional-economic-2022",
"documentation_url": "https://example.com/datasets/regional-economic-2022/documentation",
"access_endpoint": "https://api.example.com/economy/2022/data",
"documentation_endpoint": "https://api.example.com/economy/2022/docs"
}
]
}
"license": "CC BY 4.0",
"access_url": "https://zenodo.org/record/10491409",
"documentation_url": "https://zenodo.org/record/10491409"
}
]
}
}

0 comments on commit 9787ffd

Please sign in to comment.