From 742a07928bf403238e7faabb95a7c87762506305 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Wed, 6 Dec 2023 00:50:26 +0100 Subject: [PATCH] updates annotation workflow to v4.3 --- .github/workflows/concat_metadata.py | 156 ------------ .github/workflows/helper.py | 82 +++++++ .../workflows/meta_corpus_stats_generator.yml | 80 ------ .github/workflows/update_pages.py | 231 ------------------ .github/workflows/version_release.yml | 72 ++++++ .pre-commit-config.yaml | 6 + 6 files changed, 160 insertions(+), 467 deletions(-) delete mode 100644 .github/workflows/concat_metadata.py create mode 100644 .github/workflows/helper.py delete mode 100644 .github/workflows/meta_corpus_stats_generator.yml delete mode 100644 .github/workflows/update_pages.py create mode 100644 .github/workflows/version_release.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/concat_metadata.py b/.github/workflows/concat_metadata.py deleted file mode 100644 index c4f7e27..0000000 --- a/.github/workflows/concat_metadata.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -import argparse, sys, os - -import pandas as pd -from pytablewriter import MarkdownTableWriter - -from update_pages import resolve_dir - -def check_and_create(d): - """ Turn input into an existing, absolute directory path. - """ - if not os.path.isdir(d): - d = resolve_dir(os.path.join(os.getcwd(), d)) - if not os.path.isdir(d): - os.makedirs(d) - print(f"Created directory {d}") - return resolve_dir(d) - - -def check_dir(d): - if not os.path.isdir(d): - d = resolve_dir(os.path.join(os.getcwd(), d)) - if not os.path.isdir(d): - raise argparse.ArgumentTypeError(d + " needs to be an existing directory") - return resolve_dir(d) - -def concat_metadata(path): - _, folders, _ = next(os.walk(path)) - tsv_paths, keys = [], [] - for subdir in sorted(folders): - potential = os.path.join(path, subdir, 'metadata.tsv') - if os.path.isfile(potential): - tsv_paths.append(potential) - keys.append(subdir) - if len(tsv_paths) == 0: - return pd.DataFrame() - dfs = [pd.read_csv(tsv_path, sep='\t', dtype='string') for tsv_path in tsv_paths] - try: - concatenated = pd.concat(dfs, keys=keys) - except AssertionError: - info = 'Levels: ' + ', '.join(f"{key}: {df.index.nlevels} ({df.index.names})" for key, df in zip(keys, dfs)) - print(f"Concatenation of DataFrames failed due to an alignment error. {info}") - raise - try: - rel_path_col = next(col for col in ('subdirectory', 'rel_paths') if col in concatenated.columns) - except StopIteration: - raise ValueError(f"Metadata is expected to come with a column called 'subdirectory' or (previously) 'rel_paths'.") - rel_paths = [os.path.join(corpus, rel_path) for corpus, rel_path in zip(concatenated.index.get_level_values(0), concatenated[rel_path_col].values)] - concatenated.loc[:, rel_path_col] = rel_paths - if 'rel_path' in concatenated.columns: - rel_paths = [os.path.join(corpus, rel_path) for corpus, rel_path in zip(concatenated.index.get_level_values(0), concatenated.rel_path.values)] - concatenated.loc[:, 'rel_path'] = rel_paths - concatenated = concatenated.droplevel(1) - concatenated.index.rename('corpus', inplace=True) - return concatenated - -def df2md(df, name=None): - """ Turns a DataFrame into a MarkDown table. The returned writer can be converted into a string. - """ - writer = MarkdownTableWriter() - writer.table_name = name - writer.header_list = list(df.columns.values) - writer.value_matrix = df.values.tolist() - return writer - -def metadata2markdown(concatenated): - try: - fname_col = next(col for col in ('fname', 'fnames') if col in concatenated.columns) - except StopIteration: - raise ValueError(f"Metadata is expected to come with a column called 'fname' or (previously) 'fnames'.") - rename4markdown = { - fname_col: 'file_name', - 'last_mn': 'measures', - 'label_count': 'labels', - 'harmony_version': 'standard', - } - concatenated = concatenated.rename(columns=rename4markdown) - result = '# Overview' - for corpus_name, df in concatenated[rename4markdown.values()].groupby(level=0): - heading = f"\n\n## {corpus_name}\n\n" - md = str(df2md(df.fillna(''))) - result += heading + md - return result - - - -def write_md(md_str, md_path): - if os.path.isfile(md_path): - msg = 'Updated' - with open(md_path, 'r', encoding='utf-8') as f: - lines = f.readlines() - else: - msg = 'Created' - lines = [] - with open(md_path, 'w', encoding='utf-8') as f: - for line in lines: - if '# Overview' in line: - break - f.write(line) - else: - f.write('\n\n') - f.write(md_str) - print(f"{msg} {md_path}") - -def write_tsv(df, tsv_path): - df.to_csv(tsv_path, sep='\t', index=True) - print(f"Concatenated metadata written to {tsv_path}.") - - -def main(args): - concatenated = concat_metadata(args.dir) - if len(concatenated) == 0: - print(f"No metadata found in the child directories of {args.dir}.") - return - tsv_path = os.path.join(args.out, 'concatenated_metadata.tsv') - write_tsv(concatenated, tsv_path) - md_str = metadata2markdown(concatenated) - md_path = os.path.join(args.out, 'README.md') - write_md(md_str, md_path) - - - -################################################################################ -# COMMANDLINE INTERFACE -################################################################################ -if __name__ == "__main__": - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="""\ -------------------------------------------------------------------- -| Script for generating metadata and README for meta repositories | -------------------------------------------------------------------- - -""", - ) - parser.add_argument( - "-d", - "--dir", - metavar="DIR", - type=check_dir, - help="Pass the root of the repository clone to gather metadata.tsv files from its child directories.", - ) - parser.add_argument( - "-o", - "--out", - metavar="OUT_DIR", - type=check_and_create, - help="""Output directory for TSV and MD file.""", - ) - args = parser.parse_args() - if args.dir is None: - args.dir = os.getcwd() - if args.out is None: - args.out = os.getcwd() - main(args) diff --git a/.github/workflows/helper.py b/.github/workflows/helper.py new file mode 100644 index 0000000..3698fd6 --- /dev/null +++ b/.github/workflows/helper.py @@ -0,0 +1,82 @@ +import argparse +import re +import os +def create_new_tag(tag, update_major): + if not (re.match(r'^v\d+\.\d+$', tag)): + raise Exception(f'tag: {tag} is not giving in the correct format e.i v0.0') + + # Notice that this could make a tag version of three digits become two digits + # e.i 3.2.1 -> 3.3 + digits_tags = (re.match(r'^v\d+\.\d+', tag)).group()[1::].split('.') + if len(digits_tags) != 2: + raise Exception(f'tag: {tag} must contain two version digits') + + major_num = int(digits_tags[0]) + minor_num = int(digits_tags[1]) + if update_major: + print(f"Label detected to update major version") + major_num += 1 + minor_num = 0 + else: + minor_num += 1 + return f"v{major_num}.{minor_num}" + +def store_tag(tag): + with open(os.environ['GITHUB_OUTPUT'], 'a') as fh: + print(f'new_tag={tag}', file=fh) + +def update_file_with_tag(f_name, old_tag, new_tag): + if os.path.isfile(f_name): + try: + with open(f_name, "r",encoding="utf-8") as f: + data = f.read() + data = data.replace(old_tag, new_tag) + with open(f_name, "w",encoding="utf-8") as f: + f.write(data) + except Exception as e: + print(e) + else: + print(f"Warning: {f_name} doest exist at the current path {os.getcwd()}") + +def main(args): + tag = args.tag + new_tag = "v2.0" + if not tag: + print(f"Warning: a latest release with a tag does not exist in current repository, starting from {new_tag}") + else: + new_tag = create_new_tag(tag,args.update_major_ver) + print(f"Repository with tag: {tag}, creating a new tag with: {new_tag}") + update_file_with_tag(".zenodo.json", tag, new_tag) + update_file_with_tag("CITATION.cff", tag, new_tag) + update_file_with_tag("README.md", tag, new_tag) + store_tag(new_tag) + +def run(): + args = parser.parse_args() + main(args) + + +def str_to_bool(value): + if value.lower() == "true": + return True + elif value.lower() == "false": + return False + else: + raise Exception( + f"Error: value {value} as argument is not accepted\n" + f"retry with true or false" + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tag", type=str, + help="Require: latest tag", + required=True + ) + parser.add_argument( + "--update_major_ver", type=str_to_bool, + help="Require: boolean to update the major tag number", + required=True + ) + run() \ No newline at end of file diff --git a/.github/workflows/meta_corpus_stats_generator.yml b/.github/workflows/meta_corpus_stats_generator.yml deleted file mode 100644 index 7248383..0000000 --- a/.github/workflows/meta_corpus_stats_generator.yml +++ /dev/null @@ -1,80 +0,0 @@ -name: meta_corpus_stats_generator - -on: - workflow_dispatch: - -jobs: - meta_corpus_stats_generator: - runs-on: ubuntu-latest - - steps: - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - name: Checkout main - uses: actions/checkout@v2 - with: - path: main - submodules: recursive - token: ${{ secrets.MS3_BOT_TOKEN }} - - name: Configure git - working-directory: ./main - continue-on-error: true - run: | - git config --global user.name "ms3-bot" - git config --global user.email dcml.annotators@epfl.ch - git config --global user.token ${{ secrets.MS3_BOT_TOKEN }} - - name: Pull latest updates into submodules - working-directory: ./main - run: | - git submodule foreach "git checkout -f main | git clean -xfd | git reset --hard | git submodule foreach --recursive git clean -xfd | git submodule deinit -f . | git submodule update --init --recursive" - - name: Pull current workflow - working-directory: ./main - run: | - wget https://github.com/DCMLab/annotation_workflow_template/archive/refs/heads/meta_corpora.zip - unzip meta_corpora.zip - cp -r annotation_workflow_template-meta_corpora/. . - rm -r annotation_workflow_template-meta_corpora/ - rm meta_corpora.zip - - name: Push updated workflow - working-directory: ./main - continue-on-error: true - run: | - git add -A - git commit -m "Current version of workflows" - git push - - name: Clone corpusstats - uses: actions/checkout@v2 - with: - repository: DCMLab/corpus_statistics_generator - path: ./corpusstats - ref: main - token: ${{ secrets.MS3_BOT_MAINTENANCE }} - - name: Install corpusstats - run: python -m pip install -e ./corpusstats - - name: Gather metadata from submodules - working-directory: ./main - run: | - python -m pip install pandas pytablewriter - python .github/workflows/concat_metadata.py - - name: Push files - working-directory: ./main - continue-on-error: true - run: | - git add -A - git commit -m "Updated metadata from submodules" - git push - - name: Generate GitHub pages - working-directory: ./main - run: | - python .github/workflows/update_pages.py -g ${{ github.repository }} -t ${{ secrets.MS3_BOT_MAINTENANCE }} -o ../public - - name: Display generated files - working-directory: ./public - run: ls - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./public - enable_jekyll: true diff --git a/.github/workflows/update_pages.py b/.github/workflows/update_pages.py deleted file mode 100644 index 776f88a..0000000 --- a/.github/workflows/update_pages.py +++ /dev/null @@ -1,231 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 -import argparse -import os -import sys -import io -import base64 -from shutil import copy - -import corpusstats -import pandas as pd - -INDEX_FNAME = "index.md" -GANTT_FNAME = "gantt.md" -STATS_FNAME = "stats.md" -JEKYLL_CFG_FNAME = "_config.yml" -STYLE_FNAME = "assets/css/style.scss" - - -def make_index_file(gantt=True, stats=True): - file = "" - if gantt: - file += f"* [Modulation plans]({GANTT_FNAME})\n" - if stats: - file +=f"* [Corpus state]({STATS_FNAME})\n" - return file - -def generate_stats_text(pie_string, table_string): - STATS_FILE = f""" -# Corpus Status - -## Vital statistics - -{table_string} - -## Completion ratios - -{pie_string} -""" - return STATS_FILE - - -JEKYLL_CFG_FILE = "theme: jekyll-theme-tactile " - -STYLE_FILE = """--- ---- - -@import "{{ site.theme }}"; - -.inner { - max-width: 95%; - width: 1024px; -} -""" - - - -def resolve_dir(d): - """ Resolves '~' to HOME directory and turns ``d`` into an absolute path. - """ - if d is None: - return None - if '~' in d: - return os.path.expanduser(d) - return os.path.abspath(d) - - -def write_to_file(args, filename, content_str): - path = check_dir(".") if args.out is None else args.out - fname = os.path.join(path, filename) - _ = check_and_create( - os.path.dirname(fname) - ) # in case the file name included path components - with open(fname, "w", encoding="utf-8") as f: - f.writelines(content_str) - - -def write_gantt_file(args, gantt_path=None): - if gantt_path is None: - gantt_path = ( - check_dir("gantt") - if args.out is None - else check_dir(os.path.join(args.out, "gantt")) - ) - fnames = sorted(os.listdir(gantt_path)) - file_content = "\n".join( - f'' - for f in fnames) - write_to_file(args, GANTT_FNAME, file_content) - - -def write_stats_file(args): - try: - p = corpusstats.Provider(args.github, args.token) - except: - print(f"corpusstats failed with the following message: {sys.exc_info()[1]}") - return False - pie_string = "" - pie_array = [] - for s in p.tabular_stats: - plot = p.pie_chart(s) - img = io.BytesIO() - plot.savefig(img, format="png") - img.seek(0) - img = base64.encodebytes(img.getvalue()).decode("utf-8") - pie_array.append( - f'
' - ) - pie_string = "".join(pie_array) - - vital_stats = pd.DataFrame.from_dict(p.stats, orient="index") - vital_stats = vital_stats.iloc[0:6, 0:2] - vital_stats = vital_stats.to_markdown(index=False, headers=[]) - full_text = generate_stats_text(pie_string, vital_stats) - write_to_file(args, STATS_FNAME, full_text) - return True - - - -def check_and_create(d): - """ Turn input into an existing, absolute directory path. - """ - if not os.path.isdir(d): - d = resolve_dir(os.path.join(os.getcwd(), d)) - if not os.path.isdir(d): - os.makedirs(d) - print(f"Created directory {d}") - return resolve_dir(d) - - -def check_dir(d): - if not os.path.isdir(d): - d = resolve_dir(os.path.join(os.getcwd(), d)) - if not os.path.isdir(d): - print(d + " needs to be an existing directory") - return - return resolve_dir(d) - - -def copy_gantt_files(args): - destination = check_dir(".") if args.out is None else args.out - destination = check_and_create(os.path.join(destination, 'gantt')) - for file in sorted(os.listdir(args.dir)): - if file.endswith('.html'): - source = os.path.join(args.dir, file) - copy(source, destination) - print(f"Copied {source} to {destination}.") - return destination - -def main(args): - given = sum(arg is not None for arg in (args.github, args.token)) - stats, gantt = False, False - if given == 2: - stats = write_stats_file(args) - elif given == 1: - print(f"You need to specify both a repository and a token.") - if args.dir is not None: - destination = copy_gantt_files(args) - write_gantt_file(args, destination) - gantt=True - if sum((stats, gantt)) > 0: - index_file = make_index_file(gantt=gantt, stats=stats) - write_to_file(args, INDEX_FNAME, index_file) - write_to_file(args, JEKYLL_CFG_FNAME, JEKYLL_CFG_FILE) - write_to_file(args, STYLE_FNAME, STYLE_FILE) - else: - print("No page was generated.") - - -################################################################################ -# COMMANDLINE INTERFACE -################################################################################ -if __name__ == "__main__": - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description="""\ ---------------------------------------------------------- -| Script for updating GitHub pages for a DCML subcorpus | ---------------------------------------------------------- - -Description goes here - -""", - ) - parser.add_argument( - "-g", - "--github", - metavar="owner/repository", - help="If you want to generate corpusstats, you need to pass the repo in the form owner/repository_name and an access token.", - ) - parser.add_argument( - "-t", - "--token", - metavar="ACCESS_TOKEN", - help="Token that grants access to the repository in question.", - ) - parser.add_argument( - "-d", - "--dir", - metavar="DIR", - type=check_dir, - help="Pass a directory to scan it for gantt charts and write the file gantt.md", - ) - parser.add_argument( - "-o", - "--out", - metavar="OUT_DIR", - type=check_and_create, - help="""Output directory.""", - ) - parser.add_argument( - "-l", - "--level", - default="INFO", - help="Set logging to one of the levels {DEBUG, INFO, WARNING, ERROR, CRITICAL}.", - ) - args = parser.parse_args() - # logging_levels = { - # 'DEBUG': logging.DEBUG, - # 'INFO': logging.INFO, - # 'WARNING': logging.WARNING, - # 'ERROR': logging.ERROR, - # 'CRITICAL': logging.CRITICAL, - # 'D': logging.DEBUG, - # 'I': logging.INFO, - # 'W': logging.WARNING, - # 'E': logging.ERROR, - # 'C': logging.CRITICAL - # } - # logging.basicConfig(level=logging_levels[args.level.upper()]) - main(args) diff --git a/.github/workflows/version_release.yml b/.github/workflows/version_release.yml new file mode 100644 index 0000000..0fd8eab --- /dev/null +++ b/.github/workflows/version_release.yml @@ -0,0 +1,72 @@ +on: + pull_request: + types: + - closed + +jobs: + if_merged: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + + - name: Checkout corpus repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + token: ${{ secrets.MS3_BOT_TOKEN }} + ref: "${{ github.event.pull_request.base.ref }}" + submodules: recursive + + - name: "Get latest tag version" + id: tag + continue-on-error: true + run: | + res=$(git tag -l --sort=-v:refname | grep --invert-match '\^' | head -n 1) + echo "tag_version=${res}" >> $GITHUB_OUTPUT + env: + GITHUB_TOKEN: ${{ secrets.MS3_BOT_TOKEN }} + + - name: "Generate a new tag version" + id: generate_tag + run: | + major_in_PR="${{ contains(github.event.pull_request.labels.*.name, 'major_version')}}" + python .github/workflows/helper.py --tag "${{ steps.tag.outputs.tag_version }}" --update_major_ver "$major_in_PR" + + - name: Setup Github credentials & push zenodo, citation and README changes + continue-on-error: true + run: | + git config --global user.name "ms3-bot" + git config --global user.email dcml.annotators@epfl.ch + if [[ -f .zenodo.json ]]; then + git add .zenodo.json + fi + if [[ -f CITATION.cff ]]; then + git add CITATION.cff + fi + if [[ -f README.md ]]; then + git add README.md + fi + git commit -m 'chore: files updated with tag: ${{ steps.generate_tag.outputs.new_tag }}' + git push + + - name: "Create tag" + run: | + git tag -a "${{ steps.generate_tag.outputs.new_tag }}" -m "chore: files updated with tag: ${{ steps.generate_tag.outputs.new_tag }}" + git push origin "${{ steps.generate_tag.outputs.new_tag }}" + + - name: "Get ms3 package & apply transform" + continue-on-error: true + run: | + pip install --upgrade pip + pip install ms3 + ms3 transform -M -N -X -F -C -D + + - uses: ncipollo/release-action@v1 + with: + artifacts: "${{ github.event.repository.name }}.zip,\ + ${{ github.event.repository.name }}.datapackage.json,\ + ${{ github.event.repository.name }}.datapackage.errors" + body: "${{ github.event.pull_request.body }}" + name: "${{ github.event.pull_request.title }}" + tag: "${{ steps.generate_tag.outputs.new_tag }}" + makeLatest: "latest" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c2b5a75 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/johentsch/ms3 + rev: v2.4.0 + hooks: + - id: review + args: [-M, -N, -C, -X, -F, -D, -c LATEST_VERSION, --fail] \ No newline at end of file