.github/workflows/wipac-cicd.yaml

name: ci/cd

on:
  push:
    branches:
      - '**'
    tags-ignore:
      - '**'

jobs:

  ###########################################################################
  # PACKAGING
  ###########################################################################

  py-setup:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
      - uses: WIPACrepo/wipac-dev-py-setup-action@v4.2
        with:
          python_min: 3.11
          python_max: 3.11
          pypi_name: icecube-simprod-histogram
          author: IceCube
          author_email: developers@icecube.wisc.edu
          keywords: |
            "histogram sampling" simulation statistics


  ###########################################################################
  # LINTERS
  ###########################################################################

  py-versions:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.versions.outputs.matrix }}
    steps:
      - uses: actions/checkout@v3
      - id: versions
        uses: WIPACrepo/wipac-dev-py-versions-action@v2.5

  flake8:
    needs: [ py-versions ]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }}
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.py3 }}
      - uses: WIPACrepo/wipac-dev-flake8-action@v1.1

  mypy:
    needs: [ py-versions ]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }}
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.py3 }}
      - uses: WIPACrepo/wipac-dev-mypy-action@v2.0

  ###########################################################################
  # FORMATTER
  ###########################################################################

  code-format:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
      - name: Run Ruff for code formatting
        run: |
          set -euo pipefail
          pip install ruff
          ruff check --select C408 --fix . --unsafe-fixes
      - name: Commit formatted code
        run: |
          set -euo pipefail
          git config user.name github-actions
          git config user.email github-actions@github.com
          git add .
          git commit -m "<bot> auto code format file(s)" || true
          git push || true

  ###########################################################################
  # TESTS
  ###########################################################################

  unit-tests:
    needs: [ py-versions ]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }}
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.py3 }}
      - name: install
        run: |
          set -euo pipefail
          pip install .[tests]
      - name: Run unit tests
        run: |
          set -euo pipefail
          pytest -vvv tests/unit/

  test-sample-each-dataset-sh:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        max_num_datasets:
          - 1
          - 25
          - 100  # aka all of them, currently, there are 48
        base_path:
          - /tmp/data/sim/Upgrade/2022/generated/neutrino-generator/88888
          - /tmp/data/sim/IceCube/2023/filtered/CORSIKA
          - /tmp/data/sim/Upgrade/2022/filtered
          - /tmp/data/sim/IceCube/2023
          - /tmp/data/sim/Upgrade
          - /tmp/data/sim
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Set up Python environment
        uses: actions/setup-python@v4

      - name: Create source dataset dirs/files
        run: |
          set -euo pipefail
          job_range_dpaths=(
            /tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/{00-11,22-33,44-55}
          )

          for dpath in "${job_range_dpaths[@]}"; do
            mkdir -p "$dpath"/histos/
            # create 1-5 pkl files
            for i in $( seq 1 "$(( (RANDOM % 5) + 1 ))" ); do
              random_file=$(find "tests/data/simprod-histograms" -type f -name "*.pkl" | shuf -n 1)
              cp "$random_file" "$dpath/histos/histo_$i.pkl"
            done
          done

      - name: Look at filetree (before)
        run: |
          set -euo pipefail
          tree /tmp/data/sim/

      - name: Run script
        run: |
          set -euo pipefail
          set -x
          ./scripts/sample-each-dataset.sh ${{ matrix.base_path }} 0.5 ${{ matrix.max_num_datasets }}

      - name: Validate script execution
        run: |
          set -euo pipefail
          echo "Max num of datasets: ${{ matrix.max_num_datasets }}"
          
          # Count dataset directories containing at least one "*.histo.hdf5" file
          available_datasets=$(find ${{ matrix.base_path }} -type d -regex ".*/[0-9]+-[0-9]+$" -exec dirname {} \; | sort -u | wc -l)
          echo "Available datasets: $available_datasets"

          # Use the lesser of available_datasets and num_datasets for validation
          expected_num_datasets=$(( available_datasets < ${{ matrix.max_num_datasets }} ? available_datasets : ${{ matrix.max_num_datasets }} ))
          echo "Expected datasets: $expected_num_datasets"

          # Check processed count
          processed_count=$(find ${{ matrix.base_path }} -name '*.histo.hdf5' | wc -l)
          echo "Processed count: $processed_count"

          if [[ $processed_count -ne $expected_num_datasets ]]; then
            echo "Script did not process the expected number of datasets!"
            exit 1
          fi

          echo "All tests passed."

      - name: Look at filetree (after)
        run: |
          set -euo pipefail
          tree /tmp/data/sim/

  test-cp-dataset-histos-sh:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        prev_histos_setting:
          - none
          - overwrite
          - keep
        base_path:
          - /tmp/data/sim/IceCube/2023/generated/neutrino-generator
          - /tmp/data/sim/Upgrade/2022/filtered/CORSIKA
        dest_dir:
          - /tmp/mycopy
    env:
      OLD_FILE_MODTIME: 0
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
      - name: Set up Python environment
        uses: actions/setup-python@v4

      - name: Create source dataset dirs/files
        run: |
          set -euo pipefail
          dataset_dpaths=(
            /tmp/data/sim/{IceCube,Upgrade}/{2022,2023}/{generated,filtered}/{CORSIKA,neutrino-generator}/{77777,88888,99999}/
          )

          for dpath in "${dataset_dpaths[@]}"; do
            histo="$dpath"/"$(basename $dpath).histo.hdf5"
            mkdir $(dirname $histo)
            touch $histo
            # pre-create some of these files in the destination
            if [[ "${{ matrix.prev_histos_setting }}" == "overwrite" || "${{ matrix.prev_histos_setting }}" == "keep" ]]; then
              if (( RANDOM % 100 < 25 )); then
                relative_path="${dpath#*/sim/}"
                dest_dataset_dir="${{ matrix.dest_dir }}/sim/$relative_path"
                touch "$dest_dataset_dir"/"$(basename "$dest_dataset_dir").histo.hdf5"
              fi
            fi
          done
          
          # set the oldest file's mod time
          if [[ "${{ matrix.prev_histos_setting }}" == "overwrite" || "${{ matrix.prev_histos_setting }}" == "keep" ]]; then
            oldest_modtime=$(find "${{ matrix.dest_dir }}" -name "*.histo.hdf5" -type f -exec stat --format='%Y' {} + | sort -n | head -1)
            echo "OLD_FILE_MODTIME=$oldest_modtime" >> $GITHUB_ENV
          fi

      - name: Look at src filetree (before)
        run: |
          set -euo pipefail
          tree /tmp/data/sim/

      - name: Look at dest filetree (before)
        run: |
          set -euo pipefail
          tree ${{ matrix.dest_dir }}

      - name: Run script
        run: |
          set -euo pipefail
          prev_histos_setting=""
          if [[ "${{ matrix.prev_histos_setting }}" == "overwrite" ]]; then
            force_flag="--force"
          fi
          set -x
          ./cp-dataset-histos.sh ${{ matrix.base_path }} ${{ matrix.dest_dir }} $force_flag

      - name: Validate copied histograms
        run: |
          set -euo pipefail

          src_count=$(find ${{ matrix.base_path }} -name "*.histo.hdf5" | wc -l)
          dest_count=$(find ${{ matrix.dest_dir }} -name "*.histo.hdf5" | wc -l)
          echo "Source histograms: $src_count"
          echo "Copied histograms: $dest_count"
          if [[ $src_count -ne $dest_count ]]; then
            echo "Copied histograms count ($dest_count) does not match source histograms count ($src_count)!"
            exit 1
          fi
          
          # check the overwriting settings
          oldest_modtime=$(find "${{ matrix.dest_dir }}" -name "*.histo.hdf5" -type f -exec stat --format='%Y' {} + | sort -n | head -1)
          echo "Oldest histo file modtime: $oldest_modtime"
          case "${{ matrix.prev_histos_setting }}" in
            none)
              # oldest modtime should be younger than previously-stored value
              if [[ ! $oldest_modtime -lt $OLD_FILE_MODTIME ]]; then
                echo "ERROR: there is an older file in here!" >&2
                exit 1
              fi
              ;;
            overwrite)
              # oldest modtime should be younger than previously-stored value
              if [[ ! $oldest_modtime -lt $OLD_FILE_MODTIME ]]; then
                echo "ERROR: there is an older file in here! aka script didn't overwrite" >&2
                exit 1
              fi
              ;;
            keep)
              # oldest modtime should be the previously-stored value
              if [[ $oldest_modtime -ne $OLD_FILE_MODTIME ]]; then
                echo "ERROR: there is no older file in here! aka the scrip did overwrite" >&2
                exit 1
              fi
              ;;
            *)
              echo "Error: Unknown value for prev_histos_setting: $prev_histos_setting" >&2
              exit 1
              ;;
          esac

          echo "All tests passed for base_path=${{ matrix.base_path }} and dest_dir=${{ matrix.dest_dir }}."

      - name: Look at dest filetree (after)
        run: |
          set -euo pipefail
          tree ${{ matrix.dest_dir }}


  ###########################################################################
  # RELEASE
  ###########################################################################

  release:
    if: github.ref == 'refs/heads/main'
    needs: [ py-setup, flake8, mypy, code-format, unit-tests, test-sample-each-dataset-sh, test-cp-dataset-histos-sh ]
    runs-on: ubuntu-latest
    concurrency: release  # prevent any possible race conditions
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      # Python-Package Version Bump
      - uses: python-semantic-release/python-semantic-release@v9.8.1
        id: psr-psr
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
      # PyPI Release
      - uses: pypa/gh-action-pypi-publish@v1.8.14
        if: steps.psr-psr.outputs.released == 'true'
        with:
          password: ${{ secrets.WIPAC_PYPI_TOKEN }}
      # GitHub Release
      - uses: python-semantic-release/upload-to-gh-release@v9.8.1
        if: steps.psr-psr.outputs.released == 'true'
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}