From 9621804fb55da08f9d08f7349724df62b8568031 Mon Sep 17 00:00:00 2001 From: Anton Zogkolli <110612763+AntonZogk@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:16:10 +0000 Subject: [PATCH] 623 ci with cml runtimes (#118) * Build pbj-workbench-python3.9-standard.Dockerfile and test * Check files * Build full path instead of cd * Test whole process with 3.8 * Update push branch for testing * Navigate to project folder * Add ls for debugging * docker run -t cml_3.9 * Run docker in detached mode * Run docker in -i mode * Split process to many steps * Add matrix with cml versions * Typo matrix * Tupo matrix * Typo matrix * Use string type for versions * continue-on-error: true * Tidy up workflow Break steps Add 3.11 runtime Change name tags * Set wd in pytest * Add pre commit hooks * Use checkout v3 * Move files around with docker cp * Fix typo * Use checkouts * Copy files to container * Mount parent volume * Create container after checkout * Trigger on pull requests in main * 586 sic sut mapping (#113) * Creating mapping validation function * Docstrings * Update docstring and leave unmatched as set * Update to raise warning and created wraper to test multiple mapping files in one go * adding test that passes when a warning raised * update docstring to ask that mapping file be a folder and not a file * Correct design and calibration values (#108) *Add reusable function *Add test data *Add unit test *Add TODO to use this function in other parts of the pipeline * 632 module restructure (#116) * move files to correct location * change relevant module imports * Move files to apropiate folders, rename folders * Move all data to equivalent test folder structure * Update estimation test paths and imports * Update imputation test paths and imports * Update outlier detection test paths and imports * Update outpus test paths and imports * Update utilities test paths and imports * Remove tests/imputation/test_pivot_imputation_value.py * Run hooks * Add tests tree into tests readme * Run hooks * Remove duplicated test data * These were copied instead of moved, hence duplicated * these files aren't needed and covered by other tests * update tree --------- Co-authored-by: Wil Roberts * Use list, passing dict not support in pandas 2.1.4 * Enforce constrain marker str type in tests * Create separate job for pre commit hooks * Use legacy job for hooks * Run hooks * Pre commit mig config, user python instead of python3 * Use python 3.9 for hooks * Use 3.10 for hooks * Use python 3.10.13 --------- Co-authored-by: Jordan-Day-ONS <57715292+Jday7879@users.noreply.github.com> Co-authored-by: Wil Roberts --- .github/workflows/main.yaml | 69 ++++++++++---------------- .pre-commit-config.yaml | 18 +++---- mbs_results/staging/data_cleaning.py | 12 +++-- mbs_results/utilities/constrains.py | 6 +-- pre-commits/check_added_large_files.py | 2 +- pre-commits/check_merge_conflict.py | 2 +- pre-commits/end_of_line_fixer.py | 2 +- pre-commits/mixed_line_endings.py | 2 +- pre-commits/remove_whitespace.py | 2 +- tests/utilities/test_constrains.py | 4 ++ 10 files changed, 53 insertions(+), 66 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index ebe70527..927ef362 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -1,53 +1,11 @@ -name: Build and run tests +name: cml_runtimes -# Controls when the action will run. on: - # Triggers the workflow on push events for the main branch - push: - branches: [ main ] - # Triggers the workflow on pull requests to main branch pull_request: branches: [ main ] - # Allows you to run this workflow manually from the Actions tab - workflow_dispatch: - jobs: - build: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: Set up Python 3.6.8 - uses: actions/setup-python@v3 - with: - python-version: 3.6.8 - - - name: Check package build - run: | - python -m pip install --upgrade pip - - test: - runs-on: ubuntu-20.04 - steps: - # Checks-out your repository under $GITHUB_WORKSPACE - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v3 - with: - python-version: 3.6.8 - cache: 'pip' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install .[dev] - - - name: Run pytest - run: | - pytest -v - commit-hooks: runs-on: ubuntu-20.04 steps: @@ -55,7 +13,7 @@ jobs: - uses: actions/setup-python@v3 with: - python-version: 3.6.8 + python-version: 3.10.13 cache: 'pip' - name: Install Python dependencies @@ -66,3 +24,26 @@ jobs: - name: Check commit hooks run: | pre-commit run --all-files + + testing-cml: + runs-on: ubuntu-latest + strategy: + matrix: + cml_version: ["3.8", "3.9", "3.10","3.11"] + steps: + - name: checkout ml-runtimes #https://github.com/cloudera/ml-runtimes + uses: actions/checkout@master + with: + repository: cloudera/ml-runtimes + - name: build runtime cml_${{matrix.cml_version}} + run: docker build -t cml:${{matrix.cml_version}} -f 'pbj-workbench-python${{matrix.cml_version}}-standard.Dockerfile' . + - name: checkout to repository + uses: actions/checkout@v3 + - name: create container + run: docker run -id --name container_${{matrix.cml_version}} -v"$(pwd)"://home/cdsw cml:${{matrix.cml_version}} + - name: build in dev mode + run: docker exec container_${{matrix.cml_version}} pip install ."[dev]" + - name: check env + run: docker exec container_${{matrix.cml_version}} pip list + - name: test + run: docker exec container_${{matrix.cml_version}} pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7fb5d668..d808b125 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: entry: pre-commits/check_added_large_files.py name: Check for files larger than 5 MB language: script - stages: [commit] + stages: [pre-commit] args: [ "--maxkb=5120" ] #works @@ -19,7 +19,7 @@ repos: entry: pre-commits/end_of_line_fixer.py name: Check for a blank line at the end of scripts (auto-fixes) language: script - stages: [commit] + stages: [pre-commit] #works - repo: local @@ -28,7 +28,7 @@ repos: entry: pre-commits/remove_whitespace.py name: Check for trailing whitespaces (auto-fixes) language: script - stages: [commit] + stages: [pre-commit] #works - repo: local @@ -37,7 +37,7 @@ repos: entry: pre-commits/mixed_line_endings.py name: Check for consistent end of line type LF to CRLF to CR (auto-fixes) language: script - stages: [commit] + stages: [pre-commit] #works #if using on different file types, it will need a seperate hook per file type @@ -48,7 +48,7 @@ repos: name: isort - Sort Python imports (auto-fixes) language: system types: [python] - stages: [commit] + stages: [pre-commit] args: [ "--profile", "black", "--filter-files" ] #works @@ -58,7 +58,7 @@ repos: entry: nbstripout name: nbstripout - Strip outputs from notebooks (auto-fixes) language: system - stages: [commit] + stages: [pre-commit] # args: # - --extra-keys # - "metadata.colab metadata.kernelspec cell.metadata.colab cell.metadata.executionInfo cell.metadata.id cell.metadata.outputId" @@ -71,7 +71,7 @@ repos: name: black - consistent Python code formatting (auto-fixes) language: system types: [python] - stages: [commit] + stages: [pre-commit] args: ["--verbose"] exclude: ^playground/ @@ -83,7 +83,7 @@ repos: name: flake8 - Python linting language: system types: [python] - stages: [commit] + stages: [pre-commit] # works in testing @@ -96,7 +96,7 @@ repos: #args: [scan, audit] language: system types: [python] - stages: [commit] + stages: [pre-commit] diff --git a/mbs_results/staging/data_cleaning.py b/mbs_results/staging/data_cleaning.py index f8563391..119f6fe9 100644 --- a/mbs_results/staging/data_cleaning.py +++ b/mbs_results/staging/data_cleaning.py @@ -66,8 +66,10 @@ def clean_and_merge( responses = pd.DataFrame(snapshot["responses"]) responses = filter_responses(responses, reference, period, "lastupdateddate") - responses = responses[responses_keep_cols].set_index([reference, period]) - contributors = contributors[contributors_keep_cols].set_index([reference, period]) + responses = responses[list(responses_keep_cols)].set_index([reference, period]) + contributors = contributors[list(contributors_keep_cols)].set_index( + [reference, period] + ) validate_indices(responses, contributors) return responses.merge(contributors, on=[reference, period]) @@ -439,8 +441,8 @@ def correct_values( # Update value only if columns exist if set(check_columns).issubset(df.columns): - df_temp.loc[ - df[condition_column].isin(condition_values), columns_to_correct - ] = replace_with + df_temp.loc[df[condition_column].isin(condition_values), columns_to_correct] = ( + replace_with + ) return df_temp diff --git a/mbs_results/utilities/constrains.py b/mbs_results/utilities/constrains.py index a8890a32..e63c9ba5 100644 --- a/mbs_results/utilities/constrains.py +++ b/mbs_results/utilities/constrains.py @@ -375,9 +375,9 @@ def calculate_derived_outlier_weights( ) updated_o_weight_bool = df_pre_winsorised[winsorised_target].isna() - df_pre_winsorised.loc[ - updated_o_weight_bool, winsorised_target - ] = post_win_derived.loc[updated_o_weight_bool, winsorised_target] + df_pre_winsorised.loc[updated_o_weight_bool, winsorised_target] = ( + post_win_derived.loc[updated_o_weight_bool, winsorised_target] + ) df_pre_winsorised["post_wins_marker"] = updated_o_weight_bool df_pre_winsorised.reset_index(inplace=True) diff --git a/pre-commits/check_added_large_files.py b/pre-commits/check_added_large_files.py index 59c0353a..973697f7 100755 --- a/pre-commits/check_added_large_files.py +++ b/pre-commits/check_added_large_files.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Pre commit hook to ensure large files aren't added to repo.""" import argparse import json diff --git a/pre-commits/check_merge_conflict.py b/pre-commits/check_merge_conflict.py index e6c67007..997996c5 100755 --- a/pre-commits/check_merge_conflict.py +++ b/pre-commits/check_merge_conflict.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Pre commit hook to check for merge conflict flags in file.""" import argparse import os.path diff --git a/pre-commits/end_of_line_fixer.py b/pre-commits/end_of_line_fixer.py index eb85f62e..6b71763a 100755 --- a/pre-commits/end_of_line_fixer.py +++ b/pre-commits/end_of_line_fixer.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Pre commit hook to ensure single blank line at end of python file.""" import argparse import os diff --git a/pre-commits/mixed_line_endings.py b/pre-commits/mixed_line_endings.py index 8ae44909..54edea53 100755 --- a/pre-commits/mixed_line_endings.py +++ b/pre-commits/mixed_line_endings.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Pre commit hook to ensure all EOL characters are the same.""" import argparse import collections diff --git a/pre-commits/remove_whitespace.py b/pre-commits/remove_whitespace.py index 61e5803f..69b0135c 100755 --- a/pre-commits/remove_whitespace.py +++ b/pre-commits/remove_whitespace.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """Pre commit hook to remove any trailing whitespace.""" import argparse import os diff --git a/tests/utilities/test_constrains.py b/tests/utilities/test_constrains.py index c67ea981..3d0460b9 100644 --- a/tests/utilities/test_constrains.py +++ b/tests/utilities/test_constrains.py @@ -32,6 +32,10 @@ def test_replace_values_index_base(filepath): replace_values_index_based(df_in, "target", 49, ">", 40) replace_values_index_based(df_in, "target", 90, ">=", 40) + # Enforce dtypes, otherwise null==null fails + df_in["constrain_marker"] = df_in["constrain_marker"].astype(str) + df_expected["constrain_marker"] = df_expected["constrain_marker"].astype(str) + assert_frame_equal(df_in, df_expected)