Merge pull request #1 from ai2es/schreck

Added predict_ensemble to all model classes
ai2es · Sep 22, 2023 · 3027754 · 3027754
2 parents 7c29b29 + 2cd411e
commit 3027754
Show file tree

Hide file tree

Showing 41 changed files with 3,379 additions and 3,108 deletions.
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -0,0 +1,41 @@
+name: Python Package using Conda
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+    defaults:
+      run:
+        shell: bash -l {0}
+    steps:
+    - uses: actions/checkout@v2
+    - uses: conda-incubator/setup-miniconda@v2
+      with:
+        miniconda-version: "latest"
+        mamba-version: "*"
+        channel-priority: true
+        environment-file: environment.yml
+        auto-activate-base: false
+        activate-environment: test
+    - shell: bash -l {0}
+      run: |
+        conda info
+        conda list
+        conda config --show-sources
+        conda config --show
+        printenv | sort
+    - name: Lint with flake8
+      shell: bash -l {0}
+      run: |
+        mamba install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=100 --max-line-length=127 --statistics
+    - name: Test with pytest
+      shell: bash -l {0}
+      run: |
+        pytest
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/applications/evaluate_ptype.py b/applications/evaluate_ptype.py
@@ -48,7 +48,10 @@ def locate_best_model(filepath, metric="val_ave_acc", direction="max"):
 
 
 def evaluate(conf, reevaluate=False):
-    output_features = conf["ptypes"]
+    input_features = []
+    for features in conf["input_features"]:
+        input_features += conf[features]
+    output_features = conf["output_features"]
     n_splits = conf["ensemble"]["n_splits"]
     save_loc = conf["save_loc"]
     labels = ["rain", "snow", "sleet", "frz-rain"]

diff --git a/applications/train_classifier_ptype.py b/applications/train_classifier_ptype.py
@@ -15,10 +15,15 @@
 from tensorflow.keras import backend as K
 from argparse import ArgumentParser
 
-from ptype.callbacks import MetricsCallback
-from ptype.data import load_ptype_data_day, preprocess_data
-from sklearn.model_selection import GroupShuffleSplit
+try:
+    from ptype.callbacks import MetricsCallback
+except ImportError:
+    import subprocess
+    subprocess.run(['pip', 'install', 'git+https://github.com/ai2es/ptype-physical.git'], check=True)
+    from ptype.callbacks import MetricsCallback
+    from ptype.data import load_ptype_uq, preprocess_data
 
+from sklearn.model_selection import GroupShuffleSplit
 from evml.keras.callbacks import get_callbacks, ReportEpoch
 from evml.keras.models import CategoricalDNN
 from evml.pbs import launch_pbs_jobs
@@ -31,69 +36,6 @@
 logger = logging.getLogger(__name__)
 
 
-def load_ptype_uq(conf, data_split=0, verbose=0, drop_mixed=False):
-
-    # Load
-    df = pd.read_parquet(conf["data_path"])
-
-    # Drop mixed cases
-    if drop_mixed:
-        logger.info("Dropping data points with mixed observations")
-        c1 = df["ra_percent"] == 1.0
-        c2 = df["sn_percent"] == 1.0
-        c3 = df["pl_percent"] == 1.0
-        c4 = df["fzra_percent"] == 1.0
-        condition = c1 | c2 | c3 | c4
-        df = df[condition].copy()
-
-    # QC-Filter
-    qc_value = str(conf["qc"])
-    cond1 = df[f"wetbulb{qc_value}_filter"] == 0.0
-    cond2 = df["usa"] == 1.0
-    dg = df[cond1 & cond2].copy()
-
-    dg["day"] = dg["datetime"].apply(lambda x: str(x).split(" ")[0])
-    dg["id"] = range(dg.shape[0])
-
-    # Select test cases
-    test_days_c1 = dg["day"].isin(
-        [day for case in conf["case_studies"].values() for day in case]
-    )
-    test_days_c2 = dg["day"] >= conf["test_cutoff"]
-    test_condition = test_days_c1 | test_days_c2
-
-    # Partition the data into trainable-only and test-only splits
-    train_data = dg[~test_condition].copy()
-    test_data = dg[test_condition].copy()
-
-    # Make N train-valid splits using day as grouping variable, return "data_split" split
-    gsp = GroupShuffleSplit(
-        n_splits=conf["ensemble"]["n_splits"],
-        random_state=conf["seed"],
-        train_size=conf["train_size1"],
-    )
-    splits = list(gsp.split(train_data, groups=train_data["day"]))
-
-    train_index, valid_index = splits[data_split]
-    train_data, valid_data = (
-        train_data.iloc[train_index].copy(),
-        train_data.iloc[valid_index].copy(),
-    )
-
-    size = df.shape[0]
-    logger.info("Train, validation, and test fractions:")
-    logger.info(
-        f"{train_data.shape[0]/size}, {valid_data.shape[0]/size}, {test_data.shape[0]/size}"
-    )
-    print(
-        f"{train_data.shape[0]/size}, {valid_data.shape[0]/size}, {test_data.shape[0]/size}"
-    )
-
-    data = {"train": train_data, "val": valid_data, "test": test_data}
-
-    return data
-
-
 class Objective(BaseObjective):
     def __init__(self, config, metric="val_loss"):
 
@@ -144,10 +86,10 @@ def custom_updates(self, trial, conf):
 
 
 def trainer(conf, evaluate=True, data_split=0, mc_forward_passes=0):
-    input_features = (
-        conf["TEMP_C"] + conf["T_DEWPOINT_C"] + conf["UGRD_m/s"] + conf["VGRD_m/s"]
-    )
-    output_features = conf["ptypes"]
+    input_features = []
+    for features in conf["input_features"]:
+        input_features += conf[features]
+    output_features = conf["output_features"]
     metric = conf["metric"]
     # flag for using the evidential model
     if conf["model"]["loss"] == "dirichlet":