Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added predict_ensemble to all model classes #1

Merged
merged 11 commits into from
Sep 22, 2023
Merged
41 changes: 41 additions & 0 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Python Package using Conda

on: [push]

jobs:
build-linux:
runs-on: ubuntu-latest
strategy:
max-parallel: 5
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v2
- uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
mamba-version: "*"
channel-priority: true
environment-file: environment.yml
auto-activate-base: false
activate-environment: test
- shell: bash -l {0}
run: |
conda info
conda list
conda config --show-sources
conda config --show
printenv | sort
- name: Lint with flake8
shell: bash -l {0}
run: |
mamba install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=100 --max-line-length=127 --statistics
- name: Test with pytest
shell: bash -l {0}
run: |
pytest
39 changes: 39 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Upload Python Package

on:
release:
types: [published]

permissions:
contents: read

jobs:
deploy:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
5 changes: 4 additions & 1 deletion applications/evaluate_ptype.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def locate_best_model(filepath, metric="val_ave_acc", direction="max"):


def evaluate(conf, reevaluate=False):
output_features = conf["ptypes"]
input_features = []
for features in conf["input_features"]:
input_features += conf[features]
output_features = conf["output_features"]
n_splits = conf["ensemble"]["n_splits"]
save_loc = conf["save_loc"]
labels = ["rain", "snow", "sleet", "frz-rain"]
Expand Down
82 changes: 12 additions & 70 deletions applications/train_classifier_ptype.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@
from tensorflow.keras import backend as K
from argparse import ArgumentParser

from ptype.callbacks import MetricsCallback
from ptype.data import load_ptype_data_day, preprocess_data
from sklearn.model_selection import GroupShuffleSplit
try:
from ptype.callbacks import MetricsCallback
except ImportError:
import subprocess
subprocess.run(['pip', 'install', 'git+https://github.com/ai2es/ptype-physical.git'], check=True)
from ptype.callbacks import MetricsCallback
from ptype.data import load_ptype_uq, preprocess_data

from sklearn.model_selection import GroupShuffleSplit
from evml.keras.callbacks import get_callbacks, ReportEpoch
from evml.keras.models import CategoricalDNN
from evml.pbs import launch_pbs_jobs
Expand All @@ -31,69 +36,6 @@
logger = logging.getLogger(__name__)


def load_ptype_uq(conf, data_split=0, verbose=0, drop_mixed=False):

# Load
df = pd.read_parquet(conf["data_path"])

# Drop mixed cases
if drop_mixed:
logger.info("Dropping data points with mixed observations")
c1 = df["ra_percent"] == 1.0
c2 = df["sn_percent"] == 1.0
c3 = df["pl_percent"] == 1.0
c4 = df["fzra_percent"] == 1.0
condition = c1 | c2 | c3 | c4
df = df[condition].copy()

# QC-Filter
qc_value = str(conf["qc"])
cond1 = df[f"wetbulb{qc_value}_filter"] == 0.0
cond2 = df["usa"] == 1.0
dg = df[cond1 & cond2].copy()

dg["day"] = dg["datetime"].apply(lambda x: str(x).split(" ")[0])
dg["id"] = range(dg.shape[0])

# Select test cases
test_days_c1 = dg["day"].isin(
[day for case in conf["case_studies"].values() for day in case]
)
test_days_c2 = dg["day"] >= conf["test_cutoff"]
test_condition = test_days_c1 | test_days_c2

# Partition the data into trainable-only and test-only splits
train_data = dg[~test_condition].copy()
test_data = dg[test_condition].copy()

# Make N train-valid splits using day as grouping variable, return "data_split" split
gsp = GroupShuffleSplit(
n_splits=conf["ensemble"]["n_splits"],
random_state=conf["seed"],
train_size=conf["train_size1"],
)
splits = list(gsp.split(train_data, groups=train_data["day"]))

train_index, valid_index = splits[data_split]
train_data, valid_data = (
train_data.iloc[train_index].copy(),
train_data.iloc[valid_index].copy(),
)

size = df.shape[0]
logger.info("Train, validation, and test fractions:")
logger.info(
f"{train_data.shape[0]/size}, {valid_data.shape[0]/size}, {test_data.shape[0]/size}"
)
print(
f"{train_data.shape[0]/size}, {valid_data.shape[0]/size}, {test_data.shape[0]/size}"
)

data = {"train": train_data, "val": valid_data, "test": test_data}

return data


class Objective(BaseObjective):
def __init__(self, config, metric="val_loss"):

Expand Down Expand Up @@ -144,10 +86,10 @@ def custom_updates(self, trial, conf):


def trainer(conf, evaluate=True, data_split=0, mc_forward_passes=0):
input_features = (
conf["TEMP_C"] + conf["T_DEWPOINT_C"] + conf["UGRD_m/s"] + conf["VGRD_m/s"]
)
output_features = conf["ptypes"]
input_features = []
for features in conf["input_features"]:
input_features += conf[features]
output_features = conf["output_features"]
metric = conf["metric"]
# flag for using the evidential model
if conf["model"]["loss"] == "dirichlet":
Expand Down
Loading
Loading