From b437c5a5e6089ff42c2fb556b20df0b0a32dcece Mon Sep 17 00:00:00 2001
From: "Jonathan C. McKinney" <pseudotensor@gmail.com>
Date: Wed, 6 Nov 2024 01:52:25 -0800
Subject: [PATCH] Add DAI tool

---
 openai_server/agent_prompting.py              |  35 +-
 .../agent_tools/driverless_ai_data_science.py | 660 ++++++++++++++++++
 .../requirements_optional_agents.txt          |   4 +
 src/version.py                                |   2 +-
 4 files changed, 699 insertions(+), 2 deletions(-)
 create mode 100644 openai_server/agent_tools/driverless_ai_data_science.py

diff --git a/openai_server/agent_prompting.py b/openai_server/agent_prompting.py
index 78975386c..ff832b1d0 100644
--- a/openai_server/agent_prompting.py
+++ b/openai_server/agent_prompting.py
@@ -721,6 +721,34 @@ def get_wolfram_alpha_helper():
     return wolframalpha
 
 
+def get_dai_helper():
+    cwd = os.path.abspath(os.getcwd())
+    if os.getenv('ENABLE_DAI'):
+        # https://wolframalpha.readthedocs.io/en/latest/?badge=latest
+        # https://products.wolframalpha.com/api/documentation
+        dai = f"""\n* DriverlessAI is an advanced AutoML tool for data science model making and predictions.
+* If user specifically asks for a DAI model, then you should use the existing pre-built python code to query DriverlessAI, E.g.:
+```sh
+# filename: my_dai_query.sh
+# execution: true
+python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py
+```
+* usage: python {cwd}/openai_server/agent_tools/wolfram_alpha_math_science_query.py [--experiment_key EXPERIMENT_KEY] [--dataset_key DATASET_KEY] [--data-url DATA_URL] [--dataset-name DATASET_NAME] [--data-source DATA_SOURCE] [--target-column TARGET_COLUMN] [--task {{classification,regression,predict,shapley_original_features,shapley_transformed_features,transform,fit_and_transform,artifacts}}] [--scorer SCORER] [--experiment-name EXPERIMENT_NAME] [--accuracy {{1,2,3,4,5,6,7,8,9,10}}] [--time {{1,2,3,4,5,6,7,8,9,10}}] [--interpretability {{1,2,3,4,5,6,7,8,9,10}}] [--train-size TRAIN_SIZE] [--seed SEED] [--fast] [--force]
+* Typical case for creating experiment might be:
+python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --dataset-name "my_dataset" --data-url "https://mydata.com/mydata.csv" --target-column "target" --task "classification" --scorer "auc" --experiment-name "my_experiment"
+* A typical re-use of the experiment_key and dataset_key for prediction (or shapley, transform, fit_and_transform) would be like:
+python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --experiment_key <experiment_key from experiment created before> --dataset_key <dataset_key from experiment> --task "prediction"
+* For predict, shapley, transform, fit_and_transform, one can also pass --data-url to use a fresh dataset on the given experiment, e.g.:
+python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --experiment_key <experiment_key from experiment created before> --data-url "https://mydata.com/mydata.csv" --task "prediction"
+"""
+        if os.getenv('DAI_TOKEN') is None:
+            dai += f"""* Additionally, you must pass --token <DAI_TOKEN> to the command line to use the DAI tool."""
+        dai += f"""You may also pass these additional options if user provides them: --engine DAI_ENGINE --client_id DAI_CLIENT_ID --token_endpoint_url DAI_TOKEN_ENDPOINT_URL --environment DAI_ENVIRONMENT --token DAI_TOKEN"""
+    else:
+        dai = ""
+    return dai
+
+
 def get_news_api_helper():
     cwd = os.path.abspath(os.getcwd())
     have_internet = get_have_internet()
@@ -824,7 +852,6 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p
     mermaid_renderer_helper = get_mermaid_renderer_helper()
     image_generation_helper = get_image_generation_helper()
     audio_transcription_helper = get_audio_transcription_helper()
-    query_to_web_image_helper = get_query_to_web_image_helper()
     aider_coder_helper = get_aider_coder_helper(base_url, api_key, model, autogen_timeout)
     rag_helper = get_rag_helper(base_url, api_key, model, autogen_timeout, text_context_list, image_file)
     convert_helper = get_convert_to_text_helper()
@@ -836,6 +863,10 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p
     wolfram_alpha_helper = get_wolfram_alpha_helper()
     news_helper = get_news_api_helper()
     bing_search_helper = get_bing_search_helper()
+    query_to_web_image_helper = get_query_to_web_image_helper()
+
+    # data science
+    dai_helper = get_dai_helper()
 
     # general API notes:
     api_helper = get_api_helper()
@@ -875,6 +906,8 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p
                             news_helper,
                             bing_search_helper,
                             query_to_web_image_helper,
+                            # data science
+                            dai_helper,
                             # overall
                             api_helper,
                             agent_tools_note,
diff --git a/openai_server/agent_tools/driverless_ai_data_science.py b/openai_server/agent_tools/driverless_ai_data_science.py
new file mode 100644
index 000000000..8e6904b5b
--- /dev/null
+++ b/openai_server/agent_tools/driverless_ai_data_science.py
@@ -0,0 +1,660 @@
+import argparse
+import os
+import shutil
+from zipfile import ZipFile
+
+import pandas as pd
+from matplotlib import pyplot as plt
+
+
+def connect_to_h2o_engine(token: str, client_id, token_endpoint_url, environment):
+    # https://internal.dedicated.h2o.ai/cli-and-api-access
+    """Establishes a secure connection to the H2O Engine Manager using the provided token."""
+    import h2o_authn
+    token_provider = h2o_authn.TokenProvider(
+        refresh_token=token,
+        client_id=client_id,
+        token_endpoint_url=token_endpoint_url,
+    )
+
+    import h2o_engine_manager
+    engine_manager = h2o_engine_manager.login(
+        environment=environment,
+        token_provider=token_provider
+    )
+
+    # https://docs.h2o.ai/mlops/py-client/install
+    # os.system('pip install h2o-mlops')
+    # import h2o_mlops
+    # mlops = h2o_mlops.Client(
+    #  gateway_url="https://mlops-api.internal.dedicated.h2o.ai",
+    #    token_provider=token_provider
+    # )
+
+    print("Successfully connected to H2O engine manager.")
+    return engine_manager
+
+
+def connect_to_driverless_ai(engine_manager, dai_engine: str = None):
+    """Creates a Driverless AI engine and establishes a connection to it."""
+    dai_engine_obj = None
+    for dai_inst in engine_manager.dai_engine_client.list_all_engines():
+        if dai_inst.display_name == dai_engine:
+            dai_engine_obj = engine_manager.dai_engine_client.get_engine(dai_engine)
+            if dai_engine_obj.state.value != "STATE_RUNNING":
+                print(f"Waking up instance {dai_engine}")
+                dai_engine_obj.resume()
+                dai_engine_obj.wait()
+
+    if dai_engine_obj is None:
+        # if DAI Engine does not exist
+        print(f"Creating instance {dai_engine}")
+        dai_engine_obj = engine_manager.dai_engine_client.create_engine(display_name=dai_engine)
+        dai_engine_obj.wait()
+
+    dai = dai_engine_obj.connect()
+    print(f"Successfully connected to Driverless AI engine: {dai_engine}")
+    return dai
+
+
+def create_dataset(dai, data_url: str, dataset_name: str, data_source: str = "s3", force: bool = True):
+    """Creates a dataset in the Driverless AI instance."""
+    dataset = dai.datasets.create(
+        data=data_url,
+        data_source=data_source,
+        name=dataset_name,
+        force=force
+    )
+    print(f"Dataset {dataset_name} with reusable dataset_key: {dataset.key} created successfully.")
+    return dataset
+
+
+def split_dataset(dataset, train_size: float, train_name: str, test_name: str,
+                  target_column: str, seed: int = 42):
+    """Splits a dataset into train and test sets."""
+    dataset_split = dataset.split_to_train_test(
+        train_size=train_size,
+        train_name=train_name,
+        test_name=test_name,
+        target_column=target_column,
+        seed=seed
+    )
+
+    print("Dataset successfully split into training and testing sets.")
+    for k, v in dataset_split.items():
+        print(f"Name: {v.name} with reusable dataset_key: {v.key}")
+
+    return dataset_split
+
+
+def create_experiment(dai, dataset_split, target_column: str, scorer: str = 'F1',
+                      task: str = 'classification', experiment_name: str = 'Experiment',
+                      accuracy: int = 1, time: int = 1, interpretability: int = 6,
+                      fast=True,
+                      force: bool = True):
+    """Creates an experiment in Driverless AI."""
+    experiment_settings = {
+        **dataset_split,
+        'task': task,
+        'target_column': target_column,
+        'scorer': scorer
+    }
+
+    dai_settings = {
+        'accuracy': accuracy,
+        'time': time,
+        'interpretability': interpretability,
+    }
+    if fast:
+        print("Using fast settings, but still making autoreport")
+        dai_settings.update({
+            'make_python_scoring_pipeline': 'off',
+            'make_mojo_scoring_pipeline': 'off',
+            'benchmark_mojo_latency': 'off',
+            'make_autoreport': True,
+            'check_leakage': 'off',
+            'check_distribution_shift': 'off'
+        })
+
+    experiment = dai.experiments.create(
+        **experiment_settings,
+        name=experiment_name,
+        **dai_settings,
+        force=force
+    )
+
+    print(f"Experiment {experiment_name} with reusable experiment_key: {experiment.key} created with settings: "
+          f"Accuracy={accuracy}, Time={time}, Interpretability={interpretability}")
+    return experiment
+
+
+def get_experiment_from_key(experiment_key, token, client_id, token_endpoint_url, dai_engine, environment):
+    # FIXME: not used yet, would be used to act more on experiment, like restart etc.
+    # Connect to the engine manager and Driverless AI
+    engine_manager = connect_to_h2o_engine(token, client_id, token_endpoint_url, environment)
+    dai = connect_to_driverless_ai(engine_manager, dai_engine)
+
+    # Get the experiment
+    experiment = dai.experiments.get(experiment_key)
+    return experiment
+
+
+def visualize_importance(experiment):
+    """Visualizes and saves variable importance plot."""
+    var_imp = experiment.variable_importance()
+    print("\nVariable Importance Output:")
+    print(var_imp)
+
+    # Save variable importance to csv
+    df = pd.DataFrame(var_imp.data, columns=var_imp.headers)
+    csv_file = "variable_importance.csv"
+    df.to_csv(csv_file, index=False)
+    df_top10 = df.sort_values('gain', ascending=False).head(10)
+
+    plt.figure(figsize=(12, 8))
+    plt.barh(df_top10['description'], df_top10['gain'])
+    plt.title('Top 10 Important Variables')
+    plt.xlabel('Importance (Gain)')
+    plt.tight_layout()
+
+    output_path = 'variable_importance.png'
+    plt.savefig(output_path)
+    print(f"\nVariable importance plot saved as {output_path} and csv file as {csv_file}")
+
+    print("\nTop 10 Important Variables:")
+    print(df_top10[['description', 'gain']].to_string(index=False))
+
+
+def print_experiment_details(experiment):
+    """Prints details of a Driverless AI experiment."""
+    print(f"\nExperiment Details:")
+    print(f"Name: {experiment.name}")
+    print("\nDatasets:")
+    for dataset in experiment.datasets:
+        print(f" - {dataset}")
+    print(f"\nTarget: {experiment.settings.get('target_column')}")
+    print(f"Scorer: {experiment.metrics().get('scorer')}")
+    print(f"Task: {experiment.settings.get('task')}")
+    print(f"Size: {experiment.size}")
+    print(f"Summary: {experiment.summary}")
+    print("\nStatus:")
+    print(experiment.status(verbose=2))
+    print("\nWeb Page: ", end='')
+    experiment.gui()
+
+    print(f"\nMetrics: {experiment.metrics()}")
+
+
+def plot_roc_curve(roc_data, save_dir='plots'):
+    """Plot ROC (Receiver Operating Characteristic) curve and save to file"""
+    df = pd.DataFrame(roc_data['layer'][0]['data']['values'])
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(df['False Positive Rate'], df['True Positive Rate'], 'b-', label='ROC curve')
+    plt.plot([0, 1], [0, 1], 'r--', label='Random')
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('ROC Curve')
+    plt.legend()
+    plt.grid(True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'roc_curve.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_precision_recall(pr_data, save_dir='plots'):
+    """Plot Precision-Recall curve and save to file"""
+    df = pd.DataFrame(pr_data['layer'][0]['data']['values'])
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(df['Recall'], df['Precision'], 'g-')
+    plt.xlabel('Recall')
+    plt.ylabel('Precision')
+    plt.title('Precision-Recall Curve')
+    plt.grid(True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'precision_recall_curve.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_gains_chart(gains_data, save_dir='plots'):
+    """Plot Cumulative Gains chart and save to file"""
+    df = pd.DataFrame(gains_data['layer'][0]['data']['values'])
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(df['Quantile'], df['Gains'], 'b-')
+    plt.plot([0, 1], [0, 1], 'r--', label='Random')
+    plt.xlabel('Population Percentage')
+    plt.ylabel('Cumulative Gains')
+    plt.title('Cumulative Gains Chart')
+    plt.grid(True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'gains_chart.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_lift_chart(lift_data, save_dir='plots'):
+    """Plot Lift chart and save to file"""
+    df = pd.DataFrame(lift_data['layer'][0]['data']['values'])
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(df['Quantile'], df['Lift'], 'g-')
+    plt.axhline(y=1, color='r', linestyle='--', label='Baseline')
+    plt.xlabel('Population Percentage')
+    plt.ylabel('Lift')
+    plt.title('Lift Chart')
+    plt.legend()
+    plt.grid(True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'lift_chart.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_ks_chart(ks_data, save_dir='plots'):
+    """Plot Kolmogorov-Smirnov chart and save to file"""
+    df = pd.DataFrame(ks_data['layer'][0]['data']['values'])
+
+    plt.figure(figsize=(8, 6))
+    plt.plot(df['Quantile'], df['Gains'], 'b-')
+    plt.xlabel('Population Percentage')
+    plt.ylabel('KS Statistic')
+    plt.title('Kolmogorov-Smirnov Chart')
+    plt.grid(True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'ks_chart.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def plot_all_charts(roc_curve, prec_recall_curve, gains_chart, lift_chart, ks_chart, save_dir='plots'):
+    """Plot all available classification metrics charts and save to file"""
+
+    # Create subplots for available charts
+    available_charts = sum(x is not None for x in [roc_curve, prec_recall_curve, gains_chart, lift_chart, ks_chart])
+    rows = (available_charts + 1) // 2  # Calculate rows needed
+
+    fig = plt.figure(figsize=(15, 5 * rows))
+
+    plot_idx = 1
+
+    if roc_curve is not None:
+        plt.subplot(rows, 2, plot_idx)
+        df = pd.DataFrame(roc_curve['layer'][0]['data']['values'])
+        plt.plot(df['False Positive Rate'], df['True Positive Rate'], 'b-')
+        plt.plot([0, 1], [0, 1], 'r--')
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+        plt.title('ROC Curve')
+        plt.grid(True)
+        plot_idx += 1
+
+    if prec_recall_curve is not None:
+        plt.subplot(rows, 2, plot_idx)
+        df = pd.DataFrame(prec_recall_curve['layer'][0]['data']['values'])
+        plt.plot(df['Recall'], df['Precision'], 'g-')
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.title('Precision-Recall Curve')
+        plt.grid(True)
+        plot_idx += 1
+
+    if gains_chart is not None:
+        plt.subplot(rows, 2, plot_idx)
+        df = pd.DataFrame(gains_chart['layer'][0]['data']['values'])
+        plt.plot(df['Quantile'], df['Gains'], 'b-')
+        plt.plot([0, 1], [0, 1], 'r--')
+        plt.xlabel('Population Percentage')
+        plt.ylabel('Cumulative Gains')
+        plt.title('Cumulative Gains Chart')
+        plt.grid(True)
+        plot_idx += 1
+
+    if lift_chart is not None:
+        plt.subplot(rows, 2, plot_idx)
+        df = pd.DataFrame(lift_chart['layer'][0]['data']['values'])
+        plt.plot(df['Quantile'], df['Lift'], 'g-')
+        plt.axhline(y=1, color='r', linestyle='--')
+        plt.xlabel('Population Percentage')
+        plt.ylabel('Lift')
+        plt.title('Lift Chart')
+        plt.grid(True)
+        plot_idx += 1
+
+    if ks_chart is not None:
+        plt.subplot(rows, 2, plot_idx)
+        df = pd.DataFrame(ks_chart['layer'][0]['data']['values'])
+        plt.plot(df['Quantile'], df['Gains'], 'b-')
+        plt.xlabel('Population Percentage')
+        plt.ylabel('KS Statistic')
+        plt.title('Kolmogorov-Smirnov Chart')
+        plt.grid(True)
+        plot_idx += 1
+
+    plt.tight_layout()
+
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'all_classification_metrics.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+
+def key_to_experiment(experiment_key, client_id, dai_engine, token_endpoint_url, token, environment):
+    if experiment_key is None:
+        raise ValueError("Either experiment or experiment_key must be provided")
+    engine_manager = connect_to_h2o_engine(token, client_id, token_endpoint_url, environment)
+    dai = connect_to_driverless_ai(engine_manager, dai_engine)
+    experiment = dai.experiments.get(experiment_key)
+    return experiment
+
+
+def get_artifacts(experiment=None, experiment_key=None, client_id=None, dai_engine=None, token_endpoint_url=None,
+                  token=None, environment=None, save_dir='./'):
+    if experiment is None:
+        experiment = key_to_experiment(experiment_key, client_id, dai_engine, token_endpoint_url, token, environment)
+
+    artifacts = experiment.artifacts.list()
+    if 'logs' in artifacts:
+        logs_zip = experiment.artifacts.download(only=['logs'], dst_dir=save_dir, overwrite=True)['logs']
+        logs_dir = './logs_dir'
+        with ZipFile(logs_zip, 'r') as zip_ref:
+            zip_ref.extractall(logs_dir)
+        os.remove(logs_zip)
+        log_files = [os.path.join(os.getcwd(), logs_dir, x) for x in os.listdir(logs_dir)]
+
+        for fil in log_files:
+            if fil.endswith('.zip'):
+                with ZipFile(fil, 'r') as zip_ref:
+                    zip_ref.extractall(logs_dir)
+        log_files = [os.path.join(os.getcwd(), logs_dir, x) for x in os.listdir(logs_dir)]
+        print(f"List of experiment log files extracted include: {log_files}")
+
+        moved = []
+        useful_extensions = ['.png', '.csv', '.json']
+        for fil in log_files:
+            if any(fil.endswith(ext) for ext in useful_extensions):
+                shutil.copy(fil, save_dir)
+                new_abs_path = os.path.join(save_dir, os.path.basename(fil))
+                moved.append(new_abs_path)
+        print(f"Log files moved to {save_dir} include: {moved}")
+
+    if 'summary' in artifacts:
+        summary_zip = experiment.artifacts.download(only=['summary'], dst_dir=save_dir, overwrite=True)['summary']
+        summary_dir = './summary_dir'
+        with ZipFile(summary_zip, 'r') as zip_ref:
+            zip_ref.extractall(summary_dir)
+        os.remove(summary_zip)
+        summary_files = [os.path.join(os.getcwd(), summary_dir, x) for x in os.listdir(summary_dir)]
+        print(f"List of summary log files extracted include: {summary_files}")
+        moved = []
+        useful_extensions = ['.png', '.csv', '.json']
+        for fil in summary_files:
+            if any(fil.endswith(ext) for ext in useful_extensions):
+                shutil.copy(fil, save_dir)
+                new_abs_path = os.path.join(save_dir, os.path.basename(fil))
+                moved.append(new_abs_path)
+        print(f"Summary files moved to {save_dir} include: {moved}")
+    if 'train_predictions' in artifacts:
+        train_preds = experiment.artifacts.download(only=['train_predictions'], dst_dir=save_dir, overwrite=True)[
+            'train_predictions']
+        print(f"Train predictions saved to {train_preds}")
+        print(f"Head of train predictions: {pd.read_csv(train_preds).head()}")
+    if 'test_predictions' in artifacts:
+        test_preds = experiment.artifacts.download(only=['test_predictions'], dst_dir=save_dir, overwrite=True)[
+            'test_predictions']
+        print(f"Test predictions saved to {test_preds}")
+        print(f"Head of test predictions: {pd.read_csv(test_preds).head()}")
+    if 'autoreport' in artifacts:
+        autoreport = experiment.artifacts.download(only=['autoreport'], dst_dir=save_dir, overwrite=True)['autoreport']
+        print(f"Autoreport saved to {autoreport}")
+    if 'autodoc' in artifacts:
+        autodoc = experiment.artifacts.download(only=['autodoc'], dst_dir=save_dir, overwrite=True)['autodoc']
+        print(f"Autoreport saved to {autodoc}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run Driverless AI experiments from command line.")
+
+    # instance
+    parser.add_argument("--engine", "--dai_engine", default="daidemo",
+                        help="Name of the DAI engine")
+    parser.add_argument("--client_id", "--dai_client_id", default=os.getenv('DAI_CLIENT_ID', "hac-platform-public"),
+                        help="Name of client_id")
+    parser.add_argument("--token_endpoint_url", "--dai_token_endpoint_url", default=os.getenv('DAI_TOKEN_ENDPOINT_URL',
+                                                                                              "https://auth.internal.dedicated.h2o.ai/auth/realms/hac/protocol/openid-connect/token"),
+                        help="Token endpoint url")
+    parser.add_argument("--environment", "--dai_environment",
+                        default=os.getenv('DAI_ENVIRONMENT', "https://internal.dedicated.h2o.ai"),
+                        help="DAI environment")
+    parser.add_argument("--token", "--dai_token", default=os.getenv('DAI_TOKEN'),
+                        help="DAI token")
+    parser.add_argument('--demo_mode', action='store_true', help="Use demo mode")
+
+    # Existing experiment
+    parser.add_argument("--experiment_key", default="",
+                        help="Key of an existing experiment to re-use")
+    parser.add_argument("--dataset_key", default="",
+                        help="Key of an existing dataset to re-use")
+
+    # Creating new dataset
+    parser.add_argument("--data-url", required=False,
+                        default="",
+                        help="URL to the dataset (e.g., S3 URL)")
+    parser.add_argument("--dataset-name", default="Dataset",
+                        help="Name for the dataset in DAI (default: Dataset)")
+    parser.add_argument("--data-source", default="s3",
+                        help="Source type of the dataset (default: s3)")
+
+    # Creating new experiment
+    parser.add_argument("--target-column",
+                        default="Churn?",
+                        required=False,
+                        help="Name of the target column for prediction")
+    parser.add_argument("--task", default="classification",
+                        choices=["classification", "regression", "predict",
+                                 "shapley",
+                                 "shapley_original_features",
+                                 "shapley_transformed_features",
+                                 "transform",
+                                 "fit_transform",
+                                 "fit_and_transform",
+                                 "artifacts",
+                                 ],
+                        help="Type of ML task (default: classification)")
+    parser.add_argument("--scorer", default="F1",
+                        help="Evaluation metric to use (default: F1)")
+    parser.add_argument("--experiment-name", default="Experiment",
+                        help="Name for the experiment (default: Experiment)")
+    parser.add_argument("--accuracy", type=int, choices=range(1, 11), default=1,
+                        help="Accuracy setting (1-10, default: 1)")
+    parser.add_argument("--time", type=int, choices=range(1, 11), default=1,
+                        help="Time setting (1-10, default: 1)")
+    parser.add_argument("--interpretability", type=int, choices=range(1, 11), default=6,
+                        help="Interpretability setting (1-10, default: 6)")
+    parser.add_argument("--train-size", type=float, default=0.8,
+                        help="Proportion of data for training (default: 0.8)")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="Random seed for reproducibility (default: 42)")
+    parser.add_argument("--fast", action="store_false",
+                        help="Use fast settings for experiment or predictions")
+    parser.add_argument("--force", action="store_false",
+                        help="Force overwrite existing datasets/experiments")
+
+    args = parser.parse_args()
+
+    # Connect to H2O
+    engine_manager = connect_to_h2o_engine(args.token, args.client_id, args.token_endpoint_url, args.environment)
+    dai = connect_to_driverless_ai(engine_manager, args.engine)
+
+    # Create plots directory if it doesn't exist
+    save_dir = './'
+
+    # Ensure all columns are displayed
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.expand_frame_repr', False)  # Prevent wrapping to multiple lines
+
+    if args.experiment_key:
+        # Re-use existing experiment
+        experiment = dai.experiments.get(args.experiment_key)
+        print(f"Re-using existing experiment: {experiment.name} with experiment_key: {experiment.key}")
+
+        # Create dataset for (e.g.) transform or predict
+        if args.data_url:
+            dataset = create_dataset(
+                dai,
+                args.data_url,
+                args.dataset_name,
+                args.data_source,
+                args.force
+            )
+        elif args.dataset_key:
+            # Re-use existing dataset
+            dataset = dai.datasets.get(args.dataset_key)
+            print(f"Re-using existing dataset: {dataset.name} with dataset_key: {dataset.key}")
+        else:
+            dataset = None
+        print(f"Performing task {args.task} on experiment {experiment.name}")
+        if args.task == 'predict':
+            if dataset is None:
+                print("Dataset key is required for prediction.")
+            else:
+                prediction = experiment.predict(dataset)
+                prediction_csv = prediction.download(dst_file=os.path.join(save_dir, 'prediction.csv'), overwrite=True)
+                print(f"Prediction saved to {prediction_csv}")
+                print(f"Head of prediction:\n{pd.read_csv(prediction_csv).head()}")
+        elif args.task in ['shapley', 'shapley_original_features']:
+            if dataset is None:
+                print("Dataset key is required for shapley prediction.")
+            else:
+                prediction = experiment.predict(dataset, include_shap_values_for_original_features=True,
+                                                use_fast_approx_for_shap_values=args.fast)
+                prediction_csv = prediction.download(dst_file=os.path.join(save_dir, 'shapley_original_features.csv'),
+                                                     overwrite=True)
+                print(f"Shapley on original features saved to {prediction_csv}")
+                print(f"Head of shapley on original features:\n{pd.read_csv(prediction_csv).head()}")
+                print(
+                    "Column names for contributions (Shapley values) are in form contrib_<original_column_name>, which you should programatically access instead of repeating all the names in any python code.")
+        elif args.task == 'shapley_transformed_features':
+            if dataset is None:
+                print("Dataset key is required for shapley prediction.")
+            else:
+                prediction = experiment.predict(dataset, include_shap_values_for_transformed_features=True,
+                                                use_fast_approx_for_shap_values=args.fast)
+                prediction_csv = prediction.download(
+                    dst_file=os.path.join(save_dir, 'shapley_transformed_features.csv'), overwrite=True)
+                print(f"Shapley on transformed features saved to {prediction_csv}")
+                print(f"Head of shapley on transformed features:\n{pd.read_csv(prediction_csv).head()}")
+                print(
+                    "Column names for contributions (Shapley values) are in form contrib_<transformed_column_name>, which you should programatically access instead of repeating all the names in any python code.")
+        elif args.task == 'transform':
+            if dataset is None:
+                print("Dataset key is required for transformation.")
+            else:
+                transformation = experiment.transform(dataset)
+                transformation_csv = transformation.download(dst_file=os.path.join(save_dir, 'transformation.csv'),
+                                                             overwrite=True)
+                print(f"Transformation saved to {transformation_csv}")
+                print(f"Head of transformation:\n{pd.read_csv(transformation_csv).head()}")
+        elif args.task in ['fit_transform', 'fit_and_transform']:
+            if dataset is None:
+                print("Dataset key is required for fit_and_transform.")
+            else:
+                transformation = experiment.fit_and_transform(dataset)
+
+                if transformation.test_dataset:
+                    transformation_csv = transformation.download_transformed_test_dataset(
+                        dst_file=os.path.join(save_dir, 'fit_transformation_test.csv'),
+                        overwrite=True)
+                    print(f"Fit and Transformation on test dataset saved to {transformation_csv}")
+                    print(f"Head of fit and transformation on test dataset:\n{pd.read_csv(transformation_csv).head()}")
+
+                if transformation.training_dataset:
+                    transformation_csv = transformation.download_transformed_training_dataset(
+                        dst_file=os.path.join(save_dir, 'fit_transformation_train.csv'),
+                        overwrite=True)
+                    print(f"Fit and Transformation on training dataset saved to {transformation_csv}")
+                    print(
+                        f"Head of fit and transformation on training dataset:\n{pd.read_csv(transformation_csv).head()}")
+
+                if transformation.validation_dataset:
+                    print(f"validation_split_fraction: {transformation.validation_split_fraction}")
+                    transformation_csv = transformation.download_transformed_validation_dataset(
+                        dst_file=os.path.join(save_dir, 'fit_transformation_valid.csv'),
+                        overwrite=True)
+                    print(f"Fit and Transformation on validation saved to {transformation_csv}")
+                    print(
+                        f"Head of fit and transformation on validation dataset:\n{pd.read_csv(transformation_csv).head()}")
+        elif args.task == 'artifacts':
+            get_artifacts(experiment=experiment, save_dir=save_dir)
+        elif args.task in ['regression', 'classification']:
+            print(f"{args.task} task does not apply when re-using an existing experiment.")
+        else:
+            print(f"Nothing to do for task {args.task} on experiment {experiment.name}")
+
+    else:
+        if args.demo_mode:
+            args.data_url = "https://h2o-internal-release.s3-us-west-2.amazonaws.com/data/Splunk/churn.csv"
+            args.target_column = "Churn?"
+            args.task = "classification"
+            args.scorer = "F1"
+
+        # Create and split dataset
+        dataset = create_dataset(
+            dai,
+            args.data_url,
+            args.dataset_name,
+            args.data_source,
+            args.force
+        )
+
+        train_test_split = split_dataset(
+            dataset,
+            args.train_size,
+            f"{args.dataset_name}_train",
+            f"{args.dataset_name}_test",
+            args.target_column,
+            args.seed
+        )
+
+        # Create and run experiment
+        experiment = create_experiment(
+            dai,
+            train_test_split,
+            args.target_column,
+            args.scorer,
+            args.task,
+            args.experiment_name,
+            args.accuracy,
+            args.time,
+            args.interpretability,
+            args.force,
+            args.fast,
+        )
+
+        # Print details and visualize results
+        print_experiment_details(experiment)
+        visualize_importance(experiment)
+
+        # Individual plots
+        metric_plots = experiment.metric_plots
+        if args.task == 'classification':
+            plot_roc_curve(metric_plots.roc_curve, save_dir)
+            plot_precision_recall(metric_plots.prec_recall_curve, save_dir)
+            plot_gains_chart(metric_plots.gains_chart, save_dir)
+            plot_lift_chart(metric_plots.lift_chart, save_dir)
+            plot_ks_chart(metric_plots.ks_chart, save_dir)
+
+            # All plots in one figure
+            plot_all_charts(metric_plots.roc_curve, metric_plots.prec_recall_curve, metric_plots.gains_chart,
+                            metric_plots.lift_chart, metric_plots.ks_chart, save_dir)
+        else:
+            # FIXME: Add regression metrics plots
+            print("Regression task detected. No classification metrics to plot.")
+
+        get_artifacts(experiment=experiment, save_dir=save_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/reqs_optional/requirements_optional_agents.txt b/reqs_optional/requirements_optional_agents.txt
index 92cd9b694..8b6203e05 100644
--- a/reqs_optional/requirements_optional_agents.txt
+++ b/reqs_optional/requirements_optional_agents.txt
@@ -94,3 +94,7 @@ microsoft-bing-newssearch
 #microsoft-bing-autosuggest
 microsoft-bing-customimagesearch
 microsoft-bing-customwebsearch
+
+# DAI:
+h2o_engine_manager
+h2o_authn
\ No newline at end of file
diff --git a/src/version.py b/src/version.py
index f365e4a44..b34ee573b 100644
--- a/src/version.py
+++ b/src/version.py
@@ -1 +1 @@
-__version__ = "1715e071b421e762eff99c23767095b9c6555e66"
+__version__ = "263ec150c635d116ac1549842eebe7882fbf3a0b"