From b437c5a5e6089ff42c2fb556b20df0b0a32dcece Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Wed, 6 Nov 2024 01:52:25 -0800 Subject: [PATCH] Add DAI tool --- openai_server/agent_prompting.py | 35 +- .../agent_tools/driverless_ai_data_science.py | 660 ++++++++++++++++++ .../requirements_optional_agents.txt | 4 + src/version.py | 2 +- 4 files changed, 699 insertions(+), 2 deletions(-) create mode 100644 openai_server/agent_tools/driverless_ai_data_science.py diff --git a/openai_server/agent_prompting.py b/openai_server/agent_prompting.py index 78975386c..ff832b1d0 100644 --- a/openai_server/agent_prompting.py +++ b/openai_server/agent_prompting.py @@ -721,6 +721,34 @@ def get_wolfram_alpha_helper(): return wolframalpha +def get_dai_helper(): + cwd = os.path.abspath(os.getcwd()) + if os.getenv('ENABLE_DAI'): + # https://wolframalpha.readthedocs.io/en/latest/?badge=latest + # https://products.wolframalpha.com/api/documentation + dai = f"""\n* DriverlessAI is an advanced AutoML tool for data science model making and predictions. +* If user specifically asks for a DAI model, then you should use the existing pre-built python code to query DriverlessAI, E.g.: +```sh +# filename: my_dai_query.sh +# execution: true +python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py +``` +* usage: python {cwd}/openai_server/agent_tools/wolfram_alpha_math_science_query.py [--experiment_key EXPERIMENT_KEY] [--dataset_key DATASET_KEY] [--data-url DATA_URL] [--dataset-name DATASET_NAME] [--data-source DATA_SOURCE] [--target-column TARGET_COLUMN] [--task {{classification,regression,predict,shapley_original_features,shapley_transformed_features,transform,fit_and_transform,artifacts}}] [--scorer SCORER] [--experiment-name EXPERIMENT_NAME] [--accuracy {{1,2,3,4,5,6,7,8,9,10}}] [--time {{1,2,3,4,5,6,7,8,9,10}}] [--interpretability {{1,2,3,4,5,6,7,8,9,10}}] [--train-size TRAIN_SIZE] [--seed SEED] [--fast] [--force] +* Typical case for creating experiment might be: +python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --dataset-name "my_dataset" --data-url "https://mydata.com/mydata.csv" --target-column "target" --task "classification" --scorer "auc" --experiment-name "my_experiment" +* A typical re-use of the experiment_key and dataset_key for prediction (or shapley, transform, fit_and_transform) would be like: +python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --experiment_key --dataset_key --task "prediction" +* For predict, shapley, transform, fit_and_transform, one can also pass --data-url to use a fresh dataset on the given experiment, e.g.: +python {cwd}/openai_server/agent_tools/driverless_ai_data_science.py --experiment_key --data-url "https://mydata.com/mydata.csv" --task "prediction" +""" + if os.getenv('DAI_TOKEN') is None: + dai += f"""* Additionally, you must pass --token to the command line to use the DAI tool.""" + dai += f"""You may also pass these additional options if user provides them: --engine DAI_ENGINE --client_id DAI_CLIENT_ID --token_endpoint_url DAI_TOKEN_ENDPOINT_URL --environment DAI_ENVIRONMENT --token DAI_TOKEN""" + else: + dai = "" + return dai + + def get_news_api_helper(): cwd = os.path.abspath(os.getcwd()) have_internet = get_have_internet() @@ -824,7 +852,6 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p mermaid_renderer_helper = get_mermaid_renderer_helper() image_generation_helper = get_image_generation_helper() audio_transcription_helper = get_audio_transcription_helper() - query_to_web_image_helper = get_query_to_web_image_helper() aider_coder_helper = get_aider_coder_helper(base_url, api_key, model, autogen_timeout) rag_helper = get_rag_helper(base_url, api_key, model, autogen_timeout, text_context_list, image_file) convert_helper = get_convert_to_text_helper() @@ -836,6 +863,10 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p wolfram_alpha_helper = get_wolfram_alpha_helper() news_helper = get_news_api_helper() bing_search_helper = get_bing_search_helper() + query_to_web_image_helper = get_query_to_web_image_helper() + + # data science + dai_helper = get_dai_helper() # general API notes: api_helper = get_api_helper() @@ -875,6 +906,8 @@ def get_full_system_prompt(agent_code_writer_system_message, agent_system_site_p news_helper, bing_search_helper, query_to_web_image_helper, + # data science + dai_helper, # overall api_helper, agent_tools_note, diff --git a/openai_server/agent_tools/driverless_ai_data_science.py b/openai_server/agent_tools/driverless_ai_data_science.py new file mode 100644 index 000000000..8e6904b5b --- /dev/null +++ b/openai_server/agent_tools/driverless_ai_data_science.py @@ -0,0 +1,660 @@ +import argparse +import os +import shutil +from zipfile import ZipFile + +import pandas as pd +from matplotlib import pyplot as plt + + +def connect_to_h2o_engine(token: str, client_id, token_endpoint_url, environment): + # https://internal.dedicated.h2o.ai/cli-and-api-access + """Establishes a secure connection to the H2O Engine Manager using the provided token.""" + import h2o_authn + token_provider = h2o_authn.TokenProvider( + refresh_token=token, + client_id=client_id, + token_endpoint_url=token_endpoint_url, + ) + + import h2o_engine_manager + engine_manager = h2o_engine_manager.login( + environment=environment, + token_provider=token_provider + ) + + # https://docs.h2o.ai/mlops/py-client/install + # os.system('pip install h2o-mlops') + # import h2o_mlops + # mlops = h2o_mlops.Client( + # gateway_url="https://mlops-api.internal.dedicated.h2o.ai", + # token_provider=token_provider + # ) + + print("Successfully connected to H2O engine manager.") + return engine_manager + + +def connect_to_driverless_ai(engine_manager, dai_engine: str = None): + """Creates a Driverless AI engine and establishes a connection to it.""" + dai_engine_obj = None + for dai_inst in engine_manager.dai_engine_client.list_all_engines(): + if dai_inst.display_name == dai_engine: + dai_engine_obj = engine_manager.dai_engine_client.get_engine(dai_engine) + if dai_engine_obj.state.value != "STATE_RUNNING": + print(f"Waking up instance {dai_engine}") + dai_engine_obj.resume() + dai_engine_obj.wait() + + if dai_engine_obj is None: + # if DAI Engine does not exist + print(f"Creating instance {dai_engine}") + dai_engine_obj = engine_manager.dai_engine_client.create_engine(display_name=dai_engine) + dai_engine_obj.wait() + + dai = dai_engine_obj.connect() + print(f"Successfully connected to Driverless AI engine: {dai_engine}") + return dai + + +def create_dataset(dai, data_url: str, dataset_name: str, data_source: str = "s3", force: bool = True): + """Creates a dataset in the Driverless AI instance.""" + dataset = dai.datasets.create( + data=data_url, + data_source=data_source, + name=dataset_name, + force=force + ) + print(f"Dataset {dataset_name} with reusable dataset_key: {dataset.key} created successfully.") + return dataset + + +def split_dataset(dataset, train_size: float, train_name: str, test_name: str, + target_column: str, seed: int = 42): + """Splits a dataset into train and test sets.""" + dataset_split = dataset.split_to_train_test( + train_size=train_size, + train_name=train_name, + test_name=test_name, + target_column=target_column, + seed=seed + ) + + print("Dataset successfully split into training and testing sets.") + for k, v in dataset_split.items(): + print(f"Name: {v.name} with reusable dataset_key: {v.key}") + + return dataset_split + + +def create_experiment(dai, dataset_split, target_column: str, scorer: str = 'F1', + task: str = 'classification', experiment_name: str = 'Experiment', + accuracy: int = 1, time: int = 1, interpretability: int = 6, + fast=True, + force: bool = True): + """Creates an experiment in Driverless AI.""" + experiment_settings = { + **dataset_split, + 'task': task, + 'target_column': target_column, + 'scorer': scorer + } + + dai_settings = { + 'accuracy': accuracy, + 'time': time, + 'interpretability': interpretability, + } + if fast: + print("Using fast settings, but still making autoreport") + dai_settings.update({ + 'make_python_scoring_pipeline': 'off', + 'make_mojo_scoring_pipeline': 'off', + 'benchmark_mojo_latency': 'off', + 'make_autoreport': True, + 'check_leakage': 'off', + 'check_distribution_shift': 'off' + }) + + experiment = dai.experiments.create( + **experiment_settings, + name=experiment_name, + **dai_settings, + force=force + ) + + print(f"Experiment {experiment_name} with reusable experiment_key: {experiment.key} created with settings: " + f"Accuracy={accuracy}, Time={time}, Interpretability={interpretability}") + return experiment + + +def get_experiment_from_key(experiment_key, token, client_id, token_endpoint_url, dai_engine, environment): + # FIXME: not used yet, would be used to act more on experiment, like restart etc. + # Connect to the engine manager and Driverless AI + engine_manager = connect_to_h2o_engine(token, client_id, token_endpoint_url, environment) + dai = connect_to_driverless_ai(engine_manager, dai_engine) + + # Get the experiment + experiment = dai.experiments.get(experiment_key) + return experiment + + +def visualize_importance(experiment): + """Visualizes and saves variable importance plot.""" + var_imp = experiment.variable_importance() + print("\nVariable Importance Output:") + print(var_imp) + + # Save variable importance to csv + df = pd.DataFrame(var_imp.data, columns=var_imp.headers) + csv_file = "variable_importance.csv" + df.to_csv(csv_file, index=False) + df_top10 = df.sort_values('gain', ascending=False).head(10) + + plt.figure(figsize=(12, 8)) + plt.barh(df_top10['description'], df_top10['gain']) + plt.title('Top 10 Important Variables') + plt.xlabel('Importance (Gain)') + plt.tight_layout() + + output_path = 'variable_importance.png' + plt.savefig(output_path) + print(f"\nVariable importance plot saved as {output_path} and csv file as {csv_file}") + + print("\nTop 10 Important Variables:") + print(df_top10[['description', 'gain']].to_string(index=False)) + + +def print_experiment_details(experiment): + """Prints details of a Driverless AI experiment.""" + print(f"\nExperiment Details:") + print(f"Name: {experiment.name}") + print("\nDatasets:") + for dataset in experiment.datasets: + print(f" - {dataset}") + print(f"\nTarget: {experiment.settings.get('target_column')}") + print(f"Scorer: {experiment.metrics().get('scorer')}") + print(f"Task: {experiment.settings.get('task')}") + print(f"Size: {experiment.size}") + print(f"Summary: {experiment.summary}") + print("\nStatus:") + print(experiment.status(verbose=2)) + print("\nWeb Page: ", end='') + experiment.gui() + + print(f"\nMetrics: {experiment.metrics()}") + + +def plot_roc_curve(roc_data, save_dir='plots'): + """Plot ROC (Receiver Operating Characteristic) curve and save to file""" + df = pd.DataFrame(roc_data['layer'][0]['data']['values']) + + plt.figure(figsize=(8, 6)) + plt.plot(df['False Positive Rate'], df['True Positive Rate'], 'b-', label='ROC curve') + plt.plot([0, 1], [0, 1], 'r--', label='Random') + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('ROC Curve') + plt.legend() + plt.grid(True) + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'roc_curve.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def plot_precision_recall(pr_data, save_dir='plots'): + """Plot Precision-Recall curve and save to file""" + df = pd.DataFrame(pr_data['layer'][0]['data']['values']) + + plt.figure(figsize=(8, 6)) + plt.plot(df['Recall'], df['Precision'], 'g-') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.grid(True) + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'precision_recall_curve.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def plot_gains_chart(gains_data, save_dir='plots'): + """Plot Cumulative Gains chart and save to file""" + df = pd.DataFrame(gains_data['layer'][0]['data']['values']) + + plt.figure(figsize=(8, 6)) + plt.plot(df['Quantile'], df['Gains'], 'b-') + plt.plot([0, 1], [0, 1], 'r--', label='Random') + plt.xlabel('Population Percentage') + plt.ylabel('Cumulative Gains') + plt.title('Cumulative Gains Chart') + plt.grid(True) + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'gains_chart.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def plot_lift_chart(lift_data, save_dir='plots'): + """Plot Lift chart and save to file""" + df = pd.DataFrame(lift_data['layer'][0]['data']['values']) + + plt.figure(figsize=(8, 6)) + plt.plot(df['Quantile'], df['Lift'], 'g-') + plt.axhline(y=1, color='r', linestyle='--', label='Baseline') + plt.xlabel('Population Percentage') + plt.ylabel('Lift') + plt.title('Lift Chart') + plt.legend() + plt.grid(True) + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'lift_chart.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def plot_ks_chart(ks_data, save_dir='plots'): + """Plot Kolmogorov-Smirnov chart and save to file""" + df = pd.DataFrame(ks_data['layer'][0]['data']['values']) + + plt.figure(figsize=(8, 6)) + plt.plot(df['Quantile'], df['Gains'], 'b-') + plt.xlabel('Population Percentage') + plt.ylabel('KS Statistic') + plt.title('Kolmogorov-Smirnov Chart') + plt.grid(True) + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'ks_chart.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def plot_all_charts(roc_curve, prec_recall_curve, gains_chart, lift_chart, ks_chart, save_dir='plots'): + """Plot all available classification metrics charts and save to file""" + + # Create subplots for available charts + available_charts = sum(x is not None for x in [roc_curve, prec_recall_curve, gains_chart, lift_chart, ks_chart]) + rows = (available_charts + 1) // 2 # Calculate rows needed + + fig = plt.figure(figsize=(15, 5 * rows)) + + plot_idx = 1 + + if roc_curve is not None: + plt.subplot(rows, 2, plot_idx) + df = pd.DataFrame(roc_curve['layer'][0]['data']['values']) + plt.plot(df['False Positive Rate'], df['True Positive Rate'], 'b-') + plt.plot([0, 1], [0, 1], 'r--') + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('ROC Curve') + plt.grid(True) + plot_idx += 1 + + if prec_recall_curve is not None: + plt.subplot(rows, 2, plot_idx) + df = pd.DataFrame(prec_recall_curve['layer'][0]['data']['values']) + plt.plot(df['Recall'], df['Precision'], 'g-') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision-Recall Curve') + plt.grid(True) + plot_idx += 1 + + if gains_chart is not None: + plt.subplot(rows, 2, plot_idx) + df = pd.DataFrame(gains_chart['layer'][0]['data']['values']) + plt.plot(df['Quantile'], df['Gains'], 'b-') + plt.plot([0, 1], [0, 1], 'r--') + plt.xlabel('Population Percentage') + plt.ylabel('Cumulative Gains') + plt.title('Cumulative Gains Chart') + plt.grid(True) + plot_idx += 1 + + if lift_chart is not None: + plt.subplot(rows, 2, plot_idx) + df = pd.DataFrame(lift_chart['layer'][0]['data']['values']) + plt.plot(df['Quantile'], df['Lift'], 'g-') + plt.axhline(y=1, color='r', linestyle='--') + plt.xlabel('Population Percentage') + plt.ylabel('Lift') + plt.title('Lift Chart') + plt.grid(True) + plot_idx += 1 + + if ks_chart is not None: + plt.subplot(rows, 2, plot_idx) + df = pd.DataFrame(ks_chart['layer'][0]['data']['values']) + plt.plot(df['Quantile'], df['Gains'], 'b-') + plt.xlabel('Population Percentage') + plt.ylabel('KS Statistic') + plt.title('Kolmogorov-Smirnov Chart') + plt.grid(True) + plot_idx += 1 + + plt.tight_layout() + + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, 'all_classification_metrics.png'), dpi=300, bbox_inches='tight') + plt.close() + + +def key_to_experiment(experiment_key, client_id, dai_engine, token_endpoint_url, token, environment): + if experiment_key is None: + raise ValueError("Either experiment or experiment_key must be provided") + engine_manager = connect_to_h2o_engine(token, client_id, token_endpoint_url, environment) + dai = connect_to_driverless_ai(engine_manager, dai_engine) + experiment = dai.experiments.get(experiment_key) + return experiment + + +def get_artifacts(experiment=None, experiment_key=None, client_id=None, dai_engine=None, token_endpoint_url=None, + token=None, environment=None, save_dir='./'): + if experiment is None: + experiment = key_to_experiment(experiment_key, client_id, dai_engine, token_endpoint_url, token, environment) + + artifacts = experiment.artifacts.list() + if 'logs' in artifacts: + logs_zip = experiment.artifacts.download(only=['logs'], dst_dir=save_dir, overwrite=True)['logs'] + logs_dir = './logs_dir' + with ZipFile(logs_zip, 'r') as zip_ref: + zip_ref.extractall(logs_dir) + os.remove(logs_zip) + log_files = [os.path.join(os.getcwd(), logs_dir, x) for x in os.listdir(logs_dir)] + + for fil in log_files: + if fil.endswith('.zip'): + with ZipFile(fil, 'r') as zip_ref: + zip_ref.extractall(logs_dir) + log_files = [os.path.join(os.getcwd(), logs_dir, x) for x in os.listdir(logs_dir)] + print(f"List of experiment log files extracted include: {log_files}") + + moved = [] + useful_extensions = ['.png', '.csv', '.json'] + for fil in log_files: + if any(fil.endswith(ext) for ext in useful_extensions): + shutil.copy(fil, save_dir) + new_abs_path = os.path.join(save_dir, os.path.basename(fil)) + moved.append(new_abs_path) + print(f"Log files moved to {save_dir} include: {moved}") + + if 'summary' in artifacts: + summary_zip = experiment.artifacts.download(only=['summary'], dst_dir=save_dir, overwrite=True)['summary'] + summary_dir = './summary_dir' + with ZipFile(summary_zip, 'r') as zip_ref: + zip_ref.extractall(summary_dir) + os.remove(summary_zip) + summary_files = [os.path.join(os.getcwd(), summary_dir, x) for x in os.listdir(summary_dir)] + print(f"List of summary log files extracted include: {summary_files}") + moved = [] + useful_extensions = ['.png', '.csv', '.json'] + for fil in summary_files: + if any(fil.endswith(ext) for ext in useful_extensions): + shutil.copy(fil, save_dir) + new_abs_path = os.path.join(save_dir, os.path.basename(fil)) + moved.append(new_abs_path) + print(f"Summary files moved to {save_dir} include: {moved}") + if 'train_predictions' in artifacts: + train_preds = experiment.artifacts.download(only=['train_predictions'], dst_dir=save_dir, overwrite=True)[ + 'train_predictions'] + print(f"Train predictions saved to {train_preds}") + print(f"Head of train predictions: {pd.read_csv(train_preds).head()}") + if 'test_predictions' in artifacts: + test_preds = experiment.artifacts.download(only=['test_predictions'], dst_dir=save_dir, overwrite=True)[ + 'test_predictions'] + print(f"Test predictions saved to {test_preds}") + print(f"Head of test predictions: {pd.read_csv(test_preds).head()}") + if 'autoreport' in artifacts: + autoreport = experiment.artifacts.download(only=['autoreport'], dst_dir=save_dir, overwrite=True)['autoreport'] + print(f"Autoreport saved to {autoreport}") + if 'autodoc' in artifacts: + autodoc = experiment.artifacts.download(only=['autodoc'], dst_dir=save_dir, overwrite=True)['autodoc'] + print(f"Autoreport saved to {autodoc}") + + +def main(): + parser = argparse.ArgumentParser(description="Run Driverless AI experiments from command line.") + + # instance + parser.add_argument("--engine", "--dai_engine", default="daidemo", + help="Name of the DAI engine") + parser.add_argument("--client_id", "--dai_client_id", default=os.getenv('DAI_CLIENT_ID', "hac-platform-public"), + help="Name of client_id") + parser.add_argument("--token_endpoint_url", "--dai_token_endpoint_url", default=os.getenv('DAI_TOKEN_ENDPOINT_URL', + "https://auth.internal.dedicated.h2o.ai/auth/realms/hac/protocol/openid-connect/token"), + help="Token endpoint url") + parser.add_argument("--environment", "--dai_environment", + default=os.getenv('DAI_ENVIRONMENT', "https://internal.dedicated.h2o.ai"), + help="DAI environment") + parser.add_argument("--token", "--dai_token", default=os.getenv('DAI_TOKEN'), + help="DAI token") + parser.add_argument('--demo_mode', action='store_true', help="Use demo mode") + + # Existing experiment + parser.add_argument("--experiment_key", default="", + help="Key of an existing experiment to re-use") + parser.add_argument("--dataset_key", default="", + help="Key of an existing dataset to re-use") + + # Creating new dataset + parser.add_argument("--data-url", required=False, + default="", + help="URL to the dataset (e.g., S3 URL)") + parser.add_argument("--dataset-name", default="Dataset", + help="Name for the dataset in DAI (default: Dataset)") + parser.add_argument("--data-source", default="s3", + help="Source type of the dataset (default: s3)") + + # Creating new experiment + parser.add_argument("--target-column", + default="Churn?", + required=False, + help="Name of the target column for prediction") + parser.add_argument("--task", default="classification", + choices=["classification", "regression", "predict", + "shapley", + "shapley_original_features", + "shapley_transformed_features", + "transform", + "fit_transform", + "fit_and_transform", + "artifacts", + ], + help="Type of ML task (default: classification)") + parser.add_argument("--scorer", default="F1", + help="Evaluation metric to use (default: F1)") + parser.add_argument("--experiment-name", default="Experiment", + help="Name for the experiment (default: Experiment)") + parser.add_argument("--accuracy", type=int, choices=range(1, 11), default=1, + help="Accuracy setting (1-10, default: 1)") + parser.add_argument("--time", type=int, choices=range(1, 11), default=1, + help="Time setting (1-10, default: 1)") + parser.add_argument("--interpretability", type=int, choices=range(1, 11), default=6, + help="Interpretability setting (1-10, default: 6)") + parser.add_argument("--train-size", type=float, default=0.8, + help="Proportion of data for training (default: 0.8)") + parser.add_argument("--seed", type=int, default=42, + help="Random seed for reproducibility (default: 42)") + parser.add_argument("--fast", action="store_false", + help="Use fast settings for experiment or predictions") + parser.add_argument("--force", action="store_false", + help="Force overwrite existing datasets/experiments") + + args = parser.parse_args() + + # Connect to H2O + engine_manager = connect_to_h2o_engine(args.token, args.client_id, args.token_endpoint_url, args.environment) + dai = connect_to_driverless_ai(engine_manager, args.engine) + + # Create plots directory if it doesn't exist + save_dir = './' + + # Ensure all columns are displayed + pd.set_option('display.max_columns', None) + pd.set_option('display.expand_frame_repr', False) # Prevent wrapping to multiple lines + + if args.experiment_key: + # Re-use existing experiment + experiment = dai.experiments.get(args.experiment_key) + print(f"Re-using existing experiment: {experiment.name} with experiment_key: {experiment.key}") + + # Create dataset for (e.g.) transform or predict + if args.data_url: + dataset = create_dataset( + dai, + args.data_url, + args.dataset_name, + args.data_source, + args.force + ) + elif args.dataset_key: + # Re-use existing dataset + dataset = dai.datasets.get(args.dataset_key) + print(f"Re-using existing dataset: {dataset.name} with dataset_key: {dataset.key}") + else: + dataset = None + print(f"Performing task {args.task} on experiment {experiment.name}") + if args.task == 'predict': + if dataset is None: + print("Dataset key is required for prediction.") + else: + prediction = experiment.predict(dataset) + prediction_csv = prediction.download(dst_file=os.path.join(save_dir, 'prediction.csv'), overwrite=True) + print(f"Prediction saved to {prediction_csv}") + print(f"Head of prediction:\n{pd.read_csv(prediction_csv).head()}") + elif args.task in ['shapley', 'shapley_original_features']: + if dataset is None: + print("Dataset key is required for shapley prediction.") + else: + prediction = experiment.predict(dataset, include_shap_values_for_original_features=True, + use_fast_approx_for_shap_values=args.fast) + prediction_csv = prediction.download(dst_file=os.path.join(save_dir, 'shapley_original_features.csv'), + overwrite=True) + print(f"Shapley on original features saved to {prediction_csv}") + print(f"Head of shapley on original features:\n{pd.read_csv(prediction_csv).head()}") + print( + "Column names for contributions (Shapley values) are in form contrib_, which you should programatically access instead of repeating all the names in any python code.") + elif args.task == 'shapley_transformed_features': + if dataset is None: + print("Dataset key is required for shapley prediction.") + else: + prediction = experiment.predict(dataset, include_shap_values_for_transformed_features=True, + use_fast_approx_for_shap_values=args.fast) + prediction_csv = prediction.download( + dst_file=os.path.join(save_dir, 'shapley_transformed_features.csv'), overwrite=True) + print(f"Shapley on transformed features saved to {prediction_csv}") + print(f"Head of shapley on transformed features:\n{pd.read_csv(prediction_csv).head()}") + print( + "Column names for contributions (Shapley values) are in form contrib_, which you should programatically access instead of repeating all the names in any python code.") + elif args.task == 'transform': + if dataset is None: + print("Dataset key is required for transformation.") + else: + transformation = experiment.transform(dataset) + transformation_csv = transformation.download(dst_file=os.path.join(save_dir, 'transformation.csv'), + overwrite=True) + print(f"Transformation saved to {transformation_csv}") + print(f"Head of transformation:\n{pd.read_csv(transformation_csv).head()}") + elif args.task in ['fit_transform', 'fit_and_transform']: + if dataset is None: + print("Dataset key is required for fit_and_transform.") + else: + transformation = experiment.fit_and_transform(dataset) + + if transformation.test_dataset: + transformation_csv = transformation.download_transformed_test_dataset( + dst_file=os.path.join(save_dir, 'fit_transformation_test.csv'), + overwrite=True) + print(f"Fit and Transformation on test dataset saved to {transformation_csv}") + print(f"Head of fit and transformation on test dataset:\n{pd.read_csv(transformation_csv).head()}") + + if transformation.training_dataset: + transformation_csv = transformation.download_transformed_training_dataset( + dst_file=os.path.join(save_dir, 'fit_transformation_train.csv'), + overwrite=True) + print(f"Fit and Transformation on training dataset saved to {transformation_csv}") + print( + f"Head of fit and transformation on training dataset:\n{pd.read_csv(transformation_csv).head()}") + + if transformation.validation_dataset: + print(f"validation_split_fraction: {transformation.validation_split_fraction}") + transformation_csv = transformation.download_transformed_validation_dataset( + dst_file=os.path.join(save_dir, 'fit_transformation_valid.csv'), + overwrite=True) + print(f"Fit and Transformation on validation saved to {transformation_csv}") + print( + f"Head of fit and transformation on validation dataset:\n{pd.read_csv(transformation_csv).head()}") + elif args.task == 'artifacts': + get_artifacts(experiment=experiment, save_dir=save_dir) + elif args.task in ['regression', 'classification']: + print(f"{args.task} task does not apply when re-using an existing experiment.") + else: + print(f"Nothing to do for task {args.task} on experiment {experiment.name}") + + else: + if args.demo_mode: + args.data_url = "https://h2o-internal-release.s3-us-west-2.amazonaws.com/data/Splunk/churn.csv" + args.target_column = "Churn?" + args.task = "classification" + args.scorer = "F1" + + # Create and split dataset + dataset = create_dataset( + dai, + args.data_url, + args.dataset_name, + args.data_source, + args.force + ) + + train_test_split = split_dataset( + dataset, + args.train_size, + f"{args.dataset_name}_train", + f"{args.dataset_name}_test", + args.target_column, + args.seed + ) + + # Create and run experiment + experiment = create_experiment( + dai, + train_test_split, + args.target_column, + args.scorer, + args.task, + args.experiment_name, + args.accuracy, + args.time, + args.interpretability, + args.force, + args.fast, + ) + + # Print details and visualize results + print_experiment_details(experiment) + visualize_importance(experiment) + + # Individual plots + metric_plots = experiment.metric_plots + if args.task == 'classification': + plot_roc_curve(metric_plots.roc_curve, save_dir) + plot_precision_recall(metric_plots.prec_recall_curve, save_dir) + plot_gains_chart(metric_plots.gains_chart, save_dir) + plot_lift_chart(metric_plots.lift_chart, save_dir) + plot_ks_chart(metric_plots.ks_chart, save_dir) + + # All plots in one figure + plot_all_charts(metric_plots.roc_curve, metric_plots.prec_recall_curve, metric_plots.gains_chart, + metric_plots.lift_chart, metric_plots.ks_chart, save_dir) + else: + # FIXME: Add regression metrics plots + print("Regression task detected. No classification metrics to plot.") + + get_artifacts(experiment=experiment, save_dir=save_dir) + + +if __name__ == "__main__": + main() diff --git a/reqs_optional/requirements_optional_agents.txt b/reqs_optional/requirements_optional_agents.txt index 92cd9b694..8b6203e05 100644 --- a/reqs_optional/requirements_optional_agents.txt +++ b/reqs_optional/requirements_optional_agents.txt @@ -94,3 +94,7 @@ microsoft-bing-newssearch #microsoft-bing-autosuggest microsoft-bing-customimagesearch microsoft-bing-customwebsearch + +# DAI: +h2o_engine_manager +h2o_authn \ No newline at end of file diff --git a/src/version.py b/src/version.py index f365e4a44..b34ee573b 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "1715e071b421e762eff99c23767095b9c6555e66" +__version__ = "263ec150c635d116ac1549842eebe7882fbf3a0b"