Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sklearn notebooks update #86

Merged
merged 2 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,19 @@
"outputs": [],
tkilias marked this conversation as resolved.
Show resolved Hide resolved
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
"\n",
"train_table = 'ABALONE_TRAIN'\n",
"test_table = 'ABALONE_TEST'\n",
"column_desc = [' '.join(c) for c in column_def]\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Create an Exasol connection\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
Expand All @@ -122,8 +122,7 @@
"stopwatch = Stopwatch()\n",
"\n",
"# Create an Exasol connection\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
Expand All @@ -143,15 +142,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "3490c957-366e-425f-91ae-a645ccabbfe0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e53cbb99-8387-4165-a930-e4f24abfeaee",
"id": "6645e76c-6a6e-48f3-a668-c1fd8717d7f2",
"metadata": {},
"outputs": [],
"source": []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,16 @@
"metadata": {},
ahsimb marked this conversation as resolved.
Show resolved Hide resolved
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
"from stopwatch import Stopwatch\n",
"\n",
"target_column = 'RINGS'\n",
"bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/abalone_svm_model.pkl'\n",
"bfs_model_path = get_udf_bucket_path(sb_config) + '/abalone_svm_model.pkl'\n",
"params = {'schema': sb_config.SCHEMA, 'test_table': 'ABALONE_TEST', 'model_path': bfs_model_path}\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" # Get the list of feature columns\n",
" sql = 'SELECT * FROM {schema!q}.{test_table!q} LIMIT 1'\n",
" df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
Expand Down Expand Up @@ -98,7 +97,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Get the ground truth labels for the test set.\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
" df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,16 @@
"metadata": {},
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection, get_udf_bucket_path\n",
"from stopwatch import Stopwatch\n",
"\n",
"target_column = 'CLASS'\n",
"bfs_model_path = f'/buckets/{sb_config.BUCKETFS_SERVICE}/{sb_config.BUCKETFS_BUCKET}/telescope_tree_model.pkl'\n",
"bfs_model_path = get_udf_bucket_path(sb_config) + '/telescope_tree_model.pkl'\n",
"params = {'schema': sb_config.SCHEMA, 'test_table': 'TELESCOPE_TEST', 'model_path': bfs_model_path}\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" # Get the list of feature columns\n",
" sql = 'SELECT * FROM {schema!i}.{test_table!i} LIMIT 1'\n",
" df_tmp = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
Expand Down Expand Up @@ -98,7 +97,7 @@
"import matplotlib.pyplot as plt\n",
"\n",
"# Get the ground truth labels for the test set.\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" sql = f'SELECT ROWID AS [sample_id], [{target_column}] FROM {{schema!q}}.{{test_table!q}}'\n",
" df_true = conn.export_to_pandas(query_or_table=sql, query_params=params)\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"outputs": [],
"source": [
"import textwrap\n",
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
Expand Down Expand Up @@ -104,8 +104,7 @@
"/\n",
"\"\"\")\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" conn.execute(query=sql, query_params={'schema': sb_config.SCHEMA})\n",
"\n",
"print(f\"Creating prediction script took: {stopwatch}\")"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,13 @@
"metadata": {},
tkilias marked this conversation as resolved.
Show resolved Hide resolved
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from sklearn.model_selection import train_test_split\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'ABALONE_TRAIN'))\n",
"\n",
"X, y = df.drop(columns='RINGS'), df['RINGS']\n",
Expand Down Expand Up @@ -199,20 +198,14 @@
"outputs": [],
"source": [
"import pickle\n",
"from exasol.bucketfs import Service\n",
"from exasol.connections import open_bucketfs_connection\n",
"\n",
"MODEL_FILE = 'abalone_svm_model.pkl'\n",
"\n",
"# Setup the connection parameters.\n",
"buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
"buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
"buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Connect to the BucketFS service and navigate to the bucket of choice.\n",
"bucketfs = Service(buckfs_url, buckfs_credentials)\n",
"bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
"# Connect to the BucketFS service\n",
"bucket = open_bucketfs_connection(sb_config)\n",
"\n",
"# Serialize the model into a byte-array and upload it to the BucketFS, \n",
"# where it will be saved in the file with the specified name.\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,12 @@
"metadata": {},
ahsimb marked this conversation as resolved.
Show resolved Hide resolved
"outputs": [],
"source": [
"import pyexasol\n",
"from exasol.connections import open_pyexasol_connection\n",
"from stopwatch import Stopwatch\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"with open_pyexasol_connection(sb_config, compression=True) as conn:\n",
" df = conn.export_to_pandas(query_or_table=(sb_config.SCHEMA, 'TELESCOPE_TRAIN'))\n",
"\n",
"print(f\"Loading the data took: {stopwatch}\")"
Expand Down Expand Up @@ -145,20 +144,14 @@
"outputs": [],
"source": [
"import pickle\n",
"from exasol.bucketfs import Service\n",
"from exasol.connections import open_bucketfs_connection\n",
"\n",
"MODEL_FILE = 'telescope_tree_model.pkl'\n",
"\n",
"# Setup the connection parameters.\n",
"buckfs_url_prefix = 'https' if sb_config.BUCKETFS_USE_HTTPS == 'True' else 'http'\n",
"buckfs_url = f'{buckfs_url_prefix}://{sb_config.EXTERNAL_HOST_NAME}:{sb_config.BUCKETFS_PORT}'\n",
"buckfs_credentials = {sb_config.BUCKETFS_BUCKET: {'username': sb_config.BUCKETFS_USER, 'password': sb_config.BUCKETFS_PASSWORD}}\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Connect to the BucketFS service and navigate to the bucket of choice.\n",
"bucketfs = Service(buckfs_url, buckfs_credentials)\n",
"bucket = bucketfs[sb_config.BUCKETFS_BUCKET]\n",
"# Connect to the BucketFS service\n",
"bucket = open_bucketfs_connection(sb_config)\n",
"\n",
"# Serialize the model into a byte-array and upload it to the BucketFS, \n",
"# where it will be saved in the file with the specified name.\n",
Expand All @@ -174,24 +167,6 @@
"source": [
"Now we are ready to use this model in our SQL queries. This will be demonstrated in the [following notebook](sklearn_predict_telescope.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0301211d-8520-4f66-8727-114f3292bcd6",
"metadata": {},
"outputs": [],
"source": [
"buckfs_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec99ed24-d6ce-46bf-97f3-9d0b1c38aade",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

apt_dependencies:
- curl=7.68.0-1ubuntu2.20
- python3.8-venv=3.8.10-0ubuntu1~20.04.8
- python3.8-venv=3.8.10-0ubuntu1~20.04.9
- python3-pip=20.0.2-5ubuntu1.10
Loading