Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Demo #44

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open

Demo #44

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions notebooks/06_load_excel_files/06_load_excel_files.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
"kernelspec": {
"display_name": "Streamlit Notebook",
"name": "streamlit"
},
"lastEditStatus": {
"notebookId": "tls57qvv4qvhdi7o53pf",
"authorId": "3239588041470",
"authorName": "ELANPANDIAN",
"authorEmail": "[email protected]",
"sessionId": "f23c2977-0593-4a49-9c6c-591f13ca435f",
"lastEditTime": 1736881049987
}
},
"nbformat_minor": 5,
Expand All @@ -13,7 +21,8 @@
"id": "c1970118-7b46-4dcf-acd2-cd8836d14408",
"metadata": {
"name": "md_overview",
"collapsed": false
"collapsed": false,
"resultHeight": 283
},
"source": "# 06 Load Excel Files\n\n* Author: Jeremiah Hansen\n* Last Updated: 10/25/2024\n\nThis notebook will load data into the `LOCATION` and `ORDER_DETAIL` tables from Excel files.\n\nThis currently does not use Snowpark File Access as it doesn't yet work in Notebooks. So for now we copy the file locally first."
},
Expand All @@ -22,7 +31,9 @@
"id": "8873bc96-287b-4f47-a929-013c1487a088",
"metadata": {
"language": "sql",
"name": "sql_get_context"
"name": "sql_get_context",
"collapsed": false,
"resultHeight": 111
},
"outputs": [],
"source": "-- This won't be needed when we can pass variables to Notebooks!\nSELECT current_database() AS DATABASE_NAME, current_schema() AS SCHEMA_NAME",
Expand All @@ -34,7 +45,8 @@
"metadata": {
"language": "python",
"name": "py_imports",
"collapsed": false
"collapsed": false,
"resultHeight": 0
},
"source": "# Import python packages\nimport logging\nimport pandas as pd\n\nlogger = logging.getLogger(\"demo_logger\")\n\n# Get the target database and schema using the results from the SQL cell above\n# This won't be needed when we can pass variables to Notebooks!\ncurrent_context_df = cells.sql_get_context.to_pandas()\ndatabase_name = current_context_df.iloc[0,0]\nschema_name = current_context_df.iloc[0,1]\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n#session.use_schema(f\"{database_name}.{schema_name}\")\n\nlogger.info(\"06_load_excel_files start\")",
"execution_count": null,
Expand All @@ -46,7 +58,8 @@
"metadata": {
"language": "sql",
"name": "sql_get_spreadsheets",
"collapsed": false
"collapsed": false,
"resultHeight": 146
},
"outputs": [],
"source": "-- Temporary solution to load in the metadata, this should be replaced with a directy query to a directory table (or a metadata table)\nSELECT '@INTEGRATIONS.FROSTBYTE_RAW_STAGE/intro/order_detail.xlsx' AS STAGE_FILE_PATH, 'order_detail' AS WORKSHEET_NAME, 'ORDER_DETAIL' AS TARGET_TABLE\nUNION\nSELECT '@INTEGRATIONS.FROSTBYTE_RAW_STAGE/intro/location.xlsx', 'location', 'LOCATION';",
Expand All @@ -57,7 +70,8 @@
"id": "07fd7441-1c12-4195-a7cd-f04fcc3e4242",
"metadata": {
"name": "md_function",
"collapsed": false
"collapsed": false,
"resultHeight": 250
},
"source": "## Create a function to load Excel worksheet to table\n\nCreate a reusable function to load an Excel worksheet to a table in Snowflake.\n\nNote: Until we can use the `SnowflakeFile` class in Notebooks, we need to temporarily copy the file to a local temp folder and then process from there."
},
Expand All @@ -67,7 +81,8 @@
"metadata": {
"language": "python",
"name": "py_load_excel_function",
"collapsed": false
"collapsed": false,
"resultHeight": 0
},
"outputs": [],
"source": "import os\nfrom openpyxl import load_workbook\n\ndef load_excel_worksheet_to_table_local(session, stage_file_path, worksheet_name, target_table):\n local_directory = \"./\"\n file_name = os.path.basename(stage_file_path)\n\n # First copy file from stage to local storage\n get_status = session.file.get(stage_file_path, local_directory)\n\n with open(f\"{local_directory}{file_name}\", 'rb') as f:\n workbook = load_workbook(f)\n sheet = workbook[worksheet_name]\n data = sheet.values\n\n # Get the first line in file as a header line\n columns = next(data)[0:]\n # Create a DataFrame based on the second and subsequent lines of data\n df = pd.DataFrame(data, columns=columns)\n \n df2 = session.create_dataframe(df)\n df2.write.mode(\"overwrite\").save_as_table(target_table)\n \n return True",
Expand All @@ -78,7 +93,8 @@
"id": "97c2fc79-50d4-4a81-af5d-5c80d37070ec",
"metadata": {
"name": "md_process_spreadsheets",
"collapsed": false
"collapsed": false,
"resultHeight": 129
},
"source": "## Process all Excel worksheets\n\nLoop through each Excel worksheet to process and call our `load_excel_worksheet_to_table_local()` function."
},
Expand All @@ -87,7 +103,9 @@
"id": "4e73f895-6b24-4ce9-b357-7a9a879be1e4",
"metadata": {
"language": "python",
"name": "py_process_spreadsheets"
"name": "py_process_spreadsheets",
"collapsed": false,
"resultHeight": 0
},
"outputs": [],
"source": "# Process each file from the sql_get_spreadsheets cell above\nfiles_to_load = cells.sql_get_spreadsheets.to_pandas()\nfor index, excel_file in files_to_load.iterrows():\n logger.info(f\"Processing Excel file {excel_file['STAGE_FILE_PATH']}\")\n load_excel_worksheet_to_table_local(session, excel_file['STAGE_FILE_PATH'], excel_file['WORKSHEET_NAME'], excel_file['TARGET_TABLE'])\n\nlogger.info(\"06_load_excel_files end\")",
Expand All @@ -98,7 +116,8 @@
"id": "16d6be04-3690-4c5d-91ee-5d0d425355b8",
"metadata": {
"name": "md_debugging",
"collapsed": false
"collapsed": false,
"resultHeight": 46
},
"source": "### Debugging"
},
Expand All @@ -110,7 +129,7 @@
"name": "sql_debugging"
},
"outputs": [],
"source": "--DESCRIBE TABLE LOCATION;\n--SELECT * FROM LOCATION;\n--SHOW TABLES;",
"source": "--DESCRIBE TABLE LOCATION;\n--SELECT * FROM LOCATION;\nSHOW TABLES;",
"execution_count": null
}
]
Expand Down