From d3453ae469056a7a3134085a15dc9209c0007c2a Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Tue, 2 Apr 2024 13:07:26 -0700 Subject: [PATCH 1/2] Create folder_stats.sql This PR includes a sql script that can print folder structure for Synapse folders and get the total data size in GiB for each folder. --- analytics/folder_stats.sql | 141 +++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 analytics/folder_stats.sql diff --git a/analytics/folder_stats.sql b/analytics/folder_stats.sql new file mode 100644 index 00000000..0f8ff5d7 --- /dev/null +++ b/analytics/folder_stats.sql @@ -0,0 +1,141 @@ +/* +This script contains two parts: 1. lay out folder structure of the targeted FOLDER_IDs; +2. calculate total data size in GiB for each FOLDER_ID. +FOLDER_IDs can be a single Synapse folder ID or a list of Synapse folder IDs seperated by comma +*/ + +USE ROLE DATA_ANALYTICS; +USE DATABASE synapse_data_warehouse; +USE WAREHOUSE COMPUTE_XSMALL; + +-- The list of folders to be checked +SET + FOLDER_IDs = ''; + +-- Lay Out Data Structure +WITH RECURSIVE nodesnapshots + -- Column list of the "view" + ( + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + FOLDER_ID + ) + AS + -- Common Table Expression + ( + -- Anchor Clause + SELECT + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + ID AS FOLDER_ID + FROM + synapse_data_warehouse.synapse.node_latest + WHERE + ID IN ( + SELECT + REPLACE(VALUE, 'syn', '') + FROM + TABLE(SPLIT_TO_TABLE($FOLDER_IDs, ',')) + ) + + UNION ALL + + -- Recursive Clause + SELECT + node.ID, + node.PARENT_ID, + node.NAME, + node.NODE_TYPE, + node.FILE_HANDLE_ID, + nodesnapshots.FOLDER_ID + FROM + synapse_data_warehouse.synapse.node_latest AS node + JOIN + nodesnapshots + ON + node.PARENT_ID = nodesnapshots.ID + ) +-- This is the "main select". +SELECT + 'syn' || nodesnapshots.FOLDER_ID AS FOLDER_ID, + 'syn' || nodesnapshots.PARENT_ID AS PARENT_ID, + 'syn' || nodesnapshots.ID AS ID, + COUNT(nodesnapshots.ID) AS COUNTS, + nodesnapshots.NAME, + nodesnapshots.NODE_TYPE, + nodesnapshots.FILE_HANDLE_ID, + filesnapshots.CONTENT_SIZE +FROM + nodesnapshots +JOIN + synapse_data_warehouse.synapse.file_latest AS filesnapshots +ON nodesnapshots.file_handle_id = filesnapshots.ID +GROUP BY nodesnapshots.FOLDER_ID; + +-- Calculate Data Size +WITH RECURSIVE nodesnapshots + -- Column list of the "view" + ( + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + FOLDER_ID + ) + AS + -- Common Table Expression + ( + -- Anchor Clause + SELECT + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + ID AS FOLDER_ID + FROM + synapse_data_warehouse.synapse.node_latest + WHERE + ID IN ( + SELECT + REPLACE(VALUE, 'syn', '') + FROM + TABLE(SPLIT_TO_TABLE($FOLDER_IDs, ',')) + ) + + UNION ALL + + -- Recursive Clause + SELECT + node.ID, + node.PARENT_ID, + node.NAME, + node.NODE_TYPE, + node.FILE_HANDLE_ID, + nodesnapshots.FOLDER_ID + FROM + synapse_data_warehouse.synapse.node_latest AS node + JOIN + nodesnapshots + ON + node.PARENT_ID = nodesnapshots.ID + ) +-- This is the "main select". +SELECT + 'syn' || nodesnapshots.FOLDER_ID AS FOLDER_ID, + sum(filesnapshots.CONTENT_SIZE)/ power(2, 30) AS CONTENT_SIZE_in_GiB +FROM + nodesnapshots +JOIN + synapse_data_warehouse.synapse.file_latest AS filesnapshots +ON + nodesnapshots.file_handle_id = filesnapshots.ID +GROUP BY + nodesnapshots.FOLDER_ID; From e7fa94c5e9062fd649d8e28745a3f7da31356479 Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Wed, 3 Apr 2024 13:38:25 -0700 Subject: [PATCH 2/2] Update and rename folder_stats.sql to calculate_datasize.sql Only include datasize calculation part and update variable name to indicate the script works for both folder and project --- analytics/calculate_datasize.sql | 74 ++++++++++++++++ analytics/folder_stats.sql | 141 ------------------------------- 2 files changed, 74 insertions(+), 141 deletions(-) create mode 100644 analytics/calculate_datasize.sql delete mode 100644 analytics/folder_stats.sql diff --git a/analytics/calculate_datasize.sql b/analytics/calculate_datasize.sql new file mode 100644 index 00000000..19e3204e --- /dev/null +++ b/analytics/calculate_datasize.sql @@ -0,0 +1,74 @@ +/* +This is a script to calculate total data size in GiB for each ENTITY_ID (folder or project). +ENTITY_IDs can be a single Synapse folder/project ID or a list of Synapse folder/proejct IDs seperated by comma +*/ + +USE ROLE DATA_ANALYTICS; +USE DATABASE synapse_data_warehouse; +USE WAREHOUSE COMPUTE_XSMALL; + +-- The list of folders to be checked +SET + ENTITY_IDs = ''; + +-- Calculate Data Size +WITH RECURSIVE nodesnapshots + -- Column list of the "view" + ( + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + ENTITY_ID + ) + AS + -- Common Table Expression + ( + -- Anchor Clause + SELECT + ID, + PARENT_ID, + NAME, + NODE_TYPE, + FILE_HANDLE_ID, + ID AS ENTITY_ID + FROM + synapse_data_warehouse.synapse.node_latest + WHERE + ID IN ( + SELECT + REPLACE(VALUE, 'syn', '') + FROM + TABLE(SPLIT_TO_TABLE($ENTITY_IDs, ',')) + ) + + UNION ALL + + -- Recursive Clause + SELECT + node.ID, + node.PARENT_ID, + node.NAME, + node.NODE_TYPE, + node.FILE_HANDLE_ID, + nodesnapshots.ENTITY_ID + FROM + synapse_data_warehouse.synapse.node_latest AS node + JOIN + nodesnapshots + ON + node.PARENT_ID = nodesnapshots.ID + ) +-- This is the "main select". +SELECT + 'syn' || nodesnapshots.ENTITY_ID AS ENTITY_ID, + sum(filesnapshots.CONTENT_SIZE)/ power(2, 30) AS CONTENT_SIZE_in_GiB +FROM + nodesnapshots +JOIN + synapse_data_warehouse.synapse.file_latest AS filesnapshots +ON + nodesnapshots.file_handle_id = filesnapshots.ID +GROUP BY + nodesnapshots.ENTITY_ID; diff --git a/analytics/folder_stats.sql b/analytics/folder_stats.sql deleted file mode 100644 index 0f8ff5d7..00000000 --- a/analytics/folder_stats.sql +++ /dev/null @@ -1,141 +0,0 @@ -/* -This script contains two parts: 1. lay out folder structure of the targeted FOLDER_IDs; -2. calculate total data size in GiB for each FOLDER_ID. -FOLDER_IDs can be a single Synapse folder ID or a list of Synapse folder IDs seperated by comma -*/ - -USE ROLE DATA_ANALYTICS; -USE DATABASE synapse_data_warehouse; -USE WAREHOUSE COMPUTE_XSMALL; - --- The list of folders to be checked -SET - FOLDER_IDs = ''; - --- Lay Out Data Structure -WITH RECURSIVE nodesnapshots - -- Column list of the "view" - ( - ID, - PARENT_ID, - NAME, - NODE_TYPE, - FILE_HANDLE_ID, - FOLDER_ID - ) - AS - -- Common Table Expression - ( - -- Anchor Clause - SELECT - ID, - PARENT_ID, - NAME, - NODE_TYPE, - FILE_HANDLE_ID, - ID AS FOLDER_ID - FROM - synapse_data_warehouse.synapse.node_latest - WHERE - ID IN ( - SELECT - REPLACE(VALUE, 'syn', '') - FROM - TABLE(SPLIT_TO_TABLE($FOLDER_IDs, ',')) - ) - - UNION ALL - - -- Recursive Clause - SELECT - node.ID, - node.PARENT_ID, - node.NAME, - node.NODE_TYPE, - node.FILE_HANDLE_ID, - nodesnapshots.FOLDER_ID - FROM - synapse_data_warehouse.synapse.node_latest AS node - JOIN - nodesnapshots - ON - node.PARENT_ID = nodesnapshots.ID - ) --- This is the "main select". -SELECT - 'syn' || nodesnapshots.FOLDER_ID AS FOLDER_ID, - 'syn' || nodesnapshots.PARENT_ID AS PARENT_ID, - 'syn' || nodesnapshots.ID AS ID, - COUNT(nodesnapshots.ID) AS COUNTS, - nodesnapshots.NAME, - nodesnapshots.NODE_TYPE, - nodesnapshots.FILE_HANDLE_ID, - filesnapshots.CONTENT_SIZE -FROM - nodesnapshots -JOIN - synapse_data_warehouse.synapse.file_latest AS filesnapshots -ON nodesnapshots.file_handle_id = filesnapshots.ID -GROUP BY nodesnapshots.FOLDER_ID; - --- Calculate Data Size -WITH RECURSIVE nodesnapshots - -- Column list of the "view" - ( - ID, - PARENT_ID, - NAME, - NODE_TYPE, - FILE_HANDLE_ID, - FOLDER_ID - ) - AS - -- Common Table Expression - ( - -- Anchor Clause - SELECT - ID, - PARENT_ID, - NAME, - NODE_TYPE, - FILE_HANDLE_ID, - ID AS FOLDER_ID - FROM - synapse_data_warehouse.synapse.node_latest - WHERE - ID IN ( - SELECT - REPLACE(VALUE, 'syn', '') - FROM - TABLE(SPLIT_TO_TABLE($FOLDER_IDs, ',')) - ) - - UNION ALL - - -- Recursive Clause - SELECT - node.ID, - node.PARENT_ID, - node.NAME, - node.NODE_TYPE, - node.FILE_HANDLE_ID, - nodesnapshots.FOLDER_ID - FROM - synapse_data_warehouse.synapse.node_latest AS node - JOIN - nodesnapshots - ON - node.PARENT_ID = nodesnapshots.ID - ) --- This is the "main select". -SELECT - 'syn' || nodesnapshots.FOLDER_ID AS FOLDER_ID, - sum(filesnapshots.CONTENT_SIZE)/ power(2, 30) AS CONTENT_SIZE_in_GiB -FROM - nodesnapshots -JOIN - synapse_data_warehouse.synapse.file_latest AS filesnapshots -ON - nodesnapshots.file_handle_id = filesnapshots.ID -GROUP BY - nodesnapshots.FOLDER_ID;