-
Notifications
You must be signed in to change notification settings - Fork 353
/
Assessing Data Quality with Dataplex
108 lines (87 loc) · 3.51 KB
/
Assessing Data Quality with Dataplex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#---CHANGE REGION---
export REGION=
gcloud auth list
gcloud services enable dataproc.googleapis.com dataplex.googleapis.com datacatalog.googleapis.com
sleep 10
gcloud dataplex lakes create ecommerce-lake --location=$REGION --display-name="Ecommerce Lake"
gcloud projects add-iam-policy-binding $DEVSHELL_PROJECT_ID --member=user:$USER_EMAIL --role=roles/dataplex.admin
sleep 20
gcloud dataplex zones create customer-contact-raw-zone --location=$REGION --display-name="Customer Contact Raw Zone" --lake=ecommerce-lake --type=RAW --resource-location-type=SINGLE_REGION
gcloud dataplex assets create contact-info --location=$REGION --display-name="Contact Info" --lake=ecommerce-lake --zone=customer-contact-raw-zone --resource-type=BIGQUERY_DATASET --resource-name=projects/$DEVSHELL_PROJECT_ID/datasets/customers --discovery-enabled
bq query --use_legacy_sql=false "
SELECT * FROM \`$DEVSHELL_PROJECT_ID.customers.contact_info\`
ORDER BY id
LIMIT 50
"
cat > dq-customer-raw-data.yaml <<EOF_CP
metadata_registry_defaults:
dataplex:
projects: $DEVSHELL_PROJECT_ID
locations: $REGION
lakes: ecommerce-lake
zones: customer-contact-raw-zone
row_filters:
NONE:
filter_sql_expr: |-
True
INTERNATIONAL_ITEMS:
filter_sql_expr: |-
REGEXP_CONTAINS(item_id, 'INTNL')
rule_dimensions:
- consistency
- correctness
- duplication
- completeness
- conformance
- integrity
- timeliness
- accuracy
rules:
NOT_NULL:
rule_type: NOT_NULL
dimension: completeness
VALID_EMAIL:
rule_type: REGEX
dimension: conformance
params:
pattern: |-
^[^@]+[@]{1}[^@]+$
rule_bindings:
VALID_CUSTOMER:
entity_uri: bigquery://projects/$DEVSHELL_PROJECT_ID/datasets/customers/tables/contact_info
column_id: id
row_filter_id: NONE
rule_ids:
- NOT_NULL
VALID_EMAIL_ID:
entity_uri: bigquery://projects/$DEVSHELL_PROJECT_ID/datasets/customers/tables/contact_info
column_id: email
row_filter_id: NONE
rule_ids:
- VALID_EMAIL
EOF_CP
gsutil cp dq-customer-raw-data.yaml gs://$DEVSHELL_PROJECT_ID-bucket
bq query --use_legacy_sql=false "
SELECT * FROM \`$DEVSHELL_PROJECT_ID.customers.contact_info\`
ORDER BY id
LIMIT 50
"
echo "----------------------"
echo " "
echo "BIGQUERY LINK : https://console.cloud.google.com/bigquery?project=$DEVSHELL_PROJECT_ID&ws=!1m0"
echo "TASK LINK : https://console.cloud.google.com/dataplex/process/create-task/data-quality?project=$DEVSHELL_PROJECT_ID"
echo "----------------------"
echo " "
#--CLICK BIGQUERY LINK > RUN BELOW QUERY AFTER CHANGING <project_id>--
SELECT * FROM `<project_id>.customers.contact_info` ORDER BY id LIMIT 50
#--CLICK TASK LINK > DATAPLEX LAKE : ecommerce-lake > DISPLAY NAME : Customer Data Quality Job
#--SELECT GCS FILE : <PROJECT_ID>-bucket/dq-customer-raw-data.yaml
#--SELECT BIGQUERY DATASET : BROWSE > customers_dq_dataset > SELECT
#--BIGQUERY TABLE : dq_results
#--USER SERVICE ACCOUNT : Compute Engine default service account
#--CONTINUE > START IMMEDIATELY > CREATE
#--CLICK Customer Data Quality Job > [RANDOM NUMBER JOB ID] > LET STATUS BE Succeeded
#--BACK TO BIGQUERY LINK > EXPAND PROJECT ID > customers_dq_dataset > DOUBLE CLICK dq_summary > PREVIEW > SCROLL RIGHT TO THE END > IN LAST ROW FIRST COLUMN > MAKE THE ARROW TOWARD UPSIDE
#--COPY WHOLE QUERY FROM THAT CELL > PASTE IN BIGQUERY NEW EDITOR > RUN
#--AGAIN IN LAST ROW SECOND COLUMN > MAKE THE ARROW TOWARD UPSIDE
#--COPY WHOLE QUERY FROM THAT CELL > PASTE IN BIGQUERY NEW EDITOR > RUN