diff --git a/efficient_full_table_scan_example_code/README.md b/efficient_full_table_scan_example_code/README.md index 4377a342..0eba2b82 100644 --- a/efficient_full_table_scan_example_code/README.md +++ b/efficient_full_table_scan_example_code/README.md @@ -4,14 +4,12 @@ In this example we demonstrate an efficient parallel full table scan, that utili to gain servers and cores parallelism optimization. Following setup was used in this example: -- Scylla cluster v1.6.1: 3 x RHEL7.2 nodes deployed on Google Compute Engine (GCE), each has 8 vCPUs and 30GB memory +- Scylla cluster v6.1: 3 x Ubuntu 24 nodes deployed on Google Compute Engine (GCE), each has 8 vCPUs and 30GB memory - Storage: each node has 2 x 375GB local SSDs (NVMe) disks -- Client: Centos7.2 node deployed on GCE (4 vCPU and 15GB memory) will run our golang code using gocql driver +- Client: Ubuntu 24 node deployed on GCE (4 vCPU and 15GB memory) will run our golang code using gocql driver - Client prerequisites: - - Install go 1.7: https://www.digitalocean.com/community/tutorials/how-to-install-go-1-7-on-centos-7 - - Install gocql driver: "go get github.com/gocql/gocql" - - Install kingpin pkg: "go get gopkg.in/alecthomas/kingpin.v2" - + - Install go 1.23 * (see instructions below for sample local install, or use your OS package manager to get recent go) + - Install prerequisites using "go mod tidy" Instructions ============ @@ -31,6 +29,7 @@ Mandatory param: `` = your Scylla nodes IP addresses. The rest of the par Note: `-d` flag (default=false) prints all the rows into a file for debugging purpose. This loads the client CPU, hence use with caution. +Note that query template used %s to replace token as string(in case of uuid), if token will be other type (e.g. decimal) you might need to use %d to provide it. --help usage output =================== @@ -46,11 +45,11 @@ Flags: -t, --timeout=15000 Maximum duration for query execution in millisecond -b, --cluster-number-of-connections=1 Number of connections per host per session (in our case, per thread) - -l, --cql-version="3.0.0" The CQL version to use + -l, --cql-version="3.3.1" The CQL version to use -p, --cluster-page-size=5000 Page size of results - -q, --query-template="SELECT token(key) FROM keyspace1.standard1 WHERE token(key) >= %d AND token(key) <= %d;" - The template of the query to run. Make sure to have 2 '%d' parameters in it to embed - the token ranges + -q, --query-template="SELECT token(key) FROM keyspace1.standard1 WHERE token(key) >= %s AND token(key) <= %s;" + The template of the query to run. Make sure to have 2 '%s' parameters in it to embed + the token ranges (or use %d in case of decimal token) --select-statements-output-file="/tmp/select_statements.txt" Location of select statements output file -d, --print-rows Print the output rows to a file @@ -74,9 +73,9 @@ Scylla cluster nodes : 10.240.0.29,10.240.0.30,10.240.0.35 Consistency : one Timeout (ms) : 15000 Connections per host : 1 -CQL Version : 3.0.0 +CQL Version : 3.3.1 Page size : 5000 -Query template : SELECT token(key) FROM keyspace1.standard1 WHERE token(key) >= %d AND token(key) <= %d; +Query template : SELECT token(key) FROM keyspace1.standard1 WHERE token(key) >= %s AND token(key) <= %s; Select Statements Output file : /tmp/select_statements.txt # of parallel threads : 72 # of ranges to be executed : 7200 @@ -89,3 +88,54 @@ Done! Total Execution Time: 15m32s ``` + + + +Example how to run the script +============================= + +Warning: For proper installation refer to https://go.dev/doc/install +Below is really just a local sample that installs all in ~ of the user used. + +E.g. for go 1.23.1: + +``` +GOVERSION=go1.23.1 +curl -o ~/$GOVERSION.linux-amd64.tar.gz https://dl.google.com/go/$GOVERSION.linux-amd64.tar.gz + +tar xzv -C ~ -f $GOVERSION.linux-amd64.tar.gz + +ln -s $HOME/go $HOME/$GOVERSION + +cat < ~/go_profile.sh +#!/bin/bash +export GOVERSION=$GOVERSION + +export GOROOT=\$HOME/\$GOVERSION + +export PATH=\$PATH:\$HOME/\$GOVERSION/bin +EOF + +chmod 755 ~/go_profile.sh +. ~/go_profile.sh + +mkdir ~/gocqlcount + +cd ~/gocqlcount + +curl -o efficient_full_table_scan.go https://raw.githubusercontent.com/scylladb/scylla-code-samples/master/efficient_full_table_scan_example_code/efficient_full_table_scan.go + +go mod init gocqlcount + +go mod tidy + +``` + +Example full scan count using quorum for a composite primary key (PK) table (in template always use the composite PK, clustering key (CK) shall not be there), +for a cluster with 12 nodes and we only want to make 3 cpus busy per single node, +original key is ala "PRIMARY KEY ((postid, bucket), sortcolumn)" : +``` +. ~/go_profile.sh +cd ~/gocqlcount +go run `pwd`/efficient_full_table_scan.go --consistency localquorum --username read_user --password read_user_password 10.240.0.29 -q "SELECT token(postid, bucket) FROM post.bucket_by_user_and_post WHERE token(postid, bucket) >= %s AND token(postid, bucket) <= %s;" -n 12 -c 3 +``` diff --git a/efficient_full_table_scan_example_code/efficient_full_table_scan.go b/efficient_full_table_scan_example_code/efficient_full_table_scan.go index c2885571..f3ceda0c 100644 --- a/efficient_full_table_scan_example_code/efficient_full_table_scan.go +++ b/efficient_full_table_scan_example_code/efficient_full_table_scan.go @@ -11,7 +11,7 @@ import ( "time" "github.com/gocql/gocql" - "gopkg.in/alecthomas/kingpin.v2" + "github.com/alecthomas/kingpin/v2" ) const ( @@ -34,10 +34,10 @@ var ( clusterConsistency = kingpin.Flag("consistency", "Cluster consistency level. Use 'localone' for multi DC").Short('o').Default("one").String() clusterTimeout = kingpin.Flag("timeout", "Maximum duration for query execution in millisecond").Short('t').Default("15000").Int() clusterNumConnections = kingpin.Flag("cluster-number-of-connections", "Number of connections per host per session (in our case, per thread)").Short('b').Default("1").Int() - clusterCQLVersion = kingpin.Flag("cql-version", "The CQL version to use").Short('l').Default("3.0.0").String() + clusterCQLVersion = kingpin.Flag("cql-version", "The CQL version to use").Short('l').Default("3.3.1").String() clusterPageSize = kingpin.Flag("cluster-page-size", "Page size of results").Short('p').Default("5000").Int() - queryTemplate = kingpin.Flag("query-template", "The template of the query to run. Make sure to have 2 '%d' parameters in it to embed the token ranges").Short('q').Default(defaultQueryTemplate).String() + queryTemplate = kingpin.Flag("query-template", "The template of the query to run. Make sure to have 2 '%s' parameters in it to embed the token ranges (or use '%d' in case of decimal token)").Short('q').Default(defaultQueryTemplate).String() selectStatementsOutputFile = kingpin.Flag("select-statements-output-file", "Location of select statements output file").Default(defaultSelectStatementsOutputFile).String() printRows = kingpin.Flag("print-rows", "Print the output rows to a file").Short('d').Default("false").Bool()