From d922ead94f9ea85046688f1cd0c121c57576a629 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 8 Apr 2024 19:31:59 -0500 Subject: [PATCH] Take the default duplication down to 25 At some point the duplication was bumped to 100. This takes forever to complete with 4x4gb executors, which isn't a useful experience for users. With duplication at 25 the spark job takes about 5 minutes to complete which is plenty of time for the user to poke around. --- analytics/scripts/taxi-trip-execute.sh | 10 +++++----- .../docs/blueprints/data-analytics/_taxi_trip_exec.md | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/analytics/scripts/taxi-trip-execute.sh b/analytics/scripts/taxi-trip-execute.sh index 37a964ba5..fd609b12d 100755 --- a/analytics/scripts/taxi-trip-execute.sh +++ b/analytics/scripts/taxi-trip-execute.sh @@ -19,7 +19,7 @@ fi S3_BUCKET="$1" REGION="$2" -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" +INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input" # Create a local input folder mkdir input @@ -29,15 +29,15 @@ aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${R # Copy Test Input data to S3 bucket wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" -aws s3 cp "input/yellow_tripdata_2022-0.parquet" s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet +aws s3 cp "input/yellow_tripdata_2022-0.parquet" ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet pids=() # Making duplicate copies to increase the size of the data. -max=100 +max=25 for (( i=1; i <= $max; ++i )) do - aws s3 cp s3://${S3_BUCKET}/input/yellow_tripdata_2022-0.parquet s3://${S3_BUCKET}/input/yellow_tripdata_2022-${i}.parquet & + aws s3 cp ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-${i}.parquet & pids+=($!) done @@ -46,4 +46,4 @@ for pid in "${pids[@]}"; do done # Delete a local input folder -rm -rf input \ No newline at end of file +rm -rf input diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md index eef45910b..5352f9a52 100644 --- a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md +++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md @@ -6,4 +6,4 @@ require a relatively fast internet connection. ```bash ${DOEKS_HOME}/analytics/scripts/taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` \ No newline at end of file +```