-
Notifications
You must be signed in to change notification settings - Fork 115
/
Copy pathMakefile
270 lines (234 loc) · 15.5 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#### DEFAULTS ##########################################################################################################
#### Boilerplate Makefile setup
MAKEFLAGS += --warn-undefined-variables
SHELL := bash
.SHELLFLAGS := -ec -o pipefail
.DEFAULT_GOAL := help
.DELETE_ON_ERROR:
.SUFFIXES:
#### INCLUDES ##########################################################################################################
#### Includes of other Makefiles, or files that declare environment variables (e.g. .env)
# include .env _should_ allow vars to be defined here
# However be careful of vars referencing other vars. or vals with $ in it (e.g. passwords), which would need escaping
env_file_exists := $(wildcard .env)
ifneq ($(strip ${env_file_exists}),)
include .env
endif
#### VARS ##############################################################################################################
#### Variables used in this Makefile.
#### Uppercased are environment vars, or make-specific vars. All others should be lower-snake-case
# default ENV_CODE to lcl if not set
ENV_CODE ?= lcl
# default version if not set in .env or an env var
PYTHON_VERSION ?= 3.8.16
venv_name := usaspending-api
docker_compose_file := docker-compose.yml
dockerfile_for_spark := Dockerfile.spark
# Root directories under which python (namespace) packages start, for all python code in this project
src_root_paths = "."
#### RULES #############################################################################################################
#### Rules defining file targets that need to be made, or PHONY targets, which don't actually produce a file
#### Reminder: The name of non-PHONY targets needs to be the name of a file on disk, or it will always be invoked
#### NOTE: Most rules here deal with project-setup
#### Rules orchestrating project workloads are in the included Makefile
.PHONY: help
help: ## print this help
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
.PHONY: printvars
printvars: ## Print the Environment variables present in calls to make, plus variables defined in the executed Makefile
@$(info ==== Makefile Variables ====)
@$(info )
@$(foreach V,$(sort $(.VARIABLES)), \
$(if $(filter-out environment% default automatic, \
$(origin $V)),$(info $V=$($V) ($(value $V)))))
@printf "\n==== Environment Variables ====\n\n"
@printenv
.python-version: ## Attempt to setup python using pyenv
@if ! command -v pyenv &> /dev/null; then \
echo "WARNING: pyenv could not be found. Install pyenv to get a virtual env running with the compatible python version: ${PYTHON_VERSION}. Will fallback to using system python3."; \
else \
set -x; \
echo "pyenv setting python version to ${PYTHON_VERSION}"; \
pyenv install -s ${PYTHON_VERSION}; \
pyenv local ${PYTHON_VERSION}; \
python3 -V; \
if [ "$$(python3 -V)" != "Python ${PYTHON_VERSION}" ]; then \
echo "ERROR: pyenv was not able to set local python version to ${PYTHON_VERSION}"; \
exit 1; \
fi; \
fi;
.venv: ## Ensure a virtual environment is established at .venv
@( \
set -x; \
test -d .venv || python3 -m venv .venv/${venv_name}; \
)
.PHONY: requirements-dev
upgrade := #unset it
requirements-dev: .venv ## Install pip packages in dev virtual env. Add upgrade=true to upgrade required packages to newest version (can be lengthy)
# Because this depends on .venv, the virtual env should exist to activate
# This will "activate" the virtual env only for the duration of the scripts in the parens-scope
# Then when this make rule recipe is complete, the virtual env will be dropped
# But it IS there and populated
# Must _manually_ reactivate the virtual env to interact with it on the command line
@( \
source .venv/${venv_name}/bin/activate; \
echo "virtual env at .venv/${venv_name} activated (temporarily)"; \
pip install $$(cat requirements/requirements-dev.txt | grep 'pip=='); \
src_roots=(${src_root_paths}); \
for src_root in "$${src_roots[@]}"; do \
pip install ${if ${upgrade},--upgrade,} --editable "$${src_root}[dev]"; \
done; \
)
.ivy2: ## Ensure user has a ~/.ivy2 dir, which will be bound to in a docker container volume to save on dependency downloads
@mkdir -p ~/.ivy2
.PHONY: activate
activate: ## Spit out the command to run to activate the virtual env, since you can't do it within a make shell process. Use this like: source $(make activate)
@echo ".venv/${venv_name}/bin/activate"
.PHONY: local-dev-setup
local-dev-setup: .python-version requirements-dev check-dependencies .ivy2 ## Setup python, virtual environment, and pip dependencies, then check version info
.PHONY: check-dependencies
check-dependencies: ## Prints out the versions of dependencies in use
@printf "\n==== [PYTHON VERSIONS] ====\n\n"
@echo "python -> $$(python -V) ... python3 -> $$(python3 -V)"
@printf "\n==== [PIP PACKAGE VERSIONS] ====\n\n"
@source .venv/${venv_name}/bin/activate && pip list
@printf "\n==== [SPARK VERSION] ====\n\n"
@source .venv/${venv_name}/bin/activate && pyspark --version
@printf "\n==== [HADOOP VERSION] ====\n\n"
@source .venv/${venv_name}/bin/activate && python3 -c "from pyspark.sql import SparkSession; \
spark = spark = SparkSession.builder.getOrCreate(); \
print('Hadoop ' + spark.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion());"
.PHONY: env-code
env-code: ## Print the value of ENV_CODE environment variable
@echo ${ENV_CODE}
.PHONY: test-dbs
createdb := #unset it
test-dbs: ## Trigger the setup of multiple test DBs that can be reused with pytest --numprocesses. Add createdb=true to force (re-)creation of Test DBs rather than reuse.
pytest ${if ${createdb},--create-db,} --reuse-db --numprocesses=auto --no-cov --disable-warnings -rP -vvv --capture=no --log-cli-level=WARNING --show-capture=log 2> /dev/null 'usaspending_api/tests/integration/test_setup_of_test_dbs.py::test_trigger_test_db_setup'
.PHONY: test-spark-deps
test-spark-deps: ## Trigger a singular test in one pytest session that does nothing but cause Maven dependencies to be downloaded and cached through Ivy; reduces contention when parallel spark builds need the depdencies
pytest --no-cov --disable-warnings -r=fEs --verbosity=3 'usaspending_api/tests/integration/test_setup_of_spark_dependencies.py::test_preload_spark_jars'
.PHONY: tests
tests: local-dev-setup test-dbs test-spark-deps ## Run automated unit/integration tests. Configured for useful logging. add args="..." to append additional pytest args
pytest --failed-first --reuse-db --numprocesses=auto --dist=worksteal -rP -vv --capture=no --show-capture=log 2> /dev/null ${args}
.PHONY: tests-failed
tests-failed: local-dev-setup test-dbs test-spark-deps ## Re-run only automated unit/integration tests that failed on the previous run. Configured for verbose logging to get more detail on failures. logging. add args="..." to append additional pytest args
pytest --last-failed --reuse-db --numprocesses=auto --dist=worksteal -rP -vvv ${args}
.PHONY: confirm-clean-all
no-prompt := 'false'
dry-run := 'false'
confirm-clean-all: ## Guard to prompt for confirmation before aggressive clean
ifeq ($(strip ${no-prompt}),'false')
ifeq ($(strip ${dry-run}),'false')
@echo -n "This will remove any untracked/uncommitted source files or files in the working directory. Consider backing up any files in your custom setup. To see what files would be removed, re-run with dry-run=true. Continue? [y/N] " && read ans && [ $${ans:-N} = y ]
endif
endif
.PHONY: clean-all
dry-run := 'false'
clean-all: confirm-clean-all ## Remove all tmp artifacts and artifacts created as part of local dev env setup. To avoid prompt (e.g. in script) call like: make clean-all no-prompt=true. To only see what WOULD be deleted, include dry-run=true
ifeq ($(strip ${dry-run}),'false')
rm -f .python-version
rm -rf .venv
@git clean -xfd --exclude='\.env' --exclude='\.envrc' --exclude='\.idea/' --exclude='spark-warehouse/'
deactivate || true
#if command -v deactivate &> /dev/null; then deactivate; fi;
else # this is a dry-run, spit out what would be removed
@printf "Would remove .python-version\nWould remove .venv\n"
@git clean --dry-run -xfd --exclude='\.env' --exclude='\.envrc' --exclude='\.idea/' --exclude='spark-warehouse/'
endif
.PHONY: docker-compose
docker-compose: ## Run an arbitrary docker-compose command by passing in the Docker Compose profiles in the "profiles" variable, and args in the "args" variable
# NOTE: The .env file is used to provide environment variable values that replace variables in the compose file
# Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
# where it is with "--project_directory". Since this is called from the root Makefile, using ./ points to the dir
# of that Makefile
docker-compose ${profiles} --project-directory . --file ${docker_compose_file} ${args}
.PHONY: docker-compose-config
docker-compose-config: ## Show config and vars expanded, which will be used in docker-compose
# NOTE: The .env file is used to provide environment variable values that replace varialbes in the compose file
# Because the .env file does not live in the same place as the compose file, we have to tell compose explicitly
# where it is with "--project_directory". Since this is called from teh root Makefile, using ./ points to the dir
# of that Makefile
docker-compose --project-directory . --file ${docker_compose_file} config ${args}
.PHONY: docker-compose-up-usaspending
docker-compose-up-usaspending: ## Deploy containerized version of this app on the local machine using docker-compose
# To 'up' a single docker-compose service, pass it in the args var, e.g.: make deploy-docker args=my-service
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} up ${args}
.PHONY: docker-compose-up-s3
docker-compose-up-s3: ## Deploy minio container on the local machine using docker-compose, which acts as a look-alike AWS S3 service
# NOTE: [See NOTE in docker-compose rule about .env file]
echo "docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}"
docker-compose --profile s3 --project-directory . --file ${docker_compose_file} up ${args}
.PHONY: docker-compose-up-spark
docker-compose-up-spark: ## Deploy containerized version of spark cluster infrastructure on the local machine using docker-compose
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --profile spark --project-directory . --file ${docker_compose_file} up ${args}
.PHONY: docker-compose-run
docker-compose-run: ## Use docker-compose run <args> to run one or more Docker Compose services with options
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose ${profiles} --project-directory . --file ${docker_compose_file} run ${args}
.PHONY: docker-compose-down
docker-compose-down: ## Run docker-compose down to bring down services listed in the compose file
# NOTE: [See NOTE in docker-compose rule about .env file]
docker-compose --project-directory . --file ${docker_compose_file} down ${args}
.PHONY: docker-build-spark
docker-build-spark: ## Run docker build to build a base container image for spark, hadoop, and python installed
# NOTE: [See NOTE in above docker-compose rule about .env file]
echo "docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})"
docker build --tag spark-base --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args} --file ${dockerfile_for_spark} $$(dirname ${dockerfile_for_spark})
.PHONY: docker-compose-build
docker-compose-build: ## Ensure ALL services in the docker-compose.yaml file have an image built for them according to their build: key
# NOTE: This *may* creates a compose-specific image name IF an image: YAML key does not specify the image name to be used as
# a tag when compose has to build the image.
# If no image key is specified, then be aware that:
# While building and tagging the spark-base image can be done, docker-compose will _NOT USE_ that image at runtime,
# but look for an image with its custom tag. It may use cached layers of that image when doing its build,
# but it will create a _differently named_ image: the image name is always going to be <project>_<service>,
# where project defaults to the directory name you're in. Therefore you MUST always run this command (or the manual version of it)
# anytime you want services run with Docker Compose to accommodate recent changes in the image (e.g. python package dependency changes)
# NOTE: [See NOTE in above docker-compose rule about .env file]
echo "docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker-compose --profile usaspending --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
.PHONY: docker-compose-build-spark
docker-compose-build-spark: ## See: docker-compose-build rule. This builds just the subset of spark services.
# NOTE: [See NOTE in above docker-compose rule about .env file]=
echo "docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}"
docker-compose --profile spark --project-directory . --file ${docker_compose_file} build --build-arg PROJECT_LOG_DIR=${PROJECT_LOG_DIR} ${args}
.PHONY: docker-compose-spark-submit
docker-compose-spark-submit: ## Run spark-submit from within local docker containerized infrastructure (which must be running first). Set params with django_command="..."
docker-compose --profile=spark --project-directory . --file ${docker_compose_file} run \
-e MINIO_HOST=minio \
-e COMPONENT_NAME='${django_command}${python_script}' \
-e DATABASE_URL=${DATABASE_URL} \
spark-submit \
--driver-memory "2g" \
--packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
${if ${python_script}, \
${python_script}, \
/project/manage.py ${django_command} \
}
.PHONY: localhost-spark-submit
localhost-spark-submit: ## Run spark-submit from with localhost as the driver and worker (single node). Set params with django_command="..."
SPARK_LOCAL_IP=127.0.0.1 \
spark-submit \
--driver-memory "2g" \
--packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
${if ${python_script}, \
${python_script}, \
manage.py ${django_command} \
}
.PHONY: pyspark-shell
pyspark-shell: ## Launch a local pyspark REPL shell with all of the packages and spark config pre-set
SPARK_LOCAL_IP=127.0.0.1 pyspark \
--packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1 \
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \
--conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \
--conf spark.hadoop.fs.s3a.endpoint=localhost:${MINIO_PORT} \
--conf spark.hadoop.fs.s3a.access.key=usaspending \
--conf spark.hadoop.fs.s3a.secret.key=usaspender \
--conf spark.hadoop.fs.s3a.connection.ssl.enabled=false \
--conf spark.hadoop.fs.s3a.path.style.access=true \
--conf spark.sql.catalogImplementation=hive \
--conf spark.sql.warehouse.dir='$(PWD)/spark-warehouse' \
--conf spark.hadoop.javax.jdo.option.ConnectionURL='jdbc:derby:;databaseName=$(PWD)/spark-warehouse/metastore_db;create=true'