Skip to content

Commit

Permalink
Bazel (#4636)
Browse files Browse the repository at this point in the history
* Replace tensorflow with a bazel external repository

* Basic migration to bazel for xla_client.

* Revert to blob

* Add vscode config.

* Update newlines

* Merge with pjrt client test build changes.

* Migrate tests to new build

* Format test and plugin

* Order imports

* Conditionally apply tf patches; apply pt patches always.

* Format python

* configure formatters

* Mirror TF pin update an fixes in bazel.

* Support local and sandboxed build based on flags

* Add cloud cache URLs for llvm.

* Merge with upstream

* Update TF pin

* Fix patching regression

* Remove the citcleci setup downloading llvm

* Rework the experimental dockerfile for bazel support
  • Loading branch information
stgpetrovic authored Feb 16, 2023
1 parent b98179f commit 1bbe4da
Show file tree
Hide file tree
Showing 166 changed files with 859 additions and 502 deletions.
176 changes: 176 additions & 0 deletions .bazelrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
############################################################################
# All default build options below.

# Enable exceptions in C++.
common --copt=-fexceptions

# Make Bazel print out all options from rc files.
build --announce_rc

# TODO(goranpetrovic): figure out visibility of tensorflow libraries.
build --nocheck_visibility

#build --define open_source_build=true

# We can set this to `standalone` after https://github.com/bazelbuild/bazel/issues/15359 is resolved.
build --spawn_strategy=sandboxed

build --enable_platform_specific_config

build --experimental_cc_shared_library

# Disable enabled-by-default TensorFlow features that we don't care about.
build --define=no_aws_support=true
build --define=no_hdfs_support=true

build --define=grpc_no_ares=true

build -c opt

build --config=short_logs

###########################################################################

build:posix --copt=-Wno-sign-compare
build:posix --cxxopt=-std=c++17
build:posix --host_cxxopt=-std=c++17

build:avx_posix --copt=-mavx
build:avx_posix --host_copt=-mavx

build:avx_linux --copt=-mavx
build:avx_linux --host_copt=-mavx

build:native_arch_posix --copt=-march=native
build:native_arch_posix --host_copt=-march=native

build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=1

build:cuda --repo_env TF_NEED_CUDA=1
# "sm" means we emit only cubin, which is forward compatible within a GPU generation.
# "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations.
build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
build:cuda --@local_config_cuda//:enable_cuda
build:cuda --@org_tensorflow//tensorflow/compiler/xla/python:enable_gpu=true
build:cuda --define=xla_python_enable_gpu=true
build:cuda --cxxopt=-DXLA_CUDA=1

build:acl --define==build_with_acl=true

build:nonccl --define=no_nccl_support=true

build:linux --config=posix
build:linux --copt=-Wno-unknown-warning-option

# Suppress all warning messages.
build:short_logs --output_filter=DONT_MATCH_ANYTHING

#build:tpu --@org_tensorflow//tensorflow/compiler/xla/python:enable_tpu=true
build:tpu --define=with_tpu_support=true

#########################################################################
# RBE config options below.
# Flag to enable remote config
common --experimental_repo_remote_exec
#########################################################################

# Load rc file with user-specific options.
try-import %workspace%/.bazelrc.user

# Compile database generation config.
build:compdb --features=-layering_check

# Test requires Java.
test --java_runtime_version=remotejdk_11

# Coverage requires Java and GCC.
coverage --config=coverage
coverage --build_tests_only
build:coverage --java_runtime_version=remotejdk_11
build:coverage --copt=-DNDEBUG
build:coverage --combined_report=lcov
build:coverage --strategy=TestRunner=sandboxed,local
build:coverage --strategy=CoverageReport=sandboxed,local
build:coverage --experimental_use_llvm_covmap
build:coverage --collect_code_coverage
build:coverage --test_tag_filters=-nocoverage
build:coverage --action_env=CC=gcc
build:coverage --action_env=CXX=g++

############################################################################
############## TensorFlow .bazelrc greatest hits ###########################
############################################################################

# Modular TF build options
build:dynamic_kernels --define=dynamic_loaded_kernels=true
build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1

# Default paths for TF_SYSTEM_LIBS
build:linux --define=PREFIX=/usr
build:linux --define=LIBDIR=$(PREFIX)/lib
build:linux --define=INCLUDEDIR=$(PREFIX)/include
build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include

# On linux, we dynamically link small amount of kernels
build:linux --config=dynamic_kernels

# For projects which use TensorFlow as part of a Bazel build process, putting
# nothing in a bazelrc will default to a monolithic build. The following line
# opts in to modular op registration support by default.
build --define framework_shared_object=true
build --define tsl_protobuf_header_only=true

build --define=use_fast_cpp_protos=true
build --define=allow_oversize_protos=true

# Enable XLA support by default.
build --define=with_xla_support=true

# See https://github.com/bazelbuild/bazel/issues/7362 for information on what
# --incompatible_remove_legacy_whole_archive flag does.
# This flag is set to true in Bazel 1.0 and newer versions. We tried to migrate
# Tensorflow to the default, however test coverage wasn't enough to catch the
# errors.
# There is ongoing work on Bazel team's side to provide support for transitive
# shared libraries. As part of migrating to transitive shared libraries, we
# hope to provide a better mechanism for control over symbol exporting, and
# then tackle this issue again.
#
# TODO: Remove this line once TF doesn't depend on Bazel wrapping all library
# archives in -whole_archive -no_whole_archive.
build --noincompatible_remove_legacy_whole_archive

# cc_shared_library ensures no library is linked statically more than once.
build --experimental_link_static_libraries_once=false

# On linux, don't cross compile by default
build:linux --distinct_host_configuration=false

# Do not risk cache corruption. See:
# https://github.com/bazelbuild/bazel/issues/3360
build:linux --experimental_guard_against_concurrent_changes

# Prevent regressions on those two incompatible changes
# TODO: remove those flags when they are flipped in the default Bazel version TF uses.
build --incompatible_enforce_config_setting_visibility

# Disable TFRT integration for now unless --config=tfrt is specified.
build --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils

# Suppress most C++ complier warnings to reduce log size but allow
# for specific warnings to still be present.
build:linux --copt="-Wno-all"
build:linux --copt="-Wno-extra"
build:linux --copt="-Wno-deprecated"
build:linux --copt="-Wno-deprecated-declarations"
build:linux --copt="-Wno-ignored-attributes"
build:linux --copt="-Wno-array-bounds"
# Add unused-result as an error on Linux.
build:linux --copt="-Wunused-result"
build:linux --copt="-Werror=unused-result"
# Add switch as an error on Linux.
build:linux --copt="-Wswitch"
build:linux --copt="-Werror=switch"
# Required for building with clang
build:linux --copt="-Wno-error=unused-but-set-variable"
1 change: 1 addition & 0 deletions .bazelversion
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5.3.0
1 change: 0 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ setup_base_docker: &setup_base_docker
name: Setup Base Docker Image
command: |
.circleci/setup_ci_environment.sh
.circleci/download_llvm_raw.sh
launch_docker_and_build: &launch_docker_and_build
name: Launch Docker Container and Build
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ torch_xla/csrc/generated/
# Below files are not deleted by "setup.py clean".

# Visual Studio Code files
.vscode
.vs

# Files autogenerated by docs/docs_build.sh
Expand All @@ -27,3 +26,7 @@ torch_xla/csrc/generated/

# Local terraform state
.terraform*


# Build system temporary files
/bazel-*
5 changes: 0 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
[submodule "third_party/tensorflow"]
path = third_party/tensorflow
url = https://github.com/tensorflow/tensorflow.git
ignore = dirty

18 changes: 18 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"bsv.bazel.buildFlags": [
"--config=compdb",
"--sandbox_base=/dev/shm",
],
"bsv.cc.compdb.targets": [
"//third_party/xla_client/...",
],
"coverage-gutters.coverageBaseDir": ".",
"coverage-gutters.showLineCoverage": false,
"coverage-gutters.showGutterCoverage": true,
"coverage-gutters.coverageReportFileName": "./genhtml/index.html",
"coverage-gutters.coverageFileNames": [ "./bazel-out/_coverage/_coverage_report.dat" ],
"lcov.path": [ "./.bazel-out/_coverage/_coverage_report.dat"],

"python.formatting.provider": "yapf",
"editor.formatOnSave": true
}
61 changes: 61 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

# To update TensorFlow to a new revision,
# a) update URL and strip_prefix to the new git commit hash
# b) get the sha256 hash of the commit by running:
# curl -L https://github.com/tensorflow/tensorflow/archive/<git hash>.tar.gz | sha256sum
# and update the sha256 with the result.
http_archive(
name = "org_tensorflow",
patch_args = [
"-l",
"-p1",
],
patch_tool = "patch",
patches = [
"//tf_patches:bazel.diff",
"//tf_patches:cache_urls.diff",
"//tf_patches:cudnn_int8x32.diff",
"//tf_patches:f16_abi_clang.diff",
"//tf_patches:gpu_race_condition.diff",
"//tf_patches:stream_executor.diff",
"//tf_patches:grpc_version.diff",
"//tf_patches:thread_local_random.diff",
"//tf_patches:xplane.diff",
],
sha256 = "0fdf5067cd9827be2ae14c2ac59cd482e678134b125943be278ad23ea5342181",
strip_prefix = "tensorflow-f7759359f8420d3ca7b9fd19493f2a01bd47b4ef",
urls = [
"https://github.com/tensorflow/tensorflow/archive/f7759359f8420d3ca7b9fd19493f2a01bd47b4ef.tar.gz",
],
)

# For development, one often wants to make changes to the TF repository as well
# as the PyTorch/XLA repository. You can override the pinned repository above with a
# local checkout by either:
# a) overriding the TF repository on the build.py command line by passing a flag
# like:
# bazel --override_repository=org_tensorflow=/path/to/tensorflow
# or
# b) by commenting out the http_archive above and uncommenting the following:
# local_repository(
# name = "org_tensorflow",
# path = "/path/to/tensorflow",
# )

# Initialize TensorFlow's external dependencies.
load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")

tf_workspace3()

load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2")

tf_workspace2()

load("@org_tensorflow//tensorflow:workspace1.bzl", "tf_workspace1")

tf_workspace1()

load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")

tf_workspace0()
50 changes: 13 additions & 37 deletions build_torch_xla_libs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ if [[ "$XLA_BAZEL_VERBOSE" == "1" ]]; then
VERBOSE="-s"
fi

BUILD_STRATEGY="standalone"
SANDBOX_BASE="${XLA_SANDBOX_BASE}"
if [ -z "$XLA_SANDBOX_BASE" ]; then
SANDBOX_BASE="/tmp"
Expand All @@ -43,61 +42,38 @@ if [[ "$XLA_SANDBOX_BUILD" == "1" ]]; then
BUILD_STRATEGY="sandboxed --sandbox_base=${SANDBOX_BASE}"
else
# We can remove this after https://github.com/bazelbuild/bazel/issues/15359 is resolved
unset CC
unset CXX
BUILD_STRATEGY="local"
fi

TPUVM_FLAG=
if [[ "$TPUVM_MODE" == "1" ]]; then
TPUVM_FLAG="--define=with_tpu_support=true"
OPTS+=(--config=tpu)
fi

MAX_JOBS=
if [[ ! -z "$BAZEL_JOBS" ]]; then
MAX_JOBS="--jobs=$BAZEL_JOBS"
fi

OPTS+=(--cxxopt="-std=c++17")
if [[ $(basename -- $CC) =~ ^clang ]]; then
OPTS+=(--cxxopt="-Wno-c++11-narrowing")
OPTS+=(--cxxopt="-Wno-c++14-narrowing")
fi

if [[ "$XLA_CUDA" == "1" ]]; then
OPTS+=(--cxxopt="-DXLA_CUDA=1")
OPTS+=(--config=cuda)
fi

if [[ "$XLA_CPU_USE_ACL" == "1" ]]; then
OPTS+=("--define=build_with_acl=true")
OPTS+=(--config=acl)
fi

if [ "$CMD" == "clean" ]; then
pushd $THIRD_PARTY_DIR/tensorflow
bazel clean
popd
else
# Overlay llvm-raw secondary cache. The remote cache should be updated
# nightly with the pinned llvm archive. Note, this commands will be NO-OP if there is no match.
sed -i '/.*github.com\/llvm.*,/a "https://storage.googleapis.com/tpu-pytorch/llvm-raw/{commit}.tar.gz".format(commit = LLVM_COMMIT),' \
$THIRD_PARTY_DIR/tensorflow/third_party/llvm/workspace.bzl
sed -i 's/LLVM_COMMIT)]/LLVM_COMMIT),"https:\/\/storage.googleapis.com\/tpu-pytorch\/llvm-raw\/{commit}.tar.gz".format(commit = LLVM_COMMIT)]/g' \
$THIRD_PARTY_DIR/tensorflow/tensorflow/compiler/xla/mlir_hlo/WORKSPACE

cp -r -u -p $THIRD_PARTY_DIR/xla_client $THIRD_PARTY_DIR/tensorflow/tensorflow/compiler/xla/
exit 0
fi

pushd $THIRD_PARTY_DIR/tensorflow
# TensorFlow and its dependencies may introduce warning flags from newer compilers
# that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
# while '-Werror' is used. Therefore, surpress the warnings.
TF_EXTRA_FLAGS="--copt=-Wno-unknown-warning-option"
bazel build $MAX_JOBS $VERBOSE $TPUVM_FLAG $TF_EXTRA_FLAGS --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
--define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
$XLA_CUDA_CFG //tensorflow/compiler/xla/xla_client:libxla_computation_client.so
# TensorFlow and its dependencies may introduce warning flags from newer compilers
# that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
# while '-Werror' is used. Therefore, surpress the warnings in .bazelrc or here.
bazel build $MAX_JOBS $VERBOSE --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
--define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
$XLA_CUDA_CFG //third_party/xla_client:libxla_computation_client.so

popd
mkdir -p torch_xla/lib
chmod 0644 $THIRD_PARTY_DIR/tensorflow/bazel-bin/tensorflow/compiler/xla/xla_client/libxla_computation_client.so
cp $THIRD_PARTY_DIR/tensorflow/bazel-bin/tensorflow/compiler/xla/xla_client/libxla_computation_client.so torch_xla/lib
fi
mkdir -p torch_xla/lib
chmod 0644 bazel-bin/third_party/xla_client/libxla_computation_client.so
cp bazel-bin/third_party/xla_client/libxla_computation_client.so torch_xla/lib
4 changes: 0 additions & 4 deletions docker/experimental/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ WORKDIR /pytorch/xla/
# Contains actual build artifacts
FROM builder AS artifacts

COPY tf_patches/ tf_patches/
COPY third_party/ third_party/
RUN cd third_party/tensorflow && find ../../tf_patches -name '*.diff' | xargs -t -r -n 1 patch -N -p1 -l -i

COPY build_torch_xla_libs.sh .

Expand All @@ -96,9 +94,7 @@ ARG package_version
RUN TORCH_XLA_VERSION=${package_version} BUILD_CPP_TESTS=${build_cpp_tests} TPUVM_MODE=${tpuvm} BUNDLE_LIBTPU=${tpuvm} XLA_CUDA=${cuda} TF_CUDA_COMPUTE_CAPABILITIES=${tf_cuda_compute_capabilities} python setup.py bdist_wheel

# Expunge cache to keep image size under control
WORKDIR /pytorch/xla/third_party/tensorflow
RUN bazel clean --expunge
WORKDIR /pytorch/xla/

RUN pip install dist/*.whl

Expand Down
Loading

0 comments on commit 1bbe4da

Please sign in to comment.