Bazel (#4636)

* Replace tensorflow with a bazel external repository * Basic migration to bazel for xla_client. * Revert to blob * Add vscode config. * Update newlines * Merge with pjrt client test build changes. * Migrate tests to new build * Format test and plugin * Order imports * Conditionally apply tf patches; apply pt patches always. * Format python * configure formatters * Mirror TF pin update an fixes in bazel. * Support local and sandboxed build based on flags * Add cloud cache URLs for llvm. * Merge with upstream * Update TF pin * Fix patching regression * Remove the citcleci setup downloading llvm * Rework the experimental dockerfile for bazel support
pytorch · Feb 16, 2023 · 1bbe4da · 1bbe4da
1 parent b98179f
commit 1bbe4da
Show file tree

Hide file tree

Showing 166 changed files with 859 additions and 502 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -0,0 +1,176 @@
+############################################################################
+# All default build options below.
+
+# Enable exceptions in C++.
+common --copt=-fexceptions
+
+# Make Bazel print out all options from rc files.
+build --announce_rc
+
+# TODO(goranpetrovic): figure out visibility of tensorflow libraries.
+build --nocheck_visibility
+
+#build --define open_source_build=true
+
+# We can set this to `standalone` after https://github.com/bazelbuild/bazel/issues/15359 is resolved.
+build --spawn_strategy=sandboxed
+
+build --enable_platform_specific_config
+
+build --experimental_cc_shared_library
+
+# Disable enabled-by-default TensorFlow features that we don't care about.
+build --define=no_aws_support=true
+build --define=no_hdfs_support=true
+
+build --define=grpc_no_ares=true
+
+build -c opt
+
+build --config=short_logs
+
+###########################################################################
+
+build:posix --copt=-Wno-sign-compare
+build:posix --cxxopt=-std=c++17
+build:posix --host_cxxopt=-std=c++17
+
+build:avx_posix --copt=-mavx
+build:avx_posix --host_copt=-mavx
+
+build:avx_linux --copt=-mavx
+build:avx_linux --host_copt=-mavx
+
+build:native_arch_posix --copt=-march=native
+build:native_arch_posix --host_copt=-march=native
+
+build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=1
+
+build:cuda --repo_env TF_NEED_CUDA=1
+# "sm" means we emit only cubin, which is forward compatible within a GPU generation.
+# "compute" means we emit both cubin and PTX, which is larger but also forward compatible to future GPU generations.
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda --@local_config_cuda//:enable_cuda
+build:cuda --@org_tensorflow//tensorflow/compiler/xla/python:enable_gpu=true
+build:cuda --define=xla_python_enable_gpu=true
+build:cuda --cxxopt=-DXLA_CUDA=1
+
+build:acl --define==build_with_acl=true
+
+build:nonccl --define=no_nccl_support=true
+
+build:linux --config=posix
+build:linux --copt=-Wno-unknown-warning-option
+
+# Suppress all warning messages.
+build:short_logs --output_filter=DONT_MATCH_ANYTHING
+
+#build:tpu --@org_tensorflow//tensorflow/compiler/xla/python:enable_tpu=true
+build:tpu --define=with_tpu_support=true
+
+#########################################################################
+# RBE config options below.
+# Flag to enable remote config
+common --experimental_repo_remote_exec
+#########################################################################
+
+# Load rc file with user-specific options.
+try-import %workspace%/.bazelrc.user
+
+# Compile database generation config.
+build:compdb --features=-layering_check
+
+# Test requires Java.
+test --java_runtime_version=remotejdk_11
+
+# Coverage requires Java and GCC.
+coverage --config=coverage
+coverage --build_tests_only
+build:coverage --java_runtime_version=remotejdk_11
+build:coverage --copt=-DNDEBUG
+build:coverage --combined_report=lcov
+build:coverage --strategy=TestRunner=sandboxed,local
+build:coverage --strategy=CoverageReport=sandboxed,local
+build:coverage --experimental_use_llvm_covmap
+build:coverage --collect_code_coverage
+build:coverage --test_tag_filters=-nocoverage
+build:coverage --action_env=CC=gcc
+build:coverage --action_env=CXX=g++
+
+############################################################################
+############## TensorFlow .bazelrc greatest hits ###########################
+############################################################################
+
+# Modular TF build options
+build:dynamic_kernels --define=dynamic_loaded_kernels=true
+build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
+build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
+
+# Default paths for TF_SYSTEM_LIBS
+build:linux --define=PREFIX=/usr
+build:linux --define=LIBDIR=$(PREFIX)/lib
+build:linux --define=INCLUDEDIR=$(PREFIX)/include
+build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
+
+# On linux, we dynamically link small amount of kernels
+build:linux --config=dynamic_kernels
+
+# For projects which use TensorFlow as part of a Bazel build process, putting
+# nothing in a bazelrc will default to a monolithic build. The following line
+# opts in to modular op registration support by default.
+build --define framework_shared_object=true
+build --define tsl_protobuf_header_only=true
+
+build --define=use_fast_cpp_protos=true
+build --define=allow_oversize_protos=true
+
+# Enable XLA support by default.
+build --define=with_xla_support=true
+
+# See https://github.com/bazelbuild/bazel/issues/7362 for information on what
+# --incompatible_remove_legacy_whole_archive flag does.
+# This flag is set to true in Bazel 1.0 and newer versions. We tried to migrate
+# Tensorflow to the default, however test coverage wasn't enough to catch the
+# errors.
+# There is ongoing work on Bazel team's side to provide support for transitive
+# shared libraries. As part of migrating to transitive shared libraries, we
+# hope to provide a better mechanism for control over symbol exporting, and
+# then tackle this issue again.
+#
+# TODO: Remove this line once TF doesn't depend on Bazel wrapping all library
+# archives in -whole_archive -no_whole_archive.
+build --noincompatible_remove_legacy_whole_archive
+
+# cc_shared_library ensures no library is linked statically more than once.
+build --experimental_link_static_libraries_once=false
+
+# On linux, don't cross compile by default
+build:linux --distinct_host_configuration=false
+
+# Do not risk cache corruption. See:
+# https://github.com/bazelbuild/bazel/issues/3360
+build:linux --experimental_guard_against_concurrent_changes
+
+# Prevent regressions on those two incompatible changes
+# TODO: remove those flags when they are flipped in the default Bazel version TF uses.
+build --incompatible_enforce_config_setting_visibility
+
+# Disable TFRT integration for now unless --config=tfrt is specified.
+build --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils
+
+# Suppress most C++ complier warnings to reduce log size but allow
+# for specific warnings to still be present.
+build:linux --copt="-Wno-all"
+build:linux --copt="-Wno-extra"
+build:linux --copt="-Wno-deprecated"
+build:linux --copt="-Wno-deprecated-declarations"
+build:linux --copt="-Wno-ignored-attributes"
+build:linux --copt="-Wno-array-bounds"
+# Add unused-result as an error on Linux.
+build:linux --copt="-Wunused-result"
+build:linux --copt="-Werror=unused-result"
+# Add switch as an error on Linux.
+build:linux --copt="-Wswitch"
+build:linux --copt="-Werror=switch"
+# Required for building with clang
+build:linux --copt="-Wno-error=unused-but-set-variable"
diff --git a/.bazelversion b/.bazelversion
@@ -0,0 +1 @@
+5.3.0
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -10,7 +10,6 @@ setup_base_docker: &setup_base_docker
   name: Setup Base Docker Image
   command: |
     .circleci/setup_ci_environment.sh
-    .circleci/download_llvm_raw.sh
 
 launch_docker_and_build: &launch_docker_and_build
   name: Launch Docker Container and Build

diff --git a/.gitignore b/.gitignore
@@ -18,7 +18,6 @@ torch_xla/csrc/generated/
 # Below files are not deleted by "setup.py clean".
 
 # Visual Studio Code files
-.vscode
 .vs
 
 # Files autogenerated by docs/docs_build.sh
@@ -27,3 +26,7 @@ torch_xla/csrc/generated/
 
 # Local terraform state
 .terraform*
+
+
+# Build system temporary files
+/bazel-*
diff --git a/.gitmodules b/.gitmodules
@@ -1,5 +0,0 @@
-[submodule "third_party/tensorflow"]
-	path = third_party/tensorflow
-	url = https://github.com/tensorflow/tensorflow.git
-	ignore = dirty
-

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,18 @@
+{
+    "bsv.bazel.buildFlags": [
+        "--config=compdb",
+        "--sandbox_base=/dev/shm",
+    ],
+    "bsv.cc.compdb.targets": [
+        "//third_party/xla_client/...",
+    ],
+    "coverage-gutters.coverageBaseDir": ".",
+    "coverage-gutters.showLineCoverage": false,
+    "coverage-gutters.showGutterCoverage": true,
+    "coverage-gutters.coverageReportFileName": "./genhtml/index.html",
+    "coverage-gutters.coverageFileNames": [ "./bazel-out/_coverage/_coverage_report.dat" ],
+    "lcov.path": [ "./.bazel-out/_coverage/_coverage_report.dat"],
+
+    "python.formatting.provider": "yapf",
+    "editor.formatOnSave": true
+}
diff --git a/WORKSPACE b/WORKSPACE
@@ -0,0 +1,61 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+# To update TensorFlow to a new revision,
+# a) update URL and strip_prefix to the new git commit hash
+# b) get the sha256 hash of the commit by running:
+#    curl -L https://github.com/tensorflow/tensorflow/archive/<git hash>.tar.gz | sha256sum
+#    and update the sha256 with the result.
+http_archive(
+    name = "org_tensorflow",
+    patch_args = [
+        "-l",
+        "-p1",
+    ],
+    patch_tool = "patch",
+    patches = [
+        "//tf_patches:bazel.diff",
+        "//tf_patches:cache_urls.diff",
+        "//tf_patches:cudnn_int8x32.diff",
+        "//tf_patches:f16_abi_clang.diff",
+        "//tf_patches:gpu_race_condition.diff",
+        "//tf_patches:stream_executor.diff",
+        "//tf_patches:grpc_version.diff",
+        "//tf_patches:thread_local_random.diff",
+        "//tf_patches:xplane.diff",
+    ],
+    sha256 = "0fdf5067cd9827be2ae14c2ac59cd482e678134b125943be278ad23ea5342181",
+    strip_prefix = "tensorflow-f7759359f8420d3ca7b9fd19493f2a01bd47b4ef",
+    urls = [
+        "https://github.com/tensorflow/tensorflow/archive/f7759359f8420d3ca7b9fd19493f2a01bd47b4ef.tar.gz",
+    ],
+)
+
+# For development, one often wants to make changes to the TF repository as well
+# as the PyTorch/XLA repository. You can override the pinned repository above with a
+# local checkout by either:
+# a) overriding the TF repository on the build.py command line by passing a flag
+#    like:
+#    bazel --override_repository=org_tensorflow=/path/to/tensorflow
+#    or
+# b) by commenting out the http_archive above and uncommenting the following:
+# local_repository(
+#    name = "org_tensorflow",
+#    path = "/path/to/tensorflow",
+# )
+
+# Initialize TensorFlow's external dependencies.
+load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")
+
+tf_workspace3()
+
+load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2")
+
+tf_workspace2()
+
+load("@org_tensorflow//tensorflow:workspace1.bzl", "tf_workspace1")
+
+tf_workspace1()
+
+load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
+
+tf_workspace0()
diff --git a/build_torch_xla_libs.sh b/build_torch_xla_libs.sh
@@ -34,7 +34,6 @@ if [[ "$XLA_BAZEL_VERBOSE" == "1" ]]; then
   VERBOSE="-s"
 fi
 
-BUILD_STRATEGY="standalone"
 SANDBOX_BASE="${XLA_SANDBOX_BASE}"
 if [ -z "$XLA_SANDBOX_BASE" ]; then
   SANDBOX_BASE="/tmp"
@@ -43,61 +42,38 @@ if [[ "$XLA_SANDBOX_BUILD" == "1" ]]; then
   BUILD_STRATEGY="sandboxed --sandbox_base=${SANDBOX_BASE}"
 else
   # We can remove this after https://github.com/bazelbuild/bazel/issues/15359 is resolved
-  unset CC
-  unset CXX
   BUILD_STRATEGY="local"
 fi
 
-TPUVM_FLAG=
 if [[ "$TPUVM_MODE" == "1" ]]; then
-  TPUVM_FLAG="--define=with_tpu_support=true"
+  OPTS+=(--config=tpu)
 fi
 
 MAX_JOBS=
 if [[ ! -z "$BAZEL_JOBS" ]]; then
   MAX_JOBS="--jobs=$BAZEL_JOBS"
 fi
 
-OPTS+=(--cxxopt="-std=c++17")
-if [[ $(basename -- $CC) =~ ^clang ]]; then
-  OPTS+=(--cxxopt="-Wno-c++11-narrowing")
-  OPTS+=(--cxxopt="-Wno-c++14-narrowing")
-fi
-
 if [[ "$XLA_CUDA" == "1" ]]; then
-  OPTS+=(--cxxopt="-DXLA_CUDA=1")
   OPTS+=(--config=cuda)
 fi
 
 if [[ "$XLA_CPU_USE_ACL" == "1" ]]; then
-  OPTS+=("--define=build_with_acl=true")
+  OPTS+=(--config=acl)
 fi
 
 if [ "$CMD" == "clean" ]; then
-  pushd $THIRD_PARTY_DIR/tensorflow
   bazel clean
-  popd
-else
-  # Overlay llvm-raw secondary cache. The remote cache should be updated
-  # nightly with the pinned llvm archive. Note, this commands will be NO-OP if there is no match.
-  sed -i '/.*github.com\/llvm.*,/a "https://storage.googleapis.com/tpu-pytorch/llvm-raw/{commit}.tar.gz".format(commit = LLVM_COMMIT),' \
-    $THIRD_PARTY_DIR/tensorflow/third_party/llvm/workspace.bzl
-  sed -i 's/LLVM_COMMIT)]/LLVM_COMMIT),"https:\/\/storage.googleapis.com\/tpu-pytorch\/llvm-raw\/{commit}.tar.gz".format(commit = LLVM_COMMIT)]/g' \
-    $THIRD_PARTY_DIR/tensorflow/tensorflow/compiler/xla/mlir_hlo/WORKSPACE
-
-  cp -r -u -p $THIRD_PARTY_DIR/xla_client $THIRD_PARTY_DIR/tensorflow/tensorflow/compiler/xla/
+  exit 0
+fi
 
-  pushd $THIRD_PARTY_DIR/tensorflow
-  # TensorFlow and its dependencies may introduce warning flags from newer compilers
-  # that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
-  # while '-Werror' is used. Therefore, surpress the warnings.
-  TF_EXTRA_FLAGS="--copt=-Wno-unknown-warning-option"
-  bazel build $MAX_JOBS $VERBOSE $TPUVM_FLAG $TF_EXTRA_FLAGS --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
-    --define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
-    $XLA_CUDA_CFG //tensorflow/compiler/xla/xla_client:libxla_computation_client.so
+# TensorFlow and its dependencies may introduce warning flags from newer compilers
+# that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
+# while '-Werror' is used. Therefore, surpress the warnings in .bazelrc or here.
+bazel build $MAX_JOBS $VERBOSE --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
+  --define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
+  $XLA_CUDA_CFG //third_party/xla_client:libxla_computation_client.so
 
-  popd
-  mkdir -p torch_xla/lib
-  chmod 0644 $THIRD_PARTY_DIR/tensorflow/bazel-bin/tensorflow/compiler/xla/xla_client/libxla_computation_client.so
-  cp $THIRD_PARTY_DIR/tensorflow/bazel-bin/tensorflow/compiler/xla/xla_client/libxla_computation_client.so torch_xla/lib
-fi
+mkdir -p torch_xla/lib
+chmod 0644 bazel-bin/third_party/xla_client/libxla_computation_client.so
+cp bazel-bin/third_party/xla_client/libxla_computation_client.so torch_xla/lib
diff --git a/docker/experimental/Dockerfile b/docker/experimental/Dockerfile
@@ -69,9 +69,7 @@ WORKDIR /pytorch/xla/
 # Contains actual build artifacts
 FROM builder AS artifacts
 
-COPY tf_patches/ tf_patches/
 COPY third_party/ third_party/
-RUN cd third_party/tensorflow && find ../../tf_patches -name '*.diff' | xargs -t -r -n 1 patch -N -p1 -l -i
 
 COPY build_torch_xla_libs.sh .
 
@@ -96,9 +94,7 @@ ARG package_version
 RUN TORCH_XLA_VERSION=${package_version} BUILD_CPP_TESTS=${build_cpp_tests} TPUVM_MODE=${tpuvm} BUNDLE_LIBTPU=${tpuvm} XLA_CUDA=${cuda} TF_CUDA_COMPUTE_CAPABILITIES=${tf_cuda_compute_capabilities} python setup.py bdist_wheel
 
 # Expunge cache to keep image size under control
-WORKDIR /pytorch/xla/third_party/tensorflow
 RUN bazel clean --expunge
-WORKDIR /pytorch/xla/
 
 RUN pip install dist/*.whl