Use GCC and remove sandboxing mechanisms. (#4658)

* Use GCC and remove sandboxing mechanisms. * Split `libxla_computation_client.so` into libraries. (#4659) * Split `libxla_computation_client.so` into libraries. To do this split, reworked the metrics analysis. Currently, metrics analysis would get a global singleton of the computation client to get the metrics. This change switches to injection, so the python bindings init can use the singleton to pass the metrics down to the analysis, removing the dependency from the analysis to the whole client. Add some tests. * Remove running tests; they are not cached and are slow. They are not run anyway as is.
pytorch · Feb 21, 2023 · 023d763 · 023d763
1 parent 35c07d5
commit 023d763
Show file tree

Hide file tree

Showing 14 changed files with 617 additions and 155 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -10,23 +10,28 @@ build --announce_rc
 # TODO(goranpetrovic): figure out visibility of tensorflow libraries.
 build --nocheck_visibility
 
-# We can set this to `standalone` after https://github.com/bazelbuild/bazel/issues/15359 is resolved.
-build --spawn_strategy=sandboxed
-
 build --enable_platform_specific_config
 
 build --experimental_cc_shared_library
 
 # Disable enabled-by-default TensorFlow features that we don't care about.
 build --define=no_aws_support=true
 build --define=no_hdfs_support=true
+build --define=no_hdfs_support=true
+build --define=no_kafka_support=true
+build --define=no_ignite_support=true
 
 build --define=grpc_no_ares=true
 
 build -c opt
 
 build --config=short_logs
 
+# Force GCC because clang/bazel has issues.
+common --action_env=CC=gcc
+common --action_env=CXX=g++
+common --spawn_strategy=standalone
+
 ###########################################################################
 
 build:posix --copt=-Wno-sign-compare
@@ -78,22 +83,19 @@ try-import %workspace%/.bazelrc.user
 # Compile database generation config.
 build:compdb --features=-layering_check
 
-# Test requires Java.
-test --java_runtime_version=remotejdk_11
+# Compiling tests requires Java.
+common --java_runtime_version=remotejdk_11
 
 # Coverage requires Java and GCC.
 coverage --config=coverage
 coverage --build_tests_only
-build:coverage --java_runtime_version=remotejdk_11
 build:coverage --copt=-DNDEBUG
 build:coverage --combined_report=lcov
 build:coverage --strategy=TestRunner=sandboxed,local
 build:coverage --strategy=CoverageReport=sandboxed,local
 build:coverage --experimental_use_llvm_covmap
 build:coverage --collect_code_coverage
 build:coverage --test_tag_filters=-nocoverage
-build:coverage --action_env=CC=gcc
-build:coverage --action_env=CXX=g++
 
 ############################################################################
 ############## TensorFlow .bazelrc greatest hits ###########################
@@ -114,9 +116,10 @@ build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
 build:linux --config=dynamic_kernels
 
 # For projects which use TensorFlow as part of a Bazel build process, putting
-# nothing in a bazelrc will default to a monolithic build. The following line
-# opts in to modular op registration support by default.
-build --define framework_shared_object=true
+# nothing in a bazelrc will default to a monolithic build. Here we force
+# the monolitih build because otherwise there are missing dependencies and
+# linking fails.
+build --define framework_shared_object=false
 build --define tsl_protobuf_header_only=true
 
 build --define=use_fast_cpp_protos=true

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,6 @@
 {
     "bsv.bazel.buildFlags": [
         "--config=compdb",
-        "--sandbox_base=/dev/shm",
     ],
     "bsv.cc.compdb.targets": [
         "//third_party/xla_client/...",
@@ -10,9 +9,12 @@
     "coverage-gutters.showLineCoverage": false,
     "coverage-gutters.showGutterCoverage": true,
     "coverage-gutters.coverageReportFileName": "./genhtml/index.html",
-    "coverage-gutters.coverageFileNames": [ "./bazel-out/_coverage/_coverage_report.dat" ],
-    "lcov.path": [ "./.bazel-out/_coverage/_coverage_report.dat"],
-
+    "coverage-gutters.coverageFileNames": [
+        "./bazel-out/_coverage/_coverage_report.dat"
+    ],
+    "lcov.path": [
+        "./.bazel-out/_coverage/_coverage_report.dat"
+    ],
     "python.formatting.provider": "yapf",
     "editor.formatOnSave": true
-}
+}
diff --git a/WORKSPACE b/WORKSPACE
@@ -18,8 +18,8 @@ http_archive(
         "//tf_patches:cudnn_int8x32.diff",
         "//tf_patches:f16_abi_clang.diff",
         "//tf_patches:gpu_race_condition.diff",
-        "//tf_patches:stream_executor.diff",
         "//tf_patches:grpc_version.diff",
+        "//tf_patches:stream_executor.diff",
         "//tf_patches:thread_local_random.diff",
         "//tf_patches:xplane.diff",
     ],

diff --git a/build_torch_xla_libs.sh b/build_torch_xla_libs.sh
@@ -34,20 +34,6 @@ if [[ "$XLA_BAZEL_VERBOSE" == "1" ]]; then
   VERBOSE="-s"
 fi
 
-SANDBOX_BASE="${XLA_SANDBOX_BASE}"
-if [ -z "$XLA_SANDBOX_BASE" ]; then
-  SANDBOX_BASE="/tmp"
-fi
-if [[ "$XLA_SANDBOX_BUILD" == "1" ]]; then
-  BUILD_STRATEGY="sandboxed --sandbox_base=${SANDBOX_BASE}"
-else
-  # We can remove this after https://github.com/bazelbuild/bazel/issues/15359 is resolved
-  # Use GCC locally since clang does not work except with sanboxing, and sandboxing causes pjrt crashes.
-  unset CXX
-  unset CC
-  BUILD_STRATEGY="local"
-fi
-
 if [[ "$TPUVM_MODE" == "1" ]]; then
   OPTS+=(--config=tpu)
 fi
@@ -73,7 +59,7 @@ fi
 # TensorFlow and its dependencies may introduce warning flags from newer compilers
 # that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
 # while '-Werror' is used. Therefore, surpress the warnings in .bazelrc or here.
-bazel build $MAX_JOBS $VERBOSE --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
+bazel build $MAX_JOBS $VERBOSE --show_progress_rate_limit=20 \
   --define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
   $XLA_CUDA_CFG //third_party/xla_client:libxla_computation_client.so
 

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -39,10 +39,6 @@ ENV BUNDLE_LIBTPU "${tpuvm}"
 # Maximum number of jobs to use for bazel build
 ENV BAZEL_JOBS "${bazel_jobs}"
 
-# This makes the bazel build behave more consistently, but runs slower.
-ENV XLA_SANDBOX_BUILD "0"
-ENV XLA_SANDBOX_BASE "/dev/shm"
-
 # To get around issue of Cloud Build with recursive submodule update
 # clone recursively from pytorch/xla if building docker image with
 # cloud build. Otherwise, just use local.

diff --git a/docker/experimental/Dockerfile b/docker/experimental/Dockerfile
@@ -75,10 +75,6 @@ COPY .bazelversion .
 COPY WORKSPACE .
 COPY build_torch_xla_libs.sh .
 
-# TODO: Remove this when it's not required anymore
-ENV XLA_SANDBOX_BUILD=0
-ENV XLA_SANDBOX_BASE "/dev/shm"
-
 COPY torch_xla/ torch_xla/
 COPY setup.py .
 COPY xla_native_functions.yaml .