Skip to content

Commit

Permalink
Use GCC and remove sandboxing mechanisms. (#4658)
Browse files Browse the repository at this point in the history
* Use GCC and remove sandboxing mechanisms.

* Split `libxla_computation_client.so` into libraries. (#4659)

* Split `libxla_computation_client.so` into libraries.

To do this split, reworked the metrics analysis.
Currently, metrics analysis would get a global singleton
of the computation client to get the metrics. This change
switches to injection, so the python bindings init can use
the singleton to pass the metrics down to the analysis,
removing the dependency from the analysis to the whole client.

Add some tests.

* Remove running tests; they are not cached and are slow. They are not run anyway as is.
  • Loading branch information
stgpetrovic authored Feb 21, 2023
1 parent 35c07d5 commit 023d763
Show file tree
Hide file tree
Showing 14 changed files with 617 additions and 155 deletions.
25 changes: 14 additions & 11 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,28 @@ build --announce_rc
# TODO(goranpetrovic): figure out visibility of tensorflow libraries.
build --nocheck_visibility

# We can set this to `standalone` after https://github.com/bazelbuild/bazel/issues/15359 is resolved.
build --spawn_strategy=sandboxed

build --enable_platform_specific_config

build --experimental_cc_shared_library

# Disable enabled-by-default TensorFlow features that we don't care about.
build --define=no_aws_support=true
build --define=no_hdfs_support=true
build --define=no_hdfs_support=true
build --define=no_kafka_support=true
build --define=no_ignite_support=true

build --define=grpc_no_ares=true

build -c opt

build --config=short_logs

# Force GCC because clang/bazel has issues.
common --action_env=CC=gcc
common --action_env=CXX=g++
common --spawn_strategy=standalone

###########################################################################

build:posix --copt=-Wno-sign-compare
Expand Down Expand Up @@ -78,22 +83,19 @@ try-import %workspace%/.bazelrc.user
# Compile database generation config.
build:compdb --features=-layering_check

# Test requires Java.
test --java_runtime_version=remotejdk_11
# Compiling tests requires Java.
common --java_runtime_version=remotejdk_11

# Coverage requires Java and GCC.
coverage --config=coverage
coverage --build_tests_only
build:coverage --java_runtime_version=remotejdk_11
build:coverage --copt=-DNDEBUG
build:coverage --combined_report=lcov
build:coverage --strategy=TestRunner=sandboxed,local
build:coverage --strategy=CoverageReport=sandboxed,local
build:coverage --experimental_use_llvm_covmap
build:coverage --collect_code_coverage
build:coverage --test_tag_filters=-nocoverage
build:coverage --action_env=CC=gcc
build:coverage --action_env=CXX=g++

############################################################################
############## TensorFlow .bazelrc greatest hits ###########################
Expand All @@ -114,9 +116,10 @@ build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
build:linux --config=dynamic_kernels

# For projects which use TensorFlow as part of a Bazel build process, putting
# nothing in a bazelrc will default to a monolithic build. The following line
# opts in to modular op registration support by default.
build --define framework_shared_object=true
# nothing in a bazelrc will default to a monolithic build. Here we force
# the monolitih build because otherwise there are missing dependencies and
# linking fails.
build --define framework_shared_object=false
build --define tsl_protobuf_header_only=true

build --define=use_fast_cpp_protos=true
Expand Down
12 changes: 7 additions & 5 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"bsv.bazel.buildFlags": [
"--config=compdb",
"--sandbox_base=/dev/shm",
],
"bsv.cc.compdb.targets": [
"//third_party/xla_client/...",
Expand All @@ -10,9 +9,12 @@
"coverage-gutters.showLineCoverage": false,
"coverage-gutters.showGutterCoverage": true,
"coverage-gutters.coverageReportFileName": "./genhtml/index.html",
"coverage-gutters.coverageFileNames": [ "./bazel-out/_coverage/_coverage_report.dat" ],
"lcov.path": [ "./.bazel-out/_coverage/_coverage_report.dat"],

"coverage-gutters.coverageFileNames": [
"./bazel-out/_coverage/_coverage_report.dat"
],
"lcov.path": [
"./.bazel-out/_coverage/_coverage_report.dat"
],
"python.formatting.provider": "yapf",
"editor.formatOnSave": true
}
}
2 changes: 1 addition & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ http_archive(
"//tf_patches:cudnn_int8x32.diff",
"//tf_patches:f16_abi_clang.diff",
"//tf_patches:gpu_race_condition.diff",
"//tf_patches:stream_executor.diff",
"//tf_patches:grpc_version.diff",
"//tf_patches:stream_executor.diff",
"//tf_patches:thread_local_random.diff",
"//tf_patches:xplane.diff",
],
Expand Down
16 changes: 1 addition & 15 deletions build_torch_xla_libs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,6 @@ if [[ "$XLA_BAZEL_VERBOSE" == "1" ]]; then
VERBOSE="-s"
fi

SANDBOX_BASE="${XLA_SANDBOX_BASE}"
if [ -z "$XLA_SANDBOX_BASE" ]; then
SANDBOX_BASE="/tmp"
fi
if [[ "$XLA_SANDBOX_BUILD" == "1" ]]; then
BUILD_STRATEGY="sandboxed --sandbox_base=${SANDBOX_BASE}"
else
# We can remove this after https://github.com/bazelbuild/bazel/issues/15359 is resolved
# Use GCC locally since clang does not work except with sanboxing, and sandboxing causes pjrt crashes.
unset CXX
unset CC
BUILD_STRATEGY="local"
fi

if [[ "$TPUVM_MODE" == "1" ]]; then
OPTS+=(--config=tpu)
fi
Expand All @@ -73,7 +59,7 @@ fi
# TensorFlow and its dependencies may introduce warning flags from newer compilers
# that PyTorch and PyTorch/XLA's default compilers don't recognize. They become error
# while '-Werror' is used. Therefore, surpress the warnings in .bazelrc or here.
bazel build $MAX_JOBS $VERBOSE --spawn_strategy=$BUILD_STRATEGY --show_progress_rate_limit=20 \
bazel build $MAX_JOBS $VERBOSE --show_progress_rate_limit=20 \
--define framework_shared_object=false -c "$MODE" "${OPTS[@]}" \
$XLA_CUDA_CFG //third_party/xla_client:libxla_computation_client.so

Expand Down
4 changes: 0 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ ENV BUNDLE_LIBTPU "${tpuvm}"
# Maximum number of jobs to use for bazel build
ENV BAZEL_JOBS "${bazel_jobs}"

# This makes the bazel build behave more consistently, but runs slower.
ENV XLA_SANDBOX_BUILD "0"
ENV XLA_SANDBOX_BASE "/dev/shm"

# To get around issue of Cloud Build with recursive submodule update
# clone recursively from pytorch/xla if building docker image with
# cloud build. Otherwise, just use local.
Expand Down
4 changes: 0 additions & 4 deletions docker/experimental/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@ COPY .bazelversion .
COPY WORKSPACE .
COPY build_torch_xla_libs.sh .

# TODO: Remove this when it's not required anymore
ENV XLA_SANDBOX_BUILD=0
ENV XLA_SANDBOX_BASE "/dev/shm"

COPY torch_xla/ torch_xla/
COPY setup.py .
COPY xla_native_functions.yaml .
Expand Down
Loading

0 comments on commit 023d763

Please sign in to comment.