From 5eb444e5b0872421db2f6e2aa66d4df62d2b2956 Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:50:32 -0300 Subject: [PATCH] fix: make exla build resilient to stale upgrades (#1548) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim Co-authored-by: Jonatan Kłosko --- exla/Makefile | 4 +- exla/README.md | 20 +++++++ exla/lib/exla/nif.ex | 15 +++++- exla/mix.exs | 52 ++++++++++++++++--- exla/test/exla/device_memory_sharing_test.exs | 9 ++-- 5 files changed, 86 insertions(+), 14 deletions(-) diff --git a/exla/Makefile b/exla/Makefile index a371447d101..695c7f9409d 100644 --- a/exla/Makefile +++ b/exla/Makefile @@ -8,8 +8,8 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include # Cache configuration -EXLA_CACHE_SO = cache/libexla.so -EXLA_CACHE_OBJ_DIR = cache/objs +EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so +EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs # Private configuration EXLA_DIR = c_src/exla diff --git a/exla/README.md b/exla/README.md index 3091555796a..2ffe144ff72 100644 --- a/exla/README.md +++ b/exla/README.md @@ -48,6 +48,26 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4). +## Troubleshooting + +EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality. +If for any reason these fail to compile or load, troubleshooting can be tricky. + +We recommend following the steps below: + + 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files: + * `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones). + * `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds. + + Additional notes on compilation: + * Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. + * Remember to save the compilation logs from this step for further debugging. + * It is a good idea to save the `cache//libexla.so` file so that the team can inspect its contents if needed. + 2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up. + This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub + so that the Nx team can investigate further. + + ## Contributing ### Building locally diff --git a/exla/lib/exla/nif.ex b/exla/lib/exla/nif.ex index be0567cc0aa..023a0bcbd21 100644 --- a/exla/lib/exla/nif.ex +++ b/exla/lib/exla/nif.ex @@ -4,7 +4,20 @@ defmodule EXLA.NIF do def __on_load__ do path = :filename.join(:code.priv_dir(:exla), ~c"libexla") - :erlang.load_nif(path, 0) + + case :erlang.load_nif(path, 0) do + :ok -> + :ok + + {:error, {reason, text}} -> + raise """ + Failed to load NIF library. + Follow the steps in the :exla README Troubleshooting section for more information. + + #{inspect(reason)} + #{text} + """ + end end def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef) diff --git a/exla/mix.exs b/exla/mix.exs index 184a48cb94b..17ef1915d4b 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -35,7 +35,8 @@ defmodule EXLA.MixProject do %{ "MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}", - "CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv + "CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv, + "EXLA_VERSION" => "#{@version}" } end, make_args: make_args @@ -133,7 +134,38 @@ defmodule EXLA.MixProject do {:ok, []} end - defp cached_make(_) do + defp cached_make(args) do + force_rebuild_mode = + case System.get_env("EXLA_FORCE_REBUILD", "") do + "" -> + :none + + "0" -> + :none + + "partial" -> + :partial + + "true" -> + :full + + "1" -> + :full + + value -> + Mix.raise( + "invalid value for EXLA_FORCE_REBUILD: '#{value}'. Expected one of: partial, true" + ) + end + + File.mkdir_p!("cache/#{@version}") + + # remove only in full mode + if force_rebuild_mode in [:partial, :full] do + Mix.shell().info("Removing cached .o files in cache/#{@version}/objs") + File.rm_rf!("cache/#{@version}/objs") + end + contents = for path <- Path.wildcard("c_src/**/*"), {:ok, contents} <- [File.read(path)], @@ -148,19 +180,27 @@ defmodule EXLA.MixProject do "elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}" cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"]) - cached? = File.exists?(cached_so) + cached? = File.exists?(cached_so) and force_rebuild_mode == :none + + if force_rebuild_mode in [:partial, :full] do + Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so") + File.rm_rf!("cache/#{@version}/libexla.so") + + Mix.shell().info("Removing libexla.so cache at #{cached_so}") + File.rm!(cached_so) + end if cached? do Mix.shell().info("Using libexla.so from #{cached_so}") - File.cp!(cached_so, "cache/libexla.so") + File.cp!(cached_so, "cache/#{@version}/libexla.so") end - result = Mix.Tasks.Compile.ElixirMake.run([]) + result = Mix.Tasks.Compile.ElixirMake.run(args) if not cached? and match?({:ok, _}, result) do Mix.shell().info("Caching libexla.so at #{cached_so}") File.mkdir_p!(Path.dirname(cached_so)) - File.cp!("cache/libexla.so", cached_so) + File.cp!("cache/#{@version}/libexla.so", cached_so) end result diff --git a/exla/test/exla/device_memory_sharing_test.exs b/exla/test/exla/device_memory_sharing_test.exs index e986ea1ff83..09e54a42eb7 100644 --- a/exla/test/exla/device_memory_sharing_test.exs +++ b/exla/test/exla/device_memory_sharing_test.exs @@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do end @tag :cuda_required - test "ipc handles don't crash the runtime when :local mode is selected" do - assert {:error, ~c"Invalid pointer size for selected mode."} == + test "invalid ipc handles don't crash the runtime" do + assert {:error, ~c"Unable to get pointer for IPC handle."} == Nx.from_pointer( {EXLA.Backend, client: :cuda}, - Enum.to_list(0..63), + %Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4}, {:f, 32}, - {1}, - mode: :local + {1} ) end end