Skip to content

Commit

Permalink
fix: make exla build resilient to stale upgrades (#1548)
Browse files Browse the repository at this point in the history
Co-authored-by: José Valim <[email protected]>
Co-authored-by: Jonatan Kłosko <[email protected]>
  • Loading branch information
3 people authored Oct 28, 2024
1 parent c82702b commit 5eb444e
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 14 deletions.
4 changes: 2 additions & 2 deletions exla/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib
XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include

# Cache configuration
EXLA_CACHE_SO = cache/libexla.so
EXLA_CACHE_OBJ_DIR = cache/objs
EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so
EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs

# Private configuration
EXLA_DIR = c_src/exla
Expand Down
20 changes: 20 additions & 0 deletions exla/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th

For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4).

## Troubleshooting

EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality.
If for any reason these fail to compile or load, troubleshooting can be tricky.

We recommend following the steps below:

1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files:
* `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones).
* `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds.

Additional notes on compilation:
* Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important.
* Remember to save the compilation logs from this step for further debugging.
* It is a good idea to save the `cache/<version>/libexla.so` file so that the team can inspect its contents if needed.
2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up.
This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub
so that the Nx team can investigate further.


## Contributing

### Building locally
Expand Down
15 changes: 14 additions & 1 deletion exla/lib/exla/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@ defmodule EXLA.NIF do

def __on_load__ do
path = :filename.join(:code.priv_dir(:exla), ~c"libexla")
:erlang.load_nif(path, 0)

case :erlang.load_nif(path, 0) do
:ok ->
:ok

{:error, {reason, text}} ->
raise """
Failed to load NIF library.
Follow the steps in the :exla README Troubleshooting section for more information.
#{inspect(reason)}
#{text}
"""
end
end

def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef)
Expand Down
52 changes: 46 additions & 6 deletions exla/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ defmodule EXLA.MixProject do

%{
"MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}",
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv,
"EXLA_VERSION" => "#{@version}"
}
end,
make_args: make_args
Expand Down Expand Up @@ -133,7 +134,38 @@ defmodule EXLA.MixProject do
{:ok, []}
end

defp cached_make(_) do
defp cached_make(args) do
force_rebuild_mode =
case System.get_env("EXLA_FORCE_REBUILD", "") do
"" ->
:none

"0" ->
:none

"partial" ->
:partial

"true" ->
:full

"1" ->
:full

value ->
Mix.raise(
"invalid value for EXLA_FORCE_REBUILD: '#{value}'. Expected one of: partial, true"
)
end

File.mkdir_p!("cache/#{@version}")

# remove only in full mode
if force_rebuild_mode in [:partial, :full] do
Mix.shell().info("Removing cached .o files in cache/#{@version}/objs")
File.rm_rf!("cache/#{@version}/objs")
end

contents =
for path <- Path.wildcard("c_src/**/*"),
{:ok, contents} <- [File.read(path)],
Expand All @@ -148,19 +180,27 @@ defmodule EXLA.MixProject do
"elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}"

cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"])
cached? = File.exists?(cached_so)
cached? = File.exists?(cached_so) and force_rebuild_mode == :none

if force_rebuild_mode in [:partial, :full] do
Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so")
File.rm_rf!("cache/#{@version}/libexla.so")

Mix.shell().info("Removing libexla.so cache at #{cached_so}")
File.rm!(cached_so)
end

if cached? do
Mix.shell().info("Using libexla.so from #{cached_so}")
File.cp!(cached_so, "cache/libexla.so")
File.cp!(cached_so, "cache/#{@version}/libexla.so")
end

result = Mix.Tasks.Compile.ElixirMake.run([])
result = Mix.Tasks.Compile.ElixirMake.run(args)

if not cached? and match?({:ok, _}, result) do
Mix.shell().info("Caching libexla.so at #{cached_so}")
File.mkdir_p!(Path.dirname(cached_so))
File.cp!("cache/libexla.so", cached_so)
File.cp!("cache/#{@version}/libexla.so", cached_so)
end

result
Expand Down
9 changes: 4 additions & 5 deletions exla/test/exla/device_memory_sharing_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do
end

@tag :cuda_required
test "ipc handles don't crash the runtime when :local mode is selected" do
assert {:error, ~c"Invalid pointer size for selected mode."} ==
test "invalid ipc handles don't crash the runtime" do
assert {:error, ~c"Unable to get pointer for IPC handle."} ==
Nx.from_pointer(
{EXLA.Backend, client: :cuda},
Enum.to_list(0..63),
%Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4},
{:f, 32},
{1},
mode: :local
{1}
)
end
end

0 comments on commit 5eb444e

Please sign in to comment.