Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning for problematic libraries #2151

Merged
merged 5 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/accelerate/launchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import torch

from .state import AcceleratorState, PartialState
from .utils import PrecisionType, PrepareForLaunch, is_mps_available, patch_environment
from .utils import PrecisionType, PrepareForLaunch, are_libraries_initialized, is_mps_available, patch_environment


def test_launch():
Expand Down Expand Up @@ -142,6 +142,17 @@ def train(*args):
"inside your training function. Restart your notebook and make sure no cells initializes an "
"`Accelerator`."
)
# Check for specific libraries known to initialize CUDA that users constantly use
problematic_imports = are_libraries_initialized("bitsandbytes")
if len(problematic_imports) > 1:
err = (
"Could not start distributed process. Libraries known to initialize CUDA upon import have been "
"imported already. Please keep these imports inside your training function to try and help with this:"
)
for lib_name in problematic_imports:
err += f"\n\t* `{lib_name}`"
raise RuntimeError(err)

# torch.distributed will expect a few environment variable to be here. We set the ones common to each
# process here (the other ones will be set be the launcher).
with patch_environment(
Expand Down
33 changes: 25 additions & 8 deletions src/accelerate/test_utils/scripts/test_notebook.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,34 @@
# Test file to ensure that in general certain situational setups for notebooks work.
import argparse
import os

from pytest import raises

from accelerate import PartialState, notebook_launcher
from accelerate.test_utils import require_bnb
from accelerate.utils import is_bnb_available


def basic_function():
# Just prints the PartialState
print(f"PartialState:\n{PartialState()}")

parser = argparse.ArgumentParser()
parser.add_argument("--num_processes", type=int, default=1)
args = parser.parse_args()

NUM_PROCESSES = os.environ.get("ACCELERATE_NUM_PROCESSES", 1)

def function():
print(f"PartialState:\n{PartialState()}")

def test_can_initialize():
notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)


@require_bnb
def test_problematic_imports():
with raises(AssertionError, match="Please keep these imports"):
notebook_launcher(basic_function, (), num_processes=NUM_PROCESSES)


if __name__ == "__main__":
notebook_launcher(function, num_processes=int(args.num_processes))
def main():
print("Test basic notebook can be ran")
test_can_initialize()
if is_bnb_available():
muellerzr marked this conversation as resolved.
Show resolved Hide resolved
print("Test problematic imports (bnb)")
test_problematic_imports()
6 changes: 4 additions & 2 deletions src/accelerate/test_utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,13 +423,15 @@ class SubprocessCallException(Exception):
pass


def run_command(command: List[str], return_stdout=False):
def run_command(command: List[str], return_stdout=False, env=None):
"""
Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
if an error occured while running `command`
"""
if env is None:
env = os.environ.copy()
try:
output = subprocess.check_output(command, stderr=subprocess.STDOUT)
output = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env)
if return_stdout:
if hasattr(output, "decode"):
output = output.decode("utf-8")
Expand Down
8 changes: 7 additions & 1 deletion src/accelerate/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@
TensorInformation,
TorchDynamoPlugin,
)
from .environment import get_int_from_env, parse_choice_from_env, parse_flag_from_env, str_to_bool
from .environment import (
are_libraries_initialized,
get_int_from_env,
parse_choice_from_env,
parse_flag_from_env,
str_to_bool,
)
from .imports import (
get_ccl_version,
is_4bit_bnb_available,
Expand Down
9 changes: 9 additions & 0 deletions src/accelerate/utils/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# limitations under the License.

import os
import sys
from typing import Dict


def str_to_bool(value) -> int:
Expand Down Expand Up @@ -48,3 +50,10 @@ def parse_flag_from_env(key, default=False):
def parse_choice_from_env(key, default="no"):
value = os.environ.get(key, str(default))
return value


def are_libraries_initialized(*library_names: list[str]) -> Dict[str, bool]:
muellerzr marked this conversation as resolved.
Show resolved Hide resolved
"""
Checks if any of `library_names` are imported in the environment. Will return results as a `key:bool` pair.
"""
return [lib_name for lib_name in library_names if lib_name in sys.modules]
23 changes: 10 additions & 13 deletions tests/test_multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import accelerate
from accelerate import Accelerator
from accelerate.big_modeling import dispatch_model
from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu, skip
from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu
from accelerate.test_utils.testing import run_command
from accelerate.utils import patch_environment


Expand All @@ -33,6 +34,9 @@ def setUp(self):
mod_file.split(os.path.sep)[:-1] + ["scripts", "test_distributed_data_loop.py"]
)
self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"])
self.notebook_launcher_path = os.path.sep.join(
mod_file.split(os.path.sep)[:-1] + ["scripts", "test_notebook.py"]
)

@require_multi_gpu
def test_multi_gpu(self):
Expand Down Expand Up @@ -66,23 +70,16 @@ def test_distributed_data_loop(self):
with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
execute_subprocess_async(cmd, env=os.environ.copy())

# Need to see why this test raises forking issues when ran as a suite
@skip
Comment on lines -69 to -70
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially the notebook_launcher doesn't like using async subprocesses when doing things, so executing it fully as a non-async subprocess makes this fully work reliably.

@require_multi_gpu
def test_notebook_launcher(self):
"""
This test checks that the `notebook_launcher` will be able to intialize
a `PartialState` without issue
This test checks a variety of situations and scenarios
with the `notebook_launcher`
"""
cmd = [
"python",
"-m",
"accelerate.test_utils.scripts.test_notebook",
"--num_processes",
str(torch.cuda.device_count()),
]
cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.notebook_launcher_path]
print(f"Running {cmd}")
with patch_environment(omp_num_threads=1):
execute_subprocess_async(cmd, env=os.environ.copy())
run_command(cmd, env=os.environ.copy())


if __name__ == "__main__":
Expand Down
Loading