Skip to content

Commit

Permalink
make AcceleratRunner a subclass of Accelerator
Browse files Browse the repository at this point in the history
add TorchRunner
add DeepSpeedRunner
  • Loading branch information
ZhiyuanChen committed Dec 14, 2024
1 parent ec53dfe commit 23b7954
Show file tree
Hide file tree
Showing 25 changed files with 1,533 additions and 854 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- name: Install dependencies
run: pip install -r requirements.txt && pip install -e .
- name: Install dependencies for testing
run: pip install pytest pytest-cov torch torcheval torchmetrics torchvision accelerate
run: pip install pytest pytest-cov
- name: pytest
run: pytest --cov=materialx --cov-report=xml --cov-report=html .
- name: Upload coverage report for documentation
Expand Down
9 changes: 6 additions & 3 deletions danling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from lazy_imports import try_import

from danling import metrics, modules, optim, registry, runner, tensors, typing, utils
from danling import defaults, metrics, modules, optim, registry, runner, tensors, typing, utils

from .metrics import (
AverageMeter,
Expand All @@ -29,7 +29,7 @@
)
from .optim import LRScheduler
from .registry import GlobalRegistry, Registry
from .runner import AccelerateRunner, BaseRunner, TorchRunner
from .runner import AccelerateRunner, BaseRunner, Config, DeepSpeedRunner, TorchRunner
from .tensors import NestedTensor, PNTensor, tensor
from .utils import (
catch,
Expand All @@ -47,6 +47,7 @@
from .metrics import Metrics, MultiTaskMetrics

__all__ = [
"defaults",
"metrics",
"modules",
"optim",
Expand All @@ -55,9 +56,11 @@
"tensors",
"utils",
"typing",
"Config",
"BaseRunner",
"AccelerateRunner",
"TorchRunner",
"AccelerateRunner",
"DeepSpeedRunner",
"LRScheduler",
"Registry",
"GlobalRegistry",
Expand Down
15 changes: 8 additions & 7 deletions danling/runner/defaults.py → danling/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the LICENSE file for more details.

DEFAULT_RUN_NAME = "Run"
DEFAULT_EXPERIMENT_NAME = "DanLing"
DEFAULT_EXPERIMENT_ID = "xxxxxxxxxxxxxxxx"
DEFAULT_IGNORED_KEYS_IN_HASH = {
RUN_NAME = "Run"
EXPERIMENT_NAME = "DanLing"
EXPERIMENT_ID = "xxxxxxxxxxxxxxxx"
SEED = 1016
IGNORED_CONFIG_IN_HASH = {
"timestamp",
"iters",
"steps",
"epochs",
"iter",
"step",
"epoch",
"results",
"score_split",
"score",
Expand Down
8 changes: 6 additions & 2 deletions danling/metrics/metric_meter.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,9 +260,13 @@ def set( # pylint: disable=W0237
name: str,
metric: MetricMeter | MetricMeters | Callable, # type: ignore[override]
) -> None:
if callable(metric):
from .metrics import Metrics

if isinstance(metric, Metrics):
metric = MetricMeters(**metric.metrics)
elif callable(metric):
metric = MetricMeter(metric)
if not isinstance(metric, (MetricMeter, MetricMeters)):
elif not isinstance(metric, (MetricMeter, MetricMeters)):
raise ValueError(
f"Expected {metric} to be an instance of MetricMeter or MetricMeters, but got {type(metric)}"
)
Expand Down
6 changes: 3 additions & 3 deletions danling/modules/mlp/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def __init__(
super().__init__()
self.residual = residual
self.linear = nn.Linear(in_features, out_features, bias=bias)
self.norm = getattr(nn, norm)(out_features) if norm else nn.Identity()
self.activation = getattr(nn, activation)() if activation else nn.Identity()
self.norm = getattr(nn, norm)(out_features) if norm else None
self.activation = getattr(nn, activation)() if activation else None
self.dropout = nn.Dropout(dropout)
self.pool = getattr(nn, pool)(out_features) if pool else nn.Identity() if self.residual else None
self.pool = getattr(nn, pool)(out_features) if self.residual else None

def forward(self, x):
out = self.linear(x)
Expand Down
14 changes: 7 additions & 7 deletions danling/runner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,26 @@ The Runner of DanLing sets up the basic environment for running neural networks.

## Components

For cross-platform compatibilities, DanLing features a two-level Runner + RunnerState system.
For cross-platform compatibilities, DanLing features a two-level Runner + Config system.

### PlatformRunner

PlatformRunner implements platform-specific features like `step` and `prepare`.

The Runner contains all runtime information that is irrelevant to the checkpoint (e.g. `world_size`, `rank`, etc.). All other information should be saved in `RunnerState`.
The Runner contains all runtime information that is irrelevant to the checkpoint (e.g. `world_size`, `rank`, etc.). All other information should be saved in `Config`.

Currently, only [`AccelerateRunner`][danling.runner.AccelerateRunner] is supported.

### [`BaseRunner`][danling.runner.BaseRunner]

[`BaseRunner`](danling.runner.BaseRunner) defines shared attributes and implements platform-agnostic features, including `init_logging`, `results` and `scores`.
[`BaseRunner`][danling.runner.BaseRunner] defines shared attributes and implements platform-agnostic features, including `init_logging`, `results` and `scores`.

### [`RunnerState`][danling.runner.RunnerState]
### [`Config`][danling.runner.Config]

[`RunnerState`][danling.runner.RunnerState] stores the state of a run (e.g. `epochs`, `run_id`, `network`, etc.).
[`Config`][danling.runner.Config] stores the state of a run (e.g. `epoch`, `run_id`, `network`, etc.).

With `RunnerState` and corresponding weights, you can resume a run from any point.
Therefore, all members in `RunnerState` will be saved in the checkpoint, and thus should be json serialisable.
With `Config` and corresponding weights, you can resume a run from any point.
Therefore, all members in `Config` will be saved in the checkpoint, and thus should be json serialisable.

## Experiments Management

Expand Down
9 changes: 5 additions & 4 deletions danling/runner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,20 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the LICENSE file for more details.

from . import defaults
from .accelerate_runner import AccelerateRunner
from .base_runner import BaseRunner
from .state import RunnerState
from .config import Config
from .deepspeed_runner import DeepSpeedRunner
from .torch_runner import TorchRunner
from .utils import on_local_main_process, on_main_process

__all__ = [
"RunnerState",
"Config",
"BaseRunner",
"TorchRunner",
"AccelerateRunner",
"DeepSpeedRunner",
"TorchRunner",
"on_main_process",
"on_local_main_process",
"defaults",
]
Loading

0 comments on commit 23b7954

Please sign in to comment.