Skip to content

Commit

Permalink
Simplify and relax dependencies (Take 2) (#818)
Browse files Browse the repository at this point in the history
* Remove unused einops dependency

* Make Weights & Biases monitoring optional

- Move `wandb` to `./requirements/requirements-wandb.txt`
- Clean up `./deepy.py`
- Make GPT-NeoX not explode if `wandb` is not installed.
- Tell the user when `wandb` is not importable and explain how to fix.
- Remove implicit dependence on `shortuuid`.
- Ensure that `wandb` is installed in Dockerfile.

* Relax many dependencies

* Remove usage of uuid.uuid4()

* Update Dockerfile

Add flash attention install

* Update logging.py to pass when wandb is unimportable

---------

Co-authored-by: Quentin Anthony <[email protected]>
  • Loading branch information
EricHallahan and Quentin-Anthony authored Mar 9, 2023
1 parent e897c23 commit 9610391
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 51 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,13 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
#### Python packages
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
COPY requirements/requirements.txt .
COPY requirements/requirements-wandb.txt .
COPY requirements/requirements-onebitadam.txt .
COPY requirements/requirements-sparseattention.txt .
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
pip install -r requirements-sparseattention.txt && \
pip install -r requirements-flashattention.txt && \
pip install -r requirements-wandb.txt && \
pip install protobuf==3.20.* && \
pip cache purge

Expand Down
29 changes: 16 additions & 13 deletions deepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,26 @@
import logging
import os

import deepspeed
from deepspeed.launcher.runner import main
import deepspeed.launcher.runner

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

from megatron.neox_arguments import NeoXArgs
from megatron.utils import get_wandb_api_key
def main():
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

from megatron.neox_arguments import NeoXArgs
from megatron.utils import get_wandb_api_key

neox_args = NeoXArgs.consume_deepy_args()
deepspeed_main_args = neox_args.get_deepspeed_main_args()
neox_args = NeoXArgs.consume_deepy_args()
deepspeed_main_args = neox_args.get_deepspeed_main_args()

# Extract wandb API key and inject into worker environments
wandb_token = get_wandb_api_key(neox_args=neox_args)
if wandb_token is not None:
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = wandb_token

deepspeed.launcher.runner.main(deepspeed_main_args)

# Extract wandb API key and inject into worker environments
wandb_token = get_wandb_api_key(neox_args=neox_args)
if wandb_token is not None:
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = wandb_token

if __name__ == "__main__":
main(deepspeed_main_args)
main()
7 changes: 6 additions & 1 deletion megatron/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import sys
import torch
import wandb

try:
import wandb
except ModuleNotFoundError:
pass

from megatron import mpu, print_rank_0
from megatron.utils import report_memory

Expand Down
30 changes: 17 additions & 13 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import yaml
import json
import logging
import shortuuid
import copy
import torch
import argparse
Expand Down Expand Up @@ -278,13 +277,13 @@ def consume_deepy_args(cls):
"--wandb_group",
type=str,
default=None,
help='Weights and Biases group name - used to group together "runs".',
help='Weights & Biases group name - used to group together "runs".',
)
group.add_argument(
"--wandb_team",
type=str,
default=None,
help="Team name for Weights and Biases.",
help="Weights & Biases team name.",
)

group = parser.add_argument_group(title="Eval args")
Expand Down Expand Up @@ -372,11 +371,22 @@ def consume_deepy_args(cls):
paths_to_yml_files=conf_files, overwrite_values=overwrite_values
)

if neox_args.wandb_group is not None:
# concat the wandb group name with a uid to make sure it's unique
import wandb
if neox_args.use_wandb:
try:
import wandb

# Check if the W&B group name is configured
if neox_args.wandb_group is None:
# Set a randomized string as group name if no group name is provided
neox_args.wandb_group = wandb.sdk.lib.runid.generate_id()
else:
# Concatenate the W&B group name with a randomized string to ensure uniqueness.
neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id()
except ModuleNotFoundError as e:
if e.name == "wandb":
e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging."
raise e

neox_args.wandb_group += "_" + wandb.util.generate_id()
neox_args.print()

return neox_args
Expand Down Expand Up @@ -736,12 +746,6 @@ def calculate_derived(self):
Derives additional configuration values necessary for training from the current config
"""

# wandb
# sets a unique wandb group
if self.wandb_group is None:
# if none is defined a uuid is set for the run
self.wandb_group = shortuuid.uuid()

# number of gpus
# Get number of GPUs param or hostfile to determine train_batch_size
global_num_gpus = getattr(self, "global_num_gpus", None)
Expand Down
11 changes: 7 additions & 4 deletions megatron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,19 @@
from typing import Dict, List

import requests
import wandb
from wandb import UsageError

try:
import wandb
except ModuleNotFoundError:
pass

import torch

from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion

from megatron import print_rank_0
from megatron import mpu
from deepspeed import PipelineEngine, DeepSpeedEngine

from collections import deque


Expand Down Expand Up @@ -167,7 +170,7 @@ def init_wandb(neox_args):
force=False,
entity=neox_args.wandb_team,
)
except UsageError as e:
except wandb.UsageError as e:
neox_args.update_value("use_wandb", False)
print(e)
print(
Expand Down
13 changes: 6 additions & 7 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
autopep8==1.5.6
clang-format==13.0.1
pre-commit~=2.17.0
pytest==6.2.3
pytest-cov==2.11.1
pytest-forked==1.3.0
autopep8>=1.5.6
clang-format>=13.0.1
pre-commit>=2.17.0
pytest>=6.2.3
pytest-cov>=2.11.1
pytest-forked>=1.3.0
pytest-xdist
transformers~=4.16.2
2 changes: 1 addition & 1 deletion requirements/requirements-onebitadam.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cupy-cuda111==8.6.0
cupy-cuda111>=8.6.0
1 change: 1 addition & 0 deletions requirements/requirements-wandb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
wandb>=0.10.28
22 changes: 10 additions & 12 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
einops==0.3.0
ftfy==6.0.1
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
huggingface_hub==0.11.0
lm_eval==0.3.0
mpi4py==3.0.3
numpy==1.22.0
pybind11==2.6.2
deepspeed
ftfy>=6.0.1
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
huggingface_hub>=0.11.0
lm_eval>=0.3.0
mpi4py>=3.0.3
numpy>=1.22.0
pybind11>=2.6.2
regex
sentencepiece
six
tiktoken==0.1.2
tokenizers==0.12.1
transformers~=4.24.0
wandb==0.10.28
tiktoken>=0.1.2
tokenizers>=0.12.1
transformers>=4.24.0

0 comments on commit 9610391

Please sign in to comment.