From 8d69815db9617a9e720927c39ad5bde01fd06f0f Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 27 Jun 2022 01:24:33 -0700 Subject: [PATCH] [Docs] Link examples/opt_serving/README.md to docs/tutorial/opt_serving.md (#548) --- docker/build_doc.Dockerfile | 2 +- docs/conf.py | 3 +- docs/index.rst | 2 +- docs/publish.py | 4 +- docs/tutorials/opt_serving.md | 1 + docs/tutorials/opt_serving.rst | 135 -------------------------- examples/opt_serving/README.md | 11 ++- examples/opt_serving/model/wrapper.py | 2 + examples/opt_serving/textgen_demo.py | 2 +- 9 files changed, 18 insertions(+), 144 deletions(-) create mode 120000 docs/tutorials/opt_serving.md delete mode 100644 docs/tutorials/opt_serving.rst diff --git a/docker/build_doc.Dockerfile b/docker/build_doc.Dockerfile index 0dd580334..f8c82515b 100644 --- a/docker/build_doc.Dockerfile +++ b/docker/build_doc.Dockerfile @@ -9,7 +9,7 @@ RUN apt-get install -y coinor-cbc glpk-utils python3-virtualenv RUN virtualenv --python=python3.8 python3.8-env RUN source python3.8-env/bin/activate && pip install --upgrade pip \ && pip install numpy==1.19.5 setuptools wheel six auditwheel \ - sphinx sphinx-rtd-theme sphinx-gallery matplotlib + sphinx sphinx-rtd-theme sphinx-gallery myst-parser matplotlib COPY scripts/build_doc.sh /build_doc.sh RUN chmod +x build_doc.sh ENTRYPOINT ["/build_doc.sh"] diff --git a/docs/conf.py b/docs/conf.py index 79284d312..bc059a8ab 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,7 +33,8 @@ 'sphinx_gallery.gen_gallery', 'sphinx.ext.napoleon', 'sphinx.ext.intersphinx', - ] + 'myst_parser', +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/index.rst b/docs/index.rst index 928a6ec0b..2fa34b4c9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,7 +19,7 @@ Alpa can automatically generate dstirbuted execution plans that unify data, oper tutorials/pipeshard_parallelism.rst tutorials/alpa_vs_pmap.rst tutorials/perf_tuning_guide.rst - tutorials/opt_serving.rst + tutorials/opt_serving.md .. toctree:: :maxdepth: 1 diff --git a/docs/publish.py b/docs/publish.py index 534dcc20f..4fa3eee89 100755 --- a/docs/publish.py +++ b/docs/publish.py @@ -11,7 +11,9 @@ def run_cmd(cmd): run_cmd(f"cd $ALPA_SITE_PATH; git pull") -run_cmd("rm -rf $ALPA_SITE_PATH/*") +# (Optional) Remove old files +# run_cmd("rm -rf $ALPA_SITE_PATH/*") + run_cmd("cp -r _build/html/* $ALPA_SITE_PATH") cmd_message = f"Archive {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" diff --git a/docs/tutorials/opt_serving.md b/docs/tutorials/opt_serving.md new file mode 120000 index 000000000..d2992e0c8 --- /dev/null +++ b/docs/tutorials/opt_serving.md @@ -0,0 +1 @@ +../../examples/opt_serving/README.md \ No newline at end of file diff --git a/docs/tutorials/opt_serving.rst b/docs/tutorials/opt_serving.rst deleted file mode 100644 index 17220daae..000000000 --- a/docs/tutorials/opt_serving.rst +++ /dev/null @@ -1,135 +0,0 @@ -Serving OPT-175B using Alpa -=========================== - -This tutorial provides guides to setup a serving system to serve the largest available pretrained language model OPT-175B. - -As a serving system, Alpa provides the following unique advantages: - -- **Support commodity hardware**: With Alpa, you can serve OPT-175B using your in-house GPU cluster, without needing the latest generations of A100 80GB GPUs nor fancy InfiniBand connections -- no hardware constraints! - -- **Flexible parallelism strategies**: Alpa will automatically figure out the appropriate model-parallel strategies based on your cluster setup. - -In this example, we use Alpa to serve the open-source OPT model, supporting all sizes ranging from 125M to 175B. -Specifically, Alpa provides: - -- A **backend** to perform model-parallel distributed inference for the large OPT models; - -- A **web frontend** to collect and batch inference requests from users. - -.. note:: - - The pre-trained OPT model weights can be obtained from `Metaseq download page `_. Usages of - the pretrained model weights are subject to their `license `_ . - -.. note:: - - You will need at least 350GB memory to to serve the OPT-175B model. For example, you can use 4 x AWS p3.16xlarge instance, which provide 4 instance x 8 (GPU/instance) x 16 (GB/GPU) = 512 GB memory. - You can also follow this guide to setup a serving system to serve smaller versions of OPT, such as OPT-66B, OPT-30B, etc. - Pick an appropriate size from `OPT weight release page `_ based on your available resources. - -Demo ----- -Use huggingface/transformers interface and Alpa backend for distributed inference on a Ray cluster. - -.. code:: python - - from transformers import AutoTokenizer - from examples.opt_serving.model.wrapper import get_model - - # Load the tokenizer. We have to use the 30B version because - # other versions have some issues. The 30B version works for all OPT models. - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", use_fast=False) - tokenizer.add_bos_token = False - - # Load the model - model = get_model(model_name="alpa/opt-2.7b", - device="cuda", - path="/home/ubuntu/opt_weights/") - - # Generate - prompt = "Paris is the capital city of " - - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") - output = model.generate(input_ids=input_ids, max_length=256, do_sample=True) - generated_string = tokenizer.batch_decode(output, skip_special_tokens=True) - - print(generated_string) - -Requirements ------------- -1. Install Alpa following the `installation guide `_. - -2. Install additional requirements for serving: - -.. code:: bash - - pip3 install transformers flask cython - - # Install torch corresponding to your CUDA version, e.g., for CUDA 11.3: - pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 - -3. Compile several cython files for faster data processing: - -.. code:: bash - - cd examples/opt_serving && bash build.sh - -Get OPT Weights ---------------- -There are two ways you can obtain the pretrained OPT weights. - -1. You can download the original OPT weights released by `Metaseq `_, -then use our script `convert_to_numpy_weight.py `_ to convert it into Alpa-compatible formats. - -2. We provide links to download the preprocessed 125M and 2.7B model below. For other sizes of OPT, please join `Alpa slack `_ to request a copy from the Alpa developer team. - - - `OPT-125M weights `_ - - `OPT-2.7B weights `_ - -Run and Benchmark Generation in The Command Line ------------------------------------------------- - -- Run generation using the 125M model with PyTorch/HuggingFace backend: - -.. code:: bash - - cd benchmark - python3 benchmark_text_gen.py --model facebook/opt-125m --path [PATH_TO_WEIGHT] - -- Run generation using the 125M model with JAX backend in debug model to output the generated text: - -.. code:: bash - - python3 benchmark_text_gen.py --model jax/opt-125m --path [PATH_TO_WEIGHT] --debug - -- Run model-parallel generation using the 2.7B model with Alpa: - -.. code:: bash - - ray start --head - python3 benchmark_text_gen.py --model alpa/opt-2.7b --path [PATH_TO_WEIGHT] --debug - -- Run distributed generation with the 175B model using Alpa. Note you will need >350GB total GPU memory in the entire cluster to successfully run the inference. - -.. code:: bash - - # Remember to start Ray on the entire cluster before running the generation - python3 benchmark_text_gen.py --model alpa/opt-175b --path [PATH_TO_WEIGHT] --debug - -Launch a Web Server to Serve the OPT models -------------------------------------------- - -Launch the web server: - -.. code:: bash - - # Serve the OPT-175B model at port 10001 - python3 interactive_hosted.py --model alpa/opt-175b --port 10001 --path [PATH_TO_WEIGHT] - -Then open ``https://[IP-ADDRESS]:10001`` in your browser to try out the model! - - -License -------- - -The use of the OPT pretrained weights are subject to the `Model Licence `_ by Metaseq. diff --git a/examples/opt_serving/README.md b/examples/opt_serving/README.md index 2db338150..f2abb0311 100644 --- a/examples/opt_serving/README.md +++ b/examples/opt_serving/README.md @@ -1,4 +1,7 @@ -# Examples: Serving OPT-175B using Alpa +# Serving OPT-175B using Alpa + +This tutorial shows how to setup a serving system to serve the largest available pretrained language model OPT-175B. + As a serving system, Alpa provides the following unique advantages: - **Support commodity hardware**: With Alpa, you can serve OPT-175B using your in-house GPU cluster, without needing the latest generations of A100 80GB GPUs nor fancy InfiniBand connections -- no hardware constraints! - **Flexible parallelism strategies**: Alpa will automatically figure out the appropriate model-parallel strategies based on your cluster setup. @@ -32,7 +35,7 @@ model = get_model(model_name="alpa/opt-2.7b", path="/home/ubuntu/opt_weights/") # Generate -prompt = "Paris is the capital city of " +prompt = "Paris is the capital city of" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids=input_ids, max_length=256, do_sample=True) @@ -55,7 +58,7 @@ pip3 install torch torchvision torchaudio --extra-index-url https://download.pyt cd examples/opt_serving && bash build.sh ``` -## Get OPT weights +## Get OPT Weights There are two ways you can obtain the pretrained OPT weights. 1. You can download the original OPT weights released by [Metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT), @@ -92,7 +95,7 @@ Run distributed generation with the 175B model using Alpa. Note you will need >3 python3 benchmark_text_gen.py --model alpa/opt-175b --path [PATH_TO_WEIGHT] --debug ``` -# Launch a Web Server to Serve the OPT models +## Launch a Web Server to Serve the OPT Models Launch the web server: ```shell diff --git a/examples/opt_serving/model/wrapper.py b/examples/opt_serving/model/wrapper.py index a6628d0f9..3334e0b94 100644 --- a/examples/opt_serving/model/wrapper.py +++ b/examples/opt_serving/model/wrapper.py @@ -224,6 +224,8 @@ def get_model(model_name, init_cache = init_cache_np(config, 1) else: assert "alpa/opt" in model_name + print(f"Load model {model_name} ... (This can take several minutes for very large models)") + alpa.init() num_pp_stages = max(2, alpa.get_global_cluster().num_hosts) config = get_opt_config(name, num_pp_stages=num_pp_stages, dtype=dtype) diff --git a/examples/opt_serving/textgen_demo.py b/examples/opt_serving/textgen_demo.py index b092bf31e..49be90a88 100644 --- a/examples/opt_serving/textgen_demo.py +++ b/examples/opt_serving/textgen_demo.py @@ -13,7 +13,7 @@ path="/home/ubuntu/opt_weights/") # Generate -prompt = "Paris is the capital city of " +prompt = "Paris is the capital city of" input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)