From 8d69815db9617a9e720927c39ad5bde01fd06f0f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 27 Jun 2022 01:24:33 -0700
Subject: [PATCH] [Docs] Link examples/opt_serving/README.md to
 docs/tutorial/opt_serving.md (#548)

---
 docker/build_doc.Dockerfile           |   2 +-
 docs/conf.py                          |   3 +-
 docs/index.rst                        |   2 +-
 docs/publish.py                       |   4 +-
 docs/tutorials/opt_serving.md         |   1 +
 docs/tutorials/opt_serving.rst        | 135 --------------------------
 examples/opt_serving/README.md        |  11 ++-
 examples/opt_serving/model/wrapper.py |   2 +
 examples/opt_serving/textgen_demo.py  |   2 +-
 9 files changed, 18 insertions(+), 144 deletions(-)
 create mode 120000 docs/tutorials/opt_serving.md
 delete mode 100644 docs/tutorials/opt_serving.rst

diff --git a/docker/build_doc.Dockerfile b/docker/build_doc.Dockerfile
index 0dd580334..f8c82515b 100644
--- a/docker/build_doc.Dockerfile
+++ b/docker/build_doc.Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get install -y coinor-cbc glpk-utils python3-virtualenv
 RUN virtualenv --python=python3.8 python3.8-env
 RUN source python3.8-env/bin/activate && pip install --upgrade pip \
     && pip install numpy==1.19.5 setuptools wheel six auditwheel \
-    sphinx sphinx-rtd-theme sphinx-gallery matplotlib
+    sphinx sphinx-rtd-theme sphinx-gallery myst-parser matplotlib
 COPY scripts/build_doc.sh /build_doc.sh
 RUN chmod +x build_doc.sh
 ENTRYPOINT ["/build_doc.sh"]
diff --git a/docs/conf.py b/docs/conf.py
index 79284d312..bc059a8ab 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -33,7 +33,8 @@
     'sphinx_gallery.gen_gallery',
     'sphinx.ext.napoleon',
     'sphinx.ext.intersphinx',
-    ]
+    'myst_parser',
+]
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
diff --git a/docs/index.rst b/docs/index.rst
index 928a6ec0b..2fa34b4c9 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,7 +19,7 @@ Alpa can automatically generate dstirbuted execution plans that unify data, oper
    tutorials/pipeshard_parallelism.rst
    tutorials/alpa_vs_pmap.rst
    tutorials/perf_tuning_guide.rst
-   tutorials/opt_serving.rst
+   tutorials/opt_serving.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/publish.py b/docs/publish.py
index 534dcc20f..4fa3eee89 100755
--- a/docs/publish.py
+++ b/docs/publish.py
@@ -11,7 +11,9 @@ def run_cmd(cmd):
 
 run_cmd(f"cd $ALPA_SITE_PATH; git pull")
 
-run_cmd("rm -rf $ALPA_SITE_PATH/*")
+# (Optional) Remove old files
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+
 run_cmd("cp -r _build/html/* $ALPA_SITE_PATH")
 
 cmd_message = f"Archive {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
diff --git a/docs/tutorials/opt_serving.md b/docs/tutorials/opt_serving.md
new file mode 120000
index 000000000..d2992e0c8
--- /dev/null
+++ b/docs/tutorials/opt_serving.md
@@ -0,0 +1 @@
+../../examples/opt_serving/README.md
\ No newline at end of file
diff --git a/docs/tutorials/opt_serving.rst b/docs/tutorials/opt_serving.rst
deleted file mode 100644
index 17220daae..000000000
--- a/docs/tutorials/opt_serving.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-Serving OPT-175B using Alpa
-===========================
-
-This tutorial provides guides to setup a serving system to serve the largest available pretrained language model OPT-175B.
-
-As a serving system, Alpa provides the following unique advantages:
-
-- **Support commodity hardware**: With Alpa, you can serve OPT-175B using your in-house GPU cluster, without needing the latest generations of A100 80GB GPUs nor fancy InfiniBand connections -- no hardware constraints!
-
-- **Flexible parallelism strategies**: Alpa will automatically figure out the appropriate model-parallel strategies based on your cluster setup.
-
-In this example, we use Alpa to serve the open-source OPT model, supporting all sizes ranging from 125M to 175B.
-Specifically, Alpa provides:
-
-- A **backend** to perform model-parallel distributed inference for the large OPT models;
-
-- A **web frontend** to collect and batch inference requests from users.
-
-.. note::
-
-    The pre-trained OPT model weights can be obtained from `Metaseq download page <https://github.com/facebookresearch/metaseq/tree/main/projects/OPT>`_. Usages of
-    the pretrained model weights are subject to their `license <https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/MODEL_LICENSE.md>`_ .
-
-.. note::
-
-    You will need at least 350GB memory to to serve the OPT-175B model. For example, you can use 4 x AWS p3.16xlarge instance, which provide 4 instance x 8 (GPU/instance) x 16 (GB/GPU) = 512 GB memory.
-    You can also follow this guide to setup a serving system to serve smaller versions of OPT, such as OPT-66B, OPT-30B, etc.
-    Pick an appropriate size from `OPT weight release page <https://github.com/facebookresearch/metaseq/tree/main/projects/OPT>`_ based on your available resources.
-
-Demo
-----
-Use huggingface/transformers interface and Alpa backend for distributed inference on a Ray cluster.
-
-.. code:: python
-
-  from transformers import AutoTokenizer
-  from examples.opt_serving.model.wrapper import get_model
-  
-  # Load the tokenizer. We have to use the 30B version because
-  # other versions have some issues. The 30B version works for all OPT models.
-  tokenizer = AutoTokenizer.from_pretrained("facebook/opt-30b", use_fast=False)
-  tokenizer.add_bos_token = False
-  
-  # Load the model
-  model = get_model(model_name="alpa/opt-2.7b",
-                    device="cuda",
-                    path="/home/ubuntu/opt_weights/")
-  
-  # Generate
-  prompt = "Paris is the capital city of "
-  
-  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
-  output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)
-  generated_string = tokenizer.batch_decode(output, skip_special_tokens=True)
-  
-  print(generated_string)
-
-Requirements
-------------
-1. Install Alpa following the `installation guide <https://alpa-projects.github.io/install.html>`_.
-
-2. Install additional requirements for serving:
-
-.. code:: bash
-
-    pip3 install transformers flask cython
-
-    # Install torch corresponding to your CUDA version, e.g., for CUDA 11.3:
-    pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
-
-3. Compile several cython files for faster data processing:
-
-.. code:: bash
-
-    cd examples/opt_serving && bash build.sh
-
-Get OPT Weights
----------------
-There are two ways you can obtain the pretrained OPT weights.
-
-1. You can download the original OPT weights released by `Metaseq <https://github.com/facebookresearch/metaseq/tree/main/projects/OPT>`_,
-then use our script `convert_to_numpy_weight.py <scripts/convert_to_numpy_weights.p>`_ to convert it into Alpa-compatible formats.
-
-2. We provide links to download the preprocessed 125M and 2.7B model below. For other sizes of OPT, please join `Alpa slack <https://forms.gle/YEZTCrtZD6EAVNBQ7>`_ to request a copy from the Alpa developer team.
-
-   - `OPT-125M weights <https://drive.google.com/file/d/1Ps7DFD80wNO7u2t39YCYcBX-9XwypGzl/view?usp=sharing>`_
-   - `OPT-2.7B weights <https://drive.google.com/file/d/1ayIaKRhxF9osZWgcFG-3vSkjcepSWdQd/view?usp=sharing>`_
-
-Run and Benchmark Generation in The Command Line
-------------------------------------------------
-
-- Run generation using the 125M model with PyTorch/HuggingFace backend:
-
-.. code:: bash
-
-    cd benchmark
-    python3 benchmark_text_gen.py --model facebook/opt-125m --path [PATH_TO_WEIGHT]
-
-- Run generation using the 125M model with JAX backend in debug model to output the generated text：
-
-.. code:: bash
-
-    python3 benchmark_text_gen.py --model jax/opt-125m --path [PATH_TO_WEIGHT] --debug
-
-- Run model-parallel generation using the 2.7B model with Alpa:
-
-.. code:: bash
-
-    ray start --head
-    python3 benchmark_text_gen.py --model alpa/opt-2.7b --path [PATH_TO_WEIGHT] --debug
-
-- Run distributed generation with the 175B model using Alpa. Note you will need >350GB total GPU memory in the entire cluster to successfully run the inference.
-
-.. code:: bash
-
-    # Remember to start Ray on the entire cluster before running the generation
-    python3 benchmark_text_gen.py --model alpa/opt-175b --path [PATH_TO_WEIGHT] --debug
-
-Launch a Web Server to Serve the OPT models
--------------------------------------------
-
-Launch the web server:
-
-.. code:: bash
-
-    # Serve the OPT-175B model at port 10001
-    python3 interactive_hosted.py --model alpa/opt-175b --port 10001 --path [PATH_TO_WEIGHT]
-
-Then open ``https://[IP-ADDRESS]:10001`` in your browser to try out the model!
-
-
-License
--------
-
-The use of the OPT pretrained weights are subject to the `Model Licence <https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/MODEL_LICENSE.md>`_ by Metaseq.
diff --git a/examples/opt_serving/README.md b/examples/opt_serving/README.md
index 2db338150..f2abb0311 100644
--- a/examples/opt_serving/README.md
+++ b/examples/opt_serving/README.md
@@ -1,4 +1,7 @@
-# Examples: Serving OPT-175B using Alpa
+# Serving OPT-175B using Alpa
+
+This tutorial shows how to setup a serving system to serve the largest available pretrained language model OPT-175B.
+
 As a serving system, Alpa provides the following unique advantages:
 - **Support commodity hardware**: With Alpa, you can serve OPT-175B using your in-house GPU cluster, without needing the latest generations of A100 80GB GPUs nor fancy InfiniBand connections -- no hardware constraints!
 - **Flexible parallelism strategies**: Alpa will automatically figure out the appropriate model-parallel strategies based on your cluster setup.
@@ -32,7 +35,7 @@ model = get_model(model_name="alpa/opt-2.7b",
                   path="/home/ubuntu/opt_weights/")
 
 # Generate
-prompt = "Paris is the capital city of "
+prompt = "Paris is the capital city of"
 
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)
@@ -55,7 +58,7 @@ pip3 install torch torchvision torchaudio --extra-index-url https://download.pyt
 cd examples/opt_serving && bash build.sh
 ```
 
-## Get OPT weights
+## Get OPT Weights
 There are two ways you can obtain the pretrained OPT weights.
 
 1. You can download the original OPT weights released by [Metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT), 
@@ -92,7 +95,7 @@ Run distributed generation with the 175B model using Alpa. Note you will need >3
 python3 benchmark_text_gen.py --model alpa/opt-175b --path [PATH_TO_WEIGHT] --debug
 ```
 
-# Launch a Web Server to Serve the OPT models
+## Launch a Web Server to Serve the OPT Models
 
 Launch the web server:
 ```shell
diff --git a/examples/opt_serving/model/wrapper.py b/examples/opt_serving/model/wrapper.py
index a6628d0f9..3334e0b94 100644
--- a/examples/opt_serving/model/wrapper.py
+++ b/examples/opt_serving/model/wrapper.py
@@ -224,6 +224,8 @@ def get_model(model_name,
         init_cache = init_cache_np(config, 1)
     else:
         assert "alpa/opt" in model_name
+        print(f"Load model {model_name} ... (This can take several minutes for very large models)")
+
         alpa.init()
         num_pp_stages = max(2, alpa.get_global_cluster().num_hosts)
         config = get_opt_config(name, num_pp_stages=num_pp_stages, dtype=dtype)
diff --git a/examples/opt_serving/textgen_demo.py b/examples/opt_serving/textgen_demo.py
index b092bf31e..49be90a88 100644
--- a/examples/opt_serving/textgen_demo.py
+++ b/examples/opt_serving/textgen_demo.py
@@ -13,7 +13,7 @@
                   path="/home/ubuntu/opt_weights/")
 
 # Generate
-prompt = "Paris is the capital city of "
+prompt = "Paris is the capital city of"
 
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)