[Docs] Link examples/opt_serving/README.md to docs/tutorial/opt_servi…

…ng.md (#548)
alpa-projects · Jun 27, 2022 · 8d69815 · 8d69815
1 parent 88b9332
commit 8d69815
Show file tree

Hide file tree

Showing 9 changed files with 18 additions and 144 deletions.
diff --git a/docker/build_doc.Dockerfile b/docker/build_doc.Dockerfile
@@ -9,7 +9,7 @@ RUN apt-get install -y coinor-cbc glpk-utils python3-virtualenv
 RUN virtualenv --python=python3.8 python3.8-env
 RUN source python3.8-env/bin/activate && pip install --upgrade pip \
     && pip install numpy==1.19.5 setuptools wheel six auditwheel \
-    sphinx sphinx-rtd-theme sphinx-gallery matplotlib
+    sphinx sphinx-rtd-theme sphinx-gallery myst-parser matplotlib
 COPY scripts/build_doc.sh /build_doc.sh
 RUN chmod +x build_doc.sh
 ENTRYPOINT ["/build_doc.sh"]
diff --git a/docs/conf.py b/docs/conf.py
@@ -33,7 +33,8 @@
     'sphinx_gallery.gen_gallery',
     'sphinx.ext.napoleon',
     'sphinx.ext.intersphinx',
-    ]
+    'myst_parser',
+]
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

diff --git a/docs/index.rst b/docs/index.rst
@@ -19,7 +19,7 @@ Alpa can automatically generate dstirbuted execution plans that unify data, oper
    tutorials/pipeshard_parallelism.rst
    tutorials/alpa_vs_pmap.rst
    tutorials/perf_tuning_guide.rst
-   tutorials/opt_serving.rst
+   tutorials/opt_serving.md
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/publish.py b/docs/publish.py
@@ -11,7 +11,9 @@ def run_cmd(cmd):
 
 run_cmd(f"cd $ALPA_SITE_PATH; git pull")
 
-run_cmd("rm -rf $ALPA_SITE_PATH/*")
+# (Optional) Remove old files
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+
 run_cmd("cp -r _build/html/* $ALPA_SITE_PATH")
 
 cmd_message = f"Archive {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

diff --git a/docs/tutorials/opt_serving.md b/docs/tutorials/opt_serving.md
@@ -0,0 +1 @@
+../../examples/opt_serving/README.md
diff --git a/docs/tutorials/opt_serving.rst b/docs/tutorials/opt_serving.rst
diff --git a/examples/opt_serving/README.md b/examples/opt_serving/README.md
@@ -1,4 +1,7 @@
-# Examples: Serving OPT-175B using Alpa
+# Serving OPT-175B using Alpa
+
+This tutorial shows how to setup a serving system to serve the largest available pretrained language model OPT-175B.
+
 As a serving system, Alpa provides the following unique advantages:
 - **Support commodity hardware**: With Alpa, you can serve OPT-175B using your in-house GPU cluster, without needing the latest generations of A100 80GB GPUs nor fancy InfiniBand connections -- no hardware constraints!
 - **Flexible parallelism strategies**: Alpa will automatically figure out the appropriate model-parallel strategies based on your cluster setup.
@@ -32,7 +35,7 @@ model = get_model(model_name="alpa/opt-2.7b",
                   path="/home/ubuntu/opt_weights/")
 
 # Generate
-prompt = "Paris is the capital city of "
+prompt = "Paris is the capital city of"
 
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)
@@ -55,7 +58,7 @@ pip3 install torch torchvision torchaudio --extra-index-url https://download.pyt
 cd examples/opt_serving && bash build.sh
 ```
 
-## Get OPT weights
+## Get OPT Weights
 There are two ways you can obtain the pretrained OPT weights.
 
 1. You can download the original OPT weights released by [Metaseq](https://github.com/facebookresearch/metaseq/tree/main/projects/OPT), 
@@ -92,7 +95,7 @@ Run distributed generation with the 175B model using Alpa. Note you will need >3
 python3 benchmark_text_gen.py --model alpa/opt-175b --path [PATH_TO_WEIGHT] --debug
 ```
 
-# Launch a Web Server to Serve the OPT models
+## Launch a Web Server to Serve the OPT Models
 
 Launch the web server:
 ```shell

diff --git a/examples/opt_serving/model/wrapper.py b/examples/opt_serving/model/wrapper.py
@@ -224,6 +224,8 @@ def get_model(model_name,
         init_cache = init_cache_np(config, 1)
     else:
         assert "alpa/opt" in model_name
+        print(f"Load model {model_name} ... (This can take several minutes for very large models)")
+
         alpa.init()
         num_pp_stages = max(2, alpa.get_global_cluster().num_hosts)
         config = get_opt_config(name, num_pp_stages=num_pp_stages, dtype=dtype)

diff --git a/examples/opt_serving/textgen_demo.py b/examples/opt_serving/textgen_demo.py
@@ -13,7 +13,7 @@
                   path="/home/ubuntu/opt_weights/")
 
 # Generate
-prompt = "Paris is the capital city of "
+prompt = "Paris is the capital city of"
 
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
 output = model.generate(input_ids=input_ids, max_length=256, do_sample=True)