From 589526a402796e71af8a49c93dec0299e45017ee Mon Sep 17 00:00:00 2001
From: Wei-Lin Chiang <weichiang@berkeley.edu>
Date: Fri, 20 Oct 2023 22:47:00 +0000
Subject: [PATCH] Update

---
 docker/docker-compose.yml          |  2 +-
 docs/langchain_integration.md      |  2 +-
 docs/model_support.md              |  4 ++--
 docs/openai_api.md                 | 14 ++++++------
 docs/vllm_integration.md           |  4 ++--
 fastchat/llm_judge/README.md       |  2 +-
 fastchat/model/model_adapter.py    |  4 ++--
 fastchat/serve/cli.py              |  2 +-
 fastchat/serve/huggingface_api.py  |  2 +-
 fastchat/serve/launch_all_serve.py |  2 +-
 fastchat/serve/vllm_worker.py      |  2 +-
 scripts/serving/gradio.yml         |  3 +++
 scripts/serving/launch.py          | 34 ++++++++++++++++++++++++++++++
 scripts/skyserve                   |  1 +
 scripts/train_lora.sh              |  2 +-
 tests/test_cli.py                  |  2 +-
 tests/test_openai_langchain.py     |  2 +-
 17 files changed, 61 insertions(+), 23 deletions(-)
 create mode 100644 scripts/serving/gradio.yml
 create mode 100644 scripts/serving/launch.py
 create mode 160000 scripts/skyserve

diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 64a7f60fd..113e0c7a3 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -23,7 +23,7 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
-    entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names", "${FASTCHAT_WORKER_MODEL_NAMES:-vicuna-7b-v1.3}", "--model-path", "${FASTCHAT_WORKER_MODEL_PATH:-lmsys/vicuna-7b-v1.3}", "--worker-address", "http://fastchat-model-worker:21002", "--controller-address", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "21002"]
+    entrypoint: ["python3.9", "-m", "fastchat.serve.model_worker", "--model-names", "${FASTCHAT_WORKER_MODEL_NAMES:-vicuna-7b-v1.5}", "--model-path", "${FASTCHAT_WORKER_MODEL_PATH:-lmsys/vicuna-7b-v1.5}", "--worker-address", "http://fastchat-model-worker:21002", "--controller-address", "http://fastchat-controller:21001", "--host", "0.0.0.0", "--port", "21002"]
   fastchat-api-server:
     build:
       context: .
diff --git a/docs/langchain_integration.md b/docs/langchain_integration.md
index a59d739ab..50174a85e 100644
--- a/docs/langchain_integration.md
+++ b/docs/langchain_integration.md
@@ -19,7 +19,7 @@ Here, we use Vicuna as an example and use it for three endpoints: chat completio
 See a full list of supported models [here](../README.md#supported-models).
 
 ```bash
-python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path lmsys/vicuna-7b-v1.3
+python3 -m fastchat.serve.model_worker --model-names "gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002" --model-path lmsys/vicuna-7b-v1.5
 ```
 
 Finally, launch the RESTful API server
diff --git a/docs/model_support.md b/docs/model_support.md
index 745c6b646..24f3bc9cc 100644
--- a/docs/model_support.md
+++ b/docs/model_support.md
@@ -5,7 +5,7 @@
 - [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
   - example: `python3 -m fastchat.serve.cli --model-path meta-llama/Llama-2-7b-chat-hf`
 - Vicuna, Alpaca, LLaMA, Koala
-  - example: `python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.3`
+  - example: `python3 -m fastchat.serve.cli --model-path lmsys/vicuna-7b-v1.5`
 - [BAAI/AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
 - [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en#using-huggingface-transformers)
 - [baichuan-inc/baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
@@ -67,7 +67,7 @@ python3 -m fastchat.serve.cli --model [YOUR_MODEL_PATH]
 You can run this example command to learn the code logic.
 
 ```
-python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.3
+python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.5
 ```
 
 You can add `--debug` to see the actual prompt sent to the model.
diff --git a/docs/openai_api.md b/docs/openai_api.md
index 0c555a60e..f3c0fba93 100644
--- a/docs/openai_api.md
+++ b/docs/openai_api.md
@@ -18,7 +18,7 @@ python3 -m fastchat.serve.controller
 Then, launch the model worker(s)
 
 ```bash
-python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3
+python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5
 ```
 
 Finally, launch the RESTful API server
@@ -45,7 +45,7 @@ import openai
 openai.api_key = "EMPTY"
 openai.api_base = "http://localhost:8000/v1"
 
-model = "vicuna-7b-v1.3"
+model = "vicuna-7b-v1.5"
 prompt = "Once upon a time"
 
 # create a completion
@@ -77,7 +77,7 @@ Chat Completions:
 curl http://localhost:8000/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "messages": [{"role": "user", "content": "Hello! What is your name?"}]
   }'
 ```
@@ -87,7 +87,7 @@ Text Completions:
 curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "prompt": "Once upon a time",
     "max_tokens": 41,
     "temperature": 0.5
@@ -99,7 +99,7 @@ Embeddings:
 curl http://localhost:8000/v1/embeddings \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "vicuna-7b-v1.3",
+    "model": "vicuna-7b-v1.5",
     "input": "Hello world!"
   }'
 ```
@@ -111,8 +111,8 @@ you can replace the `model_worker` step above with a multi model variant:
 
 ```bash
 python3 -m fastchat.serve.multi_model_worker \
-    --model-path lmsys/vicuna-7b-v1.3 \
-    --model-names vicuna-7b-v1.3 \
+    --model-path lmsys/vicuna-7b-v1.5 \
+    --model-names vicuna-7b-v1.5 \
     --model-path lmsys/longchat-7b-16k \
     --model-names longchat-7b-16k
 ```
diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md
index e371ef32f..7d3205bb8 100644
--- a/docs/vllm_integration.md
+++ b/docs/vllm_integration.md
@@ -11,12 +11,12 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup
 
 2. When you launch a model worker, replace the normal worker (`fastchat.serve.model_worker`) with the vLLM worker (`fastchat.serve.vllm_worker`). All other commands such as controller, gradio web server, and OpenAI API server are kept the same.
    ```
-   python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3
+   python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.5
    ```
 
    If you see tokenizer errors, try
    ```
-   python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer
+   python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.5 --tokenizer hf-internal-testing/llama-tokenizer
    ```
 
    If you use an AWQ quantized model, try
diff --git a/fastchat/llm_judge/README.md b/fastchat/llm_judge/README.md
index f1755e3e5..1d2646b13 100644
--- a/fastchat/llm_judge/README.md
+++ b/fastchat/llm_judge/README.md
@@ -49,7 +49,7 @@ Arguments:
 
 e.g.,
 ```
-python gen_model_answer.py --model-path lmsys/vicuna-7b-v1.3 --model-id vicuna-7b-v1.3
+python gen_model_answer.py --model-path lmsys/vicuna-7b-v1.5 --model-id vicuna-7b-v1.5
 ```
 The answers will be saved to `data/mt_bench/model_answer/[MODEL-ID].jsonl`.
 
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 753371b2a..2774041f7 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -384,7 +384,7 @@ def add_model_args(parser):
     parser.add_argument(
         "--model-path",
         type=str,
-        default="lmsys/vicuna-7b-v1.3",
+        default="lmsys/vicuna-7b-v1.5",
         help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
     )
     parser.add_argument(
@@ -572,7 +572,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 
 
 class VicunaAdapter(BaseModelAdapter):
-    "Model adapater for Vicuna models (e.g., lmsys/vicuna-7b-v1.3)" ""
+    "Model adapater for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
 
     use_fast_tokenizer = False
 
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
index dbaf9bee9..eba4d0043 100644
--- a/fastchat/serve/cli.py
+++ b/fastchat/serve/cli.py
@@ -2,7 +2,7 @@
 Chat with a model with command line interface.
 
 Usage:
-python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.3
+python3 -m fastchat.serve.cli --model lmsys/vicuna-7b-v1.5
 python3 -m fastchat.serve.cli --model lmsys/fastchat-t5-3b-v1.0
 
 Other commands:
diff --git a/fastchat/serve/huggingface_api.py b/fastchat/serve/huggingface_api.py
index 5a4c30fec..2a49bf5f1 100644
--- a/fastchat/serve/huggingface_api.py
+++ b/fastchat/serve/huggingface_api.py
@@ -2,7 +2,7 @@
 Use FastChat with Hugging Face generation APIs.
 
 Usage:
-python3 -m fastchat.serve.huggingface_api --model lmsys/vicuna-7b-v1.3
+python3 -m fastchat.serve.huggingface_api --model lmsys/vicuna-7b-v1.5
 python3 -m fastchat.serve.huggingface_api --model lmsys/fastchat-t5-3b-v1.0
 """
 import argparse
diff --git a/fastchat/serve/launch_all_serve.py b/fastchat/serve/launch_all_serve.py
index 1952cfb17..2f4ad7b0b 100644
--- a/fastchat/serve/launch_all_serve.py
+++ b/fastchat/serve/launch_all_serve.py
@@ -54,7 +54,7 @@
 parser.add_argument(
     "--model-path",
     type=str,
-    default="lmsys/vicuna-7b-v1.3",
+    default="lmsys/vicuna-7b-v1.5",
     help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
 )
 parser.add_argument(
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
index eb0bfe26a..be247afa1 100644
--- a/fastchat/serve/vllm_worker.py
+++ b/fastchat/serve/vllm_worker.py
@@ -205,7 +205,7 @@ async def api_model_details(request: Request):
     parser.add_argument(
         "--controller-address", type=str, default="http://localhost:21001"
     )
-    parser.add_argument("--model-path", type=str, default="lmsys/vicuna-7b-v1.3")
+    parser.add_argument("--model-path", type=str, default="lmsys/vicuna-7b-v1.5")
     parser.add_argument(
         "--model-names",
         type=lambda s: s.split(","),
diff --git a/scripts/serving/gradio.yml b/scripts/serving/gradio.yml
new file mode 100644
index 000000000..6cb98a335
--- /dev/null
+++ b/scripts/serving/gradio.yml
@@ -0,0 +1,3 @@
+run: |
+  conda activate chatbot
+  python3 -m fastchat.serve.gradio_web_server --share --model-list-mode reload
\ No newline at end of file
diff --git a/scripts/serving/launch.py b/scripts/serving/launch.py
new file mode 100644
index 000000000..73c6b735e
--- /dev/null
+++ b/scripts/serving/launch.py
@@ -0,0 +1,34 @@
+import argparse
+import sky
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        required=True,
+        help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
+    )
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument("--spot", action="store_true")
+    parser.add_argument("--controller-name", type=str, default="fastchat-controller")
+    parser.add_argument("--worker-name", type=str, default="gpu-worker")
+    
+    args = parser.parse_args()
+    if len(sky.status(args.controller_name)) == 0:
+        task = sky.Task.from_yaml("controller.yaml")
+        sky.launch(task, cluster_name=args.controller_name)
+        task = sky.Task.from_yaml("gradio.yaml")
+        sky.exec(task, cluster_name=args.controller_name)
+
+    task = sky.Task.from_yaml("model_worker.yaml")
+    head_ip = sky.status(args.controller_name)[0]['handle'].head_ip
+    envs = {"CONTROLLER_IP": head_ip}
+    task.update_envs(envs)
+    
+    for i in range(args.num_gpus):
+        worker_name = f"{args.worker_name}-{i}"
+        if args.spot:
+            sky.spot_launch(task, name=worker_name)
+        else:
+            sky.launch(task, cluster_name=worker_name, detach_setup=True)
\ No newline at end of file
diff --git a/scripts/skyserve b/scripts/skyserve
new file mode 160000
index 000000000..38cf57f91
--- /dev/null
+++ b/scripts/skyserve
@@ -0,0 +1 @@
+Subproject commit 38cf57f91c661b1e32cdb2ba0813cd4925c132f2
diff --git a/scripts/train_lora.sh b/scripts/train_lora.sh
index 62648f40d..d30caad41 100644
--- a/scripts/train_lora.sh
+++ b/scripts/train_lora.sh
@@ -1,5 +1,5 @@
 deepspeed fastchat/train/train_lora.py \
-    --model_name_or_path lmsys/vicuna-7b-v1.3  \
+    --model_name_or_path lmsys/vicuna-7b-v1.5  \
     --lora_r 8 \
     --lora_alpha 16 \
     --lora_dropout 0.05 \
diff --git a/tests/test_cli.py b/tests/test_cli.py
index dcefa4bbe..113e497a4 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -69,7 +69,7 @@ def test_8bit():
 
 def test_hf_api():
     models = [
-        "lmsys/vicuna-7b-v1.3",
+        "lmsys/vicuna-7b-v1.5",
         "lmsys/fastchat-t5-3b-v1.0",
     ]
 
diff --git a/tests/test_openai_langchain.py b/tests/test_openai_langchain.py
index 3efa50322..b9c07fcf6 100644
--- a/tests/test_openai_langchain.py
+++ b/tests/test_openai_langchain.py
@@ -1,5 +1,5 @@
 # Usage:
-# python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3 --model-names gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002
+# python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.5 --model-names gpt-3.5-turbo,text-davinci-003,text-embedding-ada-002
 # export OPENAI_API_BASE=http://localhost:8000/v1
 # export OPENAI_API_KEY=EMPTY
 # wget https://raw.githubusercontent.com/hwchase17/langchain/v0.0.200/docs/modules/state_of_the_union.txt