forked from databricks/databricks-ml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_serve_driver_proxy.py
181 lines (143 loc) · 7.18 KB
/
03_serve_driver_proxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# Databricks notebook source
# MAGIC %md
# MAGIC
# MAGIC # Serving MPT-7B with a cluster driver proxy app
# MAGIC
# MAGIC This notebook enables you to run MPT-7B-Instruct on a Databricks cluster and expose the model to LangChain via [driver proxy](https://python.langchain.com/en/latest/modules/models/llms/integrations/databricks.html#wrapping-a-cluster-driver-proxy-app).
# MAGIC
# MAGIC Environment for this notebook:
# MAGIC - Runtime: 13.1 GPU ML Runtime
# MAGIC - Instance: `g5.4xlarge` on AWS, `Standard_NV36ads_A10_v5` on Azure
# COMMAND ----------
# Skip this step if running on Databricks runtime 13.2 GPU and above.
!wget -O /local_disk0/tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcusparse-dev-11-7_11.7.3.50-1_amd64.deb && \
wget -O /local_disk0/tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcublas-dev-11-7_11.10.1.25-1_amd64.deb && \
wget -O /local_disk0/tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcusolver-dev-11-7_11.4.0.1-1_amd64.deb && \
wget -O /local_disk0/tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/libcurand-dev-11-7_10.2.10.91-1_amd64.deb && \
dpkg -i /local_disk0/tmp/libcurand-dev-11-7_10.2.10.91-1_amd64.deb
# COMMAND ----------
# MAGIC %pip install xformers==0.0.20 einops==0.6.1 flash-attn==v1.0.3.post0 triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python
# COMMAND ----------
# MAGIC %md
# MAGIC ## Inference
# MAGIC The example in the model card should also work on Databricks with the same environment.
# COMMAND ----------
# Load model to text generation pipeline
import transformers
import torch
name = "mosaicml/mpt-7b-instruct"
config = transformers.AutoConfig.from_pretrained(
name,
trust_remote_code=True
)
config.attn_config['attn_impl'] = 'triton'
config.init_device = 'cuda'
model = transformers.AutoModelForCausalLM.from_pretrained(
name,
config=config,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
revision="bbe7a55d70215e16c00c1825805b81e4badb57d7",
cache_dir="/local_disk0/.cache/huggingface/"
)
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", padding_side="left")
pipeline = transformers.pipeline("text-generation",
model=model,
config=config,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
return_full_text=False,
device=0)
# COMMAND ----------
# Prompt templates as follows could guide the model to follow instructions and respond to the input, and empirically it turns out to make Falcon models produce better responses
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
# Define parameters to generate text
def gen_text_for_serving(prompt, **kwargs):
prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
# the default max length is pretty small (20), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response
if "max_new_tokens" not in kwargs:
kwargs["max_new_tokens"] = 512
# configure other text generation arguments
kwargs.update(
{
"pad_token_id": tokenizer.eos_token_id, # Hugging Face sets pad_token_id to eos_token_id by default; setting here to not see redundant message
"eos_token_id": tokenizer.eos_token_id,
}
)
return pipeline(prompt, **kwargs)[0]['generated_text']
# COMMAND ----------
print(gen_text_for_serving("How to master Python in 3 days?"))
# COMMAND ----------
# See full list of configurable args: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
print(gen_text_for_serving("How to master Python in 3 days?", temperature=0.1, max_new_tokens=100))
# COMMAND ----------
# MAGIC %md
# MAGIC ## Serve with Flask
# COMMAND ----------
from flask import Flask, jsonify, request
app = Flask("mpt-7b-instruct")
@app.route('/', methods=['POST'])
def serve_mpt_7b_instruct():
resp = gen_text_for_serving(**request.json)
return jsonify(resp)
# COMMAND ----------
from dbruntime.databricks_repl_context import get_context
ctx = get_context()
port = "7777"
driver_proxy_api = f"https://{ctx.browserHostName}/driver-proxy-api/o/0/{ctx.clusterId}/{port}"
print(f"""
driver_proxy_api = '{driver_proxy_api}'
cluster_id = '{ctx.clusterId}'
port = {port}
""")
# COMMAND ----------
# MAGIC %md
# MAGIC Keep `app.run` running, and it could be used with Langchain ([documentation](https://python.langchain.com/docs/modules/model_io/models/llms/integrations/databricks.html#wrapping-a-cluster-driver-proxy-app)), or by call the serving endpoint with:
# MAGIC ```python
# MAGIC import requests
# MAGIC import json
# MAGIC
# MAGIC def request_mpt_7b(prompt, temperature=1.0, max_new_tokens=1024):
# MAGIC token = ... # TODO: fill in with your Databricks personal access token that can access the cluster that runs this driver proxy notebook
# MAGIC url = ... # TODO: fill in with the driver_proxy_api output above
# MAGIC
# MAGIC headers = {
# MAGIC "Content-Type": "application/json",
# MAGIC "Authentication": f"Bearer {token}"
# MAGIC }
# MAGIC data = {
# MAGIC "prompt": prompt,
# MAGIC "temperature": temperature,
# MAGIC "max_new_tokens": max_new_tokens,
# MAGIC }
# MAGIC
# MAGIC response = requests.post(url, headers=headers, data=json.dumps(data))
# MAGIC return response.text
# MAGIC
# MAGIC
# MAGIC request_mpt_7b("What is databricks?")
# MAGIC ```
# MAGIC Or you could try using ai_query([doucmentation](https://docs.databricks.com/sql/language-manual/functions/ai_query.html)) to call this driver proxy from Databricks SQL with:
# MAGIC ```
# MAGIC SELECT ai_query('cluster_ud:port', -- TODO: fill in the cluster_id and port number from output above.
# MAGIC named_struct('prompt', 'What is databricks?', 'temperature', CAST(0.1 AS DOUble)),
# MAGIC 'returnType', 'STRING')
# MAGIC ```
# MAGIC Note: The [AI Functions](https://docs.databricks.com/large-language-models/ai-functions.html) is in the public preview, to enable the feature for your workspace, please submit this [form](https://docs.google.com/forms/d/e/1FAIpQLScVyh5eRioqGwuUVxj9JOiKBAo0-FWi7L3f4QWsKeyldqEw8w/viewform).
# COMMAND ----------
app.run(host="0.0.0.0", port=port, debug=True, use_reloader=False)