forked from databricks/databricks-ml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_load_inference.py
254 lines (188 loc) · 11.8 KB
/
01_load_inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# Databricks notebook source
# MAGIC %md
# MAGIC # Llama-2-70b-chat-hf Inference on Databricks
# MAGIC
# MAGIC [Llama 2](https://huggingface.co/meta-llama) is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters. It is trained with 2T tokens and supports context length window upto 4K tokens. [Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) is the 70B fine-tuned model, optimized for dialogue use cases and converted for the Hugging Face Transformers format.
# MAGIC
# MAGIC Environment for this notebook:
# MAGIC - Runtime: 13.2 GPU ML Runtime
# MAGIC - Instance: `Standard_NC48ads_A100_v4` on Azure, `p4d.24xlarge` on AWS
# MAGIC
# MAGIC GPU instances that have at least 2 A100-80GB GPUs would be enough for inference on single input (batch inference requires slightly more memory).
# MAGIC
# MAGIC Requirements:
# MAGIC - To get the access of the model on HuggingFace, please visit the [Meta website](https://ai.meta.com/resources/models-and-libraries/llama-downloads) and accept the license terms and acceptable use policy before submitting this form. Requests will be processed in 1-2 days.
# MAGIC
# COMMAND ----------
# MAGIC %pip install -U git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git
# MAGIC dbutils.library.restartPython()
# COMMAND ----------
from huggingface_hub import notebook_login
# Login to Huggingface to get access to the model
notebook_login()
# COMMAND ----------
# MAGIC %md
# MAGIC ## Inference
# MAGIC The example in the model card should also work on Databricks with the same environment.
# COMMAND ----------
# Load model to text generation pipeline
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of Llama-2-70b-chat-hf in https://huggingface.co/meta-llama/Llama-2-70b-chat-hf/commits/main
model = "meta-llama/Llama-2-70b-chat-hf"
revision = "e6152b720bd3cd67afc66e36d06893a0e1f84b48"
os.environ['HF_HOME'] = '/local_disk0/hf'
os.environ['TRANSFORMERS_CACHE'] = '/local_disk0/hf'
tokenizer = AutoTokenizer.from_pretrained(model, padding_side="left")
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
revision=revision,
return_full_text=False
)
# Required tokenizer setting for batch inference
pipeline.tokenizer.pad_token_id = tokenizer.eos_token_id
# COMMAND ----------
# Define prompt template, the format below is from: http://fastml.com/how-to-train-your-own-chatgpt-alpaca-style-part-one/
# Prompt templates as follows could guide the model to follow instructions and respond to the input, and empirically it turns out to make Falcon models produce better responses
INSTRUCTION_KEY = "### Instruction:"
RESPONSE_KEY = "### Response:"
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
intro=INTRO_BLURB,
instruction_key=INSTRUCTION_KEY,
instruction="{instruction}",
response_key=RESPONSE_KEY,
)
# COMMAND ----------
# Define parameters to generate text
def gen_text(prompts, use_template=False, **kwargs):
if use_template:
full_prompts = [
PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
for prompt in prompts
]
else:
full_prompts = prompts
if "batch_size" not in kwargs:
kwargs["batch_size"] = 1
# the default max length is pretty small (20), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response
if "max_new_tokens" not in kwargs:
kwargs["max_new_tokens"] = 1024
# configure other text generation arguments, see common configurable args here: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
kwargs.update(
{
"pad_token_id": tokenizer.eos_token_id, # Hugging Face sets pad_token_id to eos_token_id by default; setting here to not see redundant message
"eos_token_id": tokenizer.eos_token_id,
}
)
outputs = pipeline(full_prompts, **kwargs)
outputs = [out[0]["generated_text"] for out in outputs]
return outputs
# COMMAND ----------
# MAGIC %md
# MAGIC ### Inference on a single input
# COMMAND ----------
results = gen_text(["What is a large language model?"])
print(results[0])
# COMMAND ----------
# Use args such as temperature and max_new_tokens to control text generation
results = gen_text(["What is a large language model?"], temperature=0.5, max_new_tokens=50, use_template=True)
print(results[0])
# COMMAND ----------
# Check that the generation quality when the context is long
long_input = """Provide a concise summary of the below passage.
Hannah Arendt was one of the seminal political thinkers of the twentieth century. The power and originality of her thinking was evident in works such as The Origins of Totalitarianism, The Human Condition, On Revolution and The Life of the Mind. In these works and in numerous essays she grappled with the most crucial political events of her time, trying to grasp their meaning and historical import, and showing how they affected our categories of moral and political judgment. What was required, in her view, was a new framework that could enable us to come to terms with the twin horrors of the twentieth century, Nazism and Stalinism. She provided such framework in her book on totalitarianism, and went on to develop a new set of philosophical categories that could illuminate the human condition and provide a fresh perspective on the nature of political life.
Although some of her works now belong to the classics of the Western tradition of political thought, she has always remained difficult to classify. Her political philosophy cannot be characterized in terms of the traditional categories of conservatism, liberalism, and socialism. Nor can her thinking be assimilated to the recent revival of communitarian political thought, to be found, for example, in the writings of A. MacIntyre, M. Sandel, C. Taylor and M. Walzer. Her name has been invoked by a number of critics of the liberal tradition, on the grounds that she presented a vision of politics that stood in opposition some key liberal principles. There are many strands of Arendt’s thought that could justify such a claim, in particular, her critique of representative democracy, her stress on civic engagement and political deliberation, her separation of morality from politics, and her praise of the revolutionary tradition. However, it would be a mistake to view Arendt as an anti-liberal thinker. Arendt was in fact a stern defender of constitutionalism and the rule of law, an advocate of fundamental human rights (among which she included not only the right to life, liberty, and freedom of expression, but also the right to action and to opinion), and a critic of all forms of political community based on traditional ties and customs, as well as those based on religious, ethnic, or racial identity.
Arendt’s political thought cannot, in this sense, be identified either with the liberal tradition or with the claims advanced by a number of its critics. Arendt did not conceive of politics as a means for the satisfaction of individual preferences, nor as a way to integrate individuals around a shared conception of the good. Her conception of politics is based instead on the idea of active citizenship, that is, on the value and importance of civic engagement and collective deliberation about all matters affecting the political community. If there is a tradition of thought with which Arendt can be identified, it is the classical tradition of civic republicanism originating in Aristotle and embodied in the writings of Machiavelli, Montesquieu, Jefferson, and Tocqueville. According to this tradition politics finds its authentic expression whenever citizens gather together in a public space to deliberate and decide about matters of collective concern. Political activity is valued not because it may lead to agreement or to a shared conception of the good, but because it enables each citizen to exercise his or her powers of agency, to develop the capacities for judgment and to attain by concerted action some measure of political efficacy."""
def get_num_tokens(text):
inputs = tokenizer(text, return_tensors="pt").input_ids.to("cuda")
return inputs.shape[1]
print('number of tokens for input:', get_num_tokens(long_input))
results = gen_text([long_input], max_new_tokens=150, use_template=False)
print(results[0])
# COMMAND ----------
# MAGIC %md
# MAGIC ### Batch inference
# COMMAND ----------
# From databricks-dolly-15k
inputs = [
"Think of some family rules to promote a healthy family relationship",
"In the series A Song of Ice and Fire, who is the founder of House Karstark?",
"which weighs more, cold or hot water?",
"Write a short paragraph about why you should not have both a pet cat and a pet bird.",
"Is beauty objective or subjective?",
"What is SVM?",
"What is the current capital of Japan?",
"Name 10 colors",
"How should I invest my money?",
"What are some ways to improve the value of your home?",
"What does fasting mean?",
"What is cloud computing in simple terms?",
"What is the meaning of life?",
"What is Linux?",
"Why do people like gardening?",
"What makes for a good photograph?"
]
# COMMAND ----------
# Set batch size
results = gen_text(inputs, use_template=True, batch_size=8)
for output in results:
print(output)
print('\n')
# COMMAND ----------
# MAGIC %md
# MAGIC ## Measure inference speed
# MAGIC Text generation speed is often measured with token/s, which is the average number of tokens that are generated by the model per second.
# MAGIC
# COMMAND ----------
import time
import logging
def get_gen_text_throughput(prompt, use_template=True, **kwargs):
"""
Return tuple ( number of tokens / sec, num tokens, output ) of the generated tokens
"""
if use_template:
full_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt)
else:
full_prompt = prompt
if "max_new_tokens" not in kwargs:
kwargs["max_new_tokens"] = 512
kwargs.update(
{
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
"eos_token_id": tokenizer.eos_token_id,
"return_tensors": True, # make the pipeline return token ids instead of decoded text to get the number of generated tokens
}
)
num_input_tokens = get_num_tokens(full_prompt)
# measure the time it takes for text generation
start = time.time()
outputs = pipeline(full_prompt, **kwargs)
duration = time.time() - start
# get the number of generated tokens
n_tokens = len(outputs[0]["generated_token_ids"])
# show the generated text in logging
result = tokenizer.batch_decode(
outputs[0]["generated_token_ids"][num_input_tokens:], skip_special_tokens=True
)
result = "".join(result)
return ((n_tokens - num_input_tokens) / duration, (n_tokens - num_input_tokens), result)
# COMMAND ----------
throughput, n_tokens, result = get_gen_text_throughput("What is ML?", use_template=False)
print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")
# COMMAND ----------
# When the context is long or the generated text is long, it takes longer to generate each token in average
throughput, n_tokens, result = get_gen_text_throughput(long_input, max_new_tokens=200, use_template=True)
print(f"{throughput} tokens/sec, {n_tokens} tokens (not including prompt)")