Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sarkar/Add support for max_length in run_generation #476

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,17 @@ def main():
action="store_true",
help="Whether to perform generation in bf16 precision.",
)
parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.")
length_group = parser.add_mutually_exclusive_group(required=False)
length_group.add_argument(
"--max_new_tokens",
type=int,
help="Number of tokens to generate.",
)
length_group.add_argument(
"--max_length",
type=int,
help="Max number of tokens (prompt + generation).",
)
parser.add_argument(
"--max_input_tokens",
type=int,
Expand Down Expand Up @@ -211,6 +221,8 @@ def main():
)

args = parser.parse_args()
if args.max_length is None and args.max_new_tokens is None:
args.max_new_tokens = 100
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To keep back compatibility. In current case we can not specify --max_new_tokens and it would ge a default of 100

Since it was optional I have: length_group = parser.add_mutually_exclusive_group(required=False)

However to maintain back-compat if its not specified, I put default=100

Note I cannot put the default value in " length_group.add_argument(", because in the case where --max_length is specified and --max_new_tokens isnt, arg parser will assign it default of 100, which will result in both max_length and max_new_tokens getting values
Hence teh default is set here, outside argparser


# If the DeepSpeed launcher is used, the env variable _ will be equal to /usr/local/bin/deepspeed
# For multi node, the value of the env variable WORLD_SIZE should be larger than 8
Expand Down Expand Up @@ -381,7 +393,12 @@ def check_optimum_habana_min_version(*a, **b):

# Generation configuration
generation_config = copy.deepcopy(model.generation_config)
generation_config.max_new_tokens = args.max_new_tokens
if args.max_new_tokens is not None:
assert args.max_new_tokens > 0, "max_length is not set, expect a positive number for max_new_tokens"
generation_config.max_new_tokens = args.max_new_tokens
else:
assert args.max_length > 0, "max_new_tokens is not set, expect a positive number for max_length"
generation_config.max_length = args.max_length
generation_config.use_cache = args.use_kv_cache
generation_config.static_shapes = is_optimized
generation_config.bucket_size = args.bucket_size if is_optimized else -1
Expand Down Expand Up @@ -449,7 +466,7 @@ def generate():
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
).cpu()
return tokenizer.batch_decode(outputs, skip_special_tokens=True)
return tokenizer.batch_decode(outputs, skip_special_tokens=True), input_tokens["input_ids"].shape[-1]

from optimum.habana.utils import HabanaProfile

Expand All @@ -471,9 +488,10 @@ def generate():
t0 = time.perf_counter()
# Benchmark over n_iterations iterations
for i in range(args.n_iterations):
generated = generate()
generated, inp_shape = generate()
max_new_tokens = args.max_length - inp_shape if args.max_new_tokens is None else args.max_new_tokens
duration = time.perf_counter() - t0
total_new_tokens_generated = args.n_iterations * args.batch_size * args.max_new_tokens
total_new_tokens_generated = args.n_iterations * args.batch_size * max_new_tokens
throughput = total_new_tokens_generated / duration

if rank in [-1, 0]:
Expand Down Expand Up @@ -602,7 +620,7 @@ def generate_dataset(batch):
profiling_steps=args.profiling_steps,
profiling_warmup_steps=args.profiling_warmup_steps,
).cpu()
return prompt, outputs
return prompt, outputs, batch["input_ids"].shape[-1]

# warmup
if prompt_length > 0:
Expand Down Expand Up @@ -630,9 +648,11 @@ def generate_dataset(batch):
t_start = time.time()
for i, batch in enumerate(dataloader):
t0 = time.perf_counter()
prompt, outputs = generate_dataset(batch)
prompt, outputs, inp_len = generate_dataset(batch)
duration += time.perf_counter() - t0
total_new_tokens_generated += args.batch_size * args.max_new_tokens
total_new_tokens_generated += (
(args.max_length - inp_len) if args.max_new_tokens is None else args.max_new_tokens
)
if rank in [-1, 0]:
print(separator)
print(f"Batch n°{i+1}")
Expand Down
3 changes: 3 additions & 0 deletions optimum/habana/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,9 @@ def generate(
if generation_config.static_shapes:
# Pad inputs to have static shapes during generation, this gives better performance than dynamic shapes on HPUs
# In encoder_decoder models, Inputs are already padded
if generation_config.max_new_tokens is None or generation_config.max_new_tokens < 0:
assert generation_config.max_length > 0
generation_config.max_new_tokens = generation_config.max_length - inputs_tensor.shape[-1]

if not self.config.is_encoder_decoder:
# only pad if bucket_size < -1. If we are bucketing (bucket_size > 0), then that is taken care in greedy_search()
Expand Down
Loading