Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed minor typo in README #17

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ The first parameter, `text` corresponds the prompt that will be forced-decoded b
import llm_client

client = llm_client.Client(address="tir-x-xx")
ouputs = client.prompt("CMU's PhD students are")
outputs = client.prompt("CMU's PhD students are")
print(outputs[0].text)
```

Expand All @@ -34,7 +34,7 @@ It is also possible to obtain the raw logit scores / output distribution from th
import llm_client

client = llm_client.Client(address="tir-x-xx")
outputs = client.prompt("CMU's PhD students are", output_scores=True)
outputs = client.prompt(["CMU's PhD students are"], output_scores=True)
print(outputs[0].scores.shape)
```

Expand All @@ -44,7 +44,7 @@ And equivalently, it is possible to obtain the raw hidden states from the model.
import llm_client

client = llm_client.Client(address="tir-x-xx")
outputs = client.prompt("CMU's PhD students are", output_hidden_states=True)
outputs = client.prompt(["CMU's PhD students are"], output_hidden_states=True)
for layer in outputs[0].hidden_states:
print(f"Layer {layer}: {layer.shape}")
```
Expand Down
2 changes: 1 addition & 1 deletion inference_server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Example: generate_kwargs =

1. using HF accelerate
```shell
python -m inference_server.cli --model_name bigscience/bloom --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}'
python -m inference_server.benchmark --model_name bigscience/bloom-7b1 --model_class AutoModelForCausalLM --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5
```

2. using DS inference
Expand Down
9 changes: 9 additions & 0 deletions inference_server/model_handler/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import time
from typing import List

from transformers import AutoModelForCausalLM
import torch

import grpc
from mii.server_client import MIIServerClient
from transformers import AutoTokenizer
Expand All @@ -18,6 +21,8 @@
GenerateResponse,
TokenizeRequest,
TokenizeResponse,
ScoreRequest,
ScoreResponse,
create_generate_request,
get_str_dtype,
print_rank_n,
Expand Down Expand Up @@ -172,6 +177,10 @@ def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:

return response

def scores(self, request: ScoreRequest) -> ScoreResponse:
response = self.model.scores(request)
return response

def _request_response(self):
raise NotImplementedError("This method should not be implemented")

Expand Down
22 changes: 21 additions & 1 deletion inference_server/model_handler/grpc_utils/generation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ...constants import GRPC_OPTIONS
from ...models import Model
from ...utils import create_generate_request, print_rank_n
from ...utils import create_generate_request, print_rank_n, ScoreRequest
from .proto import generation_pb2, generation_pb2_grpc


Expand Down Expand Up @@ -43,6 +43,26 @@ def Generate(self, request, context):
)

return response

# def Score(self, request, context):
# text = [r for r in request.texts]
# local_rank = int(os.getenv("LOCAL_RANK", "0"))
# torch.cuda.set_device(local_rank)
# self.model.input_device = local_rank

# request = ScoreRequest(text=text)
# response = self.model.score(request)

# if isinstance(response, Exception):
# # if exception occurs, we don't this subprocess to crash
# response = generation_pb2.ScoreResponse(error=str(response))
# else:
# response = generation_pb2.ScoreResponse(
# tokens = response.tokens
# scores = response.scores
# )

# return response


def serve(inference_pipeline, port):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ def __init__(self, channel):
request_serializer=generation__pb2.GenerationRequest.SerializeToString,
response_deserializer=generation__pb2.GenerationResponse.FromString,
)

# self.Score = channel.unary_unary(
# '/generation.GenerationService/Score',
# request_serializer=generation__pb2.ScoreRequest.SerializeToString,
# response_deserializer=generation__pb2.ScoreResponse.FromString,
# )


class GenerationServiceServicer(object):
Expand All @@ -38,6 +44,11 @@ def add_GenerationServiceServicer_to_server(servicer, server):
request_deserializer=generation__pb2.GenerationRequest.FromString,
response_serializer=generation__pb2.GenerationResponse.SerializeToString,
),
# 'Score': grpc.unary_unary_rpc_method_handler(
# servicer.Score,
# request_deserializer=generation__pb2.ScoreRequest.FromString,
# response_serializer=generation__pb2.ScoreResponse.SerializeToString,
# ),
}
generic_handler = grpc.method_handlers_generic_handler(
'generation.GenerationService', rpc_method_handlers)
Expand All @@ -64,3 +75,19 @@ def Generate(request,
generation__pb2.GenerationResponse.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

# def Score(request,
# target,
# options=(),
# channel_credentials=None,
# call_credentials=None,
# insecure=False,
# compression=None,
# wait_for_ready=None,
# timeout=None,
# metadata=None):
# return grpc.experimental.unary_unary(request, target, '/generation.GenerationService/Score',
# generation__pb2.ScoreRequest.SerializeToString,
# generation__pb2.ScoreResponse.FromString,
# options, channel_credentials,
# insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
24 changes: 22 additions & 2 deletions inference_server/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers.utils import is_offline_mode

from ..utils import GenerateRequest, GenerateResponse, GenerationMixin, TokenizeRequest, TokenizeResponse, run_rank_n
from ..utils import GenerateRequest, GenerateResponse, GenerationMixin, TokenizeRequest, TokenizeResponse, ScoreRequest, ScoreResponse, run_rank_n


class Model:
Expand Down Expand Up @@ -132,7 +132,27 @@ def generate(self, request: GenerateRequest) -> Union[GenerateResponse, Exceptio
def tokenize(self, request: TokenizeRequest) -> TokenizeResponse:
response = self.tokenizer(request.text, padding=request.padding)
return TokenizeResponse(token_ids=response.input_ids, attention_mask=response.attention_mask)


# The scores function returns the tokens and scores of a given prompt
def scores(self, request: ScoreRequest) -> ScoreResponse:
tokenizer = self.tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

inputs = tokenizer(request.text, padding=True, return_tensors="pt")
input_ids = inputs["input_ids"]

outputs = self.model(input_ids, labels = input_ids)

logits = outputs.logits.float()
scores = torch.log(logits.softmax(dim=-1)).detach()
scores = scores.cuda()
input_ids = input_ids.cuda()
scores = torch.gather(scores, 2, input_ids[:, :, None].cuda()).squeeze(-1)
scores = scores.cpu().numpy()[0, :].tolist()
tokens = [tokenizer.decode([tok]) for tok in input_ids[0]]

return ScoreResponse(tokens = tokens, scores = scores)

def get_downloaded_model_path(model_name: str):
f = partial(
Expand Down
20 changes: 20 additions & 0 deletions inference_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .utils import (
GenerateRequest,
TokenizeRequest,
ScoreRequest,
get_exception_response,
get_num_tokens_to_generate,
get_torch_dtype,
Expand All @@ -21,6 +22,7 @@
class QueryID(BaseModel):
generate_query_id: int = 0
tokenize_query_id: int = 0
score_query_id: int = 0


# placeholder class for getting args. gunicorn does not allow passing args to a
Expand Down Expand Up @@ -92,3 +94,21 @@ def generate():
response = get_exception_response(query_ids.generate_query_id, x.method, args.debug)
query_ids.generate_query_id += 1
return response, status.HTTP_500_INTERNAL_SERVER_ERROR

@app.route("/score/", methods=["POST"])
def score():
try:
x = request.get_json()
x = ScoreRequest(**x)

response, total_time_taken = run_and_log_time(partial(model.scores, request=x))
response.query_id = query_ids.score_query_id
query_ids.score_query_id += 1
response.total_time_taken = "{:.2f} secs".format(total_time_taken)

return response.dict(), status.HTTP_200_OK
except Exception:
response = get_exception_response(query_ids.score_query_id, args.debug)
print(response)
query_ids.score_query_id += 1
return response, status.HTTP_500_INTERNAL_SERVER_ERROR
2 changes: 2 additions & 0 deletions inference_server/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
GenerateResponse,
TokenizeRequest,
TokenizeResponse,
ScoreRequest,
ScoreResponse,
create_generate_request,
get_filter_dict,
parse_bool,
Expand Down
9 changes: 9 additions & 0 deletions inference_server/utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,15 @@ class GenerateResponse(BaseResponse):
scores_b64: List[str] = None
hidden_states_b64: List[str] = None
method: str = "generate"

class ScoreRequest(BaseModel):
text: List[str] = None

class ScoreResponse(BaseResponse):
# input_ids: List = None
# logits: List = None
tokens: List[str] = None
scores: List[float] = None


class TokenizeRequest(BaseModel):
Expand Down
22 changes: 22 additions & 0 deletions llm_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,25 @@ def prompt(
output.hidden_states = hidden_state

return outputs

def score(self,text: Union[str, List[str]]):
request_body = {
"text": text,
}

response = requests.post(
url=f"{self.url}/score/", json=request_body, verify=False
).json()

if "error" in response:
raise ServerError(
f"Server-side Error -- {response['error']}: {response['message']}"
)

# outputs = {
# input_ids: torch.tensor(response["input_ids"]),
# logits: torch.tensor(response["logits"]),
# tokens: response["tokens"]
# }
# return response["input_ids"], response["logits"], response["tokens"]
return response["tokens"], response["scores"]