From c3dafc2e99112688f22d9b9a97a007d323065987 Mon Sep 17 00:00:00 2001 From: Marut Pandya Date: Wed, 9 Oct 2024 14:35:36 -0700 Subject: [PATCH] Update README.md --- README.md | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index be9f922..188a106 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@

SgLang Worker

-🚀 | SGLang is yet another fast serving framework for large language models and vision language models. +🚀 | SGLang is fast serving framework for large language models and vision language models. ## 📖 | Getting Started @@ -32,6 +32,45 @@ print(run_request.status()) print(run_request.output()) ``` +### OpenAI compatible API +``` +from openai import OpenAI +import os + +# Initialize the OpenAI Client with your RunPod API Key and Endpoint URL +client = OpenAI( + api_key=os.getenv("RUNPOD_API_KEY"), + base_url=f"https://api.runpod.ai/v2//openai/v1", +) +``` + +`Chat Completions (Non-Streaming)` +``` +response = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[{"role": "user", "content": "Give a two lines on Planet Earth ?"}], + temperature=0, + max_tokens=100, + +) +print(f"Response: {response}") +``` + +`Chat Completions (Streaming)` +``` +response_stream = client.chat.completions.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + messages=[{"role": "user", "content": "Give a two lines on Planet Earth ?"}], + temperature=0, + max_tokens=100, + stream=True + +) +for response in response_stream: + print(response.choices[0].delta.content or "", end="", flush=True) +``` + + ## SGLang Server Configuration When launching an endpoint, you can configure the SGLang server using environment variables. These variables allow you to customize various aspects of the server's behavior without modifying the code.