diff --git a/backend/python/transformers/backend.py b/backend/python/transformers/backend.py index 10603d2e78a9..6e809f28adac 100644 --- a/backend/python/transformers/backend.py +++ b/backend/python/transformers/backend.py @@ -21,10 +21,7 @@ XPU=os.environ.get("XPU", "0") == "1" -if XPU: - from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria -else: - from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria +from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria _ONE_DAY_IN_SECONDS = 60 * 60 * 24 @@ -77,11 +74,11 @@ def LoadModel(self, request, context): """ model_name = request.Model - compute = "auto" + compute = torch.float16 if request.F16Memory == True: compute=torch.bfloat16 - self.CUDA = request.CUDA + self.CUDA = torch.cuda.is_available() self.OV=False device_map="cpu" @@ -89,6 +86,7 @@ def LoadModel(self, request, context): quantization = None if self.CUDA: + from transformers import BitsAndBytesConfig, AutoModelForCausalLM if request.MainGPU: device_map=request.MainGPU else: @@ -107,7 +105,7 @@ def LoadModel(self, request, context): bnb_4bit_compute_dtype = None, load_in_8bit=True, ) - + try: if request.Type == "AutoModelForCausalLM": if XPU: @@ -189,6 +187,7 @@ def LoadModel(self, request, context): device=device_map) self.OV = True else: + print("Automodel", file=sys.stderr) self.model = AutoModel.from_pretrained(model_name, trust_remote_code=request.TrustRemoteCode, use_safetensors=True, diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt index 5f4f4687a08c..e399f17a35f9 100644 --- a/backend/python/transformers/requirements.txt +++ b/backend/python/transformers/requirements.txt @@ -3,4 +3,5 @@ transformers grpcio==1.64.0 protobuf torch -certifi \ No newline at end of file +certifi +intel-extension-for-transformers \ No newline at end of file diff --git a/backend/python/transformers/run.sh b/backend/python/transformers/run.sh index 375c07e5f426..8ea92a279aae 100755 --- a/backend/python/transformers/run.sh +++ b/backend/python/transformers/run.sh @@ -1,4 +1,10 @@ #!/bin/bash source $(dirname $0)/../common/libbackend.sh +if [ -d "/opt/intel" ]; then + # Assumes we are using the Intel oneAPI container image + # https://github.com/intel/intel-extension-for-pytorch/issues/538 + export XPU=1 +fi + startBackend $@ \ No newline at end of file