diff --git a/examples/openai_audio.py b/examples/openai_audio.py index 469775c5..85e8e81c 100644 --- a/examples/openai_audio.py +++ b/examples/openai_audio.py @@ -1,15 +1,22 @@ import ell +from pydub import AudioSegment +import numpy as np + +# Helper function to load and convert audio files +def load_audio_sample(file_path): + audio = AudioSegment.from_file(file_path) + samplearray = np.array(audio.get_array_of_samples()) + return samplearray + ell.init(verbose=True) @ell.complex("gpt-4o-audio-preview") def test(): - return [ell.user("Hey! Could you talk to me in spanish? I'd like to hear how you say 'ell'.")] + return [ell.user(["Hey! what do you think about this?", load_audio_sample("toronto.mp3")])] -response = test() -print(response.audios[0]) if __name__ == "__main__": - test() - + response = test() + print(response.audios[0]) diff --git a/examples/toronto.mp3 b/examples/toronto.mp3 new file mode 100644 index 00000000..3de95361 Binary files /dev/null and b/examples/toronto.mp3 differ diff --git a/src/ell/providers/openai.py b/src/ell/providers/openai.py index d9c5e084..0a55c8b7 100644 --- a/src/ell/providers/openai.py +++ b/src/ell/providers/openai.py @@ -9,7 +9,7 @@ import json from ell.configurator import _Model, config, register_provider from ell.types.message import LMP -from ell.util.serialization import serialize_image +from ell.util.serialization import array_buffer_to_base64, serialize_image try: # XXX: Could genericize. @@ -122,6 +122,7 @@ def translate_from_provider( logger(delta.content, is_refusal=hasattr(delta, "refusal") and delta.refusal) for _, message_stream in sorted(message_streams.items(), key=lambda x: x[0]): text = "".join((choice.delta.content or "") for choice in message_stream) + # XXX: API might be close to something else. messages.append( Message(role=role, content=_lstr(content=text,origin_trace=origin_id))) @@ -144,6 +145,7 @@ def translate_from_provider( ContentBlock( text=_lstr(content=content,origin_trace=origin_id))) if logger: logger(content) + #XXX: Streaming tool calls are coming. if (tool_calls := message.tool_calls): for tool_call in tool_calls: matching_tool = ell_call.get_tool_by_name(tool_call.function.name) @@ -178,6 +180,8 @@ def _content_block_to_openai_format(content_block: ContentBlock) -> Dict[str, An "type": "image_url", "image_url": image_url } + elif (audio := content_block.audio) is not None: + return dict(type="input_audio", audio=array_buffer_to_base64(audio)) elif ((text := content_block.text) is not None): return dict(type="text", text=text) elif (parsed := content_block.parsed): return dict(type="text", text=parsed.model_dump_json()) else: diff --git a/src/ell/types/message.py b/src/ell/types/message.py index 7bc61e6a..f45a6ac5 100644 --- a/src/ell/types/message.py +++ b/src/ell/types/message.py @@ -252,8 +252,10 @@ def coerce(cls, content: AnyContent) -> "ContentBlock": return cls(tool_call=content) if isinstance(content, ToolResult): return cls(tool_result=content) - if isinstance(content, (ImageContent, np.ndarray, PILImage.Image)): + if isinstance(content, (ImageContent, PILImage.Image)) or (isinstance(content, np.ndarray) and content.ndim >= 3): return cls(image=ImageContent.coerce(content)) + if isinstance(content, np.ndarray) and content.ndim == 1: + return cls(audio=content) if isinstance(content, BaseModel): return cls(parsed=content) @@ -352,8 +354,8 @@ def images(self) -> List[ImageContent]: return [c.image for c in self.content if c.image] @property - def audios(self) -> List[Union[np.ndarray, List[float]]]: - """Returns a list of all audio content. + def audios(self) -> List[Union[np.ndarray, List[float], List[int]]]: + """Returns a list of all audio content in each content block.. Example: >>> audio1 = np.array([0.1, 0.2, 0.3]) @@ -363,6 +365,20 @@ def audios(self) -> List[Union[np.ndarray, List[float]]]: 2 """ return [c.audio for c in self.content if c.audio] + + + @property + def audio(self) -> np.ndarray: + """Returns the first audio content. + + Example: + >>> audio1 = np.array([0.1, 0.2, 0.3]) + >>> message = Message(role="user", content=["Text", audio1, "More text"]) + >>> message.audio + array([0.1, 0.2, 0.3]) + """ + return np.concatenate(self.audios) + @property def text_only(self) -> str: diff --git a/src/ell/util/serialization.py b/src/ell/util/serialization.py index fbb53a11..512cb134 100644 --- a/src/ell/util/serialization.py +++ b/src/ell/util/serialization.py @@ -95,6 +95,31 @@ def compute_state_cache_key(ipstr, fn_closure): _free_vars_str = f"{json.dumps(get_immutable_vars(fn_closure[3]), sort_keys=True, default=repr)}" state_cache_key = hashlib.sha256(f"{ipstr}{_global_free_vars_str}{_free_vars_str}".encode('utf-8')).hexdigest() return state_cache_key + + +def float_to_16bit_pcm(float32_array): + int16_array = (np.clip(float32_array, -1, 1) * 32767).astype(np.int16) + return int16_array.tobytes() + +def base64_to_array_buffer(base64_string): + return base64.b64decode(base64_string) + +def array_buffer_to_base64(array_buffer): + if isinstance(array_buffer, np.ndarray): + if array_buffer.dtype == np.float32: + array_buffer = float_to_16bit_pcm(array_buffer) + elif array_buffer.dtype == np.int16: + array_buffer = array_buffer.tobytes() + return base64.b64encode(array_buffer).decode('utf-8') + +def merge_int16_arrays(left, right): + if isinstance(left, bytes): + left = np.frombuffer(left, dtype=np.int16) + if isinstance(right, bytes): + right = np.frombuffer(right, dtype=np.int16) + if not isinstance(left, np.ndarray) or not isinstance(right, np.ndarray): + raise ValueError("Both items must be numpy arrays or bytes objects") + return np.concatenate((left, right)) def prepare_invocation_params(params): diff --git a/src/ell/util/verbosity.py b/src/ell/util/verbosity.py index af4ca698..5038b4a4 100644 --- a/src/ell/util/verbosity.py +++ b/src/ell/util/verbosity.py @@ -85,6 +85,51 @@ def get_terminal_width() -> int: logger.warning("Unable to determine terminal size. Defaulting to 80 columns.") return 80 +import numpy as np +def plot_ascii_waveform(audio_data: np.ndarray, width: int = 80) -> List[str]: + """ + Plot an improved ASCII waveform of the given audio data with a height of 1. + + Args: + audio_data (np.ndarray): The audio data to plot. + width (int): The width of the ASCII plot. + + Returns: + List[str]: A list of strings representing the ASCII waveform. + """ + if audio_data.ndim != 1: + raise ValueError("Audio data must be a 1D numpy array") + + # Normalize audio data to fit within the range [0, 1] + normalized_data = (audio_data - np.min(audio_data)) / (np.max(audio_data) - np.min(audio_data)) + + # Create the ASCII waveform + step = max(1, len(audio_data) // width) + + # Characters for different amplitudes + chars = ' ▁▂▃▄▅▆▇█' + + waveform = '' + for i in range(0, len(audio_data), step): + char_index = int(normalized_data[i] * (len(chars) - 1)) + waveform += chars[char_index] + + # Add top and bottom borders + border = '─' * width + waveform = [f'╭{border}╮', f'│{waveform}│', f'╰{border}╯'] + + # Add audio label + label = "Audio ContentBlock" + label_position = (width - len(label)) // 2 + waveform[0] = ( + waveform[0][:label_position] + + label + + waveform[0][label_position + len(label):] + ) + + return waveform + + def wrap_text_with_prefix(message, width: int, prefix: str, subsequent_prefix: str, text_color: str) -> List[str]: """Wrap text while preserving the prefix and color for each line.""" result = [] @@ -102,6 +147,8 @@ def wrap_text_with_prefix(message, width: int, prefix: str, subsequent_prefix: s for c in contnets_to_wrap: if c.image and c.image.image: block_wrapped_lines = plot_ascii(c.image.image, min(80, width - len(prefix))) + elif c.audio is not None: + block_wrapped_lines = plot_ascii_waveform(c.audio) else: text = _content_to_text([c]) paragraphs = text.split('\n')