Merge pull request oobabooga#6300 from oobabooga/dev

Merge dev branch
Touch-Night · Aug 1, 2024 · d011040 · d011040
2 parents 498fec2 + 608545d
commit d011040
Show file tree

Hide file tree

Showing 23 changed files with 123 additions and 93 deletions.
diff --git a/css/main.css b/css/main.css
@@ -404,13 +404,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .message-body h3,
 .message-body h4 {
     color: var(--body-text-color);
+    margin: 20px 0 10px 0;
 }
 
 .dark .message q {
     color: #f5b031;
 }
 
-.message q::before, .message q::after {
+.message-body q::before, .message-body q::after {
     content: "";
 }
 

diff --git a/download-model.py b/download-model.py
@@ -212,11 +212,15 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
                     total_size = int(r.headers.get('content-length', 0))
                     block_size = 1024 * 1024  # 1MB
 
+                    filename_str = str(filename)  # Convert PosixPath to string if necessary
+
                     tqdm_kwargs = {
                         'total': total_size,
-                        'unit': 'iB',
+                        'unit': 'B',
                         'unit_scale': True,
-                        'bar_format': '{l_bar}{bar}| {n_fmt}/{total_fmt} {rate_fmt}'
+                        'unit_divisor': 1024,
+                        'bar_format': '{desc}{percentage:3.0f}%|{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]',
+                        'desc': f"{filename_str}: "
                     }
 
                     if 'COLAB_GPU' in os.environ:
@@ -233,7 +237,7 @@ def get_single_file(self, url, output_folder, start_from_scratch=False):
                                 t.update(len(data))
                                 if total_size != 0 and self.progress_bar is not None:
                                     count += len(data)
-                                    self.progress_bar(float(count) / float(total_size), f"{filename}")
+                                    self.progress_bar(float(count) / float(total_size), f"{filename_str}")
 
                     break  # Exit loop if successful
             except (RequestException, ConnectionError, Timeout) as e:

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
@@ -319,7 +319,6 @@ def chat_streaming_chunk(content):
         yield {'prompt': prompt}
         return
 
-    token_count = len(encode(prompt)[0])
     debug_msg({'prompt': prompt, 'generate_params': generate_params})
 
     if stream:
@@ -330,7 +329,6 @@ def chat_streaming_chunk(content):
 
     answer = ''
     seen_content = ''
-    completion_token_count = 0
 
     for a in generator:
         answer = a['internal'][-1][1]
@@ -345,6 +343,7 @@ def chat_streaming_chunk(content):
             chunk = chat_streaming_chunk(new_content)
             yield chunk
 
+    token_count = len(encode(prompt)[0])
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
     if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
@@ -429,8 +428,6 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
                         prompt = decode(prompt)[0]
 
             prefix = prompt if echo else ''
-            token_count = len(encode(prompt)[0])
-            total_prompt_token_count += token_count
 
             # generate reply #######################################
             debug_msg({'prompt': prompt, 'generate_params': generate_params})
@@ -440,6 +437,8 @@ def completions_common(body: dict, is_legacy: bool = False, stream=False):
             for a in generator:
                 answer = a
 
+            token_count = len(encode(prompt)[0])
+            total_prompt_token_count += token_count
             completion_token_count = len(encode(answer)[0])
             total_completion_token_count += completion_token_count
             stop_reason = "stop"

diff --git a/js/main.js b/js/main.js
@@ -213,6 +213,7 @@ function doSyntaxHighlighting() {
       renderMathInElement(element, {
         delimiters: [
           { left: "$$", right: "$$", display: true },
+          { left: "$", right: "$", display: false },
           { left: "\\(", right: "\\)", display: false },
           { left: "\\[", right: "\\]", display: true },
         ],
@@ -459,7 +460,12 @@ function updateCssProperties() {
 
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
-      chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+      if (!isScrolled && chatInputHeight < currentChatInputHeight) {
+        chatContainer.scrollTop = chatContainer.scrollHeight;
+      } else {
+        chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+      }
+
       currentChatInputHeight = chatInputHeight;
     }
   }

diff --git a/modules/chat.py b/modules/chat.py
@@ -26,8 +26,7 @@
 from modules.text_generation import (
     generate_reply,
     get_encoded_length,
-    get_max_prompt_length,
-    stop_everything_event
+    get_max_prompt_length
 )
 from modules.utils import delete_file, get_available_characters, save_file
 
@@ -93,8 +92,16 @@ def generate_chat_prompt(user_input, state, **kwargs):
         chat_template_str = replace_character_names(chat_template_str, state['name1'], state['name2'])
 
     instruction_template = jinja_env.from_string(state['instruction_template_str'])
-    instruct_renderer = partial(instruction_template.render, add_generation_prompt=False)
     chat_template = jinja_env.from_string(chat_template_str)
+
+    instruct_renderer = partial(
+        instruction_template.render,
+        builtin_tools=None,
+        tools=None,
+        tools_in_user_message=False,
+        add_generation_prompt=False
+    )
+
     chat_renderer = partial(
         chat_template.render,
         add_generation_prompt=False,
@@ -1036,13 +1043,6 @@ def handle_remove_last_click(state):
     return [history, html, last_input]
 
 
-def handle_stop_click(state):
-    stop_everything_event()
-    html = redraw_html(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-
-    return html
-
-
 def handle_unique_id_select(state):
     history = load_history(state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

diff --git a/modules/html_generator.py b/modules/html_generator.py
@@ -72,6 +72,14 @@ def replace_blockquote(m):
 @functools.lru_cache(maxsize=None)
 def convert_to_markdown(string):
 
+    # Make \[ \]  LaTeX equations inline
+    pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$'
+    replacement = r'\\[ \1 \\]'
+    string = re.sub(pattern, replacement, string, flags=re.MULTILINE)
+
+    # Escape backslashes
+    string = string.replace('\\', '\\\\')
+
     # Quote to <q></q>
     string = replace_quotes(string)
 
@@ -95,12 +103,27 @@ def convert_to_markdown(string):
 
     result = ''
     is_code = False
+    is_latex = False
     for line in string.split('\n'):
-        if line.lstrip(' ').startswith('```'):
+        stripped_line = line.strip()
+
+        if stripped_line.startswith('```'):
             is_code = not is_code
+        elif stripped_line.startswith('$$'):
+            is_latex = not is_latex
+        elif stripped_line.endswith('$$'):
+            is_latex = False
+        elif stripped_line.startswith('\\\\['):
+            is_latex = True
+        elif stripped_line.startswith('\\\\]'):
+            is_latex = False
+        elif stripped_line.endswith('\\\\]'):
+            is_latex = False
 
         result += line
-        if is_code or line.startswith('|'):  # Don't add an extra \n for tables or code
+
+        # Don't add an extra \n for tables, code, or LaTeX
+        if is_code or is_latex or line.startswith('|'):
             result += '\n'
         else:
             result += '\n\n'

diff --git a/modules/logits.py b/modules/logits.py
@@ -13,8 +13,8 @@
 
 
 def get_next_logits(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.previous_model_name)
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
 
     needs_lock = not args[2]  # use_samplers
     if needs_lock:

diff --git a/modules/models.py b/modules/models.py
@@ -368,14 +368,15 @@ def clear_torch_cache():
             torch.cuda.empty_cache()
 
 
-def unload_model():
+def unload_model(keep_model_name=False):
     shared.model = shared.tokenizer = None
-    shared.previous_model_name = shared.model_name
-    shared.model_name = 'None'
     shared.lora_names = []
     shared.model_dirty_from_training = False
     clear_torch_cache()
 
+    if not keep_model_name:
+        shared.model_name = 'None'
+
 
 def reload_model():
     unload_model()
@@ -393,7 +394,7 @@ def unload_model_if_idle():
             if time.time() - last_generation_time > shared.args.idle_timeout * 60:
                 if shared.model is not None:
                     logger.info("Unloading the model for inactivity.")
-                    unload_model()
+                    unload_model(keep_model_name=True)
         finally:
             shared.generation_lock.release()
 

diff --git a/modules/shared.py b/modules/shared.py
@@ -13,7 +13,6 @@
 model = None
 tokenizer = None
 model_name = 'None'
-previous_model_name = 'None'
 is_seq2seq = False
 model_dirty_from_training = False
 lora_names = []
@@ -44,8 +43,6 @@
     'negative_prompt': '',
     'seed': -1,
     'truncation_length': 2048,
-    'truncation_length_min': 0,
-    'truncation_length_max': 200000,
     'max_tokens_second': 0,
     'max_updates_second': 0,
     'prompt_lookup_num_tokens': 0,

diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -32,8 +32,8 @@
 
 
 def generate_reply(*args, **kwargs):
-    if shared.args.idle_timeout > 0 and shared.model is None and shared.previous_model_name not in [None, 'None']:
-        shared.model, shared.tokenizer = load_model(shared.previous_model_name)
+    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
+        shared.model, shared.tokenizer = load_model(shared.model_name)
 
     shared.generation_lock.acquire()
     try:

diff --git a/modules/training.py b/modules/training.py
@@ -165,7 +165,7 @@ def create_ui():
                             stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
 
                         with gr.Column():
-                            max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
+                            max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.')
 
                     with gr.Row():
                         start_current_evaluation = gr.Button("Evaluate loaded model", interactive=not mu)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
@@ -7,6 +7,7 @@
 
 from modules import chat, shared, ui, utils
 from modules.html_generator import chat_html_wrapper
+from modules.text_generation import stop_everything_event
 from modules.utils import gradio
 
 inputs = ('Chat input', 'interface_state')
@@ -221,8 +222,8 @@ def create_event_handlers():
         chat.handle_remove_last_click, gradio('interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
 
     shared.gradio['Stop'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_stop_click, gradio('interface_state'), gradio('display'), show_progress=False)
+        stop_everything_event, None, None, queue=False).then(
+        chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
 
     if not shared.args.multi_user:
         shared.gradio['unique_id'].select(

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
@@ -93,19 +93,19 @@ def create_ui():
 
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
                             shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be set to more than 0 for your GPU to be used.')
-                            shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
+                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                             shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len)
+                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. Try lowering this if you run out of memory while loading the model.')
                             with gr.Blocks():
                                 shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
                                 shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=0, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
+                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
 

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
@@ -89,7 +89,7 @@ def create_ui(default_preset):
                             shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.')
 
                         with gr.Column():
-                            shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                            shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
                             shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
                             shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')