From d948a4f2ee88922b2861b19eb7e7660921f7bf67 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 7 Feb 2024 09:37:59 -0500
Subject: [PATCH 01/17] Complete revamp of model loading to allow for more
 discreet control by the user of the models loading behavior.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-backend/llamamodel.cpp         |   3 +
 gpt4all-backend/llmodel.h              |  13 +
 gpt4all-chat/CMakeLists.txt            |   3 +
 gpt4all-chat/chat.cpp                  |  57 ++--
 gpt4all-chat/chat.h                    |  18 +-
 gpt4all-chat/chatlistmodel.h           |   4 +-
 gpt4all-chat/chatllm.cpp               |  93 ++++++-
 gpt4all-chat/chatllm.h                 |  11 +-
 gpt4all-chat/icons/eject.svg           |   6 +
 gpt4all-chat/main.qml                  | 372 ++++++++++++++++---------
 gpt4all-chat/qml/MyButton.qml          |   5 +-
 gpt4all-chat/qml/MyMiniButton.qml      |  47 ++++
 gpt4all-chat/qml/SwitchModelDialog.qml |  44 +++
 gpt4all-chat/qml/Theme.qml             |   1 +
 14 files changed, 504 insertions(+), 173 deletions(-)
 create mode 100644 gpt4all-chat/icons/eject.svg
 create mode 100644 gpt4all-chat/qml/MyMiniButton.qml
 create mode 100644 gpt4all-chat/qml/SwitchModelDialog.qml

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 5b9960fff1c1..0dd9de5d96ed 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -180,6 +180,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     d_ptr->model_params.use_mlock = params.use_mlock;
 #endif
 
+    d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
+    d_ptr->model_params.progress_callback_user_data = this;
+
 #ifdef GGML_USE_METAL
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 7fc5e71dc902..c3cc937c0f72 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -74,6 +74,8 @@ class LLModel {
         int32_t n_last_batch_tokens = 0;
     };
 
+    using ProgressCallback = std::function<bool(float progress)>;
+
     explicit LLModel() {}
     virtual ~LLModel() {}
 
@@ -125,6 +127,8 @@ class LLModel {
     virtual bool hasGPUDevice() { return false; }
     virtual bool usingGPUDevice() { return false; }
 
+    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
+
 protected:
     // These are pure virtual because subclasses need to implement as the default implementation of
     // 'prompt' above calls these functions
@@ -153,6 +157,15 @@ class LLModel {
 
     const Implementation *m_implementation = nullptr;
 
+    ProgressCallback m_progressCallback;
+    static bool staticProgressCallback(float progress, void* ctx)
+    {
+        LLModel* model = static_cast<LLModel*>(ctx);
+        if (model && model->m_progressCallback)
+            return model->m_progressCallback(progress);
+        return true;
+    }
+
 private:
     friend class LLMImplementation;
 };
diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt
index ee72f8463e42..0f9d0ab0f2e9 100644
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@@ -109,6 +109,7 @@ qt_add_qml_module(chat
       qml/ModelSettings.qml
       qml/ApplicationSettings.qml
       qml/LocalDocsSettings.qml
+      qml/SwitchModelDialog.qml
       qml/MySettingsTab.qml
       qml/MySettingsStack.qml
       qml/MySettingsDestructiveButton.qml
@@ -123,6 +124,7 @@ qt_add_qml_module(chat
       qml/MyTextField.qml
       qml/MyCheckBox.qml
       qml/MyBusyIndicator.qml
+      qml/MyMiniButton.qml
       qml/MyToolButton.qml
     RESOURCES
       icons/send_message.svg
@@ -133,6 +135,7 @@ qt_add_qml_module(chat
       icons/db.svg
       icons/download.svg
       icons/settings.svg
+      icons/eject.svg
       icons/edit.svg
       icons/image.svg
       icons/trash.svg
diff --git a/gpt4all-chat/chat.cpp b/gpt4all-chat/chat.cpp
index 0e66c5c20bfd..8730adbcee05 100644
--- a/gpt4all-chat/chat.cpp
+++ b/gpt4all-chat/chat.cpp
@@ -23,14 +23,10 @@ Chat::Chat(bool isServer, QObject *parent)
     , m_id(Network::globalInstance()->generateUniqueId())
     , m_name(tr("Server Chat"))
     , m_chatModel(new ChatModel(this))
-    , m_responseInProgress(false)
     , m_responseState(Chat::ResponseStopped)
     , m_creationDate(QDateTime::currentSecsSinceEpoch())
     , m_llmodel(new Server(this))
     , m_isServer(true)
-    , m_shouldDeleteLater(false)
-    , m_isModelLoaded(false)
-    , m_shouldLoadModelWhenInstalled(false)
     , m_collectionModel(new LocalDocsCollectionsModel(this))
 {
     connectLLM();
@@ -45,7 +41,7 @@ Chat::~Chat()
 void Chat::connectLLM()
 {
     // Should be in different threads
-    connect(m_llmodel, &ChatLLM::isModelLoadedChanged, this, &Chat::handleModelLoadedChanged, Qt::QueuedConnection);
+    connect(m_llmodel, &ChatLLM::modelLoadingPercentageChanged, this, &Chat::handleModelLoadingPercentageChanged, Qt::QueuedConnection);
     connect(m_llmodel, &ChatLLM::responseChanged, this, &Chat::handleResponseChanged, Qt::QueuedConnection);
     connect(m_llmodel, &ChatLLM::promptProcessing, this, &Chat::promptProcessing, Qt::QueuedConnection);
     connect(m_llmodel, &ChatLLM::responseStopped, this, &Chat::responseStopped, Qt::QueuedConnection);
@@ -57,6 +53,7 @@ void Chat::connectLLM()
     connect(m_llmodel, &ChatLLM::reportFallbackReason, this, &Chat::handleFallbackReasonChanged, Qt::QueuedConnection);
     connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
     connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
+    connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::trySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);
 
     connect(this, &Chat::promptRequested, m_llmodel, &ChatLLM::prompt, Qt::QueuedConnection);
     connect(this, &Chat::modelChangeRequested, m_llmodel, &ChatLLM::modelChangeRequested, Qt::QueuedConnection);
@@ -69,8 +66,6 @@ void Chat::connectLLM()
     connect(this, &Chat::processSystemPromptRequested, m_llmodel, &ChatLLM::processSystemPrompt, Qt::QueuedConnection);
 
     connect(this, &Chat::collectionListChanged, m_collectionModel, &LocalDocsCollectionsModel::setCollections);
-    connect(ModelList::globalInstance()->installedModels(), &InstalledModels::countChanged,
-        this, &Chat::handleModelInstalled, Qt::QueuedConnection);
 }
 
 void Chat::reset()
@@ -101,7 +96,12 @@ void Chat::processSystemPrompt()
 
 bool Chat::isModelLoaded() const
 {
-    return m_isModelLoaded;
+    return m_modelLoadingPercentage == 1.0f;
+}
+
+float Chat::modelLoadingPercentage() const
+{
+    return m_modelLoadingPercentage;
 }
 
 void Chat::resetResponseState()
@@ -158,16 +158,18 @@ void Chat::handleResponseChanged(const QString &response)
     emit responseChanged();
 }
 
-void Chat::handleModelLoadedChanged(bool loaded)
+void Chat::handleModelLoadingPercentageChanged(float loadingPercentage)
 {
     if (m_shouldDeleteLater)
         deleteLater();
 
-    if (loaded == m_isModelLoaded)
+    if (loadingPercentage == m_modelLoadingPercentage)
         return;
 
-    m_isModelLoaded = loaded;
-    emit isModelLoadedChanged();
+    m_modelLoadingPercentage = loadingPercentage;
+    emit modelLoadingPercentageChanged();
+    if (m_modelLoadingPercentage == 1.0f || m_modelLoadingPercentage == 0.0f)
+        emit isModelLoadedChanged();
 }
 
 void Chat::promptProcessing()
@@ -238,10 +240,10 @@ ModelInfo Chat::modelInfo() const
 
 void Chat::setModelInfo(const ModelInfo &modelInfo)
 {
-    if (m_modelInfo == modelInfo)
+    if (m_modelInfo == modelInfo && isModelLoaded())
         return;
 
-    m_isModelLoaded = false;
+    m_modelLoadingPercentage = std::numeric_limits<float>::min();
     emit isModelLoadedChanged();
     m_modelLoadingError = QString();
     emit modelLoadingErrorChanged();
@@ -291,21 +293,26 @@ void Chat::unloadModel()
 
 void Chat::reloadModel()
 {
-    // If the installed model list is empty, then we mark a special flag and monitor for when a model
-    // is installed
-    if (!ModelList::globalInstance()->installedModels()->count()) {
-        m_shouldLoadModelWhenInstalled = true;
-        return;
-    }
     m_llmodel->setShouldBeLoaded(true);
 }
 
-void Chat::handleModelInstalled()
+void Chat::forceUnloadModel()
 {
-    if (!m_shouldLoadModelWhenInstalled)
-        return;
-    m_shouldLoadModelWhenInstalled = false;
-    reloadModel();
+    stopGenerating();
+    m_llmodel->setForceUnloadModel(true);
+    m_llmodel->setShouldBeLoaded(false);
+}
+
+void Chat::forceReloadModel()
+{
+    m_llmodel->setForceUnloadModel(true);
+    m_llmodel->setShouldBeLoaded(true);
+}
+
+void Chat::trySwitchContextOfLoadedModel()
+{
+    emit trySwitchContextOfLoadedModelAttempted();
+    m_llmodel->setShouldTrySwitchContext(true);
 }
 
 void Chat::generatedNameChanged(const QString &name)
diff --git a/gpt4all-chat/chat.h b/gpt4all-chat/chat.h
index ae6910bf8f2a..cecbcbda9d39 100644
--- a/gpt4all-chat/chat.h
+++ b/gpt4all-chat/chat.h
@@ -17,6 +17,7 @@ class Chat : public QObject
     Q_PROPERTY(QString name READ name WRITE setName NOTIFY nameChanged)
     Q_PROPERTY(ChatModel *chatModel READ chatModel NOTIFY chatModelChanged)
     Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
+    Q_PROPERTY(float modelLoadingPercentage READ modelLoadingPercentage NOTIFY modelLoadingPercentageChanged)
     Q_PROPERTY(QString response READ response NOTIFY responseChanged)
     Q_PROPERTY(ModelInfo modelInfo READ modelInfo WRITE setModelInfo NOTIFY modelInfoChanged)
     Q_PROPERTY(bool responseInProgress READ responseInProgress NOTIFY responseInProgressChanged)
@@ -61,6 +62,7 @@ class Chat : public QObject
     Q_INVOKABLE void reset();
     Q_INVOKABLE void processSystemPrompt();
     Q_INVOKABLE bool isModelLoaded() const;
+    Q_INVOKABLE float modelLoadingPercentage() const;
     Q_INVOKABLE void prompt(const QString &prompt);
     Q_INVOKABLE void regenerateResponse();
     Q_INVOKABLE void stopGenerating();
@@ -75,8 +77,11 @@ class Chat : public QObject
     void setModelInfo(const ModelInfo &modelInfo);
     bool isRecalc() const;
 
-    void unloadModel();
-    void reloadModel();
+    Q_INVOKABLE void unloadModel();
+    Q_INVOKABLE void reloadModel();
+    Q_INVOKABLE void forceUnloadModel();
+    Q_INVOKABLE void forceReloadModel();
+    Q_INVOKABLE void trySwitchContextOfLoadedModel();
     void unloadAndDeleteLater();
 
     qint64 creationDate() const { return m_creationDate; }
@@ -106,6 +111,7 @@ public Q_SLOTS:
     void nameChanged();
     void chatModelChanged();
     void isModelLoadedChanged();
+    void modelLoadingPercentageChanged();
     void responseChanged();
     void responseInProgressChanged();
     void responseStateChanged();
@@ -127,10 +133,12 @@ public Q_SLOTS:
     void deviceChanged();
     void fallbackReasonChanged();
     void collectionModelChanged();
+    void trySwitchContextOfLoadedModelAttempted();
+    void trySwitchContextOfLoadedModelCompleted(bool);
 
 private Q_SLOTS:
     void handleResponseChanged(const QString &response);
-    void handleModelLoadedChanged(bool);
+    void handleModelLoadingPercentageChanged(float);
     void promptProcessing();
     void responseStopped();
     void generatedNameChanged(const QString &name);
@@ -141,7 +149,6 @@ private Q_SLOTS:
     void handleFallbackReasonChanged(const QString &device);
     void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
     void handleModelInfoChanged(const ModelInfo &modelInfo);
-    void handleModelInstalled();
 
 private:
     QString m_id;
@@ -163,8 +170,7 @@ private Q_SLOTS:
     QList<ResultInfo> m_databaseResults;
     bool m_isServer = false;
     bool m_shouldDeleteLater = false;
-    bool m_isModelLoaded = false;
-    bool m_shouldLoadModelWhenInstalled = false;
+    float m_modelLoadingPercentage = 0.0f;
     LocalDocsCollectionsModel *m_collectionModel;
 };
 
diff --git a/gpt4all-chat/chatlistmodel.h b/gpt4all-chat/chatlistmodel.h
index 3f99c622894e..ed04cc7a4476 100644
--- a/gpt4all-chat/chatlistmodel.h
+++ b/gpt4all-chat/chatlistmodel.h
@@ -179,9 +179,9 @@ class ChatListModel : public QAbstractListModel
         if (m_currentChat && m_currentChat != m_serverChat)
             m_currentChat->unloadModel();
         m_currentChat = chat;
-        if (!m_currentChat->isModelLoaded() && m_currentChat != m_serverChat)
-            m_currentChat->reloadModel();
         emit currentChatChanged();
+        if (!m_currentChat->isModelLoaded() && m_currentChat != m_serverChat)
+            m_currentChat->trySwitchContextOfLoadedModel();
     }
 
     Q_INVOKABLE Chat* get(int index)
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index 844942e44399..4b456e3464a0 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -62,7 +62,9 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
     , m_promptResponseTokens(0)
     , m_promptTokens(0)
     , m_isRecalc(false)
-    , m_shouldBeLoaded(true)
+    , m_shouldBeLoaded(false)
+    , m_forceUnloadModel(false)
+    , m_shouldTrySwitchContext(false)
     , m_stopGenerating(false)
     , m_timer(nullptr)
     , m_isServer(isServer)
@@ -76,6 +78,8 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
     connect(this, &ChatLLM::sendModelLoaded, Network::globalInstance(), &Network::sendModelLoaded);
     connect(this, &ChatLLM::shouldBeLoadedChanged, this, &ChatLLM::handleShouldBeLoadedChanged,
         Qt::QueuedConnection); // explicitly queued
+    connect(this, &ChatLLM::shouldTrySwitchContextChanged, this, &ChatLLM::handleShouldTrySwitchContextChanged,
+        Qt::QueuedConnection); // explicitly queued
     connect(parent, &Chat::idChanged, this, &ChatLLM::handleChatIdChanged);
     connect(&m_llmThread, &QThread::started, this, &ChatLLM::handleThreadStarted);
     connect(MySettings::globalInstance(), &MySettings::forceMetalChanged, this, &ChatLLM::handleForceMetalChanged);
@@ -143,6 +147,54 @@ bool ChatLLM::loadDefaultModel()
     return loadModel(defaultModel);
 }
 
+bool ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)
+{
+    // We're trying to see if the store already has the model fully loaded that we wish to use
+    // and if so we just acquire it from the store and switch the context and return true. If the
+    // store doesn't have it or we're already loaded or in any other case just return false.
+
+    // If we're already loaded or a server or we're reloading to change the variant/device or the
+    // modelInfo is empty, then this should fail
+    if (isModelLoaded() || m_isServer || m_reloadingToChangeVariant || modelInfo.name().isEmpty()) {
+        m_shouldTrySwitchContext = false;
+        emit trySwitchContextOfLoadedModelCompleted(false);
+        return false;
+    }
+
+    QString filePath = modelInfo.dirpath + modelInfo.filename();
+    QFileInfo fileInfo(filePath);
+
+    m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
+#if defined(DEBUG_MODEL_LOADING)
+        qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model;
+#endif
+
+    // The store gave us no already loaded model, the wrong type of model, then give it back to the
+    // store and fail
+    if (!m_llModelInfo.model || m_llModelInfo.fileInfo != fileInfo) {
+        LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
+        m_llModelInfo = LLModelInfo();
+        m_shouldTrySwitchContext = false;
+        emit trySwitchContextOfLoadedModelCompleted(false);
+        return false;
+    }
+
+#if defined(DEBUG_MODEL_LOADING)
+    qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model;
+#endif
+
+    // We should be loaded and now we are
+    m_shouldBeLoaded = true;
+    m_shouldTrySwitchContext = false;
+
+    // Restore, signal and process
+    restoreState();
+    emit modelLoadingPercentageChanged(1.0f);
+    emit trySwitchContextOfLoadedModelCompleted(true);
+    processSystemPrompt();
+    return true;
+}
+
 bool ChatLLM::loadModel(const ModelInfo &modelInfo)
 {
     // This is a complicated method because N different possible threads are interested in the outcome
@@ -170,7 +222,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
 #endif
         delete m_llModelInfo.model;
         m_llModelInfo.model = nullptr;
-        emit isModelLoadedChanged(false);
+        emit modelLoadingPercentageChanged(std::numeric_limits<float>::min());
     } else if (!m_isServer) {
         // This is a blocking call that tries to retrieve the model we need from the model store.
         // If it succeeds, then we just have to restore state. If the store has never had a model
@@ -188,7 +240,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
 #endif
             LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
             m_llModelInfo = LLModelInfo();
-            emit isModelLoadedChanged(false);
+            emit modelLoadingPercentageChanged(0.0f);
             return false;
         }
 
@@ -198,7 +250,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
             qDebug() << "store had our model" << m_llmThread.objectName() << m_llModelInfo.model;
 #endif
             restoreState();
-            emit isModelLoadedChanged(true);
+            emit modelLoadingPercentageChanged(1.0f);
             setModelInfo(modelInfo);
             Q_ASSERT(!m_modelInfo.filename().isEmpty());
             if (m_modelInfo.filename().isEmpty())
@@ -261,6 +313,12 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
             m_llModelInfo.model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
 
             if (m_llModelInfo.model) {
+
+                m_llModelInfo.model->setProgressCallback([this](float progress) -> bool {
+                    emit modelLoadingPercentageChanged(progress);
+                    return m_shouldBeLoaded;
+                });
+
                 // Update the settings that a model is being loaded and update the device list
                 MySettings::globalInstance()->setAttemptModelLoad(filePath);
 
@@ -354,7 +412,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
         qDebug() << "modelLoadedChanged" << m_llmThread.objectName();
         fflush(stdout);
 #endif
-        emit isModelLoadedChanged(isModelLoaded());
+        emit modelLoadingPercentageChanged(isModelLoaded() ? 1.0f : 0.0f);
 
         static bool isFirstLoad = true;
         if (isFirstLoad) {
@@ -456,6 +514,7 @@ void ChatLLM::setModelInfo(const ModelInfo &modelInfo)
 
 void ChatLLM::modelChangeRequested(const ModelInfo &modelInfo)
 {
+    m_shouldBeLoaded = true;
     loadModel(modelInfo);
 }
 
@@ -598,6 +657,12 @@ void ChatLLM::setShouldBeLoaded(bool b)
     emit shouldBeLoadedChanged();
 }
 
+void ChatLLM::setShouldTrySwitchContext(bool b)
+{
+    m_shouldTrySwitchContext = b; // atomic
+    emit shouldTrySwitchContextChanged();
+}
+
 void ChatLLM::handleShouldBeLoadedChanged()
 {
     if (m_shouldBeLoaded)
@@ -606,10 +671,10 @@ void ChatLLM::handleShouldBeLoadedChanged()
         unloadModel();
 }
 
-void ChatLLM::forceUnloadModel()
+void ChatLLM::handleShouldTrySwitchContextChanged()
 {
-    m_shouldBeLoaded = false; // atomic
-    unloadModel();
+    if (m_shouldTrySwitchContext)
+        trySwitchContextOfLoadedModel(modelInfo());
 }
 
 void ChatLLM::unloadModel()
@@ -617,17 +682,27 @@ void ChatLLM::unloadModel()
     if (!isModelLoaded() || m_isServer)
         return;
 
+    emit modelLoadingPercentageChanged(0.0f);
     saveState();
 #if defined(DEBUG_MODEL_LOADING)
     qDebug() << "unloadModel" << m_llmThread.objectName() << m_llModelInfo.model;
 #endif
+
+    if (m_forceUnloadModel) {
+        delete m_llModelInfo.model;
+        m_llModelInfo.model = nullptr;
+        m_forceUnloadModel = false;
+    }
+
     LLModelStore::globalInstance()->releaseModel(m_llModelInfo);
     m_llModelInfo = LLModelInfo();
-    emit isModelLoadedChanged(false);
 }
 
 void ChatLLM::reloadModel()
 {
+    if (isModelLoaded() && m_forceUnloadModel)
+        unloadModel(); // we unload first if we are forcing an unload
+
     if (isModelLoaded() || m_isServer)
         return;
 
diff --git a/gpt4all-chat/chatllm.h b/gpt4all-chat/chatllm.h
index d6af4cb0c427..278e79cc0b82 100644
--- a/gpt4all-chat/chatllm.h
+++ b/gpt4all-chat/chatllm.h
@@ -81,6 +81,8 @@ class ChatLLM : public QObject
 
     bool shouldBeLoaded() const { return m_shouldBeLoaded; }
     void setShouldBeLoaded(bool b);
+    void setShouldTrySwitchContext(bool b);
+    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
 
     QString response() const;
 
@@ -98,14 +100,15 @@ class ChatLLM : public QObject
 public Q_SLOTS:
     bool prompt(const QList<QString> &collectionList, const QString &prompt);
     bool loadDefaultModel();
+    bool trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
     bool loadModel(const ModelInfo &modelInfo);
     void modelChangeRequested(const ModelInfo &modelInfo);
-    void forceUnloadModel();
     void unloadModel();
     void reloadModel();
     void generateName();
     void handleChatIdChanged(const QString &id);
     void handleShouldBeLoadedChanged();
+    void handleShouldTrySwitchContextChanged();
     void handleThreadStarted();
     void handleForceMetalChanged(bool forceMetal);
     void handleDeviceChanged();
@@ -114,7 +117,7 @@ public Q_SLOTS:
 
 Q_SIGNALS:
     void recalcChanged();
-    void isModelLoadedChanged(bool);
+    void modelLoadingPercentageChanged(float);
     void modelLoadingError(const QString &error);
     void responseChanged(const QString &response);
     void promptProcessing();
@@ -125,6 +128,8 @@ public Q_SLOTS:
     void stateChanged();
     void threadStarted();
     void shouldBeLoadedChanged();
+    void shouldTrySwitchContextChanged();
+    void trySwitchContextOfLoadedModelCompleted(bool);
     void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
     void reportSpeed(const QString &speed);
     void reportDevice(const QString &device);
@@ -167,7 +172,9 @@ public Q_SLOTS:
     QThread m_llmThread;
     std::atomic<bool> m_stopGenerating;
     std::atomic<bool> m_shouldBeLoaded;
+    std::atomic<bool> m_shouldTrySwitchContext;
     std::atomic<bool> m_isRecalc;
+    std::atomic<bool> m_forceUnloadModel;
     bool m_isServer;
     bool m_forceMetal;
     bool m_reloadingToChangeVariant;
diff --git a/gpt4all-chat/icons/eject.svg b/gpt4all-chat/icons/eject.svg
new file mode 100644
index 000000000000..9649c4876d5b
--- /dev/null
+++ b/gpt4all-chat/icons/eject.svg
@@ -0,0 +1,6 @@
+
+<svg xmlns="http://www.w3.org/2000/svg" fill="#7d7d8e" viewBox="0 0 448 512"><path d="M448 384v64c0 17.673-14.327 32-32 32H32c-17.673 0-32-14.327-32-32v-64c0-17.673 14.327-32 32-32h384c17.673 0 32 14.327 32 32zM48.053 320h351.886c41.651 0 63.581-49.674 35.383-80.435L259.383 47.558c-19.014-20.743-51.751-20.744-70.767 0L12.67 239.565C-15.475 270.268 6.324 320 48.053 320z"/></svg>
+<!--
+Font Awesome Free 5.2.0 by @fontawesome - https://fontawesome.com
+License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
+-->
diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index 72fbc3b8e19a..66104e37e754 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -126,6 +126,10 @@ Window {
         }
     }
 
+    function currentModelName() {
+        return ModelList.modelInfo(currentChat.modelInfo.id).name;
+    }
+
     PopupDialog {
         id: errorCompatHardware
         anchors.centerIn: parent
@@ -282,6 +286,18 @@ Window {
         }
     }
 
+    SwitchModelDialog {
+        id: switchModelDialog
+        anchors.centerIn: parent
+        width: Math.min(1024, window.width - (window.width * .2))
+        height: Math.min(600, window.height - (window.height * .2))
+        Item {
+            Accessible.role: Accessible.Dialog
+            Accessible.name: qsTr("Switch model dialog")
+            Accessible.description: qsTr("Warn the user if they switch models, then context will be erased")
+        }
+    }
+
     Rectangle {
         id: header
         anchors.left: parent.left
@@ -292,7 +308,9 @@ Window {
         Item {
             anchors.centerIn: parent
             height: childrenRect.height
-            visible: currentChat.isModelLoaded || currentChat.modelLoadingError !== "" || currentChat.isServer
+            visible: true
+                || currentChat.modelLoadingError !== ""
+                || currentChat.isServer
 
             Label {
                 id: modelLabel
@@ -306,102 +324,168 @@ Window {
                 horizontalAlignment: TextInput.AlignRight
             }
 
-            MyComboBox {
-                id: comboBox
-                implicitWidth: 375
-                width: window.width >= 750 ? implicitWidth : implicitWidth - ((750 - window.width))
+            RowLayout {
+                id: comboLayout
                 anchors.top: modelLabel.top
                 anchors.bottom: modelLabel.bottom
                 anchors.horizontalCenter: parent.horizontalCenter
                 anchors.horizontalCenterOffset: window.width >= 950 ? 0 : Math.max(-((950 - window.width) / 2), -99.5)
-                enabled: !currentChat.isServer
-                model: ModelList.installedModels
-                valueRole: "id"
-                textRole: "name"
-                property string currentModelName: ""
-                function updateCurrentModelName() {
-                    var info = ModelList.modelInfo(currentChat.modelInfo.id);
-                    comboBox.currentModelName = info.name;
-                }
-                Connections {
-                    target: currentChat
-                    function onModelInfoChanged() {
-                        comboBox.updateCurrentModelName();
+                spacing: 20
+
+                MyComboBox {
+                    id: comboBox
+                    Layout.fillWidth: true
+                    Layout.fillHeight: true
+                    implicitWidth: 575
+                    width: window.width >= 750 ? implicitWidth : implicitWidth - ((750 - window.width))
+                    enabled: !currentChat.isServer
+                    model: ModelList.installedModels
+                    valueRole: "id"
+                    textRole: "name"
+                    property bool isCurrentlyLoading: false
+                    property real modelLoadingPercentage: 0.0
+                    property bool trySwitchContextInProgress: false
+
+                    function changeModel(index) {
+                        comboBox.modelLoadingPercentage = 0.0;
+                        comboBox.isCurrentlyLoading = true;
+                        currentChat.stopGenerating()
+                        currentChat.reset();
+                        currentChat.modelInfo = ModelList.modelInfo(comboBox.valueAt(index))
                     }
-                }
-                Connections {
-                    target: window
-                    function onCurrentChatChanged() {
-                        comboBox.updateCurrentModelName();
+
+                    Connections {
+                        target: currentChat
+                        function onModelLoadingPercentageChanged() {
+                            comboBox.modelLoadingPercentage = currentChat.modelLoadingPercentage;
+                            comboBox.isCurrentlyLoading = currentChat.modelLoadingPercentage !== 0.0
+                                && currentChat.modelLoadingPercentage !== 1.0;
+                        }
+                        function onTrySwitchContextOfLoadedModelAttempted() {
+                            comboBox.trySwitchContextInProgress = true;
+                        }
+                        function onTrySwitchContextOfLoadedModelCompleted() {
+                            comboBox.trySwitchContextInProgress = false;
+                        }
+                    }
+                    Connections {
+                        target: switchModelDialog
+                        function onAccepted() {
+                            comboBox.changeModel(switchModelDialog.index)
+                        }
+                    }
+
+                    background: ProgressBar {
+                        id: modelProgress
+                        value: comboBox.modelLoadingPercentage
+                        background: Rectangle {
+                            color: theme.mainComboBackground
+                            radius: 10
+                        }
+                        contentItem: Item {
+                            Rectangle {
+                                visible: comboBox.isCurrentlyLoading
+                                anchors.bottom: parent.bottom
+                                width: modelProgress.visualPosition * parent.width
+                                height: 10
+                                radius: 2
+                                color: theme.progressForeground
+                            }
+                        }
                     }
-                }
-                background: Rectangle {
-                    color: theme.mainComboBackground
-                    radius: 10
-                }
-                contentItem: Text {
-                    anchors.horizontalCenter: parent.horizontalCenter
-                    leftPadding: 10
-                    rightPadding: 20
-                    text: currentChat.modelLoadingError !== ""
-                        ? qsTr("Model loading error...")
-                        : comboBox.currentModelName
-                    font.pixelSize: theme.fontSizeLarger
-                    color: theme.white
-                    verticalAlignment: Text.AlignVCenter
-                    horizontalAlignment: Text.AlignHCenter
-                    elide: Text.ElideRight
-                }
-                delegate: ItemDelegate {
-                    width: comboBox.width
                     contentItem: Text {
-                        text: name
-                        color: theme.textColor
-                        font: comboBox.font
-                        elide: Text.ElideRight
+                        anchors.horizontalCenter: parent.horizontalCenter
+                        leftPadding: 10
+                        rightPadding: 20
+                        text: {
+                            if (currentChat.modelLoadingError !== "")
+                                return qsTr("Model loading error...")
+                            if (comboBox.trySwitchContextInProgress)
+                                return qsTr("Switching context...")
+                            if (currentModelName() === "")
+                                return qsTr("Choose a model...")
+                            if (currentChat.modelLoadingPercentage === 0.0)
+                                return qsTr("Reload \u00B7 ") + currentModelName()
+                            if (comboBox.isCurrentlyLoading)
+                                return qsTr("Loading \u00B7 ") + currentModelName()
+                            return currentModelName()
+                        }
+                        font.pixelSize: theme.fontSizeLarger
+                        color: theme.white
                         verticalAlignment: Text.AlignVCenter
+                        horizontalAlignment: Text.AlignHCenter
+                        elide: Text.ElideRight
+                    }
+                    delegate: ItemDelegate {
+                        id: comboItemDelegate
+                        width: comboBox.width
+                        contentItem: Text {
+                            text: name
+                            color: theme.textColor
+                            font: comboBox.font
+                            elide: Text.ElideRight
+                            verticalAlignment: Text.AlignVCenter
+                        }
+                        background: Rectangle {
+                            color: (index % 2 === 0 ? theme.darkContrast : theme.lightContrast)
+                            border.width: highlighted
+                            border.color: theme.accentColor
+                        }
+                        highlighted: comboBox.highlightedIndex === index
                     }
-                    background: Rectangle {
-                        color: (index % 2 === 0 ? theme.darkContrast : theme.lightContrast)
-                        border.width: highlighted
-                        border.color: theme.accentColor
+                    Accessible.role: Accessible.ComboBox
+                    Accessible.name: currentModelName()
+                    Accessible.description: qsTr("The top item is the current model")
+                    onActivated: function (index) {
+                        var newInfo = ModelList.modelInfo(comboBox.valueAt(index));
+                        if (currentModelName() !== ""
+                            && newInfo !== currentChat.modelInfo
+                            && chatModel.count !== 0) {
+                            switchModelDialog.index = index;
+                            switchModelDialog.open();
+                        } else {
+                            comboBox.changeModel(index);
+                        }
                     }
-                    highlighted: comboBox.highlightedIndex === index
-                }
-                Accessible.role: Accessible.ComboBox
-                Accessible.name: comboBox.currentModelName
-                Accessible.description: qsTr("The top item is the current model")
-                onActivated: function (index) {
-                    currentChat.stopGenerating()
-                    currentChat.reset();
-                    currentChat.modelInfo = ModelList.modelInfo(comboBox.valueAt(index))
-                }
-            }
-        }
 
-        Item {
-            anchors.centerIn: parent
-            visible: ModelList.installedModels.count
-                && !currentChat.isModelLoaded
-                && currentChat.modelLoadingError === ""
-                && !currentChat.isServer
-            width: childrenRect.width
-            height: childrenRect.height
-            Row {
-                spacing: 5
-                MyBusyIndicator {
-                    anchors.verticalCenter: parent.verticalCenter
-                    running: parent.visible
-                    Accessible.role: Accessible.Animation
-                    Accessible.name: qsTr("Busy indicator")
-                    Accessible.description: qsTr("loading model...")
-                }
+                    MyMiniButton {
+                        id: ejectButton
+                        visible: currentChat.isModelLoaded
+                        z: 500
+                        anchors.right: parent.right
+                        anchors.rightMargin: 50
+                        anchors.verticalCenter: parent.verticalCenter
+                        source: "qrc:/gpt4all/icons/eject.svg"
+                        backgroundColor: theme.gray300
+                        backgroundColorHovered: theme.iconBackgroundLight
+                        onClicked: {
+                            currentChat.forceUnloadModel();
+                        }
+                        ToolTip.text: qsTr("Eject the currently loaded model")
+                        ToolTip.visible: hovered
+                    }
 
-                Label {
-                    anchors.verticalCenter: parent.verticalCenter
-                    text: qsTr("Loading model...")
-                    font.pixelSize: theme.fontSizeLarge
-                    color: theme.oppositeTextColor
+                    MyMiniButton {
+                        id: reloadButton
+                        visible: currentChat.modelLoadingError === ""
+                            && !comboBox.trySwitchContextInProgress
+                            && (currentChat.isModelLoaded || currentModelName() !== "")
+                        z: 500
+                        anchors.right: ejectButton.visible ? ejectButton.left : parent.right
+                        anchors.rightMargin: ejectButton.visible ? 10 : 50
+                        anchors.verticalCenter: parent.verticalCenter
+                        source: "qrc:/gpt4all/icons/regenerate.svg"
+                        backgroundColor: theme.gray300
+                        backgroundColorHovered: theme.iconBackgroundLight
+                        onClicked: {
+                            if (currentChat.isModelLoaded)
+                                currentChat.forceReloadModel();
+                            else
+                                currentChat.reloadModel();
+                        }
+                        ToolTip.text: qsTr("Reload the currently loaded model")
+                        ToolTip.visible: hovered
+                    }
                 }
             }
         }
@@ -790,9 +874,9 @@ Window {
 
                 Rectangle {
                     id: homePage
-                    color: "transparent"//theme.green200
+                    color: "transparent"
                     anchors.fill: parent
-                    visible: (ModelList.installedModels.count === 0 || chatModel.count === 0) && !currentChat.isServer
+                    visible: !currentChat.isModelLoaded && (ModelList.installedModels.count === 0 || currentModelName() === "") && !currentChat.isServer
 
                     ColumnLayout {
                         anchors.centerIn: parent
@@ -1138,50 +1222,84 @@ Window {
             }
         }
 
-        MyButton {
-            id: myButton
-            visible: chatModel.count && !currentChat.isServer
-            textColor: theme.textColor
-            Image {
-                anchors.verticalCenter: parent.verticalCenter
-                anchors.left: parent.left
-                anchors.leftMargin: 15
-                source: currentChat.responseInProgress ? "qrc:/gpt4all/icons/stop_generating.svg" : "qrc:/gpt4all/icons/regenerate.svg"
-            }
-            leftPadding: 50
-            onClicked: {
-                var index = Math.max(0, chatModel.count - 1);
-                var listElement = chatModel.get(index);
-
-                if (currentChat.responseInProgress) {
-                    listElement.stopped = true
-                    currentChat.stopGenerating()
-                } else {
-                    currentChat.regenerateResponse()
-                    if (chatModel.count) {
-                        if (listElement.name === qsTr("Response: ")) {
-                            chatModel.updateCurrentResponse(index, true);
-                            chatModel.updateStopped(index, false);
-                            chatModel.updateThumbsUpState(index, false);
-                            chatModel.updateThumbsDownState(index, false);
-                            chatModel.updateNewResponse(index, "");
-                            currentChat.prompt(listElement.prompt)
+        RowLayout {
+            anchors.bottom: textInputView.top
+            anchors.horizontalCenter: textInputView.horizontalCenter
+            anchors.bottomMargin: 20
+            spacing: 10
+            MyButton {
+                textColor: theme.textColor
+                visible: chatModel.count && !currentChat.isServer && currentChat.isModelLoaded
+                Image {
+                    anchors.verticalCenter: parent.verticalCenter
+                    anchors.left: parent.left
+                    anchors.leftMargin: 15
+                    source: currentChat.responseInProgress ? "qrc:/gpt4all/icons/stop_generating.svg" : "qrc:/gpt4all/icons/regenerate.svg"
+                }
+                leftPadding: 50
+                onClicked: {
+                    var index = Math.max(0, chatModel.count - 1);
+                    var listElement = chatModel.get(index);
+
+                    if (currentChat.responseInProgress) {
+                        listElement.stopped = true
+                        currentChat.stopGenerating()
+                    } else {
+                        currentChat.regenerateResponse()
+                        if (chatModel.count) {
+                            if (listElement.name === qsTr("Response: ")) {
+                                chatModel.updateCurrentResponse(index, true);
+                                chatModel.updateStopped(index, false);
+                                chatModel.updateThumbsUpState(index, false);
+                                chatModel.updateThumbsDownState(index, false);
+                                chatModel.updateNewResponse(index, "");
+                                currentChat.prompt(listElement.prompt)
+                            }
                         }
                     }
                 }
+
+                borderWidth: 1
+                backgroundColor: theme.conversationButtonBackground
+                backgroundColorHovered: theme.conversationButtonBackgroundHovered
+                backgroundRadius: 5
+                padding: 15
+                topPadding: 4
+                bottomPadding: 4
+                text: currentChat.responseInProgress ? qsTr("Stop generating") : qsTr("Regenerate response")
+                fontPixelSize: theme.fontSizeSmaller
+                Accessible.description: qsTr("Controls generation of the response")
             }
-            background: Rectangle {
-                border.color: theme.conversationButtonBorder
-                border.width: 2
-                radius: 10
-                color: myButton.hovered ? theme.conversationButtonBackgroundHovered : theme.conversationButtonBackground
+
+            MyButton {
+                textColor: theme.textColor
+                visible: chatModel.count
+                    && !currentChat.isServer
+                    && !currentChat.isModelLoaded
+                    && currentChat.modelLoadingPercentage === 0.0
+                    && currentChat.modelInfo.name !== ""
+                Image {
+                    anchors.verticalCenter: parent.verticalCenter
+                    anchors.left: parent.left
+                    anchors.leftMargin: 15
+                    source: "qrc:/gpt4all/icons/regenerate.svg"
+                }
+                leftPadding: 50
+                onClicked: {
+                    currentChat.reloadModel();
+                }
+
+                borderWidth: 1
+                backgroundColor: theme.conversationButtonBackground
+                backgroundColorHovered: theme.conversationButtonBackgroundHovered
+                backgroundRadius: 5
+                padding: 15
+                topPadding: 4
+                bottomPadding: 4
+                text: qsTr("Reload \u00B7 ") + currentChat.modelInfo.name
+                fontPixelSize: theme.fontSizeSmaller
+                Accessible.description: qsTr("Reloads the model")
             }
-            anchors.bottom: textInputView.top
-            anchors.horizontalCenter: textInputView.horizontalCenter
-            anchors.bottomMargin: 20
-            padding: 15
-            text: currentChat.responseInProgress ? qsTr("Stop generating") : qsTr("Regenerate response")
-            Accessible.description: qsTr("Controls generation of the response")
         }
 
         Text {
@@ -1224,7 +1342,7 @@ Window {
                 rightPadding: 40
                 enabled: currentChat.isModelLoaded && !currentChat.isServer
                 font.pixelSize: theme.fontSizeLarger
-                placeholderText: qsTr("Send a message...")
+                placeholderText: currentChat.isModelLoaded ? qsTr("Send a message...") : qsTr("Load a model to continue...")
                 Accessible.role: Accessible.EditableText
                 Accessible.name: placeholderText
                 Accessible.description: qsTr("Send messages/prompts to the model")
diff --git a/gpt4all-chat/qml/MyButton.qml b/gpt4all-chat/qml/MyButton.qml
index d79c275b1e1b..6f14f9d37258 100644
--- a/gpt4all-chat/qml/MyButton.qml
+++ b/gpt4all-chat/qml/MyButton.qml
@@ -13,9 +13,10 @@ Button {
     property color mutedTextColor: theme.oppositeMutedTextColor
     property color backgroundColor: theme.buttonBackground
     property color backgroundColorHovered: theme.buttonBackgroundHovered
+    property real  backgroundRadius: 10
     property real  borderWidth: MySettings.chatTheme === "LegacyDark" ? 1 : 0
     property color borderColor: theme.buttonBorder
-    property real fontPixelSize: theme.fontSizeLarge
+    property real  fontPixelSize: theme.fontSizeLarge
     contentItem: Text {
         text: myButton.text
         horizontalAlignment: Text.AlignHCenter
@@ -25,7 +26,7 @@ Button {
         Accessible.name: text
     }
     background: Rectangle {
-        radius: 10
+        radius: myButton.backgroundRadius
         border.width: myButton.borderWidth
         border.color: myButton.borderColor
         color: myButton.hovered ? backgroundColorHovered : backgroundColor
diff --git a/gpt4all-chat/qml/MyMiniButton.qml b/gpt4all-chat/qml/MyMiniButton.qml
new file mode 100644
index 000000000000..d5e5571aa420
--- /dev/null
+++ b/gpt4all-chat/qml/MyMiniButton.qml
@@ -0,0 +1,47 @@
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import Qt5Compat.GraphicalEffects
+
+Button {
+    id: myButton
+    padding: 0
+    property color backgroundColor: theme.iconBackgroundDark
+    property color backgroundColorHovered: theme.iconBackgroundHovered
+    property alias source: image.source
+    property alias fillMode: image.fillMode
+    width: 30
+    height: 30
+    contentItem: Text {
+        text: myButton.text
+        horizontalAlignment: Text.AlignHCenter
+        color: myButton.enabled ? theme.textColor : theme.mutedTextColor
+        font.pixelSize: theme.fontSizeLarge
+        Accessible.role: Accessible.Button
+        Accessible.name: text
+    }
+
+    background: Item {
+        anchors.fill: parent
+        Rectangle {
+            anchors.fill: parent
+            color: "transparent"
+        }
+        Image {
+            id: image
+            anchors.centerIn: parent
+            mipmap: true
+            width: 20
+            height: 20
+        }
+        ColorOverlay {
+            anchors.fill: image
+            source: image
+            color: myButton.hovered ? backgroundColorHovered : backgroundColor
+        }
+    }
+    Accessible.role: Accessible.Button
+    Accessible.name: text
+    ToolTip.delay: Qt.styleHints.mousePressAndHoldInterval
+}
diff --git a/gpt4all-chat/qml/SwitchModelDialog.qml b/gpt4all-chat/qml/SwitchModelDialog.qml
new file mode 100644
index 000000000000..54dfbe60ac02
--- /dev/null
+++ b/gpt4all-chat/qml/SwitchModelDialog.qml
@@ -0,0 +1,44 @@
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+import llm
+import mysettings
+
+MyDialog {
+    id: switchModelDialog
+    anchors.centerIn: parent
+    modal: true
+    padding: 20
+    property int index: -1
+
+    Theme {
+        id: theme
+    }
+
+    Column {
+        id: column
+        spacing: 20
+    }
+
+    footer: DialogButtonBox {
+        id: dialogBox
+        padding: 20
+        alignment: Qt.AlignRight
+        spacing: 10
+        MySettingsButton {
+            text: qsTr("Continue")
+            Accessible.description: qsTr("Continue with model loading")
+            DialogButtonBox.buttonRole: DialogButtonBox.AcceptRole
+        }
+        MySettingsButton {
+            text: qsTr("Cancel")
+            Accessible.description: qsTr("Cancel")
+            DialogButtonBox.buttonRole: DialogButtonBox.RejectRole
+        }
+        background: Rectangle {
+            color: "transparent"
+        }
+    }
+}
diff --git a/gpt4all-chat/qml/Theme.qml b/gpt4all-chat/qml/Theme.qml
index 49f8343cbc82..2b8c9733ebfe 100644
--- a/gpt4all-chat/qml/Theme.qml
+++ b/gpt4all-chat/qml/Theme.qml
@@ -555,6 +555,7 @@ QtObject {
     property real fontSizeFixedSmall: 16
     property real fontSize: Qt.application.font.pixelSize
 
+    property real fontSizeSmaller: fontSizeLarge - 4
     property real fontSizeSmall: fontSizeLarge - 2
     property real  fontSizeLarge: MySettings.fontSize === "Small" ?
         fontSize : MySettings.fontSize === "Medium" ?

From ed0f93977da3d5e3b92dc5516d913443b0702acb Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 19 Feb 2024 10:37:03 -0500
Subject: [PATCH 02/17] Fixes for issues identified in review.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/main.qml                  | 55 +++++++++++++-------------
 gpt4all-chat/qml/ModelSettings.qml     |  6 +--
 gpt4all-chat/qml/SwitchModelDialog.qml |  8 ++--
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index 66104e37e754..a12f2666521a 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -130,6 +130,10 @@ Window {
         return ModelList.modelInfo(currentChat.modelInfo.id).name;
     }
 
+    property bool isCurrentlyLoading: false
+    property real modelLoadingPercentage: 0.0
+    property bool trySwitchContextInProgress: false
+
     PopupDialog {
         id: errorCompatHardware
         anchors.centerIn: parent
@@ -289,8 +293,6 @@ Window {
     SwitchModelDialog {
         id: switchModelDialog
         anchors.centerIn: parent
-        width: Math.min(1024, window.width - (window.width * .2))
-        height: Math.min(600, window.height - (window.height * .2))
         Item {
             Accessible.role: Accessible.Dialog
             Accessible.name: qsTr("Switch model dialog")
@@ -309,8 +311,6 @@ Window {
             anchors.centerIn: parent
             height: childrenRect.height
             visible: true
-                || currentChat.modelLoadingError !== ""
-                || currentChat.isServer
 
             Label {
                 id: modelLabel
@@ -337,18 +337,17 @@ Window {
                     Layout.fillWidth: true
                     Layout.fillHeight: true
                     implicitWidth: 575
-                    width: window.width >= 750 ? implicitWidth : implicitWidth - ((750 - window.width))
+                    width: window.width >= 750 ? implicitWidth : implicitWidth - (750 - window.width)
                     enabled: !currentChat.isServer
+                        && !window.trySwitchContextInProgress
+                        && !window.isCurrentlyLoading
                     model: ModelList.installedModels
                     valueRole: "id"
                     textRole: "name"
-                    property bool isCurrentlyLoading: false
-                    property real modelLoadingPercentage: 0.0
-                    property bool trySwitchContextInProgress: false
 
                     function changeModel(index) {
-                        comboBox.modelLoadingPercentage = 0.0;
-                        comboBox.isCurrentlyLoading = true;
+                        window.modelLoadingPercentage = 0.0;
+                        window.isCurrentlyLoading = true;
                         currentChat.stopGenerating()
                         currentChat.reset();
                         currentChat.modelInfo = ModelList.modelInfo(comboBox.valueAt(index))
@@ -357,15 +356,15 @@ Window {
                     Connections {
                         target: currentChat
                         function onModelLoadingPercentageChanged() {
-                            comboBox.modelLoadingPercentage = currentChat.modelLoadingPercentage;
-                            comboBox.isCurrentlyLoading = currentChat.modelLoadingPercentage !== 0.0
+                            window.modelLoadingPercentage = currentChat.modelLoadingPercentage;
+                            window.isCurrentlyLoading = currentChat.modelLoadingPercentage !== 0.0
                                 && currentChat.modelLoadingPercentage !== 1.0;
                         }
                         function onTrySwitchContextOfLoadedModelAttempted() {
-                            comboBox.trySwitchContextInProgress = true;
+                            window.trySwitchContextInProgress = true;
                         }
                         function onTrySwitchContextOfLoadedModelCompleted() {
-                            comboBox.trySwitchContextInProgress = false;
+                            window.trySwitchContextInProgress = false;
                         }
                     }
                     Connections {
@@ -377,14 +376,14 @@ Window {
 
                     background: ProgressBar {
                         id: modelProgress
-                        value: comboBox.modelLoadingPercentage
+                        value: window.modelLoadingPercentage
                         background: Rectangle {
                             color: theme.mainComboBackground
                             radius: 10
                         }
                         contentItem: Item {
                             Rectangle {
-                                visible: comboBox.isCurrentlyLoading
+                                visible: window.isCurrentlyLoading
                                 anchors.bottom: parent.bottom
                                 width: modelProgress.visualPosition * parent.width
                                 height: 10
@@ -400,13 +399,13 @@ Window {
                         text: {
                             if (currentChat.modelLoadingError !== "")
                                 return qsTr("Model loading error...")
-                            if (comboBox.trySwitchContextInProgress)
+                            if (window.trySwitchContextInProgress)
                                 return qsTr("Switching context...")
                             if (currentModelName() === "")
                                 return qsTr("Choose a model...")
                             if (currentChat.modelLoadingPercentage === 0.0)
                                 return qsTr("Reload \u00B7 ") + currentModelName()
-                            if (comboBox.isCurrentlyLoading)
+                            if (window.isCurrentlyLoading)
                                 return qsTr("Loading \u00B7 ") + currentModelName()
                             return currentModelName()
                         }
@@ -468,7 +467,8 @@ Window {
                     MyMiniButton {
                         id: reloadButton
                         visible: currentChat.modelLoadingError === ""
-                            && !comboBox.trySwitchContextInProgress
+                            && !window.trySwitchContextInProgress
+                            && !window.isCurrentlyLoading
                             && (currentChat.isModelLoaded || currentModelName() !== "")
                         z: 500
                         anchors.right: ejectButton.visible ? ejectButton.left : parent.right
@@ -1264,8 +1264,8 @@ Window {
                 backgroundColorHovered: theme.conversationButtonBackgroundHovered
                 backgroundRadius: 5
                 padding: 15
-                topPadding: 4
-                bottomPadding: 4
+                topPadding: 8
+                bottomPadding: 8
                 text: currentChat.responseInProgress ? qsTr("Stop generating") : qsTr("Regenerate response")
                 fontPixelSize: theme.fontSizeSmaller
                 Accessible.description: qsTr("Controls generation of the response")
@@ -1273,11 +1273,12 @@ Window {
 
             MyButton {
                 textColor: theme.textColor
-                visible: chatModel.count
-                    && !currentChat.isServer
+                visible: !currentChat.isServer
                     && !currentChat.isModelLoaded
-                    && currentChat.modelLoadingPercentage === 0.0
-                    && currentChat.modelInfo.name !== ""
+                    && !window.trySwitchContextInProgress
+                    && !window.isCurrentlyLoading
+                    && currentModelName() !== ""
+
                 Image {
                     anchors.verticalCenter: parent.verticalCenter
                     anchors.left: parent.left
@@ -1294,8 +1295,8 @@ Window {
                 backgroundColorHovered: theme.conversationButtonBackgroundHovered
                 backgroundRadius: 5
                 padding: 15
-                topPadding: 4
-                bottomPadding: 4
+                topPadding: 8
+                bottomPadding: 8
                 text: qsTr("Reload \u00B7 ") + currentChat.modelInfo.name
                 fontPixelSize: theme.fontSizeSmaller
                 Accessible.description: qsTr("Reloads the model")
diff --git a/gpt4all-chat/qml/ModelSettings.qml b/gpt4all-chat/qml/ModelSettings.qml
index ce2f51570643..d338dc15bd92 100644
--- a/gpt4all-chat/qml/ModelSettings.qml
+++ b/gpt4all-chat/qml/ModelSettings.qml
@@ -328,7 +328,7 @@ MySettingsTab {
                 text: root.currentModelInfo.contextLength
                 font.pixelSize: theme.fontSizeLarge
                 color: theme.textColor
-                ToolTip.text: qsTr("Maximum combined prompt/response tokens before information is lost.\nUsing more context than the model was trained on will yield poor results.\nNOTE: Does not take effect until you RESTART GPT4All or SWITCH MODELS.")
+                ToolTip.text: qsTr("Maximum combined prompt/response tokens before information is lost.\nUsing more context than the model was trained on will yield poor results.\nNOTE: Does not take effect until you reload the model.")
                 ToolTip.visible: hovered
                 Layout.row: 0
                 Layout.column: 1
@@ -692,7 +692,7 @@ MySettingsTab {
                 text: root.currentModelInfo.gpuLayers
                 font.pixelSize: theme.fontSizeLarge
                 color: theme.textColor
-                ToolTip.text: qsTr("How many GPU layers to load into VRAM. Decrease this if GPT4All runs out of VRAM while loading this model.\nLower values increase CPU load and RAM usage, and make inference slower.\nNOTE: Does not take effect until you RESTART GPT4All or SWITCH MODELS.")
+                ToolTip.text: qsTr("How many GPU layers to load into VRAM. Decrease this if GPT4All runs out of VRAM while loading this model.\nLower values increase CPU load and RAM usage, and make inference slower.\nNOTE: Does not take effect until you reload the model.")
                 ToolTip.visible: hovered
                 Layout.row: 4
                 Layout.column: 1
@@ -705,7 +705,7 @@ MySettingsTab {
                 Connections {
                     target: root
                     function onCurrentModelInfoChanged() {
-                        if (root.currentModelInfo.gpuLayers == 100) {
+                        if (root.currentModelInfo.gpuLayers === 100) {
                             gpuLayersField.text = root.currentModelInfo.maxGpuLayers
                         } else {
                             gpuLayersField.text = root.currentModelInfo.gpuLayers
diff --git a/gpt4all-chat/qml/SwitchModelDialog.qml b/gpt4all-chat/qml/SwitchModelDialog.qml
index 54dfbe60ac02..f0ca43abbc24 100644
--- a/gpt4all-chat/qml/SwitchModelDialog.qml
+++ b/gpt4all-chat/qml/SwitchModelDialog.qml
@@ -17,9 +17,11 @@ MyDialog {
         id: theme
     }
 
-    Column {
-        id: column
-        spacing: 20
+    contentItem: Text {
+        textFormat: Text.StyledText
+        text: qsTr("<b>Warning:</b> changing the model will erase the current conversation. Do you wish to continue?")
+        color: theme.textColor
+        font.pixelSize: theme.fontSizeLarge
     }
 
     footer: DialogButtonBox {

From fbf5e5e7326c792355d51121fae9767388a17671 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 20 Feb 2024 09:27:28 -0500
Subject: [PATCH 03/17] Increase padding for elided text in combo.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/main.qml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index a12f2666521a..337a14b9f5ed 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -395,7 +395,13 @@ Window {
                     contentItem: Text {
                         anchors.horizontalCenter: parent.horizontalCenter
                         leftPadding: 10
-                        rightPadding: 20
+                        rightPadding: {
+                            if (ejectButton.visible && reloadButton)
+                                return 105;
+                            if (reloadButton.visible)
+                                return 65
+                            return 25
+                        }
                         text: {
                             if (currentChat.modelLoadingError !== "")
                                 return qsTr("Model loading error...")

From ad34c2bdd40a8226f82b9637f17119d528cde838 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 20 Feb 2024 12:05:13 -0500
Subject: [PATCH 04/17] Don't erase context when reloading model by selection.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/main.qml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index 337a14b9f5ed..be1ec94e6b85 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -443,9 +443,9 @@ Window {
                     Accessible.description: qsTr("The top item is the current model")
                     onActivated: function (index) {
                         var newInfo = ModelList.modelInfo(comboBox.valueAt(index));
-                        if (currentModelName() !== ""
-                            && newInfo !== currentChat.modelInfo
-                            && chatModel.count !== 0) {
+                        if (newInfo === currentChat.modelInfo) {
+                            currentChat.reloadModel();
+                        } else if (currentModelName() !== "" && chatModel.count !== 0) {
                             switchModelDialog.index = index;
                             switchModelDialog.open();
                         } else {

From 67099f80ba3fcd68c32a215b76c5d3866d142bc0 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 09:54:27 -0500
Subject: [PATCH 05/17] Add comment to make this clear.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/chat.cpp    | 2 +-
 gpt4all-chat/chatllm.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpt4all-chat/chat.cpp b/gpt4all-chat/chat.cpp
index 8730adbcee05..62c33a1a6c29 100644
--- a/gpt4all-chat/chat.cpp
+++ b/gpt4all-chat/chat.cpp
@@ -243,7 +243,7 @@ void Chat::setModelInfo(const ModelInfo &modelInfo)
     if (m_modelInfo == modelInfo && isModelLoaded())
         return;
 
-    m_modelLoadingPercentage = std::numeric_limits<float>::min();
+    m_modelLoadingPercentage = std::numeric_limits<float>::min(); // small non-zero positive value
     emit isModelLoadedChanged();
     m_modelLoadingError = QString();
     emit modelLoadingErrorChanged();
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index 4b456e3464a0..bf3f6253aab0 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -222,7 +222,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
 #endif
         delete m_llModelInfo.model;
         m_llModelInfo.model = nullptr;
-        emit modelLoadingPercentageChanged(std::numeric_limits<float>::min());
+        emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
     } else if (!m_isServer) {
         // This is a blocking call that tries to retrieve the model we need from the model store.
         // If it succeeds, then we just have to restore state. If the store has never had a model

From b0c471aed8acd628e5b987cdf2540871301939ae Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 10:49:13 -0500
Subject: [PATCH 06/17] Make the reload/regenerate buttons a little bit larger
 font.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/main.qml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index be1ec94e6b85..b3392fbd757c 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -1273,7 +1273,7 @@ Window {
                 topPadding: 8
                 bottomPadding: 8
                 text: currentChat.responseInProgress ? qsTr("Stop generating") : qsTr("Regenerate response")
-                fontPixelSize: theme.fontSizeSmaller
+                fontPixelSize: theme.fontSizeSmall
                 Accessible.description: qsTr("Controls generation of the response")
             }
 
@@ -1304,7 +1304,7 @@ Window {
                 topPadding: 8
                 bottomPadding: 8
                 text: qsTr("Reload \u00B7 ") + currentChat.modelInfo.name
-                fontPixelSize: theme.fontSizeSmaller
+                fontPixelSize: theme.fontSizeSmall
                 Accessible.description: qsTr("Reloads the model")
             }
         }

From fa0a2129dcca6fe5b61f761c5a41fa38cfd4a871 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 11:06:57 -0500
Subject: [PATCH 07/17] Don't try and detect model load error on startup.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/chatllm.cpp    | 14 --------------
 gpt4all-chat/mysettings.cpp | 21 ---------------------
 gpt4all-chat/mysettings.h   |  2 --
 3 files changed, 37 deletions(-)

diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index bf3f6253aab0..750e85485b41 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -274,16 +274,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
     // Store the file info in the modelInfo in case we have an error loading
     m_llModelInfo.fileInfo = fileInfo;
 
-    // Check if we've previously tried to load this file and failed/crashed
-    if (MySettings::globalInstance()->attemptModelLoad() == filePath) {
-        MySettings::globalInstance()->setAttemptModelLoad(QString()); // clear the flag
-        if (!m_isServer)
-            LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
-        m_llModelInfo = LLModelInfo();
-        emit modelLoadingError(QString("Previous attempt to load model resulted in crash for `%1` most likely due to insufficient memory. You should either remove this model or decrease your system RAM usage by closing other applications.").arg(modelInfo.filename()));
-        return false;
-    }
-
     if (fileInfo.exists()) {
         if (isChatGPT) {
             QString apiKey;
@@ -319,9 +309,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                     return m_shouldBeLoaded;
                 });
 
-                // Update the settings that a model is being loaded and update the device list
-                MySettings::globalInstance()->setAttemptModelLoad(filePath);
-
                 // Pick the best match for the device
                 QString actualDevice = m_llModelInfo.model->implementation().buildVariant() == "metal" ? "Metal" : "CPU";
                 const QString requestedDevice = MySettings::globalInstance()->device();
@@ -373,7 +360,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                     emit reportFallbackReason("<br>model or quant has no GPU support");
                 }
 
-                MySettings::globalInstance()->setAttemptModelLoad(QString());
                 if (!success) {
                     delete m_llModelInfo.model;
                     m_llModelInfo.model = nullptr;
diff --git a/gpt4all-chat/mysettings.cpp b/gpt4all-chat/mysettings.cpp
index f9774bde2f57..9e5cdad0ce06 100644
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@@ -717,24 +717,3 @@ void MySettings::setNetworkUsageStatsActive(bool b)
     setting.sync();
     emit networkUsageStatsActiveChanged();
 }
-
-QString MySettings::attemptModelLoad() const
-{
-    QSettings setting;
-    setting.sync();
-    return setting.value("attemptModelLoad", QString()).toString();
-}
-
-void MySettings::setAttemptModelLoad(const QString &modelFile)
-{
-    if (attemptModelLoad() == modelFile)
-        return;
-
-    QSettings setting;
-    if (modelFile.isEmpty())
-        setting.remove("attemptModelLoad");
-    else
-        setting.setValue("attemptModelLoad", modelFile);
-    setting.sync();
-    emit attemptModelLoadChanged();
-}
diff --git a/gpt4all-chat/mysettings.h b/gpt4all-chat/mysettings.h
index 4bfbef6b6390..c5019b91c8a4 100644
--- a/gpt4all-chat/mysettings.h
+++ b/gpt4all-chat/mysettings.h
@@ -110,8 +110,6 @@ class MySettings : public QObject
     bool networkUsageStatsActive() const;
     void setNetworkUsageStatsActive(bool b);
 
-    QString attemptModelLoad() const;
-    void setAttemptModelLoad(const QString &modelFile);
 
     QVector<QString> deviceList() const;
     void setDeviceList(const QVector<QString> &deviceList);

From 896fc6fbb72c6a30bcf87a6b42cfb83d5a219d3a Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 11:40:05 -0500
Subject: [PATCH 08/17] Save the window size for the user and reuse next load.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/main.qml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index b3392fbd757c..7bacb6cb2b79 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -21,6 +21,14 @@ Window {
     visible: true
     title: qsTr("GPT4All v") + Qt.application.version
 
+
+    Settings {
+        property alias x: window.x
+        property alias y: window.y
+        property alias width: window.width
+        property alias height: window.height
+    }
+
     Theme {
         id: theme
     }

From 7810b757c9120a93533fbcf56d169272b881d6bb Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 11:41:37 -0500
Subject: [PATCH 09/17] llamamodel: add gemma model support

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/llama.cpp-mainline | 2 +-
 gpt4all-backend/llamamodel.cpp     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index 822a9c894eb3..7d4ced850548 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit 822a9c894eb3770c65f0b4a724aae34605c90029
+Subproject commit 7d4ced850548642b9a1740fa25ecdef249fbf47f
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 0dd9de5d96ed..167d10ee5661 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -519,8 +519,8 @@ DLL_EXPORT bool magic_match(const char *fname) {
     bool valid = true;
 
     static const std::vector<const char *> known_arches {
-        "baichuan", "bloom", "codeshell", "falcon", "gpt2", "llama", "mpt", "orion", "persimmon", "phi2", "plamo",
-        "qwen", "qwen2", "refact", "stablelm", "starcoder"
+        "baichuan", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "orion", "persimmon", "phi2",
+        "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
     };
 
     if (std::find(known_arches.begin(), known_arches.end(), arch) == known_arches.end()) {

From 32837fb3a0cc6074544661115dad09665b9704e7 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 14:05:26 -0500
Subject: [PATCH 10/17] models2.json: add gemma model

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-chat/metadata/models2.json | 48 ++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/gpt4all-chat/metadata/models2.json b/gpt4all-chat/metadata/models2.json
index 95ef5ad84377..124f17315fc1 100644
--- a/gpt4all-chat/metadata/models2.json
+++ b/gpt4all-chat/metadata/models2.json
@@ -1,6 +1,22 @@
 [
   {
     "order": "a",
+    "md5sum": "6d1ca6e9533d177361fe2612a2c87474",
+    "name": "Gemma Instruct",
+    "filename": "gemma-7b-it.Q4_0.gguf",
+    "filesize": "4809316512",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Gemma",
+    "description": "<strong>A state-of-the-art open model from Google</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Google</li><li>Licensed for commercial use</li><li>Gemma is provided under and subject to the Gemma Terms of Use found at <a href=\"https://ai.google.dev/gemma/terms\">ai.google.dev/gemma/terms</a></li></ul>",
+    "url": "https://gpt4all.io/models/gguf/gemma-7b-it.Q4_0.gguf",
+    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n\n",
+    "systemPrompt": ""
+  },
+  {
+    "order": "b",
     "md5sum": "48de9538c774188eb25a7e9ee024bbd3",
     "name": "Mistral OpenOrca",
     "filename": "mistral-7b-openorca.Q4_0.gguf",
@@ -15,22 +31,6 @@
     "promptTemplate": "<|im_start|>user\n%1<|im_end|><|im_start|>assistant\n",
     "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI. For multi-step problems, write out your reasoning for each step.\n<|im_end|>"
   },
-  {
-    "order": "b",
-    "md5sum": "97463be739b50525df56d33b26b00852",
-    "name": "Mistral Instruct",
-    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "filesize": "4108916384",
-    "requires": "2.5.0",
-    "ramrequired": "8",
-    "parameters": "7 billion",
-    "quant": "q4_0",
-    "type": "Mistral",
-    "systemPrompt": " ",
-    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
-    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "promptTemplate": "[INST] %1 [/INST]"
-  },
   {
     "order": "c",
     "md5sum": "c4c78adf744d6a20f05c8751e3961b84",
@@ -47,6 +47,22 @@
     "url": "https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf",
     "promptTemplate": "### Instruction:\n%1\n### Response:\n"
   },
+  {
+    "order": "d",
+    "md5sum": "97463be739b50525df56d33b26b00852",
+    "name": "Mistral Instruct",
+    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "filesize": "4108916384",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "systemPrompt": " ",
+    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
+    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "promptTemplate": "[INST] %1 [/INST]"
+  },
   {
     "order": "e",
     "md5sum": "00c8593ba57f5240f59662367b3ed4a5",

From 4a8c6d7f9cc1aea0b75309cd4c542598836d2a9d Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 14:16:12 -0500
Subject: [PATCH 11/17] gemma: fix default prompt template

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-chat/metadata/models2.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt4all-chat/metadata/models2.json b/gpt4all-chat/metadata/models2.json
index 124f17315fc1..91cffee819dd 100644
--- a/gpt4all-chat/metadata/models2.json
+++ b/gpt4all-chat/metadata/models2.json
@@ -12,7 +12,7 @@
     "type": "Gemma",
     "description": "<strong>A state-of-the-art open model from Google</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Google</li><li>Licensed for commercial use</li><li>Gemma is provided under and subject to the Gemma Terms of Use found at <a href=\"https://ai.google.dev/gemma/terms\">ai.google.dev/gemma/terms</a></li></ul>",
     "url": "https://gpt4all.io/models/gguf/gemma-7b-it.Q4_0.gguf",
-    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n\n",
+    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n",
     "systemPrompt": ""
   },
   {

From c13202a6f5f90094629cc6e214a2a4ccd91ccb74 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 14:43:55 -0500
Subject: [PATCH 12/17] models2.json: gemma requires a future GPT4All version

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-chat/metadata/models2.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt4all-chat/metadata/models2.json b/gpt4all-chat/metadata/models2.json
index 91cffee819dd..5096cd032228 100644
--- a/gpt4all-chat/metadata/models2.json
+++ b/gpt4all-chat/metadata/models2.json
@@ -5,7 +5,7 @@
     "name": "Gemma Instruct",
     "filename": "gemma-7b-it.Q4_0.gguf",
     "filesize": "4809316512",
-    "requires": "2.5.0",
+    "requires": "2.7.1",
     "ramrequired": "8",
     "parameters": "7 billion",
     "quant": "q4_0",

From b8f5c74f40def7622a7e4b5aa86fadb473f39046 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 15:41:20 -0500
Subject: [PATCH 13/17] add models3.json for new templates (#1993)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-chat/metadata/models3.json | 257 +++++++++++++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 gpt4all-chat/metadata/models3.json

diff --git a/gpt4all-chat/metadata/models3.json b/gpt4all-chat/metadata/models3.json
new file mode 100644
index 000000000000..5e33ca0f88b0
--- /dev/null
+++ b/gpt4all-chat/metadata/models3.json
@@ -0,0 +1,257 @@
+[
+  {
+    "order": "a",
+    "md5sum": "6d1ca6e9533d177361fe2612a2c87474",
+    "name": "Gemma Instruct",
+    "filename": "gemma-7b-it.Q4_0.gguf",
+    "filesize": "4809316512",
+    "requires": "2.7.1",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Gemma",
+    "description": "<strong>A state-of-the-art open model from Google</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Google</li><li>Licensed for commercial use</li><li>Gemma is provided under and subject to the Gemma Terms of Use found at <a href=\"https://ai.google.dev/gemma/terms\">ai.google.dev/gemma/terms</a></li></ul>",
+    "url": "https://gpt4all.io/models/gguf/gemma-7b-it.Q4_0.gguf",
+    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n%2<end_of_turn>\n",
+    "systemPrompt": ""
+  },
+  {
+    "order": "b",
+    "md5sum": "f692417a22405d80573ac10cb0cd6c6a",
+    "name": "Mistral OpenOrca",
+    "filename": "mistral-7b-openorca.Q4_0.gguf2.gguf",
+    "filesize": "4108928128",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "description": "<strong>Best overall fast chat model</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Mistral AI<li>Finetuned on OpenOrca dataset curated via <a href=\"https://atlas.nomic.ai/\">Nomic Atlas</a><li>Licensed for commercial use</ul>",
+    "url": "https://gpt4all.io/models/gguf/mistral-7b-openorca.Q4_0.gguf",
+    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
+    "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI. For multi-step problems, write out your reasoning for each step.\n<|im_end|>"
+  },
+  {
+    "order": "c",
+    "md5sum": "c4c78adf744d6a20f05c8751e3961b84",
+    "name": "GPT4All Falcon",
+    "filename": "gpt4all-falcon-newbpe-q4_0.gguf",
+    "filesize": "4210994112",
+    "requires": "2.6.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Falcon",
+    "systemPrompt": " ",
+    "description": "<strong>Very fast model with good quality</strong><br><ul><li>Fastest responses</li><li>Instruction based</li><li>Trained by TII<li>Finetuned by Nomic AI<li>Licensed for commercial use</ul>",
+    "url": "https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf",
+    "promptTemplate": "### Instruction:\n%1\n### Response:\n"
+  },
+  {
+    "order": "d",
+    "md5sum": "97463be739b50525df56d33b26b00852",
+    "name": "Mistral Instruct",
+    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "filesize": "4108916384",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "systemPrompt": " ",
+    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
+    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "promptTemplate": "[INST] %1 [/INST]"
+  },
+  {
+    "order": "e",
+    "md5sum": "00c8593ba57f5240f59662367b3ed4a5",
+    "name": "Orca 2 (Medium)",
+    "filename": "orca-2-7b.Q4_0.gguf",
+    "filesize": "3825824192",
+    "requires": "2.5.2",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "LLaMA2",
+    "systemPrompt": " ",
+    "description": "<ul><li>Instruction based<li>Trained by Microsoft<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/orca-2-7b.Q4_0.gguf"
+  },
+  {
+    "order": "f",
+    "md5sum": "3c0d63c4689b9af7baa82469a6f51a19",
+    "name": "Orca 2 (Full)",
+    "filename": "orca-2-13b.Q4_0.gguf",
+    "filesize": "7365856064",
+    "requires": "2.5.2",
+    "ramrequired": "16",
+    "parameters": "13 billion",
+    "quant": "q4_0",
+    "type": "LLaMA2",
+    "systemPrompt": " ",
+    "description": "<ul><li>Instruction based<li>Trained by Microsoft<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/orca-2-13b.Q4_0.gguf"
+  },
+  {
+    "order": "g",
+    "md5sum": "5aff90007499bce5c64b1c0760c0b186",
+    "name": "Wizard v1.2",
+    "filename": "wizardlm-13b-v1.2.Q4_0.gguf",
+    "filesize": "7365834624",
+    "requires": "2.5.0",
+    "ramrequired": "16",
+    "parameters": "13 billion",
+    "quant": "q4_0",
+    "type": "LLaMA2",
+    "systemPrompt": " ",
+    "description": "<strong>Best overall larger model</strong><br><ul><li>Instruction based<li>Gives very long responses<li>Finetuned with only 1k of high-quality data<li>Trained by Microsoft and Peking University<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/wizardlm-13b-v1.2.Q4_0.gguf"
+  },
+  {
+    "order": "h",
+    "md5sum": "3d12810391d04d1153b692626c0c6e16",
+    "name": "Hermes",
+    "filename": "nous-hermes-llama2-13b.Q4_0.gguf",
+    "filesize": "7366062080",
+    "requires": "2.5.0",
+    "ramrequired": "16",
+    "parameters": "13 billion",
+    "quant": "q4_0",
+    "type": "LLaMA2",
+    "systemPrompt": " ",
+    "description": "<strong>Extremely good model</strong><br><ul><li>Instruction based<li>Gives long responses<li>Curated with 300,000 uncensored instructions<li>Trained by Nous Research<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/nous-hermes-llama2-13b.Q4_0.gguf",
+    "promptTemplate": "### Instruction:\n%1\n### Response:\n"
+  },
+  {
+    "order": "i",
+    "md5sum": "40388eb2f8d16bb5d08c96fdfaac6b2c",
+    "name": "Snoozy",
+    "filename": "gpt4all-13b-snoozy-q4_0.gguf",
+    "filesize": "7365834624",
+    "requires": "2.5.0",
+    "ramrequired": "16",
+    "parameters": "13 billion",
+    "quant": "q4_0",
+    "type": "LLaMA",
+    "systemPrompt": " ",
+    "description": "<strong>Very good overall model</strong><br><ul><li>Instruction based<li>Based on the same dataset as Groovy<li>Slower than Groovy, with higher quality responses<li>Trained by Nomic AI<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/gpt4all-13b-snoozy-q4_0.gguf"
+  },
+  {
+    "order": "j",
+    "md5sum": "15dcb4d7ea6de322756449c11a0b7545",
+    "name": "MPT Chat",
+    "filename": "mpt-7b-chat-newbpe-q4_0.gguf",
+    "filesize": "3912373472",
+    "requires": "2.6.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "MPT",
+    "description": "<strong>Good model with novel architecture</strong><br><ul><li>Fast responses<li>Chat based<li>Trained by Mosaic ML<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/mpt-7b-chat-newbpe-q4_0.gguf",
+    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
+    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>"
+  },
+  {
+    "order": "k",
+    "md5sum": "0e769317b90ac30d6e09486d61fefa26",
+    "name": "Mini Orca (Small)",
+    "filename": "orca-mini-3b-gguf2-q4_0.gguf",
+    "filesize": "1979946720",
+    "requires": "2.5.0",
+    "ramrequired": "4",
+    "parameters": "3 billion",
+    "quant": "q4_0",
+    "type": "OpenLLaMa",
+    "description": "<strong>Small version of new model with novel dataset</strong><br><ul><li>Instruction based<li>Explain tuned datasets<li>Orca Research Paper dataset construction approaches<li>Cannot be used commercially</ul>",
+    "url": "https://gpt4all.io/models/gguf/orca-mini-3b-gguf2-q4_0.gguf",
+    "promptTemplate": "### User:\n%1\n### Response:\n",
+    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
+  },
+  {
+    "order": "l",
+    "md5sum": "c232f17e09bca4b7ee0b5b1f4107c01e",
+    "disableGUI": "true",
+    "name": "Replit",
+    "filename": "replit-code-v1_5-3b-newbpe-q4_0.gguf",
+    "filesize": "1953055104",
+    "requires": "2.6.0",
+    "ramrequired": "4",
+    "parameters": "3 billion",
+    "quant": "q4_0",
+    "type": "Replit",
+    "systemPrompt": " ",
+    "promptTemplate": "%1",
+    "description": "<strong>Trained on subset of the Stack</strong><br><ul><li>Code completion based<li>Licensed for commercial use<li>WARNING: Not available for chat GUI</ul>",
+    "url": "https://gpt4all.io/models/gguf/replit-code-v1_5-3b-newbpe-q4_0.gguf"
+  },
+  {
+    "order": "m",
+    "md5sum": "70841751ccd95526d3dcfa829e11cd4c",
+    "disableGUI": "true",
+    "name": "Starcoder",
+    "filename": "starcoder-newbpe-q4_0.gguf",
+    "filesize": "8987411904",
+    "requires": "2.6.0",
+    "ramrequired": "4",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Starcoder",
+    "systemPrompt": " ",
+    "promptTemplate": "%1",
+    "description": "<strong>Trained on subset of the Stack</strong><br><ul><li>Code completion based<li>WARNING: Not available for chat GUI</ul>",
+    "url": "https://gpt4all.io/models/gguf/starcoder-newbpe-q4_0.gguf"
+  },
+  {
+    "order": "n",
+    "md5sum": "e973dd26f0ffa6e46783feaea8f08c83",
+    "disableGUI": "true",
+    "name": "Rift coder",
+    "filename": "rift-coder-v0-7b-q4_0.gguf",
+    "filesize": "3825903776",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "LLaMA",
+    "systemPrompt": " ",
+    "promptTemplate": "%1",
+    "description": "<strong>Trained on collection of Python and TypeScript</strong><br><ul><li>Code completion based<li>WARNING: Not available for chat GUI</li>",
+    "url": "https://gpt4all.io/models/gguf/rift-coder-v0-7b-q4_0.gguf"
+  },
+  {
+    "order": "o",
+    "md5sum": "e479e6f38b59afc51a470d1953a6bfc7",
+    "disableGUI": "true",
+    "name": "SBert",
+    "filename": "all-MiniLM-L6-v2-f16.gguf",
+    "filesize": "45887744",
+    "requires": "2.5.0",
+    "ramrequired": "1",
+    "parameters": "40 million",
+    "quant": "f16",
+    "type": "Bert",
+    "systemPrompt": " ",
+    "description": "<strong>LocalDocs text embeddings model</strong><br><ul><li>For use with LocalDocs feature<li>Used for retrieval augmented generation (RAG)",
+    "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2-f16.gguf"
+  },
+  {
+    "order": "p",
+    "md5sum": "919de4dd6f25351bcb0223790db1932d",
+    "name": "EM German Mistral",
+    "filename": "em_german_mistral_v01.Q4_0.gguf",
+    "filesize": "4108916352",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "description": "<strong>Mistral-based model for German-language applications</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by ellamind<li>Finetuned on German instruction and chat data</a><li>Licensed for commercial use</ul>",
+    "url": "https://huggingface.co/TheBloke/em_german_mistral_v01-GGUF/resolve/main/em_german_mistral_v01.Q4_0.gguf",
+    "promptTemplate": "USER: %1 ASSISTANT: ",
+    "systemPrompt": "Du bist ein hilfreicher Assistent. "
+  }
+]

From 4fc4d94be440c7991a5bafb87eef6e0fd54a2e13 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 15:45:32 -0500
Subject: [PATCH 14/17] fix chat-style prompt templates (#1970)

Also use a new version of Mistral OpenOrca.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/bert.cpp                      |   4 +-
 gpt4all-backend/bert_impl.h                   |   7 +-
 gpt4all-backend/gptj.cpp                      |   4 +-
 gpt4all-backend/gptj/placeholder              |   0
 gpt4all-backend/gptj_impl.h                   |   7 +-
 gpt4all-backend/llama.cpp-mainline            |   2 +-
 gpt4all-backend/llama/placeholder             |   0
 gpt4all-backend/llamamodel.cpp                | 183 +++++++++++-------
 gpt4all-backend/llamamodel_impl.h             |  14 +-
 gpt4all-backend/llmodel.h                     |  50 +++--
 gpt4all-backend/llmodel_c.cpp                 |  18 +-
 gpt4all-backend/llmodel_c.h                   |   6 +-
 gpt4all-backend/llmodel_shared.cpp            | 136 +++++++++++--
 .../python/docs/gpt4all_python.md             |  84 --------
 gpt4all-bindings/python/gpt4all/_pyllmodel.py |  19 +-
 gpt4all-bindings/python/gpt4all/gpt4all.py    |  61 +++---
 gpt4all-bindings/python/setup.py              |   2 +-
 gpt4all-chat/chatgpt.cpp                      |   9 +-
 gpt4all-chat/chatgpt.h                        |  50 ++++-
 gpt4all-chat/chatllm.cpp                      |  68 +++----
 gpt4all-chat/metadata/models2.json            |  10 +-
 gpt4all-chat/modellist.cpp                    |   2 +-
 22 files changed, 429 insertions(+), 307 deletions(-)
 delete mode 100644 gpt4all-backend/gptj/placeholder
 delete mode 100644 gpt4all-backend/llama/placeholder

diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
index 342827e2da65..bad5a422a436 100644
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -814,8 +814,10 @@ std::vector<float> Bert::embedding(const std::string &text)
     return finalEmbeddings;
 }
 
-std::vector<LLModel::Token> Bert::tokenize(PromptContext &, const std::string &str) const
+std::vector<LLModel::Token> Bert::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
+    (void)ctx;
+    (void)special;
     return ::bert_tokenize(d_ptr->ctx, str.c_str());
 }
 
diff --git a/gpt4all-backend/bert_impl.h b/gpt4all-backend/bert_impl.h
index 072e9783217f..610cc2c9f2cf 100644
--- a/gpt4all-backend/bert_impl.h
+++ b/gpt4all-backend/bert_impl.h
@@ -33,12 +33,13 @@ class Bert : public LLModel {
     std::unique_ptr<BertPrivate> d_ptr;
 
 protected:
-    std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
     Token sampleToken(PromptContext &ctx) const override;
-    std::string tokenToString(Token) const override;
+    std::string tokenToString(Token id) const override;
     bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
     int32_t contextLength() const override;
-    const std::vector<Token>& endTokens() const override;
+    const std::vector<Token> &endTokens() const override;
+    bool shouldAddBOS() const override { return true; }
 };
 
 #endif // BERT_H
diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
index 51a032f803f2..fcc4ae2a26e5 100644
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@@ -737,8 +737,10 @@ size_t GPTJ::restoreState(const uint8_t *src)
     return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
 }
 
-std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &, const std::string &str) const
+std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
+    (void)ctx;
+    (void)special;
     return ::gpt_tokenize(d_ptr->vocab, str);
 }
 
diff --git a/gpt4all-backend/gptj/placeholder b/gpt4all-backend/gptj/placeholder
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/gpt4all-backend/gptj_impl.h b/gpt4all-backend/gptj_impl.h
index 01d5698f79f0..5d940af3dfc8 100644
--- a/gpt4all-backend/gptj_impl.h
+++ b/gpt4all-backend/gptj_impl.h
@@ -30,12 +30,13 @@ class GPTJ : public LLModel {
     GPTJPrivate *d_ptr;
 
 protected:
-    std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
     Token sampleToken(PromptContext &ctx) const override;
-    std::string tokenToString(Token) const override;
+    std::string tokenToString(Token id) const override;
     bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
     int32_t contextLength() const override;
-    const std::vector<Token>& endTokens() const override;
+    const std::vector<Token> &endTokens() const override;
+    bool shouldAddBOS() const override { return false; }
 };
 
 #endif // GPTJ_H
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index 7d4ced850548..b61ee89fca20 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit 7d4ced850548642b9a1740fa25ecdef249fbf47f
+Subproject commit b61ee89fca2038e9937317a794e28e08391b7888
diff --git a/gpt4all-backend/llama/placeholder b/gpt4all-backend/llama/placeholder
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 167d10ee5661..e8d2ccbf2f62 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -6,38 +6,29 @@
 #include <cstdio>
 #include <cstring>
 #include <fstream>
-#include <map>
-#include <string>
-#include <vector>
+#include <iomanip>
 #include <iostream>
-#if defined(_WIN32) && defined(_MSC_VER)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-    #include <stdio.h>
-#else
-    #include <unistd.h>
-#endif
+#include <map>
 #include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 #include <thread>
 #include <unordered_set>
+#include <vector>
 
 #include <llama.h>
 #include <ggml.h>
-
 #ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
+#include <ggml-kompute.h>
 #endif
 
+using namespace std::string_literals;
+
 // Maximum supported GGUF version
 static constexpr int GGUF_VER_MAX = 3;
 
-namespace {
-const char *modelType_ = "LLaMA";
-}
+static const char * const modelType_ = "LLaMA";
 
 static bool llama_verbose() {
     const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
@@ -96,6 +87,56 @@ static int llama_sample_top_p_top_k(
     return llama_sample_token(ctx, &candidates_p);
 }
 
+std::string get_arch_name(gguf_context *ctx_gguf) {
+    std::string arch_name;
+    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
+    if (ktype != (GGUF_TYPE_STRING)) {
+        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
+    }
+    return gguf_get_val_str(ctx_gguf, kid);
+}
+
+static gguf_context *load_gguf(const char *fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ nullptr,
+    };
+    gguf_context *ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        std::cerr << __func__ << ": gguf_init_from_file failed\n";
+        return nullptr;
+    }
+
+    int gguf_ver = gguf_get_version(ctx);
+    if (gguf_ver > GGUF_VER_MAX) {
+        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
+static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
+    auto * ctx = load_gguf(modelPath.c_str());
+    auto arch = get_arch_name(ctx);
+
+    int32_t value = -1;
+    if (ctx) {
+        auto key = arch + "." + archKey;
+        int keyidx = gguf_find_key(ctx, key.c_str());
+        if (keyidx != -1) {
+            value = gguf_get_val_u32(ctx, keyidx);
+        } else {
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
+        }
+    }
+
+    gguf_free(ctx);
+    return value;
+}
+
 struct LLamaPrivate {
     const std::string modelPath;
     bool modelLoaded;
@@ -148,6 +189,42 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
     return filesize + est_kvcache_size;
 }
 
+bool LLamaModel::isModelBlacklisted(const std::string &modelPath) {
+    auto * ctx = load_gguf(modelPath.c_str());
+    if (!ctx) {
+        std::cerr << __func__ << ": failed to load " << modelPath << "\n";
+        return false;
+    }
+
+    auto get_key = [ctx, &modelPath](const char *name) {
+        int keyidx = gguf_find_key(ctx, name);
+        if (keyidx == -1) {
+            throw std::logic_error(name + " not found in "s + modelPath);
+        }
+        return keyidx;
+    };
+
+    bool res = false;
+    try {
+        std::string name(gguf_get_val_str(ctx, get_key("general.name")));
+        int token_idx = get_key("tokenizer.ggml.tokens");
+        int n_vocab = gguf_get_arr_n(ctx, token_idx);
+
+        // check for known bad models
+        if (name == "open-orca_mistral-7b-openorca"
+            && n_vocab == 32002
+            && gguf_get_arr_str(ctx, token_idx, 32000) == "<dummy32000>"s // should be <|im_end|>
+        ) {
+            res = true;
+        }
+    } catch (const std::logic_error &e) {
+        std::cerr << __func__ << ": " << e.what() << "\n";
+    }
+
+    gguf_free(ctx);
+    return res;
+}
+
 bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     d_ptr->modelLoaded = false;
@@ -290,12 +367,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
     return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }
 
-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
-    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->model));
-    std::vector<LLModel::Token> fres(str.size()+4);
-    // TODO(cebtenzzre): we may want to use special=true here to process special tokens
-    auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, false);
+    const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
+    const bool useBOS = wantBOS && shouldAddBOS();
+    auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
+    std::vector<LLModel::Token> fres(strCat.size()+4);
+    auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
     fres.resize(fres_len);
     return fres;
 }
@@ -349,55 +427,10 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
     return d_ptr->end_tokens;
 }
 
-std::string get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
-    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
-    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != (GGUF_TYPE_STRING)) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
-    return gguf_get_val_str(ctx_gguf, kid);
-}
-
-static gguf_context *load_gguf(const char *fname, std::string &arch) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ nullptr,
-    };
-    gguf_context *ctx = gguf_init_from_file(fname, params);
-    if (!ctx) {
-        std::cerr << __func__ << ": gguf_init_from_file failed\n";
-        return nullptr;
-    }
-
-    int gguf_ver = gguf_get_version(ctx);
-    if (gguf_ver > GGUF_VER_MAX) {
-        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
-        gguf_free(ctx);
-        return nullptr;
-    }
-
-    arch = get_arch_name(ctx);
-    return ctx;
-}
-
-static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
-    std::string arch;
-    auto * ctx = load_gguf(modelPath.c_str(), arch);
-
-    int32_t value = -1;
-    if (ctx) {
-        auto key = arch + "." + archKey;
-        int keyidx = gguf_find_key(ctx, key.c_str());
-        if (keyidx != -1) {
-            value = gguf_get_val_u32(ctx, keyidx);
-        } else {
-            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
-        }
-    }
-
-    gguf_free(ctx);
-    return value;
+bool LLamaModel::shouldAddBOS() const
+{
+    int add_bos = llama_add_bos_token(d_ptr->model);
+    return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
 }
 
 int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@@ -513,8 +546,8 @@ DLL_EXPORT const char *get_build_variant() {
 }
 
 DLL_EXPORT bool magic_match(const char *fname) {
-    std::string arch;
-    auto * ctx = load_gguf(fname, arch);
+    auto * ctx = load_gguf(fname);
+    auto arch = get_arch_name(ctx);
 
     bool valid = true;
 
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index 27eb580b07cf..15cbe1cdb928 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -19,6 +19,7 @@ class LLamaModel : public LLModel {
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
     bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
+    bool isModelBlacklisted(const std::string &modelPath) override;
     bool isModelLoaded() const override;
     size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
@@ -27,7 +28,7 @@ class LLamaModel : public LLModel {
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
     std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
-    bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const override;
+    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
     bool initializeGPUDevice(int device, std::string *unavail_reason) const override;
     bool hasGPUDevice() override;
     bool usingGPUDevice() override;
@@ -36,12 +37,13 @@ class LLamaModel : public LLModel {
     std::unique_ptr<LLamaPrivate> d_ptr;
 
 protected:
-    std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
-    std::string tokenToString(Token) const override;
-    Token sampleToken(PromptContext& ctx) const override;
-    bool evalTokens(PromptContext& ctx, const std::vector<int32_t> &tokens) const override;
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
+    std::string tokenToString(Token id) const override;
+    Token sampleToken(PromptContext &ctx) const override;
+    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
     int32_t contextLength() const override;
-    const std::vector<Token>& endTokens() const override;
+    const std::vector<Token> &endTokens() const override;
+    bool shouldAddBOS() const override;
 
     int32_t maxContextLength(std::string const &modelPath) const override;
     int32_t layerCount(std::string const &modelPath) const override;
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index c3cc937c0f72..5ccbea08a119 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -29,23 +29,23 @@ class LLModel {
 
     class Implementation {
     public:
-        Implementation(Dlhandle&&);
-        Implementation(const Implementation&) = delete;
-        Implementation(Implementation&&);
+        Implementation(Dlhandle &&);
+        Implementation(const Implementation &) = delete;
+        Implementation(Implementation &&);
         ~Implementation();
 
         std::string_view modelType() const { return m_modelType; }
         std::string_view buildVariant() const { return m_buildVariant; }
 
-        static bool isImplementation(const Dlhandle&);
-        static const std::vector<Implementation>& implementationList();
-        static const Implementation *implementation(const char *fname, const std::string& buildVariant);
+        static bool isImplementation(const Dlhandle &dl);
+        static const std::vector<Implementation> &implementationList();
+        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
         static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
         static std::vector<GPUDevice> availableGPUDevices();
         static int32_t maxContextLength(const std::string &modelPath);
         static int32_t layerCount(const std::string &modelPath);
-        static void setImplementationsSearchPath(const std::string& path);
-        static const std::string& implementationsSearchPath();
+        static void setImplementationsSearchPath(const std::string &path);
+        static const std::string &implementationsSearchPath();
 
     private:
         static LLModel *constructDefaultLlama();
@@ -82,26 +82,30 @@ class LLModel {
     virtual bool supportsEmbedding() const = 0;
     virtual bool supportsCompletion() const = 0;
     virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
+    virtual bool isModelBlacklisted(const std::string &modelPath) { (void)modelPath; return false; };
     virtual bool isModelLoaded() const = 0;
     virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
     virtual size_t stateSize() const { return 0; }
-    virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
-    virtual size_t restoreState(const uint8_t */*src*/) { return 0; }
+    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
+    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
 
     // This method requires the model to return true from supportsCompletion otherwise it will throw
     // an error
     virtual void prompt(const std::string &prompt,
+                        const std::string &promptTemplate,
                         std::function<bool(int32_t)> promptCallback,
                         std::function<bool(int32_t, const std::string&)> responseCallback,
                         std::function<bool(bool)> recalculateCallback,
-                        PromptContext &ctx);
+                        PromptContext &ctx,
+                        bool special = false,
+                        std::string *fakeReply = nullptr);
 
     virtual std::vector<float> embedding(const std::string &text);
 
-    virtual void setThreadCount(int32_t /*n_threads*/) {}
+    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
     virtual int32_t threadCount() const { return 1; }
 
-    const Implementation& implementation() const {
+    const Implementation &implementation() const {
         return *m_implementation;
     }
 
@@ -110,7 +114,7 @@ class LLModel {
         return {};
     }
 
-    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const {
+    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
         (void)memoryRequired;
         (void)name;
         return false;
@@ -132,12 +136,13 @@ class LLModel {
 protected:
     // These are pure virtual because subclasses need to implement as the default implementation of
     // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(PromptContext &, const std::string&) const = 0;
-    virtual std::string tokenToString(Token) const = 0;
+    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) const = 0;
+    virtual std::string tokenToString(Token id) const = 0;
     virtual Token sampleToken(PromptContext &ctx) const = 0;
-    virtual bool evalTokens(PromptContext &/*ctx*/, const std::vector<int32_t>& /*tokens*/) const = 0;
+    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
     virtual int32_t contextLength() const = 0;
-    virtual const std::vector<Token>& endTokens() const = 0;
+    virtual const std::vector<Token> &endTokens() const = 0;
+    virtual bool shouldAddBOS() const = 0;
 
     virtual int32_t maxContextLength(std::string const &modelPath) const
     {
@@ -166,6 +171,15 @@ class LLModel {
         return true;
     }
 
+    void decodePrompt(std::function<bool(int32_t)> promptCallback,
+                      std::function<bool(int32_t, const std::string&)> responseCallback,
+                      std::function<bool(bool)> recalculateCallback,
+                      PromptContext &promptCtx,
+                      std::vector<Token> embd_inp);
+    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                          std::function<bool(bool)> recalculateCallback,
+                          PromptContext &promptCtx);
+
 private:
     friend class LLMImplementation;
 };
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index 8ba59b2b7a39..b6306a77d894 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -1,8 +1,9 @@
 #include "llmodel_c.h"
 #include "llmodel.h"
 
-#include <cstring>
 #include <cerrno>
+#include <cstring>
+#include <iostream>
 #include <utility>
 
 struct LLModelWrapper {
@@ -56,7 +57,14 @@ size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_c
 bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->loadModel(model_path, n_ctx, ngl);
+
+    std::string modelPath(model_path);
+    if (wrapper->llModel->isModelBlacklisted(modelPath)) {
+        size_t slash = modelPath.find_last_of("/\\");
+        auto basename = slash == std::string::npos ? modelPath : modelPath.substr(slash + 1);
+        std::cerr << "warning: model '" << basename << "' is out-of-date, please check for an updated version\n";
+    }
+    return wrapper->llModel->loadModel(modelPath, n_ctx, ngl);
 }
 
 bool llmodel_isModelLoaded(llmodel_model model)
@@ -100,10 +108,12 @@ bool recalculate_wrapper(bool is_recalculating, void *user_data) {
 }
 
 void llmodel_prompt(llmodel_model model, const char *prompt,
+                    const char *prompt_template,
                     llmodel_prompt_callback prompt_callback,
                     llmodel_response_callback response_callback,
                     llmodel_recalculate_callback recalculate_callback,
-                    llmodel_prompt_context *ctx)
+                    llmodel_prompt_context *ctx,
+                    bool special)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
 
@@ -131,7 +141,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
     wrapper->promptContext.contextErase = ctx->context_erase;
 
     // Call the C++ prompt method
-    wrapper->llModel->prompt(prompt, prompt_func, response_func, recalc_func, wrapper->promptContext);
+    wrapper->llModel->prompt(prompt, prompt_template, prompt_func, response_func, recalc_func, wrapper->promptContext, special);
 
     // Update the C context by giving access to the wrappers raw pointers to std::vector data
     // which involves no copies
diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
index 50d35edac6dd..eac4ae9b9666 100644
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -163,16 +163,20 @@ uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
  * Generate a response using the model.
  * @param model A pointer to the llmodel_model instance.
  * @param prompt A string representing the input prompt.
+ * @param prompt_template A string representing the input prompt template.
  * @param prompt_callback A callback function for handling the processing of prompt.
  * @param response_callback A callback function for handling the generated response.
  * @param recalculate_callback A callback function for handling recalculation requests.
+ * @param special True if special tokens in the prompt should be processed, false otherwise.
  * @param ctx A pointer to the llmodel_prompt_context structure.
  */
 void llmodel_prompt(llmodel_model model, const char *prompt,
+                    const char *prompt_template,
                     llmodel_prompt_callback prompt_callback,
                     llmodel_response_callback response_callback,
                     llmodel_recalculate_callback recalculate_callback,
-                    llmodel_prompt_context *ctx);
+                    llmodel_prompt_context *ctx,
+                    bool special);
 
 /**
  * Generate an embedding using the model.
diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llmodel_shared.cpp
index 13c3706c0871..665da9c9fbd1 100644
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@@ -2,11 +2,20 @@
 
 #include <cassert>
 #include <iostream>
+#include <regex>
 #include <unordered_set>
 
+// TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
 void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
-    size_t i = 0;
-    promptCtx.n_past = 0;
+    int n_keep = shouldAddBOS();
+    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
+
+    // Erase the first percentage of context from the tokens
+    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
+    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
+
+    size_t i = n_keep;
+    promptCtx.n_past = n_keep;
     while (i < promptCtx.tokens.size()) {
         size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
         std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
@@ -26,11 +35,36 @@ void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bo
     recalculate(false);
 }
 
+static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err) {
+    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
+
+    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
+    placeholders.clear();
+    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
+
+    if (placeholders.size() > 2) {
+        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
+        return false;
+    }
+    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
+        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
+        return false;
+    }
+    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
+        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
+        return false;
+    }
+    return true;
+}
+
 void LLModel::prompt(const std::string &prompt,
+                     const std::string &promptTemplate,
                      std::function<bool(int32_t)> promptCallback,
                      std::function<bool(int32_t, const std::string&)> responseCallback,
                      std::function<bool(bool)> recalculateCallback,
-                     PromptContext &promptCtx)
+                     PromptContext &promptCtx,
+                     bool special,
+                     std::string *fakeReply)
 {
     if (!isModelLoaded()) {
         std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
@@ -38,15 +72,86 @@ void LLModel::prompt(const std::string &prompt,
     }
 
     if (!supportsCompletion()) {
-        std::string errorMessage = "ERROR: this model does not support text completion or chat!\n";
+        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
         responseCallback(-1, errorMessage);
-        std::cerr << implementation().modelType() << errorMessage;
+        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
         return;
     }
 
-    // tokenize the prompt
-    std::vector<Token> embd_inp = tokenize(promptCtx, prompt);
+    // parse the prompt template
+    std::vector<std::smatch> placeholders;
+    {
+        std::string err;
+        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
+            responseCallback(-1, err);
+            std::cerr << err << "\n";
+            return;
+        }
+    }
+
+    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
 
+    // tokenize the user prompt
+    std::vector<Token> embd_inp;
+    if (placeholders.empty()) {
+        // this is unusual, but well-defined
+        std::cerr << __func__ << ": prompt template has no placeholder\n";
+        embd_inp = tokenize(promptCtx, promptTemplate, true);
+    } else {
+        // template: beginning of user prompt
+        const auto &phUser = placeholders[0];
+        std::string userPrefix(phUser.prefix());
+        if (!userPrefix.empty()) {
+            embd_inp = tokenize(promptCtx, userPrefix, true);
+            promptCtx.n_past += embd_inp.size();
+        }
+
+        // user input (shouldn't have special token processing)
+        auto tokens = tokenize(promptCtx, prompt, special);
+        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
+        promptCtx.n_past += tokens.size();
+
+        // template: end of user prompt + start of assistant prompt
+        size_t start = phUser.position() + phUser.length();
+        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
+        auto userToAsst = promptTemplate.substr(start, end - start);
+        if (!userToAsst.empty()) {
+            tokens = tokenize(promptCtx, userToAsst, true);
+            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
+            promptCtx.n_past += tokens.size();
+        }
+    }
+
+    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
+
+    // decode the user prompt
+    decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+
+    // decode the assistant's reply, either generated or spoofed
+    if (fakeReply == nullptr) {
+        generateResponse(responseCallback, recalculateCallback, promptCtx);
+    } else {
+        embd_inp = tokenize(promptCtx, *fakeReply, false);
+        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+    }
+
+    // decode the rest of the prompt template
+    if (placeholders.size() >= 2) {
+        // template: end of assistant prompt
+        size_t start = placeholders[1].position() + placeholders[1].length();
+        auto asstSuffix = promptTemplate.substr(start);
+        if (!asstSuffix.empty()) {
+            embd_inp = tokenize(promptCtx, asstSuffix, true);
+            decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+        }
+    }
+}
+
+void LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
+                           std::function<bool(int32_t, const std::string&)> responseCallback,
+                           std::function<bool(bool)> recalculateCallback,
+                           PromptContext &promptCtx,
+                           std::vector<Token> embd_inp) {
     // save the context size
     promptCtx.n_ctx = contextLength();
 
@@ -69,11 +174,6 @@ void LLModel::prompt(const std::string &prompt,
 
         // Check if the context has run out...
         if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
-            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
-            // Erase the first percentage of context from the tokens...
-            std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
-            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
-            promptCtx.n_past = promptCtx.tokens.size();
             recalculateContext(promptCtx, recalculateCallback);
             assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
         }
@@ -94,7 +194,11 @@ void LLModel::prompt(const std::string &prompt,
         }
         i = batch_end;
     }
+}
 
+void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                               std::function<bool(bool)> recalculateCallback,
+                               PromptContext &promptCtx) {
     std::string cachedResponse;
     std::vector<Token> cachedTokens;
     std::unordered_set<std::string> reversePrompts
@@ -108,11 +212,6 @@ void LLModel::prompt(const std::string &prompt,
 
         // Check if the context has run out...
         if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
-            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
-            // Erase the first percentage of context from the tokens...
-            std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
-            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
-            promptCtx.n_past = promptCtx.tokens.size();
             recalculateContext(promptCtx, recalculateCallback);
             assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
         }
@@ -165,8 +264,9 @@ void LLModel::prompt(const std::string &prompt,
     }
 }
 
-std::vector<float> LLModel::embedding(const std::string &/*text*/)
+std::vector<float> LLModel::embedding(const std::string &text)
 {
+    (void)text;
     if (!supportsCompletion()) {
         std::string errorMessage = "ERROR: this model does not support generating embeddings!\n";
         std::cerr << implementation().modelType() << errorMessage;
diff --git a/gpt4all-bindings/python/docs/gpt4all_python.md b/gpt4all-bindings/python/docs/gpt4all_python.md
index dd4f6d7f35d1..7e56fabeec5a 100644
--- a/gpt4all-bindings/python/docs/gpt4all_python.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python.md
@@ -246,90 +246,6 @@ To do the same outside a session, the input has to be formatted manually. For ex
     The colors in my previous response are blue, green and red.
     ```
 
-Ultimately, the method `GPT4All._format_chat_prompt_template()` is responsible for formatting templates. It can be
-customized in a subclass. As an example:
-
-=== "Custom Subclass"
-    ``` py
-    from itertools import cycle
-    from gpt4all import GPT4All
-
-    class RotatingTemplateGPT4All(GPT4All):
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-            self._templates = [
-                "Respond like a pirate.",
-                "Respond like a politician.",
-                "Respond like a philosopher.",
-                "Respond like a Klingon.",
-            ]
-            self._cycling_templates = cycle(self._templates)
-
-        def _format_chat_prompt_template(
-            self,
-            messages: list,
-            default_prompt_header: str = "",
-            default_prompt_footer: str = "",
-        ) -> str:
-            full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""
-            for message in messages:
-                if message["role"] == "user":
-                    user_message = f"USER: {message['content']} {next(self._cycling_templates)}\n"
-                    full_prompt += user_message
-                if message["role"] == "assistant":
-                    assistant_message = f"ASSISTANT: {message['content']}\n"
-                    full_prompt += assistant_message
-            full_prompt += "\n\n" + default_prompt_footer if default_prompt_footer != "" else ""
-            print(full_prompt)
-            return full_prompt
-    ```
-=== "GPT4All Custom Subclass Example"
-    ``` py
-    model = RotatingTemplateGPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
-    with model.chat_session():  # starting a session is optional in this example
-        response1 = model.generate("hi, who are you?")
-        print(response1)
-        print()
-        response2 = model.generate("what can you tell me about snakes?")
-        print(response2)
-        print()
-        response3 = model.generate("what's your opinion on Chess?")
-        print(response3)
-        print()
-        response4 = model.generate("tell me about ancient Rome.")
-        print(response4)
-    ```
-=== "Possible Output"
-    ```
-    USER: hi, who are you? Respond like a pirate.
-
-    Pirate: Ahoy there mateys! I be Cap'n Jack Sparrow of the Black Pearl.
-
-    USER: what can you tell me about snakes? Respond like a politician.
-
-    Politician: Snakes have been making headlines lately due to their ability to
-    slither into tight spaces and evade capture, much like myself during my last
-    election campaign. However, I believe that with proper education and
-    understanding of these creatures, we can work together towards creating a
-    safer environment for both humans and snakes alike.
-
-    USER: what's your opinion on Chess? Respond like a philosopher.
-
-    Philosopher: The game of chess is often used as an analogy to illustrate the
-    complexities of life and decision-making processes. However, I believe that it
-    can also be seen as a reflection of our own consciousness and subconscious mind.
-    Just as each piece on the board has its unique role to play in shaping the
-    outcome of the game, we too have different roles to fulfill in creating our own
-    personal narrative.
-
-    USER: tell me about ancient Rome. Respond like a Klingon.
-
-    Klingon: Ancient Rome was once a great empire that ruled over much of Europe and
-    the Mediterranean region. However, just as the Empire fell due to internal strife
-    and external threats, so too did my own house come crashing down when I failed to
-    protect our homeworld from invading forces.
-    ```
-
 
 ### Introspection
 A less apparent feature is the capacity to log the final prompt that gets sent to the model. It relies on
diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
index eb03a91443fe..9aaa94c10208 100644
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -89,10 +89,12 @@ class LLModelGPUDevice(ctypes.Structure):
 llmodel.llmodel_prompt.argtypes = [
     ctypes.c_void_p,
     ctypes.c_char_p,
+    ctypes.c_char_p,
     PromptCallback,
     ResponseCallback,
     RecalculateCallback,
     ctypes.POINTER(LLModelPromptContext),
+    ctypes.c_bool,
 ]
 
 llmodel.llmodel_prompt.restype = None
@@ -290,6 +292,7 @@ def generate_embedding(self, text: str) -> List[float]:
     def prompt_model(
         self,
         prompt: str,
+        prompt_template: str,
         callback: ResponseCallbackType,
         n_predict: int = 4096,
         top_k: int = 40,
@@ -300,6 +303,7 @@ def prompt_model(
         repeat_last_n: int = 10,
         context_erase: float = 0.75,
         reset_context: bool = False,
+        special: bool = False,
     ):
         """
         Generate response from model from a prompt.
@@ -326,9 +330,6 @@ def prompt_model(
             prompt,
         )
 
-        prompt_bytes = prompt.encode()
-        prompt_ptr = ctypes.c_char_p(prompt_bytes)
-
         self._set_context(
             n_predict=n_predict,
             top_k=top_k,
@@ -343,16 +344,18 @@ def prompt_model(
 
         llmodel.llmodel_prompt(
             self.model,
-            prompt_ptr,
+            ctypes.c_char_p(prompt.encode()),
+            ctypes.c_char_p(prompt_template.encode()),
             PromptCallback(self._prompt_callback),
             ResponseCallback(self._callback_decoder(callback)),
             RecalculateCallback(self._recalculate_callback),
             self.context,
+            special,
         )
 
 
     def prompt_model_streaming(
-        self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
+        self, prompt: str, prompt_template: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
     ) -> Iterable[str]:
         output_queue: Queue[str | Sentinel] = Queue()
 
@@ -369,15 +372,15 @@ def _generator_callback(token_id: int, response: str):
 
             return _generator_callback
 
-        def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
-            self.prompt_model(prompt, callback, **kwargs)
+        def run_llmodel_prompt(prompt: str, prompt_template: str, callback: ResponseCallbackType, **kwargs):
+            self.prompt_model(prompt, prompt_template, callback, **kwargs)
             output_queue.put(Sentinel.TERMINATING_SYMBOL)
 
         # Kick off llmodel_prompt in separate thread so we can return generator
         # immediately
         thread = threading.Thread(
             target=run_llmodel_prompt,
-            args=(prompt, _generator_callback_wrapper(callback)),
+            args=(prompt, prompt_template, _generator_callback_wrapper(callback)),
             kwargs=kwargs,
         )
         thread.start()
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index 02fa1c806bb9..82342b28babf 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -4,8 +4,10 @@
 from __future__ import annotations
 
 import os
+import re
 import sys
 import time
+import warnings
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
@@ -314,6 +316,10 @@ def generate(
             Either the entire completion or a generator that yields the completion token by token.
         """
 
+        if re.search(r"%1(?![0-9])", self._current_prompt_template):
+            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
+                             "placeholder, please use '{0}' instead.")
+
         # Preparing the model request
         generate_kwargs: Dict[str, Any] = dict(
             temp=temp,
@@ -327,16 +333,29 @@ def generate(
 
         if self._is_chat_session_activated:
             # check if there is only one message, i.e. system prompt:
-            generate_kwargs["reset_context"] = len(self.current_chat_session) == 1
+            reset = len(self.current_chat_session) == 1
+            generate_kwargs["reset_context"] = reset
             self.current_chat_session.append({"role": "user", "content": prompt})
 
-            prompt = self._format_chat_prompt_template(
-                messages=self.current_chat_session[-1:],
-                default_prompt_header=self.current_chat_session[0]["content"]
-                if generate_kwargs["reset_context"]
-                else "",
-            )
+            if self._format_chat_prompt_template.__func__ is GPT4All._format_chat_prompt_template:
+                if reset:
+                    # ingest system prompt
+                    self.model.prompt_model(self.current_chat_session[0]["content"], "%1",
+                                            n_batch=n_batch, n_predict=0, special=True)
+                prompt_template = self._current_prompt_template.format("%1")
+            else:
+                warnings.warn(
+                    "_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.",
+                    DeprecationWarning,
+                )
+                # special tokens won't be processed
+                prompt = self._format_chat_prompt_template(
+                    self.current_chat_session[-1:],
+                    self.current_chat_session[0]["content"] if reset else "",
+                )
+                prompt_template = "%1"
         else:
+            prompt_template = "%1"
             generate_kwargs["reset_context"] = True
 
         # Prepare the callback, process the model response
@@ -365,14 +384,16 @@ def _callback(token_id: int, response: str) -> bool:
         # Send the request to the model
         if streaming:
             return self.model.prompt_model_streaming(
-                prompt=prompt,
-                callback=_callback_wrapper(callback, output_collector),
+                prompt,
+                prompt_template,
+                _callback_wrapper(callback, output_collector),
                 **generate_kwargs,
             )
 
         self.model.prompt_model(
-            prompt=prompt,
-            callback=_callback_wrapper(callback, output_collector),
+            prompt,
+            prompt_template,
+            _callback_wrapper(callback, output_collector),
             **generate_kwargs,
         )
 
@@ -423,24 +444,6 @@ def _format_chat_prompt_template(
             Formatted prompt.
         """
 
-        if isinstance(default_prompt_header, bool):
-            import warnings
-
-            warnings.warn(
-                "Using True/False for the 'default_prompt_header' is deprecated. Use a string instead.",
-                DeprecationWarning,
-            )
-            default_prompt_header = ""
-
-        if isinstance(default_prompt_footer, bool):
-            import warnings
-
-            warnings.warn(
-                "Using True/False for the 'default_prompt_footer' is deprecated. Use a string instead.",
-                DeprecationWarning,
-            )
-            default_prompt_footer = ""
-
         full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""
 
         for message in messages:
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index c76f1b49254b..e13bca6c07a2 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():
 
 setup(
     name=package_name,
-    version="2.2.1.post1",
+    version="2.3.0",
     description="Python bindings for GPT4All",
     long_description=get_long_description(),
     long_description_content_type="text/markdown",
diff --git a/gpt4all-chat/chatgpt.cpp b/gpt4all-chat/chatgpt.cpp
index 5f3da91d180a..0575ee8ee22b 100644
--- a/gpt4all-chat/chatgpt.cpp
+++ b/gpt4all-chat/chatgpt.cpp
@@ -75,13 +75,18 @@ size_t ChatGPT::restoreState(const uint8_t *src)
 }
 
 void ChatGPT::prompt(const std::string &prompt,
+        const std::string &promptTemplate,
         std::function<bool(int32_t)> promptCallback,
         std::function<bool(int32_t, const std::string&)> responseCallback,
         std::function<bool(bool)> recalculateCallback,
-        PromptContext &promptCtx) {
+        PromptContext &promptCtx,
+        bool special,
+        std::string *fakeReply) {
 
     Q_UNUSED(promptCallback);
     Q_UNUSED(recalculateCallback);
+    Q_UNUSED(special);
+    Q_UNUSED(fakeReply); // FIXME(cebtenzzre): I broke ChatGPT
 
     if (!isModelLoaded()) {
         std::cerr << "ChatGPT ERROR: prompt won't work with an unloaded model!\n";
@@ -109,7 +114,7 @@ void ChatGPT::prompt(const std::string &prompt,
 
     QJsonObject promptObject;
     promptObject.insert("role", "user");
-    promptObject.insert("content", QString::fromStdString(prompt));
+    promptObject.insert("content", QString::fromStdString(promptTemplate).arg(QString::fromStdString(prompt)));
     messages.append(promptObject);
     root.insert("messages", messages);
 
diff --git a/gpt4all-chat/chatgpt.h b/gpt4all-chat/chatgpt.h
index 11d84606bccf..2656c6f763c4 100644
--- a/gpt4all-chat/chatgpt.h
+++ b/gpt4all-chat/chatgpt.h
@@ -1,6 +1,8 @@
 #ifndef CHATGPT_H
 #define CHATGPT_H
 
+#include <stdexcept>
+
 #include <QObject>
 #include <QNetworkReply>
 #include <QNetworkRequest>
@@ -55,10 +57,13 @@ class ChatGPT : public QObject, public LLModel {
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
     void prompt(const std::string &prompt,
+        const std::string &promptTemplate,
         std::function<bool(int32_t)> promptCallback,
         std::function<bool(int32_t, const std::string&)> responseCallback,
         std::function<bool(bool)> recalculateCallback,
-        PromptContext &ctx) override;
+        PromptContext &ctx,
+        bool special,
+        std::string *fakeReply) override;
 
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
@@ -69,7 +74,7 @@ class ChatGPT : public QObject, public LLModel {
     QList<QString> context() const { return m_context; }
     void setContext(const QList<QString> &context) { m_context = context; }
 
-    bool callResponse(int32_t token, const std::string& string);
+    bool callResponse(int32_t token, const std::string &string);
 
 Q_SIGNALS:
     void request(const QString &apiKey,
@@ -80,12 +85,41 @@ class ChatGPT : public QObject, public LLModel {
     // We have to implement these as they are pure virtual in base class, but we don't actually use
     // them as they are only called from the default implementation of 'prompt' which we override and
     // completely replace
-    std::vector<Token> tokenize(PromptContext &, const std::string&) const override { return std::vector<Token>(); }
-    std::string tokenToString(Token) const override { return std::string(); }
-    Token sampleToken(PromptContext &ctx) const override { return -1; }
-    bool evalTokens(PromptContext &/*ctx*/, const std::vector<int32_t>& /*tokens*/) const override { return false; }
-    int32_t contextLength() const override { return -1; }
-    const std::vector<Token>& endTokens() const override { static const std::vector<Token> fres; return fres; }
+
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override {
+        (void)ctx;
+        (void)str;
+        (void)special;
+        throw std::logic_error("not implemented");
+    }
+
+    std::string tokenToString(Token id) const override {
+        (void)id;
+        throw std::logic_error("not implemented");
+    }
+
+    Token sampleToken(PromptContext &ctx) const override {
+        (void)ctx;
+        throw std::logic_error("not implemented");
+    }
+
+    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override {
+        (void)ctx;
+        (void)tokens;
+        throw std::logic_error("not implemented");
+    }
+
+    int32_t contextLength() const override {
+        throw std::logic_error("not implemented");
+    }
+
+    const std::vector<Token> &endTokens() const override {
+        throw std::logic_error("not implemented");
+    }
+
+    bool shouldAddBOS() const override {
+        throw std::logic_error("not implemented");
+    }
 
 private:
     std::function<bool(int32_t, const std::string&)> m_responseCallback;
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index 750e85485b41..aa19b69601b1 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -303,6 +303,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
             m_llModelInfo.model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
 
             if (m_llModelInfo.model) {
+                if (m_llModelInfo.model->isModelBlacklisted(filePath.toStdString())) {
+                    // TODO(cebtenzzre): warn that this model is out-of-date
+                }
 
                 m_llModelInfo.model->setProgressCallback([this](float progress) -> bool {
                     emit modelLoadingPercentageChanged(progress);
@@ -588,14 +591,11 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
     }
 
     // Augment the prompt template with the results if any
-    QList<QString> augmentedTemplate;
+    QList<QString> docsContext;
     if (!databaseResults.isEmpty())
-        augmentedTemplate.append("### Context:");
+        docsContext.append("### Context:");
     for (const ResultInfo &info : databaseResults)
-        augmentedTemplate.append(info.text);
-    augmentedTemplate.append(promptTemplate);
-
-    QString instructPrompt = augmentedTemplate.join("\n").arg(prompt);
+        docsContext.append(info.text);
 
     int n_threads = MySettings::globalInstance()->threadCount();
 
@@ -605,7 +605,6 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
         std::placeholders::_2);
     auto recalcFunc = std::bind(&ChatLLM::handleRecalculate, this, std::placeholders::_1);
     emit promptProcessing();
-    qint32 logitsBefore = m_ctx.logits.size();
     m_ctx.n_predict = n_predict;
     m_ctx.top_k = top_k;
     m_ctx.top_p = top_p;
@@ -615,11 +614,16 @@ bool ChatLLM::promptInternal(const QList<QString> &collectionList, const QString
     m_ctx.repeat_last_n = repeat_penalty_tokens;
     m_llModelInfo.model->setThreadCount(n_threads);
 #if defined(DEBUG)
-    printf("%s", qPrintable(instructPrompt));
+    printf("%s", qPrintable(prompt));
     fflush(stdout);
 #endif
     m_timer->start();
-    m_llModelInfo.model->prompt(instructPrompt.toStdString(), promptFunc, responseFunc, recalcFunc, m_ctx);
+    if (!docsContext.isEmpty()) {
+        auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode localdocs context without a response
+        m_llModelInfo.model->prompt(docsContext.join("\n").toStdString(), "%1", promptFunc, responseFunc, recalcFunc, m_ctx);
+        m_ctx.n_predict = old_n_predict; // now we are ready for a response
+    }
+    m_llModelInfo.model->prompt(prompt.toStdString(), promptTemplate.toStdString(), promptFunc, responseFunc, recalcFunc, m_ctx);
 #if defined(DEBUG)
     printf("\n");
     fflush(stdout);
@@ -720,7 +724,7 @@ void ChatLLM::generateName()
     printf("%s", qPrintable(instructPrompt));
     fflush(stdout);
 #endif
-    m_llModelInfo.model->prompt(instructPrompt.toStdString(), promptFunc, responseFunc, recalcFunc, ctx);
+    m_llModelInfo.model->prompt(instructPrompt.toStdString(), "%1", promptFunc, responseFunc, recalcFunc, ctx);
 #if defined(DEBUG)
     printf("\n");
     fflush(stdout);
@@ -780,16 +784,6 @@ bool ChatLLM::handleSystemPrompt(int32_t token)
     return !m_stopGenerating;
 }
 
-bool ChatLLM::handleSystemResponse(int32_t token, const std::string &response)
-{
-#if defined(DEBUG)
-    qDebug() << "system response" << m_llmThread.objectName() << token << response << m_stopGenerating;
-#endif
-    Q_UNUSED(token);
-    Q_UNUSED(response);
-    return false;
-}
-
 bool ChatLLM::handleSystemRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
@@ -808,16 +802,6 @@ bool ChatLLM::handleRestoreStateFromTextPrompt(int32_t token)
     return !m_stopGenerating;
 }
 
-bool ChatLLM::handleRestoreStateFromTextResponse(int32_t token, const std::string &response)
-{
-#if defined(DEBUG)
-    qDebug() << "restore state from text response" << m_llmThread.objectName() << token << response << m_stopGenerating;
-#endif
-    Q_UNUSED(token);
-    Q_UNUSED(response);
-    return false;
-}
-
 bool ChatLLM::handleRestoreStateFromTextRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
@@ -1027,8 +1011,6 @@ void ChatLLM::processSystemPrompt()
     m_ctx = LLModel::PromptContext();
 
     auto promptFunc = std::bind(&ChatLLM::handleSystemPrompt, this, std::placeholders::_1);
-    auto responseFunc = std::bind(&ChatLLM::handleSystemResponse, this, std::placeholders::_1,
-        std::placeholders::_2);
     auto recalcFunc = std::bind(&ChatLLM::handleSystemRecalculate, this, std::placeholders::_1);
 
     const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
@@ -1051,7 +1033,9 @@ void ChatLLM::processSystemPrompt()
     printf("%s", qPrintable(QString::fromStdString(systemPrompt)));
     fflush(stdout);
 #endif
-    m_llModelInfo.model->prompt(systemPrompt, promptFunc, responseFunc, recalcFunc, m_ctx);
+    auto old_n_predict = std::exchange(m_ctx.n_predict, 0); // decode system prompt without a response
+    m_llModelInfo.model->prompt(systemPrompt, "%1", promptFunc, nullptr, recalcFunc, m_ctx, true);
+    m_ctx.n_predict = old_n_predict;
 #if defined(DEBUG)
     printf("\n");
     fflush(stdout);
@@ -1073,8 +1057,6 @@ void ChatLLM::processRestoreStateFromText()
     m_ctx = LLModel::PromptContext();
 
     auto promptFunc = std::bind(&ChatLLM::handleRestoreStateFromTextPrompt, this, std::placeholders::_1);
-    auto responseFunc = std::bind(&ChatLLM::handleRestoreStateFromTextResponse, this, std::placeholders::_1,
-        std::placeholders::_2);
     auto recalcFunc = std::bind(&ChatLLM::handleRestoreStateFromTextRecalculate, this, std::placeholders::_1);
 
     const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
@@ -1094,9 +1076,19 @@ void ChatLLM::processRestoreStateFromText()
     m_ctx.repeat_penalty = repeat_penalty;
     m_ctx.repeat_last_n = repeat_penalty_tokens;
     m_llModelInfo.model->setThreadCount(n_threads);
-    for (auto pair : m_stateFromText) {
-        const QString str = pair.first == "Prompt: " ? promptTemplate.arg(pair.second) : pair.second;
-        m_llModelInfo.model->prompt(str.toStdString(), promptFunc, responseFunc, recalcFunc, m_ctx);
+
+    auto it = m_stateFromText.begin();
+    while (it < m_stateFromText.end()) {
+        auto &prompt = *it++;
+        Q_ASSERT(prompt.first == "Prompt: ");
+        Q_ASSERT(it < m_stateFromText.end());
+
+        auto &response = *it++;
+        Q_ASSERT(response.first != "Prompt: ");
+        auto responseText = response.second.toStdString();
+
+        m_llModelInfo.model->prompt(prompt.second.toStdString(), promptTemplate.toStdString(), promptFunc, nullptr,
+                                    recalcFunc, m_ctx, false, &responseText);
     }
 
     if (!m_stopGenerating) {
diff --git a/gpt4all-chat/metadata/models2.json b/gpt4all-chat/metadata/models2.json
index 5096cd032228..903e7ad6bd2a 100644
--- a/gpt4all-chat/metadata/models2.json
+++ b/gpt4all-chat/metadata/models2.json
@@ -17,10 +17,10 @@
   },
   {
     "order": "b",
-    "md5sum": "48de9538c774188eb25a7e9ee024bbd3",
+    "md5sum": "f692417a22405d80573ac10cb0cd6c6a",
     "name": "Mistral OpenOrca",
-    "filename": "mistral-7b-openorca.Q4_0.gguf",
-    "filesize": "4108927744",
+    "filename": "mistral-7b-openorca.Q4_0.gguf2.gguf",
+    "filesize": "4108928128",
     "requires": "2.5.0",
     "ramrequired": "8",
     "parameters": "7 billion",
@@ -28,7 +28,7 @@
     "type": "Mistral",
     "description": "<strong>Best overall fast chat model</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Mistral AI<li>Finetuned on OpenOrca dataset curated via <a href=\"https://atlas.nomic.ai/\">Nomic Atlas</a><li>Licensed for commercial use</ul>",
     "url": "https://gpt4all.io/models/gguf/mistral-7b-openorca.Q4_0.gguf",
-    "promptTemplate": "<|im_start|>user\n%1<|im_end|><|im_start|>assistant\n",
+    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n",
     "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI. For multi-step problems, write out your reasoning for each step.\n<|im_end|>"
   },
   {
@@ -152,7 +152,7 @@
     "type": "MPT",
     "description": "<strong>Good model with novel architecture</strong><br><ul><li>Fast responses<li>Chat based<li>Trained by Mosaic ML<li>Cannot be used commercially</ul>",
     "url": "https://gpt4all.io/models/gguf/mpt-7b-chat-newbpe-q4_0.gguf",
-    "promptTemplate": "<|im_start|>user\n%1<|im_end|><|im_start|>assistant\n",
+    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n",
     "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>"
   },
   {
diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp
index 7d07e4c5fc88..10881e880589 100644
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@@ -951,7 +951,7 @@ void ModelList::updateModelsFromDirectory()
         processDirectory(localPath);
 }
 
-#define MODELS_VERSION 2
+#define MODELS_VERSION 3
 
 void ModelList::updateModelsFromJson()
 {

From 67bbce43abf55c25c937a2339fa2de72acc23058 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 16:05:49 -0500
Subject: [PATCH 15/17] Fix state issues with reloading model.

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/chatllm.cpp | 6 +++++-
 gpt4all-chat/main.qml    | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index aa19b69601b1..d0c9d33b1f6a 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -672,7 +672,11 @@ void ChatLLM::unloadModel()
     if (!isModelLoaded() || m_isServer)
         return;
 
-    emit modelLoadingPercentageChanged(0.0f);
+    if (!m_forceUnloadModel || !m_shouldBeLoaded)
+        emit modelLoadingPercentageChanged(0.0f);
+    else
+        emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
+
     saveState();
 #if defined(DEBUG_MODEL_LOADING)
     qDebug() << "unloadModel" << m_llmThread.objectName() << m_llModelInfo.model;
diff --git a/gpt4all-chat/main.qml b/gpt4all-chat/main.qml
index 7bacb6cb2b79..70fe6dae9170 100644
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@@ -463,7 +463,7 @@ Window {
 
                     MyMiniButton {
                         id: ejectButton
-                        visible: currentChat.isModelLoaded
+                        visible: currentChat.isModelLoaded && !window.isCurrentlyLoading
                         z: 500
                         anchors.right: parent.right
                         anchors.rightMargin: 50

From ef0a67eb940a57f88172817b186a439809360e46 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 21 Feb 2024 16:18:26 -0500
Subject: [PATCH 16/17] models: remove gemma from models2.json and models3.json
 (#1995)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-chat/metadata/models2.json | 48 ++++++++++--------------------
 gpt4all-chat/metadata/models3.json | 48 ++++++++++--------------------
 2 files changed, 32 insertions(+), 64 deletions(-)

diff --git a/gpt4all-chat/metadata/models2.json b/gpt4all-chat/metadata/models2.json
index 903e7ad6bd2a..4d6c6a372819 100644
--- a/gpt4all-chat/metadata/models2.json
+++ b/gpt4all-chat/metadata/models2.json
@@ -1,22 +1,6 @@
 [
   {
     "order": "a",
-    "md5sum": "6d1ca6e9533d177361fe2612a2c87474",
-    "name": "Gemma Instruct",
-    "filename": "gemma-7b-it.Q4_0.gguf",
-    "filesize": "4809316512",
-    "requires": "2.7.1",
-    "ramrequired": "8",
-    "parameters": "7 billion",
-    "quant": "q4_0",
-    "type": "Gemma",
-    "description": "<strong>A state-of-the-art open model from Google</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Google</li><li>Licensed for commercial use</li><li>Gemma is provided under and subject to the Gemma Terms of Use found at <a href=\"https://ai.google.dev/gemma/terms\">ai.google.dev/gemma/terms</a></li></ul>",
-    "url": "https://gpt4all.io/models/gguf/gemma-7b-it.Q4_0.gguf",
-    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n",
-    "systemPrompt": ""
-  },
-  {
-    "order": "b",
     "md5sum": "f692417a22405d80573ac10cb0cd6c6a",
     "name": "Mistral OpenOrca",
     "filename": "mistral-7b-openorca.Q4_0.gguf2.gguf",
@@ -31,6 +15,22 @@
     "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n",
     "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI. For multi-step problems, write out your reasoning for each step.\n<|im_end|>"
   },
+  {
+    "order": "b",
+    "md5sum": "97463be739b50525df56d33b26b00852",
+    "name": "Mistral Instruct",
+    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "filesize": "4108916384",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "systemPrompt": " ",
+    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
+    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "promptTemplate": "[INST] %1 [/INST]"
+  },
   {
     "order": "c",
     "md5sum": "c4c78adf744d6a20f05c8751e3961b84",
@@ -47,22 +47,6 @@
     "url": "https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf",
     "promptTemplate": "### Instruction:\n%1\n### Response:\n"
   },
-  {
-    "order": "d",
-    "md5sum": "97463be739b50525df56d33b26b00852",
-    "name": "Mistral Instruct",
-    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "filesize": "4108916384",
-    "requires": "2.5.0",
-    "ramrequired": "8",
-    "parameters": "7 billion",
-    "quant": "q4_0",
-    "type": "Mistral",
-    "systemPrompt": " ",
-    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
-    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "promptTemplate": "[INST] %1 [/INST]"
-  },
   {
     "order": "e",
     "md5sum": "00c8593ba57f5240f59662367b3ed4a5",
diff --git a/gpt4all-chat/metadata/models3.json b/gpt4all-chat/metadata/models3.json
index 5e33ca0f88b0..df6c12eb0468 100644
--- a/gpt4all-chat/metadata/models3.json
+++ b/gpt4all-chat/metadata/models3.json
@@ -1,22 +1,6 @@
 [
   {
     "order": "a",
-    "md5sum": "6d1ca6e9533d177361fe2612a2c87474",
-    "name": "Gemma Instruct",
-    "filename": "gemma-7b-it.Q4_0.gguf",
-    "filesize": "4809316512",
-    "requires": "2.7.1",
-    "ramrequired": "8",
-    "parameters": "7 billion",
-    "quant": "q4_0",
-    "type": "Gemma",
-    "description": "<strong>A state-of-the-art open model from Google</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Google</li><li>Licensed for commercial use</li><li>Gemma is provided under and subject to the Gemma Terms of Use found at <a href=\"https://ai.google.dev/gemma/terms\">ai.google.dev/gemma/terms</a></li></ul>",
-    "url": "https://gpt4all.io/models/gguf/gemma-7b-it.Q4_0.gguf",
-    "promptTemplate": "<start_of_turn>user\n%1<end_of_turn>\n<start_of_turn>model\n%2<end_of_turn>\n",
-    "systemPrompt": ""
-  },
-  {
-    "order": "b",
     "md5sum": "f692417a22405d80573ac10cb0cd6c6a",
     "name": "Mistral OpenOrca",
     "filename": "mistral-7b-openorca.Q4_0.gguf2.gguf",
@@ -31,6 +15,22 @@
     "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
     "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI. For multi-step problems, write out your reasoning for each step.\n<|im_end|>"
   },
+  {
+    "order": "b",
+    "md5sum": "97463be739b50525df56d33b26b00852",
+    "name": "Mistral Instruct",
+    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "filesize": "4108916384",
+    "requires": "2.5.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "Mistral",
+    "systemPrompt": " ",
+    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
+    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
+    "promptTemplate": "[INST] %1 [/INST]"
+  },
   {
     "order": "c",
     "md5sum": "c4c78adf744d6a20f05c8751e3961b84",
@@ -47,22 +47,6 @@
     "url": "https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf",
     "promptTemplate": "### Instruction:\n%1\n### Response:\n"
   },
-  {
-    "order": "d",
-    "md5sum": "97463be739b50525df56d33b26b00852",
-    "name": "Mistral Instruct",
-    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "filesize": "4108916384",
-    "requires": "2.5.0",
-    "ramrequired": "8",
-    "parameters": "7 billion",
-    "quant": "q4_0",
-    "type": "Mistral",
-    "systemPrompt": " ",
-    "description": "<strong>Best overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
-    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "promptTemplate": "[INST] %1 [/INST]"
-  },
   {
     "order": "e",
     "md5sum": "00c8593ba57f5240f59662367b3ed4a5",

From a010a8a7ca3020e14a5e6c08a3426e0d987eef75 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 21 Feb 2024 16:53:47 -0500
Subject: [PATCH 17/17] Bump version and release notes for v2.7.1

Signed-off-by: Adam Treat <treat.adam@gmail.com>
---
 gpt4all-chat/CMakeLists.txt        |  2 +-
 gpt4all-chat/metadata/release.json | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/gpt4all-chat/CMakeLists.txt b/gpt4all-chat/CMakeLists.txt
index 0f9d0ab0f2e9..076e3c0b6fd2 100644
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 
 set(APP_VERSION_MAJOR 2)
 set(APP_VERSION_MINOR 7)
-set(APP_VERSION_PATCH 1)
+set(APP_VERSION_PATCH 2)
 set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
 
 # Include the binary directory for the generated header file
diff --git a/gpt4all-chat/metadata/release.json b/gpt4all-chat/metadata/release.json
index bd5b9b6836db..1ca17c3afbf9 100644
--- a/gpt4all-chat/metadata/release.json
+++ b/gpt4all-chat/metadata/release.json
@@ -683,6 +683,28 @@
 * Jared Van Bortel (Nomic AI)
 * Adam Treat (Nomic AI)
 * Community (beta testers, bug reporters, bindings authors)
+"
+  },
+  {
+    "version": "2.7.1",
+    "notes":
+"
+* Update to latest llama.cpp with support for Google Gemma
+* Gemma, Phi and Phi-2, Qwen2, and StableLM are now all GPU accelerated
+* Large revamp of the model loading to support explicit unload/reload
+* Bugfixes for ChatML and improved version of Mistral OpenOrca
+* We no longer load a model by default on application start
+* We no longer load a model by default on chat context switch
+* Fixes for visual artifacts in update reminder dialog
+* Blacklist Intel GPU's for now as we don't support yet
+* Fixes for binary save/restore of chat
+* Save and restore of window geometry across application starts
+",
+    "contributors":
+"
+* Jared Van Bortel (Nomic AI)
+* Adam Treat (Nomic AI)
+* Community (beta testers, bug reporters, bindings authors)
 "
   }
 ]