Skip to content

Commit

Permalink
Merge branch 'nomic-ai:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
abdulrahman305 authored Feb 22, 2024
2 parents b81d008 + a010a8a commit 1d07ae2
Show file tree
Hide file tree
Showing 38 changed files with 1,222 additions and 522 deletions.
4 changes: 3 additions & 1 deletion gpt4all-backend/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -814,8 +814,10 @@ std::vector<float> Bert::embedding(const std::string &text)
return finalEmbeddings;
}

std::vector<LLModel::Token> Bert::tokenize(PromptContext &, const std::string &str) const
std::vector<LLModel::Token> Bert::tokenize(PromptContext &ctx, const std::string &str, bool special) const
{
(void)ctx;
(void)special;
return ::bert_tokenize(d_ptr->ctx, str.c_str());
}

Expand Down
7 changes: 4 additions & 3 deletions gpt4all-backend/bert_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ class Bert : public LLModel {
std::unique_ptr<BertPrivate> d_ptr;

protected:
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
Token sampleToken(PromptContext &ctx) const override;
std::string tokenToString(Token) const override;
std::string tokenToString(Token id) const override;
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
int32_t contextLength() const override;
const std::vector<Token>& endTokens() const override;
const std::vector<Token> &endTokens() const override;
bool shouldAddBOS() const override { return true; }
};

#endif // BERT_H
4 changes: 3 additions & 1 deletion gpt4all-backend/gptj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -737,8 +737,10 @@ size_t GPTJ::restoreState(const uint8_t *src)
return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
}

std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &, const std::string &str) const
std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &ctx, const std::string &str, bool special) const
{
(void)ctx;
(void)special;
return ::gpt_tokenize(d_ptr->vocab, str);
}

Expand Down
Empty file removed gpt4all-backend/gptj/placeholder
Empty file.
7 changes: 4 additions & 3 deletions gpt4all-backend/gptj_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ class GPTJ : public LLModel {
GPTJPrivate *d_ptr;

protected:
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
Token sampleToken(PromptContext &ctx) const override;
std::string tokenToString(Token) const override;
std::string tokenToString(Token id) const override;
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
int32_t contextLength() const override;
const std::vector<Token>& endTokens() const override;
const std::vector<Token> &endTokens() const override;
bool shouldAddBOS() const override { return false; }
};

#endif // GPTJ_H
2 changes: 1 addition & 1 deletion gpt4all-backend/llama.cpp-mainline
Empty file removed gpt4all-backend/llama/placeholder
Empty file.
190 changes: 113 additions & 77 deletions gpt4all-backend/llamamodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,29 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <iomanip>
#include <iostream>
#if defined(_WIN32) && defined(_MSC_VER)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <io.h>
#include <stdio.h>
#else
#include <unistd.h>
#endif
#include <map>
#include <random>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <unordered_set>
#include <vector>

#include <llama.h>
#include <ggml.h>

#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#include <ggml-kompute.h>
#endif

using namespace std::string_literals;

// Maximum supported GGUF version
static constexpr int GGUF_VER_MAX = 3;

namespace {
const char *modelType_ = "LLaMA";
}
static const char * const modelType_ = "LLaMA";

static bool llama_verbose() {
const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
Expand Down Expand Up @@ -96,6 +87,56 @@ static int llama_sample_top_p_top_k(
return llama_sample_token(ctx, &candidates_p);
}

std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}

static gguf_context *load_gguf(const char *fname) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context *ctx = gguf_init_from_file(fname, params);
if (!ctx) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return nullptr;
}

int gguf_ver = gguf_get_version(ctx);
if (gguf_ver > GGUF_VER_MAX) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
gguf_free(ctx);
return nullptr;
}

return ctx;
}

static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
auto * ctx = load_gguf(modelPath.c_str());
auto arch = get_arch_name(ctx);

int32_t value = -1;
if (ctx) {
auto key = arch + "." + archKey;
int keyidx = gguf_find_key(ctx, key.c_str());
if (keyidx != -1) {
value = gguf_get_val_u32(ctx, keyidx);
} else {
std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
}
}

gguf_free(ctx);
return value;
}

struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded;
Expand Down Expand Up @@ -148,6 +189,42 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
return filesize + est_kvcache_size;
}

bool LLamaModel::isModelBlacklisted(const std::string &modelPath) {
auto * ctx = load_gguf(modelPath.c_str());
if (!ctx) {
std::cerr << __func__ << ": failed to load " << modelPath << "\n";
return false;
}

auto get_key = [ctx, &modelPath](const char *name) {
int keyidx = gguf_find_key(ctx, name);
if (keyidx == -1) {
throw std::logic_error(name + " not found in "s + modelPath);
}
return keyidx;
};

bool res = false;
try {
std::string name(gguf_get_val_str(ctx, get_key("general.name")));
int token_idx = get_key("tokenizer.ggml.tokens");
int n_vocab = gguf_get_arr_n(ctx, token_idx);

// check for known bad models
if (name == "open-orca_mistral-7b-openorca"
&& n_vocab == 32002
&& gguf_get_arr_str(ctx, token_idx, 32000) == "<dummy32000>"s // should be <|im_end|>
) {
res = true;
}
} catch (const std::logic_error &e) {
std::cerr << __func__ << ": " << e.what() << "\n";
}

gguf_free(ctx);
return res;
}

bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
{
d_ptr->modelLoaded = false;
Expand Down Expand Up @@ -180,6 +257,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->model_params.use_mlock = params.use_mlock;
#endif

d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
d_ptr->model_params.progress_callback_user_data = this;

#ifdef GGML_USE_METAL
if (llama_verbose()) {
std::cerr << "llama.cpp: using Metal" << std::endl;
Expand Down Expand Up @@ -287,12 +367,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
}

std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
{
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->model));
std::vector<LLModel::Token> fres(str.size()+4);
// TODO(cebtenzzre): we may want to use special=true here to process special tokens
auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, false);
const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
const bool useBOS = wantBOS && shouldAddBOS();
auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
std::vector<LLModel::Token> fres(strCat.size()+4);
auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
fres.resize(fres_len);
return fres;
}
Expand Down Expand Up @@ -346,55 +427,10 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
return d_ptr->end_tokens;
}

std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}

static gguf_context *load_gguf(const char *fname, std::string &arch) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context *ctx = gguf_init_from_file(fname, params);
if (!ctx) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return nullptr;
}

int gguf_ver = gguf_get_version(ctx);
if (gguf_ver > GGUF_VER_MAX) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
gguf_free(ctx);
return nullptr;
}

arch = get_arch_name(ctx);
return ctx;
}

static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
std::string arch;
auto * ctx = load_gguf(modelPath.c_str(), arch);

int32_t value = -1;
if (ctx) {
auto key = arch + "." + archKey;
int keyidx = gguf_find_key(ctx, key.c_str());
if (keyidx != -1) {
value = gguf_get_val_u32(ctx, keyidx);
} else {
std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
}
}

gguf_free(ctx);
return value;
bool LLamaModel::shouldAddBOS() const
{
int add_bos = llama_add_bos_token(d_ptr->model);
return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
}

int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
Expand Down Expand Up @@ -510,14 +546,14 @@ DLL_EXPORT const char *get_build_variant() {
}

DLL_EXPORT bool magic_match(const char *fname) {
std::string arch;
auto * ctx = load_gguf(fname, arch);
auto * ctx = load_gguf(fname);
auto arch = get_arch_name(ctx);

bool valid = true;

static const std::vector<const char *> known_arches {
"baichuan", "bloom", "codeshell", "falcon", "gpt2", "llama", "mpt", "orion", "persimmon", "phi2", "plamo",
"qwen", "qwen2", "refact", "stablelm", "starcoder"
"baichuan", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "orion", "persimmon", "phi2",
"plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
};

if (std::find(known_arches.begin(), known_arches.end(), arch) == known_arches.end()) {
Expand Down
14 changes: 8 additions & 6 deletions gpt4all-backend/llamamodel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class LLamaModel : public LLModel {
bool supportsEmbedding() const override { return false; }
bool supportsCompletion() const override { return true; }
bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
bool isModelBlacklisted(const std::string &modelPath) override;
bool isModelLoaded() const override;
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
size_t stateSize() const override;
Expand All @@ -27,7 +28,7 @@ class LLamaModel : public LLModel {
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const override;
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
bool initializeGPUDevice(int device, std::string *unavail_reason) const override;
bool hasGPUDevice() override;
bool usingGPUDevice() override;
Expand All @@ -36,12 +37,13 @@ class LLamaModel : public LLModel {
std::unique_ptr<LLamaPrivate> d_ptr;

protected:
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
std::string tokenToString(Token) const override;
Token sampleToken(PromptContext& ctx) const override;
bool evalTokens(PromptContext& ctx, const std::vector<int32_t> &tokens) const override;
std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
std::string tokenToString(Token id) const override;
Token sampleToken(PromptContext &ctx) const override;
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
int32_t contextLength() const override;
const std::vector<Token>& endTokens() const override;
const std::vector<Token> &endTokens() const override;
bool shouldAddBOS() const override;

int32_t maxContextLength(std::string const &modelPath) const override;
int32_t layerCount(std::string const &modelPath) const override;
Expand Down
Loading

0 comments on commit 1d07ae2

Please sign in to comment.