Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/concedo'
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Sep 27, 2023
2 parents 6823315 + bfc696f commit 16271e0
Show file tree
Hide file tree
Showing 10 changed files with 173 additions and 118 deletions.
6 changes: 3 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
Expand Down
10 changes: 5 additions & 5 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

if (plan.work_size > 0) {
Expand All @@ -32,7 +32,7 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
ggml_graph_compute(graph, &plan);
}

float tensor_sum_elements(const ggml_tensor * tensor) {
static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
if (tensor->type == GGML_TYPE_F32) {
for (int j = 0; j < tensor->ne[1]; j++) {
Expand All @@ -44,7 +44,7 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
return sum;
}

void tensor_dump(const ggml_tensor * tensor, const char * name) {
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type),
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
Expand All @@ -59,7 +59,7 @@ struct benchmark_params_struct {
int32_t n_iterations = 10;
};

void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
Expand Down Expand Up @@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6

if (delta > allowed_delta) {
Expand Down
22 changes: 20 additions & 2 deletions examples/embedding/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
# embedding
# llama.cpp/example/embedding

TODO
This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.

## Quick Start

To get started right away, run the following command, making sure to use the correct path for the model you have:

### Unix-based systems (Linux, macOS, etc.):

```bash
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
```

### Windows:

```powershell
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
```

The above command will output space-separated float values.
4 changes: 2 additions & 2 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
Expand Down
6 changes: 3 additions & 3 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@ std::array<std::string, 2> mul_str_values = {
"mul_f32", "float"
};

std::string& replace(std::string& s, const std::string& from, const std::string& to) {
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) {
s.replace(pos, from.length(), to);
Expand All @@ -859,7 +859,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
return s;
}

std::string generate_kernels() {
static std::string generate_kernels() {
std::stringstream src;
src << program_source << '\n';
src << k_quants_source << '\n';
Expand Down Expand Up @@ -1808,7 +1808,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
return false;
}

bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
// If device doesn't support FP16
if (!fp16_support) {
return false;
Expand Down
16 changes: 6 additions & 10 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -17290,18 +17290,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} else {
// wait for other threads to finish
const int last = node_n;
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
do {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_METAL)
//apple does nothing
#else
sched_yield();
#endif

#endif
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
};
} while (node_n == last);
}

// check if we should stop
Expand Down
1 change: 1 addition & 0 deletions gpttype_adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1233,6 +1233,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
concat_output_mtx.lock();
concat_output = "";
concat_output_reader_copy = "";
concat_output_mtx.unlock();
last_stop_reason = stop_reason::OUT_OF_TOKENS;
stop_sequence.clear();
Expand Down
Loading

0 comments on commit 16271e0

Please sign in to comment.