Skip to content

Commit

Permalink
refactor: lock context size under the train context
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <[email protected]>
  • Loading branch information
thxCode committed Jul 11, 2024
1 parent 62be7e1 commit 1b3c92e
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 21 deletions.
22 changes: 12 additions & 10 deletions cmd/gguf-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Usage of gguf-parser ...:
Model file below the --repo, e.g. Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.
-hf-repo string
Repository of HuggingFace which the GGUF file store, e.g. NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.
-in-max-ctx-size
Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size.
-in-mib
Display the estimated result in table with MiB.
-json
Expand Down Expand Up @@ -93,13 +95,13 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| ARCHITECTURE | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 32032 |
| ARCHITECTURE | 32768 | 4096 | 4 | 32 | 32 | 14336 | 0 | 32032 |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A |
| TOKENIZER | llama | 450.50 KiB | 32032 | N/A | 1 | 32000 | N/A | N/A | N/A |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
Expand All @@ -123,13 +125,13 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| ARCHITECTURE | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 8 | 32002 |
| ARCHITECTURE | 32768 | 4096 | 4 | 32 | 32 | 14336 | 8 | 32002 |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 |
| TOKENIZER | llama | 449.91 KiB | 32002 | N/A | 1 | 32000 | 0 | N/A | 2 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+
Expand All @@ -153,13 +155,13 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| ARCHITECTURE | 8192 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 128256 |
| ARCHITECTURE | 8192 | 4096 | 4 | 32 | 32 | 14336 | 0 | 128256 |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 |
| TOKENIZER | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128001 | 128002 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
Expand All @@ -183,13 +185,13 @@ $ gguf-parser --ol-model="gemma2"
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| ARCHITECTURE | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 |
| ARCHITECTURE | 8192 | 3584 | 2 | 16 | 42 | 14336 | 0 | 256000 |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 3.80 MiB | 256000 | 0 | 2 | 1 | 3 | N/A | 0 |
| TOKENIZER | llama | 3.80 MiB | 256000 | N/A | 2 | 1 | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
Expand All @@ -208,13 +210,13 @@ $ gguf-parser --ol-model="gemma2" --ol-crawl
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
| ARCHITECTURE | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 |
| ARCHITECTURE | 8192 | 3584 | 2 | 16 | 42 | 14336 | 0 | 256000 |
+--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
| TOKENIZER | llama | 0 B | 256000 | 0 | 2 | 1 | 3 | N/A | 0 |
| TOKENIZER | llama | N/A | 256000 | N/A | 2 | 1 | 3 | N/A | 0 |
+--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
Expand Down
12 changes: 9 additions & 3 deletions cmd/gguf-parser/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func main() {
skipDNSCache bool
// estimate options
ctxSize = -1
inMaxCtxSize bool
physicalBatchSize = 512
parallelSize = 1
kvType = "f16"
Expand Down Expand Up @@ -102,6 +103,8 @@ func main() {
fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+
"which is used to estimate the usage, "+
"default is equal to the model's maximum context size.")
fs.BoolVar(&inMaxCtxSize, "in-max-ctx-size", inMaxCtxSize, "Limit the context size to the maximum context size of the model, "+
"if the context size is larger than the maximum context size.")
fs.IntVar(&physicalBatchSize, "ubatch-size", physicalBatchSize, "Specify the physical maximum batch size, "+
"which is used to estimate the usage, "+
"default is 512.")
Expand Down Expand Up @@ -184,6 +187,9 @@ func main() {
if ctxSize > 0 {
eopts = append(eopts, WithContextSize(int32(ctxSize)))
}
if inMaxCtxSize {
eopts = append(eopts, WithinMaxContextSize())
}
if physicalBatchSize > 0 {
eopts = append(eopts, WithPhysicalBatchSize(int32(physicalBatchSize)))
}
Expand Down Expand Up @@ -449,9 +455,9 @@ func main() {
nil,
[]string{
t.Model,
sprintf(GGUFBytesScalar(t.TokensSize)),
sprintf(t.TokensLength),
sprintf(t.AddedTokensLength),
sprintf(tenary(t.TokensSize <= 0, "N/A", GGUFBytesScalar(t.TokensSize))),
sprintf(tenary(t.TokensLength <= 0, "N/A", t.TokensLength)),
sprintf(tenary(t.AddedTokensLength <= 0, "N/A", t.AddedTokensLength)),
sprintf(tenary(t.BOSTokenID < 0, "N/A", t.BOSTokenID)),
sprintf(tenary(t.EOSTokenID < 0, "N/A", t.EOSTokenID)),
sprintf(tenary(t.UnknownTokenID < 0, "N/A", t.UnknownTokenID)),
Expand Down
12 changes: 4 additions & 8 deletions file_architecture.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,12 @@ type GGUFArchitectureMetadata struct {

/* Appendix */

// EmbeddingGroup is the number of groups in the embedding layer.
EmbeddingGroup uint64 `json:"embeddingGroup,omitempty"`
// EmbeddingGGQA is the GQA of the embedding layer.
EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"`
// EmbeddingKeyGQA is the number of key GQA in the embedding layer.
EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"`
// EmbeddingValueGQA is the number of value GQA in the embedding layer.
EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"`
// EmbeddingGGQA is the GQA of the embedding layer.
EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"`

// ClipHasTextEncoder indicates whether the clip model has text encoder or not.
//
Expand Down Expand Up @@ -213,7 +211,7 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) {

{
if ga.AttentionHeadCountKV > 0 {
ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV
ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
}
if ga.AttentionHeadCount > 0 {
ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
Expand All @@ -223,7 +221,6 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) {
ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
}
ga.EmbeddingGQA = ga.EmbeddingValueGQA
}

return ga
Expand Down Expand Up @@ -408,7 +405,7 @@ func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetad

{
if ga.AttentionHeadCountKV > 0 {
ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV
ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
}
if ga.AttentionHeadCount > 0 {
ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
Expand All @@ -418,7 +415,6 @@ func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetad
ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
}
ga.EmbeddingGQA = ga.EmbeddingValueGQA
}

return ga
Expand Down
3 changes: 3 additions & 0 deletions file_estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
if o.ContextSize != nil {
nContext = uint64(*o.ContextSize)
}
if o.InMaxContextSize {
nContext = min(nContext, a.MaximumContextLength)
}
// Padding context size,
// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002.
if o.FlashAttention {
Expand Down
9 changes: 9 additions & 0 deletions file_estimate_option.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ type (
Architecture *GGUFArchitectureMetadata
Tokenizer *GGUFTokenizerMetadata
ContextSize *int32
InMaxContextSize bool
PhysicalBatchSize *int32
ParallelSize *int32
CacheKeyType *GGMLType
Expand Down Expand Up @@ -51,6 +52,14 @@ func WithContextSize(size int32) LLaMACppUsageEstimateOption {
}
}

// WithinMaxContextSize limits the context size to the maximum,
// if the context size is over the maximum.
func WithinMaxContextSize() LLaMACppUsageEstimateOption {
return func(o *_LLaMACppUsageEstimateOptions) {
o.InMaxContextSize = true
}
}

// WithPhysicalBatchSize sets the physical batch size for the estimate.
func WithPhysicalBatchSize(size int32) LLaMACppUsageEstimateOption {
return func(o *_LLaMACppUsageEstimateOptions) {
Expand Down

0 comments on commit 1b3c92e

Please sign in to comment.