refactor: lock context size under the train context

Signed-off-by: thxCode <[email protected]>
gpustack · Jul 11, 2024 · 1b3c92e · 1b3c92e
1 parent 62be7e1
commit 1b3c92e
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 21 deletions.
diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md
@@ -23,6 +23,8 @@ Usage of gguf-parser ...:
         Model file below the --repo, e.g. Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.
   -hf-repo string
         Repository of HuggingFace which the GGUF file store, e.g. NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.
+  -in-max-ctx-size
+        Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size.
   -in-mib
         Display the estimated result in table with MiB.
   -json
@@ -93,13 +95,13 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 |      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
-| ARCHITECTURE |      32768      |     4096      |     1024      |         32         |   32   |      14336       |     0      |     32032      |
+| ARCHITECTURE |      32768      |     4096      |       4       |         32         |   32   |      14336       |     0      |     32032      |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 |      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
-|  TOKENIZER   | llama | 450.50 KiB  |   32032    |        0         |     1     |   32000   |      N/A      |       N/A       |      N/A      |
+|  TOKENIZER   | llama | 450.50 KiB  |   32032    |       N/A        |     1     |   32000   |      N/A      |       N/A       |      N/A      |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 
 +--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
@@ -123,13 +125,13 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 |      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
-| ARCHITECTURE |      32768      |     4096      |     1024      |         32         |   32   |      14336       |     8      |     32002      |
+| ARCHITECTURE |      32768      |     4096      |       4       |         32         |   32   |      14336       |     8      |     32002      |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 |      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
-|  TOKENIZER   | llama | 449.91 KiB  |   32002    |        0         |     1     |   32000   |       0       |       N/A       |       2       |
+|  TOKENIZER   | llama | 449.91 KiB  |   32002    |       N/A        |     1     |   32000   |       0       |       N/A       |       2       |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 
 +--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+
@@ -153,13 +155,13 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 |      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
-| ARCHITECTURE |      8192       |     4096      |     1024      |         32         |   32   |      14336       |     0      |     128256     |
+| ARCHITECTURE |      8192       |     4096      |       4       |         32         |   32   |      14336       |     0      |     128256     |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 |      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
-|  TOKENIZER   | gpt2  |    2 MiB    |   128256   |        0         |  128000   |  128001   |    128002     |       N/A       |       0       |
+|  TOKENIZER   | gpt2  |    2 MiB    |   128256   |       N/A        |  128000   |  128001   |    128002     |       N/A       |       0       |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 
 +--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
@@ -183,13 +185,13 @@ $ gguf-parser --ol-model="gemma2"
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 |      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
-| ARCHITECTURE |      8192       |     3584      |     2048      |         16         |   42   |      14336       |     0      |     256000     |
+| ARCHITECTURE |      8192       |     3584      |       2       |         16         |   42   |      14336       |     0      |     256000     |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 |      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
-|  TOKENIZER   | llama |  3.80 MiB   |   256000   |        0         |     2     |     1     |       3       |       N/A       |       0       |
+|  TOKENIZER   | llama |  3.80 MiB   |   256000   |       N/A        |     2     |     1     |       3       |       N/A       |       0       |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 
 +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+
@@ -208,13 +210,13 @@ $ gguf-parser --ol-model="gemma2" --ol-crawl
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 |      \       | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
-| ARCHITECTURE |      8192       |     3584      |     2048      |         16         |   42   |      14336       |     0      |     256000     |
+| ARCHITECTURE |      8192       |     3584      |       2       |         16         |   42   |      14336       |     0      |     256000     |
 +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+
 
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 |      \       | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
-|  TOKENIZER   | llama |     0 B     |   256000   |        0         |     2     |     1     |       3       |       N/A       |       0       |
+|  TOKENIZER   | llama |     N/A     |   256000   |       N/A        |     2     |     1     |       3       |       N/A       |       0       |
 +--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
 
 +--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+

diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
@@ -41,6 +41,7 @@ func main() {
 		skipDNSCache  bool
 		// estimate options
 		ctxSize           = -1
+		inMaxCtxSize      bool
 		physicalBatchSize = 512
 		parallelSize      = 1
 		kvType            = "f16"
@@ -102,6 +103,8 @@ func main() {
 	fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+
 		"which is used to estimate the usage, "+
 		"default is equal to the model's maximum context size.")
+	fs.BoolVar(&inMaxCtxSize, "in-max-ctx-size", inMaxCtxSize, "Limit the context size to the maximum context size of the model, "+
+		"if the context size is larger than the maximum context size.")
 	fs.IntVar(&physicalBatchSize, "ubatch-size", physicalBatchSize, "Specify the physical maximum batch size, "+
 		"which is used to estimate the usage, "+
 		"default is 512.")
@@ -184,6 +187,9 @@ func main() {
 	if ctxSize > 0 {
 		eopts = append(eopts, WithContextSize(int32(ctxSize)))
 	}
+	if inMaxCtxSize {
+		eopts = append(eopts, WithinMaxContextSize())
+	}
 	if physicalBatchSize > 0 {
 		eopts = append(eopts, WithPhysicalBatchSize(int32(physicalBatchSize)))
 	}
@@ -449,9 +455,9 @@ func main() {
 			nil,
 			[]string{
 				t.Model,
-				sprintf(GGUFBytesScalar(t.TokensSize)),
-				sprintf(t.TokensLength),
-				sprintf(t.AddedTokensLength),
+				sprintf(tenary(t.TokensSize <= 0, "N/A", GGUFBytesScalar(t.TokensSize))),
+				sprintf(tenary(t.TokensLength <= 0, "N/A", t.TokensLength)),
+				sprintf(tenary(t.AddedTokensLength <= 0, "N/A", t.AddedTokensLength)),
 				sprintf(tenary(t.BOSTokenID < 0, "N/A", t.BOSTokenID)),
 				sprintf(tenary(t.EOSTokenID < 0, "N/A", t.EOSTokenID)),
 				sprintf(tenary(t.UnknownTokenID < 0, "N/A", t.UnknownTokenID)),

diff --git a/file_architecture.go b/file_architecture.go
@@ -83,14 +83,12 @@ type GGUFArchitectureMetadata struct {
 
 	/* Appendix */
 
-	// EmbeddingGroup is the number of groups in the embedding layer.
-	EmbeddingGroup uint64 `json:"embeddingGroup,omitempty"`
+	// EmbeddingGGQA is the GQA of the embedding layer.
+	EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"`
 	// EmbeddingKeyGQA is the number of key GQA in the embedding layer.
 	EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"`
 	// EmbeddingValueGQA is the number of value GQA in the embedding layer.
 	EmbeddingValueGQA uint64 `json:"embeddingValueGQA,omitempty"`
-	// EmbeddingGGQA is the GQA of the embedding layer.
-	EmbeddingGQA uint64 `json:"embeddingGQA,omitempty"`
 
 	// ClipHasTextEncoder indicates whether the clip model has text encoder or not.
 	//
@@ -213,7 +211,7 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) {
 
 	{
 		if ga.AttentionHeadCountKV > 0 {
-			ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV
+			ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
 		}
 		if ga.AttentionHeadCount > 0 {
 			ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
@@ -223,7 +221,6 @@ func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitectureMetadata) {
 			ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
 			ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
 		}
-		ga.EmbeddingGQA = ga.EmbeddingValueGQA
 	}
 
 	return ga
@@ -408,7 +405,7 @@ func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetad
 
 	{
 		if ga.AttentionHeadCountKV > 0 {
-			ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV
+			ga.EmbeddingGQA = ga.AttentionHeadCount / ga.AttentionHeadCountKV
 		}
 		if ga.AttentionHeadCount > 0 {
 			ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV
@@ -418,7 +415,6 @@ func (gf *GGUFFile) transformArchitecture(arch string) (ga GGUFArchitectureMetad
 			ga.EmbeddingKeyGQA = uint64((ga.SSMConvolutionKernel - 1) * ga.SSMInnerSize)
 			ga.EmbeddingValueGQA = uint64(ga.SSMStateSize * ga.SSMInnerSize)
 		}
-		ga.EmbeddingGQA = ga.EmbeddingValueGQA
 	}
 
 	return ga

diff --git a/file_estimate.go b/file_estimate.go
@@ -144,6 +144,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
 		if o.ContextSize != nil {
 			nContext = uint64(*o.ContextSize)
 		}
+		if o.InMaxContextSize {
+			nContext = min(nContext, a.MaximumContextLength)
+		}
 		// Padding context size,
 		// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002.
 		if o.FlashAttention {

diff --git a/file_estimate_option.go b/file_estimate_option.go
@@ -11,6 +11,7 @@ type (
 		Architecture      *GGUFArchitectureMetadata
 		Tokenizer         *GGUFTokenizerMetadata
 		ContextSize       *int32
+		InMaxContextSize  bool
 		PhysicalBatchSize *int32
 		ParallelSize      *int32
 		CacheKeyType      *GGMLType
@@ -51,6 +52,14 @@ func WithContextSize(size int32) LLaMACppUsageEstimateOption {
 	}
 }
 
+// WithinMaxContextSize limits the context size to the maximum,
+// if the context size is over the maximum.
+func WithinMaxContextSize() LLaMACppUsageEstimateOption {
+	return func(o *_LLaMACppUsageEstimateOptions) {
+		o.InMaxContextSize = true
+	}
+}
+
 // WithPhysicalBatchSize sets the physical batch size for the estimate.
 func WithPhysicalBatchSize(size int32) LLaMACppUsageEstimateOption {
 	return func(o *_LLaMACppUsageEstimateOptions) {