From d9d0593fe088111c63f68bb79a689425499f396b Mon Sep 17 00:00:00 2001 From: fangjun Date: Wed, 16 Oct 2024 11:24:56 +0800 Subject: [PATCH 1/9] feat(sysadvisor): support reclaimed-core numa binding --- go.mod | 1 + go.sum | 4 +- .../cpu/dynamicpolicy/cpuadvisor/const.go | 23 + .../cpu/dynamicpolicy/cpuadvisor/cpu.pb.go | 263 +-- .../cpu/dynamicpolicy/cpuadvisor/cpu.proto | 1 + .../dynamicpolicy/memoryadvisor/const.go | 1 + .../sysadvisor/plugin/qosaware/qos_aware.go | 19 +- .../qosaware/reporter/headroom_reporter.go | 185 +- .../reporter/headroom_reporter_test.go | 4 +- .../qosaware/reporter/manager/manager.go | 24 +- .../reporter/manager/resource/generic.go | 27 +- .../plugin/qosaware/resource/cpu/advisor.go | 33 +- .../qosaware/resource/cpu/advisor_test.go | 2 +- .../assembler/headroomassembler/assembler.go | 2 +- .../headroomassembler/assembler_common.go | 128 +- .../assembler_common_test.go | 13 +- .../assembler_common_util.go | 60 +- .../qosaware/resource/memory/advisor.go | 48 +- .../qosaware/resource/memory/advisor_test.go | 1491 +++++++++++++++-- .../resource/memory/headroompolicy/policy.go | 2 +- .../memory/headroompolicy/policy_canonical.go | 65 +- .../headroompolicy/policy_canonical_test.go | 27 +- .../headroompolicy/policy_numa_aware.go | 55 +- .../headroompolicy/policy_numa_aware_test.go | 2 +- .../plugin/qosaware/resource/resource.go | 12 +- .../plugin/qosaware/resource/resource_stub.go | 10 +- .../plugin/qosaware/server/cpu_server.go | 105 +- .../plugin/qosaware/server/cpu_server_test.go | 61 +- .../plugin/qosaware/server/memory_server.go | 47 +- .../qosaware/server/memory_server_test.go | 15 +- .../plugin/qosaware/server/server.go | 29 +- pkg/agent/sysadvisor/types/cpu.go | 6 + 32 files changed, 2416 insertions(+), 349 deletions(-) create mode 100644 pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/const.go diff --git a/go.mod b/go.mod index 03a1c60bf..d3196474d 100644 --- a/go.mod +++ b/go.mod @@ -161,6 +161,7 @@ require ( ) replace ( + github.com/kubewharf/katalyst-api => github.com/luomingmeng/katalyst-api v0.0.0-20241008091724-de1c08556aab // FIXME k8s.io/api => k8s.io/api v0.24.6 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.24.6 k8s.io/apimachinery => k8s.io/apimachinery v0.24.6 diff --git a/go.sum b/go.sum index 39907aa39..159279e8d 100644 --- a/go.sum +++ b/go.sum @@ -568,8 +568,6 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubewharf/katalyst-api v0.5.1-0.20240929080014-ae613a8935d8 h1:rIwZVD4iia7pTiB6h1xR8muc4jx4GgHWzhVCPKlEhXw= -github.com/kubewharf/katalyst-api v0.5.1-0.20240929080014-ae613a8935d8/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/kubewharf/kubelet v1.24.6-kubewharf.9 h1:jOTYZt7h/J7I8xQMKMUcJjKf5UFBv37jHWvNp5VRFGc= github.com/kubewharf/kubelet v1.24.6-kubewharf.9/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c= github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8= @@ -581,6 +579,8 @@ github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0U github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/logrusorgru/aurora v0.0.0-20181002194514-a7b3b318ed4e/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/lpabon/godbc v0.1.1/go.mod h1:Jo9QV0cf3U6jZABgiJ2skINAXb9j8m51r07g4KI92ZA= +github.com/luomingmeng/katalyst-api v0.0.0-20241008091724-de1c08556aab h1:2m1zZnUXbwpWpHA9fGnnPonEc5rJRyT0O41NUpXztk4= +github.com/luomingmeng/katalyst-api v0.0.0-20241008091724-de1c08556aab/go.mod h1:Y2IeIorxQamF2a3oa0+URztl5QCSty6Jj3zD83R8J9k= github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/const.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/const.go new file mode 100644 index 000000000..37cbe9ec6 --- /dev/null +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/const.go @@ -0,0 +1,23 @@ +/* +Copyright 2022 The Katalyst Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cpuadvisor + +type CPUControlKnobName string + +const ( + ControlKnobKeyCPUNUMAHeadroom CPUControlKnobName = "cpu_numa_headroom" +) diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.pb.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.pb.go index 50286db9e..93c0c3cf6 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.pb.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.pb.go @@ -73,6 +73,7 @@ func (OverlapType) EnumDescriptor() ([]byte, []int) { type ListAndWatchResponse struct { Entries map[string]*CalculationEntries `protobuf:"bytes,1,rep,name=entries,proto3" json:"entries,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` AllowSharedCoresOverlapReclaimedCores bool `protobuf:"varint,2,opt,name=allow_shared_cores_overlap_reclaimed_cores,json=allowSharedCoresOverlapReclaimedCores,proto3" json:"allow_shared_cores_overlap_reclaimed_cores,omitempty"` + ExtraEntries []*advisorsvc.CalculationInfo `protobuf:"bytes,3,rep,name=extra_entries,json=extraEntries,proto3" json:"extra_entries,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_sizecache int32 `json:"-"` } @@ -123,6 +124,13 @@ func (m *ListAndWatchResponse) GetAllowSharedCoresOverlapReclaimedCores() bool { return false } +func (m *ListAndWatchResponse) GetExtraEntries() []*advisorsvc.CalculationInfo { + if m != nil { + return m.ExtraEntries + } + return nil +} + type CalculationEntries struct { Entries map[string]*CalculationInfo `protobuf:"bytes,1,rep,name=entries,proto3" json:"entries,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` XXX_NoUnkeyedLiteral struct{} `json:"-"` @@ -629,72 +637,73 @@ func init() { func init() { proto.RegisterFile("cpu.proto", fileDescriptor_08fc9a87e8768c24) } var fileDescriptor_08fc9a87e8768c24 = []byte{ - // 1031 bytes of a gzipped FileDescriptorProto + // 1052 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x9c, 0x56, 0xcb, 0x6e, 0xdb, 0x46, - 0x17, 0x16, 0x2d, 0xc7, 0xb6, 0x8e, 0xef, 0x13, 0xdb, 0x91, 0xf9, 0x47, 0x82, 0xa2, 0x1f, 0x29, - 0x5c, 0x17, 0x96, 0x52, 0xb9, 0x68, 0x02, 0xaf, 0x2a, 0xa9, 0x86, 0x9b, 0x5e, 0x12, 0x95, 0x89, - 0x62, 0x24, 0x1b, 0x62, 0x44, 0x8e, 0x29, 0x42, 0x24, 0x87, 0x21, 0x87, 0x32, 0x88, 0x02, 0x45, - 0xdf, 0xa0, 0x5d, 0xf5, 0x15, 0xba, 0x2e, 0xd0, 0x65, 0x1f, 0x20, 0xcb, 0x2e, 0xbb, 0x6c, 0xdc, - 0x57, 0xe8, 0xb2, 0x05, 0x0a, 0x0e, 0x29, 0x69, 0xa8, 0x6b, 0xdb, 0x95, 0x74, 0x2e, 0xdf, 0x77, - 0x3e, 0x9e, 0x33, 0x73, 0x30, 0x90, 0xd3, 0xdc, 0xa0, 0xe2, 0x7a, 0x94, 0x51, 0x04, 0x9a, 0x1b, - 0x60, 0xbd, 0x6f, 0xfa, 0xd4, 0x93, 0x4f, 0x0c, 0x93, 0x75, 0x83, 0x4e, 0x45, 0xa3, 0x76, 0xd5, - 0xa0, 0x06, 0xad, 0xf2, 0x94, 0x4e, 0x70, 0xc5, 0x2d, 0x6e, 0xf0, 0x7f, 0x31, 0x54, 0x6e, 0x0b, - 0xe9, 0xbd, 0xa0, 0x43, 0xae, 0xbb, 0xd8, 0xbb, 0xaa, 0xf6, 0x30, 0xc3, 0x56, 0xe8, 0xb3, 0x13, - 0x8d, 0x7a, 0xa4, 0xea, 0xf6, 0x8c, 0x2a, 0x36, 0x88, 0xc3, 0xaa, 0xaf, 0x3d, 0xfb, 0xc4, 0xb5, - 0x02, 0xc3, 0x74, 0xfc, 0x6a, 0x52, 0xd0, 0xef, 0x6b, 0x83, 0xbf, 0xaa, 0xdf, 0xd7, 0x62, 0xda, - 0xf2, 0xf7, 0x4b, 0xb0, 0xf7, 0xb9, 0xe9, 0xb3, 0xba, 0xa3, 0x5f, 0x62, 0xa6, 0x75, 0x15, 0xe2, - 0xbb, 0xd4, 0xf1, 0x09, 0xba, 0x80, 0x55, 0xe2, 0x30, 0xcf, 0x24, 0x7e, 0x5e, 0x2a, 0x65, 0x8f, - 0xd6, 0x6b, 0x27, 0x95, 0x91, 0xf8, 0xca, 0x34, 0x48, 0xe5, 0x3c, 0xce, 0x8f, 0x7e, 0x42, 0x65, - 0x80, 0x46, 0x2f, 0xe1, 0x18, 0x5b, 0x16, 0xbd, 0x56, 0xfd, 0x2e, 0xf6, 0x88, 0xae, 0x46, 0x4a, - 0x7d, 0x95, 0xf6, 0x89, 0x67, 0x61, 0x57, 0xf5, 0x88, 0x66, 0x61, 0xd3, 0x1e, 0xf8, 0xf3, 0x4b, - 0x25, 0xe9, 0x68, 0x4d, 0xb9, 0xcf, 0x11, 0xcf, 0x38, 0xa0, 0x19, 0xf9, 0x9f, 0xc6, 0xe9, 0xca, - 0x20, 0x9b, 0x3b, 0xe5, 0x57, 0xb0, 0x21, 0xd6, 0x44, 0x3b, 0x90, 0xed, 0x91, 0x30, 0x2f, 0x95, - 0xa4, 0xa3, 0x9c, 0x12, 0xfd, 0x45, 0x1f, 0xc0, 0xad, 0x3e, 0xb6, 0x02, 0xc2, 0x79, 0xd7, 0x6b, - 0x45, 0xf1, 0x1b, 0x9a, 0xd8, 0xd2, 0x02, 0x0b, 0x33, 0x93, 0x3a, 0x09, 0x8b, 0x12, 0x27, 0x9f, - 0x2d, 0x3d, 0x92, 0xca, 0x3f, 0x49, 0x80, 0x26, 0x33, 0xd0, 0xf9, 0x78, 0x5b, 0xde, 0x9b, 0x4f, - 0x39, 0xbd, 0x29, 0xf2, 0xe5, 0x42, 0xe5, 0xef, 0xa7, 0x95, 0xff, 0x6f, 0x46, 0x99, 0xc7, 0xce, - 0x15, 0x15, 0x65, 0xff, 0xb0, 0x04, 0xdb, 0x63, 0x61, 0xf4, 0x0e, 0x6c, 0xd3, 0x6b, 0x87, 0x78, - 0xaa, 0x4b, 0xa9, 0xa5, 0x3a, 0xd8, 0x26, 0x49, 0xa1, 0x4d, 0xee, 0x6e, 0x51, 0x6a, 0x3d, 0xc1, - 0x36, 0x41, 0x5f, 0xc1, 0x5d, 0x6d, 0x04, 0x55, 0x3d, 0xe2, 0x07, 0x16, 0xf3, 0xd5, 0x4e, 0xa8, - 0x3a, 0x81, 0x8d, 0xa3, 0xd9, 0x44, 0x1f, 0x7c, 0x36, 0x47, 0x89, 0x68, 0x2b, 0x31, 0xbc, 0x11, - 0x3e, 0x89, 0xc0, 0xf1, 0xf7, 0x1f, 0x6a, 0xb3, 0xe2, 0x32, 0x85, 0xe2, 0x7c, 0xb0, 0xd8, 0xa3, - 0x6c, 0xdc, 0xa3, 0x87, 0xe9, 0x1e, 0xdd, 0x13, 0x95, 0x45, 0xc0, 0x09, 0x42, 0xb1, 0x53, 0x0d, - 0xd8, 0x9f, 0x9a, 0x83, 0xde, 0x85, 0x95, 0x8e, 0x45, 0xb5, 0xde, 0xe0, 0x83, 0x77, 0x45, 0xda, - 0x46, 0x14, 0x51, 0x92, 0x84, 0xf2, 0xd7, 0x70, 0x8b, 0x3b, 0xd0, 0x01, 0xac, 0xc4, 0xed, 0xe2, - 0xf2, 0x96, 0x95, 0xc4, 0x42, 0x0d, 0xd8, 0x1e, 0x9c, 0x74, 0x86, 0x3d, 0x83, 0xb0, 0x01, 0xe9, - 0xa1, 0x48, 0x9a, 0x9c, 0xee, 0xe7, 0x3c, 0x43, 0xd9, 0xa2, 0xa2, 0xe9, 0xa3, 0x43, 0x58, 0xe3, - 0xe5, 0x54, 0x53, 0xcf, 0x67, 0xf9, 0xdc, 0x56, 0xb9, 0xfd, 0x58, 0x2f, 0xff, 0x29, 0xc1, 0x66, - 0x0a, 0x8c, 0x1e, 0x42, 0x3e, 0x5d, 0x70, 0x62, 0xe8, 0xfb, 0x29, 0xfa, 0xe1, 0xf0, 0x4f, 0xe1, - 0x60, 0x02, 0xa8, 0xab, 0x81, 0xa9, 0xf3, 0xe6, 0xe6, 0x94, 0xdb, 0x63, 0x30, 0xbd, 0x6d, 0xea, - 0xa8, 0x0e, 0x85, 0x31, 0x90, 0x46, 0x1d, 0x86, 0xcd, 0xe8, 0xb0, 0xf1, 0x92, 0xb1, 0x5e, 0x39, - 0x85, 0x6d, 0x0e, 0x52, 0x78, 0xdd, 0x33, 0xd8, 0x18, 0x52, 0x84, 0x2e, 0xc9, 0x2f, 0x97, 0xa4, - 0xa3, 0xad, 0xda, 0x9d, 0x69, 0xed, 0x09, 0x5d, 0xa2, 0xac, 0xd3, 0x91, 0x51, 0x3e, 0x80, 0xbd, - 0x0b, 0xc2, 0x9a, 0x5d, 0xa2, 0xf5, 0x5c, 0x6a, 0x3a, 0x4c, 0x21, 0xaf, 0x03, 0xe2, 0xb3, 0xf2, - 0xcf, 0x12, 0xec, 0x8f, 0x05, 0x92, 0xad, 0xf6, 0xc9, 0xf8, 0xf5, 0xad, 0x88, 0x85, 0xa6, 0x62, - 0x66, 0xdc, 0xe0, 0x97, 0x0b, 0x6f, 0xf0, 0x69, 0xfa, 0x74, 0x16, 0xc4, 0x4a, 0x75, 0xcb, 0xa2, - 0xda, 0xac, 0xd5, 0xf3, 0xa3, 0x04, 0xbb, 0x13, 0x09, 0xe8, 0xe3, 0x71, 0xe9, 0xc7, 0x73, 0x09, - 0x67, 0xc8, 0x7e, 0xb1, 0x50, 0xf6, 0x83, 0xb4, 0x6c, 0x79, 0x7a, 0x95, 0xf1, 0xbd, 0xf3, 0x57, - 0x16, 0xb6, 0xd2, 0x51, 0x74, 0x07, 0x56, 0x3d, 0x6c, 0xbb, 0x6a, 0xe0, 0x72, 0xfa, 0x35, 0x65, - 0x25, 0x32, 0xdb, 0xee, 0xb4, 0x7d, 0xb4, 0x34, 0x6d, 0x1f, 0xf5, 0x41, 0x66, 0xd4, 0xa5, 0x16, - 0x35, 0x42, 0x15, 0x5f, 0x63, 0x8f, 0xa8, 0xd8, 0xf7, 0x4d, 0xc3, 0xb1, 0x89, 0xc3, 0xfc, 0x7c, - 0x96, 0x37, 0xe1, 0xd1, 0x6c, 0x79, 0x95, 0xe7, 0x09, 0xb8, 0x1e, 0x61, 0xeb, 0x23, 0x68, 0xdc, - 0x92, 0x3c, 0x9b, 0x11, 0x46, 0xdf, 0x4a, 0xf0, 0x7f, 0xea, 0x99, 0x86, 0xe9, 0x60, 0x4b, 0x9d, - 0xa3, 0x60, 0x99, 0x2b, 0xf8, 0x68, 0x8e, 0x82, 0xa7, 0x09, 0xcb, 0x7c, 0x25, 0x25, 0xba, 0x20, - 0x4d, 0xfe, 0x0c, 0x0a, 0x73, 0x29, 0xc4, 0x31, 0x2e, 0xc7, 0x63, 0xdc, 0x13, 0xc7, 0x98, 0x13, - 0x46, 0x25, 0x3f, 0x83, 0xfb, 0xff, 0x48, 0xd7, 0xbf, 0x21, 0x3d, 0xfe, 0x10, 0xd6, 0x85, 0x6b, - 0x8a, 0x10, 0x6c, 0x25, 0xe6, 0xa5, 0xc9, 0xba, 0x2d, 0xaa, 0xef, 0x64, 0xd0, 0x6d, 0xd8, 0x4e, - 0xf9, 0xa8, 0xb5, 0x23, 0xd5, 0xfe, 0x90, 0x00, 0x9a, 0xad, 0x76, 0x3d, 0xee, 0x1f, 0xfa, 0x12, - 0x36, 0xea, 0xba, 0x3e, 0xdc, 0x10, 0xa8, 0x50, 0x19, 0xbd, 0x5e, 0x2a, 0x43, 0xf7, 0x17, 0x84, - 0x61, 0x1d, 0x33, 0x2c, 0x97, 0xc4, 0xb0, 0x08, 0x1c, 0x5c, 0xde, 0x72, 0x06, 0x7d, 0x0a, 0x39, - 0x85, 0xd8, 0xb4, 0x4f, 0x5a, 0x54, 0x47, 0x77, 0x45, 0xc0, 0xd0, 0x9d, 0xec, 0x0d, 0xb9, 0x30, - 0x23, 0x3a, 0xe4, 0xba, 0x80, 0x0d, 0xf1, 0xe5, 0x83, 0x76, 0x45, 0xc0, 0xb9, 0xed, 0xb2, 0x50, - 0x2e, 0x2d, 0x7a, 0x26, 0x95, 0x33, 0x0f, 0xa4, 0x9a, 0x06, 0xb9, 0x66, 0xab, 0xdd, 0xe2, 0x2f, - 0x34, 0xf4, 0x02, 0x36, 0x53, 0x9b, 0x07, 0x95, 0xe6, 0x2c, 0xa5, 0x58, 0xe9, 0xbd, 0x85, 0x6b, - 0xab, 0x9c, 0x69, 0xf8, 0x6f, 0xde, 0x16, 0xa5, 0x5f, 0xdf, 0x16, 0x33, 0xdf, 0xdc, 0x14, 0xa5, - 0x37, 0x37, 0x45, 0xe9, 0x97, 0x9b, 0xa2, 0xf4, 0xdb, 0x4d, 0x51, 0xfa, 0xee, 0xf7, 0x62, 0xe6, - 0xd5, 0x7f, 0x7f, 0x50, 0x6a, 0x6e, 0x50, 0xd5, 0x43, 0x07, 0xdb, 0xa6, 0xe6, 0x52, 0xcb, 0xd4, - 0xc2, 0xea, 0x48, 0x4c, 0x67, 0x85, 0xbf, 0x2b, 0x4f, 0xff, 0x0e, 0x00, 0x00, 0xff, 0xff, 0x9e, - 0xab, 0x59, 0x4c, 0xf6, 0x0a, 0x00, 0x00, + 0x14, 0x15, 0x2d, 0xc7, 0xb6, 0xae, 0xdf, 0x13, 0xdb, 0x91, 0x99, 0x58, 0x50, 0x54, 0xa4, 0x70, + 0x5d, 0x58, 0x4a, 0xed, 0xa2, 0x09, 0xbc, 0x8a, 0xac, 0x1a, 0x6e, 0xfa, 0x48, 0x54, 0x26, 0x8a, + 0x91, 0x6c, 0x88, 0x11, 0x39, 0xa6, 0x08, 0x91, 0x1c, 0x86, 0x1c, 0xca, 0x25, 0x0a, 0x14, 0xfd, + 0x83, 0xf6, 0x2f, 0xba, 0x2e, 0xd0, 0x65, 0x3f, 0x20, 0xcb, 0x2e, 0xb3, 0x6c, 0xdc, 0x5f, 0xe8, + 0xb2, 0x05, 0x0a, 0x0e, 0x49, 0x69, 0xa8, 0x67, 0xdb, 0x95, 0x79, 0x1f, 0xe7, 0xdc, 0x33, 0xf7, + 0x6a, 0xae, 0x07, 0x0a, 0x9a, 0x1b, 0x54, 0x5d, 0x8f, 0x32, 0x8a, 0x40, 0x73, 0x03, 0xac, 0xf7, + 0x4c, 0x9f, 0x7a, 0xf2, 0xa1, 0x61, 0xb2, 0x4e, 0xd0, 0xae, 0x6a, 0xd4, 0xae, 0x19, 0xd4, 0xa0, + 0x35, 0x9e, 0xd2, 0x0e, 0x2e, 0xb9, 0xc5, 0x0d, 0xfe, 0x15, 0x43, 0xe5, 0x96, 0x90, 0xde, 0x0d, + 0xda, 0xe4, 0xaa, 0x83, 0xbd, 0xcb, 0x5a, 0x17, 0x33, 0x6c, 0x85, 0x3e, 0x3b, 0xd4, 0xa8, 0x47, + 0x6a, 0x6e, 0xd7, 0xa8, 0x61, 0x83, 0x38, 0xac, 0xf6, 0xda, 0xb3, 0x0f, 0x5d, 0x2b, 0x30, 0x4c, + 0xc7, 0xaf, 0x25, 0x05, 0xfd, 0x9e, 0x96, 0x7e, 0xaa, 0x7e, 0x4f, 0x8b, 0x69, 0x2b, 0x6f, 0xe7, + 0x60, 0xeb, 0x4b, 0xd3, 0x67, 0x75, 0x47, 0xbf, 0xc0, 0x4c, 0xeb, 0x28, 0xc4, 0x77, 0xa9, 0xe3, + 0x13, 0x74, 0x0e, 0x8b, 0xc4, 0x61, 0x9e, 0x49, 0xfc, 0xa2, 0x54, 0xce, 0xef, 0x2f, 0x1f, 0x1d, + 0x56, 0x07, 0xe2, 0xab, 0xe3, 0x20, 0xd5, 0xb3, 0x38, 0x3f, 0xfa, 0x13, 0x2a, 0x29, 0x1a, 0xbd, + 0x84, 0x03, 0x6c, 0x59, 0xf4, 0x4a, 0xf5, 0x3b, 0xd8, 0x23, 0xba, 0x1a, 0x29, 0xf5, 0x55, 0xda, + 0x23, 0x9e, 0x85, 0x5d, 0xd5, 0x23, 0x9a, 0x85, 0x4d, 0x3b, 0xf5, 0x17, 0xe7, 0xca, 0xd2, 0xfe, + 0x92, 0x72, 0x8f, 0x23, 0x9e, 0x71, 0x40, 0x23, 0xf2, 0x3f, 0x8d, 0xd3, 0x95, 0x34, 0x9b, 0x3b, + 0xd1, 0x23, 0x58, 0x25, 0xdf, 0x30, 0x0f, 0xab, 0xa9, 0xd2, 0x3c, 0x57, 0x7a, 0xbb, 0x3a, 0x38, + 0x72, 0xb5, 0x81, 0x2d, 0x2d, 0xb0, 0x30, 0x33, 0xa9, 0xf3, 0xd8, 0xb9, 0xa4, 0xca, 0x0a, 0x47, + 0x24, 0x52, 0xe5, 0x57, 0xb0, 0x22, 0xaa, 0x46, 0x1b, 0x90, 0xef, 0x92, 0xb0, 0x28, 0x95, 0xa5, + 0xfd, 0x82, 0x12, 0x7d, 0xa2, 0x8f, 0xe1, 0x46, 0x0f, 0x5b, 0x01, 0xe1, 0xca, 0x96, 0x8f, 0x4a, + 0x62, 0x17, 0x04, 0xee, 0x84, 0x45, 0x89, 0x93, 0x4f, 0xe6, 0x1e, 0x4a, 0x95, 0x5f, 0x24, 0x40, + 0xa3, 0x19, 0xe8, 0x6c, 0xb8, 0xb1, 0x1f, 0x4e, 0xa7, 0x1c, 0xdf, 0x56, 0xf9, 0x62, 0xa6, 0xf2, + 0x8f, 0xb2, 0xca, 0x6f, 0x4f, 0x28, 0xc3, 0xbb, 0x22, 0xc8, 0xfe, 0x69, 0x0e, 0xd6, 0x87, 0xc2, + 0xe8, 0x7d, 0x58, 0xa7, 0x57, 0x0e, 0xf1, 0x54, 0x97, 0x52, 0x4b, 0x75, 0xb0, 0x4d, 0x92, 0x42, + 0xab, 0xdc, 0xdd, 0xa4, 0xd4, 0x7a, 0x82, 0x6d, 0x82, 0xbe, 0x85, 0x3b, 0xda, 0x00, 0xaa, 0x7a, + 0xc4, 0x0f, 0x2c, 0xe6, 0xab, 0xed, 0x50, 0x75, 0x02, 0x1b, 0x47, 0xd3, 0x8d, 0x0e, 0x7c, 0x32, + 0x45, 0x89, 0x68, 0x2b, 0x31, 0xfc, 0x34, 0x7c, 0x12, 0x81, 0xe3, 0xf3, 0xef, 0x6a, 0x93, 0xe2, + 0x32, 0x85, 0xd2, 0x74, 0xb0, 0xd8, 0xa3, 0x7c, 0xdc, 0xa3, 0x07, 0xd9, 0x1e, 0xdd, 0x15, 0x95, + 0x45, 0xc0, 0x11, 0x42, 0xb1, 0x53, 0xa7, 0xb0, 0x3d, 0x36, 0x07, 0x7d, 0x00, 0x0b, 0x6d, 0x8b, + 0x6a, 0xdd, 0xf4, 0xc0, 0x9b, 0x22, 0xed, 0x69, 0x14, 0x51, 0x92, 0x84, 0xca, 0x77, 0x70, 0x83, + 0x3b, 0xd0, 0x0e, 0x2c, 0xc4, 0xed, 0xe2, 0xf2, 0xe6, 0x95, 0xc4, 0x42, 0xa7, 0xb0, 0x9e, 0xde, + 0x15, 0x86, 0x3d, 0x83, 0xb0, 0x94, 0x74, 0x57, 0x24, 0x4d, 0xee, 0xc7, 0x73, 0x9e, 0xa1, 0xac, + 0x51, 0xd1, 0xf4, 0xd1, 0x2e, 0x2c, 0xf1, 0x72, 0xaa, 0xa9, 0x17, 0xf3, 0x7c, 0x6e, 0x8b, 0xdc, + 0x7e, 0xac, 0x57, 0xfe, 0x92, 0x60, 0x35, 0x03, 0x46, 0x0f, 0xa0, 0x98, 0x2d, 0x38, 0x32, 0xf4, + 0xed, 0x0c, 0x7d, 0x7f, 0xf8, 0xc7, 0xb0, 0x33, 0x02, 0xd4, 0xd5, 0xc0, 0xd4, 0x79, 0x73, 0x0b, + 0xca, 0xcd, 0x21, 0x98, 0xde, 0x32, 0x75, 0x54, 0x87, 0xbd, 0x21, 0x90, 0x46, 0x1d, 0x86, 0xcd, + 0xe8, 0xc7, 0xc6, 0x4b, 0xc6, 0x7a, 0xe5, 0x0c, 0xb6, 0x91, 0xa6, 0xf0, 0xba, 0x27, 0xb0, 0xd2, + 0xa7, 0x08, 0x5d, 0x52, 0x9c, 0x2f, 0x4b, 0xfb, 0x6b, 0x47, 0xb7, 0xc6, 0xb5, 0x27, 0x74, 0x89, + 0xb2, 0x4c, 0x07, 0x46, 0x65, 0x07, 0xb6, 0xce, 0x09, 0x6b, 0x74, 0x88, 0xd6, 0x75, 0xa9, 0xe9, + 0x30, 0x85, 0xbc, 0x0e, 0x88, 0xcf, 0x2a, 0xbf, 0x4a, 0xb0, 0x3d, 0x14, 0x48, 0xf6, 0xe2, 0x67, + 0xc3, 0xd7, 0xb7, 0x2a, 0x16, 0x1a, 0x8b, 0x99, 0x70, 0x83, 0x5f, 0xce, 0xbc, 0xc1, 0xc7, 0xd9, + 0x5f, 0xe7, 0x9e, 0x58, 0xa9, 0x6e, 0x59, 0x54, 0x9b, 0xb4, 0x7a, 0x7e, 0x96, 0x60, 0x73, 0x24, + 0x01, 0x7d, 0x3a, 0x2c, 0xfd, 0x60, 0x2a, 0xe1, 0x04, 0xd9, 0x2f, 0x66, 0xca, 0xbe, 0x9f, 0x95, + 0x2d, 0x8f, 0xaf, 0x32, 0xbc, 0x77, 0xfe, 0xce, 0xc3, 0x5a, 0x36, 0x8a, 0x6e, 0xc1, 0xa2, 0x87, + 0x6d, 0x57, 0x0d, 0x5c, 0x4e, 0xbf, 0xa4, 0x2c, 0x44, 0x66, 0xcb, 0x1d, 0xb7, 0x8f, 0xe6, 0xc6, + 0xed, 0xa3, 0x1e, 0xc8, 0x8c, 0xba, 0xd4, 0xa2, 0x46, 0xa8, 0xe2, 0x2b, 0xec, 0x11, 0x15, 0xfb, + 0xbe, 0x69, 0x38, 0x36, 0x71, 0x58, 0xfa, 0xdf, 0xe2, 0xe1, 0x64, 0x79, 0xd5, 0xe7, 0x09, 0xb8, + 0x1e, 0x61, 0xeb, 0x03, 0x68, 0xdc, 0x92, 0x22, 0x9b, 0x10, 0x46, 0x3f, 0x48, 0xf0, 0x1e, 0xf5, + 0x4c, 0xc3, 0x74, 0xb0, 0xa5, 0x4e, 0x51, 0x30, 0xcf, 0x15, 0x3c, 0x9a, 0xa2, 0xe0, 0x69, 0xc2, + 0x32, 0x5d, 0x49, 0x99, 0xce, 0x48, 0x93, 0xbf, 0x80, 0xbd, 0xa9, 0x14, 0xe2, 0x18, 0xe7, 0xe3, + 0x31, 0x6e, 0x89, 0x63, 0x2c, 0x08, 0xa3, 0x92, 0x9f, 0xc1, 0xbd, 0x7f, 0xa5, 0xeb, 0xbf, 0x90, + 0x1e, 0x7c, 0x02, 0xcb, 0xc2, 0x35, 0x45, 0x08, 0xd6, 0x12, 0xf3, 0xc2, 0x64, 0x9d, 0x26, 0xd5, + 0x37, 0x72, 0xe8, 0x26, 0xac, 0x67, 0x7c, 0xd4, 0xda, 0x90, 0x8e, 0xfe, 0x94, 0x00, 0x1a, 0xcd, + 0x56, 0x3d, 0xee, 0x1f, 0xfa, 0x1a, 0x56, 0xea, 0xba, 0xde, 0xdf, 0x10, 0x68, 0x2f, 0xf3, 0x18, + 0x48, 0xdd, 0x5f, 0x11, 0x86, 0x75, 0xcc, 0xb0, 0x5c, 0x16, 0xc3, 0x22, 0x30, 0xbd, 0xbc, 0x95, + 0x1c, 0xfa, 0x1c, 0x0a, 0x0a, 0xb1, 0x69, 0x8f, 0x34, 0xa9, 0x8e, 0xee, 0x88, 0x80, 0xbe, 0x3b, + 0xd9, 0x1b, 0xf2, 0xde, 0x84, 0x68, 0x9f, 0xeb, 0x1c, 0x56, 0xc4, 0xb7, 0x13, 0xda, 0x14, 0x01, + 0x67, 0xb6, 0xcb, 0x42, 0xb9, 0x3c, 0xeb, 0xa1, 0x55, 0xc9, 0xdd, 0x97, 0x8e, 0x34, 0x28, 0x34, + 0x9a, 0xad, 0x26, 0x7f, 0xe3, 0xa1, 0x17, 0xb0, 0x9a, 0xd9, 0x3c, 0xa8, 0x3c, 0x65, 0x29, 0xc5, + 0x4a, 0xef, 0xce, 0x5c, 0x5b, 0x95, 0xdc, 0xa9, 0xff, 0xe6, 0x5d, 0x49, 0x7a, 0xfb, 0xae, 0x94, + 0xfb, 0xfe, 0xba, 0x24, 0xbd, 0xb9, 0x2e, 0x49, 0xbf, 0x5d, 0x97, 0xa4, 0xdf, 0xaf, 0x4b, 0xd2, + 0x8f, 0x7f, 0x94, 0x72, 0xaf, 0xfe, 0xff, 0x93, 0x54, 0x73, 0x83, 0x9a, 0x1e, 0x3a, 0xd8, 0x36, + 0x35, 0x97, 0x5a, 0xa6, 0x16, 0xd6, 0x06, 0x62, 0xda, 0x0b, 0xfc, 0x65, 0x7a, 0xfc, 0x4f, 0x00, + 0x00, 0x00, 0xff, 0xff, 0x37, 0x02, 0x0d, 0x3a, 0x38, 0x0b, 0x00, 0x00, } // Reference imports to suppress errors if they are not otherwise used. @@ -969,6 +978,20 @@ func (m *ListAndWatchResponse) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if len(m.ExtraEntries) > 0 { + for iNdEx := len(m.ExtraEntries) - 1; iNdEx >= 0; iNdEx-- { + { + size, err := m.ExtraEntries[iNdEx].MarshalToSizedBuffer(dAtA[:i]) + if err != nil { + return 0, err + } + i -= size + i = encodeVarintCpu(dAtA, i, uint64(size)) + } + i-- + dAtA[i] = 0x1a + } + } if m.AllowSharedCoresOverlapReclaimedCores { i-- if m.AllowSharedCoresOverlapReclaimedCores { @@ -1474,6 +1497,12 @@ func (m *ListAndWatchResponse) Size() (n int) { if m.AllowSharedCoresOverlapReclaimedCores { n += 2 } + if len(m.ExtraEntries) > 0 { + for _, e := range m.ExtraEntries { + l = e.Size() + n += 1 + l + sovCpu(uint64(l)) + } + } return n } @@ -1681,6 +1710,11 @@ func (this *ListAndWatchResponse) String() string { if this == nil { return "nil" } + repeatedStringForExtraEntries := "[]*CalculationInfo{" + for _, f := range this.ExtraEntries { + repeatedStringForExtraEntries += strings.Replace(fmt.Sprintf("%v", f), "CalculationInfo", "advisorsvc.CalculationInfo", 1) + "," + } + repeatedStringForExtraEntries += "}" keysForEntries := make([]string, 0, len(this.Entries)) for k, _ := range this.Entries { keysForEntries = append(keysForEntries, k) @@ -1694,6 +1728,7 @@ func (this *ListAndWatchResponse) String() string { s := strings.Join([]string{`&ListAndWatchResponse{`, `Entries:` + mapStringForEntries + `,`, `AllowSharedCoresOverlapReclaimedCores:` + fmt.Sprintf("%v", this.AllowSharedCoresOverlapReclaimedCores) + `,`, + `ExtraEntries:` + repeatedStringForExtraEntries + `,`, `}`, }, "") return s @@ -2021,7 +2056,7 @@ func (m *ListAndWatchResponse) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -2052,16 +2087,47 @@ func (m *ListAndWatchResponse) Unmarshal(dAtA []byte) error { } } m.AllowSharedCoresOverlapReclaimedCores = bool(v != 0) + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field ExtraEntries", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowCpu + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthCpu + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthCpu + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.ExtraEntries = append(m.ExtraEntries, &advisorsvc.CalculationInfo{}) + if err := m.ExtraEntries[len(m.ExtraEntries)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex default: iNdEx = preIndex skippy, err := skipCpu(dAtA[iNdEx:]) if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2223,7 +2289,7 @@ func (m *CalculationEntries) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -2240,10 +2306,7 @@ func (m *CalculationEntries) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2423,7 +2486,7 @@ func (m *CalculationInfo) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -2440,10 +2503,7 @@ func (m *CalculationInfo) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2527,10 +2587,7 @@ func (m *NumaCalculationResult) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2665,10 +2722,7 @@ func (m *Block) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2833,10 +2887,7 @@ func (m *OverlapTarget) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -2886,10 +2937,7 @@ func (m *GetCheckpointRequest) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -3051,7 +3099,7 @@ func (m *GetCheckpointResponse) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -3068,10 +3116,7 @@ func (m *GetCheckpointResponse) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -3233,7 +3278,7 @@ func (m *AllocationEntries) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -3250,10 +3295,7 @@ func (m *AllocationEntries) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { @@ -3451,7 +3493,7 @@ func (m *AllocationInfo) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -3564,7 +3606,7 @@ func (m *AllocationInfo) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > postIndex { @@ -3581,10 +3623,7 @@ func (m *AllocationInfo) Unmarshal(dAtA []byte) error { if err != nil { return err } - if skippy < 0 { - return ErrInvalidLengthCpu - } - if (iNdEx + skippy) < 0 { + if (skippy < 0) || (iNdEx+skippy) < 0 { return ErrInvalidLengthCpu } if (iNdEx + skippy) > l { diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.proto b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.proto index 021292d6d..f4feeb68e 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.proto +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor/cpu.proto @@ -34,6 +34,7 @@ option go_package = "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cp message ListAndWatchResponse { map entries = 1; // keyed by pool name or podUID bool allow_shared_cores_overlap_reclaimed_cores= 2; // if set to true, cpuset of shared_cores may overlap with reclaimed_cores + repeated advisorsvc.CalculationInfo extra_entries = 3; // for non-container level adjustment (eg. /kubepods/besteffort) } message CalculationEntries { diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go index 6f7b4aa84..03fd9d001 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go @@ -26,4 +26,5 @@ const ( ControlKnobKeyBalanceNumaMemory MemoryControlKnobName = "balance_numa_memory" ControlKnobKeySwapMax MemoryControlKnobName = "swap_max" ControlKnowKeyMemoryOffloading MemoryControlKnobName = "memory_offloading" + ControlKnobKeyMemoryNUMAHeadroom MemoryControlKnobName = "memory_numa_headroom" ) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/qos_aware.go b/pkg/agent/sysadvisor/plugin/qosaware/qos_aware.go index 33fb3e01c..ece6aee60 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/qos_aware.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/qos_aware.go @@ -18,6 +18,7 @@ package qosaware import ( "context" + "fmt" "sync" "time" @@ -69,11 +70,8 @@ func NewQoSAwarePlugin(pluginName string, conf *config.Configuration, extraConf return nil, err } - qrmServer, err := server.NewQRMServer(resourceAdvisor, conf, metaCache, metaServer, emitter) - if err != nil { - return nil, err - } - + var resourceGetter reporter.HeadroomResourceGetter + headroomReporterExists := false reporters := make([]reporter.Reporter, 0) for _, reporterName := range conf.Reporters { switch reporterName { @@ -83,6 +81,8 @@ func NewQoSAwarePlugin(pluginName string, conf *config.Configuration, extraConf return nil, err } reporters = append(reporters, headroomReporter) + resourceGetter = headroomReporter + headroomReporterExists = true case types.NodeMetricReporter: nodeMetricsReporter, err := reporter.NewNodeMetricsReporter(emitter, metaServer, metaCache, conf) if err != nil { @@ -92,6 +92,15 @@ func NewQoSAwarePlugin(pluginName string, conf *config.Configuration, extraConf } } + if !headroomReporterExists { + return nil, fmt.Errorf("headroom reporter must be specified") + } + + qrmServer, err := server.NewQRMServer(resourceAdvisor, resourceGetter, conf, metaCache, metaServer, emitter) + if err != nil { + return nil, err + } + // add AdminQos dynamic config watcher err = metaServer.ConfigurationManager.AddConfigWatcher(crd.AdminQoSConfigurationGVR) if err != nil { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter.go index d3ae9a7a9..465bb10ae 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter.go @@ -23,6 +23,7 @@ import ( "sync" v1 "k8s.io/api/core/v1" + apiresource "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/errors" "k8s.io/klog/v2" @@ -49,23 +50,51 @@ const ( headroomReporterPluginName = "headroom-reporter-plugin" ) -type headroomReporterImpl struct { +type HeadroomResourceManager interface { + manager.ResourceManager + manager.NumaResourceManager +} + +type HeadroomResourceGetter interface { + GetHeadroomResource(name v1.ResourceName) (HeadroomResourceManager, error) +} + +type HeadroomReporter struct { skeleton.GenericPlugin + HeadroomResourceGetter +} + +type DummyHeadroomResourceManager struct{} + +func (mgr *DummyHeadroomResourceManager) GetAllocatable() (apiresource.Quantity, error) { + return apiresource.Quantity{}, nil +} + +func (mgr *DummyHeadroomResourceManager) GetCapacity() (apiresource.Quantity, error) { + return apiresource.Quantity{}, nil +} + +func (mgr *DummyHeadroomResourceManager) GetNumaAllocatable() (map[int]apiresource.Quantity, error) { + return nil, nil +} + +func (mgr *DummyHeadroomResourceManager) GetNumaCapacity() (map[int]apiresource.Quantity, error) { + return nil, nil } // NewHeadroomReporter returns a wrapper of headroom reporter plugins as headroom reporter func NewHeadroomReporter(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, conf *config.Configuration, headroomAdvisor hmadvisor.ResourceAdvisor, -) (Reporter, error) { - plugin, err := newHeadroomReporterPlugin(emitter, metaServer, conf, headroomAdvisor) +) (*HeadroomReporter, error) { + plugin, getter, err := newHeadroomReporterPlugin(emitter, metaServer, conf, headroomAdvisor) if err != nil { return nil, fmt.Errorf("[headroom-reporter] create headroom reporter failed: %s", err) } - return &headroomReporterImpl{plugin}, nil + return &HeadroomReporter{GenericPlugin: plugin, HeadroomResourceGetter: getter}, nil } -func (r *headroomReporterImpl) Run(ctx context.Context) { +func (r *HeadroomReporter) Run(ctx context.Context) { if err := r.Start(); err != nil { klog.Fatalf("[headroom-reporter] start %v failed: %v", r.Name(), err) } @@ -78,13 +107,16 @@ func (r *headroomReporterImpl) Run(ctx context.Context) { } type reclaimedResource struct { - allocatable v1.ResourceList - capacity v1.ResourceList + allocatable v1.ResourceList + capacity v1.ResourceList + numaAllocatable map[int]v1.ResourceList + numaCapacity map[int]v1.ResourceList } type headroomReporterPlugin struct { sync.Mutex - headroomManagers map[v1.ResourceName]manager.HeadroomManager + headroomManagers map[v1.ResourceName]manager.HeadroomManager + numaSocketZoneNodeMap map[util.ZoneNode]util.ZoneNode ctx context.Context cancel context.CancelFunc @@ -93,7 +125,7 @@ type headroomReporterPlugin struct { func newHeadroomReporterPlugin(emitter metrics.MetricEmitter, metaServer *metaserver.MetaServer, conf *config.Configuration, headroomAdvisor hmadvisor.ResourceAdvisor, -) (skeleton.GenericPlugin, error) { +) (skeleton.GenericPlugin, HeadroomResourceGetter, error) { var ( err error errList []error @@ -107,20 +139,40 @@ func newHeadroomReporterPlugin(emitter metrics.MetricEmitter, metaServer *metase errList = append(errList, err) } } + + // init numa topo info by metaServer + if metaServer == nil || metaServer.MachineInfo == nil { + errList = append(errList, fmt.Errorf("get metaserver machine info is nil")) + } + if len(errList) > 0 { - return nil, errors.NewAggregate(errList) + return nil, nil, errors.NewAggregate(errList) } reporter := &headroomReporterPlugin{ - headroomManagers: headroomManagers, + headroomManagers: headroomManagers, + numaSocketZoneNodeMap: util.GenerateNumaSocketZone(metaServer.MachineInfo.Topology), } - return skeleton.NewRegistrationPluginWrapper(reporter, []string{conf.PluginRegistrationDir}, + pluginWrapper, err := skeleton.NewRegistrationPluginWrapper(reporter, []string{conf.PluginRegistrationDir}, func(key string, value int64) { _ = emitter.StoreInt64(key, value, metrics.MetricTypeNameCount, metrics.ConvertMapToTags(map[string]string{ "pluginName": headroomReporterPluginName, "pluginType": registration.ReporterPlugin, })...) }) + if err != nil { + return nil, nil, err + } + + return pluginWrapper, reporter, nil +} + +func (r *headroomReporterPlugin) GetHeadroomResource(name v1.ResourceName) (HeadroomResourceManager, error) { + if mgr, ok := r.headroomManagers[name]; ok { + return mgr, nil + } + + return nil, fmt.Errorf("not found headroom manager for resource %s", name) } func (r *headroomReporterPlugin) Name() string { @@ -171,7 +223,7 @@ func (r *headroomReporterPlugin) GetReportContent(_ context.Context, _ *v1alpha1 return nil, err } - reportToCNR, err := getReportReclaimedResourceForCNR(res) + reportToCNR, err := r.getReportReclaimedResourceForCNR(res) if err != nil { return nil, err } @@ -202,6 +254,8 @@ func (r *headroomReporterPlugin) getReclaimedResource() (*reclaimedResource, err allocatable := make(v1.ResourceList) capacity := make(v1.ResourceList) + numaAllocatable := make(map[int]v1.ResourceList) + numaCapacity := make(map[int]v1.ResourceList) for resourceName, rm := range r.headroomManagers { allocatable[resourceName], err = rm.GetAllocatable() if err != nil { @@ -212,6 +266,37 @@ func (r *headroomReporterPlugin) getReclaimedResource() (*reclaimedResource, err if err != nil { errList = append(errList, err, fmt.Errorf("get reclaimed %s capacity failed: %s", resourceName, err)) } + + // get allocatable per numa + allocatableMap, err := rm.GetNumaAllocatable() + if err != nil { + errList = append(errList, fmt.Errorf("get reclaimed %s numa allocatable failed: %s", resourceName, err)) + } else { + for numaID, quantity := range allocatableMap { + perNumaAllocatable, ok := numaAllocatable[numaID] + if !ok { + perNumaAllocatable = make(v1.ResourceList) + numaAllocatable[numaID] = perNumaAllocatable + } + perNumaAllocatable[resourceName] = quantity + } + } + + // get capacity per numa + capacityMap, err := rm.GetNumaCapacity() + if err != nil { + errList = append(errList, fmt.Errorf("get reclaimed %s numa capacity failed: %s", resourceName, err)) + } else { + for numaID, quantity := range capacityMap { + perNumaCapacity, ok := numaCapacity[numaID] + if !ok { + perNumaCapacity = make(v1.ResourceList) + numaCapacity[numaID] = perNumaCapacity + } + perNumaCapacity[resourceName] = quantity + } + } + } if len(errList) > 0 { @@ -219,22 +304,24 @@ func (r *headroomReporterPlugin) getReclaimedResource() (*reclaimedResource, err } return &reclaimedResource{ - allocatable: allocatable, - capacity: capacity, + allocatable: allocatable, + capacity: capacity, + numaAllocatable: numaAllocatable, + numaCapacity: numaCapacity, }, err } -func getReportReclaimedResourceForCNR(reclaimedResource *reclaimedResource) (*v1alpha1.ReportContent, error) { +func (r *headroomReporterPlugin) getReportReclaimedResourceForCNR(reclaimedResource *reclaimedResource) (*v1alpha1.ReportContent, error) { if reclaimedResource == nil { return nil, nil } - resources := nodeapis.Resources{ - Allocatable: &reclaimedResource.allocatable, - Capacity: &reclaimedResource.capacity, + resourceField, err := r.getReportReclaimedResource(reclaimedResource) + if err != nil { + return nil, err } - resourcesValue, err := json.Marshal(&resources) + topologyZoneField, err := r.getReportNUMAReclaimedResource(reclaimedResource) if err != nil { return nil, err } @@ -242,11 +329,59 @@ func getReportReclaimedResourceForCNR(reclaimedResource *reclaimedResource) (*v1 return &v1alpha1.ReportContent{ GroupVersionKind: &util.CNRGroupVersionKind, Field: []*v1alpha1.ReportField{ - { - FieldType: v1alpha1.FieldType_Status, - FieldName: util.CNRFieldNameResources, - Value: resourcesValue, - }, + resourceField, topologyZoneField, }, }, nil } + +func (r *headroomReporterPlugin) getReportReclaimedResource(reclaimedResource *reclaimedResource) (*v1alpha1.ReportField, error) { + resources := nodeapis.Resources{ + Allocatable: &reclaimedResource.allocatable, + Capacity: &reclaimedResource.capacity, + } + + resourcesValue, err := json.Marshal(&resources) + if err != nil { + return nil, fmt.Errorf("marshal resource failed: %s", err) + } + + return &v1alpha1.ReportField{ + FieldType: v1alpha1.FieldType_Status, + FieldName: util.CNRFieldNameResources, + Value: resourcesValue, + }, nil +} + +func (r *headroomReporterPlugin) getReportNUMAReclaimedResource(reclaimedResource *reclaimedResource) (*v1alpha1.ReportField, error) { + topologyZoneGenerator, err := util.NewNumaSocketTopologyZoneGenerator(r.numaSocketZoneNodeMap) + if err != nil { + return nil, fmt.Errorf("create topology zone generator failed: %s", err) + } + + zoneResources := make(map[util.ZoneNode]nodeapis.Resources) + for numaID := range reclaimedResource.numaAllocatable { + allocatable := reclaimedResource.numaAllocatable[numaID] + capacity, ok := reclaimedResource.numaCapacity[numaID] + if !ok { + return nil, fmt.Errorf("miss capacity with numaID: %d", numaID) + } + + numaZoneNode := util.GenerateNumaZoneNode(numaID) + zoneResources[numaZoneNode] = nodeapis.Resources{ + Allocatable: &allocatable, + Capacity: &capacity, + } + } + + topologyZone := topologyZoneGenerator.GenerateTopologyZoneStatus(nil, zoneResources, nil, nil) + value, err := json.Marshal(&topologyZone) + if err != nil { + return nil, fmt.Errorf("marshal topology zone failed: %s", err) + } + + return &v1alpha1.ReportField{ + FieldType: v1alpha1.FieldType_Status, + FieldName: util.CNRFieldNameTopologyZone, + Value: value, + }, nil +} diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter_test.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter_test.go index 9d8f986ad..5032f9095 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/headroom_reporter_test.go @@ -24,6 +24,7 @@ import ( "testing" "time" + info "github.com/google/cadvisor/info/v1" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -114,6 +115,7 @@ func generateTestMetaServer(clientSet *client.GenericClientSet, conf *config.Con MetricsFetcher: metric.NewFakeMetricsFetcher(metrics.DummyMetrics{}), KatalystMachineInfo: &machine.KatalystMachineInfo{ CPUTopology: cpuTopology, + MachineInfo: &info.MachineInfo{}, }, }, ConfigurationManager: &dynamicconfig.DummyConfigurationManager{}, @@ -187,7 +189,7 @@ func TestReclaimedResourcedReporterWithManager(t *testing.T) { metaServer := generateTestMetaServer(clientSet, conf) advisorStub := hmadvisor.NewResourceAdvisorStub() - genericPlugin, err := newHeadroomReporterPlugin(metrics.DummyMetrics{}, metaServer, conf, advisorStub) + genericPlugin, _, err := newHeadroomReporterPlugin(metrics.DummyMetrics{}, metaServer, conf, advisorStub) require.NoError(t, err) require.NotNil(t, genericPlugin) _ = genericPlugin.Start() diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/manager.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/manager.go index 9e9bfbbf0..3b01fdca1 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/manager.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/manager.go @@ -33,12 +33,28 @@ var headroomManagerInitializers sync.Map // HeadroomManager is used to manage resource headroom reporting and overcommit. type HeadroomManager interface { - // GetAllocatable return the allocatable resource of this resource + // global resource management + ResourceManager + // NUMA-specific resource management + NumaResourceManager + // Run starts the resource manager + Run(ctx context.Context) +} + +// ResourceManager provides a general interface for managing resources +type ResourceManager interface { + // GetAllocatable returns the total allocatable resource of this manager GetAllocatable() (resource.Quantity, error) - // GetCapacity return the capacity of this resource + // GetCapacity returns the total capacity resource of this manager GetCapacity() (resource.Quantity, error) - // Run this resource manager - Run(ctx context.Context) +} + +// NumaResourceManager provides an interface for managing NUMA-specific resources +type NumaResourceManager interface { + // GetNumaAllocatable returns the allocatable resource for each NUMA node + GetNumaAllocatable() (map[int]resource.Quantity, error) + // GetNumaCapacity returns the capacity resource for each NUMA node + GetNumaCapacity() (map[int]resource.Quantity, error) } // InitFunc is used to init headroom manager diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go index 9d689b91e..a0f39702f 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go @@ -61,6 +61,8 @@ type GenericSlidingWindowOptions struct { type GenericHeadroomManager struct { sync.RWMutex lastReportResult *resource.Quantity + // the latest transformed reporter result per numa + lastNumaReportResult map[int]resource.Quantity headroomAdvisor hmadvisor.ResourceAdvisor emitter metrics.MetricEmitter @@ -91,6 +93,7 @@ func NewGenericHeadroomManager(name v1.ResourceName, useMilliValue, reportMilliV return &GenericHeadroomManager{ resourceName: name, + lastNumaReportResult: make(map[int]resource.Quantity), reportResultTransformer: reportResultTransformer, syncPeriod: syncPeriod, headroomAdvisor: headroomAdvisor, @@ -120,6 +123,18 @@ func (m *GenericHeadroomManager) GetCapacity() (resource.Quantity, error) { return m.getLastReportResult() } +func (m *GenericHeadroomManager) GetNumaAllocatable() (map[int]resource.Quantity, error) { + m.RLock() + defer m.RUnlock() + return m.lastNumaReportResult, nil +} + +func (m *GenericHeadroomManager) GetNumaCapacity() (map[int]resource.Quantity, error) { + m.RLock() + defer m.RUnlock() + return m.lastNumaReportResult, nil +} + func (m *GenericHeadroomManager) Run(ctx context.Context) { go wait.UntilWithContext(ctx, m.sync, m.syncPeriod) <-ctx.Done() @@ -150,12 +165,14 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { return } - originResultFromAdvisor, err := m.headroomAdvisor.GetHeadroom(m.resourceName) + originResultFromAdvisor, numaResult, err := m.headroomAdvisor.GetHeadroom(m.resourceName) if err != nil { klog.Errorf("get origin result %s from headroomAdvisor failed: %v", m.resourceName, err) return } + originValue := originResultFromAdvisor.Value() + reportResult := m.reportSlidingWindow.GetWindowedResources(originResultFromAdvisor) if reportResult == nil { klog.Infof("skip update reclaimed resource %s without enough valid sample", m.resourceName) @@ -172,6 +189,14 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { reportResult.String(), reclaimOptions.ReservedResourceForReport.String()) m.setLastReportResult(*reportResult) + // set latest numa report result + diffRatio := float64(reportResult.Value()) / float64(originValue) + for numaID, res := range numaResult { + res.Set(int64(float64(res.Value()) * diffRatio)) + result := m.reportResultTransformer(res) + m.lastNumaReportResult[numaID] = result + klog.Infof("%s headroom manager for NUMA: %d, headroom: %d", m.resourceName, numaID, result.Value()) + } } func (m *GenericHeadroomManager) emitResourceToMetric(metricsName string, value resource.Quantity) { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go index 44b0ea501..b88fd0de2 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go @@ -194,7 +194,7 @@ func (cra *cpuResourceAdvisor) GetChannels() (interface{}, interface{}) { return cra.recvCh, cra.sendCh } -func (cra *cpuResourceAdvisor) GetHeadroom() (resource.Quantity, error) { +func (cra *cpuResourceAdvisor) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { klog.Infof("[qosaware-cpu] receive get headroom request") cra.mutex.RLock() @@ -202,22 +202,22 @@ func (cra *cpuResourceAdvisor) GetHeadroom() (resource.Quantity, error) { if !cra.advisorUpdated { klog.Infof("[qosaware-cpu] skip getting headroom: advisor not updated") - return resource.Quantity{}, fmt.Errorf("advisor not updated") + return resource.Quantity{}, nil, fmt.Errorf("advisor not updated") } if cra.headroomAssembler == nil { klog.Errorf("[qosaware-cpu] get headroom failed: no legal assembler") - return resource.Quantity{}, fmt.Errorf("no legal assembler") + return resource.Quantity{}, nil, fmt.Errorf("no legal assembler") } - headroom, err := cra.headroomAssembler.GetHeadroom() + headroom, numaHeadroom, err := cra.headroomAssembler.GetHeadroom() if err != nil { klog.Errorf("[qosaware-cpu] get headroom failed: %v", err) } else { klog.Infof("[qosaware-cpu] get headroom: %v", headroom) } - return headroom, err + return headroom, numaHeadroom, err } // update works in a monolithic way to maintain lifecycle and triggers update actions for all regions; @@ -601,6 +601,29 @@ func (cra *cpuResourceAdvisor) assembleProvision() (types.InternalCPUCalculation calculationResult, err := cra.provisionAssembler.AssembleProvision() + /* + _, headroom, err := cra.headroomAssembler.GetHeadroom() + if err != nil { + return types.InternalCPUCalculationResult{}, fmt.Errorf("get numa headroom failed: %v", err) + } + + numaHeadroom := make(map[int]float64) + for numaID, res := range headroom { + numaHeadroom[numaID] = float64(res.Value()) + } + data, err := json.Marshal(numaHeadroom) + if err != nil { + return types.InternalCPUCalculationResult{}, fmt.Errorf("marshal numa headroom failed: %v", err) + } + extra := types.ExtraCPUAdvices{ + CgroupPath: cra.conf.ReclaimRelativeRootCgroupPath, + Values: map[string]string{ + string(cpuadvisor.ControlKnobKeyCPUNUMAHeadroom): string(data), + }, + } + calculationResult.ExtraEntries = append(calculationResult.ExtraEntries, extra) + */ + return calculationResult, err } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_test.go index 2c489f047..2187030c4 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor_test.go @@ -1242,7 +1242,7 @@ func TestAdvisorUpdate(t *testing.T) { // check headroom if !reflect.DeepEqual(tt.wantHeadroom, resource.Quantity{}) { - headroom, err := advisor.GetHeadroom() + headroom, _, err := advisor.GetHeadroom() if tt.wantHeadroomErr { assert.Error(t, err) } else { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler.go index 90d5822e3..1daefe4d6 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler.go @@ -34,7 +34,7 @@ import ( // Advisor data elements are shared ONLY by assemblers as pointer to avoid rebuild in advisor, // and NOT supposed to be used by other components. type HeadroomAssembler interface { - GetHeadroom() (resource.Quantity, error) + GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) } type InitFunc func(conf *config.Configuration, extraConf interface{}, regionMap *map[string]region.QoSRegion, diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go index be0a50abc..7cf3ac8eb 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go @@ -27,10 +27,11 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" - "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper" + metricHelper "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" @@ -69,16 +70,17 @@ func NewHeadroomAssemblerCommon(conf *config.Configuration, _ interface{}, regio } } -func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) { +func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { dynamicConfig := ha.conf.GetDynamicConfiguration() // return zero when reclaim is disabled if !dynamicConfig.EnableReclaim { - return *resource.NewQuantity(0, resource.DecimalSI), nil + return *resource.NewQuantity(0, resource.DecimalSI), nil, nil } reserved := ha.conf.GetDynamicConfiguration().ReservedResourceForAllocate[v1.ResourceCPU] headroomTotal := 0.0 + headroomNuma := make(map[int]float64) emptyNUMAs := ha.metaServer.CPUDetails.NUMANodes() exclusiveNUMAs := machine.NewCPUSet() @@ -89,7 +91,7 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) { if r.Type() == configapi.QoSRegionTypeDedicatedNumaExclusive { regionInfo, ok := ha.metaReader.GetRegionInfo(r.Name()) if !ok || regionInfo == nil || regionInfo.Headroom < 0 { - return resource.Quantity{}, fmt.Errorf("failed to get headroom for %v", r.Name()) + return resource.Quantity{}, nil, fmt.Errorf("failed to get headroom for %v", r.Name()) } if regionInfo.RegionStatus.BoundType == types.BoundUpper && r.EnableReclaim() { general.Infof("region %v is in status of upper bound", regionInfo.RegionName) @@ -98,6 +100,15 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) { headroomTotal += regionInfo.Headroom exclusiveNUMAs = exclusiveNUMAs.Union(r.GetBindingNumas()) + // divide headroom evenly to each numa + bindingNUMAs := r.GetBindingNumas() + if regionInfo.Headroom > 0 && bindingNUMAs.Size() > 0 { + perNumaHeadroom := regionInfo.Headroom / float64(bindingNUMAs.Size()) + for _, numaID := range bindingNUMAs.ToSliceInt() { + headroomNuma[numaID] += perNumaHeadroom + } + } + klog.InfoS("dedicated_cores NUMA headroom", "headroom", regionInfo.Headroom, "NUMAs", r.GetBindingNumas().String()) } emptyNUMAs = emptyNUMAs.Difference(r.GetBindingNumas()) @@ -112,6 +123,7 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) { for _, numaID := range reclaimPoolNUMAs.Difference(exclusiveNUMAs).Difference(emptyNUMAs).ToSliceInt() { headroom := float64(reclaimPoolInfo.TopologyAwareAssignments[numaID].Size()) sharedCoresHeadroom += headroom + headroomNuma[numaID] += headroom klog.InfoS("shared_cores headroom", "headroom", headroom, "numaID", numaID) } @@ -126,29 +138,129 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, error) { reservedForReclaim, _ := (*ha.reservedForReclaim)[numaID] headroom := float64(available) - reservedForAllocate + float64(reservedForReclaim) headroomTotal += headroom + headroomNuma[numaID] += headroom klog.InfoS("empty NUMA headroom", "headroom", headroom) } if hasUpperBound { ha.backoffRetries++ + headroomTotalOrig := headroomTotal headroomTotal = general.MaxFloat64(0, headroomTotal-float64(ha.backoffRetries*ha.backoffStep)) + if headroomTotal == 0 { + headroomNuma = nil + } else { + // reduce numa headroom by percent + reduceRatio := headroomTotal / headroomTotalOrig + for numaID := range headroomNuma { + headroomNuma[numaID] *= reduceRatio + } + } } else { ha.backoffRetries = 0 } general.InfoS("[qosaware-cpu] headroom assembled", "headroomTotal", headroomTotal, "backoffRetries", ha.backoffRetries, "util based enabled", dynamicConfig.CPUUtilBasedConfiguration.Enable) + for numaID, headroom := range headroomNuma { + general.InfoS("[qosaware-cpu] NUMA headroom assembled", "NUMA-ID", numaID, "headroom", headroom) + } // if util based cpu headroom disable or reclaim pool not existed, just return total reclaim pool size as headroom if !dynamicConfig.CPUUtilBasedConfiguration.Enable || !reclaimPoolExist || reclaimPoolInfo == nil { - return *resource.NewQuantity(int64(headroomTotal), resource.DecimalSI), nil + headroomNumaRet := make(map[int]resource.Quantity) + for numaID, headroom := range headroomNuma { + headroomNumaRet[numaID] = *resource.NewMilliQuantity(int64(headroom*1000), resource.DecimalSI) + } + return *resource.NewQuantity(int64(headroomTotal), resource.DecimalSI), headroomNumaRet, nil } - reclaimMetrics, err := helper.GetReclaimMetrics(reclaimPoolInfo.TopologyAwareAssignments.MergeCPUSet(), ha.conf.ReclaimRelativeRootCgroupPath, ha.metaServer.MetricsFetcher) + return ha.getHeadroomByUtil() +} + +func (ha *HeadroomAssemblerCommon) getReclaimCgroupPathByNUMA(numaID int) string { + return fmt.Sprintf("%s-%d", ha.conf.ReclaimRelativeRootCgroupPath, numaID) +} + +func (ha *HeadroomAssemblerCommon) getReclaimCgroupPath() string { + return ha.conf.ReclaimRelativeRootCgroupPath +} + +func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[int]resource.Quantity, error) { + reclaimPoolInfo, reclaimPoolExist := ha.metaReader.GetPoolInfo(commonstate.PoolNameReclaim) + if !reclaimPoolExist || reclaimPoolInfo == nil { + return resource.Quantity{}, nil, fmt.Errorf("get headroom by util failed: reclaim pool not found") + } + + bindingNUMAs, nonBindingNumas, err := ha.getReclaimNUMABindingTopo(reclaimPoolInfo) if err != nil { - return resource.Quantity{}, err + return resource.Quantity{}, nil, err + } + + numaHeadroom := make(map[int]resource.Quantity, ha.metaServer.NumNUMANodes) + totalHeadroom := resource.Quantity{} + dynamicConfig := ha.conf.GetDynamicConfiguration() + options := helper.UtilBasedCapacityOptions{ + TargetUtilization: dynamicConfig.TargetReclaimedCoreUtilization, + MaxUtilization: dynamicConfig.MaxReclaimedCoreUtilization, + MaxOversoldRate: dynamicConfig.MaxOversoldRate, + MaxCapacity: dynamicConfig.MaxHeadroomCapacityRate * float64(ha.metaServer.MachineInfo.NumCores/ha.metaServer.NumNUMANodes), + } + + // get headroom per NUMA + for _, numaID := range bindingNUMAs { + cpuSet, ok := reclaimPoolInfo.TopologyAwareAssignments[numaID] + if !ok { + return resource.Quantity{}, nil, fmt.Errorf("reclaim pool NOT found TopologyAwareAssignments with numaID: %v", numaID) + } + + reclaimMetrics, err := metricHelper.GetReclaimMetrics(cpuSet, ha.getReclaimCgroupPathByNUMA(numaID), ha.metaServer.MetricsFetcher) + if err != nil { + return resource.Quantity{}, nil, fmt.Errorf("get reclaim Metrics failed with numa %d: %v", numaID, err) + } + + headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics) + if err != nil { + return resource.Quantity{}, nil, fmt.Errorf("get util-based headroom failed with numa %d: %v", numaID, err) + } + + numaHeadroom[numaID] = headroom + totalHeadroom.Add(headroom) } - return ha.getUtilBasedHeadroom(dynamicConfig, reclaimMetrics) + // get global reclaim headroom + if len(nonBindingNumas) > 0 { + cpusets := machine.NewCPUSet() + for _, numaID := range nonBindingNumas { + cpuSet, ok := reclaimPoolInfo.TopologyAwareAssignments[numaID] + if !ok { + return resource.Quantity{}, nil, fmt.Errorf("reclaim pool NOT found TopologyAwareAssignments with numaID: %v", numaID) + } + + cpusets = cpusets.Union(cpuSet) + } + + reclaimMetrics, err := metricHelper.GetReclaimMetrics(cpusets, ha.getReclaimCgroupPath(), ha.metaServer.MetricsFetcher) + if err != nil { + return resource.Quantity{}, nil, fmt.Errorf("get reclaim Metrics failed: %v", err) + } + + options.MaxCapacity *= float64(len(nonBindingNumas)) + headroom, err := ha.getUtilBasedHeadroom(options, reclaimMetrics) + if err != nil { + return resource.Quantity{}, nil, fmt.Errorf("get util-based headroom failed: %v", err) + } + + totalHeadroom.Add(headroom) + headroomPerNUMA := float64(headroom.Value()) / float64(len(nonBindingNumas)) + for _, numaID := range nonBindingNumas { + numaHeadroom[numaID] = *resource.NewQuantity(int64(headroomPerNUMA), resource.DecimalSI) + } + } + + general.InfoS("[qosaware-cpu] headroom by utilization", "total", totalHeadroom.Value()) + for numaID, headroom := range numaHeadroom { + general.InfoS("[qosaware-cpu] NUMA headroom by utilization", "NUMA-ID", numaID, "headroom", headroom.Value()) + } + return totalHeadroom, numaHeadroom, nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_test.go index 1360a9017..8d8e15d19 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_test.go @@ -79,6 +79,14 @@ func generateTestMetaServer(t *testing.T, cnr *v1alpha1.CustomNodeResource, podL KatalystMachineInfo: &machine.KatalystMachineInfo{ MachineInfo: &info.MachineInfo{ NumCores: 96, + Topology: []info.Node{ + { + Id: 0, + }, + { + Id: 1, + }, + }, }, CPUTopology: cpuTopology, }, @@ -409,7 +417,8 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) { err := cache.SetPoolInfo(commonstate.PoolNameReclaim, &types.PoolInfo{ PoolName: commonstate.PoolNameReclaim, TopologyAwareAssignments: map[int]machine.CPUSet{ - 0: machine.MustParse("0-85"), + 0: machine.MustParse("0-43"), + 1: machine.MustParse("49-85"), }, }) require.NoError(t, err) @@ -513,7 +522,7 @@ func TestHeadroomAssemblerCommon_GetHeadroom(t *testing.T) { store := metricsFetcher.(*metric.FakeMetricsFetcher) tt.fields.setFakeMetric(store) - got, err := ha.GetHeadroom() + got, _, err := ha.GetHeadroom() if (err != nil) != tt.wantErr { t.Errorf("GetHeadroom() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go index e54e5f943..10c2b0ceb 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go @@ -20,18 +20,19 @@ import ( "context" "fmt" "math" + "strconv" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" - "github.com/kubewharf/katalyst-core/pkg/config/agent/dynamic" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" metaserverHelper "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper" "github.com/kubewharf/katalyst-core/pkg/util/general" ) -func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(dynamicConfig *dynamic.Configuration, +func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(options helper.UtilBasedCapacityOptions, reclaimMetrics *metaserverHelper.ReclaimMetrics, ) (resource.Quantity, error) { lastReclaimedCPU, err := ha.getLastReclaimedCPU() @@ -53,12 +54,7 @@ func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(dynamicConfig *dynamic.C "lastReclaimedCPU", lastReclaimedCPU) headroom, err := helper.EstimateUtilBasedCapacity( - helper.UtilBasedCapacityOptions{ - TargetUtilization: dynamicConfig.TargetReclaimedCoreUtilization, - MaxUtilization: dynamicConfig.MaxReclaimedCoreUtilization, - MaxOversoldRate: dynamicConfig.MaxOversoldRate, - MaxCapacity: dynamicConfig.MaxHeadroomCapacityRate * float64(ha.metaServer.MachineInfo.NumCores), - }, + options, reclaimMetrics.ReclaimedCoresSupply, util, lastReclaimedCPU, @@ -85,3 +81,51 @@ func (ha *HeadroomAssemblerCommon) getLastReclaimedCPU() (float64, error) { klog.Errorf("cnr status resource allocatable reclaimed milli cpu not found") return 0, nil } + +func (ha *HeadroomAssemblerCommon) getReclaimNUMABindingTopo(reclaimPool *types.PoolInfo) (bindingNUMAs, nonBindingNumas []int, err error) { + if ha.metaServer == nil || ha.metaServer.MachineInfo == nil || len(ha.metaServer.MachineInfo.Topology) == 0 { + err = fmt.Errorf("invalid machaine topo") + return + } + + numaMap := make(map[int]bool) + + for numaID := range reclaimPool.TopologyAwareAssignments { + numaMap[numaID] = false + } + + f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { + if ci == nil { + return true + } + + if ci.QoSLevel != consts.PodAnnotationQoSLevelReclaimedCores { + return true + } + + numaRet, ok := ci.Annotations[consts.PodAnnotationNUMABindResultKey] + if !ok || numaRet == "-1" { + return true + } + + numaID, err := strconv.Atoi(numaRet) + if err != nil { + klog.Errorf("invalid numa binding result: %s, %v\n", numaRet, err) + return true + } + + numaMap[numaID] = true + return true + } + ha.metaReader.RangeContainer(f) + + for numaID, bound := range numaMap { + if bound { + bindingNUMAs = append(bindingNUMAs, numaID) + } else { + nonBindingNumas = append(nonBindingNumas, numaID) + } + } + + return +} diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go index 1f03f7c7b..b8388332c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go @@ -148,22 +148,22 @@ func (ra *memoryResourceAdvisor) GetChannels() (interface{}, interface{}) { return ra.recvCh, ra.sendChan } -func (ra *memoryResourceAdvisor) GetHeadroom() (resource.Quantity, error) { +func (ra *memoryResourceAdvisor) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { ra.mutex.RLock() defer ra.mutex.RUnlock() for _, headroomPolicy := range ra.headroomPolices { - headroom, err := headroomPolicy.GetHeadroom() + headroom, numaHeadroom, err := headroomPolicy.GetHeadroom() if err != nil { klog.ErrorS(err, "get headroom failed", "headroomPolicy", headroomPolicy.Name()) _ = ra.emitter.StoreInt64(metricNameMemoryGetHeadroomFailed, 1, metrics.MetricTypeNameRaw, metrics.MetricTag{Key: metricTagKeyPolicyName, Val: string(headroomPolicy.Name())}) continue } - return headroom, nil + return headroom, numaHeadroom, nil } - return resource.Quantity{}, fmt.Errorf("failed to get valid headroom") + return resource.Quantity{}, nil, fmt.Errorf("failed to get valid headroom") } func (ra *memoryResourceAdvisor) sendAdvices() error { @@ -175,6 +175,46 @@ func (ra *memoryResourceAdvisor) sendAdvices() error { result.ExtraEntries = append(result.ExtraEntries, advices.ExtraEntries...) } + /* + var numaHeadroom map[int]resource.Quantity + var err error + for _, headroomPolicy := range ra.headroomPolices { + _, numaHeadroom, err = headroomPolicy.GetHeadroom() + if err != nil { + klog.ErrorS(err, "get headroom failed", "headroomPolicy", headroomPolicy.Name()) + _ = ra.emitter.StoreInt64(metricNameMemoryGetHeadroomFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: metricTagKeyPolicyName, Val: string(headroomPolicy.Name())}) + continue + } + + break + } + + if numaHeadroom == nil { + klog.Errorf("can NOT get numa headroom") + return fmt.Errorf("can NOT get numa headroom") + } + + headroom := make(map[int]float64) + for numaID, res := range numaHeadroom { + headroom[numaID] = float64(res.Value()) + } + + data, err := json.Marshal(headroom) + if err != nil { + klog.Errorf("marshal numa headroom failed: %s", err) + return fmt.Errorf("marshal numa headroom failed: %s", err) + + } + extra := types.ExtraMemoryAdvices{ + CgroupPath: ra.conf.ReclaimRelativeRootCgroupPath, + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryNUMAHeadroom): string(data), + }, + } + result.ExtraEntries = append(result.ExtraEntries, extra) + */ + select { case ra.sendChan <- result: general.Infof("notify memory server: %+v", result) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go index 46952a0f6..064826e57 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor_test.go @@ -321,6 +321,26 @@ var defaultNumaMetrics = []numaMetric{ metricName: coreconsts.MetricMemTotalNuma, metricValue: metricutil.MetricData{Value: 120 << 30}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, } var dropCacheNodeMetrics = []nodeMetric{ @@ -379,6 +399,26 @@ var dropCacheNUMAMetrics = []numaMetric{ metricName: coreconsts.MetricMemTotalNuma, metricValue: metricutil.MetricData{Value: 120 << 30}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, } var cgroupMetrics = []cgroupMetric{ @@ -624,6 +664,122 @@ func TestUpdate(t *testing.T) { containerName: "c3", }, }, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, + }, + wantAdviceResult: types.InternalMemoryCalculationResult{ ContainerEntries: []types.ContainerMemoryAdvices{ { @@ -719,6 +875,62 @@ func TestUpdate(t *testing.T) { containerName: "c3", numaID: 1, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, }, wantAdviceResult: types.InternalMemoryCalculationResult{ ContainerEntries: []types.ContainerMemoryAdvices{ @@ -1180,6 +1392,66 @@ func TestUpdate(t *testing.T) { containerName: "c4", }, }, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 3, + }, + + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, + }, + cgroupMetrics: []cgroupMetric{ { metricName: coreconsts.MetricMemPsiAvg60Cgroup, @@ -1311,78 +1583,178 @@ func TestUpdate(t *testing.T) { 0: machine.MustParse("1"), }, 200<<30), }, - plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemsetBinder}, - nodeMetrics: defaultNodeMetrics, - numaMetrics: defaultNumaMetrics, - wantHeadroom: *resource.NewQuantity(871<<30, resource.DecimalSI), - wantAdviceResult: types.InternalMemoryCalculationResult{ - ContainerEntries: []types.ContainerMemoryAdvices{ - { - PodUID: "uid1", - ContainerName: "c1", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, - }, - { - PodUID: "uid2", - ContainerName: "c2", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, - }, - { - PodUID: "uid3", - ContainerName: "c3", - Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, - }, - }, - }, - }, - { - name: "bind memset(numa1 pressure)", - pools: map[string]*types.PoolInfo{ - commonstate.PoolNameReserve: { - PoolName: commonstate.PoolNameReserve, - TopologyAwareAssignments: map[int]machine.CPUSet{ - 0: machine.MustParse("0"), - 1: machine.MustParse("24"), - }, - OriginalTopologyAwareAssignments: map[int]machine.CPUSet{ - 0: machine.MustParse("0"), - 1: machine.MustParse("24"), - }, - }, - }, - reclaimedEnable: false, - needRecvAdvices: true, - containers: []*types.ContainerInfo{ - makeContainerInfo("uid1", "default", "pod1", "c1", consts.PodAnnotationQoSLevelReclaimedCores, nil, - map[int]machine.CPUSet{ - 0: machine.MustParse("1"), - 1: machine.MustParse("25"), - }, 200<<30), - makeContainerInfo("uid2", "default", "pod2", "c2", consts.PodAnnotationQoSLevelReclaimedCores, nil, - map[int]machine.CPUSet{ - 0: machine.MustParse("1"), - 1: machine.MustParse("25"), - }, 200<<30), - makeContainerInfo("uid3", "default", "pod3", "c3", consts.PodAnnotationQoSLevelReclaimedCores, nil, - map[int]machine.CPUSet{ - 0: machine.MustParse("1"), - }, 200<<30), - makeContainerInfo("uid4", "default", "pod4", "c4", consts.PodAnnotationQoSLevelDedicatedCores, map[string]string{ - consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, - consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, - }, - map[int]machine.CPUSet{ - 0: machine.MustParse("1"), - }, 200<<30), - }, plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemsetBinder}, nodeMetrics: defaultNodeMetrics, - numaMetrics: []numaMetric{ - { - numaID: 0, - metricName: coreconsts.MetricMemFreeNuma, - metricValue: metricutil.MetricData{Value: 60 << 30}, - }, + numaMetrics: defaultNumaMetrics, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 10 << 20}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 9 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 2 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 10 << 20}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 9 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemFilePerNumaContainer, + metricValue: metricutil.MetricData{Value: 2 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + }, + wantHeadroom: *resource.NewQuantity(871<<30, resource.DecimalSI), + wantAdviceResult: types.InternalMemoryCalculationResult{ + ContainerEntries: []types.ContainerMemoryAdvices{ + { + PodUID: "uid1", + ContainerName: "c1", + Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, + }, + { + PodUID: "uid2", + ContainerName: "c2", + Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, + }, + { + PodUID: "uid3", + ContainerName: "c3", + Values: map[string]string{string(memoryadvisor.ControlKnobKeyCPUSetMems): "1-3"}, + }, + }, + }, + }, + { + name: "bind memset(numa1 pressure)", + pools: map[string]*types.PoolInfo{ + commonstate.PoolNameReserve: { + PoolName: commonstate.PoolNameReserve, + TopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + OriginalTopologyAwareAssignments: map[int]machine.CPUSet{ + 0: machine.MustParse("0"), + 1: machine.MustParse("24"), + }, + }, + }, + reclaimedEnable: false, + needRecvAdvices: true, + containers: []*types.ContainerInfo{ + makeContainerInfo("uid1", "default", "pod1", "c1", consts.PodAnnotationQoSLevelReclaimedCores, nil, + map[int]machine.CPUSet{ + 0: machine.MustParse("1"), + 1: machine.MustParse("25"), + }, 200<<30), + makeContainerInfo("uid2", "default", "pod2", "c2", consts.PodAnnotationQoSLevelReclaimedCores, nil, + map[int]machine.CPUSet{ + 0: machine.MustParse("1"), + 1: machine.MustParse("25"), + }, 200<<30), + makeContainerInfo("uid3", "default", "pod3", "c3", consts.PodAnnotationQoSLevelReclaimedCores, nil, + map[int]machine.CPUSet{ + 0: machine.MustParse("1"), + }, 200<<30), + makeContainerInfo("uid4", "default", "pod4", "c4", consts.PodAnnotationQoSLevelDedicatedCores, map[string]string{ + consts.PodAnnotationMemoryEnhancementNumaBinding: consts.PodAnnotationMemoryEnhancementNumaBindingEnable, + consts.PodAnnotationMemoryEnhancementNumaExclusive: consts.PodAnnotationMemoryEnhancementNumaExclusiveEnable, + }, + map[int]machine.CPUSet{ + 0: machine.MustParse("1"), + }, 200<<30), + }, + plugins: []types.MemoryAdvisorPluginName{memadvisorplugin.MemsetBinder}, + nodeMetrics: defaultNodeMetrics, + numaMetrics: []numaMetric{ + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 60 << 30}, + }, { numaID: 1, metricName: coreconsts.MetricMemFreeNuma, @@ -1418,7 +1790,143 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemTotalNuma, metricValue: metricutil.MetricData{Value: 120 << 30}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, }, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, + }, + wantHeadroom: *resource.NewQuantity(871<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ ContainerEntries: []types.ContainerMemoryAdvices{ @@ -1523,6 +2031,141 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemTotalNuma, metricValue: metricutil.MetricData{Value: 120 << 30}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + }, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, }, wantHeadroom: *resource.NewQuantity(871<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -1624,43 +2267,103 @@ func TestUpdate(t *testing.T) { numaMetrics: []numaMetric{ { numaID: 0, - metricName: coreconsts.MetricMemLatencyReadNuma, - metricValue: metricutil.MetricData{Value: 110}, + metricName: coreconsts.MetricMemLatencyReadNuma, + metricValue: metricutil.MetricData{Value: 110}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemLatencyReadNuma, + metricValue: metricutil.MetricData{Value: 50}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemLatencyReadNuma, + metricValue: metricutil.MetricData{Value: 20}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemLatencyReadNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemBandwidthNuma, + metricValue: metricutil.MetricData{Value: 1000}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemBandwidthNuma, + metricValue: metricutil.MetricData{Value: 300}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemBandwidthNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemBandwidthNuma, + metricValue: metricutil.MetricData{Value: 100}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10 << 30}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200 << 30}, }, { numaID: 1, - metricName: coreconsts.MetricMemLatencyReadNuma, - metricValue: metricutil.MetricData{Value: 50}, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200 << 30}, }, { numaID: 2, - metricName: coreconsts.MetricMemLatencyReadNuma, - metricValue: metricutil.MetricData{Value: 20}, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200 << 30}, }, { numaID: 3, - metricName: coreconsts.MetricMemLatencyReadNuma, - metricValue: metricutil.MetricData{Value: 10}, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200 << 30}, }, { numaID: 0, - metricName: coreconsts.MetricMemBandwidthNuma, - metricValue: metricutil.MetricData{Value: 1000}, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5 << 30}, }, { numaID: 1, - metricName: coreconsts.MetricMemBandwidthNuma, - metricValue: metricutil.MetricData{Value: 300}, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5 << 30}, }, { numaID: 2, - metricName: coreconsts.MetricMemBandwidthNuma, - metricValue: metricutil.MetricData{Value: 200}, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5 << 30}, }, { numaID: 3, - metricName: coreconsts.MetricMemBandwidthNuma, - metricValue: metricutil.MetricData{Value: 100}, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5 << 30}, }, }, containerNUMAMetrics: []containerNUMAMetric{ @@ -1692,6 +2395,76 @@ func TestUpdate(t *testing.T) { containerName: "c4", numaID: 0, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -1821,6 +2594,66 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemBandwidthNuma, metricValue: metricutil.MetricData{Value: 100}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, }, containerNUMAMetrics: []containerNUMAMetric{ { @@ -1851,6 +2684,76 @@ func TestUpdate(t *testing.T) { containerName: "c4", numaID: 0, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -1947,32 +2850,120 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemBandwidthNuma, metricValue: metricutil.MetricData{Value: 100}, }, - }, - containerNUMAMetrics: []containerNUMAMetric{ { - metricName: coreconsts.MetricsMemAnonPerNumaContainer, - metricValue: metricutil.MetricData{Value: 2 << 30}, + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + }, + containerNUMAMetrics: []containerNUMAMetric{ + { + metricName: coreconsts.MetricsMemAnonPerNumaContainer, + metricValue: metricutil.MetricData{Value: 2 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemAnonPerNumaContainer, + metricValue: metricutil.MetricData{Value: 1 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemAnonPerNumaContainer, + metricValue: metricutil.MetricData{Value: 2 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemAnonPerNumaContainer, + metricValue: metricutil.MetricData{Value: 512 << 20}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, podUID: "uid1", containerName: "c1", numaID: 0, }, { - metricName: coreconsts.MetricsMemAnonPerNumaContainer, - metricValue: metricutil.MetricData{Value: 1 << 30}, + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, podUID: "uid2", containerName: "c2", numaID: 0, }, { - metricName: coreconsts.MetricsMemAnonPerNumaContainer, - metricValue: metricutil.MetricData{Value: 2 << 30}, + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, podUID: "uid3", containerName: "c3", numaID: 0, }, { - metricName: coreconsts.MetricsMemAnonPerNumaContainer, - metricValue: metricutil.MetricData{Value: 512 << 20}, + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, podUID: "uid4", containerName: "c4", numaID: 0, @@ -2106,6 +3097,66 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemBandwidthNuma, metricValue: metricutil.MetricData{Value: 100}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, }, containerNUMAMetrics: []containerNUMAMetric{ { @@ -2136,6 +3187,76 @@ func TestUpdate(t *testing.T) { containerName: "c4", numaID: 0, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 3, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 1, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 2, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 3, + }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -2240,6 +3361,66 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemBandwidthNuma, metricValue: metricutil.MetricData{Value: 300}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, }, containerNUMAMetrics: []containerNUMAMetric{ { @@ -2270,6 +3451,34 @@ func TestUpdate(t *testing.T) { containerName: "c4", numaID: 2, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -2372,6 +3581,66 @@ func TestUpdate(t *testing.T) { metricName: coreconsts.MetricMemBandwidthNuma, metricValue: metricutil.MetricData{Value: 100}, }, + { + numaID: 0, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemFreeNuma, + metricValue: metricutil.MetricData{Value: 10}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemTotalNuma, + metricValue: metricutil.MetricData{Value: 200}, + }, + { + numaID: 0, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 1, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 2, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, + { + numaID: 3, + metricName: coreconsts.MetricMemInactiveFileNuma, + metricValue: metricutil.MetricData{Value: 5}, + }, }, containerNUMAMetrics: []containerNUMAMetric{ { @@ -2402,6 +3671,34 @@ func TestUpdate(t *testing.T) { containerName: "c4", numaID: 2, }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid1", + containerName: "c1", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid2", + containerName: "c2", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid3", + containerName: "c3", + numaID: 0, + }, + { + metricName: coreconsts.MetricsMemTotalPerNumaContainer, + metricValue: metricutil.MetricData{Value: 50 << 30}, + podUID: "uid4", + containerName: "c4", + numaID: 0, + }, }, wantHeadroom: *resource.NewQuantity(980<<30, resource.DecimalSI), wantAdviceResult: types.InternalMemoryCalculationResult{ @@ -2498,7 +3795,7 @@ func TestUpdate(t *testing.T) { assert.ElementsMatch(t, tt.wantAdviceResult.ExtraEntries, result.ExtraEntries) assert.ElementsMatch(t, tt.wantAdviceResult.ContainerEntries, result.ContainerEntries) } - headroom, err := advisor.GetHeadroom() + headroom, _, err := advisor.GetHeadroom() if reflect.DeepEqual(tt.wantHeadroom, resource.Quantity{}) { assert.Error(t, err) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy.go index 1ac8c8bf7..6b47b8f60 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy.go @@ -43,7 +43,7 @@ type HeadroomPolicy interface { Update() error // GetHeadroom returns the latest headroom estimation - GetHeadroom() (resource.Quantity, error) + GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) } type InitFunc func(conf *config.Configuration, extraConfig interface{}, metaReader metacache.MetaReader, diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go index 81f627e43..5abd5a8a1 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical.go @@ -30,6 +30,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource/helper" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" + "github.com/kubewharf/katalyst-core/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" @@ -40,8 +41,9 @@ type PolicyCanonical struct { *PolicyBase // memoryHeadroom is valid to be used iff updateStatus successes - memoryHeadroom float64 - updateStatus types.PolicyUpdateStatus + memoryHeadroom float64 + numaMemoryHeadroom map[int]resource.Quantity + updateStatus types.PolicyUpdateStatus conf *config.Configuration } @@ -50,9 +52,10 @@ func NewPolicyCanonical(conf *config.Configuration, _ interface{}, metaReader me metaServer *metaserver.MetaServer, _ metrics.MetricEmitter, ) HeadroomPolicy { p := PolicyCanonical{ - PolicyBase: NewPolicyBase(metaReader, metaServer), - updateStatus: types.PolicyUpdateFailed, - conf: conf, + PolicyBase: NewPolicyBase(metaReader, metaServer), + numaMemoryHeadroom: make(map[int]resource.Quantity), + updateStatus: types.PolicyUpdateFailed, + conf: conf, } return &p @@ -159,6 +162,52 @@ func (p *PolicyCanonical) Update() (err error) { p.memoryHeadroom = math.Max(memoryHeadroomWithoutBuffer+utilBasedBuffer, 0) p.memoryHeadroom = math.Min(p.memoryHeadroom, maxAllocatableMemory) + availNUMAs, reclaimedCoresContainers, err := helper.GetAvailableNUMAsAndReclaimedCores(p.conf, p.metaReader, p.metaServer) + if err != nil { + return err + } + + numaReclaimableMemory := make(map[int]float64) + numaReclaimableMemorySum := 0.0 + for _, numaID := range availNUMAs.ToSliceInt() { + data, err := p.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) + if err != nil { + general.Errorf("Can not get numa memory free, numaID: %v", numaID) + return err + } + free := data.Value + + data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemInactiveFileNuma) + if err != nil { + return err + } + inactiveFile := data.Value + + numaReclaimable := free + inactiveFile*dynamicConfig.CacheBasedRatio + numaReclaimableMemory[numaID] = numaReclaimable + numaReclaimableMemorySum += numaReclaimable + } + + for _, container := range reclaimedCoresContainers { + for numaID := range container.TopologyAwareAssignments { + data, err := p.metaServer.GetContainerNumaMetric(container.PodUID, container.ContainerName, numaID, consts.MetricsMemTotalPerNumaContainer) + if err != nil { + general.ErrorS(err, "Can not get container numa memory total", "numaID", numaID, "uid", container.PodUID, "name", container.ContainerName) + return err + } + numaReclaimableMemory[numaID] += data.Value + numaReclaimableMemorySum += data.Value + } + } + + ratio := p.memoryHeadroom / numaReclaimableMemorySum + numaHeadroom := make(map[int]resource.Quantity) + for numaID := range numaReclaimableMemory { + numaHeadroom[numaID] = *resource.NewQuantity(int64(numaReclaimableMemory[numaID]*ratio), resource.BinarySI) + general.InfoS("memory headroom per NUMA", "NUMA-ID", numaID, "headroom", int64(numaReclaimableMemory[numaID]*ratio)) + } + p.numaMemoryHeadroom = numaHeadroom + general.InfoS("memory details", "without buffer memory headroom", general.FormatMemoryQuantity(memoryHeadroomWithoutBuffer), "final memory headroom", general.FormatMemoryQuantity(p.memoryHeadroom), @@ -169,10 +218,10 @@ func (p *PolicyCanonical) Update() (err error) { return nil } -func (p *PolicyCanonical) GetHeadroom() (resource.Quantity, error) { +func (p *PolicyCanonical) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { if p.updateStatus != types.PolicyUpdateSucceeded { - return resource.Quantity{}, fmt.Errorf("last update failed") + return resource.Quantity{}, nil, fmt.Errorf("last update failed") } - return *resource.NewQuantity(int64(p.memoryHeadroom), resource.BinarySI), nil + return *resource.NewQuantity(int64(p.memoryHeadroom), resource.BinarySI), p.numaMemoryHeadroom, nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical_test.go index 878558edc..ab055d77e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_canonical_test.go @@ -187,6 +187,11 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { ReservedForAllocate: 4 << 30, }, setFakeMetric: func(store *metric.FakeMetricsFetcher) { + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 5 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemRssContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemCacheContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) @@ -266,6 +271,11 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { store.SetNodeMetric(pkgconsts.MetricMemScaleFactorSystem, utilmetric.MetricData{Value: 500, Time: &now}) store.SetNodeMetric(pkgconsts.MetricMemUsedSystem, utilmetric.MetricData{Value: 40 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemRssContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemCacheContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) @@ -345,6 +355,11 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { store.SetNodeMetric(pkgconsts.MetricMemScaleFactorSystem, utilmetric.MetricData{Value: 500, Time: &now}) store.SetNodeMetric(pkgconsts.MetricMemUsedSystem, utilmetric.MetricData{Value: 40 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemRssContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemCacheContainer, utilmetric.MetricData{Value: 10 << 30, Time: &now}) @@ -428,6 +443,11 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { store.SetNodeMetric(pkgconsts.MetricMemScaleFactorSystem, utilmetric.MetricData{Value: 500, Time: &now}) store.SetNodeMetric(pkgconsts.MetricMemUsedSystem, utilmetric.MetricData{Value: 60 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemRssContainer, utilmetric.MetricData{Value: 15 << 30, Time: &now}) store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemCacheContainer, utilmetric.MetricData{Value: 15 << 30, Time: &now}) @@ -491,6 +511,11 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { store.SetNodeMetric(pkgconsts.MetricMemScaleFactorSystem, utilmetric.MetricData{Value: 500, Time: &now}) store.SetNodeMetric(pkgconsts.MetricMemUsedSystem, utilmetric.MetricData{Value: 60 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemFreeNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(0, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 2 << 30, Time: &now}) + store.SetNumaMetric(1, pkgconsts.MetricMemInactiveFileNuma, utilmetric.MetricData{Value: 8 << 30, Time: &now}) + store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemRssContainer, utilmetric.MetricData{Value: 15 << 30, Time: &now}) store.SetContainerMetric("pod1", "container1", pkgconsts.MetricMemCacheContainer, utilmetric.MetricData{Value: 15 << 30, Time: &now}) }, @@ -535,7 +560,7 @@ func TestPolicyCanonical_calculateMemoryBuffer(t *testing.T) { err = p.Update() require.NoError(t, err) - got, err := p.GetHeadroom() + got, _, err := p.GetHeadroom() if (err != nil) != tt.wantErr { t.Errorf("calculateUtilBasedBuffer() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go index 0d4a07cab..c1738039f 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go @@ -38,8 +38,9 @@ type PolicyNUMAAware struct { *PolicyBase // memoryHeadroom is valid to be used iff updateStatus successes - memoryHeadroom float64 - updateStatus types.PolicyUpdateStatus + memoryHeadroom float64 + numaMemoryHeadroom map[int]resource.Quantity + updateStatus types.PolicyUpdateStatus conf *config.Configuration } @@ -48,9 +49,10 @@ func NewPolicyNUMAAware(conf *config.Configuration, _ interface{}, metaReader me metaServer *metaserver.MetaServer, _ metrics.MetricEmitter, ) HeadroomPolicy { p := PolicyNUMAAware{ - PolicyBase: NewPolicyBase(metaReader, metaServer), - updateStatus: types.PolicyUpdateFailed, - conf: conf, + PolicyBase: NewPolicyBase(metaReader, metaServer), + numaMemoryHeadroom: make(map[int]resource.Quantity), + updateStatus: types.PolicyUpdateFailed, + conf: conf, } return &p @@ -74,10 +76,11 @@ func (p *PolicyNUMAAware) Update() (err error) { }() var ( - reclaimableMemory float64 = 0 - availNUMATotal float64 = 0 - reservedForAllocate float64 = 0 - data metric.MetricData + reclaimableMemory float64 = 0 + numaReclaimableMemory map[int]float64 + availNUMATotal float64 = 0 + reservedForAllocate float64 = 0 + data metric.MetricData ) dynamicConfig := p.conf.GetDynamicConfiguration() @@ -86,6 +89,7 @@ func (p *PolicyNUMAAware) Update() (err error) { return err } + numaReclaimableMemory = make(map[int]float64) for _, numaID := range availNUMAs.ToSliceInt() { data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) if err != nil { @@ -118,10 +122,17 @@ func (p *PolicyNUMAAware) Update() (err error) { ) reclaimableMemory += numaReclaimable + numaReclaimableMemory[numaID] = numaReclaimable } for _, container := range reclaimedCoresContainers { reclaimableMemory += container.MemoryRequest + if container.MemoryRequest > 0 && len(container.TopologyAwareAssignments) > 0 { + reclaimableMemoryPerNuma := container.MemoryRequest / float64(len(container.TopologyAwareAssignments)) + for numaID := range container.TopologyAwareAssignments { + numaReclaimableMemory[numaID] += reclaimableMemoryPerNuma + } + } } watermarkScaleFactor, err := p.metaServer.GetNodeMetric(consts.MetricMemScaleFactorSystem) @@ -133,20 +144,34 @@ func (p *PolicyNUMAAware) Update() (err error) { // reserve memory for watermark_scale_factor to make kswapd less happened systemWatermarkReserved := availNUMATotal * watermarkScaleFactor.Value / 10000 + p.memoryHeadroom = math.Max(reclaimableMemory-systemWatermarkReserved-reservedForAllocate, 0) + reduceRatio := 0.0 + if reclaimableMemory > 0 { + reduceRatio = p.memoryHeadroom / reclaimableMemory + } + numaHeadroom := 0.0 + for numaID := range numaReclaimableMemory { + numaReclaimableMemory[numaID] *= reduceRatio + numaHeadroom += numaReclaimableMemory[numaID] + p.numaMemoryHeadroom[numaID] = *resource.NewQuantity(int64(numaReclaimableMemory[numaID]), resource.BinarySI) + general.InfoS("memory reclaimable per NUMA", "NUMA-ID", numaID, "headroom", numaReclaimableMemory[numaID]) + } + general.InfoS("total memory reclaimable", "reclaimableMemory", general.FormatMemoryQuantity(reclaimableMemory), "ResourceUpperBound", general.FormatMemoryQuantity(p.essentials.ResourceUpperBound), "systemWatermarkReserved", general.FormatMemoryQuantity(systemWatermarkReserved), - "reservedForAllocate", general.FormatMemoryQuantity(reservedForAllocate)) - p.memoryHeadroom = math.Max(reclaimableMemory-systemWatermarkReserved-reservedForAllocate, 0) - + "reservedForAllocate", general.FormatMemoryQuantity(reservedForAllocate), + "headroom", p.memoryHeadroom, + "numaHeadroom", numaHeadroom, + ) return nil } -func (p *PolicyNUMAAware) GetHeadroom() (resource.Quantity, error) { +func (p *PolicyNUMAAware) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { if p.updateStatus != types.PolicyUpdateSucceeded { - return resource.Quantity{}, fmt.Errorf("last update failed") + return resource.Quantity{}, nil, fmt.Errorf("last update failed") } - return *resource.NewQuantity(int64(p.memoryHeadroom), resource.BinarySI), nil + return *resource.NewQuantity(int64(p.memoryHeadroom), resource.BinarySI), p.numaMemoryHeadroom, nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go index 6111ef863..b0e14e63e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware_test.go @@ -236,7 +236,7 @@ func TestPolicyNUMAAware(t *testing.T) { t.Errorf("update() error = %v, wantErr %v", err, tt.wantErr) return } - got, err := p.GetHeadroom() + got, _, err := p.GetHeadroom() if (err != nil) != tt.wantErr { t.Errorf("GetHeadroom() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go index ffc72f5fa..1a19ec32e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go @@ -42,7 +42,7 @@ type ResourceAdvisor interface { GetSubAdvisor(resourceName types.QoSResourceName) (SubResourceAdvisor, error) // GetHeadroom returns the corresponding headroom quantity according to resource name - GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, error) + GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, map[int]resource.Quantity, error) } // SubResourceAdvisor updates resource provision of a certain dimension based on the latest @@ -57,7 +57,7 @@ type SubResourceAdvisor interface { GetChannels() (interface{}, interface{}) // GetHeadroom returns the latest resource headroom quantity for resource reporter - GetHeadroom() (resource.Quantity, error) + GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) } type resourceAdvisorWrapper struct { @@ -112,21 +112,21 @@ func (ra *resourceAdvisorWrapper) GetSubAdvisor(resourceName types.QoSResourceNa return nil, fmt.Errorf("no sub resource advisor for %v", resourceName) } -func (ra *resourceAdvisorWrapper) GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, error) { +func (ra *resourceAdvisorWrapper) GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, map[int]resource.Quantity, error) { switch resourceName { case v1.ResourceCPU: return ra.getSubAdvisorHeadroom(types.QoSResourceCPU) case v1.ResourceMemory: return ra.getSubAdvisorHeadroom(types.QoSResourceMemory) default: - return resource.Quantity{}, fmt.Errorf("illegal resource %v", resourceName) + return resource.Quantity{}, nil, fmt.Errorf("illegal resource %v", resourceName) } } -func (ra *resourceAdvisorWrapper) getSubAdvisorHeadroom(resourceName types.QoSResourceName) (resource.Quantity, error) { +func (ra *resourceAdvisorWrapper) getSubAdvisorHeadroom(resourceName types.QoSResourceName) (resource.Quantity, map[int]resource.Quantity, error) { subAdvisor, ok := ra.subAdvisorsToRun[resourceName] if !ok { - return resource.Quantity{}, fmt.Errorf("no sub resource advisor for %v", resourceName) + return resource.Quantity{}, nil, fmt.Errorf("no sub resource advisor for %v", resourceName) } return subAdvisor.GetHeadroom() } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource_stub.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource_stub.go index 4950d379f..4f02a0dae 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource_stub.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource_stub.go @@ -47,14 +47,14 @@ func (r *ResourceAdvisorStub) GetSubAdvisor(resourceName types.QoSResourceName) return nil, nil } -func (r *ResourceAdvisorStub) GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, error) { +func (r *ResourceAdvisorStub) GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, map[int]resource.Quantity, error) { r.Lock() defer r.Unlock() if quantity, ok := r.resources[resourceName]; ok { - return quantity, nil + return quantity, nil, nil } - return resource.Quantity{}, fmt.Errorf("not exist") + return resource.Quantity{}, nil, fmt.Errorf("not exist") } func (r *ResourceAdvisorStub) SetHeadroom(resourceName v1.ResourceName, quantity resource.Quantity) { @@ -87,8 +87,8 @@ func (s *SubResourceAdvisorStub) GetChannels() (interface{}, interface{}) { return nil, nil } -func (s *SubResourceAdvisorStub) GetHeadroom() (resource.Quantity, error) { - return s.quantity, nil +func (s *SubResourceAdvisorStub) GetHeadroom() (resource.Quantity, map[int]resource.Quantity, error) { + return s.quantity, nil, nil } func (s *SubResourceAdvisorStub) SetHeadroom(quantity resource.Quantity) { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go index 80747f163..7a37df4b8 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go @@ -18,6 +18,7 @@ package server import ( "context" + "encoding/json" "fmt" "time" @@ -32,6 +33,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -48,17 +50,19 @@ const ( type cpuServer struct { *baseServer - getCheckpointCalled bool - cpuPluginClient cpuadvisor.CPUPluginClient + getCheckpointCalled bool + cpuPluginClient cpuadvisor.CPUPluginClient + headroomResourceManager reporter.HeadroomResourceManager } func NewCPUServer(recvCh chan types.InternalCPUCalculationResult, sendCh chan types.TriggerInfo, conf *config.Configuration, - metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, + headroomResourceManager reporter.HeadroomResourceManager, metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, ) (*cpuServer, error) { cs := &cpuServer{} cs.baseServer = newBaseServer(cpuServerName, conf, recvCh, sendCh, metaCache, metaServer, emitter, cs) cs.advisorSocketPath = conf.CPUAdvisorSocketAbsPath cs.pluginSocketPath = conf.CPUPluginSocketAbsPath + cs.headroomResourceManager = headroomResourceManager cs.resourceRequestName = "CPURequest" return cs, nil } @@ -122,26 +126,12 @@ func (cs *cpuServer) ListAndWatch(_ *advisorsvc.Empty, server cpuadvisor.CPUAdvi } klog.Infof("[qosaware-server-cpu] get advisor update: %+v", advisorResp) - - calculationEntriesMap := make(map[string]*cpuadvisor.CalculationEntries) - blockID2Blocks := NewBlockSet() - - cs.assemblePoolEntries(&advisorResp, calculationEntriesMap, blockID2Blocks) - - // Assemble pod entries - f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { - if err := cs.assemblePodEntries(calculationEntriesMap, blockID2Blocks, podUID, ci); err != nil { - klog.Errorf("[qosaware-server-cpu] assemblePodEntries for pod %s/%s uid %s err: %v", ci.PodNamespace, ci.PodName, ci.PodUID, err) - } - return true + resp, err := cs.assembleResponse(&advisorResp) + if err != nil { + general.Errorf("assembleResponse failed: %s", err) + continue } - cs.metaCache.RangeContainer(f) - // Send result - resp := &cpuadvisor.ListAndWatchResponse{ - Entries: calculationEntriesMap, - AllowSharedCoresOverlapReclaimedCores: advisorResp.AllowSharedCoresOverlapReclaimedCores, - } if err := server.Send(resp); err != nil { klog.Errorf("[qosaware-server-cpu] send response failed: %v", err) _ = cs.emitter.StoreInt64(cs.genMetricsName(metricServerLWSendResponseFailed), int64(cs.period.Seconds()), metrics.MetricTypeNameCount) @@ -155,6 +145,79 @@ func (cs *cpuServer) ListAndWatch(_ *advisorsvc.Empty, server cpuadvisor.CPUAdvi } } +func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult) (*cpuadvisor.ListAndWatchResponse, error) { + calculationEntriesMap := make(map[string]*cpuadvisor.CalculationEntries) + blockID2Blocks := NewBlockSet() + cs.assemblePoolEntries(result, calculationEntriesMap, blockID2Blocks) + + // assmble per-numa headroom + numaAllocatable, err := cs.headroomResourceManager.GetNumaAllocatable() + if err != nil { + return nil, fmt.Errorf("get numa allocatable failed: %v", err) + } + + numaHeadroom := make(map[int]float64) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = float64(res.Value()) + } + data, err := json.Marshal(numaHeadroom) + if err != nil { + return nil, fmt.Errorf("marshal numa headroom failed: %v", err) + } + + calculationResult := &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(cpuadvisor.ControlKnobKeyCPUNUMAHeadroom): string(data), + }, + } + extraNumaHeadRoom := &advisorsvc.CalculationInfo{ + CgroupPath: "", + CalculationResult: calculationResult, + } + + // Assemble pod entries + f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { + if err := cs.assemblePodEntries(calculationEntriesMap, blockID2Blocks, podUID, ci); err != nil { + klog.Errorf("[qosaware-server-cpu] assemblePodEntries for pod %s/%s uid %s err: %v", ci.PodNamespace, ci.PodName, ci.PodUID, err) + } + return true + } + cs.metaCache.RangeContainer(f) + + // Send result + resp := &cpuadvisor.ListAndWatchResponse{ + Entries: calculationEntriesMap, + ExtraEntries: make([]*advisorsvc.CalculationInfo, 0), + AllowSharedCoresOverlapReclaimedCores: result.AllowSharedCoresOverlapReclaimedCores, + } + + for _, retEntry := range result.ExtraEntries { + found := false + for _, respEntry := range resp.ExtraEntries { + if retEntry.CgroupPath == respEntry.CgroupPath { + found = true + for k, v := range retEntry.Values { + respEntry.CalculationResult.Values[k] = v + } + break + } + } + if !found { + calculationInfo := &advisorsvc.CalculationInfo{ + CgroupPath: retEntry.CgroupPath, + CalculationResult: &advisorsvc.CalculationResult{ + Values: general.DeepCopyMap(retEntry.Values), + }, + } + resp.ExtraEntries = append(resp.ExtraEntries, calculationInfo) + } + } + + resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) + + return resp, nil +} + func (cs *cpuServer) getCheckpoint() { safeTime := time.Now().UnixNano() diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server_test.go b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server_test.go index 6363a1c09..f9422f19c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server_test.go @@ -41,6 +41,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/cpu/dynamicpolicy/cpuadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -89,7 +90,7 @@ func newTestCPUServer(t *testing.T, podList []*v1.Pod) *cpuServer { }, } - cpuServer, err := NewCPUServer(recvCh, sendCh, conf, metaCache, metaServer, metrics.DummyMetrics{}) + cpuServer, err := NewCPUServer(recvCh, sendCh, conf, &reporter.DummyHeadroomResourceManager{}, metaCache, metaServer, metrics.DummyMetrics{}) require.NoError(t, err) require.NotNil(t, cpuServer) @@ -116,7 +117,7 @@ func newTestCPUServerWithChanBuffer(t *testing.T, podList []*v1.Pod) *cpuServer }, } - cpuServer, err := NewCPUServer(recvCh, sendCh, conf, metaCache, metaServer, metrics.DummyMetrics{}) + cpuServer, err := NewCPUServer(recvCh, sendCh, conf, &reporter.DummyHeadroomResourceManager{}, metaCache, metaServer, metrics.DummyMetrics{}) require.NoError(t, err) require.NotNil(t, cpuServer) @@ -347,6 +348,13 @@ func TestCPUServerListAndWatch(t *testing.T) { }, wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameShare: { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -457,6 +465,13 @@ func TestCPUServerListAndWatch(t *testing.T) { }, wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameReclaim: { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -622,6 +637,13 @@ func TestCPUServerListAndWatch(t *testing.T) { }, wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameReclaim: { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -899,6 +921,13 @@ func TestCPUServerListAndWatch(t *testing.T) { }, wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameReclaim: { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -1185,6 +1214,13 @@ func TestCPUServerListAndWatch(t *testing.T) { wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ AllowSharedCoresOverlapReclaimedCores: true, + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ "share-1": { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -1303,6 +1339,13 @@ func TestCPUServerListAndWatch(t *testing.T) { wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ AllowSharedCoresOverlapReclaimedCores: true, + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ "share-1": { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -1417,6 +1460,13 @@ func TestCPUServerListAndWatch(t *testing.T) { wantErr: false, wantRes: &cpuadvisor.ListAndWatchResponse{ AllowSharedCoresOverlapReclaimedCores: true, + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameReclaim: { Entries: map[string]*cpuadvisor.CalculationInfo{ @@ -1814,6 +1864,13 @@ func TestCPUServerDropOldAdvice(t *testing.T) { copyres, err := DeepCopyResponse(res) assert.NoError(t, err) wantRes := &cpuadvisor.ListAndWatchResponse{ + ExtraEntries: []*advisorsvc.CalculationInfo{ + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"cpu_numa_headroom": "{}"}, + }, + }, + }, Entries: map[string]*cpuadvisor.CalculationEntries{ commonstate.PoolNameReclaim: { Entries: map[string]*cpuadvisor.CalculationInfo{ diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go index 1ccec9b5d..bfcd4b7aa 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go @@ -18,6 +18,7 @@ package server import ( "context" + "encoding/json" "fmt" "time" @@ -27,7 +28,9 @@ import ( "k8s.io/klog/v2" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc" + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -45,17 +48,19 @@ const ( type memoryServer struct { *baseServer - memoryPluginClient advisorsvc.QRMServiceClient - listAndWatchCalled bool + memoryPluginClient advisorsvc.QRMServiceClient + listAndWatchCalled bool + headroomResourceManager reporter.HeadroomResourceManager } func NewMemoryServer(recvCh chan types.InternalMemoryCalculationResult, sendCh chan types.TriggerInfo, conf *config.Configuration, - metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, + headroomResourceManager reporter.HeadroomResourceManager, metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, ) (*memoryServer, error) { ms := &memoryServer{} ms.baseServer = newBaseServer(memoryServerName, conf, recvCh, sendCh, metaCache, metaServer, emitter, ms) ms.advisorSocketPath = conf.MemoryAdvisorSocketAbsPath ms.pluginSocketPath = conf.MemoryPluginSocketAbsPath + ms.headroomResourceManager = headroomResourceManager ms.resourceRequestName = "MemoryRequest" return ms, nil } @@ -196,13 +201,42 @@ func (ms *memoryServer) ListAndWatch(_ *advisorsvc.Empty, server advisorsvc.Advi } func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculationResult) *advisorsvc.ListAndWatchResponse { + if result == nil { + return nil + } + + // assmble per-numa headroom + numaAllocatable, err := ms.headroomResourceManager.GetNumaAllocatable() + if err != nil { + general.ErrorS(err, "get numa allocatable failed") + return nil + } + + numaHeadroom := make(map[int]float64) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = float64(res.Value()) + } + data, err := json.Marshal(numaHeadroom) + if err != nil { + general.ErrorS(err, "marshal numa headroom failed") + return nil + } + + calculationResult := &advisorsvc.CalculationResult{ + Values: map[string]string{ + string(memoryadvisor.ControlKnobKeyMemoryNUMAHeadroom): string(data), + }, + } + extraNumaHeadRoom := &advisorsvc.CalculationInfo{ + CgroupPath: "", + CalculationResult: calculationResult, + } + resp := advisorsvc.ListAndWatchResponse{ PodEntries: make(map[string]*advisorsvc.CalculationEntries), ExtraEntries: make([]*advisorsvc.CalculationInfo, 0), } - if result == nil { - return nil - } + for _, advice := range result.ContainerEntries { podEntry, ok := resp.PodEntries[advice.PodUID] if !ok { @@ -247,5 +281,6 @@ func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculation } } + resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) return &resp } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server_test.go b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server_test.go index 6cf3caa9e..d71916639 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server_test.go @@ -35,6 +35,7 @@ import ( "github.com/kubewharf/katalyst-core/cmd/katalyst-agent/app/options" "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/advisorsvc" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/config" "github.com/kubewharf/katalyst-core/pkg/metaserver" @@ -94,7 +95,7 @@ func newTestMemoryServer(t *testing.T, podList []*v1.Pod) *memoryServer { }, } - memoryServer, err := NewMemoryServer(recvCh, sendCh, conf, metaCache, metaServer, metrics.DummyMetrics{}) + memoryServer, err := NewMemoryServer(recvCh, sendCh, conf, &reporter.DummyHeadroomResourceManager{}, metaCache, metaServer, metrics.DummyMetrics{}) require.NoError(t, err) require.NotNil(t, memoryServer) @@ -121,7 +122,7 @@ func newTestMemoryServerWithChannelBuffer(t *testing.T, podList []*v1.Pod) *memo }, } - memoryServer, err := NewMemoryServer(recvCh, sendCh, conf, metaCache, metaServer, metrics.DummyMetrics{}) + memoryServer, err := NewMemoryServer(recvCh, sendCh, conf, &reporter.DummyHeadroomResourceManager{}, metaCache, metaServer, metrics.DummyMetrics{}) require.NoError(t, err) require.NotNil(t, memoryServer) @@ -320,6 +321,11 @@ func TestMemoryServerListAndWatch(t *testing.T) { Values: map[string]string{"k1": "v1"}, }, }, + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"memory_numa_headroom": "{}"}, + }, + }, }, }, }, @@ -448,6 +454,11 @@ func TestMemoryServerDropOldAdvice(t *testing.T) { Values: map[string]string{"k1": "v1"}, }, }, + { + CalculationResult: &advisorsvc.CalculationResult{ + Values: map[string]string{"memory_numa_headroom": "{}"}, + }, + }, }, } if !reflect.DeepEqual(res, wantRes) { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/server.go index 33e28eaff..36cd10913 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/server.go @@ -22,10 +22,13 @@ import ( "sync" "time" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/reporter" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" + "github.com/kubewharf/katalyst-api/pkg/consts" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/metacache" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource" "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" @@ -56,7 +59,7 @@ type qrmServerWrapper struct { // NewQRMServer returns a qrm server wrapper, which instantiates // all required qrm plugin servers according to config -func NewQRMServer(advisorWrapper resource.ResourceAdvisor, conf *config.Configuration, +func NewQRMServer(advisorWrapper resource.ResourceAdvisor, headroomResourceGetter reporter.HeadroomResourceGetter, conf *config.Configuration, metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, ) (QRMServer, error) { qrmServer := qrmServerWrapper{ @@ -65,7 +68,23 @@ func NewQRMServer(advisorWrapper resource.ResourceAdvisor, conf *config.Configur for _, resourceNameStr := range conf.QRMServers { resourceName := v1.ResourceName(resourceNameStr) - server, err := newSubQRMServer(resourceName, advisorWrapper, conf, metaCache, metaServer, emitter) + var headroomResourceManager reporter.HeadroomResourceManager + var err error + switch resourceName { + case v1.ResourceCPU: + headroomResourceManager, err = headroomResourceGetter.GetHeadroomResource(consts.ReclaimedResourceMilliCPU) + if err != nil { + return nil, err + } + case v1.ResourceMemory: + headroomResourceManager, err = headroomResourceGetter.GetHeadroomResource(consts.ReclaimedResourceMemory) + if err != nil { + return nil, err + } + default: + klog.Warningf("[qosaware-server] resource %s do NOT has headroomResourceManager, be care not to use the invalid manager", resourceName) + } + server, err := newSubQRMServer(resourceName, advisorWrapper, headroomResourceManager, conf, metaCache, metaServer, emitter) if err != nil { return nil, fmt.Errorf("new qrm plugin server for %v failed: %v", resourceName, err) } else { @@ -104,7 +123,7 @@ func (qs *qrmServerWrapper) Run(ctx context.Context) { } } -func newSubQRMServer(resourceName v1.ResourceName, advisorWrapper resource.ResourceAdvisor, +func newSubQRMServer(resourceName v1.ResourceName, advisorWrapper resource.ResourceAdvisor, headroomResourceManager reporter.HeadroomResourceManager, conf *config.Configuration, metaCache metacache.MetaCache, metaServer *metaserver.MetaServer, emitter metrics.MetricEmitter, ) (subQRMServer, error) { switch resourceName { @@ -116,7 +135,7 @@ func newSubQRMServer(resourceName v1.ResourceName, advisorWrapper resource.Resou advisorRecvChInterface, advisorSendChInterface := subAdvisor.GetChannels() advisorRecvCh := advisorRecvChInterface.(chan types.TriggerInfo) advisorSendCh := advisorSendChInterface.(chan types.InternalCPUCalculationResult) - return NewCPUServer(advisorSendCh, advisorRecvCh, conf, metaCache, metaServer, emitter) + return NewCPUServer(advisorSendCh, advisorRecvCh, conf, headroomResourceManager, metaCache, metaServer, emitter) case v1.ResourceMemory: subAdvisor, err := advisorWrapper.GetSubAdvisor(types.QoSResourceMemory) if err != nil { @@ -125,7 +144,7 @@ func newSubQRMServer(resourceName v1.ResourceName, advisorWrapper resource.Resou advisorRecvChInterface, advisorSendChInterface := subAdvisor.GetChannels() advisorRecvCh := advisorRecvChInterface.(chan types.TriggerInfo) advisorSendCh := advisorSendChInterface.(chan types.InternalMemoryCalculationResult) - return NewMemoryServer(advisorSendCh, advisorRecvCh, conf, metaCache, metaServer, emitter) + return NewMemoryServer(advisorSendCh, advisorRecvCh, conf, headroomResourceManager, metaCache, metaServer, emitter) default: return nil, fmt.Errorf("illegal resource %v", resourceName) } diff --git a/pkg/agent/sysadvisor/types/cpu.go b/pkg/agent/sysadvisor/types/cpu.go index dfe064a90..f3512f654 100644 --- a/pkg/agent/sysadvisor/types/cpu.go +++ b/pkg/agent/sysadvisor/types/cpu.go @@ -161,11 +161,17 @@ type RegionInfo struct { HeadroomPolicyInUse CPUHeadroomPolicyName `json:"headroom_policy_in_use"` } +type ExtraCPUAdvices struct { + CgroupPath string + Values map[string]string +} + // InternalCPUCalculationResult conveys minimal information to cpu server for composing // calculation result type InternalCPUCalculationResult struct { PoolEntries map[string]map[int]int // map[poolName][numaId]cpuSize PoolOverlapInfo map[string]map[int]map[string]int // map[poolName][numaId][targetOverlapPoolName]int + ExtraEntries []ExtraCPUAdvices TimeStamp time.Time AllowSharedCoresOverlapReclaimedCores bool } From 441975b65bd3aa62d6788fe0b96d4d75c0566899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Wed, 23 Oct 2024 21:38:12 +0800 Subject: [PATCH 2/9] feat(sysadvisor): add offline per numa cgroup Change-Id: I63c0e7b7becb450bda956cd3f46310c32279a8fa --- .../memoryadvisor/{const.go => types.go} | 2 ++ .../plugin/qosaware/server/cpu_server.go | 22 ++++++++++--------- .../plugin/qosaware/server/memory_server.go | 22 +++++++++---------- pkg/metaserver/agent/agent.go | 6 ++++- pkg/metaserver/agent/metric/metric_impl.go | 9 ++++++-- .../agent/metric/metric_provisoner.go | 4 +++- pkg/metaserver/agent/metric/metric_test.go | 6 +++-- .../metric/provisioner/cgroup/provisioner.go | 3 ++- .../metric/provisioner/kubelet/provisioner.go | 3 ++- .../malachite/client/client_cgroup.go | 2 +- .../provisioner/malachite/provisioner.go | 17 +++++++++++++- .../provisioner/malachite/provisioner_test.go | 15 ++++++++++--- .../metric/provisioner/rodan/provisioner.go | 2 ++ pkg/metaserver/metaserver_test.go | 3 ++- pkg/util/cgroup/common/path.go | 14 ++++++++++++ 15 files changed, 95 insertions(+), 35 deletions(-) rename pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/{const.go => types.go} (96%) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go similarity index 96% rename from pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go rename to pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go index 03fd9d001..3fb7d3bb4 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/const.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/memoryadvisor/types.go @@ -28,3 +28,5 @@ const ( ControlKnowKeyMemoryOffloading MemoryControlKnobName = "memory_offloading" ControlKnobKeyMemoryNUMAHeadroom MemoryControlKnobName = "memory_numa_headroom" ) + +type MemoryNUMAHeadroom map[int]int64 diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go index 7a37df4b8..b4d72089c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go @@ -151,18 +151,20 @@ func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult cs.assemblePoolEntries(result, calculationEntriesMap, blockID2Blocks) // assmble per-numa headroom + var data []byte numaAllocatable, err := cs.headroomResourceManager.GetNumaAllocatable() if err != nil { - return nil, fmt.Errorf("get numa allocatable failed: %v", err) - } - - numaHeadroom := make(map[int]float64) - for numaID, res := range numaAllocatable { - numaHeadroom[numaID] = float64(res.Value()) - } - data, err := json.Marshal(numaHeadroom) - if err != nil { - return nil, fmt.Errorf("marshal numa headroom failed: %v", err) + // ignore get allocatable failed + klog.Errorf("get numa allocatable failed: %v", err) + } else { + numaHeadroom := make(map[int]float64) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = float64(res.Value()) / 1000.0 + } + data, err = json.Marshal(numaHeadroom) + if err != nil { + klog.Errorf("get numa allocatable failed: %v", err) + } } calculationResult := &advisorsvc.CalculationResult{ diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go index bfcd4b7aa..94954651e 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go @@ -206,20 +206,20 @@ func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculation } // assmble per-numa headroom + var data []byte numaAllocatable, err := ms.headroomResourceManager.GetNumaAllocatable() if err != nil { + // ignore get allocatable failed general.ErrorS(err, "get numa allocatable failed") - return nil - } - - numaHeadroom := make(map[int]float64) - for numaID, res := range numaAllocatable { - numaHeadroom[numaID] = float64(res.Value()) - } - data, err := json.Marshal(numaHeadroom) - if err != nil { - general.ErrorS(err, "marshal numa headroom failed") - return nil + } else { + numaHeadroom := make(memoryadvisor.MemoryNUMAHeadroom) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = res.Value() + } + data, err = json.Marshal(numaHeadroom) + if err != nil { + general.ErrorS(err, "marshal numa headroom failed") + } } calculationResult := &advisorsvc.CalculationResult{ diff --git a/pkg/metaserver/agent/agent.go b/pkg/metaserver/agent/agent.go index 6bfbdcbab..c8050d084 100644 --- a/pkg/metaserver/agent/agent.go +++ b/pkg/metaserver/agent/agent.go @@ -96,7 +96,11 @@ func NewMetaAgent(conf *config.Configuration, clientSet *client.GenericClientSet } if conf.EnableMetricsFetcher { - metaAgent.MetricsFetcher = metric.NewMetricsFetcher(conf.BaseConfiguration, conf.MetaServerConfiguration.MetricConfiguration, emitter, metaAgent) + metaAgent.MetricsFetcher = metric.NewMetricsFetcher(conf.BaseConfiguration, + conf.MetaServerConfiguration.MetricConfiguration, + emitter, + metaAgent, + machineInfo) } else { metaAgent.MetricsFetcher = metric.NewFakeMetricsFetcher(emitter) } diff --git a/pkg/metaserver/agent/metric/metric_impl.go b/pkg/metaserver/agent/metric/metric_impl.go index b25bfd6b7..037ddfadd 100644 --- a/pkg/metaserver/agent/metric/metric_impl.go +++ b/pkg/metaserver/agent/metric/metric_impl.go @@ -275,7 +275,12 @@ type MetricsFetcherImpl struct { intervals map[string]time.Duration } -func NewMetricsFetcher(baseConf *global.BaseConfiguration, metricConf *metaserver.MetricConfiguration, emitter metrics.MetricEmitter, podFetcher pod.PodFetcher) types.MetricsFetcher { +func NewMetricsFetcher(baseConf *global.BaseConfiguration, + metricConf *metaserver.MetricConfiguration, + emitter metrics.MetricEmitter, + podFetcher pod.PodFetcher, + machineInfo *machine.KatalystMachineInfo, +) types.MetricsFetcher { metricStore := utilmetric.NewMetricStore() metricsNotifierManager := NewMetricsNotifierManager(metricStore, emitter) externalMetricManager := NewExternalMetricManager(metricStore, emitter) @@ -289,7 +294,7 @@ func NewMetricsFetcher(baseConf *global.BaseConfiguration, metricConf *metaserve if interval, exist := metricConf.ProvisionerIntervals[name]; exist { intervals[name] = interval } - provisioners[name] = f(baseConf, metricConf, emitter, podFetcher, metricStore) + provisioners[name] = f(baseConf, metricConf, emitter, podFetcher, metricStore, machineInfo) } } diff --git a/pkg/metaserver/agent/metric/metric_provisoner.go b/pkg/metaserver/agent/metric/metric_provisoner.go index 929f0b896..aa3efd7dc 100644 --- a/pkg/metaserver/agent/metric/metric_provisoner.go +++ b/pkg/metaserver/agent/metric/metric_provisoner.go @@ -28,6 +28,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -39,7 +40,8 @@ func init() { } type ProvisionerInitFunc func(baseConf *global.BaseConfiguration, metricConf *metaserver.MetricConfiguration, - emitter metrics.MetricEmitter, fetcher pod.PodFetcher, metricStore *utilmetric.MetricStore) types.MetricsProvisioner + emitter metrics.MetricEmitter, fetcher pod.PodFetcher, metricStore *utilmetric.MetricStore, + machineInfo *machine.KatalystMachineInfo) types.MetricsProvisioner // provisioners stores the initializing function for each-provisioner var provisioners sync.Map diff --git a/pkg/metaserver/agent/metric/metric_test.go b/pkg/metaserver/agent/metric/metric_test.go index 63008a8fe..1c7d9438a 100644 --- a/pkg/metaserver/agent/metric/metric_test.go +++ b/pkg/metaserver/agent/metric/metric_test.go @@ -47,7 +47,8 @@ func Test_notifySystem(t *testing.T) { totalNotification := 0 conf := generateTestConfiguration(t) conf.DefaultInterval = time.Millisecond * 300 - f := NewMetricsFetcher(conf.BaseConfiguration, conf.MetricConfiguration, metrics.DummyMetrics{}, &pod.PodFetcherStub{}) + f := NewMetricsFetcher(conf.BaseConfiguration, conf.MetricConfiguration, metrics.DummyMetrics{}, + &pod.PodFetcherStub{}, &machine.KatalystMachineInfo{}) rChan := make(chan metrictypes.NotifiedResponse, 20) f.RegisterNotifier(metrictypes.MetricsScopeNode, metrictypes.NotifiedRequest{ @@ -157,7 +158,8 @@ func TestStore_Aggregate(t *testing.T) { now := time.Now() conf := generateTestConfiguration(t) - f := NewMetricsFetcher(conf.BaseConfiguration, conf.MetricConfiguration, metrics.DummyMetrics{}, &pod.PodFetcherStub{}).(*MetricsFetcherImpl) + f := NewMetricsFetcher(conf.BaseConfiguration, conf.MetricConfiguration, + metrics.DummyMetrics{}, &pod.PodFetcherStub{}, &machine.KatalystMachineInfo{}).(*MetricsFetcherImpl) pod1 := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/metaserver/agent/metric/provisioner/cgroup/provisioner.go b/pkg/metaserver/agent/metric/provisioner/cgroup/provisioner.go index 7e99987e7..c2016c843 100644 --- a/pkg/metaserver/agent/metric/provisioner/cgroup/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/cgroup/provisioner.go @@ -24,12 +24,13 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) // NewCGroupMetricsProvisioner returns the default implementation of CGroup. func NewCGroupMetricsProvisioner(baseConf *global.BaseConfiguration, _ *metaserver.MetricConfiguration, - emitter metrics.MetricEmitter, _ pod.PodFetcher, metricStore *utilmetric.MetricStore, + emitter metrics.MetricEmitter, _ pod.PodFetcher, metricStore *utilmetric.MetricStore, _ *machine.KatalystMachineInfo, ) types.MetricsProvisioner { return &CGroupMetricsProvisioner{ metricStore: metricStore, diff --git a/pkg/metaserver/agent/metric/provisioner/kubelet/provisioner.go b/pkg/metaserver/agent/metric/provisioner/kubelet/provisioner.go index 526293ab1..01574bf00 100644 --- a/pkg/metaserver/agent/metric/provisioner/kubelet/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/kubelet/provisioner.go @@ -30,6 +30,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/types" "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -38,7 +39,7 @@ const ( ) func NewKubeletSummaryProvisioner(baseConf *global.BaseConfiguration, _ *metaserver.MetricConfiguration, - emitter metrics.MetricEmitter, _ pod.PodFetcher, metricStore *utilmetric.MetricStore, + emitter metrics.MetricEmitter, _ pod.PodFetcher, metricStore *utilmetric.MetricStore, _ *machine.KatalystMachineInfo, ) types.MetricsProvisioner { return &KubeletSummaryProvisioner{ metricStore: metricStore, diff --git a/pkg/metaserver/agent/metric/provisioner/malachite/client/client_cgroup.go b/pkg/metaserver/agent/metric/provisioner/malachite/client/client_cgroup.go index 694305eed..ec73524bf 100644 --- a/pkg/metaserver/agent/metric/provisioner/malachite/client/client_cgroup.go +++ b/pkg/metaserver/agent/metric/provisioner/malachite/client/client_cgroup.go @@ -104,7 +104,7 @@ func (c *MalachiteClient) getCgroupStats(cgroupPath string) ([]byte, error) { defer func() { _ = rsp.Body.Close() }() if rsp.StatusCode != 200 { - return nil, fmt.Errorf("invalid http response status code %d, url: %s", rsp.StatusCode, req.URL) + return nil, fmt.Errorf("invalid http response status code %d, url: %v", rsp.StatusCode, req.URL) } return ioutil.ReadAll(rsp.Body) diff --git a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go index 1356cc131..8c67a41f9 100644 --- a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner.go @@ -36,6 +36,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -55,13 +56,14 @@ const ( // NewMalachiteMetricsProvisioner returns the default implementation of MetricsFetcher. func NewMalachiteMetricsProvisioner(baseConf *global.BaseConfiguration, _ *metaserver.MetricConfiguration, - emitter metrics.MetricEmitter, fetcher pod.PodFetcher, metricStore *utilmetric.MetricStore, + emitter metrics.MetricEmitter, fetcher pod.PodFetcher, metricStore *utilmetric.MetricStore, machineInfo *machine.KatalystMachineInfo, ) types.MetricsProvisioner { return &MalachiteMetricsProvisioner{ malachiteClient: client.NewMalachiteClient(fetcher), metricStore: metricStore, emitter: emitter, baseConf: baseConf, + machineInfo: machineInfo, } } @@ -70,6 +72,7 @@ type MalachiteMetricsProvisioner struct { malachiteClient *client.MalachiteClient baseConf *global.BaseConfiguration emitter metrics.MetricEmitter + machineInfo *machine.KatalystMachineInfo startOnce sync.Once cpuToNumaMap map[int]int } @@ -169,6 +172,12 @@ func (m *MalachiteMetricsProvisioner) updateSystemStats() error { func (m *MalachiteMetricsProvisioner) getCgroupPaths() []string { cgroupPaths := []string{m.baseConf.ReclaimRelativeRootCgroupPath, common.CgroupFsRootPathBurstable, common.CgroupFsRootPathBestEffort} + // add numa binding cgroup paths + for _, path := range common.GetNUMABindingReclaimRelativeRootCgroupPaths(m.baseConf.ReclaimRelativeRootCgroupPath, + m.machineInfo.CPUDetails.NUMANodes().ToSliceNoSortInt()) { + cgroupPaths = append(cgroupPaths, path) + } + for _, path := range m.baseConf.OptionalRelativeCgroupPaths { absPath := common.GetAbsCgroupPath(common.DefaultSelectedSubsys, path) if !general.IsPathExists(absPath) { @@ -177,6 +186,7 @@ func (m *MalachiteMetricsProvisioner) getCgroupPaths() []string { } cgroupPaths = append(cgroupPaths, path) } + for _, path := range m.baseConf.GeneralRelativeCgroupPaths { cgroupPaths = append(cgroupPaths, path) } @@ -189,6 +199,11 @@ func (m *MalachiteMetricsProvisioner) updateCgroupData() error { cgroupPaths := m.getCgroupPaths() errList := make([]error, 0) for _, path := range cgroupPaths { + if !general.IsPathExists(path) { + general.Warningf("cgroup path %v not existed, ignore it", path) + continue + } + stats, err := m.malachiteClient.GetCgroupStats(path) if err != nil { errList = append(errList, err) diff --git a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner_test.go b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner_test.go index 199e4464a..031dfa1be 100644 --- a/pkg/metaserver/agent/metric/provisioner/malachite/provisioner_test.go +++ b/pkg/metaserver/agent/metric/provisioner/malachite/provisioner_test.go @@ -29,6 +29,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -37,14 +38,19 @@ func Test_noneExistMetricsProvisioner(t *testing.T) { store := utilmetric.NewMetricStore() - var err error + cpuTopology, err := machine.GenerateDummyCPUTopology(16, 2, 4) + assert.Nil(t, err) + implement := NewMalachiteMetricsProvisioner(&global.BaseConfiguration{ ReclaimRelativeRootCgroupPath: "test", MalachiteConfiguration: &global.MalachiteConfiguration{ GeneralRelativeCgroupPaths: []string{"d1", "d2"}, OptionalRelativeCgroupPaths: []string{"d3", "d4"}, }, - }, &metaserver.MetricConfiguration{}, metrics.DummyMetrics{}, &pod.PodFetcherStub{}, store) + }, &metaserver.MetricConfiguration{}, metrics.DummyMetrics{}, &pod.PodFetcherStub{}, store, + &machine.KatalystMachineInfo{ + CPUTopology: cpuTopology, + }) fakeSystemCompute := &malachitetypes.SystemComputeData{ CPU: []malachitetypes.CPU{ @@ -195,5 +201,8 @@ func Test_noneExistMetricsProvisioner(t *testing.T) { defer monkey.UnpatchAll() paths := implement.(*MalachiteMetricsProvisioner).getCgroupPaths() - assert.ElementsMatch(t, paths, []string{"d1", "d2", "d3", "/kubepods/burstable", "/kubepods/besteffort", "test"}) + assert.ElementsMatch(t, paths, []string{ + "d1", "d2", "d3", "/kubepods/burstable", "/kubepods/besteffort", + "test", "test-0", "test-1", "test-2", "test-3", + }) } diff --git a/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go b/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go index 4af99d764..ac816b7e9 100644 --- a/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go +++ b/pkg/metaserver/agent/metric/provisioner/rodan/provisioner.go @@ -32,6 +32,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/pod" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" + "github.com/kubewharf/katalyst-core/pkg/util/machine" utilmetric "github.com/kubewharf/katalyst-core/pkg/util/metric" ) @@ -55,6 +56,7 @@ func NewRodanMetricsProvisioner( emitter metrics.MetricEmitter, fetcher pod.PodFetcher, metricStore *utilmetric.MetricStore, + _ *machine.KatalystMachineInfo, ) metrictypes.MetricsProvisioner { return &RodanMetricsProvisioner{ metricStore: metricStore, diff --git a/pkg/metaserver/metaserver_test.go b/pkg/metaserver/metaserver_test.go index 92a0c4d15..c77f30bf6 100644 --- a/pkg/metaserver/metaserver_test.go +++ b/pkg/metaserver/metaserver_test.go @@ -40,6 +40,7 @@ import ( dynamicconfig "github.com/kubewharf/katalyst-core/pkg/metaserver/kcc" "github.com/kubewharf/katalyst-core/pkg/metaserver/spd" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/machine" ) func generateTestConfiguration(t *testing.T) *config.Configuration { @@ -58,7 +59,7 @@ func generateTestMetaServer(clientSet *client.GenericClientSet, conf *config.Con CNRFetcher: cnr.NewCachedCNRFetcher(conf.BaseConfiguration, conf.CNRConfiguration, clientSet.InternalClient.NodeV1alpha1().CustomNodeResources()), MetricsFetcher: metric.NewMetricsFetcher(conf.BaseConfiguration, conf.MetricConfiguration, - metrics.DummyMetrics{}, &pod.PodFetcherStub{}), + metrics.DummyMetrics{}, &pod.PodFetcherStub{}, &machine.KatalystMachineInfo{}), AgentConf: conf.MetaServerConfiguration.AgentConfiguration, }, ConfigurationManager: &dynamicconfig.DummyConfigurationManager{}, diff --git a/pkg/util/cgroup/common/path.go b/pkg/util/cgroup/common/path.go index 66096399d..4d977ea25 100644 --- a/pkg/util/cgroup/common/path.go +++ b/pkg/util/cgroup/common/path.go @@ -20,6 +20,7 @@ import ( "fmt" "path" "path/filepath" + "strconv" "sync" utilerrors "k8s.io/apimachinery/pkg/util/errors" @@ -28,6 +29,10 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/general" ) +const ( + numaBindingReclaimRelativeRootCgroupPathSeparator = "-" +) + // k8sCgroupPathList is used to record cgroup-path related configurations, // and it will be set as SystemdRootPath (along with kubernetes levels) as default. var ( @@ -158,3 +163,12 @@ func IsContainerCgroupExist(podUID, containerID string) (bool, error) { return general.IsPathExists(containerAbsCGPath), nil } + +// GetNUMABindingReclaimRelativeRootCgroupPaths returns relative cgroup paths for numa-binding reclaim +func GetNUMABindingReclaimRelativeRootCgroupPaths(reclaimRelativeRootCgroupPath string, NUMANode []int) map[int]string { + paths := make(map[int]string, len(NUMANode)) + for _, numaID := range NUMANode { + paths[numaID] = reclaimRelativeRootCgroupPath + numaBindingReclaimRelativeRootCgroupPathSeparator + strconv.Itoa(numaID) + } + return paths +} From 80c7417ebaa782622c0fe37b77cd245df32265a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Fri, 25 Oct 2024 17:30:56 +0800 Subject: [PATCH 3/9] fix: slide headroom result per-numa --- .../reporter/manager/resource/generic.go | 91 ++++++++++++++++--- .../reporter/manager/resource/generic_test.go | 3 +- .../plugin/qosaware/resource/cpu/advisor.go | 27 +----- .../qosaware/resource/memory/advisor.go | 40 -------- 4 files changed, 79 insertions(+), 82 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go index a0f39702f..9c4dd59b5 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go @@ -62,11 +62,14 @@ type GenericHeadroomManager struct { sync.RWMutex lastReportResult *resource.Quantity // the latest transformed reporter result per numa - lastNumaReportResult map[int]resource.Quantity + lastNUMAReportResult map[int]resource.Quantity - headroomAdvisor hmadvisor.ResourceAdvisor - emitter metrics.MetricEmitter - reportSlidingWindow general.SmoothWindow + headroomAdvisor hmadvisor.ResourceAdvisor + emitter metrics.MetricEmitter + useMilliValue bool + slidingWindowOptions GenericSlidingWindowOptions + reportSlidingWindow general.SmoothWindow + reportNUMASlidingWindow map[int]general.SmoothWindow reportResultTransformer func(quantity resource.Quantity) resource.Quantity resourceName v1.ResourceName @@ -93,10 +96,12 @@ func NewGenericHeadroomManager(name v1.ResourceName, useMilliValue, reportMilliV return &GenericHeadroomManager{ resourceName: name, - lastNumaReportResult: make(map[int]resource.Quantity), + lastNUMAReportResult: make(map[int]resource.Quantity), reportResultTransformer: reportResultTransformer, syncPeriod: syncPeriod, headroomAdvisor: headroomAdvisor, + useMilliValue: useMilliValue, + slidingWindowOptions: slidingWindowOptions, reportSlidingWindow: general.NewCappedSmoothWindow( slidingWindowOptions.MinStep, slidingWindowOptions.MaxStep, @@ -106,8 +111,9 @@ func NewGenericHeadroomManager(name v1.ResourceName, useMilliValue, reportMilliV AggregateArgs: slidingWindowOptions.AggregateArgs, }), ), - emitter: emitter, - getReclaimOptions: getReclaimOptions, + reportNUMASlidingWindow: make(map[int]general.SmoothWindow), + emitter: emitter, + getReclaimOptions: getReclaimOptions, } } @@ -126,13 +132,13 @@ func (m *GenericHeadroomManager) GetCapacity() (resource.Quantity, error) { func (m *GenericHeadroomManager) GetNumaAllocatable() (map[int]resource.Quantity, error) { m.RLock() defer m.RUnlock() - return m.lastNumaReportResult, nil + return m.getLastNUMAReportResult() } func (m *GenericHeadroomManager) GetNumaCapacity() (map[int]resource.Quantity, error) { m.RLock() defer m.RUnlock() - return m.lastNumaReportResult, nil + return m.getLastNUMAReportResult() } func (m *GenericHeadroomManager) Run(ctx context.Context) { @@ -140,6 +146,13 @@ func (m *GenericHeadroomManager) Run(ctx context.Context) { <-ctx.Done() } +func (m *GenericHeadroomManager) getLastNUMAReportResult() (map[int]resource.Quantity, error) { + if m.lastNUMAReportResult == nil { + return nil, fmt.Errorf("resource %s last numa report value not found", m.resourceName) + } + return m.lastNUMAReportResult, nil +} + func (m *GenericHeadroomManager) getLastReportResult() (resource.Quantity, error) { if m.lastReportResult == nil { return resource.Quantity{}, fmt.Errorf("resource %s last report value not found", m.resourceName) @@ -155,6 +168,20 @@ func (m *GenericHeadroomManager) setLastReportResult(q resource.Quantity) { m.emitResourceToMetric(metricsNameHeadroomReportResult, m.reportResultTransformer(*m.lastReportResult)) } +func (m *GenericHeadroomManager) newSlidingWindow() general.SmoothWindow { + slidingWindowSize := int(m.slidingWindowOptions.SlidingWindowTime / m.syncPeriod) + slidingWindowTTL := m.slidingWindowOptions.SlidingWindowTime * 2 + return general.NewCappedSmoothWindow( + m.slidingWindowOptions.MinStep, + m.slidingWindowOptions.MaxStep, + general.NewAggregatorSmoothWindow(general.SmoothWindowOpts{ + WindowSize: slidingWindowSize, + TTL: slidingWindowTTL, UsedMillValue: m.useMilliValue, AggregateFunc: m.slidingWindowOptions.AggregateFunc, + AggregateArgs: m.slidingWindowOptions.AggregateArgs, + }), + ) +} + func (m *GenericHeadroomManager) sync(_ context.Context) { m.Lock() defer m.Unlock() @@ -171,14 +198,47 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { return } - originValue := originResultFromAdvisor.Value() - reportResult := m.reportSlidingWindow.GetWindowedResources(originResultFromAdvisor) if reportResult == nil { klog.Infof("skip update reclaimed resource %s without enough valid sample", m.resourceName) return } + reportNUMAResult := make(map[int]*resource.Quantity) + numaResultReady := true + numaSum := 0.0 + reservedResourceForReportPerNUMA := *resource.NewQuantity(int64(float64(reclaimOptions.ReservedResourceForReport.Value())/float64(len(numaResult))), resource.DecimalSI) + min := float64(reclaimOptions.MinReclaimedResourceForReport.Value()) / float64(len(numaResult)) + if reclaimOptions.MinReclaimedResourceForReport.Value() != 0 && min == 0 { + min = 1 + } + minReclaimedResourceForReportPerNUMA := *resource.NewQuantity(int64(min), resource.DecimalSI) + for numaID, ret := range numaResult { + numaWindow, ok := m.reportNUMASlidingWindow[numaID] + if !ok { + numaWindow = m.newSlidingWindow() + m.reportNUMASlidingWindow[numaID] = numaWindow + } + + reportResult := numaWindow.GetWindowedResources(ret) + if reportResult == nil { + klog.Infof("numa %d result if not ready", numaID) + numaResultReady = false + continue + } + + reportResult.Sub(reservedResourceForReportPerNUMA) + if reportResult.Cmp(minReclaimedResourceForReportPerNUMA) < 0 { + reportResult = &minReclaimedResourceForReportPerNUMA + } + reportNUMAResult[numaID] = reportResult + numaSum += float64(reportResult.Value()) + } + + if !numaResultReady { + return + } + reportResult.Sub(reclaimOptions.ReservedResourceForReport) if reportResult.Cmp(reclaimOptions.MinReclaimedResourceForReport) < 0 { reportResult = &reclaimOptions.MinReclaimedResourceForReport @@ -189,12 +249,13 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { reportResult.String(), reclaimOptions.ReservedResourceForReport.String()) m.setLastReportResult(*reportResult) + // set latest numa report result - diffRatio := float64(reportResult.Value()) / float64(originValue) - for numaID, res := range numaResult { + diffRatio := float64(reportResult.Value()) / numaSum + for numaID, res := range reportNUMAResult { res.Set(int64(float64(res.Value()) * diffRatio)) - result := m.reportResultTransformer(res) - m.lastNumaReportResult[numaID] = result + result := m.reportResultTransformer(*res) + m.lastNUMAReportResult[numaID] = result klog.Infof("%s headroom manager for NUMA: %d, headroom: %d", m.resourceName, numaID, result.Value()) } } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic_test.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic_test.go index 68247271c..8467cb25a 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic_test.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic_test.go @@ -73,9 +73,10 @@ func TestNewGenericHeadroomManager(t *testing.T) { tt := tt t.Run(tt.name, func(t *testing.T) { t.Parallel() - NewGenericHeadroomManager(tt.args.name, tt.args.useMilliValue, tt.args.reportMillValue, + mgr := NewGenericHeadroomManager(tt.args.name, tt.args.useMilliValue, tt.args.reportMillValue, tt.args.syncPeriod, tt.args.headroomAdvisor, tt.args.emitter, tt.args.slidingWindowOptions, tt.args.getReclaimOptionsFunc) + mgr.newSlidingWindow() }) } } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go index b88fd0de2..290413a00 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/advisor.go @@ -599,32 +599,7 @@ func (cra *cpuResourceAdvisor) assembleProvision() (types.InternalCPUCalculation return types.InternalCPUCalculationResult{}, fmt.Errorf("no legal provision assembler") } - calculationResult, err := cra.provisionAssembler.AssembleProvision() - - /* - _, headroom, err := cra.headroomAssembler.GetHeadroom() - if err != nil { - return types.InternalCPUCalculationResult{}, fmt.Errorf("get numa headroom failed: %v", err) - } - - numaHeadroom := make(map[int]float64) - for numaID, res := range headroom { - numaHeadroom[numaID] = float64(res.Value()) - } - data, err := json.Marshal(numaHeadroom) - if err != nil { - return types.InternalCPUCalculationResult{}, fmt.Errorf("marshal numa headroom failed: %v", err) - } - extra := types.ExtraCPUAdvices{ - CgroupPath: cra.conf.ReclaimRelativeRootCgroupPath, - Values: map[string]string{ - string(cpuadvisor.ControlKnobKeyCPUNUMAHeadroom): string(data), - }, - } - calculationResult.ExtraEntries = append(calculationResult.ExtraEntries, extra) - */ - - return calculationResult, err + return cra.provisionAssembler.AssembleProvision() } func (cra *cpuResourceAdvisor) emitMetrics(calculationResult types.InternalCPUCalculationResult) { diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go index b8388332c..0735bebb4 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/advisor.go @@ -175,46 +175,6 @@ func (ra *memoryResourceAdvisor) sendAdvices() error { result.ExtraEntries = append(result.ExtraEntries, advices.ExtraEntries...) } - /* - var numaHeadroom map[int]resource.Quantity - var err error - for _, headroomPolicy := range ra.headroomPolices { - _, numaHeadroom, err = headroomPolicy.GetHeadroom() - if err != nil { - klog.ErrorS(err, "get headroom failed", "headroomPolicy", headroomPolicy.Name()) - _ = ra.emitter.StoreInt64(metricNameMemoryGetHeadroomFailed, 1, metrics.MetricTypeNameRaw, - metrics.MetricTag{Key: metricTagKeyPolicyName, Val: string(headroomPolicy.Name())}) - continue - } - - break - } - - if numaHeadroom == nil { - klog.Errorf("can NOT get numa headroom") - return fmt.Errorf("can NOT get numa headroom") - } - - headroom := make(map[int]float64) - for numaID, res := range numaHeadroom { - headroom[numaID] = float64(res.Value()) - } - - data, err := json.Marshal(headroom) - if err != nil { - klog.Errorf("marshal numa headroom failed: %s", err) - return fmt.Errorf("marshal numa headroom failed: %s", err) - - } - extra := types.ExtraMemoryAdvices{ - CgroupPath: ra.conf.ReclaimRelativeRootCgroupPath, - Values: map[string]string{ - string(memoryadvisor.ControlKnobKeyMemoryNUMAHeadroom): string(data), - }, - } - result.ExtraEntries = append(result.ExtraEntries, extra) - */ - select { case ra.sendChan <- result: general.Infof("notify memory server: %+v", result) From c788feec5f5c26c5d4164b7934d03f81ae2d055b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Mon, 28 Oct 2024 11:27:38 +0800 Subject: [PATCH 4/9] feat(sysadvisor): assemble headroom Change-Id: Icd250c6a60d998bb9454e236baaa13c9f53c68ab --- .../plugin/qosaware/server/cpu_server.go | 44 +++++++++++-------- .../plugin/qosaware/server/memory_server.go | 44 +++++++++++-------- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go index b4d72089c..acec77266 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/cpu_server.go @@ -145,26 +145,22 @@ func (cs *cpuServer) ListAndWatch(_ *advisorsvc.Empty, server cpuadvisor.CPUAdvi } } -func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult) (*cpuadvisor.ListAndWatchResponse, error) { - calculationEntriesMap := make(map[string]*cpuadvisor.CalculationEntries) - blockID2Blocks := NewBlockSet() - cs.assemblePoolEntries(result, calculationEntriesMap, blockID2Blocks) - - // assmble per-numa headroom - var data []byte +// assmble per-numa headroom +func (cs *cpuServer) assembleHeadroom() *advisorsvc.CalculationInfo { numaAllocatable, err := cs.headroomResourceManager.GetNumaAllocatable() if err != nil { - // ignore get allocatable failed klog.Errorf("get numa allocatable failed: %v", err) - } else { - numaHeadroom := make(map[int]float64) - for numaID, res := range numaAllocatable { - numaHeadroom[numaID] = float64(res.Value()) / 1000.0 - } - data, err = json.Marshal(numaHeadroom) - if err != nil { - klog.Errorf("get numa allocatable failed: %v", err) - } + return nil + } + + numaHeadroom := make(map[int]float64) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = float64(res.Value()) / 1000.0 + } + data, err := json.Marshal(numaHeadroom) + if err != nil { + klog.Errorf("marshal headroom failed: %v", err) + return nil } calculationResult := &advisorsvc.CalculationResult{ @@ -172,10 +168,17 @@ func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult string(cpuadvisor.ControlKnobKeyCPUNUMAHeadroom): string(data), }, } - extraNumaHeadRoom := &advisorsvc.CalculationInfo{ + + return &advisorsvc.CalculationInfo{ CgroupPath: "", CalculationResult: calculationResult, } +} + +func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult) (*cpuadvisor.ListAndWatchResponse, error) { + calculationEntriesMap := make(map[string]*cpuadvisor.CalculationEntries) + blockID2Blocks := NewBlockSet() + cs.assemblePoolEntries(result, calculationEntriesMap, blockID2Blocks) // Assemble pod entries f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { @@ -215,7 +218,10 @@ func (cs *cpuServer) assembleResponse(result *types.InternalCPUCalculationResult } } - resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) + extraNumaHeadRoom := cs.assembleHeadroom() + if extraNumaHeadRoom != nil { + resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) + } return resp, nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go index 94954651e..770fa5fbc 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/server/memory_server.go @@ -200,26 +200,23 @@ func (ms *memoryServer) ListAndWatch(_ *advisorsvc.Empty, server advisorsvc.Advi } } -func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculationResult) *advisorsvc.ListAndWatchResponse { - if result == nil { - return nil - } - - // assmble per-numa headroom +// assmble per-numa headroom +func (ms *memoryServer) assembleHeadroom() *advisorsvc.CalculationInfo { var data []byte numaAllocatable, err := ms.headroomResourceManager.GetNumaAllocatable() if err != nil { - // ignore get allocatable failed general.ErrorS(err, "get numa allocatable failed") - } else { - numaHeadroom := make(memoryadvisor.MemoryNUMAHeadroom) - for numaID, res := range numaAllocatable { - numaHeadroom[numaID] = res.Value() - } - data, err = json.Marshal(numaHeadroom) - if err != nil { - general.ErrorS(err, "marshal numa headroom failed") - } + return nil + } + + numaHeadroom := make(memoryadvisor.MemoryNUMAHeadroom) + for numaID, res := range numaAllocatable { + numaHeadroom[numaID] = res.Value() + } + data, err = json.Marshal(numaHeadroom) + if err != nil { + general.ErrorS(err, "marshal numa headroom failed") + return nil } calculationResult := &advisorsvc.CalculationResult{ @@ -227,10 +224,17 @@ func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculation string(memoryadvisor.ControlKnobKeyMemoryNUMAHeadroom): string(data), }, } - extraNumaHeadRoom := &advisorsvc.CalculationInfo{ + + return &advisorsvc.CalculationInfo{ CgroupPath: "", CalculationResult: calculationResult, } +} + +func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculationResult) *advisorsvc.ListAndWatchResponse { + if result == nil { + return nil + } resp := advisorsvc.ListAndWatchResponse{ PodEntries: make(map[string]*advisorsvc.CalculationEntries), @@ -281,6 +285,10 @@ func (ms *memoryServer) assembleResponse(result *types.InternalMemoryCalculation } } - resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) + extraNumaHeadRoom := ms.assembleHeadroom() + if extraNumaHeadRoom != nil { + resp.ExtraEntries = append(resp.ExtraEntries, extraNumaHeadRoom) + } + return &resp } From 1c55bb76131a6b5bde6649f1f9589fc86f381b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Mon, 28 Oct 2024 17:38:31 +0800 Subject: [PATCH 5/9] feat(sysadvisor): eliminate GetHeadroom API of resource advisor Change-Id: I561b42cba90995a64895b75268076ca03611ca5d --- .../reporter/manager/resource/generic.go | 20 ++++++++++++++++++- .../plugin/qosaware/resource/resource.go | 3 --- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go index 9c4dd59b5..a3f1f40b2 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go @@ -28,6 +28,7 @@ import ( "k8s.io/klog/v2" hmadvisor "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/plugin/qosaware/resource" + "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" ) @@ -192,7 +193,24 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { return } - originResultFromAdvisor, numaResult, err := m.headroomAdvisor.GetHeadroom(m.resourceName) + var resourceName types.QoSResourceName + switch m.resourceName { + case v1.ResourceCPU: + resourceName = types.QoSResourceCPU + case v1.ResourceMemory: + resourceName = types.QoSResourceMemory + default: + klog.Errorf("resource %v NOT support to get headroom", m.resourceName) + return + } + + subAdvisor, err := m.headroomAdvisor.GetSubAdvisor(resourceName) + if err != nil { + klog.Errorf("get SubAdvisor with resource %v failed: %v", resourceName, err) + return + } + + originResultFromAdvisor, numaResult, err := subAdvisor.GetHeadroom() if err != nil { klog.Errorf("get origin result %s from headroomAdvisor failed: %v", m.resourceName, err) return diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go index 1a19ec32e..8cab9698c 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/resource.go @@ -40,9 +40,6 @@ type ResourceAdvisor interface { // GetSubAdvisor returns the corresponding sub advisor according to resource name GetSubAdvisor(resourceName types.QoSResourceName) (SubResourceAdvisor, error) - - // GetHeadroom returns the corresponding headroom quantity according to resource name - GetHeadroom(resourceName v1.ResourceName) (resource.Quantity, map[int]resource.Quantity, error) } // SubResourceAdvisor updates resource provision of a certain dimension based on the latest From 0d614650aa8df9f7f66ef0e1a75e1c8cf2aa32b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Mon, 28 Oct 2024 19:44:37 +0800 Subject: [PATCH 6/9] feat(sysadvisor): get RNB NUMA topo Change-Id: I287dc277f63289d657b1f2dafeb24094aab1ba16 --- .../headroomassembler/assembler_common.go | 1 + .../assembler_common_util.go | 37 +++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go index 7cf3ac8eb..a1a56d2e8 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go @@ -196,6 +196,7 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i if err != nil { return resource.Quantity{}, nil, err } + general.Infof("RNB NUMA topo: %v, %v", bindingNUMAs, nonBindingNumas) numaHeadroom := make(map[int]resource.Quantity, ha.metaServer.NumNUMANodes) totalHeadroom := resource.Quantity{} diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go index 10c2b0ceb..3f1fa6a05 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common_util.go @@ -22,6 +22,7 @@ import ( "math" "strconv" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" @@ -30,6 +31,11 @@ import ( "github.com/kubewharf/katalyst-core/pkg/agent/sysadvisor/types" metaserverHelper "github.com/kubewharf/katalyst-core/pkg/metaserver/agent/metric/helper" "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/native" +) + +const ( + FakedNUMAID = "-1" ) func (ha *HeadroomAssemblerCommon) getUtilBasedHeadroom(options helper.UtilBasedCapacityOptions, @@ -94,30 +100,37 @@ func (ha *HeadroomAssemblerCommon) getReclaimNUMABindingTopo(reclaimPool *types. numaMap[numaID] = false } - f := func(podUID string, containerName string, ci *types.ContainerInfo) bool { - if ci == nil { - return true + pods, e := ha.metaServer.GetPodList(context.TODO(), func(pod *v1.Pod) bool { + if !native.PodIsActive(pod) { + return false } - if ci.QoSLevel != consts.PodAnnotationQoSLevelReclaimedCores { - return true + if ok, err := ha.conf.CheckReclaimedQoSForPod(pod); err != nil { + klog.Errorf("filter pod %v err: %v", pod.Name, err) + return false + } else { + return ok } + }) + if e != nil { + err = fmt.Errorf("get pod list failed: %v", e) + return + } - numaRet, ok := ci.Annotations[consts.PodAnnotationNUMABindResultKey] - if !ok || numaRet == "-1" { - return true + for _, pod := range pods { + numaRet, ok := pod.Annotations[consts.PodAnnotationNUMABindResultKey] + if !ok || numaRet == FakedNUMAID { + continue } numaID, err := strconv.Atoi(numaRet) if err != nil { - klog.Errorf("invalid numa binding result: %s, %v\n", numaRet, err) - return true + klog.Errorf("invalid numa binding result: %s, %s, %v\n", pod.Name, numaRet, err) + continue } numaMap[numaID] = true - return true } - ha.metaReader.RangeContainer(f) for numaID, bound := range numaMap { if bound { From cad32a465c840c18ec93cb78afb921a93b321c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Tue, 29 Oct 2024 16:27:57 +0800 Subject: [PATCH 7/9] feat(sysadvisor): enrich debug info Change-Id: I09e022cc70d24cda28420b61b5f2ee077ffb67c5 --- .../cpu/assembler/headroomassembler/assembler_common.go | 4 +++- .../cpu/region/headroompolicy/policy_numa_exclusive.go | 4 ++++ .../plugin/qosaware/resource/cpu/region/region_base.go | 2 +- .../resource/memory/headroompolicy/policy_numa_aware.go | 8 +++++--- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go index a1a56d2e8..718256647 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go @@ -75,6 +75,7 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, map[int]res // return zero when reclaim is disabled if !dynamicConfig.EnableReclaim { + general.Infof("reclaim is NOT enabled") return *resource.NewQuantity(0, resource.DecimalSI), nil, nil } @@ -91,7 +92,7 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, map[int]res if r.Type() == configapi.QoSRegionTypeDedicatedNumaExclusive { regionInfo, ok := ha.metaReader.GetRegionInfo(r.Name()) if !ok || regionInfo == nil || regionInfo.Headroom < 0 { - return resource.Quantity{}, nil, fmt.Errorf("failed to get headroom for %v", r.Name()) + return resource.Quantity{}, nil, fmt.Errorf("failed to get headroom for %v, %#v", r.Name(), regionInfo) } if regionInfo.RegionStatus.BoundType == types.BoundUpper && r.EnableReclaim() { general.Infof("region %v is in status of upper bound", regionInfo.RegionName) @@ -194,6 +195,7 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i bindingNUMAs, nonBindingNumas, err := ha.getReclaimNUMABindingTopo(reclaimPoolInfo) if err != nil { + general.Errorf("getReclaimNUMABindingTop failed: %v", err) return resource.Quantity{}, nil, err } general.Infof("RNB NUMA topo: %v, %v", bindingNUMAs, nonBindingNumas) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/headroompolicy/policy_numa_exclusive.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/headroompolicy/policy_numa_exclusive.go index 8922141d8..13152cecf 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/headroompolicy/policy_numa_exclusive.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/headroompolicy/policy_numa_exclusive.go @@ -31,6 +31,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metaserver" "github.com/kubewharf/katalyst-core/pkg/metaserver/spd" "github.com/kubewharf/katalyst-core/pkg/metrics" + "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" ) @@ -79,6 +80,7 @@ func (p *PolicyNUMAExclusive) Update() error { } enableReclaim, err := helper.PodEnableReclaim(context.Background(), p.metaServer, podUID, p.EnableReclaim) if err != nil { + general.Errorf("check pod reclaim status failed: %v, %v", podUID, err) return err } if !enableReclaim { @@ -89,6 +91,7 @@ func (p *PolicyNUMAExclusive) Update() error { for _, ci := range containers { containerEstimation, err := helper.EstimateContainerCPUUsage(ci, p.metaReader, enableReclaim) if err != nil { + general.Errorf("EstimateContainerCPUUsage failed: %v, %v", ci.PodName, err) return err } @@ -117,6 +120,7 @@ func (p *PolicyNUMAExclusive) Update() error { originHeadroom := math.Max(p.ResourceUpperBound-cpuEstimation+p.ReservedForReclaim, 0) score, err := helper.PodPerformanceScore(context.Background(), p.metaServer, podUID) if err != nil { + general.Errorf("get pps failed: %v, %v", podUID, err) return err } p.headroom = originHeadroom * (score - spd.MinPerformanceScore) / (spd.MaxPerformanceScore - spd.MinPerformanceScore) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/region_base.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/region_base.go index 110862c3c..91ecc0465 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/region_base.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/region/region_base.go @@ -358,7 +358,7 @@ func (r *QoSRegionBase) TryUpdateHeadroom() { // run an episode of policy and calculator update if err := internal.policy.Update(); err != nil { - klog.Errorf("[qosaware-cpu] update policy %v failed: %v", internal.name, err) + klog.Errorf("[qosaware-cpu] region %s update policy %v failed: %v", r.name, internal.name, err) continue } internal.updateStatus = types.PolicyUpdateSucceeded diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go index c1738039f..46cf30c7d 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go @@ -86,6 +86,7 @@ func (p *PolicyNUMAAware) Update() (err error) { availNUMAs, reclaimedCoresContainers, err := helper.GetAvailableNUMAsAndReclaimedCores(p.conf, p.metaReader, p.metaServer) if err != nil { + general.Errorf("GetAvailableNUMAsAndReclaimedCores failed: %v", err) return err } @@ -93,20 +94,21 @@ func (p *PolicyNUMAAware) Update() (err error) { for _, numaID := range availNUMAs.ToSliceInt() { data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemFreeNuma) if err != nil { - general.Errorf("Can not get numa memory free, numaID: %v", numaID) + general.Errorf("Can not get numa memory free, numaID: %v, %v", numaID, err) return err } free := data.Value data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemInactiveFileNuma) if err != nil { + general.Errorf("Can not get numa memory inactiveFile, numaID: %v, %v", numaID, err) return err } inactiveFile := data.Value data, err = p.metaServer.GetNumaMetric(numaID, consts.MetricMemTotalNuma) if err != nil { - general.ErrorS(err, "Can not get numa memory total", "numaID", numaID) + general.Errorf("Can not get numa memory total, numaID: %v, %v", numaID, err) return err } total := data.Value @@ -137,7 +139,7 @@ func (p *PolicyNUMAAware) Update() (err error) { watermarkScaleFactor, err := p.metaServer.GetNodeMetric(consts.MetricMemScaleFactorSystem) if err != nil { - general.InfoS("Can not get system watermark scale factor") + general.Infof("Can not get system watermark scale factor: %v", err) return err } From 6019e02908cf3683e30b81eeb507f53a71a047e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Tue, 29 Oct 2024 20:22:35 +0800 Subject: [PATCH 8/9] fix(sysadvisor): fix sliding windows --- .../reporter/manager/resource/generic.go | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go index a3f1f40b2..3f7cab08f 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/reporter/manager/resource/generic.go @@ -217,10 +217,6 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { } reportResult := m.reportSlidingWindow.GetWindowedResources(originResultFromAdvisor) - if reportResult == nil { - klog.Infof("skip update reclaimed resource %s without enough valid sample", m.resourceName) - return - } reportNUMAResult := make(map[int]*resource.Quantity) numaResultReady := true @@ -238,22 +234,23 @@ func (m *GenericHeadroomManager) sync(_ context.Context) { m.reportNUMASlidingWindow[numaID] = numaWindow } - reportResult := numaWindow.GetWindowedResources(ret) - if reportResult == nil { + result := numaWindow.GetWindowedResources(ret) + if result == nil { klog.Infof("numa %d result if not ready", numaID) numaResultReady = false continue } - reportResult.Sub(reservedResourceForReportPerNUMA) - if reportResult.Cmp(minReclaimedResourceForReportPerNUMA) < 0 { - reportResult = &minReclaimedResourceForReportPerNUMA + result.Sub(reservedResourceForReportPerNUMA) + if result.Cmp(minReclaimedResourceForReportPerNUMA) < 0 { + result = &minReclaimedResourceForReportPerNUMA } - reportNUMAResult[numaID] = reportResult - numaSum += float64(reportResult.Value()) + reportNUMAResult[numaID] = result + numaSum += float64(result.Value()) } - if !numaResultReady { + if reportResult == nil || !numaResultReady { + klog.Infof("skip update reclaimed resource %s without enough valid sample: %v", m.resourceName, numaResultReady) return } From 044320179255564be10b04aa7b8dbd91af3c136a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=B9=E4=BF=8A?= Date: Tue, 29 Oct 2024 20:48:18 +0800 Subject: [PATCH 9/9] fix(sysadvisor): ensure headroom exists in each numa --- .../headroomassembler/assembler_common.go | 26 ++++++++++++++++--- .../headroompolicy/policy_numa_aware.go | 8 ++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go index 718256647..38a0abe4a 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/cpu/assembler/headroomassembler/assembler_common.go @@ -103,11 +103,13 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, map[int]res // divide headroom evenly to each numa bindingNUMAs := r.GetBindingNumas() + perNumaHeadroom := 0.0 if regionInfo.Headroom > 0 && bindingNUMAs.Size() > 0 { - perNumaHeadroom := regionInfo.Headroom / float64(bindingNUMAs.Size()) - for _, numaID := range bindingNUMAs.ToSliceInt() { - headroomNuma[numaID] += perNumaHeadroom - } + perNumaHeadroom = regionInfo.Headroom / float64(bindingNUMAs.Size()) + } + // set headroom even it is zero + for _, numaID := range bindingNUMAs.ToSliceInt() { + headroomNuma[numaID] += perNumaHeadroom } klog.InfoS("dedicated_cores NUMA headroom", "headroom", regionInfo.Headroom, "NUMAs", r.GetBindingNumas().String()) @@ -173,6 +175,14 @@ func (ha *HeadroomAssemblerCommon) GetHeadroom() (resource.Quantity, map[int]res for numaID, headroom := range headroomNuma { headroomNumaRet[numaID] = *resource.NewMilliQuantity(int64(headroom*1000), resource.DecimalSI) } + + allNUMAs := ha.metaServer.CPUDetails.NUMANodes() + for _, numaID := range allNUMAs.ToSliceInt() { + if _, ok := headroomNumaRet[numaID]; !ok { + general.InfoS("set non-reclaim NUMA cpu headroom as empty", "NUMA-ID", numaID) + headroomNumaRet[numaID] = *resource.NewQuantity(0, resource.BinarySI) + } + } return *resource.NewQuantity(int64(headroomTotal), resource.DecimalSI), headroomNumaRet, nil } @@ -265,5 +275,13 @@ func (ha *HeadroomAssemblerCommon) getHeadroomByUtil() (resource.Quantity, map[i for numaID, headroom := range numaHeadroom { general.InfoS("[qosaware-cpu] NUMA headroom by utilization", "NUMA-ID", numaID, "headroom", headroom.Value()) } + + allNUMAs := ha.metaServer.CPUDetails.NUMANodes() + for _, numaID := range allNUMAs.ToSliceInt() { + if _, ok := numaHeadroom[numaID]; !ok { + general.InfoS("set non-reclaim NUMA cpu headroom as empty", "NUMA-ID", numaID) + numaHeadroom[numaID] = *resource.NewQuantity(0, resource.BinarySI) + } + } return totalHeadroom, numaHeadroom, nil } diff --git a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go index 46cf30c7d..a2bcb043b 100644 --- a/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go +++ b/pkg/agent/sysadvisor/plugin/qosaware/resource/memory/headroompolicy/policy_numa_aware.go @@ -159,6 +159,14 @@ func (p *PolicyNUMAAware) Update() (err error) { general.InfoS("memory reclaimable per NUMA", "NUMA-ID", numaID, "headroom", numaReclaimableMemory[numaID]) } + allNUMAs := p.metaServer.CPUDetails.NUMANodes() + for _, numaID := range allNUMAs.ToSliceInt() { + if _, ok := p.numaMemoryHeadroom[numaID]; !ok { + general.InfoS("set non-reclaim NUMA memory reclaimable as empty", "NUMA-ID", numaID) + p.numaMemoryHeadroom[numaID] = *resource.NewQuantity(0, resource.BinarySI) + } + } + general.InfoS("total memory reclaimable", "reclaimableMemory", general.FormatMemoryQuantity(reclaimableMemory), "ResourceUpperBound", general.FormatMemoryQuantity(p.essentials.ResourceUpperBound),