Skip to content

Commit

Permalink
Release cce-network-v2/2.12.11
Browse files Browse the repository at this point in the history
  • Loading branch information
gola committed Dec 27, 2024
1 parent ec72bf4 commit 2690dc0
Show file tree
Hide file tree
Showing 14 changed files with 262 additions and 64 deletions.
2 changes: 1 addition & 1 deletion cce-network-v2/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.12.10
2.12.11
Binary file modified cce-network-v2/deploy/cce-network-v2-2.12.tar.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions cce-network-v2/deploy/cce-network-v2/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 2.12.10
version: 2.12.11

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "2.12.10"
appVersion: "2.12.11"
9 changes: 9 additions & 0 deletions cce-network-v2/docs/release.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ v2 版本新架构,支持VPC-ENI 辅助IP和vpc路由。版本发布历史如
2. 增加 eni 安全组同步功能, 保持CCE ENI 和节点安全组同步。
3. 增加节点网络配置集功能 NetResourceConfigSet,支持指定节点独立配置网络资源。

#### 2.12.11 [20241227]
1. [Bug] 修复 VPC-ENI 模式下,弹性网卡预挂载 eni-pre-allocate-num 配置不生效的问题
2. [Bug] 修改开启 RDMA 场景下 RDMA ENI 对象本地缓存过期状态相关逻辑,解决因 resync nrs timeout 而导致的新增 RDMA 节点初始化慢,大规模集群扩容速度慢的问题
3. [Optimize] 修改开启 RDMA 场景下 RDMA NetResourceSet 对象拼装规则,以及 ENI 对象的 LabelSelectorValue 的拼装规则,防止 RDMA NetResourceSet 名字超过限定值 253,防止 ENI 对象的 LabelSelectorValue 超过限定值 63,解决因 Node Name 超长而导致的 cce-network-agent panic 问题
4. [Optimize] 修改 RDMA ENI 对象更新逻辑,解决因 NodeName 变更时 ENI 对象未正常销毁而导致的 RDMA ENI 对象无法被更新而导致的节点无法就绪的问题
5. [Optimize] 优化开启 RDMA 模式时的 RDMA ENI 状态机处理逻辑,支持非终态 RDMA ENI 的处理流程,避免非终态状态 RDMA ENI 卡住节点NotReady 无法恢复的问题
6. [Optimize] 优化开启 RDMA 模式时,对 HPC OpenAPI 的请求逻辑,大幅降低请求频率,降低大规模集群下的 OpenAPI 请求压力
7. [Bug] 修复创建 Node Interface 对象时因初始值未判断而导致的出现 Instance is out of interfaces 导致节点就绪慢的问题

#### 2.12.10 [20241213]
1. [Optimize] 优化 VPC-ENI 模式下的 veth 单机路由规则目的地址存在冲突的判断条件,解决残留本地路由规则时创建的 Pod 容器网络不通的问题
2. [Optimize] 禁用 VPC-ENI 模式下的 IPv6 DHCP 时使用的网卡名修改为udev生成的原始名字,避免生成的网卡配置文件导致虚拟机重启时 network.service 服务启动失败
Expand Down
15 changes: 13 additions & 2 deletions cce-network-v2/pkg/bce/bcesync/eni.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ func (es *VPCENISyncerRouter) StartENISyncer(ctx context.Context, updater syncer
// Create implements syncer.ENIEventHandler
func (es *VPCENISyncerRouter) Create(resource *ccev2.ENI) error {
types := resource.Spec.Type
if types == ccev2.ENIForBBC || types == ccev2.ENIForHPC || types == ccev2.ENIForERI {
// Remove "if types == ccev2.ENIForHPC || types == ccev2.ENIForERI return nil",
// because we need to support the case of RDMA ENI already have RDMA IPs
if types == ccev2.ENIForBBC {
return nil
}

Expand All @@ -110,7 +112,9 @@ func (es *VPCENISyncerRouter) ResyncENI(ctx context.Context) time.Duration {
// Update implements syncer.ENIEventHandler
func (es *VPCENISyncerRouter) Update(resource *ccev2.ENI) error {
types := resource.Spec.Type
if types == ccev2.ENIForBBC || types == ccev2.ENIForHPC || types == ccev2.ENIForERI {
// Remove "if types == ccev2.ENIForHPC || types == ccev2.ENIForERI return nil",
// because we need to support the case of RDMA ENI already have RDMA IPs
if types == ccev2.ENIForBBC {
return nil
}

Expand Down Expand Up @@ -326,6 +330,13 @@ type eniStateMachine struct {

// Start state machine flow
func (esm *eniStateMachine) start() error {
// ENI for RDMA (ccev2.ENIForHPC or ccev2.ENIForERI) need do nothing, so return nil directly.
// esm.es.remoteSyncer.statENI(esm.ctx, esm.resource.Name) can not stat ENI for RDMA, it will return
// error like [Code: EniNotFoundException; Message: eni:eni-tzjatpp7gbh6 resource not exist;
// RequestId: 148ca1d1-174f-494a-8192-5bae2a3bf0c7]". So we need to check ENI type first.
if esm.resource.Spec.Type == ccev2.ENIForHPC || esm.resource.Spec.Type == ccev2.ENIForERI {
return nil
}
var err error
if esm.resource.Status.VPCStatus == ccev2.VPCENIStatusInuse {
if len(esm.resource.Spec.PrivateIPSet) == 0 {
Expand Down
11 changes: 8 additions & 3 deletions cce-network-v2/pkg/bce/rdma/rdma_eni.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ func (m *rdmaInstancesManager) ForeachInstance(instanceID, nodeName string, fn i
return fmt.Errorf("list ENIs failed: %w", err)
}
for i := 0; i < len(enis); i++ {
if enis[i].DeletionTimestamp != nil || enis[i].Status.VPCStatus == ccev2.VPCENIStatusDeleted {
eni := enis[i]
vpcStatus := eni.Status.VPCStatus
// Do not process the RDMA ENI which is being deleted or in attaching/detaching status.
// It is not useful to process it, because the IPs are not assigned to the RDMA ENI.
if enis[i].DeletionTimestamp != nil || vpcStatus == ccev2.VPCENIStatusAttaching ||
vpcStatus == ccev2.VPCENIStatusDetaching || vpcStatus == ccev2.VPCENIStatusDeleted {
continue
}
fn(instanceID, enis[i].Spec.ENI.ID, ipamTypes.InterfaceRevision{
Expand All @@ -87,8 +92,9 @@ func (m *rdmaInstancesManager) ForeachInstance(instanceID, nodeName string, fn i
// waitForENISynced wait for eni synced
// this method should not lock the mutex of bceNode before calling
func (n *bceRDMANetResourceSet) waitForENISynced(ctx context.Context) {
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()

wait.PollImmediateUntilWithContext(ctx, 200*time.Millisecond, func(ctx context.Context) (done bool, err error) {
haveSynced := true
n.manager.ForeachInstance(n.instanceID, n.k8sObj.Name,
Expand All @@ -111,5 +117,4 @@ func (n *bceRDMANetResourceSet) waitForENISynced(ctx context.Context) {
})
return haveSynced, nil
})

}
2 changes: 2 additions & 0 deletions cce-network-v2/pkg/bce/rdma/rdma_super.go
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,8 @@ func (n *bceRDMANetResourceSet) updateENIWithPoll(ctx context.Context, eni *ccev
return false, fmt.Errorf("get eni %s failed: %v", eni.Name, ierr)
}
eni = eni.DeepCopy()
oldversion = eni.Spec.VPCVersion
eni.Spec.VPCVersion = eni.Spec.VPCVersion + 1
eni = refresh(eni)

// update eni
Expand Down
116 changes: 94 additions & 22 deletions cce-network-v2/pkg/bce/rdma/rdma_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ type rdmaNetResourceSetWrapper struct {
*bceRDMANetResourceSet

// rdmaeni is the eni of the node
rdmaeniName string
rdmaENIName string
}

func newRdmaNetResourceSetWrapper(super *bceRDMANetResourceSet) *rdmaNetResourceSetWrapper {
Expand Down Expand Up @@ -86,42 +86,96 @@ func (n *rdmaNetResourceSetWrapper) findMatchedEniByMac(ctx context.Context, iaa
// ensureRdmaENI means create a eni object for rdma interface
// rdma interface has only one eni, so we use rdma interface id as eni name
func (n *rdmaNetResourceSetWrapper) ensureRdmaENI() (*ccev2.ENI, error) {
if n.rdmaeniName != "" {
eni, err := n.manager.eniLister.Get(n.rdmaeniName)
if n.rdmaENIName != "" {
eni, err := n.manager.eniLister.Get(n.rdmaENIName)
if errors.IsNotFound(err) {
goto forceGetFromIaaS
}
if err != nil {
return nil, fmt.Errorf("failed to get rdma eni %s from lister", n.rdmaeniName)
return nil, fmt.Errorf("failed to get rdma eni %s from lister", n.rdmaENIName)
}

isNeedUpdate := false
if eni.Status.VPCStatus != ccev2.VPCENIStatusInuse {
_, err = k8s.CCEClient().CceV2().ENIs().UpdateStatus(context.TODO(), eni, metav1.UpdateOptions{})
isNeedUpdate = true
}
if bceutils.IsCCERdmaNetRourceSetName(eni.Labels[k8s.LabelNodeName]) &&
eni.Labels[k8s.LabelNodeName] != n.k8sObj.Name {
var ownerReferenceNodeName string
or := n.k8sObj.GetOwnerReferences()
for _, ref := range or {
if ref.Kind == "Node" {
ownerReferenceNodeName = ref.Name
break
}
}
labelSelectorValue := bceutils.GetLabelSelectorValueFromNetResourceSetName(n.k8sObj.Name,
ownerReferenceNodeName, n.k8sObj.Spec.InstanceID, eni.Spec.MacAddress, string(eni.Spec.Type))
eni.Labels[k8s.LabelNodeName] = labelSelectorValue
isNeedUpdate = true
}

if isNeedUpdate {
err = n.updateENIWithPoll(context.TODO(), eni, func(eni *ccev2.ENI) *ccev2.ENI {
// do nothing
return eni
})
if err != nil {
return nil, fmt.Errorf("failed to update %s ENI status: %w", n.rdmaeniName, err)
return nil, fmt.Errorf("failed to update %s ENI %s status: %w", eni.Spec.Type, eni.Name, err)
}
n.log.Infof("update %s ENI status successed", n.rdmaeniName)
n.log.Infof("update %s ENI %s status successed", eni.Spec.Type, eni.Name)
}
return eni, nil
}

forceGetFromIaaS:
// the hpc or eri api do not use vpcID, subnetID and zoneName
vpcID := n.k8sObj.Spec.ENI.VpcID
// the macAddress and vifFeatures is decided by the NetResourceSet's annotation
macAddress := n.bceRDMANetResourceSet.k8sObj.Annotations[k8s.AnnotationRDMAInfoMacAddress]
vifFeatures := n.bceRDMANetResourceSet.k8sObj.Annotations[k8s.AnnotationRDMAInfoVifFeatures]

iaasClient := n.manager.getIaaSClient(vifFeatures)
rdmaEni, err := n.findMatchedEniByMac(context.Background(), iaasClient, vpcID, n.instanceID, vifFeatures, macAddress)
var rdmaEniId string
requirement := metav1.LabelSelectorRequirement{
Key: k8s.LabelNodeName,
Operator: metav1.LabelSelectorOpIn,
Values: []string{n.k8sObj.Name},
}
labelSelector := &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{requirement},
}
// Select only the ENI of the local node
selector, err := metav1.LabelSelectorAsSelector(labelSelector)
if err != nil {
n.log.WithError(err).Errorf("failed to get instance %s eni", vifFeatures)
return nil, err
panic(fmt.Errorf("failed to create label selector: %v", err))
}
k8sRdmaEnis, err := n.manager.eniLister.List(selector)
if err != nil {
return nil, fmt.Errorf("failed to list enis: %w", err)
}
for _, rdmaEni := range k8sRdmaEnis {
// only one RDMA ENI per RDMA NetworkResourceSet
rdmaEniId = rdmaEni.Spec.ID
}
var rdmaEni *client.EniResult
iaasClient := n.manager.getIaaSClient(vifFeatures)
if rdmaEniId == "" {
rdmaEni, err = n.findMatchedEniByMac(context.Background(), iaasClient, vpcID, n.instanceID, vifFeatures, macAddress)
if err != nil {
n.log.WithError(err).Errorf("failed to get instance %s eni", vifFeatures)
return nil, err
}
n.log.WithField("rdmaeni", logfields.Repr(rdmaEni)).Debugf("get instance %s eni success", vifFeatures)
rdmaEniId = rdmaEni.Id
}
n.log.WithField("rdmaeni", logfields.Repr(rdmaEni)).Debugf("get instance %s eni success", vifFeatures)

eni, err := n.manager.eniLister.Get(rdmaEni.Id)
var ctx = context.Background()
eni, err := n.manager.eniLister.Get(rdmaEniId)
if errors.IsNotFound(err) {
// the hpc or eri do not use ensure subnet object

var (
ipv4IPSet, ipv6IPSet []*models.PrivateIP
ctx = context.Background()
)

for _, v := range rdmaEni.PrivateIpSet {
Expand Down Expand Up @@ -164,7 +218,7 @@ func (n *rdmaNetResourceSetWrapper) ensureRdmaENI() (*ccev2.ENI, error) {
InstanceID: n.instanceID,
PrivateIPSet: ipv4IPSet,
IPV6PrivateIPSet: ipv6IPSet,
MacAddress: rdmaEni.MacAddress,
MacAddress: macAddress,
},
},
Status: ccev2.ENIStatus{},
Expand All @@ -173,19 +227,28 @@ func (n *rdmaNetResourceSetWrapper) ensureRdmaENI() (*ccev2.ENI, error) {
if err != nil {
return nil, fmt.Errorf("failed to create %s ENI: %w", vifFeatures, err)
}
n.log.Infof("create %s ENI resource successed", vifFeatures)
n.log.Infof("create %s ENI %s resource successed", vifFeatures, eni.Name)
(&eni.Status).AppendVPCStatus(ccev2.VPCENIStatusInuse)
_, err = k8s.CCEClient().CceV2().ENIs().UpdateStatus(ctx, eni, metav1.UpdateOptions{})
if err != nil {
return nil, fmt.Errorf("failed to update %s ENI status: %w", vifFeatures, err)
return nil, fmt.Errorf("failed to update %s ENI %s status: %w", vifFeatures, eni.Name, err)
}
n.log.Infof("update %s ENI status successed", vifFeatures)
n.log.Infof("update %s ENI %s status successed", vifFeatures, eni.Name)
} else if err != nil {
n.log.Errorf("failed to get %s ENI resource: %v", vifFeatures, err)
n.log.Errorf("failed to get %s ENI %s resource: %v", vifFeatures, eni.Name, err)
return nil, err
} else {
err = n.updateENIWithPoll(ctx, eni, func(eni *ccev2.ENI) *ccev2.ENI {
// do nothing
return eni
})
if err != nil {
return nil, fmt.Errorf("failed to update %s ENI %s status: %w", vifFeatures, eni.Name, err)
}
n.log.Infof("update %s ENI %s status successed", vifFeatures, eni.Name)
}
n.log.Debugf("got %s ENI resource successed", vifFeatures)
n.rdmaeniName = eni.Name
n.log.Debugf("got %s ENI %s resource successed", vifFeatures, eni.Name)
n.rdmaENIName = eni.Name
return eni, err
}

Expand Down Expand Up @@ -217,7 +280,16 @@ func (n *rdmaNetResourceSetWrapper) refreshENIQuota(scopeLog *logrus.Entry) (Rdm
if client == nil {
scopeLog.Fatal("K8s client is nil")
}
nodeName := bceutils.GetNodeNameFromNetResourceSetName(n.k8sObj.Name)

var ownerReferenceNodeName string
or := n.k8sObj.GetOwnerReferences()
for _, ref := range or {
if ref.Kind == "Node" {
ownerReferenceNodeName = ref.Name
break
}
}
nodeName := bceutils.GetNodeNameFromNetResourceSetName(n.k8sObj.Name, ownerReferenceNodeName, n.k8sObj.InstanceID())
k8sNode, err := client.Informers.Core().V1().Nodes().Lister().Get(nodeName)
if err != nil {
return nil, fmt.Errorf("failed to get k8s node %s: %v", n.k8sObj.Name, err)
Expand Down
Loading

0 comments on commit 2690dc0

Please sign in to comment.