Skip to content

Commit

Permalink
Release cce-network-v2/2.9.1
Browse files Browse the repository at this point in the history
  • Loading branch information
gola committed Feb 9, 2024
1 parent 278ac7d commit aa26f07
Show file tree
Hide file tree
Showing 18 changed files with 181 additions and 54 deletions.
2 changes: 1 addition & 1 deletion cce-network-v2/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.9.0
2.9.1
4 changes: 2 additions & 2 deletions cce-network-v2/deploy/cce-network-v2/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 2.8.1
version: 2.9.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "2.8.1"
appVersion: "2.9.1"
10 changes: 7 additions & 3 deletions cce-network-v2/deploy/cce-network-v2/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ extplugins: {}
# cce守护进程配置
ccedConfig:
# 开启debug模式(会多打日志)
debug: true
debug: false
# 在非k8s环境运行使用
# k8s-kubeconfig-path: /run/kubeconfig
k8s-client-burst: 10
k8s-client-qps: 5
k8s-client-burst: 15
k8s-client-qps: 10
k8s-api-discovery: false
leader-election-lease-duration: 60s
leader-election-renew-deadline: 30s
Expand Down Expand Up @@ -162,6 +162,10 @@ ccedConfig:
cce-cluster-id: ""
# vpc 资源同步周期
resource-resync-interval: 20s
# ipam 并发工作协程数
parallel-alloc-workers: 500
# 资源重新同步的并发协程数
resource-resync-workers: 64

# operator BCE VPC 配置
# vpc id
Expand Down
9 changes: 7 additions & 2 deletions cce-network-v2/docs/release.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ v2 版本新架构,支持VPC-ENI 辅助IP和vpc路由。版本发布历史如
2. CRD 字段变更: NetworkResourceSet 资源池增加了节点上 ENI 的异常状态,报错单机 IP 容量状态,整机 ENI 网卡状态。
3. 新特性: 支持ubuntu 22.04 操作系统,在容器网络环境下,定义 systemd-networkd 的 MacAddressPolicy 为 none。
4. 新特性:支持 pod 级 Qos

### 2.9.0
### 2.9.1
1. [optimize] 优化NetResourceManager在接收事件时处理的锁,消除事件处理过程中 6 分钟延迟
2. [optimize] 优化ENI状态机同步错误时,增加 3 次重试机会,消除因 ENI 状态延迟导致的 10 分钟就绪延迟
3. [bug]修复 cce-network-agent 识别操作系统信息错误的问题
4. [bug]修复cce-network-agent pod 被删除后,小概率导致 operator 空指针退出问题
5. [bug]修复创建 eni 无法向 nrs 对象上打印 event 的问题
### 2.9.0 [20240102]
1. [optimize] 申请 IP 失败时,支持给出失败的原因.包括:
a. 没有可用子网
b. IP 地址池已满
Expand Down
2 changes: 1 addition & 1 deletion cce-network-v2/operator/watchers/net_resource_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func StartSynchronizingNetResourceSets(ctx context.Context, nodeManager NodeEven
return nodeManager.Update(node)
})

controller := cm.NewWorkqueueController("network-resource-set-controller", 1, nodeManagerSyncHandler)
controller := cm.NewWorkqueueController("network-resource-set-controller", 10, nodeManagerSyncHandler)
controller.Run()
k8s.CCEClient().Informers.Cce().V2().NetResourceSets().Informer().AddEventHandler(controller)

Expand Down
2 changes: 1 addition & 1 deletion cce-network-v2/operator/watchers/node_taint.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ func processNextCCEPodItem(workQueue workqueue.RateLimitingInterface) bool {
}

pod, err := PodClient.Lister().Pods(namespace).Get(name)
if err != nil && !k8sErrors.IsNotFound(err) {
if err != nil {
return true
}

Expand Down
77 changes: 51 additions & 26 deletions cce-network-v2/pkg/bce/bcesync/eni.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,26 +251,50 @@ func (ss *eniSyncher) Create(resource *ccev2.ENI) error {
}

func (ss *eniSyncher) Update(resource *ccev2.ENI) error {
var err error
if resource.Spec.Type != ccev2.ENIForBCC && resource.Spec.Type != ccev2.ENIDefaultBCC {
return nil
}
var (
newObj = resource.DeepCopy()
eniStatus *ccev2.ENIStatus
err error
updateError error
ctx = logfields.NewContext()
)

scopeLog := eniLog.WithFields(logrus.Fields{
"eniID": newObj.Name,
"vpcID": newObj.Spec.ENI.VpcID,
"eniName": newObj.Spec.ENI.Name,
"instanceID": newObj.Spec.ENI.InstanceID,
"status": newObj.Status.VPCStatus,
"eniID": resource.Name,
"vpcID": resource.Spec.ENI.VpcID,
"eniName": resource.Spec.ENI.Name,
"instanceID": resource.Spec.ENI.InstanceID,
"oldStatus": resource.Status.VPCStatus,
"method": "eniSyncher.Update",
})

// refresh eni from vpc and retry if k8s resource is expired
for retry := 0; retry < 3; retry++ {
if retry > 0 {
// refresh new k8s resource when resource is expired
resource, err = k8s.CCEClient().CceV2().ENIs().Get(context.TODO(), resource.Name, metav1.GetOptions{})
if err != nil {
scopeLog.WithError(err).Error("get eni failed")
return err
}

}
scopeLog = scopeLog.WithField("retry", retry)
err := ss.handleENIUpdate(resource, scopeLog)
if kerrors.IsConflict(err) || kerrors.IsResourceExpired(err) {
continue
}
return err
}

return nil
}

func (ss *eniSyncher) handleENIUpdate(resource *ccev2.ENI, scopeLog *logrus.Entry) error {
var (
newObj = resource.DeepCopy()
err error
ctx = logfields.NewContext()
eniStatus *ccev2.ENIStatus
updateError error
)
skipRefresh := ss.mangeFinalizer(newObj)
if !skipRefresh {
scopeLog.Debug("start eni machine")
Expand All @@ -284,15 +308,10 @@ func (ss *eniSyncher) Update(resource *ccev2.ENI) error {
err = machine.start()
_, isDelayError := err.(*cm.DelayEvent)
if err != nil {
if isDelayError {
if isDelayError && newObj.Status.VPCStatus == resource.Status.VPCStatus {
// if vpc status is not changed, will retry after 5s
if newObj.Status.VPCStatus == resource.Status.VPCStatus {
scopeLog.Infof("eni vpc status not changed, will retry later")
return err
} else {
// if vpc status is changed, will upate status on apiserver
goto updateAPIServer
}
scopeLog.Infof("eni vpc status not changed, will retry later")
return err
} else {
scopeLog.WithError(err).Error("eni machine failed")
return err
Expand All @@ -306,15 +325,13 @@ func (ss *eniSyncher) Update(resource *ccev2.ENI) error {
scopeLog.WithError(err).Error("refresh eni failed")
return err
}
eniStatus = &newObj.Status
}

updateAPIServer:
eniStatus = &newObj.Status
// update spec and status
if !reflect.DeepEqual(&newObj.Spec, &resource.Spec) ||
!reflect.DeepEqual(newObj.Labels, resource.Labels) ||
!reflect.DeepEqual(newObj.Finalizers, resource.Finalizers) {
scopeLog.Debug("start update eni spec")
newObj, updateError = ss.updater.Update(newObj)
if updateError != nil {
scopeLog.WithError(updateError).Error("update eni spec failed")
Expand All @@ -323,9 +340,12 @@ updateAPIServer:
scopeLog.Info("update eni spec success")
}

if !reflect.DeepEqual(eniStatus, &resource.Status) {
if !reflect.DeepEqual(eniStatus, &resource.Status) && eniStatus != nil {
newObj.Status = *eniStatus
scopeLog.Debug("start update eni status")
scopeLog = scopeLog.WithFields(logrus.Fields{
"vpcStatus": newObj.Status.VPCStatus,
"cceStatus": newObj.Status.CCEStatus,
})
_, updateError = ss.updater.UpdateStatus(newObj)
if updateError != nil {
scopeLog.WithError(updateError).Error("update eni status failed")
Expand Down Expand Up @@ -501,7 +521,12 @@ func (esm *eniStateMachine) start() error {
return err
}

(&esm.resource.Status).AppendVPCStatus(ccev2.VPCENIStatus(esm.vpceni.Status))
// regresh the status of ENI
if esm.resource.Status.VPCStatus != ccev2.VPCENIStatus(esm.vpceni.Status) {
(&esm.resource.Status).AppendVPCStatus(ccev2.VPCENIStatus(esm.vpceni.Status))
return nil
}

// not the final status, will retry later
return cm.NewDelayEvent(esm.resource.Name, ENIReadyTimeToAttach, fmt.Sprintf("eni %s status is not final: %s", esm.resource.Spec.ENI.ID, esm.resource.Status.VPCStatus))
}
Expand Down
3 changes: 2 additions & 1 deletion cce-network-v2/pkg/bce/option/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/bce/api/cloud/ccegateway"
"github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/k8s"
"github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/logging"
"github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/option"
)

var (
Expand All @@ -47,7 +48,7 @@ func BCEClient() cloud.Interface {
operatorOption.Config.CCEClusterID,
operatorOption.Config.BCECloudAccessKey,
operatorOption.Config.BCECloudSecureKey,
k8s.Client(), false)
k8s.Client(), option.Config.Debug)
if err != nil {
log.Fatalf("[InitBCEClient] failed to init bce client %v", err)
}
Expand Down
18 changes: 15 additions & 3 deletions cce-network-v2/pkg/bce/vpceni/node_bcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,29 @@ func (n *bccNode) __prepareIPAllocation(scopedLog *logrus.Entry, checkSubnet boo
return nil
}
eniCount++

scopedLog = scopedLog.WithFields(logrus.Fields{
"eniID": interfaceID,
"index": e.Status.InterfaceIndex,
"numAddresses": len(e.Spec.ENI.PrivateIPSet) - 1,
})
if e.Spec.UseMode == ccev2.ENIUseModePrimaryIP {
return nil
}

// Eni that is not in an in use state should be ignored, as even if the VPC interface is called to apply for an IP,
// the following error will be obtained
// [Code: EniStatusException; Message: The eni status is not allowed
// to operate; RequestId: 0f4d190a-76af-4671-9f29-b954dbb47195]
if e.Status.VPCStatus != ccev2.VPCENIStatusInuse {
scopedLog.WithField("vpcStatus", e.Status.VPCStatus).Warnf("skip ENI which is not in use")
return nil
}
// The limits include the primary IP, so we need to take it into account
// when computing the effective number of available addresses on the ENI.
effectiveLimits := n.k8sObj.Spec.ENI.MaxIPsPerENI
scopedLog.WithFields(logrus.Fields{
"eniID": interfaceID,
"index": e.Status.InterfaceIndex,
"addressLimit": effectiveLimits,
"numAddresses": len(e.Spec.ENI.PrivateIPSet) - 1,
}).Debug("Considering ENI for allocation")

amap := ipamTypes.AllocationMap{}
Expand Down
7 changes: 6 additions & 1 deletion cce-network-v2/pkg/bce/vpceni/node_super.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ import (
"github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/math"
)

func init() {
ccev2.AddToScheme(scheme.Scheme)
}

// The following error constants represent the error conditions for
// CreateInterface without additional context embedded in order to make them
// usable for metrics accounting purposes.
Expand Down Expand Up @@ -366,7 +370,8 @@ func (n *bceNode) CreateInterface(ctx context.Context, allocation *ipam.Allocati
})

if n.creatingEni.hasCreatingENI() {
return 0, "", fmt.Errorf("concurrent eni creating")
scopedLog.Debugf("skip to creating new eni, concurrent eni creating")
return 0, "", nil
}

n.creatingEni.add(1)
Expand Down
2 changes: 1 addition & 1 deletion cce-network-v2/pkg/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ const (
UseENIPrimaryAddress = false

// ParallelAllocWorkers is the default max number of parallel workers doing allocation in the operator
ParallelAllocWorkers = 50
ParallelAllocWorkers = 500

// CloudAPIBurst is the default burst value when rate limiting access to external APIs
CloudAPIBurst = 30
Expand Down
1 change: 1 addition & 0 deletions cce-network-v2/pkg/ipam/net_resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,7 @@ func (n *NetResource) MaintainIPPool(ctx context.Context) error {
n.poolMaintenanceComplete()
n.recalculate()
if instanceMutated || err != nil {
n.logger().Debug("MaintainIPPool triggering resync")
n.manager.resyncTrigger.Trigger()
}
return err
Expand Down
17 changes: 10 additions & 7 deletions cce-network-v2/pkg/ipam/net_resource_set_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"sort"
"time"

operatorOption "github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/operator/option"
listerv2 "github.com/baidubce/baiducloud-cce-cni-driver/cce-network-v2/pkg/k8s/client/listers/cce.baidubce.com/v2"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -222,7 +223,7 @@ func (n *NetResourceSetManager) Start(ctx context.Context) error {
mngr := controller.NewManager()
mngr.UpdateController("ipam-node-interval-refresh",
controller.ControllerParams{
RunInterval: 30 * time.Second,
RunInterval: operatorOption.Config.ResourceResyncInterval,
DoFunc: func(ctx context.Context) error {
if syncTime, ok := n.instancesAPIResync(ctx); ok {
n.Resync(ctx, syncTime)
Expand Down Expand Up @@ -270,14 +271,12 @@ func (n *NetResourceSetManager) Create(resource *v2.NetResourceSet) error {
// Update is called whenever a NetResourceSet resource has been updated in the
// Kubernetes apiserver
func (n *NetResourceSetManager) Update(resource *v2.NetResourceSet) error {
var nodeSynced = true
n.mutex.Lock()
node, ok := n.netResources[resource.Name]
n.mutex.Unlock()

defer func() {
n.mutex.Unlock()
if nodeSynced {
nodeSynced = node.UpdatedResource(resource)
}
node.UpdatedResource(resource)
}()
if !ok {
node = &NetResource{
Expand Down Expand Up @@ -307,7 +306,7 @@ func (n *NetResourceSetManager) Update(resource *v2.NetResourceSet) error {

retry, err := trigger.NewTrigger(trigger.Parameters{
Name: fmt.Sprintf("ipam-pool-maintainer-%s-retry", resource.Name),
MinInterval: 30 * time.Second, // large minimal interval to not retry too often
MinInterval: 5 * time.Second, // large minimal interval to not retry too often
TriggerFunc: func(reasons []string) { poolMaintainer.Trigger() },
})
if err != nil {
Expand All @@ -332,7 +331,11 @@ func (n *NetResourceSetManager) Update(resource *v2.NetResourceSet) error {

node.poolMaintainer = poolMaintainer
node.k8sSync = k8sSync

n.mutex.Lock()
n.netResources[node.name] = node
n.mutex.Unlock()

log.WithField(fieldName, resource.Name).Info("Discovered new NetResourceSet custom resource")
}

Expand Down
8 changes: 6 additions & 2 deletions cce-network-v2/pkg/os/os_detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ import (
)

const (
osReleasePath = "/etc-host/os-release"
osReleasePath = "/usr-host/lib/os-release"
etcReleasePath = "/etc-host/os-release"
)

var log = logging.DefaultLogger.WithField(logfields.LogSubsys, "os")
Expand Down Expand Up @@ -45,7 +46,10 @@ func NewOSDistribution() (*OSRelease, error) {
}
file, err := os.ReadFile(osReleasePath)
if err != nil {
return nil, fmt.Errorf("failed to read %s: %v", osReleasePath, err)
file, err = os.ReadFile(etcReleasePath)
if err != nil {
return nil, fmt.Errorf("failed to read %s and %s: %v", osReleasePath, etcReleasePath, err)
}
}
var osRelease OSRelease
reader := bufio.NewReader(bytes.NewReader(file))
Expand Down
2 changes: 1 addition & 1 deletion cce-network-v2/pkg/os/systemd_networkd.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"io"
"os"
"os/exec"

"github.com/coreos/go-systemd/v22/unit"
)

Expand Down
8 changes: 8 additions & 0 deletions cce-network-v2/pkg/os/ubuntu.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ func (o *ubuntuOS) DisableMacPersistant() error {
log.Info("not ubuntu 22.04, skip disable mac persistent")
return nil
}
err := o.overrideSystemdDefaultLinkConfig()
if err != nil {
log.Errorf("failed to disable mac persistent, ignored os policy: %v", err)
}
return nil
}

func (o *ubuntuOS) overrideSystemdDefaultLinkConfig() error {
_, err := os.Open(defaultLinkPath)
if os.IsNotExist(err) {
err = os.WriteFile(defaultLinkPath, []byte(defaultLinkTemplate), 0644)
Expand Down
Loading

0 comments on commit aa26f07

Please sign in to comment.