0%

【Kubernetes】Controller Manager

Controller Manager作为集群的管理控制中心,维护集群中的所有控制器,对维持集群的稳定和自我修复,实现高可用,副本控制等起关键作用。

内部结构图

12039474-1e134c69dc68c410.png

关键性调用链

12039474-6d0b6a2aabc46a28.png

源码分析过程

组件启动的入口

cmd/kube-controller-manager/controller-manager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
func main() {
rand.Seed(time.Now().UnixNano())

command := app.NewControllerManagerCommand()

// TODO: once we switch everything over to Cobra commands, we can go back to calling
// utilflag.InitFlags() (by removing its pflag.Parse() call). For now, we have to set the
// normalize func and add the go flag set by hand.
// utilflag.InitFlags()
logs.InitLogs()
defer logs.FlushLogs()

if err := command.Execute(); err != nil {
os.Exit(1)
}
}

读取配置文件,进行配置读取和初始化默认配置

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go ->NewControllerManagerCommand

  • 初始化Controller-manager的配置选项结构:NewKubeControllerManagerOptions()
  • 创建执行命令结构包括Use,Long,和Run:cmd := &cobra.Command{
  • 解析配置文件: s.AddFlags
    1.KnownControllers()获取所有controller
    2.将配置文件中的配置选项注入到配置对象中
    3.同时将controller需要的参数写入.
cmd/kube-controller-manager/app/controllermanager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// NewControllerManagerCommand creates a *cobra.Command object with default parameters
func NewControllerManagerCommand() *cobra.Command {
s, err := options.NewKubeControllerManagerOptions()
if err != nil {
klog.Fatalf("unable to initialize command options: %v", err)
}

cmd := &cobra.Command{
Use: "kube-controller-manager",
Long: `The Kubernetes controller manager is a daemon that embeds
the core control loops shipped with Kubernetes. In applications of robotics and
automation, a control loop is a non-terminating loop that regulates the state of
the system. In Kubernetes, a controller is a control loop that watches the shared
state of the cluster through the apiserver and makes changes attempting to move the
current state towards the desired state. Examples of controllers that ship with
Kubernetes today are the replication controller, endpoints controller, namespace
controller, and serviceaccounts controller.`,
Run: func(cmd *cobra.Command, args []string) {
verflag.PrintAndExitIfRequested()
utilflag.PrintFlags(cmd.Flags())

c, err := s.Config(KnownControllers(), ControllersDisabledByDefault.List())
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}

if err := Run(c.Complete(), wait.NeverStop); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
},
}

fs := cmd.Flags()
namedFlagSets := s.Flags(KnownControllers(), ControllersDisabledByDefault.List())
verflag.AddFlags(namedFlagSets.FlagSet("global"))
globalflag.AddGlobalFlags(namedFlagSets.FlagSet("global"), cmd.Name())
registerLegacyGlobalFlags(namedFlagSets)
for _, f := range namedFlagSets.FlagSets {
fs.AddFlagSet(f)
}
usageFmt := "Usage:\n %s\n"
cols, _, _ := term.TerminalSize(cmd.OutOrStdout())
cmd.SetUsageFunc(func(cmd *cobra.Command) error {
fmt.Fprintf(cmd.OutOrStderr(), usageFmt, cmd.UseLine())
cliflag.PrintSections(cmd.OutOrStderr(), namedFlagSets, cols)
return nil
})
cmd.SetHelpFunc(func(cmd *cobra.Command, args []string) {
fmt.Fprintf(cmd.OutOrStdout(), "%s\n\n"+usageFmt, cmd.Long, cmd.UseLine())
cliflag.PrintSections(cmd.OutOrStdout(), namedFlagSets, cols)
})

return cmd
}

组件启动执行

从main中的command.Execute()到4.2中构造的Run
位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
//加载所有控制器,并将对应参数注入到控制器中

1
c, err := s.Config(KnownControllers(), ControllersDisabledByDefault.List())

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
KnownControllers()中的NewControllerInitializers初始化所有的控制器

cmd/kube-controller-manager/app/controllermanager.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// NewControllerInitializers is a public map of named controller groups (you can start more than one in an init func)
// paired to their InitFunc. This allows for structured downstream composition and subdivision.
func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc {
controllers := map[string]InitFunc{}
controllers["endpoint"] = startEndpointController
controllers["endpointslice"] = startEndpointSliceController
controllers["replicationcontroller"] = startReplicationController
controllers["podgc"] = startPodGCController
controllers["resourcequota"] = startResourceQuotaController
controllers["namespace"] = startNamespaceController
controllers["serviceaccount"] = startServiceAccountController
controllers["garbagecollector"] = startGarbageCollectorController
controllers["daemonset"] = startDaemonSetController
controllers["job"] = startJobController
controllers["deployment"] = startDeploymentController
controllers["replicaset"] = startReplicaSetController
controllers["horizontalpodautoscaling"] = startHPAController
controllers["disruption"] = startDisruptionController
controllers["statefulset"] = startStatefulSetController
controllers["cronjob"] = startCronJobController
controllers["csrsigning"] = startCSRSigningController
controllers["csrapproving"] = startCSRApprovingController
controllers["csrcleaner"] = startCSRCleanerController
controllers["ttl"] = startTTLController
controllers["bootstrapsigner"] = startBootstrapSignerController
controllers["tokencleaner"] = startTokenCleanerController
controllers["nodeipam"] = startNodeIpamController
controllers["nodelifecycle"] = startNodeLifecycleController
if loopMode == IncludeCloudLoops {
controllers["service"] = startServiceController
controllers["route"] = startRouteController
controllers["cloud-node-lifecycle"] = startCloudNodeLifecycleController
// TODO: volume controller into the IncludeCloudLoops only set.
}
controllers["persistentvolume-binder"] = startPersistentVolumeBinderController
controllers["attachdetach"] = startAttachDetachController
controllers["persistentvolume-expander"] = startVolumeExpandController
controllers["clusterrole-aggregation"] = startClusterRoleAggregrationController
controllers["pvc-protection"] = startPVCProtectionController
controllers["pv-protection"] = startPVProtectionController
controllers["ttl-after-finished"] = startTTLAfterFinishedController
controllers["root-ca-cert-publisher"] = startRootCACertPublisher

return controllers
}

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
真正进入执行

  • 启动controller-manager的http服务和对应处理器,包括安全和非安全:BuildHandlerChain
  • 构造run的执行体
  • 需要选主的情况,选主完执行run;不需要选主的直接执行run,然后panic
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
// Run runs the KubeControllerManagerOptions.  This should never exit.

func Run(c *config.CompletedConfig) error {



// To help debugging, immediately log version



glog.Infof("Version: %+v", version.Get())







if cfgz, err := configz.New("componentconfig"); err == nil {



cfgz.Set(c.ComponentConfig)



} else {

glog.Errorf("unable to register configz: %c", err)



}







// Start the controller manager HTTP server



stopCh := make(chan struct{})



if c.SecureServing != nil {



handler := genericcontrollermanager.NewBaseHandler(&c.ComponentConfig.Debugging)



handler = genericcontrollermanager.BuildHandlerChain(handler, &c.Authorization, &c.Authentication)



if err := c.SecureServing.Serve(handler, 0, stopCh); err != nil {



return err



}



}



if c.InsecureServing != nil {



handler := genericcontrollermanager.NewBaseHandler(&c.ComponentConfig.Debugging)



handler = genericcontrollermanager.BuildHandlerChain(handler, &c.Authorization, &c.Authentication)



if err := c.InsecureServing.Serve(handler, 0, stopCh); err != nil {



return err



}



}







run := func(stop <-chan struct{}) {



rootClientBuilder := controller.SimpleControllerClientBuilder{



ClientConfig: c.Kubeconfig,



}



var clientBuilder controller.ControllerClientBuilder



if c.ComponentConfig.KubeCloudShared.UseServiceAccountCredentials {



if len(c.ComponentConfig.SAController.ServiceAccountKeyFile) == 0 {



// It'c possible another controller process is creating the tokens for us.



// If one isn't, we'll timeout and exit when our client builder is unable to create the tokens.



glog.Warningf("--use-service-account-credentials was specified without providing a --service-account-private-key-file")



}



clientBuilder = controller.SAControllerClientBuilder{



ClientConfig: restclient.AnonymousClientConfig(c.Kubeconfig),



CoreClient: c.Client.CoreV1(),



AuthenticationClient: c.Client.AuthenticationV1(),



Namespace: "kube-system",



}



} else {



clientBuilder = rootClientBuilder



}



ctx, err := CreateControllerContext(c, rootClientBuilder, clientBuilder, stop)



if err != nil {



glog.Fatalf("error building controller context: %v", err)



}



saTokenControllerInitFunc := serviceAccountTokenControllerStarter{rootClientBuilder: rootClientBuilder}.startServiceAccountTokenController







//启动控制器



if err := StartControllers(ctx, saTokenControllerInitFunc, NewControllerInitializers(ctx.LoopMode)); err != nil {



glog.Fatalf("error starting controllers: %v", err)



}







ctx.InformerFactory.Start(ctx.Stop)



close(ctx.InformersStarted)







select {}



}







//note 如果未启用选主(只是单节点),直接启动,并且panic,不在往下走,因为run内部有select挂起



if !c.ComponentConfig.GenericComponent.LeaderElection.LeaderElect {



run(wait.NeverStop)



panic("unreachable")



}







id, err := os.Hostname()



if err != nil {



return err



}







// add a uniquifier so that two processes on the same host don't accidentally both become active



//生成唯一ID,相当于进程锁



id = id + "_" + string(uuid.NewUUID())



rl, err := resourcelock.New(c.ComponentConfig.GenericComponent.LeaderElection.ResourceLock,



"kube-system",



"kube-controller-manager",



c.LeaderElectionClient.CoreV1(),



resourcelock.ResourceLockConfig{



Identity: id,



EventRecorder: c.EventRecorder,



})



if err != nil {



glog.Fatalf("error creating lock: %v", err)



}







//进行选主,并在选为主节点后执行run



leaderelection.RunOrDie(leaderelection.LeaderElectionConfig{



Lock: rl,



LeaseDuration: c.ComponentConfig.GenericComponent.LeaderElection.LeaseDuration.Duration,



RenewDeadline: c.ComponentConfig.GenericComponent.LeaderElection.RenewDeadline.Duration,



RetryPeriod: c.ComponentConfig.GenericComponent.LeaderElection.RetryPeriod.Duration,



Callbacks: leaderelection.LeaderCallbacks{



//选主完成后执行



OnStartedLeading: run,



OnStoppedLeading: func() {



glog.Fatalf("leaderelection lost")



},



},



})



panic("unreachable")



}

转到run内部核心的三个动作 :CreateControllerContext 、 StartControllers和ctx.InformerFactory.Start

CreateControllerContext

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go

  • 拿到对kube-APIserver中资源的操作句柄
  • 确认Kube-APIServer的健康(最多等待10s),然后拿获取连接
  • 创建控制器上下文
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// CreateControllerContext creates a context struct containing references to resources needed by the
// controllers such as the cloud provider and clientBuilder. rootClientBuilder is only used for
// the shared-informers client and token controller.
func CreateControllerContext(s *config.CompletedConfig, rootClientBuilder, clientBuilder controller.ControllerClientBuilder, stop <-chan struct{}) (ControllerContext, error) {
versionedClient := rootClientBuilder.ClientOrDie("shared-informers")
sharedInformers := informers.NewSharedInformerFactory(versionedClient, ResyncPeriod(s)())

metadataClient := metadata.NewForConfigOrDie(rootClientBuilder.ConfigOrDie("metadata-informers"))
metadataInformers := metadatainformer.NewSharedInformerFactory(metadataClient, ResyncPeriod(s)())

// If apiserver is not running we should wait for some time and fail only then. This is particularly
// important when we start apiserver and controller manager at the same time.
if err := genericcontrollermanager.WaitForAPIServer(versionedClient, 10*time.Second); err != nil {
return ControllerContext{}, fmt.Errorf("failed to wait for apiserver being healthy: %v", err)
}

// Use a discovery client capable of being refreshed.
discoveryClient := rootClientBuilder.ClientOrDie("controller-discovery")
cachedClient := cacheddiscovery.NewMemCacheClient(discoveryClient.Discovery())
restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cachedClient)
go wait.Until(func() {
restMapper.Reset()
}, 30*time.Second, stop)

availableResources, err := GetAvailableResources(rootClientBuilder)
if err != nil {
return ControllerContext{}, err
}

cloud, loopMode, err := createCloudProvider(s.ComponentConfig.KubeCloudShared.CloudProvider.Name, s.ComponentConfig.KubeCloudShared.ExternalCloudVolumePlugin,
s.ComponentConfig.KubeCloudShared.CloudProvider.CloudConfigFile, s.ComponentConfig.KubeCloudShared.AllowUntaggedCloud, sharedInformers)
if err != nil {
return ControllerContext{}, err
}

ctx := ControllerContext{
ClientBuilder: clientBuilder,
InformerFactory: sharedInformers,
ObjectOrMetadataInformerFactory: controller.NewInformerFactory(sharedInformers, metadataInformers),
ComponentConfig: s.ComponentConfig,
RESTMapper: restMapper,
AvailableResources: availableResources,
Cloud: cloud,
LoopMode: loopMode,
Stop: stop,
InformersStarted: make(chan struct{}),
ResyncPeriod: ResyncPeriod(s),
}
return ctx, nil
}

StartControllers

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
启动初始化的所有控制器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// StartControllers starts a set of controllers with a specified ControllerContext
func StartControllers(ctx ControllerContext, startSATokenController InitFunc, controllers map[string]InitFunc, unsecuredMux *mux.PathRecorderMux) error {
// Always start the SA token controller first using a full-power client, since it needs to mint tokens for the rest
// If this fails, just return here and fail since other controllers won't be able to get credentials.
if _, _, err := startSATokenController(ctx); err != nil {
return err
}

// Initialize the cloud provider with a reference to the clientBuilder only after token controller
// has started in case the cloud provider uses the client builder.
if ctx.Cloud != nil {
ctx.Cloud.Initialize(ctx.ClientBuilder, ctx.Stop)
}

for controllerName, initFn := range controllers {
if !ctx.IsControllerEnabled(controllerName) {
klog.Warningf("%q is disabled", controllerName)
continue
}

time.Sleep(wait.Jitter(ctx.ComponentConfig.Generic.ControllerStartInterval.Duration, ControllerStartJitter))

klog.V(1).Infof("Starting %q", controllerName)
debugHandler, started, err := initFn(ctx)
if err != nil {
klog.Errorf("Error starting %q", controllerName)
return err
}
if !started {
klog.Warningf("Skipping %q", controllerName)
continue
}
if debugHandler != nil && unsecuredMux != nil {
basePath := "/debug/controllers/" + controllerName
unsecuredMux.UnlistedHandle(basePath, http.StripPrefix(basePath, debugHandler))
unsecuredMux.UnlistedHandlePrefix(basePath+"/", http.StripPrefix(basePath, debugHandler))
}
klog.Infof("Started %q", controllerName)
}

return nil
}

ctx.InformerFactory.Start

controller-manager中的informer开始启动监听资源的事件,将事件放到自己的队列中(具有限流特性)。处理进程从队列总获取事件开始进行任务处理。

将新建的ReplicaSet,放入队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// obj could be an *apps.ReplicaSet, or a DeletionFinalStateUnknown marker item.



func (rsc *ReplicaSetController) enqueueReplicaSet(obj interface{}) {



key, err := controller.KeyFunc(obj)



if err != nil {



utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))



return



}



rsc.queue.Add(key)



}

从队列中获取对象进行处理(具体过程见下方)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
func (rsc *ReplicaSetController) processNextWorkItem() bool {



key, quit := rsc.queue.Get()



if quit {



return false



}



defer rsc.queue.Done(key)







err := rsc.syncHandler(key.(string))



if err == nil {



rsc.queue.Forget(key)



return true



}







utilruntime.HandleError(fmt.Errorf("Sync %q failed with %v", key, err))



rsc.queue.AddRateLimited(key)







return true



}

以startReplicaSetController为例

在StartControllers中initFn方法是NewControllerInitializers中初始化Controller是定义,以下主要看下startReplicaSetController。
位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/apps.go
其中NewReplicaSetController主要是初始化ReplicaSetController的结构,包括apiserver的客户端,informer的回调函数等等。NewReplicaSetController->NewBaseController

1
2
3
4
5
6
7
8
9
10
11
12
func startReplicaSetController(ctx ControllerContext) (http.Handler, bool, error) {
if !ctx.AvailableResources[schema.GroupVersionResource{Group: "apps", Version: "v1", Resource: "replicasets"}] {
return nil, false, nil
}
go replicaset.NewReplicaSetController(
ctx.InformerFactory.Apps().V1().ReplicaSets(),
ctx.InformerFactory.Core().V1().Pods(),
ctx.ClientBuilder.ClientOrDie("replicaset-controller"),
replicaset.BurstReplicas,
).Run(int(ctx.ComponentConfig.ReplicaSetController.ConcurrentRSSyncs), ctx.Stop)
return nil, true, nil
}

关键函数run:k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go
run中执行rsc.worker。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Run begins watching and syncing.
func (rsc *ReplicaSetController) Run(workers int, stopCh <-chan struct{}) {
defer utilruntime.HandleCrash()
defer rsc.queue.ShutDown()

controllerName := strings.ToLower(rsc.Kind)
klog.Infof("Starting %v controller", controllerName)
defer klog.Infof("Shutting down %v controller", controllerName)

if !cache.WaitForNamedCacheSync(rsc.Kind, stopCh, rsc.podListerSynced, rsc.rsListerSynced) {
return
}

for i := 0; i < workers; i++ {
go wait.Until(rsc.worker, time.Second, stopCh)
}

<-stopCh
}

rsc.worker即为rsc.syncHandler,而syncHandler在创建时来源于rsc.syncReplicaSet(见NewBaseController方法)
那么我们转到syncReplicaSet
位置:k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go
updateReplicaSetStatus:在pod死亡或者新建时更新

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled,
// meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
// invoked concurrently with the same key.
func (rsc *ReplicaSetController) syncReplicaSet(key string) error {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing %v %q (%v)", rsc.Kind, key, time.Since(startTime))
}()

namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
return err
}
rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
if errors.IsNotFound(err) {
klog.V(4).Infof("%v %v has been deleted", rsc.Kind, key)
rsc.expectations.DeleteExpectations(key)
return nil
}
if err != nil {
return err
}

rsNeedsSync := rsc.expectations.SatisfiedExpectations(key)
selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
if err != nil {
utilruntime.HandleError(fmt.Errorf("error converting pod selector to selector: %v", err))
return nil
}

// list all pods to include the pods that don't match the rs`s selector
// anymore but has the stale controller ref.
// TODO: Do the List and Filter in a single pass, or use an index.
allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
if err != nil {
return err
}
// Ignore inactive pods.
filteredPods := controller.FilterActivePods(allPods)

// NOTE: filteredPods are pointing to objects from cache - if you need to
// modify them, you need to copy it first.
filteredPods, err = rsc.claimPods(rs, selector, filteredPods)
if err != nil {
return err
}

var manageReplicasErr error
if rsNeedsSync && rs.DeletionTimestamp == nil {
manageReplicasErr = rsc.manageReplicas(filteredPods, rs)
}
rs = rs.DeepCopy()
newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)

// Always updates status as pods come up or die.
updatedRS, err := updateReplicaSetStatus(rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus)
if err != nil {
// Multiple things could lead to this update failing. Requeuing the replica set ensures
// Returning an error causes a requeue without forcing a hotloop
return err
}
// Resync the ReplicaSet after MinReadySeconds as a last line of defense to guard against clock-skew.
if manageReplicasErr == nil && updatedRS.Spec.MinReadySeconds > 0 &&
updatedRS.Status.ReadyReplicas == *(updatedRS.Spec.Replicas) &&
updatedRS.Status.AvailableReplicas != *(updatedRS.Spec.Replicas) {
rsc.queue.AddAfter(key, time.Duration(updatedRS.Spec.MinReadySeconds)*time.Second)
}
return manageReplicasErr
}

转到updateReplicaSetStatus:k8s.io/kubernetes/pkg/controller/replicaset/replica_set_utils.go
调用UpdateStatus,通过apiserver更新

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
// updateReplicaSetStatus attempts to update the Status.Replicas of the given ReplicaSet, with a single GET/PUT retry.
func updateReplicaSetStatus(c appsclient.ReplicaSetInterface, rs *apps.ReplicaSet, newStatus apps.ReplicaSetStatus) (*apps.ReplicaSet, error) {
// This is the steady state. It happens when the ReplicaSet doesn't have any expectations, since
// we do a periodic relist every 30s. If the generations differ but the replicas are
// the same, a caller might've resized to the same replica count.
if rs.Status.Replicas == newStatus.Replicas &&
rs.Status.FullyLabeledReplicas == newStatus.FullyLabeledReplicas &&
rs.Status.ReadyReplicas == newStatus.ReadyReplicas &&
rs.Status.AvailableReplicas == newStatus.AvailableReplicas &&
rs.Generation == rs.Status.ObservedGeneration &&
reflect.DeepEqual(rs.Status.Conditions, newStatus.Conditions) {
return rs, nil
}

// Save the generation number we acted on, otherwise we might wrongfully indicate
// that we've seen a spec update when we retry.
// TODO: This can clobber an update if we allow multiple agents to write to the
// same status.
newStatus.ObservedGeneration = rs.Generation

var getErr, updateErr error
var updatedRS *apps.ReplicaSet
for i, rs := 0, rs; ; i++ {
klog.V(4).Infof(fmt.Sprintf("Updating status for %v: %s/%s, ", rs.Kind, rs.Namespace, rs.Name) +
fmt.Sprintf("replicas %d->%d (need %d), ", rs.Status.Replicas, newStatus.Replicas, *(rs.Spec.Replicas)) +
fmt.Sprintf("fullyLabeledReplicas %d->%d, ", rs.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas) +
fmt.Sprintf("readyReplicas %d->%d, ", rs.Status.ReadyReplicas, newStatus.ReadyReplicas) +
fmt.Sprintf("availableReplicas %d->%d, ", rs.Status.AvailableReplicas, newStatus.AvailableReplicas) +
fmt.Sprintf("sequence No: %v->%v", rs.Status.ObservedGeneration, newStatus.ObservedGeneration))

rs.Status = newStatus
updatedRS, updateErr = c.UpdateStatus(context.TODO(), rs, metav1.UpdateOptions{})
if updateErr == nil {
return updatedRS, nil
}
// Stop retrying if we exceed statusUpdateRetries - the replicaSet will be requeued with a rate limit.
if i >= statusUpdateRetries {
break
}
// Update the ReplicaSet with the latest resource version for the next poll
if rs, getErr = c.Get(context.TODO(), rs.Name, metav1.GetOptions{}); getErr != nil {
// If the GET fails we can't trust status.Replicas anymore. This error
// is bound to be more interesting than the update failure.
return nil, getErr
}
}

return nil, updateErr
}

PodGCController

1.gc掉超过阈值限制的pod,按时间排序gc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
func (gcc *PodGCController) gcTerminated(pods []*v1.Pod) {
terminatedPods := []*v1.Pod{}
for _, pod := range pods {
if isPodTerminated(pod) {
terminatedPods = append(terminatedPods, pod)
}
}

terminatedPodCount := len(terminatedPods)
deleteCount := terminatedPodCount - gcc.terminatedPodThreshold

if deleteCount > terminatedPodCount {
deleteCount = terminatedPodCount
}
if deleteCount <= 0 {
return
}

klog.Infof("garbage collecting %v pods", deleteCount)
// sort only when necessary
sort.Sort(byCreationTimestamp(terminatedPods))
var wait sync.WaitGroup
for i := 0; i < deleteCount; i++ {
wait.Add(1)
go func(namespace string, name string) {
defer wait.Done()
if err := gcc.deletePod(namespace, name); err != nil {
// ignore not founds
defer utilruntime.HandleError(err)
}
}(terminatedPods[i].Namespace, terminatedPods[i].Name)
}
wait.Wait()
}

2.gc掉孤儿pod:pod上的node信息不在当前可调度的节点上,即没有和有效node绑定

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// gcOrphaned deletes pods that are bound to nodes that don't exist.
func (gcc *PodGCController) gcOrphaned(pods []*v1.Pod, nodes []*v1.Node) {
klog.V(4).Infof("GC'ing orphaned")
existingNodeNames := sets.NewString()
for _, node := range nodes {
existingNodeNames.Insert(node.Name)
}
// Add newly found unknown nodes to quarantine
for _, pod := range pods {
if pod.Spec.NodeName != "" && !existingNodeNames.Has(pod.Spec.NodeName) {
gcc.nodeQueue.AddAfter(pod.Spec.NodeName, quarantineTime)
}
}
// Check if nodes are still missing after quarantine period
deletedNodesNames, quit := gcc.discoverDeletedNodes(existingNodeNames)
if quit {
return
}
// Delete orphaned pods
for _, pod := range pods {
if !deletedNodesNames.Has(pod.Spec.NodeName) {
continue
}
klog.V(2).Infof("Found orphaned Pod %v/%v assigned to the Node %v. Deleting.", pod.Namespace, pod.Name, pod.Spec.NodeName)
if err := gcc.deletePod(pod.Namespace, pod.Name); err != nil {
utilruntime.HandleError(err)
} else {
klog.V(0).Infof("Forced deletion of orphaned Pod %v/%v succeeded", pod.Namespace, pod.Name)
}
}
}

3.gc掉没有调度成功的pod:表现在pod的NodeName为空,主要由于资源等条件不满足

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// gcUnscheduledTerminating deletes pods that are terminating and haven't been scheduled to a particular node.
func (gcc *PodGCController) gcUnscheduledTerminating(pods []*v1.Pod) {
klog.V(4).Infof("GC'ing unscheduled pods which are terminating.")

for _, pod := range pods {
if pod.DeletionTimestamp == nil || len(pod.Spec.NodeName) > 0 {
continue
}

klog.V(2).Infof("Found unscheduled terminating Pod %v/%v not assigned to any Node. Deleting.", pod.Namespace, pod.Name)
if err := gcc.deletePod(pod.Namespace, pod.Name); err != nil {
utilruntime.HandleError(err)
} else {
klog.V(0).Infof("Forced deletion of unscheduled terminating Pod %v/%v succeeded", pod.Namespace, pod.Name)
}
}
}

// byCreationTimestamp sorts a list by creation timestamp, using their names as a tie breaker.
type byCreationTimestamp []*v1.Pod

func (o byCreationTimestamp) Len() int { return len(o) }
func (o byCreationTimestamp) Swap(i, j int) { o[i], o[j] = o[j], o[i] }

func (o byCreationTimestamp) Less(i, j int) bool {
if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
return o[i].Name < o[j].Name
}
return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
}