for volumeLimitKey, count := range newVolumeCount { maxVolumeLimit, ok := nodeVolumeLimits[v1.ResourceName(volumeLimitKey)] if ok { currentVolumeCount := attachedVolumeCount[volumeLimitKey] if currentVolumeCount+count > int(maxVolumeLimit) { return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded) } } }
errReason := fmt.Sprintf("node(s) had taint {%s: %s}, that the pod didn't tolerate", taint.Key, taint.Value) return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReason) }
funcPodMatchesNodeSelectorAndAffinityTerms(pod *v1.Pod, node *v1.Node)bool { // Check if node.Labels match pod.Spec.NodeSelector. iflen(pod.Spec.NodeSelector) > 0 { selector := labels.SelectorFromSet(pod.Spec.NodeSelector) if !selector.Matches(labels.Set(node.Labels)) { returnfalse } }
// 1. nil NodeSelector matches all nodes (i.e. does not filter out any nodes) // 2. nil []NodeSelectorTerm (equivalent to non-nil empty NodeSelector) matches no nodes // 3. zero-length non-nil []NodeSelectorTerm matches no nodes also, just for simplicity // 4. nil []NodeSelectorRequirement (equivalent to non-nil empty NodeSelectorTerm) matches no nodes // 5. zero-length non-nil []NodeSelectorRequirement matches no nodes also, just for simplicity // 6. non-nil empty NodeSelectorRequirement is not allowed nodeAffinityMatches := true affinity := pod.Spec.Affinity if affinity != nil && affinity.NodeAffinity != nil { nodeAffinity := affinity.NodeAffinity // if no required NodeAffinity requirements, will do no-op, means select all nodes. // TODO: Replace next line with subsequent commented-out line when implement RequiredDuringSchedulingRequiredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution == nil && nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { returntrue }
// Match node selector for requiredDuringSchedulingRequiredDuringExecution. // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. // if nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution != nil { // nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingRequiredDuringExecution.NodeSelectorTerms // klog.V(10).Infof("Match for RequiredDuringSchedulingRequiredDuringExecution node selector terms %+v", nodeSelectorTerms) // nodeAffinityMatches = nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) // }
// Match node selector for requiredDuringSchedulingIgnoredDuringExecution. if nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { nodeSelectorTerms := nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms nodeAffinityMatches = nodeAffinityMatches && nodeMatchesNodeSelectorTerms(node, nodeSelectorTerms) }
{ pod: &v1.Pod{ Spec: v1.PodSpec{ Affinity: &v1.Affinity{ NodeAffinity: &v1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ NodeSelectorTerms: []v1.NodeSelectorTerm{ { MatchExpressions: []v1.NodeSelectorRequirement{ { Key: "kernel-version", Operator: v1.NodeSelectorOpGt, Values: []string{"0204"}, }, }, }, }, }, }, }, }, }, labels: map[string]string{ // We use two digit to denote major version and two digit for minor version. "kernel-version": "0206", }, name: "Pod with matchExpressions using Gt operator that matches the existing node", },
所谓 Bin Packing ,又称装箱问题,是运筹学中的一个经典问题。问题的背景是,现有若干个小盒子,想要把它们装进有限个给定大小的箱子中,如何既能够装的多油装的快,使得尽可能每个箱子都装满,从而减少箱子的使用数目。BinPack问题有很多变种,当限制箱子的数目为1,每个盒子给定value和weight,binpack问题就变成了背包问题。
registry.registerPriorityConfigProducer(serviceaffinity.Name, func(args ConfigProducerArgs)(plugins config.Plugins, pluginConfig []config.PluginConfig) { // If there are n ServiceAffinity priorities in the policy, the weight for the corresponding // score plugin is n*weight (note that the validation logic verifies that all ServiceAffinity // priorities specified in Policy have the same weight). weight := args.Weight * int32(len(args.ServiceAffinityArgs.AntiAffinityLabelsPreference)) plugins.Score = appendToPluginSet(plugins.Score, serviceaffinity.Name, &weight) if args.ServiceAffinityArgs != nil { pluginConfig = append(pluginConfig, config.PluginConfig{Name: serviceaffinity.Name, Args: args.ServiceAffinityArgs}) } return })
SelectorSpreadPriority
SelectorSpreadPriority:用于实现 Pod 所属的 Controller 下所有的 Pod 在 Node 上打散的要求。实现方式是这样的:它会依据待分配的 Pod 所属的 controller,计算该 controller 下的所有 Pod,假设总数为 T,对这些 Pod 按照所在的 Node 分组统计;假设为 N (表示为某个 Node 上的统计值),那么对 Node上的分数统计为 (T-N)/T 的分数,值越大表示这个节点的 controller 部署的越少,分数越高,从而达到 workload 的 pod 打散需求。
InterPodAffinityPriority:先介绍一下使用场景:第一个例子,比如说应用 A 提供数据,应用 B 提供服务,A 和 B 部署在一起可以走本地网络,优化网络传输;第二个例子,如果应用 A 和应用 B 之间都是 CPU 密集型应用,而且证明它们之间是会互相干扰的,那么可以通过这个规则设置尽量让它们不在一个节点上。pod亲和性选择策略,类似NodeAffinityPriority,提供两种选择器支持:requiredDuringSchedulingIgnoredDuringExecution(保证所选的主机必须满足所有Pod对主机的规则要求)、preferresDuringSchedulingIgnoredDuringExecution(调度器会尽量但不保证满足NodeSelector的所有要求),两个子策略:podAffinity和podAntiAffinity
func(pl *NodePreferAvoidPods)Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string)(int64, *framework.Status) { nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName) if err != nil { return0, framework.NewStatus(framework.Error, fmt.Sprintf("getting node %q from Snapshot: %v", nodeName, err)) }
node := nodeInfo.Node() if node == nil { return0, framework.NewStatus(framework.Error, "node not found") }
controllerRef := metav1.GetControllerOf(pod) if controllerRef != nil { // Ignore pods that are owned by other controller than ReplicationController // or ReplicaSet. if controllerRef.Kind != "ReplicationController" && controllerRef.Kind != "ReplicaSet" { controllerRef = nil } } if controllerRef == nil { return framework.MaxNodeScore, nil }
avoids, err := v1helper.GetAvoidPodsFromNodeAnnotations(node.Annotations) if err != nil { // If we cannot get annotation, assume it's schedulable there. return framework.MaxNodeScore, nil } for i := range avoids.PreferAvoidPods { avoid := &avoids.PreferAvoidPods[i] if avoid.PodSignature.PodController.Kind == controllerRef.Kind && avoid.PodSignature.PodController.UID == controllerRef.UID { return0, nil } } return framework.MaxNodeScore, nil }
var count int64 // A nil element of PreferredDuringSchedulingIgnoredDuringExecution matches no objects. // An element of PreferredDuringSchedulingIgnoredDuringExecution that refers to an // empty PreferredSchedulingTerm matches all objects. if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil { // Match PreferredDuringSchedulingIgnoredDuringExecution term by term. for i := range affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { preferredSchedulingTerm := &affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution[i] if preferredSchedulingTerm.Weight == 0 { continue }
// TODO: Avoid computing it for all nodes if this becomes a performance problem. nodeSelector, err := v1helper.NodeSelectorRequirementsAsSelector(preferredSchedulingTerm.Preference.MatchExpressions) if err != nil { return0, framework.NewStatus(framework.Error, err.Error()) }
if nodeSelector.Matches(labels.Set(node.Labels)) { count += int64(preferredSchedulingTerm.Weight) } } }
ImageLocalityPriority策略主要考虑的是镜像下载的速度。如果节点里面存在镜像的话,优先把 Pod 调度到这个节点上,这里还会去考虑镜像的大小,比如这个 Pod 有好几个镜像,镜像越大下载速度越慢,它会按照节点上已经存在的镜像大小优先级亲和。
1 2 3 4 5 6 7 8 9
funcsumImageScores(nodeInfo *framework.NodeInfo, containers []v1.Container, totalNumNodes int)int64 { var sum int64 for _, container := range containers { if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok { sum += scaledImageScore(state, totalNumNodes) } } return sum }
func(pl *NodeLabel)Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string)(int64, *framework.Status) { nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName) if err != nil || nodeInfo.Node() == nil { return0, framework.NewStatus(framework.Error, fmt.Sprintf("getting node %q from Snapshot: %v, node is nil: %v", nodeName, err, nodeInfo.Node() == nil)) }
node := nodeInfo.Node() score := int64(0) for _, label := range pl.args.PresentLabelsPreference { if labels.Set(node.Labels).Has(label) { score += framework.MaxNodeScore } } for _, label := range pl.args.AbsentLabelsPreference { if !labels.Set(node.Labels).Has(label) { score += framework.MaxNodeScore } } // Take average score for each label to ensure the score doesn't exceed MaxNodeScore. score /= int64(len(pl.args.PresentLabelsPreference) + len(pl.args.AbsentLabelsPreference))
registry.registerPriorityConfigProducer(nodelabel.Name, func(args ConfigProducerArgs)(plugins config.Plugins, pluginConfig []config.PluginConfig) { // If there are n LabelPreference priorities in the policy, the weight for the corresponding // score plugin is n*weight (note that the validation logic verifies that all LabelPreference // priorities specified in Policy have the same weight). weight := args.Weight * int32(len(args.NodeLabelArgs.PresentLabelsPreference)+len(args.NodeLabelArgs.AbsentLabelsPreference)) plugins.Score = appendToPluginSet(plugins.Score, nodelabel.Name, &weight) if args.NodeLabelArgs != nil { pluginConfig = append(pluginConfig, config.PluginConfig{Name: nodelabel.Name, Args: args.NodeLabelArgs}) } return })