// IsInterested returns true if at least one extended resource requested by// this pod is managed by this extender.func(h*HTTPExtender)IsInterested(pod*v1.Pod)bool{ifh.managedResources.Len()==0{returntrue}ifh.hasManagedResources(pod.Spec.Containers){returntrue}ifh.hasManagedResources(pod.Spec.InitContainers){returntrue}returnfalse}func(h*HTTPExtender)hasManagedResources(containers[]v1.Container)bool{fori:=rangecontainers{container:=&containers[i]forresourceName:=rangecontainer.Resources.Requests{ifh.managedResources.Has(string(resourceName)){returntrue}}forresourceName:=rangecontainer.Resources.Limits{ifh.managedResources.Has(string(resourceName)){returntrue}}}returnfalse}
// ExtenderArgs represents the arguments needed by the extender to filter/prioritize// nodes for a pod.typeExtenderArgsstruct{// Pod being scheduledPod*v1.Pod// List of candidate nodes where the pod can be scheduled; to be populated// only if Extender.NodeCacheCapable == falseNodes*v1.NodeList// List of candidate node names where the pod can be scheduled; to be// populated only if Extender.NodeCacheCapable == trueNodeNames*[]string}
响应结果
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
// ExtenderFilterResult represents the results of a filter call to an extendertypeExtenderFilterResultstruct{// Filtered set of nodes where the pod can be scheduled; to be populated// only if Extender.NodeCacheCapable == falseNodes*v1.NodeList// Filtered set of nodes where the pod can be scheduled; to be populated// only if Extender.NodeCacheCapable == trueNodeNames*[]string// Filtered out nodes where the pod can't be scheduled and the failure messagesFailedNodesFailedNodesMap// Filtered out nodes where the pod can't be scheduled and preemption would// not change anything. The value is the failure message same as FailedNodes.// Nodes specified here takes precedence over FailedNodes.FailedAndUnresolvableNodesFailedNodesMap// Error message indicating failureErrorstring}
Prioritize
请求参数
1
2
3
4
5
6
7
8
9
10
11
12
// ExtenderArgs represents the arguments needed by the extender to filter/prioritize// nodes for a pod.typeExtenderArgsstruct{// Pod being scheduledPod*v1.Pod// List of candidate nodes where the pod can be scheduled; to be populated// only if Extender.NodeCacheCapable == falseNodes*v1.NodeList// List of candidate node names where the pod can be scheduled; to be// populated only if Extender.NodeCacheCapable == trueNodeNames*[]string}
响应结果
1
2
3
4
5
6
7
8
9
10
// HostPriority represents the priority of scheduling to a particular host, higher priority is better.typeHostPrioritystruct{// Name of the hostHoststring// Score associated with the hostScoreint64}// HostPriorityList declares a []HostPriority type.typeHostPriorityList[]HostPriority
Bind
请求参数
1
2
3
4
5
6
7
8
9
10
11
// ExtenderBindingArgs represents the arguments to an extender for binding a pod to a node.typeExtenderBindingArgsstruct{// PodName is the name of the pod being boundPodNamestring// PodNamespace is the namespace of the pod being boundPodNamespacestring// PodUID is the UID of the pod being boundPodUIDtypes.UID// Node selected by the schedulerNodestring}
响应结果
1
2
3
4
5
// ExtenderBindingResult represents the result of binding of a pod to a node from an extender.typeExtenderBindingResultstruct{// Error message indicating failureErrorstring}
// Prioritize 给 Pod 打分func(ex*Extender)Prioritize(argsextenderv1.ExtenderArgs)*extenderv1.HostPriorityList{varresultextenderv1.HostPriorityListfor_,node:=rangeargs.Nodes.Items{// 获取 Node 上的 Label 作为分数priorityStr,ok:=node.Labels[Label]if!ok{klog.Errorf("node %q does not have label %s",node.Name,Label)continue}priority,err:=strconv.Atoi(priorityStr)iferr!=nil{klog.Errorf("node %q has priority %s are invalid",node.Name,priorityStr)continue}result=append(result,extenderv1.HostPriority{Host:node.Name,Score:int64(priority),})}return&result}
Bind 实现
就是通过 clientset 创建一个 Binding 对象,指定 Pod 和 Node 信息。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Bind 将 Pod 绑定到指定节点func(ex*Extender)Bind(argsextenderv1.ExtenderBindingArgs)*extenderv1.ExtenderBindingResult{log.Printf("bind pod: %s/%s to node:%s",args.PodNamespace,args.PodName,args.Node)// 创建绑定关系binding:=&corev1.Binding{ObjectMeta:metav1.ObjectMeta{Name:args.PodName,Namespace:args.PodNamespace,UID:args.PodUID},Target:corev1.ObjectReference{Kind:"Node",APIVersion:"v1",Name:args.Node},}result:=new(extenderv1.ExtenderBindingResult)err:=ex.ClientSet.CoreV1().Pods(args.PodNamespace).Bind(context.Background(),binding,metav1.CreateOptions{})iferr!=nil{klog.ErrorS(err,"Failed to bind pod","pod",args.PodName,"namespace",args.PodNamespace,"podUID",args.PodUID,"node",args.Node)result.Error=err.Error()}returnresult}
# syntax=docker/dockerfile:1# Build the manager binaryFROMgolang:1.22.5asbuilderARG TARGETOSARG TARGETARCHENVGOPROXY=https://goproxy.cnWORKDIR/workspace# Copy the go sourceCOPY . /workspace# cache deps before building and copying source so that we don't need to re-download as much# and so that source changes don't invalidate our downloaded layerRUN go mod download# Build# the GOARCH has not a default value to allow the binary be built according to the host where the command# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.RUNCGO_ENABLED=0GOOS=${TARGETOS:-linux}GOARCH=${TARGETARCH} go build -a -o extender main.go# Use distroless as minimal base image to package the manager binary# Refer to https://github.com/GoogleContainerTools/distroless for more details#FROM gcr.io/distroless/static:nonrootFROMbusybox:1.36WORKDIR /COPY --from=builder /workspace/extender .USER65532:65532ENTRYPOINT["/extender"]
[root@scheduler-1 lixd]# k get poNAME READY STATUS RESTARTS AGE
test-58794bff9f-ljxbs 0/1 Pending 0 17s
查看具体情况
1
2
3
4
5
6
[root@scheduler-1]# k describe po test-58794bff9f-ljxbsEvents:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 99s i-scheduler-extender all node do not have label priority.lixueduan.com
Warning FailedScheduling 95s (x2 over 97s) i-scheduler-extender all node do not have label priority.lixueduan.com
可以看到,是因为 Node 上没有我们定义的 Label,因此都不满足条件,最终 Pod 就一直 Pending 了。
[root@scheduler-1 install]# k get nodeNAME STATUS ROLES AGE VERSION
scheduler-1 Ready control-plane 4h34m v1.27.4
scheduler-2 Ready <none> 4h33m v1.27.4
[root@scheduler-1 install]# k label node scheduler-1 priority.lixueduan.com=10node/scheduler-1 labeled
再次查看 Pod 状态
1
2
3
[root@scheduler-1 lixd]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-58794bff9f-ljxbs 1/1 Running 0 104s 172.25.123.201 scheduler-1 <none> <none>
已经被调度到 node1 上了,查看详细日志
1
2
3
4
5
6
7
[root@scheduler-1 install]# k describe po test-7f7bb8f449-w6wvvEvents:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 116s i-scheduler-extender 0/2 nodes are available: preemption: 0/2 nodes are available: 2 No preemption victims found for incoming pod.
Warning FailedScheduling 112s (x2 over 115s) i-scheduler-extender 0/2 nodes are available: preemption: 0/2 nodes are available: 2 No preemption victims found for incoming pod.
Normal Scheduled 26s i-scheduler-extender Successfully assigned default/test-58794bff9f-ljxbs to scheduler-1
[root@scheduler-1 install]# k get nodeNAME STATUS ROLES AGE VERSION
scheduler-1 Ready control-plane 4h34m v1.27.4
scheduler-2 Ready <none> 4h33m v1.27.4
[root@scheduler-1 install]# k label node scheduler-2 priority.lixueduan.com=20node/scheduler-2 labeled
[root@scheduler-1 lixd]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-84fdbbd8c7-47mtr 1/1 Running 0 38s 172.25.0.162 scheduler-1 <none> <none>
[root@scheduler-1 install]# k rollout restart deploy testdeployment.apps/test restarted
这样应该是调度到 node2 了,确认一下
1
2
3
[root@scheduler-1 lixd]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-849f549d5b-pbrml 1/1 Running 0 12s 172.25.0.166 scheduler-2 <none> <none>
现在我们更新 Node1 的 label,改成 30
1
k label node scheduler-1 priority.lixueduan.com=30 --overwrite
再次更新 Deployment 触发调度
1
2
[root@scheduler-1 install]# k rollout restart deploy testdeployment.apps/test restarted
这样应该是调度到 node1 了,确认一下
1
2
3
[root@scheduler-1 lixd]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-69d9ccb877-9fb6t 1/1 Running 0 5s 172.25.123.203 scheduler-1 <none> <none>