// IsInterested returns true if at least one extended resource requested by// this pod is managed by this extender.func(h*HTTPExtender)IsInterested(pod*v1.Pod)bool{ifh.managedResources.Len()==0{returntrue}ifh.hasManagedResources(pod.Spec.Containers){returntrue}ifh.hasManagedResources(pod.Spec.InitContainers){returntrue}returnfalse}func(h*HTTPExtender)hasManagedResources(containers[]v1.Container)bool{fori:=rangecontainers{container:=&containers[i]forresourceName:=rangecontainer.Resources.Requests{ifh.managedResources.Has(string(resourceName)){returntrue}}forresourceName:=rangecontainer.Resources.Limits{ifh.managedResources.Has(string(resourceName)){returntrue}}}returnfalse}
等待(带有超时):如果一个 Permit 插件返回“等待”结果,则 Pod 将保持在一个内部的“等待中” 的 Pod 列表,同时该 Pod 的绑定周期启动时即直接阻塞直到得到批准。 如果超时发生,等待变成拒绝,并且 Pod 将返回调度队列,从而触发 Reserve 插件中的 Unreserve 阶段。
PreBind:这些插件用于执行 Pod 绑定前所需的所有工作。 例如,一个 PreBind 插件可能需要制备网络卷并且在允许 Pod 运行在该节点之前将其挂载到目标节点上。如果任何 PreBind 插件返回错误,则 Pod 将被拒绝并且退回到调度队列中。
mkdri i-scheduler
cd i-scheduler
#go mod init github.com/lixd96/i-schedulergit clone -b release-1.29 https://github.com/kubernetes-sigs/scheduler-plugins.git
cd scheduler-plugins
packagepriorityimport("context""fmt""k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper""log""strconv"v1"k8s.io/api/core/v1""k8s.io/apimachinery/pkg/runtime""k8s.io/kubernetes/pkg/scheduler/framework")// Name is the name of the plugin used in the plugin registry and configurations.constName="priority"constLabel="priority.lixueduan.com"typePrioritystruct{handleframework.Handle}var_framework.FilterPlugin=&Priority{}var_framework.ScorePlugin=&Priority{}// New initializes a new plugin and returns it.funcNew(_context.Context,_runtime.Object,hframework.Handle)(framework.Plugin,error){return&Priority{handle:h},nil}// Name returns name of the plugin.func(pl*Priority)Name()string{returnName}func(pl*Priority)Filter(ctxcontext.Context,state*framework.CycleState,pod*v1.Pod,nodeInfo*framework.NodeInfo)*framework.Status{log.Printf("filter pod: %v, node: %v",pod.Name,nodeInfo)log.Println(state)// 只调度到携带指定 Label 的节点上if_,ok:=nodeInfo.Node().Labels[Label];!ok{returnframework.NewStatus(framework.Unschedulable,fmt.Sprintf("Node:%s does not have label %s","Node: "+nodeInfo.Node().Name,Label))}returnframework.NewStatus(framework.Success,"Node: "+nodeInfo.Node().Name)}// Score invoked at the score extension point.func(pl*Priority)Score(ctxcontext.Context,state*framework.CycleState,pod*v1.Pod,nodeNamestring)(int64,*framework.Status){nodeInfo,err:=pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)iferr!=nil{return0,framework.NewStatus(framework.Error,fmt.Sprintf("getting node %q from Snapshot: %v",nodeName,err))}// 获取 Node 上的 Label 作为分数priorityStr,ok:=nodeInfo.Node().Labels[Label]if!ok{return0,framework.NewStatus(framework.Error,fmt.Sprintf("node %q does not have label %s",nodeName,Label))}priority,err:=strconv.Atoi(priorityStr)iferr!=nil{return0,framework.NewStatus(framework.Error,fmt.Sprintf("node %q has priority %s are invalid",nodeName,priorityStr))}returnint64(priority),framework.NewStatus(framework.Success,"")}// ScoreExtensions of the Score plugin.func(pl*Priority)ScoreExtensions()framework.ScoreExtensions{returnpl}// NormalizeScore invoked after scoring all nodes.func(pl*Priority)NormalizeScore(ctxcontext.Context,state*framework.CycleState,pod*v1.Pod,scoresframework.NodeScoreList)*framework.Status{returnhelper.DefaultNormalizeScore(framework.MaxNodeScore,false,scores)}
func(pl*Priority)Filter(ctxcontext.Context,state*framework.CycleState,pod*v1.Pod,nodeInfo*framework.NodeInfo)*framework.Status{log.Printf("filter pod: %v, node: %v",pod.Name,nodeInfo)log.Println(state)// 只调度到携带指定 Label 的节点上if_,ok:=nodeInfo.Node().Labels[Label];!ok{returnframework.NewStatus(framework.Unschedulable,fmt.Sprintf("Node:%s does not have label %s","Node: "+nodeInfo.Node().Name,Label))}returnframework.NewStatus(framework.Success,"Node: "+nodeInfo.Node().Name)
func(pl*Priority)Score(ctxcontext.Context,state*framework.CycleState,pod*v1.Pod,nodeNamestring)(int64,*framework.Status){nodeInfo,err:=pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)iferr!=nil{return0,framework.NewStatus(framework.Error,fmt.Sprintf("getting node %q from Snapshot: %v",nodeName,err))}// 获取 Node 上的 Label 作为分数priorityStr,ok:=nodeInfo.Node().Labels[Label]if!ok{return0,framework.NewStatus(framework.Error,fmt.Sprintf("node %q does not have label %s",nodeName,Label))}priority,err:=strconv.Atoi(priorityStr)iferr!=nil{return0,framework.NewStatus(framework.Error,fmt.Sprintf("node %q has priority %s are invalid",nodeName,priorityStr))}returnint64(priority),framework.NewStatus(framework.Success,"")}
# Default values for scheduler-plugins-as-a-second-scheduler.# This is a YAML-formatted file.# Declare variables to be passed into your templates.scheduler:name:i-schedulerimage:lixd96/kube-scheduler:priporitycontroller:replicaCount:0# LoadVariationRiskBalancing and TargetLoadPacking are not enabled by default# as they need extra RBAC privileges on metrics.k8s.io.plugins:enabled:["Priority","Coscheduling","CapacityScheduling","NodeResourceTopologyMatch","NodeResourcesAllocatable"]
[root@scheduler-1 install]# k get poNAME READY STATUS RESTARTS AGE
test-7f7bb8f449-w6wvv 0/1 Pending 0 4s
查看具体情况
1
2
3
4
5
6
[root@scheduler-1 install]# kubectl describe po test-7f7bb8f449-w6wvvEvents:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 8s i-scheduler 0/2 nodes are available: 1 Node:Node: scheduler-1 does not have label priority.lixueduan.com, 1 Node:Node: scheduler-2 does not have label priority.lixueduan.com. preemption: 0/2 nodes are available: 2 No preemption victims found for incoming pod.
可以看到,是因为 Node 上没有我们定义的 Label,因此都不满足条件,最终 Pod 就一直 Pending 了。
[root@scheduler-1 install]# k get nodeNAME STATUS ROLES AGE VERSION
scheduler-1 Ready control-plane 4h34m v1.27.4
scheduler-2 Ready <none> 4h33m v1.27.4
[root@scheduler-1 install]# k label node scheduler-1 priority.lixueduan.com=10node/scheduler-1 labeled
再次查看 Pod 状态
1
2
3
[root@scheduler-1 install]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-7f7bb8f449-w6wvv 1/1 Running 0 4m20s 172.25.123.199 scheduler-1 <none> <none>
已经被调度到 node1 上了,查看详细日志
1
2
3
4
5
6
[root@scheduler-1 install]# k describe po test-7f7bb8f449-w6wvvEvents:
Type Reason Age From Message
---- ------ ---- ---- -------
Warning FailedScheduling 4m8s i-scheduler 0/2 nodes are available: 1 Node:Node: scheduler-1 does not have label priority.lixueduan.com, 1 Node:Node: scheduler-2 does not have label priority.lixueduan.com. preemption: 0/2 nodes are available: 2 No preemption victims found for incoming pod.
Normal Scheduled 33s i-scheduler Successfully assigned default/test-7f7bb8f449-w6wvv to scheduler-1
可以看到,也是 i-scheduler 在处理,调度到了 node1.
多节点排序
我们实现的 Score 是根据 Node 上的 priority.lixueduan.com 对应的 Value 作为得分的,因此肯定会调度到 Value 比较大的一个节点。
给 node2 也打上 label,value 设置为 20
1
2
3
4
5
6
[root@scheduler-1 install]# k get nodeNAME STATUS ROLES AGE VERSION
scheduler-1 Ready control-plane 4h34m v1.27.4
scheduler-2 Ready <none> 4h33m v1.27.4
[root@scheduler-1 install]# k label node scheduler-2 priority.lixueduan.com=20node/scheduler-2 labeled
[root@scheduler-1 install]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-7f7bb8f449-krvqj 1/1 Running 0 58s 172.25.0.150 scheduler-2 <none> <none>
果然,被调度到了 Node2。
现在我们更新 Node1 的 label,改成 30
1
k label node scheduler-1 priority.lixueduan.com=30 --overwrite
再次更新 Deployment 触发调度
1
2
[root@scheduler-1 install]# k rollout restart deploy testdeployment.apps/test restarted
这样应该是调度到 node1 了,确认一下
1
2
3
[root@scheduler-1 install]# k get po -owideNAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
test-f7b597544-bbcb8 1/1 Running 0 65s 172.25.123.200 scheduler-1 <none> <none>