// pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go#L199func(plugin*NvidiaDevicePlugin)WatchAndRegister(){klog.Info("Starting WatchAndRegister")errorSleepInterval:=time.Second*5successSleepInterval:=time.Second*30for{err:=plugin.RegistrInAnnotation()iferr!=nil{klog.Errorf("Failed to register annotation: %v",err)klog.Infof("Retrying in %v seconds...",errorSleepInterval)time.Sleep(errorSleepInterval)}else{klog.Infof("Successfully registered annotation. Next check in %v seconds...",successSleepInterval)time.Sleep(successSleepInterval)}}}
// pkg/scheduler/score.go#L65funcfitInCertainDevice(node*NodeUsage,requestutil.ContainerDeviceRequest,annosmap[string]string,pod*corev1.Pod,allocated*util.PodDevices)(bool,map[string]util.ContainerDevices){k:=requestoriginReq:=k.Numsprevnuma:=-1klog.InfoS("Allocating device for container request","pod",klog.KObj(pod),"card request",k)vartmpDevsmap[string]util.ContainerDevicestmpDevs=make(map[string]util.ContainerDevices)fori:=len(node.Devices.DeviceLists)-1;i>=0;i--{found,numa:=checkType(annos,*node.Devices.DeviceLists[i].Device,k)if!found{continue}ifnuma&&prevnuma!=node.Devices.DeviceLists[i].Device.Numa{k.Nums=originReqprevnuma=node.Devices.DeviceLists[i].Device.NumatmpDevs=make(map[string]util.ContainerDevices)}if!checkUUID(annos,*node.Devices.DeviceLists[i].Device,k){continue}memreq:=int32(0)ifnode.Devices.DeviceLists[i].Device.Count<=node.Devices.DeviceLists[i].Device.Used{continue}ifk.Coresreq>100{klog.ErrorS(nil,"core limit can't exceed 100","pod",klog.KObj(pod))k.Coresreq=100//return false, tmpDevs}ifk.Memreq>0{memreq=k.Memreq}ifk.MemPercentagereq!=101&&k.Memreq==0{//This incurs an issuememreq=node.Devices.DeviceLists[i].Device.Totalmem*k.MemPercentagereq/100}ifnode.Devices.DeviceLists[i].Device.Totalmem-node.Devices.DeviceLists[i].Device.Usedmem<memreq{continue}ifnode.Devices.DeviceLists[i].Device.Totalcore-node.Devices.DeviceLists[i].Device.Usedcores<k.Coresreq{continue}// Coresreq=100 indicates it want this card exclusivelyifnode.Devices.DeviceLists[i].Device.Totalcore==100&&k.Coresreq==100&&node.Devices.DeviceLists[i].Device.Used>0{continue}// You can't allocate core=0 job to an already full GPUifnode.Devices.DeviceLists[i].Device.Totalcore!=0&&node.Devices.DeviceLists[i].Device.Usedcores==node.Devices.DeviceLists[i].Device.Totalcore&&k.Coresreq==0{continue}if!device.GetDevices()[k.Type].CustomFilterRule(allocated,request,tmpDevs[k.Type],node.Devices.DeviceLists[i].Device){continue}ifk.Nums>0{klog.InfoS("first fitted","pod",klog.KObj(pod),"device",node.Devices.DeviceLists[i].Device.ID)k.Nums--tmpDevs[k.Type]=append(tmpDevs[k.Type],util.ContainerDevice{Idx:int(node.Devices.DeviceLists[i].Device.Index),UUID:node.Devices.DeviceLists[i].Device.ID,Type:k.Type,Usedmem:memreq,Usedcores:k.Coresreq,})}ifk.Nums==0{klog.InfoS("device allocate success","pod",klog.KObj(pod),"allocate device",tmpDevs)returntrue,tmpDevs}ifnode.Devices.DeviceLists[i].Device.Mode=="mig"{i++}}returnfalse,tmpDevs}
// pkg/scheduler/score.go#L38funccheckType(annosmap[string]string,dutil.DeviceUsage,nutil.ContainerDeviceRequest)(bool,bool){//General type check, NVIDIA->NVIDIA MLU->MLUklog.V(3).InfoS("Type check","device",d.Type,"req",n.Type)if!strings.Contains(d.Type,n.Type){returnfalse,false}for_,val:=rangedevice.GetDevices(){found,pass,numaAssert:=val.CheckType(annos,d,n)iffound{returnpass,numaAssert}}klog.Infof("Unrecognized device %s",n.Type)returnfalse,false}funccheckGPUtype(annosmap[string]string,cardtypestring)bool{cardtype=strings.ToUpper(cardtype)ifinuse,ok:=annos[GPUInUse];ok{useTypes:=strings.Split(inuse,",")// if false return false...if!ContainsSliceFunc(useTypes,func(useTypestring)bool{returnstrings.Contains(cardtype,strings.ToUpper(useType))}){returnfalse}}ifunuse,ok:=annos[GPUNoUse];ok{unuseTypes:=strings.Split(unuse,",")// if true return falseifContainsSliceFunc(unuseTypes,func(unuseTypestring)bool{returnstrings.Contains(cardtype,strings.ToUpper(unuseType))}){returnfalse}}returntrue}
同样是包括 TypeUse 和 TypeNoUse 两个。
1
2
3
4
5
6
7
8
9
ifinuse,ok:=annos[GPUInUse];ok{useTypes:=strings.Split(inuse,",")// if false return false...if!ContainsSliceFunc(useTypes,func(useTypestring)bool{returnstrings.Contains(cardtype,strings.ToUpper(useType))}){returnfalse}}
ifunuse,ok:=annos[GPUNoUse];ok{unuseTypes:=strings.Split(unuse,",")// if true return falseifContainsSliceFunc(unuseTypes,func(unuseTypestring)bool{returnstrings.Contains(cardtype,strings.ToUpper(unuseType))}){returnfalse}}
// pkg/scheduler/score.go#L54funccheckUUID(annosmap[string]string,dutil.DeviceUsage,nutil.ContainerDeviceRequest)bool{devices,ok:=device.GetDevices()[n.Type]if!ok{klog.Errorf("can not get device for %s type",n.Type)returnfalse}result:=devices.CheckUUID(annos,d)klog.V(2).Infof("checkUUID result is %v for %s type",result,n.Type)returnresult}func(dev*NvidiaGPUDevices)CheckUUID(annosmap[string]string,dutil.DeviceUsage)bool{userUUID,ok:=annos[GPUUseUUID]ifok{klog.V(5).Infof("check uuid for nvidia user uuid [%s], device id is %s",userUUID,d.ID)// use , symbol to connect multiple uuiduserUUIDs:=strings.Split(userUUID,",")for_,uuid:=rangeuserUUIDs{ifd.ID==uuid{returntrue}}returnfalse}noUserUUID,ok:=annos[GPUNoUseUUID]ifok{klog.V(5).Infof("check uuid for nvidia not user uuid [%s], device id is %s",noUserUUID,d.ID)// use , symbol to connect multiple uuidnoUserUUIDs:=strings.Split(noUserUUID,",")for_,uuid:=rangenoUserUUIDs{ifd.ID==uuid{returnfalse}}returntrue}returntrue}
sort.Sort(nodeScores)m:=(*nodeScores).NodeList[len((*nodeScores).NodeList)-1]klog.Infof("schedule %v/%v to %v %v",args.Pod.Namespace,args.Pod.Name,m.NodeID,m.Devices)annotations:=make(map[string]string)annotations[util.AssignedNodeAnnotations]=m.NodeIDannotations[util.AssignedTimeAnnotations]=strconv.FormatInt(time.Now().Unix(),10)for_,val:=rangedevice.GetDevices(){val.PatchAnnotations(&annotations,m.Devices)}
Annoations 大概是这样的:
1
2
3
4
5
6
7
8
9
10
11
root@test:~/lixd/hami# k get po hami-30 -oyaml
apiVersion: v1
kind: Pod
metadata:
annotations:
hami.io/bind-phase: allocating
hami.io/bind-time: "1732072495" hami.io/vgpu-devices-allocated: GPU-1afede84-4e70-2174-49af-f07ebb94d1ae,NVIDIA,20000,30:; hami.io/vgpu-devices-to-allocate: GPU-1afede84-4e70-2174-49af-f07ebb94d1ae,NVIDIA,20000,30:; hami.io/vgpu-node: test hami.io/vgpu-time: "1732072495"
其中 hami.io/vgpu-devices-to-allocate 则是 Scheduler 为 Pod 选择的目标 GPU
// pkg/util/util.go#L281funcGetNextDeviceRequest(dtypestring,pcorev1.Pod)(corev1.Container,ContainerDevices,error){pdevices,err:=DecodePodDevices(InRequestDevices,p.Annotations)iferr!=nil{returncorev1.Container{},ContainerDevices{},err}klog.Infof("pod annotation decode vaule is %+v",pdevices)res:=ContainerDevices{}pd,ok:=pdevices[dtype]if!ok{returncorev1.Container{},res,errors.New("device request not found")}forctridx,ctrDevice:=rangepd{iflen(ctrDevice)>0{returnp.Spec.Containers[ctridx],ctrDevice,nil}}returncorev1.Container{},res,errors.New("device request not found")}// pkg/util/util.go#L254funcDecodePodDevices(checklistmap[string]string,annosmap[string]string)(PodDevices,error){klog.V(5).Infof("checklist is [%+v], annos is [%+v]",checklist,annos)iflen(annos)==0{returnPodDevices{},nil}pd:=make(PodDevices)fordevID,devs:=rangechecklist{str,ok:=annos[devs]if!ok{continue}pd[devID]=make(PodSingleDevice,0)for_,s:=rangestrings.Split(str,OnePodMultiContainerSplitSymbol){cd,err:=DecodeContainerDevices(s)iferr!=nil{returnPodDevices{},nil}iflen(cd)==0{continue}pd[devID]=append(pd[devID],cd)}}klog.InfoS("Decoded pod annos","poddevices",pd)returnpd,nil}
// pkg/util/util.go#L223funcDecodeContainerDevices(strstring)(ContainerDevices,error){iflen(str)==0{returnContainerDevices{},nil}cd:=strings.Split(str,OneContainerMultiDeviceSplitSymbol)contdev:=ContainerDevices{}tmpdev:=ContainerDevice{}klog.V(5).Infof("Start to decode container device %s",str)iflen(str)==0{returnContainerDevices{},nil}for_,val:=rangecd{ifstrings.Contains(val,","){//fmt.Println("cd is ", val)tmpstr:=strings.Split(val,",")iflen(tmpstr)<4{returnContainerDevices{},fmt.Errorf("pod annotation format error; information missing, please do not use nodeName field in task")}tmpdev.UUID=tmpstr[0]tmpdev.Type=tmpstr[1]devmem,_:=strconv.ParseInt(tmpstr[2],10,32)tmpdev.Usedmem=int32(devmem)devcores,_:=strconv.ParseInt(tmpstr[3],10,32)tmpdev.Usedcores=int32(devcores)contdev=append(contdev,tmpdev)}}klog.V(5).Infof("Finished decoding container devices. Total devices: %d",len(contdev))returncontdev,nil}
至此,整个流程就完成了。
6.小结
1)HAMi 提供了一个指定调度到(或者不调度到)某种(个) GPU 的功能:
By Type:指定 GPU Type,仅调度(或者不调度)到某些指定 Type 的卡上,例如:A100、A40