funcmain(){varconfigFilestringc:=cli.NewApp()c.Name="NVIDIA Device Plugin"c.Usage="NVIDIA device plugin for Kubernetes"c.Version=info.GetVersionString()c.Action=func(ctx*cli.Context)error{returnstart(ctx,c.Flags)}c.Flags=[]cli.Flag{&cli.StringFlag{Name:"mig-strategy",Value:spec.MigStrategyNone,Usage:"the desired strategy for exposing MIG devices on GPUs that support it:\n\t\t[none | single | mixed]",EnvVars:[]string{"MIG_STRATEGY"},},&cli.BoolFlag{Name:"fail-on-init-error",Value:true,Usage:"fail the plugin if an error is encountered during initialization, otherwise block indefinitely",EnvVars:[]string{"FAIL_ON_INIT_ERROR"},},&cli.StringFlag{Name:"nvidia-driver-root",Value:"/",Usage:"the root path for the NVIDIA driver installation (typical values are '/' or '/run/nvidia/driver')",EnvVars:[]string{"NVIDIA_DRIVER_ROOT"},},&cli.BoolFlag{Name:"pass-device-specs",Value:false,Usage:"pass the list of DeviceSpecs to the kubelet on Allocate()",EnvVars:[]string{"PASS_DEVICE_SPECS"},},&cli.StringSliceFlag{Name:"device-list-strategy",Value:cli.NewStringSlice(string(spec.DeviceListStrategyEnvvar)),Usage:"the desired strategy for passing the device list to the underlying runtime:\n\t\t[envvar | volume-mounts | cdi-annotations]",EnvVars:[]string{"DEVICE_LIST_STRATEGY"},},&cli.StringFlag{Name:"device-id-strategy",Value:spec.DeviceIDStrategyUUID,Usage:"the desired strategy for passing device IDs to the underlying runtime:\n\t\t[uuid | index]",EnvVars:[]string{"DEVICE_ID_STRATEGY"},},&cli.BoolFlag{Name:"gds-enabled",Usage:"ensure that containers are started with NVIDIA_GDS=enabled",EnvVars:[]string{"GDS_ENABLED"},},&cli.BoolFlag{Name:"mofed-enabled",Usage:"ensure that containers are started with NVIDIA_MOFED=enabled",EnvVars:[]string{"MOFED_ENABLED"},},&cli.StringFlag{Name:"config-file",Usage:"the path to a config file as an alternative to command line options or environment variables",Destination:&configFile,EnvVars:[]string{"CONFIG_FILE"},},&cli.StringFlag{Name:"cdi-annotation-prefix",Value:spec.DefaultCDIAnnotationPrefix,Usage:"the prefix to use for CDI container annotation keys",EnvVars:[]string{"CDI_ANNOTATION_PREFIX"},},&cli.StringFlag{Name:"nvidia-ctk-path",Value:spec.DefaultNvidiaCTKPath,Usage:"the path to use for the nvidia-ctk in the generated CDI specification",EnvVars:[]string{"NVIDIA_CTK_PATH"},},&cli.StringFlag{Name:"container-driver-root",Value:spec.DefaultContainerDriverRoot,Usage:"the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications",EnvVars:[]string{"CONTAINER_DRIVER_ROOT"},},}c.Flags=append(c.Flags,addFlags()...)err:=c.Run(os.Args)iferr!=nil{klog.Error(err)os.Exit(1)}}funcaddFlags()[]cli.Flag{addition:=[]cli.Flag{&cli.StringFlag{Name:"node-name",Value:os.Getenv(util.NodeNameEnvName),Usage:"node name",EnvVars:[]string{"NodeName"},},&cli.UintFlag{Name:"device-split-count",Value:2,Usage:"the number for NVIDIA device split",EnvVars:[]string{"DEVICE_SPLIT_COUNT"},},&cli.Float64Flag{Name:"device-memory-scaling",Value:1.0,Usage:"the ratio for NVIDIA device memory scaling",EnvVars:[]string{"DEVICE_MEMORY_SCALING"},},&cli.Float64Flag{Name:"device-cores-scaling",Value:1.0,Usage:"the ratio for NVIDIA device cores scaling",EnvVars:[]string{"DEVICE_CORES_SCALING"},},&cli.BoolFlag{Name:"disable-core-limit",Value:false,Usage:"If set, the core utilization limit will be ignored",EnvVars:[]string{"DISABLE_CORE_LIMIT"},},&cli.StringFlag{Name:"resource-name",Value:"nvidia.com/gpu",Usage:"the name of field for number GPU visible in container",},}returnaddition}
&cli.UintFlag{Name:"device-split-count",Value:2,Usage:"the number for NVIDIA device split",EnvVars:[]string{"DEVICE_SPLIT_COUNT"},},&cli.Float64Flag{Name:"device-memory-scaling",Value:1.0,Usage:"the ratio for NVIDIA device memory scaling",EnvVars:[]string{"DEVICE_MEMORY_SCALING"},},&cli.Float64Flag{Name:"device-cores-scaling",Value:1.0,Usage:"the ratio for NVIDIA device cores scaling",EnvVars:[]string{"DEVICE_CORES_SCALING"},},&cli.BoolFlag{Name:"disable-core-limit",Value:false,Usage:"If set, the core utilization limit will be ignored",EnvVars:[]string{"DISABLE_CORE_LIMIT"},},&cli.StringFlag{Name:"resource-name",Value:"nvidia.com/gpu",Usage:"the name of field for number GPU visible in container",},
device-split-count:表示 GPU 的分割数,每一张 GPU 都不能分配超过其配置数目的任务。若其配置为 N 的话,每个 GPU 上最多可以同时存在 N 个任务。
// pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go#L222// Register registers the device plugin for the given resourceName with Kubelet.func(plugin*NvidiaDevicePlugin)Register()error{conn,err:=plugin.dial(kubeletdevicepluginv1beta1.KubeletSocket,5*time.Second)iferr!=nil{returnerr}deferconn.Close()client:=kubeletdevicepluginv1beta1.NewRegistrationClient(conn)reqt:=&kubeletdevicepluginv1beta1.RegisterRequest{Version:kubeletdevicepluginv1beta1.Version,Endpoint:path.Base(plugin.socket),ResourceName:string(plugin.rm.Resource()),Options:&kubeletdevicepluginv1beta1.DevicePluginOptions{GetPreferredAllocationAvailable:false,},}_,err=client.Register(context.Background(),reqt)iferr!=nil{returnerr}returnnil}
device plugin 注册时的几个核心信息:
ResourceName:资源名称,这个和创建 Pod 时申请 vGPU 的资源名匹配时就会使用到该 device plugin。
// pkg/device-plugin/nvidiadevice/nvinternal/plugin/register.go#L199func(plugin*NvidiaDevicePlugin)WatchAndRegister(){klog.Info("Starting WatchAndRegister")errorSleepInterval:=time.Second*5successSleepInterval:=time.Second*30for{err:=plugin.RegistrInAnnotation()iferr!=nil{klog.Errorf("Failed to register annotation: %v",err)klog.Infof("Retrying in %v seconds...",errorSleepInterval)time.Sleep(errorSleepInterval)}else{klog.Infof("Successfully registered annotation. Next check in %v seconds...",successSleepInterval)time.Sleep(successSleepInterval)}}}
func(plugin*NvidiaDevicePlugin)getAPIDevices()*[]*api.DeviceInfo{devs:=plugin.Devices()nvml.Init()res:=make([]*api.DeviceInfo,0,len(devs))idx:=0foridx<len(devs){ndev,ret:=nvml.DeviceGetHandleByIndex(idx)//ndev, err := nvml.NewDevice(uint(idx))//klog.V(3).Infoln("ndev type=", ndev.Model)ifret!=nvml.SUCCESS{klog.Errorln("nvml new device by index error idx=",idx,"err=",ret)panic(0)}memoryTotal:=0memory,ret:=ndev.GetMemoryInfo()ifret==nvml.SUCCESS{memoryTotal=int(memory.Total)}else{klog.Error("nvml get memory error ret=",ret)panic(0)}UUID,ret:=ndev.GetUUID()ifret!=nvml.SUCCESS{klog.Error("nvml get uuid error ret=",ret)panic(0)}Model,ret:=ndev.GetName()ifret!=nvml.SUCCESS{klog.Error("nvml get name error ret=",ret)panic(0)}registeredmem:=int32(memoryTotal/1024/1024)if*util.DeviceMemoryScaling!=1{registeredmem=int32(float64(registeredmem)**util.DeviceMemoryScaling)}klog.Infoln("MemoryScaling=",*util.DeviceMemoryScaling,"registeredmem=",registeredmem)health:=truefor_,val:=rangedevs{ifstrings.Compare(val.ID,UUID)==0{// when NVIDIA-Tesla P4, the device info is : ID:GPU-e290caca-2f0c-9582-acab-67a142b61ffa,Health:Healthy,Topology:nil,// it is more reasonable to think of healthy as case-insensitiveifstrings.EqualFold(val.Health,"healthy"){health=true}else{health=false}break}}numa,err:=plugin.getNumaInformation(idx)iferr!=nil{klog.ErrorS(err,"failed to get numa information","idx",idx)}res=append(res,&api.DeviceInfo{ID:UUID,Count:int32(*util.DeviceSplitCount),Devmem:registeredmem,Devcore:int32(*util.DeviceCoresScaling*100),Type:fmt.Sprintf("%v-%v","NVIDIA",Model),Numa:numa,Health:health,})idx++klog.Infof("nvml registered device id=%v, memory=%v, type=%v, numa=%v",idx,registeredmem,Model,numa)}return&res}
encodeddevices:=util.EncodeNodeDevices(*devices)annos[nvidia.HandshakeAnnos]="Reported "+time.Now().String()annos[nvidia.RegisterAnnos]=encodeddevicesklog.Infof("patch node with the following annos %v",fmt.Sprintf("%v",annos))err=util.PatchNodeAnnotations(node,annos)
// ListAndWatch lists devices and update that list according to the health statusfunc(plugin*NvidiaDevicePlugin)ListAndWatch(e*kubeletdevicepluginv1beta1.Empty,skubeletdevicepluginv1beta1.DevicePlugin_ListAndWatchServer)error{s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices:plugin.apiDevices()})for{select{case<-plugin.stop:returnnilcased:=<-plugin.health:// FIXME: there is no way to recover from the Unhealthy state.d.Health=kubeletdevicepluginv1beta1.Unhealthyklog.Infof("'%s' device marked unhealthy: %s",plugin.rm.Resource(),d.ID)s.Send(&kubeletdevicepluginv1beta1.ListAndWatchResponse{Devices:plugin.apiDevices()})}}}
// VisitDevices visits each top-level device and invokes a callback function for itfunc(d*devicelib)VisitDevices(visitfunc(int,Device)error)error{count,ret:=d.nvml.DeviceGetCount()ifret!=nvml.SUCCESS{returnfmt.Errorf("error getting device count: %v",ret)}fori:=0;i<count;i++{device,ret:=d.nvml.DeviceGetHandleByIndex(i)ifret!=nvml.SUCCESS{returnfmt.Errorf("error getting device handle for index '%v': %v",i,ret)}dev,err:=d.newDevice(device)iferr!=nil{returnfmt.Errorf("error creating new device wrapper: %v",err)}isSkipped,err:=dev.isSkipped()iferr!=nil{returnfmt.Errorf("error checking whether device is skipped: %v",err)}ifisSkipped{continue}err=visit(i,dev)iferr!=nil{returnfmt.Errorf("error visiting device: %v",err)}}returnnil}// buildGPUDeviceMap builds a map of resource names to GPU devicesfunc(b*deviceMapBuilder)buildGPUDeviceMap()(DeviceMap,error){devices:=make(DeviceMap)b.VisitDevices(func(iint,gpudevice.Device)error{name,ret:=gpu.GetName()ifret!=nvml.SUCCESS{returnfmt.Errorf("error getting product name for GPU: %v",ret)}migEnabled,err:=gpu.IsMigEnabled()iferr!=nil{returnfmt.Errorf("error checking if MIG is enabled on GPU: %v",err)}ifmigEnabled&&*b.config.Flags.MigStrategy!=spec.MigStrategyNone{returnnil}for_,resource:=rangeb.config.Resources.GPUs{ifresource.Pattern.Matches(name){index,info:=newGPUDevice(i,gpu)returndevices.setEntry(resource.Name,index,info)}}returnfmt.Errorf("GPU name '%v' does not match any resource patterns",name)})returndevices,nil}
// GetPluginDevices returns the plugin Devices from all devices in the Devicesfunc(dsDevices)GetPluginDevices()[]*kubeletdevicepluginv1beta1.Device{varres[]*kubeletdevicepluginv1beta1.Deviceif!strings.Contains(ds.GetIDs()[0],"MIG"){for_,dev:=rangeds{fori:=uint(0);i<*util.DeviceSplitCount;i++{id:=fmt.Sprintf("%v-%v",dev.ID,i)res=append(res,&kubeletdevicepluginv1beta1.Device{ID:id,Health:dev.Health,Topology:nil,})}}}else{for_,d:=rangeds{res=append(res,&d.Device)}}returnres}
// pkg/device-plugin/nvidiadevice/nvinternal/plugin/server.go#L290func(plugin*NvidiaDevicePlugin)Allocate(ctxcontext.Context,reqs*kubeletdevicepluginv1beta1.AllocateRequest)(*kubeletdevicepluginv1beta1.AllocateResponse,error){klog.InfoS("Allocate","request",reqs)responses:=kubeletdevicepluginv1beta1.AllocateResponse{}nodename:=os.Getenv(util.NodeNameEnvName)current,err:=util.GetPendingPod(ctx,nodename)iferr!=nil{nodelock.ReleaseNodeLock(nodename,NodeLockNvidia)return&kubeletdevicepluginv1beta1.AllocateResponse{},err}klog.V(5).Infof("allocate pod name is %s/%s, annotation is %+v",current.Namespace,current.Name,current.Annotations)foridx,req:=rangereqs.ContainerRequests{// If the devices being allocated are replicas, then (conditionally)// error out if more than one resource is being allocated.ifstrings.Contains(req.DevicesIDs[0],"MIG"){ifplugin.config.Sharing.TimeSlicing.FailRequestsGreaterThanOne&&rm.AnnotatedIDs(req.DevicesIDs).AnyHasAnnotations(){iflen(req.DevicesIDs)>1{returnnil,fmt.Errorf("request for '%v: %v' too large: maximum request size for shared resources is 1",plugin.rm.Resource(),len(req.DevicesIDs))}}for_,id:=rangereq.DevicesIDs{if!plugin.rm.Devices().Contains(id){returnnil,fmt.Errorf("invalid allocation request for '%s': unknown device: %s",plugin.rm.Resource(),id)}}response,err:=plugin.getAllocateResponse(req.DevicesIDs)iferr!=nil{returnnil,fmt.Errorf("failed to get allocate response: %v",err)}responses.ContainerResponses=append(responses.ContainerResponses,response)}else{currentCtr,devreq,err:=util.GetNextDeviceRequest(nvidia.NvidiaGPUDevice,*current)klog.Infoln("deviceAllocateFromAnnotation=",devreq)iferr!=nil{device.PodAllocationFailed(nodename,current,NodeLockNvidia)return&kubeletdevicepluginv1beta1.AllocateResponse{},err}iflen(devreq)!=len(reqs.ContainerRequests[idx].DevicesIDs){device.PodAllocationFailed(nodename,current,NodeLockNvidia)return&kubeletdevicepluginv1beta1.AllocateResponse{},errors.New("device number not matched")}response,err:=plugin.getAllocateResponse(util.GetContainerDeviceStrArray(devreq))iferr!=nil{returnnil,fmt.Errorf("failed to get allocate response: %v",err)}err=util.EraseNextDeviceTypeFromAnnotation(nvidia.NvidiaGPUDevice,*current)iferr!=nil{device.PodAllocationFailed(nodename,current,NodeLockNvidia)return&kubeletdevicepluginv1beta1.AllocateResponse{},err}fori,dev:=rangedevreq{limitKey:=fmt.Sprintf("CUDA_DEVICE_MEMORY_LIMIT_%v",i)response.Envs[limitKey]=fmt.Sprintf("%vm",dev.Usedmem)/*tmp := response.Envs["NVIDIA_VISIBLE_DEVICES"]
if i > 0 {
response.Envs["NVIDIA_VISIBLE_DEVICES"] = fmt.Sprintf("%v,%v", tmp, dev.UUID)
} else {
response.Envs["NVIDIA_VISIBLE_DEVICES"] = dev.UUID
}*/}response.Envs["CUDA_DEVICE_SM_LIMIT"]=fmt.Sprint(devreq[0].Usedcores)response.Envs["CUDA_DEVICE_MEMORY_SHARED_CACHE"]=fmt.Sprintf("%s/vgpu/%v.cache",hostHookPath,uuid.New().String())if*util.DeviceMemoryScaling>1{response.Envs["CUDA_OVERSUBSCRIBE"]="true"}if*util.DisableCoreLimit{response.Envs[api.CoreLimitSwitch]="disable"}cacheFileHostDirectory:=fmt.Sprintf("%s/vgpu/containers/%s_%s",hostHookPath,current.UID,currentCtr.Name)os.RemoveAll(cacheFileHostDirectory)os.MkdirAll(cacheFileHostDirectory,0777)os.Chmod(cacheFileHostDirectory,0777)os.MkdirAll("/tmp/vgpulock",0777)os.Chmod("/tmp/vgpulock",0777)response.Mounts=append(response.Mounts,&kubeletdevicepluginv1beta1.Mount{ContainerPath:fmt.Sprintf("%s/vgpu/libvgpu.so",hostHookPath),HostPath:hostHookPath+"/vgpu/libvgpu.so",ReadOnly:true},&kubeletdevicepluginv1beta1.Mount{ContainerPath:fmt.Sprintf("%s/vgpu",hostHookPath),HostPath:cacheFileHostDirectory,ReadOnly:false},&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/tmp/vgpulock",HostPath:"/tmp/vgpulock",ReadOnly:false},)found:=falsefor_,val:=rangecurrentCtr.Env{ifstrings.Compare(val.Name,"CUDA_DISABLE_CONTROL")==0{// if env existed but is set to false or can not be parsed, ignoret,_:=strconv.ParseBool(val.Value)if!t{continue}// only env existed and set to true, we mark it "found"found=truebreak}}if!found{response.Mounts=append(response.Mounts,&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/etc/ld.so.preload",HostPath:hostHookPath+"/vgpu/ld.so.preload",ReadOnly:true},)}_,err=os.Stat(fmt.Sprintf("%s/vgpu/license",hostHookPath))iferr==nil{response.Mounts=append(response.Mounts,&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/tmp/license",HostPath:fmt.Sprintf("%s/vgpu/license",hostHookPath),ReadOnly:true,})response.Mounts=append(response.Mounts,&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/usr/bin/vgpuvalidator",HostPath:fmt.Sprintf("%s/vgpu/vgpuvalidator",hostHookPath),ReadOnly:true,})}responses.ContainerResponses=append(responses.ContainerResponses,response)}}klog.Infoln("Allocate Response",responses.ContainerResponses)device.PodAllocationTrySuccess(nodename,nvidia.NvidiaGPUDevice,NodeLockNvidia,current)return&responses,nil}
// 缓存文件存放位置 /usr/local/vgpu/containers/xxx/xxxcacheFileHostDirectory:=fmt.Sprintf("%s/vgpu/containers/%s_%s",hostHookPath,current.UID,currentCtr.Name)os.RemoveAll(cacheFileHostDirectory)os.MkdirAll(cacheFileHostDirectory,0777)os.Chmod(cacheFileHostDirectory,0777)os.MkdirAll("/tmp/vgpulock",0777)os.Chmod("/tmp/vgpulock",0777)response.Mounts=append(response.Mounts,// 宿主机上的 libvgpu.so挂载到 pod 里替换 nvidia 默认的驱动&kubeletdevicepluginv1beta1.Mount{ContainerPath:fmt.Sprintf("%s/vgpu/libvgpu.so",hostHookPath),HostPath:hostHookPath+"/vgpu/libvgpu.so",ReadOnly:true},// 随机的文件挂载进 pod 作为 vgpu 使用&kubeletdevicepluginv1beta1.Mount{ContainerPath:fmt.Sprintf("%s/vgpu",hostHookPath),HostPath:cacheFileHostDirectory,ReadOnly:false},// 一个 lock 文件&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/tmp/vgpulock",HostPath:"/tmp/vgpulock",ReadOnly:false},)
替换动态库,当没有指定 CUDA_DISABLE_CONTROL=true 时就会做该处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
found:=falsefor_,val:=rangecurrentCtr.Env{ifstrings.Compare(val.Name,"CUDA_DISABLE_CONTROL")==0{// if env existed but is set to false or can not be parsed, ignoret,_:=strconv.ParseBool(val.Value)if!t{continue}// only env existed and set to true, we mark it "found"found=truebreak}}if!found{response.Mounts=append(response.Mounts,&kubeletdevicepluginv1beta1.Mount{ContainerPath:"/etc/ld.so.preload",HostPath:hostHookPath+"/vgpu/ld.so.preload",ReadOnly:true},)}
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePluginfuncNewNvidiaDevicePlugin(config*util.DeviceConfig,resourceManagerrm.ResourceManager,cdiHandlercdi.Interface,cdiEnabledbool)*NvidiaDevicePlugin{_,name:=resourceManager.Resource().Split()deviceListStrategies,_:=spec.NewDeviceListStrategies(*config.Flags.Plugin.DeviceListStrategy)return&NvidiaDevicePlugin{rm:resourceManager,config:config,deviceListEnvvar:"NVIDIA_VISIBLE_DEVICES",deviceListStrategies:deviceListStrategies,socket:kubeletdevicepluginv1beta1.DevicePluginPath+"nvidia-"+name+".sock",cdiHandler:cdiHandler,cdiEnabled:cdiEnabled,cdiAnnotationPrefix:*config.Flags.Plugin.CDIAnnotationPrefix,// These will be reinitialized every// time the plugin server is restarted.server:nil,health:nil,stop:nil,}}