Documentation
¶
Overview ¶
Package common a series of common function
Package common a series of common function ¶
Package common a series of common function ¶
Package common a series of common function ¶
Package common a series of common function ¶
Package common a series of common function ¶
Package common a series of common function ¶
Package common a series of common function
Index ¶
- Constants
- Variables
- func CheckCardUsageMode(use310PMixedInsert bool, productTypes []string) error
- func CheckDeviceName(deviceName, deviceRunMode string) bool
- func CheckErrorMessage(err error, target string) bool
- func CheckFileUserSameWithProcess(loggerPath string) bool
- func CheckPodNameAndSpace(podPara string, maxLength int) error
- func ConvertDevListToSets(devices, sepType string) sets.String
- func CountFaultDuration(device *NpuDevice, devFaultInfoMap map[int32][]common.DevFaultInfo)
- func DelOnceFrequencyFault()
- func DelOnceRecoverFault(groupDevice map[string][]*NpuDevice)
- func DeleteManuallyFaultInfo(logicID int32)
- func FakeAiCoreDevice(dev DavinCiDev, aiCoreDevices *[]*NpuDevice)
- func FilterPods(pods []v1.Pod, deviceType string, conditionFunc func(pod *v1.Pod) bool) []v1.Pod
- func GenResetDirName(namespace, name string) string
- func GenResetFileName(namespace, name string) string
- func GenResetTypeFileName(namespace, name string) string
- func Get310PProductType() map[string]string
- func GetAICore(templateName string) (int, error)
- func GetAllDeviceInfoTypeList() map[string]struct{}
- func GetAndCleanFaultInfo() map[int32][]common.DevFaultInfo
- func GetAndCleanLogicID() []int32
- func GetChangedDevFaultInfo(device *NpuDevice, oldErrCodes []int64, newErrCodes []int64) []common.DevFaultInfo
- func GetDefaultDevices(getFdFlag bool) ([]string, error)
- func GetDeviceFromPodAnnotation(pod *v1.Pod, deviceType string) ([]string, error)
- func GetDeviceID(deviceName string, ascendRuntimeOptions string) (int, int, error)
- func GetDeviceListID(devices []string, ascendRuntimeOptions string) (map[int]int, []int, error)
- func GetDeviceRunMode() (string, error)
- func GetFaultAssertionName(assertion int8) string
- func GetFaultType(faultCodes []int64, logicId int32) string
- func GetFaultTypeByCode(faultCodes []int64) string
- func GetFaultTypeFromFaultDuration(logicId int32, mode string) string
- func GetFaultTypeFromFaultFrequency(logicId int32) string
- func GetJobNameOfPod(pod *v1.Pod) string
- func GetNetworkFaultType(faultCodes []int64, logicId int32) string
- func GetNetworkFaultTypeByCode(faultCodes []int64) string
- func GetPattern() map[string]*regexp.Regexp
- func GetPodAnnotationByDeviceType(pod *v1.Pod, deviceType string) (string, error)
- func GetPodConfiguration(phyDevMapVirtualDev map[int]int, devices map[int]string, podName string, ...) string
- func GetPodNameFromEnv() (string, error)
- func GetSwitchFaultCode() []int64
- func GetTemplateName2DeviceTypeMap() map[string]string
- func GetTimeoutFaultCodes(mode string) []int64
- func GetVNPUSegmentInfo(deviceInfos []string) (int32, string, error)
- func Int32Join(data []int32, sep string) string
- func IntInList(num int32, list []int32) bool
- func IsContainAll300IDuo() bool
- func IsContainAtlas300IDuo() bool
- func IsValidNumber(checkVal string) (int64, bool)
- func IsVirtualDev(devType string) bool
- func LoadFaultCode(faultCodeBytes []byte) error
- func LoadFaultCodeFromFile() error
- func LoadFaultCustomization(faultCustomizationByte []byte) error
- func LoadFaultCustomizationFromFile() error
- func LoadSwitchFaultCode(switchFaultCodeByte []byte) error
- func LoadSwitchFaultCodeFromFile() error
- func LockAllDeviceInfo()
- func MakeDataHash(data interface{}) string
- func MapDeepCopy(source map[string]string) map[string]string
- func MarshalData(data interface{}) []byte
- func NewSignWatcher(osSigns ...os.Signal) chan os.Signal
- func QueryManuallyFaultInfoByLogicID(logicID int32) bool
- func QueryManuallyFaultNPULogicIDsByHandleStatus(handleStatus string) []int32
- func RandomInt64(min, max int64) int64
- func RecordFaultInfoList(devFaultInfoList []*TaskDevInfo)
- func RemoveFileAndDir(namespace, name string) error
- func ResetFaultCustomizationCache()
- func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo)
- func SaveManuallyFaultInfo(logicID int32)
- func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, ...)
- func SetDeviceInit(logicID int32)
- func SetManuallyFaultNPUHandled()
- func SetNetworkNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)
- func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)
- func SetSwitchFaultCode(newFaults []int64)
- func ShareDev() bool
- func ToString(devices sets.String, sepType string) string
- func UnlockAllDeviceInfo()
- func VerifyPathAndPermission(verifyPath string, waitSecond int) (string, bool)
- func WriteToFile(info, path string) error
- type AtomicBool
- type DavinCiDev
- type DevFaultInfo
- type DevFaultInfoBasedTimeAscend
- type DevStatusSet
- type Device
- type DeviceFault
- type DeviceHealth
- type FaultCustomization
- type FaultDuration
- type FaultDurationCache
- type FaultDurationCustomization
- type FaultDurationData
- type FaultFrequency
- type FaultFrequencyCache
- type FaultFrequencyCustomization
- type FaultTypeCode
- type FileWatch
- type GraceToleranceCustomization
- type HbmFaultManager
- type Instance
- type ManuallyFaultInfo
- type NodeDeviceInfo
- type NodeDeviceInfoCache
- type NpuAllInfo
- type NpuBaseInfo
- type NpuDevice
- type Option
- type PodDeviceInfo
- type ServerInfo
- type SuperPodInfo
- type SwitchFaultFileInfo
- type SwitchFaultInfo
- type TaskDevInfo
- type TaskFaultInfo
- type TaskFaultInfoCache
- type TaskResetInfo
- type TaskResetInfoCache
Constants ¶
const ( // Component component name Component = "device-plugin" // MaxBackups log file max backup MaxBackups = 30 // MaxAge the log file last time MaxAge = 7 // KubeEnvMaxLength k8s env name max length KubeEnvMaxLength = 230 // PodNameMaxLength pod name max length PodNameMaxLength = 253 // PodNameSpaceMaxLength pod name space max length PodNameSpaceMaxLength = 63 // MaxPodLimit max pod num MaxPodLimit = 10000 // MaxContainerLimit max container num MaxContainerLimit = 300000 // RetryUpdateCount is max number of retry resource update RetryUpdateCount = 3 // GetPodFromInformerTime is max number of get pod from informer GetPodFromInformerTime = 3 // MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0" MaxDeviceNameLen = 50 // MaxGRPCRecvMsgSize 4MB MaxGRPCRecvMsgSize = 4 * 1024 * 1024 // MaxGRPCConcurrentStreams limit on the number of concurrent streams to each ServerTransport. MaxGRPCConcurrentStreams = 64 // MaxConcurrentLimit limit over listener MaxConcurrentLimit = 64 // MaxIPConnectionLimit limit over ip MaxIPConnectionLimit = 64 // CacheSize cache for ip CacheSize = 128 // MaxVirtualDeviceNum max num of virtual device MaxVirtualDeviceNum = 1024 // CMDataMaxLength configMap max data size 1MB CMDataMaxLength = 1024 * 1024 // PodAnnotationMaxLength pod annotation max data length 1MB PodAnnotationMaxLength = 1024 * 1024 // UpdatePodWaitTime default try update pod wait time 200 millisecond UpdatePodWaitTime = 200 // DeviceInfoCMNameSpace namespace of device info configmap DeviceInfoCMNameSpace = "kube-system" // DeviceInfoCMNamePrefix device info configmap name prefix DeviceInfoCMNamePrefix = "mindx-dl-deviceinfo-" // DeviceInfoCMDataKey device info configmap data key DeviceInfoCMDataKey = "DeviceInfoCfg" // SwitchInfoCMDataKey the key of switch info in device-info configmap SwitchInfoCMDataKey = "SwitchInfoCfg" // DeviceInfoCMManuallySeparateNPUKey for deviceinfo configmap ManuallySeparateNPU key DeviceInfoCMManuallySeparateNPUKey = "ManuallySeparateNPU" // SlowNodeNoticeCMName the name for slow node notice configmap SlowNodeNoticeCMName = "steptime-dtpgroup" // CmConsumer who uses these configmap CmConsumer = "mx-consumer-cim" // CmConsumerValue the value only for true CmConsumerValue = "true" // AscendVisibleDevicesEnv visible devices env AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES" // PodPredicateTime pod predicate time PodPredicateTime = "predicate-time" // Pod2kl pod annotation key, means kubelet allocate device Pod2kl = "kltDev" // PodRealAlloc pod annotation key, means pod real mount device PodRealAlloc = "AscendReal" // Pod910DeviceKey pod annotation key, for generate 910 hccl rank table Pod910DeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration" // BaseDeviceInfoKey base device info key BaseDeviceInfoKey = "baseDeviceInfos" // ChipNameLabel update chip name to node label ChipNameLabel = "node.kubernetes.io/npu.chip.name" // MetaDataAnnotation downward api which map annotation from volcano to container's env MetaDataAnnotation = "metadata.annotations" // MetaData is meta data of pod MetaData = "metadata" // SlowNodeStepTimeEnvNum is the number of environment value for step time cm SlowNodeStepTimeEnvNum = 2 // PerfDumpPathEnv is an environment variable for slow node step time configmap PerfDumpPathEnv = "PERF_DUMP_PATH" // PerfDumpConfigEnv is an environment variable for slow node step time configmap PerfDumpConfigEnv = "PERF_DUMP_CONFIG" // PodResourceSeverKey for pod resource key PodResourceSeverKey = "podResource" // VirtualDev Virtual device tag VirtualDev = "VIRTUAL" // PhyDeviceLen like Ascend910-0 split length is 2 PhyDeviceLen = 2 // VirDeviceLen like Ascend910-2c-100-1 split length is 4 VirDeviceLen = 4 // MaxDevicesNum max device num MaxDevicesNum = 100 // MaxCardNum max card num MaxCardNum = 64 // MaxDevNumInCard max device num in card MaxDevNumInCard = 4 // MaxRequestVirtualDeviceNum max request device num MaxRequestVirtualDeviceNum = 1 // LabelDeviceLen like Ascend910-0 split length is 2 LabelDeviceLen = 2 // DefaultDeviceIP device ip address DefaultDeviceIP = "127.0.0.1" // NormalState health state NormalState = uint32(0) // GeneralAlarm health state GeneralAlarm = uint32(1) // SocketChmod socket file mode SocketChmod = 0600 // RunMode310 for 310 chip RunMode310 = "ascend310" // RunMode910 for 910 chip RunMode910 = "ascend910" // RunMode310P for 310P chip RunMode310P = "ascend310P" // AMPMode for AMP chip work mode AMPMode = "AMP" // SMPMode for SMP chip work mode SMPMode = "SMP" // Interval interval time Interval = 1 // Timeout time Timeout = 10 // BaseDec base BaseDec = 10 // BitSize base size BitSize = 64 // BitSize32 base size 32 BitSize32 = 32 // SleepTime The unit is seconds SleepTime = 5 // GeneralMapSize general map size GeneralMapSize = 8 // MapSizeTwo map size two MapSizeTwo = 2 // GeneralSubscribeTime general subscribe try time GeneralSubscribeTime = 3 // Hex hexadecimal Hex = 16 // SecondMagnification is second-level unit magnification SecondMagnification = 1000 // SecondMagnificationFloat is second-level unit magnification float SecondMagnificationFloat = 1000.0 )
const ( // ResourceNamePrefix prefix ResourceNamePrefix = "huawei.com/" // DistributedJob annotation indicates that the job is distributed DistributedJob = "distributed-job" // Ascend310P 310p Ascend310P = "Ascend310P" // Ascend310PV 310P-V Ascend310PV = Ascend310P + "-V" // Ascend310PVPro 310P-VPro Ascend310PVPro = Ascend310P + "-VPro" // Ascend310PIPro 310P-IPro Ascend310PIPro = Ascend310P + "-IPro" // Ascend310Pc1 Ascend310P 1 core Ascend310Pc1 = Ascend310P + "-" + Core1 // Ascend310Pc2 Ascend310P 2 core Ascend310Pc2 = Ascend310P + "-" + Core2 // Ascend310Pc4 Ascend310P 4 core Ascend310Pc4 = Ascend310P + "-" + Core4 // Ascend310Pc4Cpu3 Ascend310P 4core 3cpu Ascend310Pc4Cpu3 = Ascend310P + "-" + Core4Cpu3 // Ascend310Pc2Cpu1 Ascend310P 2core 1cpu Ascend310Pc2Cpu1 = Ascend310P + "-" + Core2Cpu1 // Ascend310Pc4Cpu4Dvpp Ascend310P 4core 4cpu dvpp Ascend310Pc4Cpu4Dvpp = Ascend310P + "-" + Core4Cpu4Dvpp // Ascend310Pc4Cpu3Ndvpp Ascend310P 4core 3cpu ndvpp Ascend310Pc4Cpu3Ndvpp = Ascend310P + "-" + Core4Cpu3Ndvpp // HuaweiAscend310P with prefix HuaweiAscend310P = ResourceNamePrefix + Ascend310P // Ascend910 910 Ascend910 = "Ascend910" // Ascend910c2 Ascend910 2core Ascend910c2 = Ascend910 + "-" + Core2 // Ascend910c4 Ascend910 4core Ascend910c4 = Ascend910 + "-" + Core4 // Ascend910c8 Ascend910 8core Ascend910c8 = Ascend910 + "-" + Core8 // Ascend910c16 Ascend910 16core Ascend910c16 = Ascend910 + "-" + Core16 // Ascend910c5Cpu1Gb8 Ascend910 5core 1cpu 8 Gb memory Ascend910c5Cpu1Gb8 = Ascend910 + "-" + Core5Cpu1Gb8 // Ascend910c5Cpu1Gb16 Ascend910 5core 1cpu 16Gb memory Ascend910c5Cpu1Gb16 = Ascend910 + "-" + Core5Cpu1Gb16 // Ascend910c6Cpu1Gb16 Ascend910 6core 1cpu 16Gb memory Ascend910c6Cpu1Gb16 = Ascend910 + "-" + Core6Cpu1Gb16 // Ascend910c10Cpu3Gb16 Ascend910 10core 3cpu 16Gb memory Ascend910c10Cpu3Gb16 = Ascend910 + "-" + Core10Cpu3Gb16 // Ascend910c10Cpu3Gb16Ndvpp Ascend910 10core 3cpu 16Gb memory ndvpp Ascend910c10Cpu3Gb16Ndvpp = Ascend910 + "-" + Core10Cpu3Gb16Ndvpp // Ascend910c10Cpu3Gb32 Ascend910 10core 3cpu 32Gb memory Ascend910c10Cpu3Gb32 = Ascend910 + "-" + Core10Cpu3Gb32 // Ascend910c10Cpu4Gb16Dvpp Ascend910 10core 4cpu 16Gb memory dvpp Ascend910c10Cpu4Gb16Dvpp = Ascend910 + "-" + Core10Cpu4Gb16Dvpp // Ascend910c12Cpu3Gb32 Ascend910 12core 3cpu 32Gb memory Ascend910c12Cpu3Gb32 = Ascend910 + "-" + Core12Cpu3Gb32 // Ascend910c3Cpu1Gb8 Ascend910 3core 1cpu 8Gb memory Ascend910c3Cpu1Gb8 = Ascend910 + "-" + Core3Cpu1Gb8 // HuaweiAscend910 with prefix HuaweiAscend910 = ResourceNamePrefix + Ascend910 // Ascend310 310 Ascend310 = "Ascend310" // Ascend310B 310B chip Ascend310B = "Ascend310B" // HuaweiAscend310 with prefix HuaweiAscend310 = ResourceNamePrefix + Ascend310 // AscendfdPrefix use in fd AscendfdPrefix = "davinci-mini" // Ascend910B ascend 910B chip Ascend910B = "Ascend910B" // Ascend910A3 ascend 910A3 chip Ascend910A3 = "Ascend910A3" // HuaweiNetworkUnHealthAscend910 910 network unhealthy HuaweiNetworkUnHealthAscend910 = ResourceNamePrefix + "Ascend910-NetworkUnhealthy" // HuaweiUnHealthAscend910 unhealthy HuaweiUnHealthAscend910 = ResourceNamePrefix + Ascend910 + "-Unhealthy" // HuaweiRecoveringAscend910 recovering HuaweiRecoveringAscend910 = ResourceNamePrefix + Ascend910 + "-Recovering" // HuaweiUnHealthAscend310P 310p unhealthy HuaweiUnHealthAscend310P = ResourceNamePrefix + Ascend310P + "-Unhealthy" // HuaweiUnHealthAscend310 310 unhealthy HuaweiUnHealthAscend310 = ResourceNamePrefix + Ascend310 + "-Unhealthy" // HuaweiNetworkRecoverAscend910 910 network recover HuaweiNetworkRecoverAscend910 = ResourceNamePrefix + Ascend910 + "-NetworkRecover" // HuaweiRecoverAscend910 910 recover HuaweiRecoverAscend910 = ResourceNamePrefix + Ascend910 + "-Recover" // HuaweiFaultCodeAscend910 910 fault code HuaweiFaultCodeAscend910 = ResourceNamePrefix + Ascend910 + "-Fault" // HuaweiFaultCodeAscend310P 310p fault code HuaweiFaultCodeAscend310P = ResourceNamePrefix + Ascend310P + "-Fault" // HuaweiFaultCodeAscend310 310 fault code HuaweiFaultCodeAscend310 = ResourceNamePrefix + Ascend310 + "-Fault" // AiCoreResourceName resource name for virtual device AiCoreResourceName = "npu-core" // Core1 1 core Core1 = "1c" // Core2 2 core Core2 = "2c" // Core2Cpu1 2core 1cpu Core2Cpu1 = "2c.1cpu" // Core3Cpu1Gb8 3 core, 1 cpu and 8GB memory Core3Cpu1Gb8 = "3c.1cpu.8g" // Core4 4 core Core4 = "4c" // Core4Cpu3 4core 3cpu Core4Cpu3 = "4c.3cpu" // Core4Cpu3Ndvpp 4core 3cpu ndvpp Core4Cpu3Ndvpp = "4c.3cpu.ndvpp" // Core4Cpu4Dvpp 4core 4cpu dvpp Core4Cpu4Dvpp = "4c.4cpu.dvpp" // Core5Cpu1Gb8 5 core, 1 cpu and 8GB memory Core5Cpu1Gb8 = "5c.1cpu.8g" // Core5Cpu1Gb16 5 core, 1 cpu and 16GB memory Core5Cpu1Gb16 = "5c.1cpu.16g" // Core6Cpu1Gb16 6 core, 1 cpu and 16GB memory Core6Cpu1Gb16 = "6c.1cpu.16g" // Core8 8 core Core8 = "8c" // Core10Cpu3Gb16 10 core, 3 cpu and 16Gb memory Core10Cpu3Gb16 = "10c.3cpu.16g" // Core10Cpu3Gb16Ndvpp 10 core, 3 cpu, 16Gb memory and ndvpp Core10Cpu3Gb16Ndvpp = "10c.3cpu.16g.ndvpp" // Core10Cpu3Gb32 10 core, 3 cpu and 32GB memory Core10Cpu3Gb32 = "10c.3cpu.32g" // Core10Cpu4Gb16Dvpp 10 core, 4 cpu, 16Gb memory and dvpp Core10Cpu4Gb16Dvpp = "10c.4cpu.16g.dvpp" // Core12Cpu3Gb32 12 core, 3 cpu and 32GB memory Core12Cpu3Gb32 = "12c.3cpu.32g" // Core16 16 core Core16 = "16c" // Vir01 template name vir01 Vir01 = "vir01" // Vir02 template name vir02 Vir02 = "vir02" // Vir02C1 template name vir02_1c Vir02C1 = "vir02_1c" // Vir03C1G8 template name vir03_1c_8g Vir03C1G8 = "vir03_1c_8g" // Vir04 template name vir04 Vir04 = "vir04" // Vir04C3 template name vir04_3c Vir04C3 = "vir04_3c" // Vir04C4Dvpp template name vir04_4c_dvpp Vir04C4Dvpp = "vir04_4c_dvpp" // Vir04C3Ndvpp template name vir04_3c_ndvpp Vir04C3Ndvpp = "vir04_3c_ndvpp" // Vir05C1G8 template name vir05_1c_8g Vir05C1G8 = "vir05_1c_8g" // Vir05C1G16 template name vir05_1c_16g Vir05C1G16 = "vir05_1c_16g" // Vir06C1G16 template name vir06_1c_16g Vir06C1G16 = "vir06_1c_16g" // Vir08 template name vir08 Vir08 = "vir08" // Vir10C3G16 template name vir10_3c_16g Vir10C3G16 = "vir10_3c_16g" // Vir10C3G16NM template name vir10_3c_16g_nm Vir10C3G16NM = "vir10_3c_16g_nm" // Vir10C3G32 template name vir10_3c_32g Vir10C3G32 = "vir10_3c_32g" // Vir10C4G16M template name vir10_4c_16g_m Vir10C4G16M = "vir10_4c_16g_m" // Vir12C3G32 template name vir12_3c_32g Vir12C3G32 = "vir12_3c_32g" // Vir16 template name vir16 Vir16 = "vir16" // VirMark the mark of virtual device VirMark = "vir" // AnnotationVNPUInfoSplitLen length of pod annotation for allocate vnpu info AnnotationVNPUInfoSplitLen = 2 // MaxAICoreNum max ai core num MaxAICoreNum = 32 // MinAICoreNum min ai core num MinAICoreNum = 8 // DefaultIDForCreateVNPU default id for creating vnpu DefaultIDForCreateVNPU = 0xFFFFFFFF // ServerTypeInfoMinLen the min len of server type split data ServerTypeInfoMinLen = 2 // VGroupAndDevLen a list only contain virtual group and device VGroupAndDevLen = 2 MaxShareDevCount = 100 )
const ( // ServerTypeLabelKey the node label key of server type ServerTypeLabelKey = "servertype" // AcceleratorTypeKey the node label key of accelerator type AcceleratorTypeKey = "accelerator-type" // A300IA2Label the value of the A300I A2 node label A300IA2Label = "card-910b-infer" // ServerUsageLabelKey is to indicate the usage of server // is infer or training, currently only related to A800IA2 infer server ServerUsageLabelKey = "server-usage" // InferCardKey the node label key of infer card InferCardKey = "infer-card-type" // A300IDuoLabel the value of the A300I Duo node label A300IDuoLabel = "card-300i-duo" )
const ( // HiAIHDCDevice hisi_hdc HiAIHDCDevice = "/dev/hisi_hdc" // HiAIManagerDevice davinci_manager HiAIManagerDevice = "/dev/davinci_manager" // HiAIManagerDeviceDocker davinci_manager for docker HiAIManagerDeviceDocker = "/dev/davinci_manager_docker" // HiAISVMDevice devmm_svm HiAISVMDevice = "/dev/devmm_svm" // HiAi200RCSVM0 svm0 HiAi200RCSVM0 = "/dev/svm0" // HiAi200RCLog log_drv HiAi200RCLog = "/dev/log_drv" // HiAi200RCEventSched event_sched HiAi200RCEventSched = "/dev/event_sched" // HiAi200RCUpgrade upgrade HiAi200RCUpgrade = "/dev/upgrade" // HiAi200RCHiDvpp hi_dvpp HiAi200RCHiDvpp = "/dev/hi_dvpp" // HiAi200RCMemoryBandwidth memory_bandwidth HiAi200RCMemoryBandwidth = "/dev/memory_bandwidth" // HiAi200RCTsAisle ts_aisle HiAi200RCTsAisle = "/dev/ts_aisle" )
const ( // Atlas200ISoc 200 soc env Atlas200ISoc = "Atlas 200I SoC A1" // Atlas200ISocXSMEM is xsmem_dev Atlas200ISocXSMEM = "/dev/xsmem_dev" // Atlas200ISocSYS is sys Atlas200ISocSYS = "/dev/sys" // Atlas200ISocVDEC is vdec Atlas200ISocVDEC = "/dev/vdec" // Atlas200ISocVPC is vpc Atlas200ISocVPC = "/dev/vpc" // Atlas200ISocSpiSmbus is spi_smbus Atlas200ISocSpiSmbus = "/dev/spi_smbus" // Atlas200ISocUserConfig is user_config Atlas200ISocUserConfig = "/dev/user_config" )
const ( // Atlas310BDvppCmdlist is dvpp_cmdlist Atlas310BDvppCmdlist = "/dev/dvpp_cmdlist" // Atlas310BPngd is pngd Atlas310BPngd = "/dev/pngd" // Atlas310BVenc is venc Atlas310BVenc = "/dev/venc" )
const ( Atlas310BAcodec = "/dev/acodec" Atlas310BAi = "/dev/ai" Atlas310BAo = "/dev/ao" Atlas310BVo = "/dev/vo" Atlas310BHdmi = "/dev/hdmi" )
Audio and video dependent device for Atlas310B
const ( // RootUID is root user id RootUID = 0 // RootGID is root group id RootGID = 0 // KeySliceLength is the length of key slice check KeySliceLength = 2 // DotSepDev if the separator between devices on labels DotSepDev = "." // CommaSepDev if the separator between devices on annotation CommaSepDev = "," // MiddelLine if the separator between devices for split id MiddelLine = "-" // UnderLine the separator between ids UnderLine = "_" // NoNPUResource means allocated some devices that don't exist NoNPUResource = "NoNPUResource" // NPUSegmentFailed means create vnpu device failed NPUSegmentFailed = "NPUSegmentFailed" // CenterScene deploy the device-plugin component on the central side CenterScene = "center" // EdgeScene deploy the device-plugin component on the edge side EdgeScene = "edge" // A300IA2BoardId board id of A300I A2 A300IA2BoardId = 0x28 // A800IA2NoneHccsBoardIdOld is the boardid of a800i a2 device,0x33 is server without hccs A800IA2NoneHccsBoardIdOld = 0x33 // A800IA2NoneHccsBoardId 0x33 changed to 0x3c , and compatible with the old boardId ,since 2024.9.4 A800IA2NoneHccsBoardId = 0x3c // EmptyBoardId is the boardid of device before initialized EmptyBoardId = 0x00 // FirstDevice the first device id FirstDevice = 0 // Infer means device for inference Infer = "infer" // Train means device for training Train = "train" )
const ( DeviceNotSupport = 8255 // DefaultAiCoreNum set a default value of aicore number DefaultAiCoreNum = 1 )
Special scene for invoking the dcmi interface
const ( // Atlas300IDuo for hot reset function, sync chip healthy state Atlas300IDuo = "Atlas 300I Duo" // HotResetClose not using chip hot reset function HotResetClose = -1 // HotResetInfer using infer chip hot reset HotResetInfer = 0 // HotResetTrainOnLine using train chip hot reset online HotResetTrainOnLine = 1 // HotResetTrainOffLine using train chip hot reset offline HotResetTrainOffLine = 2 // BootStartFinish chip hot reset finish BootStartFinish = 16 )
const ( // Ascend910RingsNum indicates the number of devices in a ring Ascend910RingsNum = 4 // Ascend910BRingsNumTrain indicates the number of devices in a ring Ascend910BRingsNumTrain = 8 // Ascend910BRingsNumInfer indicates the number of devices in a ring Ascend910BRingsNumInfer = 1 // Ascend910A3RingsNum indicates the number of devices in a ring Ascend910A3RingsNum = 2 // RingSum indicates the max number of ring RingSum = 2 // RankIndexKey for obtain the rank index in the pod RankIndexKey = "hccl/rankIndex" // InferRankIndex indecates the rank index of infer situation (rank index is meaningless in infer situation) InferRankIndex = "-1" // WaitResetEndTime for wait device reset to complete WaitResetEndTime = 120 // WaitRetryTime for wait five seconds to reset device again WaitRetryTime = 5 // ResetRetryTimes for max retry times when reset failed ResetRetryTimes = 4 )
const ( // ResetInfoDir dir for reset info ResetInfoDir = "/user/restore/reset/" // ResetInfoCMNamePrefix for reset configmap name prefix ResetInfoCMNamePrefix = "reset-config-" // ResetInfoCMDataKey for reset configmap data key ResetInfoCMDataKey = "reset.json" // ResetInfoCMCheckCodeKey for reset configmap checkcode key ResetInfoCMCheckCodeKey = "checkCode" // ResetInfoTypeKey for reset configmap type key ResetInfoTypeKey = "restartType" // HotResetRestartType for hot reset restart type HotResetRestartType = "hotReset" // ResetTaskNameKey for obtain the reset task name ResetTaskNameKey = "volcano.sh/job-name" // ResetTaskNameKeyInLabel for obtain the reset task name when using operator ResetTaskNameKeyInLabel = "training.kubeflow.org/job-name" )
const ( // FaultInfoCMNamePrefix for fault configmap name prefix FaultInfoCMNamePrefix = "fault-config-" // FaultInfoCMDataKey for fault configmap data key FaultInfoCMDataKey = "fault-npus" // FaultInfoCMCheckCodeKey for fault configmap checkcode key FaultInfoCMCheckCodeKey = "checkCode" )
const ( // EmptyError indicates that there is no fault EmptyError = "empty" // IgnoreError indicates that the current fault can be ignored IgnoreError = "ignore" // RestartRequestError indicates that the task only needs to re-execute this request RestartRequestError = "restart_request" // RestartError indicates that the training needs to be re-executed for the current fault RestartError = "restart" // FreeResetError indicates the fault level of the device to be reset whenever there is no task on NPU FreeResetError = "free_reset" // ResetError indicates that the current fault requires resetting the chip and re-executing the training ResetError = "reset" // IsolateError indicates that the device needs to be isolated due to the current fault IsolateError = "isolate" )
const ( // EmptyErrorLevel indicates the level of no fault state EmptyErrorLevel = iota // IgnoreErrorLevel indicates the level of a fault that can be ignored IgnoreErrorLevel // RestartRequestErrorLevel indicates that the task only needs to re-execute this request RestartRequestErrorLevel // RestartErrorLevel indicates the level of the fault that needs to be re-executed RestartErrorLevel // FreeResetErrorLevel indicates the fault level of the device to be reset whenever there is no task on NPU FreeResetErrorLevel // ResetErrorLevel indicates the fault level of the device to be reset ResetErrorLevel // IsolateErrorLevel indicates the fault level of the device to be isolated IsolateErrorLevel )
const ( // UnrecoveredStatus indicates the status before recovery UnrecoveredStatus = "unrecovered" // RecoveredStatus indicates that the recovery is successful RecoveredStatus = "recovered" // RecoverFailedStatus indicates that the recovery fails RecoverFailedStatus = "failed" )
const ( // AssertionRecovery the name of assertion 0 AssertionRecovery = "Recovery" // AssertionOccur the name of assertion 1 AssertionOccur = "Occur" // AssertionNotice the name of assertion 2 AssertionNotice = "Notice" // TimeFormat the format for time TimeFormat = "2006-01-02 15:04:05" // ResourceKindPod the kind pod of resource ResourceKindPod = "pod" )
const ( // PollFaultCodeCMInterval is the default interval(second) of polling fault code CM PollFaultCodeCMInterval = 300 // PollFaultCodeCMMaxInterval is the max interval(second) of polling fault code CM PollFaultCodeCMMaxInterval = 3600 // PollFaultCodeCMMinInterval is the min interval(second) of polling fault code CM PollFaultCodeCMMinInterval = 30 // GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface GetSwitchFaultCodeInterval = 300 // FaultCodeCMName is the name of the configmap that is used to save fault code FaultCodeCMName = "mindx-dl-fault-config" // FaultCodeCMNameSpace is the namespace of the fault code configmap FaultCodeCMNameSpace = "kube-system" // FaultCodeKey is the key to find fault code in cm FaultCodeKey = "faultCode.json" // SwitchFaultCodeKey is the key of the switch fault code SwitchFaultCodeKey = "SwitchFaultCode.json" // FaultCustomizationKey is the key to find fault customization in cm FaultCustomizationKey = "faultCustomization.json" // PollIntervalKey is the key to find poll interval in cm PollIntervalKey = "PollInterval" // DefaultProcessReadCMTime is the default time for process read configmap DefaultProcessReadCMTime = 30 // DefaultWaitFaultSelfHealingTime for waiting for fault self-healing DefaultWaitFaultSelfHealingTime = 15 // MinWaitFaultSelfHealingTime for min time of waiting for fault self-healing MinWaitFaultSelfHealingTime = 1 // MaxWaitFaultSelfHealingTime for max time of waiting for fault self-healing MaxWaitFaultSelfHealingTime = 30 // DefaultPollingInterval represents the time between polls of the dcmi interface DefaultPollingInterval = 1 // MaxWaitProcessReadCMTime for max time waiting for process to read cm MaxWaitProcessReadCMTime = 90 // MinWaitProcessReadCMTime for min time waiting for process to read cm MinWaitProcessReadCMTime = 5 // DefaultWaitDeviceResetTime is the default time used in waiting device reset DefaultWaitDeviceResetTime = 150 // MaxWaitDeviceResetTime is the max time used in waiting device reset MaxWaitDeviceResetTime = 180 // MinWaitDeviceResetTime is the min time used in waiting device reset MinWaitDeviceResetTime = 60 // MaxFaultFrequencyTimeWindow is the max time for the time window of fault frequency MaxFaultFrequencyTimeWindow = 864000 // MinFaultFrequencyTimeWindow is the min time for the time window of fault frequency MinFaultFrequencyTimeWindow = 60 // MaxFaultFrequencyTimes is the max count for the fault occurrence time of fault frequency MaxFaultFrequencyTimes = 100 // MinFaultFrequencyTimes is the min count for the fault occurrence time of fault frequency MinFaultFrequencyTimes = 1 // DefaultLinkUpTimeout is the default time for the linkup event DefaultLinkUpTimeout = 60 // MinLinkUpTimeout is the min time for the linkup event MinLinkUpTimeout = 1 // MaxLinkUpTimeout is the max time for the linkup event MaxLinkUpTimeout = 60 // MinLinkDownTimeout is the min time for the linkdown event MinLinkDownTimeout = 1 // MaxLinkDownTimeout is the max time for the linkdown event MaxLinkDownTimeout = 30 // MaxFaultTimeout is the max time(s) for the fault duration time of fault duration MaxFaultTimeout = 600 // MinFaultTimeout is the min time(s) for the fault duration time of fault duration MinFaultTimeout = 0 // MaxRecoverTimeout is the max time(s) for the fault recover duration time of fault duration MaxRecoverTimeout = 600 // MinRecoverTimeout is the min time(s) for the fault recover duration time of fault duration MinRecoverTimeout = 0 // DefaultSubscribeToPollingTime is the default time from subscribe to polling DefaultSubscribeToPollingTime = 5 // MaxLogicID is the maximum logic ID MaxLogicID = 15 // MinLogicID is the minimum logic ID MinLogicID = 0 // MaxResetTimes the max reset times of a device while error happened, // setting to 30 to avoid manually reset on host machine MaxResetTimes = 30 )
Fault customization const
const ( FaultSeveritySuggestion = iota FaultSeverityMinor FaultSeverityMajor FaultSeverityCritical )
the severity level of fault
const ( ManuallySeparateNpuFirstHandle = "FirstHandle" ManuallySeparateNpuHandled = "Handled" ManuallySeparateNpuAll = "All" )
LogicID list for reset, get id list of ring
const ( // SdIdAbnormal represents super pod sdid abnormal value SdIdAbnormal = -2 // ScaleTypeAbnormal represents super pod scaleType abnormal value ScaleTypeAbnormal = -2 // SuperPodIdAbnormal represents super pod superPodId abnormal value SuperPodIdAbnormal = -2 // ServerIdAbnormal represents super pod serverId abnormal value ServerIdAbnormal = -2 )
const ( // TimeoutProcess represents fault timeout process TimeoutProcess = "fault timeout" // TimeoutRecoverProcess represents fault timeout recover process TimeoutRecoverProcess = "fault timeout recover" )
const ( // ChipFaultMode represents chip fault mode ChipFaultMode = "chip fault mode" // NetworkFaultMode represents network fault mode NetworkFaultMode = "network fault mode" )
const ( // Polling represents subscribe mode invalid and polling is used scenario Polling = "polling" // Subscribe represents subscribe mode Subscribe = "subscribe" )
const ( // NPUNormalStatus represents normal status NPUNormalStatus = "normal" // NPUResettingStatus represents resetting status NPUResettingStatus = "resetting" // UpdateAnnotationRetryTimes update annotation retry times UpdateAnnotationRetryTimes = 3 // SubHealthyAnnotationKey sub-healthy annotation key on node SubHealthyAnnotationKey = "subHealthy" // FirstUpdateMaxSleepMilliSecond max sleep time before first update node annotation FirstUpdateMaxSleepMilliSecond = 3000 )
const ( // HbmDoubleBitFaultCode indicate 0x80E01801 HbmDoubleBitFaultCode = 2162169857 // HbmDoubleBitFaultCodeStr indicate 80e01801 HbmDoubleBitFaultCodeStr = "80e01801" // AivBusFaultCode indicate 0x80CB8009 AivBusFaultCode = 2160820233 // AicBusFaultCode indicate 0x80C98009 AicBusFaultCode = 2160689161 // AssociatedFaultDiagnosisTime associated fault diagnosis AssociatedFaultDiagnosisTime = 5 // TimeMilliseconds indicate how many milliseconds are there in a second TimeMilliseconds = 1000 )
const ( // NotHandleFaultLevel NotHandle Fault Level NotHandleFaultLevel = 0 // PreSeparateFaultLevel PreSeparate Fault Level PreSeparateFaultLevel = 1 // SeparateFaultLevel Separate Fault Level SeparateFaultLevel = 2 // NotHandleFaultLevelStr NotHandle Fault Level Str NotHandleFaultLevelStr = "NotHandle" // PreSeparateFaultLevelStr PreSeparate Fault Level Str PreSeparateFaultLevelStr = "PreSeparate" // SeparateFaultLevelStr Separate Fault Level Str SeparateFaultLevelStr = "Separate" )
const ( // NotHandleFault not handle fault NotHandleFault = "NotHandleFault" // RestartRequest restart request RestartRequest = "RestartRequest" // RestartBusiness restart business RestartBusiness = "RestartBusiness" // RestartNPU restart NPU RestartNPU = "RestartNPU" // FreeRestartNPU wait free and restart NPU FreeRestartNPU = "FreeRestartNPU" // SeparateNPU separate NPU SeparateNPU = "SeparateNPU" // NormalNPU normal NPU NormalNPU = "NormalNPU" // NormalNetwork normal network NormalNetwork = "NormalNetwork" // PreSeparateNPU pre separate NPU PreSeparateNPU = "PreSeparateNPU" // ManuallySeparateNPU Manually Separate NPU ManuallySeparateNPU = "ManuallySeparateNPU" // CardUnhealthy fault is caused by card unhealthy CardUnhealthy = "CardUnhealthy" // CardNetworkUnhealthy fault is caused by card network unhealthy CardNetworkUnhealthy = "CardNetworkUnhealthy" // LinkDownFaultCode linkdown fault code LinkDownFaultCode = 0x81078603 // ResetFinishFaultCode reset finish fault code ResetFinishFaultCode = 0x8C2FA009 // CardDropFaultCode card drop fault code CardDropFaultCode = 0x40F84E00 // WaitNpuReadyTime is the time used in waiting for npu ready WaitNpuReadyTime time.Duration = 30 // WaitErrorCodeCleanTime is the time used in waiting for clean error code WaitErrorCodeCleanTime time.Duration = 30 // WaitProcessesToZeroTime is the time used in waiting for process to zero WaitProcessesToZeroTime time.Duration = 60 // ResetInterVal is the interval time used in waiting for reset ResetInterVal time.Duration = 5 // PollingInterval is used to poll the dcmi interface interval time PollingInterval time.Duration = DefaultPollingInterval // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const ApiServerPort = "443"
ApiServerPort is port of API server
const (
// InitialProcNum represents the initial value of the number of remaining processes
InitialProcNum = 1
)
const (
// MaxResetWaitRecoverTime max reset wait chip recover time is 150s
MaxResetWaitRecoverTime = 150
)
Variables ¶
var ( // SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read SwitchFaultLevelMapLock sync.Mutex // SwitchFaultLevelMap record every fault code and it's level SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize) // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence SwitchFaultLock sync.Mutex )
var ( // NotHandleFaultCodes contains all fault code that believed to be not handled, in this case is L1 NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) // PreSeparateFaultCodes contains all fault code that believed to be PreSeparate, in this case is L2-L3 PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) // SeparateFaultCodes contains all fault code that believed to be Separate, in this case is L4-L5 SeparateFaultCodes = make([]int64, 0, GeneralMapSize) // SubscribeFailed subscribe failed flag SubscribeFailed bool // SwitchSubscribeFailed indicate switch fault subscribe failed result, true is subscribe failed SwitchSubscribeFailed bool // Synchronize used for synchronizing the fault cache between the main process and the grace tolerance coroutines Synchronize bool // FaultTypeSet is a set that contains all the fault level FaultTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU, RestartNPU, PreSeparateNPU, SeparateNPU, ManuallySeparateNPU, SubHealthFault) // FaultDurationTypeSet is a set that contains all the fault Duration level FaultDurationTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU, RestartNPU, PreSeparateNPU, SeparateNPU, SubHealthFault) // NetworkFaultCodes is a set that contains all the network fault codes NetworkFaultCodes = sets.NewInt64(LinkDownFaultCode) )
var ( // WaitProcessReadCMTime is the time used in waiting for process read cm WaitProcessReadCMTime time.Duration = DefaultProcessReadCMTime // WaitFaultSelfHealingTime for waiting for fault self-healing WaitFaultSelfHealingTime time.Duration = DefaultWaitFaultSelfHealingTime // WaitDeviceResetTime is the time used in waiting device reset WaitDeviceResetTime time.Duration = DefaultWaitDeviceResetTime )
fault customization
var Int32Tool int32Tool
Int32Tool slice for int32 tool
var Int64Tool int64Tool
Int64Tool slice for int64 tool
var StringTool stringTool
StringTool slice for string tool
Functions ¶
func CheckCardUsageMode ¶
CheckCardUsageMode check card usage mode
func CheckDeviceName ¶
CheckDeviceName used to check device name
func CheckErrorMessage ¶
CheckErrorMessage check whether the error message contains a specific string
func CheckFileUserSameWithProcess ¶
CheckFileUserSameWithProcess to check whether the owner of the log file is the same as the uid
func CheckPodNameAndSpace ¶
CheckPodNameAndSpace used to check pod name or pod namespace
func ConvertDevListToSets ¶
ConvertDevListToSets convert devices to Sets
func CountFaultDuration ¶
func CountFaultDuration(device *NpuDevice, devFaultInfoMap map[int32][]common.DevFaultInfo)
CountFaultDuration used to calculate each fault duration
func DelOnceFrequencyFault ¶
func DelOnceFrequencyFault()
DelOnceFrequencyFault clear all the fault occurrence time in cache when frequency fault detected at the end of each cycle
func DelOnceRecoverFault ¶
DelOnceRecoverFault delete func 'cacheAfterDelFaultCode' record fault code and network fault code in the end of cycle
func DeleteManuallyFaultInfo ¶
func DeleteManuallyFaultInfo(logicID int32)
DeleteManuallyFaultInfo delete manually fault info from manuallySeparateNpuMap
func FakeAiCoreDevice ¶
func FakeAiCoreDevice(dev DavinCiDev, aiCoreDevices *[]*NpuDevice)
FakeAiCoreDevice fake ai core devices
func FilterPods ¶
FilterPods get pods which meet the conditions
func GenResetDirName ¶
GenResetDirName generate reset cm dir name
func GenResetFileName ¶
GenResetFileName generate reset cm file name
func GenResetTypeFileName ¶
GenResetTypeFileName generate reset cm file name
func Get310PProductType ¶
Get310PProductType get 310P product type
func GetAllDeviceInfoTypeList ¶
func GetAllDeviceInfoTypeList() map[string]struct{}
GetAllDeviceInfoTypeList Get All Device Info Type List
func GetAndCleanFaultInfo ¶
func GetAndCleanFaultInfo() map[int32][]common.DevFaultInfo
GetAndCleanFaultInfo get device fault info and clean cache
func GetAndCleanLogicID ¶
func GetAndCleanLogicID() []int32
GetAndCleanLogicID get should init device's logicID and clean cache
func GetChangedDevFaultInfo ¶
func GetChangedDevFaultInfo(device *NpuDevice, oldErrCodes []int64, newErrCodes []int64) []common.DevFaultInfo
GetChangedDevFaultInfo get device changed fault info
func GetDefaultDevices ¶
GetDefaultDevices get default device, for allocate mount
func GetDeviceFromPodAnnotation ¶
GetDeviceFromPodAnnotation get devices from pod annotation
func GetDeviceID ¶
GetDeviceID get device physical id and virtual by device name
func GetDeviceListID ¶
GetDeviceListID get device id by input device name
func GetDeviceRunMode ¶
GetDeviceRunMode get current env device run mode
func GetFaultAssertionName ¶
GetFaultAssertionName get assertion name of fault code
func GetFaultType ¶
GetFaultType will return the fault type from fault codes, fault frequency, fault duration and ManuallySeparateNPU cache
func GetFaultTypeByCode ¶
GetFaultTypeByCode get fault type by fault code. if code not record, default SeparateNPU0
func GetFaultTypeFromFaultDuration ¶
GetFaultTypeFromFaultDuration get fault type from fault duration cache
func GetFaultTypeFromFaultFrequency ¶
GetFaultTypeFromFaultFrequency refreshes the cache of FaultFrequency, delete the faults those not in time window, and return the fault level if the occurrence times of fault >= the set value
func GetJobNameOfPod ¶
GetJobNameOfPod get job name of pod from annotations or labels
func GetNetworkFaultType ¶
GetNetworkFaultType will return the fault type from network fault codes, fault duration
func GetNetworkFaultTypeByCode ¶
GetNetworkFaultTypeByCode get network fault type by fault code. if code not record, default PreSeparateNPU
func GetPodAnnotationByDeviceType ¶
GetPodAnnotationByDeviceType get pod annotation by device type
func GetPodConfiguration ¶
func GetPodConfiguration(phyDevMapVirtualDev map[int]int, devices map[int]string, podName string, info ServerInfo, allDevices []NpuDevice) string
GetPodConfiguration get annotation configuration of pod
func GetPodNameFromEnv ¶
GetPodNameFromEnv get current pod name from env
func GetSwitchFaultCode ¶
func GetSwitchFaultCode() []int64
GetSwitchFaultCode get switch fault code
func GetTemplateName2DeviceTypeMap ¶
GetTemplateName2DeviceTypeMap get virtual device type by template
func GetTimeoutFaultCodes ¶
GetTimeoutFaultCodes get timeout fault codes
func GetVNPUSegmentInfo ¶
GetVNPUSegmentInfo get vpu segment info
func IsContainAll300IDuo ¶
func IsContainAll300IDuo() bool
IsContainAll300IDuo in ProductTypes list, is full Atlas 300I Duo card
func IsContainAtlas300IDuo ¶
func IsContainAtlas300IDuo() bool
IsContainAtlas300IDuo in ProductTypes list, is contain Atlas 300I Duo card
func IsValidNumber ¶
IsValidNumber input checkVal is a valid number
func IsVirtualDev ¶
IsVirtualDev used to judge whether a physical device or a virtual device
func LoadFaultCode ¶
LoadFaultCode loads the fault codes
func LoadFaultCodeFromFile ¶
func LoadFaultCodeFromFile() error
LoadFaultCodeFromFile load fault code and fault type from faultCode.json
func LoadFaultCustomization ¶
LoadFaultCustomization loads fault customization
func LoadFaultCustomizationFromFile ¶
func LoadFaultCustomizationFromFile() error
LoadFaultCustomizationFromFile load fault customization from faultCustomization.json
func LoadSwitchFaultCode ¶
LoadSwitchFaultCode Load SwitchFault Code from bytes of config file or configmap
func LoadSwitchFaultCodeFromFile ¶
func LoadSwitchFaultCodeFromFile() error
LoadSwitchFaultCodeFromFile load fault code from SwitchFaultCode.json
func MapDeepCopy ¶
MapDeepCopy map deep copy
func NewSignWatcher ¶
NewSignWatcher new sign watcher
func QueryManuallyFaultInfoByLogicID ¶
QueryManuallyFaultInfoByLogicID query manually fault info based on logic id from manuallySeparateNpuMap
func QueryManuallyFaultNPULogicIDsByHandleStatus ¶
QueryManuallyFaultNPULogicIDsByHandleStatus query manually fault npu logic ids based on handle status from manuallySeparateNpuMap
func RecordFaultInfoList ¶
func RecordFaultInfoList(devFaultInfoList []*TaskDevInfo)
RecordFaultInfoList record the fault info
func RemoveFileAndDir ¶
RemoveFileAndDir remove file and dir
func ResetFaultCustomizationCache ¶
func ResetFaultCustomizationCache()
ResetFaultCustomizationCache reset fault customization cache
func SaveDevFaultInfo ¶
func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo)
SaveDevFaultInfo save device fault info , subscribe interface call back function
func SaveManuallyFaultInfo ¶
func SaveManuallyFaultInfo(logicID int32)
SaveManuallyFaultInfo save manually fault info into manuallySeparateNpuMap
func SetAscendRuntimeEnv ¶
func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1beta1.ContainerAllocateResponse)
SetAscendRuntimeEnv is to set ascend runtime environment
func SetDeviceInit ¶
func SetDeviceInit(logicID int32)
SetDeviceInit set should init device's logicID
func SetManuallyFaultNPUHandled ¶
func SetManuallyFaultNPUHandled()
SetManuallyFaultNPUHandled set manually fault NPU handled
func SetNetworkNewFaultAndCacheOnceRecoverFault ¶
func SetNetworkNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)
SetNetworkNewFaultAndCacheOnceRecoverFault set new network fault code and cache once recover network fault
func SetNewFaultAndCacheOnceRecoverFault ¶
func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)
SetNewFaultAndCacheOnceRecoverFault set new fault code and cache once recover fault
func SetSwitchFaultCode ¶
func SetSwitchFaultCode(newFaults []int64)
SetSwitchFaultCode set switch fault code
func UnlockAllDeviceInfo ¶
func UnlockAllDeviceInfo()
UnlockAllDeviceInfo unlock for device info status
func VerifyPathAndPermission ¶
VerifyPathAndPermission used to verify the validity of the path and permission and return resolved absolute path
Types ¶
type AtomicBool ¶
type AtomicBool struct {
// contains filtered or unexported fields
}
AtomicBool is an atomic Boolean.
func NewAtomicBool ¶
func NewAtomicBool(initial bool) *AtomicBool
NewAtomicBool creates a AtomicBool.
func (*AtomicBool) Store ¶
func (b *AtomicBool) Store(new bool)
Store atomically stores the passed value.
type DavinCiDev ¶
DavinCiDev davinci device
type DevFaultInfo ¶
type DevFaultInfo struct { LogicId int32 Status string Policy string InitialPolicy string ErrorCode []int64 ErrorCodeHex string }
DevFaultInfo is the fault info of device
type DevFaultInfoBasedTimeAscend ¶
type DevFaultInfoBasedTimeAscend []common.DevFaultInfo
DevFaultInfoBasedTimeAscend sort fault queue based on alarmRaisedTime in ascending order
func (DevFaultInfoBasedTimeAscend) Len ¶
func (devFault DevFaultInfoBasedTimeAscend) Len() int
Len is a fixed usage to find the length of type
func (DevFaultInfoBasedTimeAscend) Less ¶
func (devFault DevFaultInfoBasedTimeAscend) Less(i, j int) bool
Less is fixed usage to check if one is less than the other one of type
func (DevFaultInfoBasedTimeAscend) Swap ¶
func (devFault DevFaultInfoBasedTimeAscend) Swap(i, j int)
Swap is a fixed usage to switch the index of type
type DevStatusSet ¶
type DevStatusSet struct { UnHealthyDevice sets.String NetUnHealthyDevice sets.String HealthDevices sets.String RecoveringDevices sets.String FreeHealthyDevice map[string]sets.String DeviceFault []DeviceFault }
DevStatusSet contain different states devices
type Device ¶
type Device struct { DeviceID string `json:"device_id"` // device id DeviceIP string `json:"device_ip"` // device ip SuperDeviceID string `json:"super_device_id,omitempty"` }
Device id for Instcance
type DeviceFault ¶
type DeviceFault struct { FaultType string `json:"fault_type"` NPUName string `json:"npu_name"` LargeModelFaultLevel string `json:"large_model_fault_level"` FaultLevel string `json:"fault_level"` FaultHandling string `json:"fault_handling"` FaultCode string `json:"fault_code"` }
DeviceFault npu or network fault info
type DeviceHealth ¶
DeviceHealth health status of device
type FaultCustomization ¶
type FaultCustomization struct { GraceTolerance GraceToleranceCustomization FaultFrequency []FaultFrequencyCustomization FaultDuration []FaultDurationCustomization }
FaultCustomization is the customization info of fault
type FaultDuration ¶
FaultDuration is the base info of fault duration
type FaultDurationCache ¶
type FaultDurationCache struct { // key: logicID, value: fault duration data Duration map[int32]FaultDurationData FaultDuration }
FaultDurationCache is the cache saving the FaultDuration
type FaultDurationCustomization ¶
type FaultDurationCustomization struct { EventId []string FaultDuration }
FaultDurationCustomization is the customization info of fault duration
type FaultDurationData ¶
type FaultDurationData struct { TimeoutStatus bool FaultEventQueue []common.DevFaultInfo FaultDurationTime int64 FaultRecoverDurationTime int64 }
FaultDurationData saved data during fault duration statistics
type FaultFrequency ¶
FaultFrequency is the base info of fault frequency
type FaultFrequencyCache ¶
type FaultFrequencyCache struct { // key: logicID, value: fault occurrence time (unix time) Frequency map[int32][]int64 FaultFrequency }
FaultFrequencyCache is the cache saving the FaultFrequency
type FaultFrequencyCustomization ¶
type FaultFrequencyCustomization struct { EventId []string FaultFrequency }
FaultFrequencyCustomization is the customization info of fault frequency
type FaultTypeCode ¶
type FaultTypeCode struct { NotHandleFaultCodes []int64 RestartRequestCodes []int64 RestartBusinessCodes []int64 RestartNPUCodes []int64 FreeRestartNPUCodes []int64 PreSeparateNPUCodes []int64 SeparateNPUCodes []int64 NotHandleFaultNetworkCodes []int64 PreSeparateNPUNetworkCodes []int64 SeparateNPUNetworkCodes []int64 SubHealthFaultCodes []int64 }
FaultTypeCode group code by type
type FileWatch ¶
FileWatch is used to watch sock file
func NewFileWatch ¶
NewFileWatch is used to watch socket file
type GraceToleranceCustomization ¶
type GraceToleranceCustomization struct { WaitProcessReadCMTime int64 WaitDeviceResetTime int64 WaitFaultSelfHealingTime int64 }
GraceToleranceCustomization is the customization info of grace tolerance
type HbmFaultManager ¶
type HbmFaultManager struct { HbmOccurTimeCache map[int32]int64 AicFaultEventQue map[int32][]common.DevFaultInfo }
HbmFaultManager manage the accompanying faults of aic error and hbm error
func NewHbmFaultManager ¶
func NewHbmFaultManager() *HbmFaultManager
NewHbmFaultManager return a hbm fault manager
type Instance ¶
type Instance struct { PodName string `json:"pod_name"` // pod Name ServerID string `json:"server_id"` // serverdId SuperPodId int32 `json:"super_pod_id"` Devices []Device `json:"devices"` // dev }
Instance is for annotation
type ManuallyFaultInfo ¶
ManuallyFaultInfo save the info of ManuallySeparateNPU
type NodeDeviceInfo ¶
NodeDeviceInfo record node NPU device information. Will be solidified into cm.
type NodeDeviceInfoCache ¶
type NodeDeviceInfoCache struct { DeviceInfo NodeDeviceInfo SuperPodID int32 ServerIndex int32 CheckCode string }
NodeDeviceInfoCache record node NPU device information. Will be solidified into cm.
type NpuAllInfo ¶
NpuAllInfo all npu infos
type NpuBaseInfo ¶
NpuBaseInfo is the base info of npu
type NpuDevice ¶
type NpuDevice struct { FaultCodes []int64 AlarmRaisedTime int64 NetworkFaultCodes []int64 NetworkAlarmRaisedTime int64 DevType string DeviceName string Health string NetworkHealth string CardDrop bool IP string LogicID int32 PhyID int32 CardID int32 SuperDeviceID uint32 Status string }
NpuDevice npu device description
type Option ¶
type Option struct { GetFdFlag bool // to describe FdFlag UseAscendDocker bool // UseAscendDocker to chose docker type UseVolcanoType bool // use volcano mode AutoStowingDevs bool // auto stowing fixes devices or not PresetVDevice bool // preset virtual device Use310PMixedInsert bool // chose 310P mixed insert mode GraceToleranceOn bool // check if grace tolerance is on ListAndWatchPeriod int // set listening device state period HotReset int // unhealthy chip hot reset AiCoreCount int32 // found by dcmi interface BuildScene string // build scene judge device-plugin start scene ProductTypes []string // all product types RealCardType string // real card type LinkdownTimeout int64 // linkdown timeout duration DealWatchHandler bool // update pod cache when receiving pod informer watch errors EnableSwitchFault bool // if enable switch faul CheckCachedPods bool // check cached pods periodically }
Option option
type PodDeviceInfo ¶
PodDeviceInfo define device info of pod, include kubelet allocate and real allocate device
type ServerInfo ¶
ServerInfo used for pass parameters
type SuperPodInfo ¶
SuperPodInfo is super pod info
type SwitchFaultFileInfo ¶
type SwitchFaultFileInfo struct { NotHandleFaultCodes []string ReportFaultCodes []string SubHealthFaultCodes []string ResetFaultCodes []string SeparateFaultCodes []string }
SwitchFaultFileInfo contains all fault code loading from faultconfig configmap or switchfaultconfig.json
type SwitchFaultInfo ¶
type SwitchFaultInfo struct { FaultCode []string FaultLevel string UpdateTime int64 NodeStatus string }
SwitchFaultInfo Switch Fault Info
func GetSwitchFaultInfo ¶
func GetSwitchFaultInfo() SwitchFaultInfo
GetSwitchFaultInfo GetSwitch Fault Info by CurrentSwitchFault and fault config of switch
type TaskDevInfo ¶
type TaskDevInfo struct { RankId int DevFaultInfo }
TaskDevInfo is the device info of a task
type TaskFaultInfo ¶
TaskFaultInfo record task fault rank information
type TaskFaultInfoCache ¶
type TaskFaultInfoCache struct { FaultInfo *TaskFaultInfo CheckCode string }
TaskFaultInfoCache record task fault rank information cache
type TaskResetInfo ¶
type TaskResetInfo struct { RankList []*TaskDevInfo UpdateTime int64 RetryTime int }
TaskResetInfo record task reset device information
type TaskResetInfoCache ¶
type TaskResetInfoCache struct { ResetInfo *TaskResetInfo CheckCode string }
TaskResetInfoCache record task reset device information cache