common

package
v0.0.0-...-ff574f0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 5, 2025 License: Apache-2.0 Imports: 24 Imported by: 0

Documentation

Overview

Package common a series of common function

Package common a series of common function

Package common a series of common function

Package common a series of common function

Package common a series of common function

Package common a series of common function

Package common a series of common function

Package common a series of common function

Index

Constants

View Source
const (
	// Component component name
	Component = "device-plugin"
	// MaxBackups log file max backup
	MaxBackups = 30
	// MaxAge the log file last time
	MaxAge = 7

	// KubeEnvMaxLength k8s env name max length
	KubeEnvMaxLength = 230
	// PodNameMaxLength pod name max length
	PodNameMaxLength = 253
	// PodNameSpaceMaxLength pod name space max length
	PodNameSpaceMaxLength = 63
	// MaxPodLimit max pod num
	MaxPodLimit = 10000
	// MaxContainerLimit max container num
	MaxContainerLimit = 300000
	// RetryUpdateCount is max number of retry resource update
	RetryUpdateCount = 3
	// GetPodFromInformerTime is max number of get pod from informer
	GetPodFromInformerTime = 3
	// MaxDeviceNameLen max length of device name, like "Ascend310P-4c.3cpu-100-0"
	MaxDeviceNameLen = 50
	// MaxGRPCRecvMsgSize 4MB
	MaxGRPCRecvMsgSize = 4 * 1024 * 1024
	// MaxGRPCConcurrentStreams limit on the number of concurrent streams to each ServerTransport.
	MaxGRPCConcurrentStreams = 64
	// MaxConcurrentLimit limit over listener
	MaxConcurrentLimit = 64
	// MaxIPConnectionLimit limit over ip
	MaxIPConnectionLimit = 64
	// CacheSize cache for ip
	CacheSize = 128
	// MaxVirtualDeviceNum max num of virtual device
	MaxVirtualDeviceNum = 1024
	// CMDataMaxLength configMap max data size 1MB
	CMDataMaxLength = 1024 * 1024
	// PodAnnotationMaxLength pod annotation max data length 1MB
	PodAnnotationMaxLength = 1024 * 1024
	// UpdatePodWaitTime default try update pod wait time 200 millisecond
	UpdatePodWaitTime = 200

	// DeviceInfoCMNameSpace namespace of device info configmap
	DeviceInfoCMNameSpace = "kube-system"
	// DeviceInfoCMNamePrefix device info configmap name prefix
	DeviceInfoCMNamePrefix = "mindx-dl-deviceinfo-"
	// DeviceInfoCMDataKey device info configmap data key
	DeviceInfoCMDataKey = "DeviceInfoCfg"
	// SwitchInfoCMDataKey the key of switch info in device-info configmap
	SwitchInfoCMDataKey = "SwitchInfoCfg"
	// DeviceInfoCMManuallySeparateNPUKey for deviceinfo configmap ManuallySeparateNPU key
	DeviceInfoCMManuallySeparateNPUKey = "ManuallySeparateNPU"
	// SlowNodeNoticeCMName the name for slow node notice configmap
	SlowNodeNoticeCMName = "steptime-dtpgroup"

	// CmConsumer who uses these configmap
	CmConsumer = "mx-consumer-cim"
	// CmConsumerValue the value only for true
	CmConsumerValue = "true"

	// AscendVisibleDevicesEnv visible devices env
	AscendVisibleDevicesEnv = "ASCEND_VISIBLE_DEVICES"

	// PodPredicateTime pod predicate time
	PodPredicateTime = "predicate-time"
	// Pod2kl pod annotation key, means kubelet allocate device
	Pod2kl = "kltDev"
	// PodRealAlloc pod annotation key, means pod real mount device
	PodRealAlloc = "AscendReal"
	// Pod910DeviceKey pod annotation key, for generate 910 hccl rank table
	Pod910DeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration"
	// BaseDeviceInfoKey base device info key
	BaseDeviceInfoKey = "baseDeviceInfos"
	// ChipNameLabel update chip name to node label
	ChipNameLabel = "node.kubernetes.io/npu.chip.name"
	// MetaDataAnnotation downward api which map annotation from volcano to container's env
	MetaDataAnnotation = "metadata.annotations"
	// MetaData is meta data of pod
	MetaData = "metadata"

	// SlowNodeStepTimeEnvNum is the number of environment value for step time cm
	SlowNodeStepTimeEnvNum = 2
	// PerfDumpPathEnv is an environment variable for slow node step time configmap
	PerfDumpPathEnv = "PERF_DUMP_PATH"
	// PerfDumpConfigEnv is an environment variable for slow node step time configmap
	PerfDumpConfigEnv = "PERF_DUMP_CONFIG"

	// PodResourceSeverKey for pod resource key
	PodResourceSeverKey = "podResource"
	// VirtualDev Virtual device tag
	VirtualDev = "VIRTUAL"
	// PhyDeviceLen like Ascend910-0 split length is 2
	PhyDeviceLen = 2
	// VirDeviceLen like Ascend910-2c-100-1 split length is 4
	VirDeviceLen = 4
	// MaxDevicesNum max device num
	MaxDevicesNum = 100
	// MaxCardNum max card num
	MaxCardNum = 64
	// MaxDevNumInCard max device num in card
	MaxDevNumInCard = 4
	// MaxRequestVirtualDeviceNum max request device num
	MaxRequestVirtualDeviceNum = 1
	// LabelDeviceLen like Ascend910-0 split length is 2
	LabelDeviceLen = 2
	// DefaultDeviceIP device ip address
	DefaultDeviceIP = "127.0.0.1"
	// NormalState health state
	NormalState = uint32(0)
	// GeneralAlarm health state
	GeneralAlarm = uint32(1)

	// SocketChmod socket file mode
	SocketChmod = 0600
	// RunMode310 for 310 chip
	RunMode310 = "ascend310"
	// RunMode910 for 910 chip
	RunMode910 = "ascend910"
	// RunMode310P for 310P chip
	RunMode310P = "ascend310P"

	// AMPMode for AMP chip work mode
	AMPMode = "AMP"
	// SMPMode for SMP chip work mode
	SMPMode = "SMP"

	// Interval interval time
	Interval = 1
	// Timeout time
	Timeout = 10
	// BaseDec base
	BaseDec = 10
	// BitSize base size
	BitSize = 64
	// BitSize32 base size 32
	BitSize32 = 32
	// SleepTime The unit is seconds
	SleepTime = 5

	// GeneralMapSize general map size
	GeneralMapSize = 8
	// MapSizeTwo map size two
	MapSizeTwo = 2
	// GeneralSubscribeTime general subscribe try time
	GeneralSubscribeTime = 3
	// Hex hexadecimal
	Hex = 16
	// SecondMagnification is second-level unit magnification
	SecondMagnification = 1000
	// SecondMagnificationFloat is second-level unit magnification float
	SecondMagnificationFloat = 1000.0
)
View Source
const (
	// ResourceNamePrefix prefix
	ResourceNamePrefix = "huawei.com/"
	// DistributedJob annotation indicates that the job is distributed
	DistributedJob = "distributed-job"
	// Ascend310P 310p
	Ascend310P = "Ascend310P"
	// Ascend310PV 310P-V
	Ascend310PV = Ascend310P + "-V"
	// Ascend310PVPro 310P-VPro
	Ascend310PVPro = Ascend310P + "-VPro"
	// Ascend310PIPro 310P-IPro
	Ascend310PIPro = Ascend310P + "-IPro"
	// Ascend310Pc1 Ascend310P 1 core
	Ascend310Pc1 = Ascend310P + "-" + Core1
	// Ascend310Pc2 Ascend310P 2 core
	Ascend310Pc2 = Ascend310P + "-" + Core2
	// Ascend310Pc4 Ascend310P 4 core
	Ascend310Pc4 = Ascend310P + "-" + Core4
	// Ascend310Pc4Cpu3 Ascend310P 4core 3cpu
	Ascend310Pc4Cpu3 = Ascend310P + "-" + Core4Cpu3
	// Ascend310Pc2Cpu1 Ascend310P 2core 1cpu
	Ascend310Pc2Cpu1 = Ascend310P + "-" + Core2Cpu1
	// Ascend310Pc4Cpu4Dvpp Ascend310P 4core 4cpu dvpp
	Ascend310Pc4Cpu4Dvpp = Ascend310P + "-" + Core4Cpu4Dvpp
	// Ascend310Pc4Cpu3Ndvpp Ascend310P 4core 3cpu ndvpp
	Ascend310Pc4Cpu3Ndvpp = Ascend310P + "-" + Core4Cpu3Ndvpp
	// HuaweiAscend310P with prefix
	HuaweiAscend310P = ResourceNamePrefix + Ascend310P

	// Ascend910 910
	Ascend910 = "Ascend910"
	// Ascend910c2  Ascend910 2core
	Ascend910c2 = Ascend910 + "-" + Core2
	// Ascend910c4 Ascend910 4core
	Ascend910c4 = Ascend910 + "-" + Core4
	// Ascend910c8 Ascend910 8core
	Ascend910c8 = Ascend910 + "-" + Core8
	// Ascend910c16 Ascend910 16core
	Ascend910c16 = Ascend910 + "-" + Core16
	// Ascend910c5Cpu1Gb8 Ascend910 5core 1cpu 8 Gb memory
	Ascend910c5Cpu1Gb8 = Ascend910 + "-" + Core5Cpu1Gb8
	// Ascend910c5Cpu1Gb16 Ascend910 5core 1cpu 16Gb memory
	Ascend910c5Cpu1Gb16 = Ascend910 + "-" + Core5Cpu1Gb16
	// Ascend910c6Cpu1Gb16 Ascend910 6core 1cpu 16Gb memory
	Ascend910c6Cpu1Gb16 = Ascend910 + "-" + Core6Cpu1Gb16
	// Ascend910c10Cpu3Gb16 Ascend910 10core 3cpu 16Gb memory
	Ascend910c10Cpu3Gb16 = Ascend910 + "-" + Core10Cpu3Gb16

	// Ascend910c10Cpu3Gb16Ndvpp Ascend910 10core 3cpu 16Gb memory ndvpp
	Ascend910c10Cpu3Gb16Ndvpp = Ascend910 + "-" + Core10Cpu3Gb16Ndvpp
	// Ascend910c10Cpu3Gb32 Ascend910 10core 3cpu 32Gb memory
	Ascend910c10Cpu3Gb32 = Ascend910 + "-" + Core10Cpu3Gb32
	// Ascend910c10Cpu4Gb16Dvpp Ascend910 10core 4cpu 16Gb memory dvpp
	Ascend910c10Cpu4Gb16Dvpp = Ascend910 + "-" + Core10Cpu4Gb16Dvpp

	// Ascend910c12Cpu3Gb32 Ascend910 12core 3cpu 32Gb memory
	Ascend910c12Cpu3Gb32 = Ascend910 + "-" + Core12Cpu3Gb32

	// Ascend910c3Cpu1Gb8 Ascend910 3core 1cpu 8Gb memory
	Ascend910c3Cpu1Gb8 = Ascend910 + "-" + Core3Cpu1Gb8

	// HuaweiAscend910 with prefix
	HuaweiAscend910 = ResourceNamePrefix + Ascend910

	// Ascend310 310
	Ascend310 = "Ascend310"
	// Ascend310B 310B chip
	Ascend310B = "Ascend310B"
	// HuaweiAscend310 with prefix
	HuaweiAscend310 = ResourceNamePrefix + Ascend310
	// AscendfdPrefix use in fd
	AscendfdPrefix = "davinci-mini"

	// Ascend910B ascend 910B chip
	Ascend910B = "Ascend910B"

	// Ascend910A3 ascend 910A3 chip
	Ascend910A3 = "Ascend910A3"

	// HuaweiNetworkUnHealthAscend910 910 network unhealthy
	HuaweiNetworkUnHealthAscend910 = ResourceNamePrefix + "Ascend910-NetworkUnhealthy"
	// HuaweiUnHealthAscend910 unhealthy
	HuaweiUnHealthAscend910 = ResourceNamePrefix + Ascend910 + "-Unhealthy"
	// HuaweiRecoveringAscend910 recovering
	HuaweiRecoveringAscend910 = ResourceNamePrefix + Ascend910 + "-Recovering"
	// HuaweiUnHealthAscend310P 310p unhealthy
	HuaweiUnHealthAscend310P = ResourceNamePrefix + Ascend310P + "-Unhealthy"
	// HuaweiUnHealthAscend310 310 unhealthy
	HuaweiUnHealthAscend310 = ResourceNamePrefix + Ascend310 + "-Unhealthy"
	// HuaweiNetworkRecoverAscend910 910 network recover
	HuaweiNetworkRecoverAscend910 = ResourceNamePrefix + Ascend910 + "-NetworkRecover"
	// HuaweiRecoverAscend910 910 recover
	HuaweiRecoverAscend910 = ResourceNamePrefix + Ascend910 + "-Recover"

	// HuaweiFaultCodeAscend910 910 fault code
	HuaweiFaultCodeAscend910 = ResourceNamePrefix + Ascend910 + "-Fault"
	// HuaweiFaultCodeAscend310P 310p fault code
	HuaweiFaultCodeAscend310P = ResourceNamePrefix + Ascend310P + "-Fault"
	// HuaweiFaultCodeAscend310 310 fault code
	HuaweiFaultCodeAscend310 = ResourceNamePrefix + Ascend310 + "-Fault"

	// AiCoreResourceName resource name for virtual device
	AiCoreResourceName = "npu-core"

	// Core1 1 core
	Core1 = "1c"
	// Core2 2 core
	Core2 = "2c"
	// Core2Cpu1 2core 1cpu
	Core2Cpu1 = "2c.1cpu"

	// Core3Cpu1Gb8 3 core, 1 cpu and 8GB memory
	Core3Cpu1Gb8 = "3c.1cpu.8g"
	// Core4 4 core
	Core4 = "4c"
	// Core4Cpu3 4core 3cpu
	Core4Cpu3 = "4c.3cpu"
	// Core4Cpu3Ndvpp 4core 3cpu ndvpp
	Core4Cpu3Ndvpp = "4c.3cpu.ndvpp"
	// Core4Cpu4Dvpp 4core 4cpu dvpp
	Core4Cpu4Dvpp = "4c.4cpu.dvpp"
	// Core5Cpu1Gb8 5 core, 1 cpu and 8GB memory
	Core5Cpu1Gb8 = "5c.1cpu.8g"
	// Core5Cpu1Gb16 5 core, 1 cpu and 16GB memory
	Core5Cpu1Gb16 = "5c.1cpu.16g"

	// Core6Cpu1Gb16 6 core, 1 cpu and 16GB memory
	Core6Cpu1Gb16 = "6c.1cpu.16g"

	// Core8 8 core
	Core8 = "8c"
	// Core10Cpu3Gb16 10 core, 3 cpu and 16Gb memory
	Core10Cpu3Gb16 = "10c.3cpu.16g"

	// Core10Cpu3Gb16Ndvpp 10 core, 3 cpu, 16Gb memory and ndvpp
	Core10Cpu3Gb16Ndvpp = "10c.3cpu.16g.ndvpp"
	// Core10Cpu3Gb32 10 core, 3 cpu and 32GB memory
	Core10Cpu3Gb32 = "10c.3cpu.32g"
	// Core10Cpu4Gb16Dvpp 10 core, 4 cpu, 16Gb memory and dvpp
	Core10Cpu4Gb16Dvpp = "10c.4cpu.16g.dvpp"

	// Core12Cpu3Gb32 12 core, 3 cpu and 32GB memory
	Core12Cpu3Gb32 = "12c.3cpu.32g"

	// Core16 16 core
	Core16 = "16c"

	// Vir01 template name vir01
	Vir01 = "vir01"
	// Vir02 template name vir02
	Vir02 = "vir02"
	// Vir02C1 template name vir02_1c
	Vir02C1 = "vir02_1c"
	// Vir03C1G8 template name vir03_1c_8g
	Vir03C1G8 = "vir03_1c_8g"
	// Vir04 template name vir04
	Vir04 = "vir04"
	// Vir04C3 template name vir04_3c
	Vir04C3 = "vir04_3c"
	// Vir04C4Dvpp template name vir04_4c_dvpp
	Vir04C4Dvpp = "vir04_4c_dvpp"
	// Vir04C3Ndvpp template name vir04_3c_ndvpp
	Vir04C3Ndvpp = "vir04_3c_ndvpp"
	// Vir05C1G8 template name vir05_1c_8g
	Vir05C1G8 = "vir05_1c_8g"
	// Vir05C1G16 template name vir05_1c_16g
	Vir05C1G16 = "vir05_1c_16g"
	// Vir06C1G16 template name vir06_1c_16g
	Vir06C1G16 = "vir06_1c_16g"
	// Vir08 template name vir08
	Vir08 = "vir08"
	// Vir10C3G16 template name vir10_3c_16g
	Vir10C3G16 = "vir10_3c_16g"
	// Vir10C3G16NM template name vir10_3c_16g_nm
	Vir10C3G16NM = "vir10_3c_16g_nm"
	// Vir10C3G32 template name vir10_3c_32g
	Vir10C3G32 = "vir10_3c_32g"
	// Vir10C4G16M template name vir10_4c_16g_m
	Vir10C4G16M = "vir10_4c_16g_m"
	// Vir12C3G32 template name vir12_3c_32g
	Vir12C3G32 = "vir12_3c_32g"
	// Vir16 template name vir16
	Vir16 = "vir16"

	// VirMark the mark of virtual device
	VirMark = "vir"

	// AnnotationVNPUInfoSplitLen length of pod annotation for allocate vnpu info
	AnnotationVNPUInfoSplitLen = 2

	// MaxAICoreNum max ai core num
	MaxAICoreNum = 32
	// MinAICoreNum min ai core num
	MinAICoreNum = 8
	// DefaultIDForCreateVNPU default id for creating vnpu
	DefaultIDForCreateVNPU = 0xFFFFFFFF

	// ServerTypeInfoMinLen the min len of server type split data
	ServerTypeInfoMinLen = 2
	// VGroupAndDevLen a list only contain virtual group and device
	VGroupAndDevLen = 2
	// MaxShareDevCount open share device function, max share count is 100
	MaxShareDevCount = 100
)
View Source
const (
	// ServerTypeLabelKey the node label key of server type
	ServerTypeLabelKey = "servertype"
	// AcceleratorTypeKey the node label key of accelerator type
	AcceleratorTypeKey = "accelerator-type"
	// A300IA2Label the value of the A300I A2 node label
	A300IA2Label = "card-910b-infer"
	// ServerUsageLabelKey is to indicate the usage of server
	// is infer or training, currently only related to A800IA2 infer server
	ServerUsageLabelKey = "server-usage"
	// InferCardKey the node label key of infer card
	InferCardKey = "infer-card-type"
	// A300IDuoLabel the value of the A300I Duo node label
	A300IDuoLabel = "card-300i-duo"
)
View Source
const (
	// HiAIHDCDevice hisi_hdc
	HiAIHDCDevice = "/dev/hisi_hdc"
	// HiAIManagerDevice davinci_manager
	HiAIManagerDevice = "/dev/davinci_manager"
	// HiAIManagerDeviceDocker davinci_manager for docker
	HiAIManagerDeviceDocker = "/dev/davinci_manager_docker"
	// HiAISVMDevice devmm_svm
	HiAISVMDevice = "/dev/devmm_svm"
	// HiAi200RCSVM0 svm0
	HiAi200RCSVM0 = "/dev/svm0"
	// HiAi200RCLog log_drv
	HiAi200RCLog = "/dev/log_drv"
	// HiAi200RCEventSched event_sched
	HiAi200RCEventSched = "/dev/event_sched"
	// HiAi200RCUpgrade upgrade
	HiAi200RCUpgrade = "/dev/upgrade"
	// HiAi200RCHiDvpp hi_dvpp
	HiAi200RCHiDvpp = "/dev/hi_dvpp"
	// HiAi200RCMemoryBandwidth memory_bandwidth
	HiAi200RCMemoryBandwidth = "/dev/memory_bandwidth"
	// HiAi200RCTsAisle ts_aisle
	HiAi200RCTsAisle = "/dev/ts_aisle"
)
View Source
const (
	// Atlas200ISoc 200 soc env
	Atlas200ISoc = "Atlas 200I SoC A1"
	// Atlas200ISocXSMEM is xsmem_dev
	Atlas200ISocXSMEM = "/dev/xsmem_dev"
	// Atlas200ISocSYS is sys
	Atlas200ISocSYS = "/dev/sys"
	// Atlas200ISocVDEC is vdec
	Atlas200ISocVDEC = "/dev/vdec"
	// Atlas200ISocVPC is vpc
	Atlas200ISocVPC = "/dev/vpc"
	// Atlas200ISocSpiSmbus is spi_smbus
	Atlas200ISocSpiSmbus = "/dev/spi_smbus"
	// Atlas200ISocUserConfig is user_config
	Atlas200ISocUserConfig = "/dev/user_config"
)
View Source
const (
	// Atlas310BDvppCmdlist is dvpp_cmdlist
	Atlas310BDvppCmdlist = "/dev/dvpp_cmdlist"
	// Atlas310BPngd is pngd
	Atlas310BPngd = "/dev/pngd"
	// Atlas310BVenc is venc
	Atlas310BVenc = "/dev/venc"
)
View Source
const (
	Atlas310BAcodec = "/dev/acodec"
	Atlas310BAi     = "/dev/ai"
	Atlas310BAo     = "/dev/ao"
	Atlas310BVo     = "/dev/vo"
	Atlas310BHdmi   = "/dev/hdmi"
)

Audio and video dependent device for Atlas310B

View Source
const (
	// RootUID is root user id
	RootUID = 0
	// RootGID is root group id
	RootGID = 0

	// KeySliceLength is the length of key slice check
	KeySliceLength = 2

	// DotSepDev if the separator between devices on labels
	DotSepDev = "."

	// CommaSepDev if the separator between devices on annotation
	CommaSepDev = ","
	// MiddelLine if the separator between devices for split id
	MiddelLine = "-"
	// UnderLine the separator between ids
	UnderLine = "_"

	// NoNPUResource means allocated some devices that don't exist
	NoNPUResource = "NoNPUResource"
	// NPUSegmentFailed means create vnpu device failed
	NPUSegmentFailed = "NPUSegmentFailed"
	// CenterScene deploy the device-plugin component on the central side
	CenterScene = "center"
	// EdgeScene deploy the device-plugin component on the edge side
	EdgeScene = "edge"
	// A300IA2BoardId board id of A300I A2
	A300IA2BoardId = 0x28
	// A800IA2NoneHccsBoardIdOld is the boardid of a800i a2 device,0x33 is server without hccs
	A800IA2NoneHccsBoardIdOld = 0x33
	// A800IA2NoneHccsBoardId 0x33 changed to 0x3c , and compatible with the old boardId ,since 2024.9.4
	A800IA2NoneHccsBoardId = 0x3c
	// EmptyBoardId is the boardid of device before initialized
	EmptyBoardId = 0x00
	// FirstDevice the first device id
	FirstDevice = 0
	// Infer means device for inference
	Infer = "infer"
	// Train means device for training
	Train = "train"
)
View Source
const (
	DeviceNotSupport = 8255
	// DefaultAiCoreNum set a default value of aicore number
	DefaultAiCoreNum = 1
)

Special scene for invoking the dcmi interface

View Source
const (
	// Atlas300IDuo for hot reset function, sync chip healthy state
	Atlas300IDuo = "Atlas 300I Duo"
	// HotResetClose not using chip hot reset function
	HotResetClose = -1
	// HotResetInfer using infer chip hot reset
	HotResetInfer = 0
	// HotResetTrainOnLine using train chip hot reset online
	HotResetTrainOnLine = 1
	// HotResetTrainOffLine using train chip hot reset offline
	HotResetTrainOffLine = 2
	// BootStartFinish chip hot reset finish
	BootStartFinish = 16
)
View Source
const (
	// Ascend910RingsNum indicates the number of devices in a ring
	Ascend910RingsNum = 4
	// Ascend910BRingsNumTrain indicates the number of devices in a ring
	Ascend910BRingsNumTrain = 8
	// Ascend910BRingsNumInfer indicates the number of devices in a ring
	Ascend910BRingsNumInfer = 1
	// Ascend910A3RingsNum indicates the number of devices in a ring
	Ascend910A3RingsNum = 2
	// RingSum indicates the max number of ring
	RingSum = 2
	// RankIndexKey for obtain the rank index in the pod
	RankIndexKey = "hccl/rankIndex"
	// InferRankIndex indecates the rank index of infer situation (rank index is meaningless in infer situation)
	InferRankIndex = "-1"
	// WaitResetEndTime for wait device reset to complete
	WaitResetEndTime = 120
	// WaitRetryTime for wait five seconds to reset device again
	WaitRetryTime = 5
	// ResetRetryTimes for max retry times when reset failed
	ResetRetryTimes = 4
)
View Source
const (
	// ResetInfoDir dir for reset info
	ResetInfoDir = "/user/restore/reset/"
	// ResetInfoCMNamePrefix for reset configmap name prefix
	ResetInfoCMNamePrefix = "reset-config-"
	// ResetInfoCMDataKey for reset configmap data key
	ResetInfoCMDataKey = "reset.json"
	// ResetInfoCMCheckCodeKey for reset configmap checkcode key
	ResetInfoCMCheckCodeKey = "checkCode"
	// ResetInfoTypeKey for reset configmap type key
	ResetInfoTypeKey = "restartType"
	// HotResetRestartType for hot reset restart type
	HotResetRestartType = "hotReset"
	// ResetTaskNameKey for obtain the reset task name
	ResetTaskNameKey = "volcano.sh/job-name"
	// ResetTaskNameKeyInLabel for obtain the reset task name when using operator
	ResetTaskNameKeyInLabel = "training.kubeflow.org/job-name"
)
View Source
const (
	// FaultInfoCMNamePrefix for fault configmap name prefix
	FaultInfoCMNamePrefix = "fault-config-"
	// FaultInfoCMDataKey for fault configmap data key
	FaultInfoCMDataKey = "fault-npus"
	// FaultInfoCMCheckCodeKey for fault configmap checkcode key
	FaultInfoCMCheckCodeKey = "checkCode"
)
View Source
const (
	// EmptyError indicates that there is no fault
	EmptyError = "empty"
	// IgnoreError indicates that the current fault can be ignored
	IgnoreError = "ignore"
	// RestartRequestError indicates that the task only needs to re-execute this request
	RestartRequestError = "restart_request"
	// RestartError indicates that the training needs to be re-executed for the current fault
	RestartError = "restart"
	// FreeResetError indicates the fault level of the device to be reset whenever there is no task on NPU
	FreeResetError = "free_reset"
	// ResetError indicates that the current fault requires resetting the chip and re-executing the training
	ResetError = "reset"
	// IsolateError indicates that the device needs to be isolated due to the current fault
	IsolateError = "isolate"
)
View Source
const (
	// EmptyErrorLevel indicates the level of no fault state
	EmptyErrorLevel = iota
	// IgnoreErrorLevel indicates the level of a fault that can be ignored
	IgnoreErrorLevel
	// RestartRequestErrorLevel indicates that the task only needs to re-execute this request
	RestartRequestErrorLevel
	// RestartErrorLevel indicates the level of the fault that needs to be re-executed
	RestartErrorLevel
	// FreeResetErrorLevel indicates the fault level of the device to be reset whenever there is no task on NPU
	FreeResetErrorLevel
	// ResetErrorLevel indicates the fault level of the device to be reset
	ResetErrorLevel
	// IsolateErrorLevel indicates the fault level of the device to be isolated
	IsolateErrorLevel
)
View Source
const (
	// UnrecoveredStatus indicates the status before recovery
	UnrecoveredStatus = "unrecovered"
	// RecoveredStatus indicates that the recovery is successful
	RecoveredStatus = "recovered"
	// RecoverFailedStatus indicates that the recovery fails
	RecoverFailedStatus = "failed"
)
View Source
const (
	// AssertionRecovery the name of assertion 0
	AssertionRecovery = "Recovery"
	// AssertionOccur the name of assertion 1
	AssertionOccur = "Occur"
	// AssertionNotice the name of assertion 2
	AssertionNotice = "Notice"

	// TimeFormat the format for time
	TimeFormat = "2006-01-02 15:04:05"

	// ResourceKindPod the kind pod of resource
	ResourceKindPod = "pod"
)
View Source
const (
	// PollFaultCodeCMInterval is the default interval(second) of polling fault code CM
	PollFaultCodeCMInterval = 300
	// PollFaultCodeCMMaxInterval is the max interval(second) of polling fault code CM
	PollFaultCodeCMMaxInterval = 3600
	// PollFaultCodeCMMinInterval is the min interval(second) of polling fault code CM
	PollFaultCodeCMMinInterval = 30
	// GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface
	GetSwitchFaultCodeInterval = 300
	// FaultCodeCMName is the name of the configmap that is used to save fault code
	FaultCodeCMName = "mindx-dl-fault-config"
	// FaultCodeCMNameSpace is the namespace of the fault code configmap
	FaultCodeCMNameSpace = "kube-system"
	// FaultCodeKey is the key to find fault code in cm
	FaultCodeKey = "faultCode.json"
	// SwitchFaultCodeKey is the key of the switch fault code
	SwitchFaultCodeKey = "SwitchFaultCode.json"
	// FaultCustomizationKey is the key to find fault customization in cm
	FaultCustomizationKey = "faultCustomization.json"
	// PollIntervalKey is the key to find poll interval in cm
	PollIntervalKey = "PollInterval"
	// DefaultProcessReadCMTime is the default time for process read configmap
	DefaultProcessReadCMTime = 30
	// DefaultWaitFaultSelfHealingTime for waiting for fault self-healing
	DefaultWaitFaultSelfHealingTime = 15
	// MinWaitFaultSelfHealingTime for min time of waiting for fault self-healing
	MinWaitFaultSelfHealingTime = 1
	// MaxWaitFaultSelfHealingTime for max time of waiting for fault self-healing
	MaxWaitFaultSelfHealingTime = 30
	// DefaultPollingInterval  represents the time between polls of the dcmi interface
	DefaultPollingInterval = 1
	// MaxWaitProcessReadCMTime for max time waiting for process to read cm
	MaxWaitProcessReadCMTime = 90
	// MinWaitProcessReadCMTime for min time waiting for process to read cm
	MinWaitProcessReadCMTime = 5
	// DefaultWaitDeviceResetTime is the default time used in waiting device reset
	DefaultWaitDeviceResetTime = 150
	// MaxWaitDeviceResetTime is the max time used in waiting device reset
	MaxWaitDeviceResetTime = 180
	// MinWaitDeviceResetTime is the min time used in waiting device reset
	MinWaitDeviceResetTime = 60
	// MaxFaultFrequencyTimeWindow is the max time for the time window of fault frequency
	MaxFaultFrequencyTimeWindow = 864000
	// MinFaultFrequencyTimeWindow is the min time for the time window of fault frequency
	MinFaultFrequencyTimeWindow = 60
	// MaxFaultFrequencyTimes is the max count for the fault occurrence time of fault frequency
	MaxFaultFrequencyTimes = 100
	// MinFaultFrequencyTimes is the min count for the fault occurrence time of fault frequency
	MinFaultFrequencyTimes = 1
	// DefaultLinkUpTimeout is the default time for the linkup event
	DefaultLinkUpTimeout = 60
	// MinLinkUpTimeout is the min time for the linkup event
	MinLinkUpTimeout = 1
	// MaxLinkUpTimeout is the max time for the linkup event
	MaxLinkUpTimeout = 60
	// MinLinkDownTimeout is the min time for the linkdown event
	MinLinkDownTimeout = 1
	// MaxLinkDownTimeout is the max time for the linkdown event
	MaxLinkDownTimeout = 30
	// MaxFaultTimeout is the max time(s) for the fault duration time of fault duration
	MaxFaultTimeout = 600
	// MinFaultTimeout is the min time(s) for the fault duration time of fault duration
	MinFaultTimeout = 0
	// MaxRecoverTimeout is the max time(s) for the fault recover duration time of fault duration
	MaxRecoverTimeout = 600
	// MinRecoverTimeout is the min time(s) for the fault recover duration time of fault duration
	MinRecoverTimeout = 0
	// DefaultSubscribeToPollingTime is the default time from subscribe to polling
	DefaultSubscribeToPollingTime = 5
	// MaxLogicID is the maximum logic ID
	MaxLogicID = 15
	// MinLogicID is the minimum logic ID
	MinLogicID = 0
	// MaxResetTimes the max reset times of a device while error happened,
	// setting to 30 to avoid manually reset on host machine
	MaxResetTimes = 30
)

Fault customization const

View Source
const (
	FaultSeveritySuggestion = iota
	FaultSeverityMinor
	FaultSeverityMajor
	FaultSeverityCritical
)

the severity level of fault

View Source
const (
	ManuallySeparateNpuFirstHandle = "FirstHandle"
	ManuallySeparateNpuHandled     = "Handled"
	ManuallySeparateNpuAll         = "All"
)

LogicID list for reset, get id list of ring

View Source
const (
	// SdIdAbnormal represents super pod sdid abnormal value
	SdIdAbnormal = -2
	// ScaleTypeAbnormal represents super pod scaleType abnormal value
	ScaleTypeAbnormal = -2
	// SuperPodIdAbnormal represents super pod superPodId abnormal value
	SuperPodIdAbnormal = -2
	// ServerIdAbnormal represents super pod serverId abnormal value
	ServerIdAbnormal = -2
)
View Source
const (
	// TimeoutProcess represents fault timeout process
	TimeoutProcess = "fault timeout"
	// TimeoutRecoverProcess represents fault timeout recover process
	TimeoutRecoverProcess = "fault timeout recover"
)
View Source
const (
	// ChipFaultMode represents chip fault mode
	ChipFaultMode = "chip fault mode"
	// NetworkFaultMode represents network fault mode
	NetworkFaultMode = "network fault mode"
)
View Source
const (
	// Polling represents subscribe mode invalid and polling is used scenario
	Polling = "polling"
	// Subscribe represents subscribe mode
	Subscribe = "subscribe"
)
View Source
const (
	// NPUNormalStatus represents normal status
	NPUNormalStatus = "normal"
	// NPUResettingStatus represents resetting status
	NPUResettingStatus = "resetting"
	// UpdateAnnotationRetryTimes update annotation retry times
	UpdateAnnotationRetryTimes = 3
	// SubHealthyAnnotationKey sub-healthy annotation key on node
	SubHealthyAnnotationKey = "subHealthy"
	// FirstUpdateMaxSleepMilliSecond max sleep time before first update node annotation
	FirstUpdateMaxSleepMilliSecond = 3000
)
View Source
const (
	// HbmDoubleBitFaultCode indicate 0x80E01801
	HbmDoubleBitFaultCode = 2162169857
	// HbmDoubleBitFaultCodeStr indicate 80e01801
	HbmDoubleBitFaultCodeStr = "80e01801"
	// AivBusFaultCode indicate 0x80CB8009
	AivBusFaultCode = 2160820233
	// AicBusFaultCode indicate 0x80C98009
	AicBusFaultCode = 2160689161
	// AssociatedFaultDiagnosisTime associated fault diagnosis
	AssociatedFaultDiagnosisTime = 5
	// TimeMilliseconds indicate how many milliseconds are there in a second
	TimeMilliseconds = 1000
)
View Source
const (
	// NotHandleFaultLevel NotHandle Fault Level
	NotHandleFaultLevel = 0
	// PreSeparateFaultLevel PreSeparate Fault Level
	PreSeparateFaultLevel = 1
	// SeparateFaultLevel Separate Fault Level
	SeparateFaultLevel = 2
	// NotHandleFaultLevelStr NotHandle Fault Level Str
	NotHandleFaultLevelStr = "NotHandle"
	// PreSeparateFaultLevelStr PreSeparate Fault Level Str
	PreSeparateFaultLevelStr = "PreSeparate"
	// SeparateFaultLevelStr Separate Fault Level Str
	SeparateFaultLevelStr = "Separate"
)
View Source
const (
	// NotHandleFault not handle fault
	NotHandleFault = "NotHandleFault"
	// RestartRequest restart request
	RestartRequest = "RestartRequest"
	// RestartBusiness restart business
	RestartBusiness = "RestartBusiness"
	// RestartNPU restart NPU
	RestartNPU = "RestartNPU"
	// FreeRestartNPU wait free and restart NPU
	FreeRestartNPU = "FreeRestartNPU"
	// SeparateNPU separate NPU
	SeparateNPU = "SeparateNPU"
	// NormalNPU normal NPU
	NormalNPU = "NormalNPU"
	// NormalNetwork normal network
	NormalNetwork = "NormalNetwork"
	// PreSeparateNPU pre separate NPU
	PreSeparateNPU = "PreSeparateNPU"
	// ManuallySeparateNPU Manually Separate NPU
	ManuallySeparateNPU = "ManuallySeparateNPU"
	// CardUnhealthy fault is caused by card unhealthy
	CardUnhealthy = "CardUnhealthy"
	// CardNetworkUnhealthy  fault is caused by card network unhealthy
	CardNetworkUnhealthy = "CardNetworkUnhealthy"
	// LinkDownFaultCode linkdown fault code
	LinkDownFaultCode = 0x81078603
	// ResetFinishFaultCode reset finish fault code
	ResetFinishFaultCode = 0x8C2FA009
	// CardDropFaultCode card drop fault code
	CardDropFaultCode = 0x40F84E00

	// WaitNpuReadyTime is the time used in waiting for npu ready
	WaitNpuReadyTime time.Duration = 30
	// WaitErrorCodeCleanTime is the time used in waiting for clean error code
	WaitErrorCodeCleanTime time.Duration = 30
	// WaitProcessesToZeroTime is the time used in waiting for process to zero
	WaitProcessesToZeroTime time.Duration = 60
	// ResetInterVal is the interval time used in waiting for reset
	ResetInterVal time.Duration = 5
	// PollingInterval is used to poll the dcmi interface interval time
	PollingInterval time.Duration = DefaultPollingInterval
	// SubHealthFault subHealth code
	SubHealthFault = "SubHealthFault"
)
View Source
const ApiServerPort = "443"

ApiServerPort is port of API server

View Source
const (
	// InitialProcNum represents the initial value of the number of remaining processes
	InitialProcNum = 1
)
View Source
const (
	// MaxResetWaitRecoverTime max reset wait chip recover time is 150s
	MaxResetWaitRecoverTime = 150
)

Variables

View Source
var (
	// SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read
	SwitchFaultLevelMapLock sync.Mutex
	// SwitchFaultLevelMap record every fault code and it's level
	SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize)
	// SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence
	SwitchFaultLock sync.Mutex
)
View Source
var (

	// NotHandleFaultCodes contains all fault code that believed to be not handled, in this case is L1
	NotHandleFaultCodes = make([]int64, 0, GeneralMapSize)
	// PreSeparateFaultCodes contains all fault code that believed to be PreSeparate, in this case is L2-L3
	PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize)
	// SeparateFaultCodes contains all fault code that believed to be Separate, in this case is L4-L5
	SeparateFaultCodes = make([]int64, 0, GeneralMapSize)

	// SubscribeFailed subscribe failed flag
	SubscribeFailed bool
	// SwitchSubscribeFailed indicate switch fault subscribe failed result, true is subscribe failed
	SwitchSubscribeFailed bool
	// Synchronize used for synchronizing the fault cache between the main process and the grace tolerance coroutines
	Synchronize bool

	// FaultTypeSet is a set that contains all the fault level
	FaultTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU,
		RestartNPU, PreSeparateNPU, SeparateNPU, ManuallySeparateNPU, SubHealthFault)
	// FaultDurationTypeSet is a set that contains all the fault Duration level
	FaultDurationTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU,
		RestartNPU, PreSeparateNPU, SeparateNPU, SubHealthFault)
	// NetworkFaultCodes is a set that contains all the network fault codes
	NetworkFaultCodes = sets.NewInt64(LinkDownFaultCode)
)
View Source
var (
	// WaitProcessReadCMTime is the time used in waiting for process read cm
	WaitProcessReadCMTime time.Duration = DefaultProcessReadCMTime
	// WaitFaultSelfHealingTime for waiting for fault self-healing
	WaitFaultSelfHealingTime time.Duration = DefaultWaitFaultSelfHealingTime
	// WaitDeviceResetTime is the time used in waiting device reset
	WaitDeviceResetTime time.Duration = DefaultWaitDeviceResetTime
)

fault customization

View Source
var Int32Tool int32Tool

Int32Tool slice for int32 tool

View Source
var Int64Tool int64Tool

Int64Tool slice for int64 tool

View Source
var StringTool stringTool

StringTool slice for string tool

Functions

func CheckCardUsageMode

func CheckCardUsageMode(use310PMixedInsert bool, productTypes []string) error

CheckCardUsageMode check card usage mode

func CheckDeviceName

func CheckDeviceName(deviceName, deviceRunMode string) bool

CheckDeviceName used to check device name

func CheckErrorMessage

func CheckErrorMessage(err error, target string) bool

CheckErrorMessage check whether the error message contains a specific string

func CheckFileUserSameWithProcess

func CheckFileUserSameWithProcess(loggerPath string) bool

CheckFileUserSameWithProcess to check whether the owner of the log file is the same as the uid

func CheckPodNameAndSpace

func CheckPodNameAndSpace(podPara string, maxLength int) error

CheckPodNameAndSpace used to check pod name or pod namespace

func ConvertDevListToSets

func ConvertDevListToSets(devices, sepType string) sets.String

ConvertDevListToSets convert devices to Sets

func CountFaultDuration

func CountFaultDuration(device *NpuDevice, devFaultInfoMap map[int32][]common.DevFaultInfo)

CountFaultDuration used to calculate each fault duration

func DelOnceFrequencyFault

func DelOnceFrequencyFault()

DelOnceFrequencyFault clear all the fault occurrence time in cache when frequency fault detected at the end of each cycle

func DelOnceRecoverFault

func DelOnceRecoverFault(groupDevice map[string][]*NpuDevice)

DelOnceRecoverFault delete func 'cacheAfterDelFaultCode' record fault code and network fault code in the end of cycle

func DeleteManuallyFaultInfo

func DeleteManuallyFaultInfo(logicID int32)

DeleteManuallyFaultInfo delete manually fault info from manuallySeparateNpuMap

func FakeAiCoreDevice

func FakeAiCoreDevice(dev DavinCiDev, aiCoreDevices *[]*NpuDevice)

FakeAiCoreDevice fake ai core devices

func FilterPods

func FilterPods(pods []v1.Pod, deviceType string, conditionFunc func(pod *v1.Pod) bool) []v1.Pod

FilterPods get pods which meet the conditions

func GenResetDirName

func GenResetDirName(namespace, name string) string

GenResetDirName generate reset cm dir name

func GenResetFileName

func GenResetFileName(namespace, name string) string

GenResetFileName generate reset cm file name

func GenResetTypeFileName

func GenResetTypeFileName(namespace, name string) string

GenResetTypeFileName generate reset cm file name

func Get310PProductType

func Get310PProductType() map[string]string

Get310PProductType get 310P product type

func GetAICore

func GetAICore(templateName string) (int, error)

GetAICore get ai core num by template name

func GetAllDeviceInfoTypeList

func GetAllDeviceInfoTypeList() map[string]struct{}

GetAllDeviceInfoTypeList Get All Device Info Type List

func GetAndCleanFaultInfo

func GetAndCleanFaultInfo() map[int32][]common.DevFaultInfo

GetAndCleanFaultInfo get device fault info and clean cache

func GetAndCleanLogicID

func GetAndCleanLogicID() []int32

GetAndCleanLogicID get should init device's logicID and clean cache

func GetChangedDevFaultInfo

func GetChangedDevFaultInfo(device *NpuDevice, oldErrCodes []int64, newErrCodes []int64) []common.DevFaultInfo

GetChangedDevFaultInfo get device changed fault info

func GetDefaultDevices

func GetDefaultDevices(getFdFlag bool) ([]string, error)

GetDefaultDevices get default device, for allocate mount

func GetDeviceFromPodAnnotation

func GetDeviceFromPodAnnotation(pod *v1.Pod, deviceType string) ([]string, error)

GetDeviceFromPodAnnotation get devices from pod annotation

func GetDeviceID

func GetDeviceID(deviceName string, ascendRuntimeOptions string) (int, int, error)

GetDeviceID get device physical id and virtual by device name

func GetDeviceListID

func GetDeviceListID(devices []string, ascendRuntimeOptions string) (map[int]int, []int, error)

GetDeviceListID get device id by input device name

func GetDeviceRunMode

func GetDeviceRunMode() (string, error)

GetDeviceRunMode get current env device run mode

func GetFaultAssertionName

func GetFaultAssertionName(assertion int8) string

GetFaultAssertionName get assertion name of fault code

func GetFaultType

func GetFaultType(faultCodes []int64, logicId int32) string

GetFaultType will return the fault type from fault codes, fault frequency, fault duration and ManuallySeparateNPU cache

func GetFaultTypeByCode

func GetFaultTypeByCode(faultCodes []int64) string

GetFaultTypeByCode get fault type by fault code. if code not record, default SeparateNPU0

func GetFaultTypeFromFaultDuration

func GetFaultTypeFromFaultDuration(logicId int32, mode string) string

GetFaultTypeFromFaultDuration get fault type from fault duration cache

func GetFaultTypeFromFaultFrequency

func GetFaultTypeFromFaultFrequency(logicId int32) string

GetFaultTypeFromFaultFrequency refreshes the cache of FaultFrequency, delete the faults those not in time window, and return the fault level if the occurrence times of fault >= the set value

func GetJobNameOfPod

func GetJobNameOfPod(pod *v1.Pod) string

GetJobNameOfPod get job name of pod from annotations or labels

func GetNetworkFaultType

func GetNetworkFaultType(faultCodes []int64, logicId int32) string

GetNetworkFaultType will return the fault type from network fault codes, fault duration

func GetNetworkFaultTypeByCode

func GetNetworkFaultTypeByCode(faultCodes []int64) string

GetNetworkFaultTypeByCode get network fault type by fault code. if code not record, default PreSeparateNPU

func GetPattern

func GetPattern() map[string]*regexp.Regexp

GetPattern return pattern map

func GetPodAnnotationByDeviceType

func GetPodAnnotationByDeviceType(pod *v1.Pod, deviceType string) (string, error)

GetPodAnnotationByDeviceType get pod annotation by device type

func GetPodConfiguration

func GetPodConfiguration(phyDevMapVirtualDev map[int]int, devices map[int]string, podName string,
	info ServerInfo, allDevices []NpuDevice) string

GetPodConfiguration get annotation configuration of pod

func GetPodNameFromEnv

func GetPodNameFromEnv() (string, error)

GetPodNameFromEnv get current pod name from env

func GetSwitchFaultCode

func GetSwitchFaultCode() []int64

GetSwitchFaultCode get switch fault code

func GetTemplateName2DeviceTypeMap

func GetTemplateName2DeviceTypeMap() map[string]string

GetTemplateName2DeviceTypeMap get virtual device type by template

func GetTimeoutFaultCodes

func GetTimeoutFaultCodes(mode string) []int64

GetTimeoutFaultCodes get timeout fault codes

func GetVNPUSegmentInfo

func GetVNPUSegmentInfo(deviceInfos []string) (int32, string, error)

GetVNPUSegmentInfo get vpu segment info

func Int32Join

func Int32Join(data []int32, sep string) string

Int32Join int32 join to string

func IntInList

func IntInList(num int32, list []int32) bool

IntInList check if int in list

func IsContainAll300IDuo

func IsContainAll300IDuo() bool

IsContainAll300IDuo in ProductTypes list, is full Atlas 300I Duo card

func IsContainAtlas300IDuo

func IsContainAtlas300IDuo() bool

IsContainAtlas300IDuo in ProductTypes list, is contain Atlas 300I Duo card

func IsValidNumber

func IsValidNumber(checkVal string) (int64, bool)

IsValidNumber input checkVal is a valid number

func IsVirtualDev

func IsVirtualDev(devType string) bool

IsVirtualDev used to judge whether a physical device or a virtual device

func LoadFaultCode

func LoadFaultCode(faultCodeBytes []byte) error

LoadFaultCode loads the fault codes

func LoadFaultCodeFromFile

func LoadFaultCodeFromFile() error

LoadFaultCodeFromFile load fault code and fault type from faultCode.json

func LoadFaultCustomization

func LoadFaultCustomization(faultCustomizationByte []byte) error

LoadFaultCustomization loads fault customization

func LoadFaultCustomizationFromFile

func LoadFaultCustomizationFromFile() error

LoadFaultCustomizationFromFile load fault customization from faultCustomization.json

func LoadSwitchFaultCode

func LoadSwitchFaultCode(switchFaultCodeByte []byte) error

LoadSwitchFaultCode Load SwitchFault Code from bytes of config file or configmap

func LoadSwitchFaultCodeFromFile

func LoadSwitchFaultCodeFromFile() error

LoadSwitchFaultCodeFromFile load fault code from SwitchFaultCode.json

func LockAllDeviceInfo

func LockAllDeviceInfo()

LockAllDeviceInfo lock for device info status

func MakeDataHash

func MakeDataHash(data interface{}) string

MakeDataHash Make Data Hash

func MapDeepCopy

func MapDeepCopy(source map[string]string) map[string]string

MapDeepCopy map deep copy

func MarshalData

func MarshalData(data interface{}) []byte

MarshalData marshal data to bytes

func NewSignWatcher

func NewSignWatcher(osSigns ...os.Signal) chan os.Signal

NewSignWatcher new sign watcher

func QueryManuallyFaultInfoByLogicID

func QueryManuallyFaultInfoByLogicID(logicID int32) bool

QueryManuallyFaultInfoByLogicID query manually fault info based on logic id from manuallySeparateNpuMap

func QueryManuallyFaultNPULogicIDsByHandleStatus

func QueryManuallyFaultNPULogicIDsByHandleStatus(handleStatus string) []int32

QueryManuallyFaultNPULogicIDsByHandleStatus query manually fault npu logic ids based on handle status from manuallySeparateNpuMap

func RandomInt64

func RandomInt64(min, max int64) int64

RandomInt64 return a random int64 number

func RecordFaultInfoList

func RecordFaultInfoList(devFaultInfoList []*TaskDevInfo)

RecordFaultInfoList record the fault info

func RemoveFileAndDir

func RemoveFileAndDir(namespace, name string) error

RemoveFileAndDir remove file and dir

func ResetFaultCustomizationCache

func ResetFaultCustomizationCache()

ResetFaultCustomizationCache reset fault customization cache

func SaveDevFaultInfo

func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo)

SaveDevFaultInfo save device fault info , subscribe interface call back function

func SaveManuallyFaultInfo

func SaveManuallyFaultInfo(logicID int32)

SaveManuallyFaultInfo save manually fault info into manuallySeparateNpuMap

func SetAscendRuntimeEnv

func SetAscendRuntimeEnv(devices []int, ascendRuntimeOptions string, resp *v1beta1.ContainerAllocateResponse)

SetAscendRuntimeEnv is to set ascend runtime environment

func SetDeviceInit

func SetDeviceInit(logicID int32)

SetDeviceInit set should init device's logicID

func SetManuallyFaultNPUHandled

func SetManuallyFaultNPUHandled()

SetManuallyFaultNPUHandled set manually fault NPU handled

func SetNetworkNewFaultAndCacheOnceRecoverFault

func SetNetworkNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)

SetNetworkNewFaultAndCacheOnceRecoverFault set new network fault code and cache once recover network fault

func SetNewFaultAndCacheOnceRecoverFault

func SetNewFaultAndCacheOnceRecoverFault(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice)

SetNewFaultAndCacheOnceRecoverFault set new fault code and cache once recover fault

func SetSwitchFaultCode

func SetSwitchFaultCode(newFaults []int64)

SetSwitchFaultCode set switch fault code

func ShareDev

func ShareDev() bool

ShareDev open the share dev function

func ToString

func ToString(devices sets.String, sepType string) string

ToString convert input data to string

func UnlockAllDeviceInfo

func UnlockAllDeviceInfo()

UnlockAllDeviceInfo unlock for device info status

func VerifyPathAndPermission

func VerifyPathAndPermission(verifyPath string, waitSecond int) (string, bool)

VerifyPathAndPermission used to verify the validity of the path and permission and return resolved absolute path

func WriteToFile

func WriteToFile(info, path string) error

WriteToFile write data to file

Types

type AtomicBool

type AtomicBool struct {
	// contains filtered or unexported fields
}

AtomicBool is an atomic Boolean.

func NewAtomicBool

func NewAtomicBool(initial bool) *AtomicBool

NewAtomicBool creates a AtomicBool.

func (*AtomicBool) Load

func (b *AtomicBool) Load() bool

Load atomically loads the Boolean.

func (*AtomicBool) Store

func (b *AtomicBool) Store(new bool)

Store atomically stores the passed value.

type DavinCiDev

type DavinCiDev struct {
	IP      string
	LogicID int32
	PhyID   int32
	CardID  int32
}

DavinCiDev davinci device

type DevFaultInfo

type DevFaultInfo struct {
	LogicId       int32
	Status        string
	Policy        string
	InitialPolicy string
	ErrorCode     []int64
	ErrorCodeHex  string
}

DevFaultInfo is the fault info of device

type DevFaultInfoBasedTimeAscend

type DevFaultInfoBasedTimeAscend []common.DevFaultInfo

DevFaultInfoBasedTimeAscend sort fault queue based on alarmRaisedTime in ascending order

func (DevFaultInfoBasedTimeAscend) Len

func (devFault DevFaultInfoBasedTimeAscend) Len() int

Len is a fixed usage to find the length of type

func (DevFaultInfoBasedTimeAscend) Less

func (devFault DevFaultInfoBasedTimeAscend) Less(i, j int) bool

Less is fixed usage to check if one is less than the other one of type

func (DevFaultInfoBasedTimeAscend) Swap

func (devFault DevFaultInfoBasedTimeAscend) Swap(i, j int)

Swap is a fixed usage to switch the index of type

type DevStatusSet

type DevStatusSet struct {
	UnHealthyDevice    sets.String
	NetUnHealthyDevice sets.String
	HealthDevices      sets.String
	RecoveringDevices  sets.String
	FreeHealthyDevice  map[string]sets.String
	DeviceFault        []DeviceFault
}

DevStatusSet contain different states devices

type Device

type Device struct {
	DeviceID      string `json:"device_id"` // device id
	DeviceIP      string `json:"device_ip"` // device ip
	SuperDeviceID string `json:"super_device_id,omitempty"`
}

Device id for Instcance

type DeviceFault

type DeviceFault struct {
	FaultType            string `json:"fault_type"`
	NPUName              string `json:"npu_name"`
	LargeModelFaultLevel string `json:"large_model_fault_level"`
	FaultLevel           string `json:"fault_level"`
	FaultHandling        string `json:"fault_handling"`
	FaultCode            string `json:"fault_code"`
}

DeviceFault npu or network fault info

type DeviceHealth

type DeviceHealth struct {
	FaultCodes    []int64
	Health        string
	NetworkHealth string
}

DeviceHealth health status of device

type FaultCustomization

type FaultCustomization struct {
	GraceTolerance GraceToleranceCustomization
	FaultFrequency []FaultFrequencyCustomization
	FaultDuration  []FaultDurationCustomization
}

FaultCustomization is the customization info of fault

type FaultDuration

type FaultDuration struct {
	FaultTimeout   int64
	RecoverTimeout int64
	FaultHandling  string
}

FaultDuration is the base info of fault duration

type FaultDurationCache

type FaultDurationCache struct {
	// key: logicID, value: fault duration data
	Duration map[int32]FaultDurationData
	FaultDuration
}

FaultDurationCache is the cache saving the FaultDuration

type FaultDurationCustomization

type FaultDurationCustomization struct {
	EventId []string
	FaultDuration
}

FaultDurationCustomization is the customization info of fault duration

type FaultDurationData

type FaultDurationData struct {
	TimeoutStatus            bool
	FaultEventQueue          []common.DevFaultInfo
	FaultDurationTime        int64
	FaultRecoverDurationTime int64
}

FaultDurationData saved data during fault duration statistics

type FaultFrequency

type FaultFrequency struct {
	TimeWindow    int64
	Times         int64
	FaultHandling string
}

FaultFrequency is the base info of fault frequency

type FaultFrequencyCache

type FaultFrequencyCache struct {
	// key: logicID, value: fault occurrence time (unix time)
	Frequency map[int32][]int64
	FaultFrequency
}

FaultFrequencyCache is the cache saving the FaultFrequency

type FaultFrequencyCustomization

type FaultFrequencyCustomization struct {
	EventId []string
	FaultFrequency
}

FaultFrequencyCustomization is the customization info of fault frequency

type FaultTypeCode

type FaultTypeCode struct {
	NotHandleFaultCodes        []int64
	RestartRequestCodes        []int64
	RestartBusinessCodes       []int64
	RestartNPUCodes            []int64
	FreeRestartNPUCodes        []int64
	PreSeparateNPUCodes        []int64
	SeparateNPUCodes           []int64
	NotHandleFaultNetworkCodes []int64
	PreSeparateNPUNetworkCodes []int64
	SeparateNPUNetworkCodes    []int64
	SubHealthFaultCodes        []int64
}

FaultTypeCode group code by type

type FileWatch

type FileWatch struct {
	FileWatcher *fsnotify.Watcher
}

FileWatch is used to watch sock file

func NewFileWatch

func NewFileWatch() (*FileWatch, error)

NewFileWatch is used to watch socket file

func (*FileWatch) WatchFile

func (fw *FileWatch) WatchFile(fileName string) error

WatchFile add file to watch

type GraceToleranceCustomization

type GraceToleranceCustomization struct {
	WaitProcessReadCMTime    int64
	WaitDeviceResetTime      int64
	WaitFaultSelfHealingTime int64
}

GraceToleranceCustomization is the customization info of grace tolerance

type HbmFaultManager

type HbmFaultManager struct {
	HbmOccurTimeCache map[int32]int64
	AicFaultEventQue  map[int32][]common.DevFaultInfo
}

HbmFaultManager manage the accompanying faults of aic error and hbm error

func NewHbmFaultManager

func NewHbmFaultManager() *HbmFaultManager

NewHbmFaultManager return a hbm fault manager

type Instance

type Instance struct {
	PodName    string   `json:"pod_name"`  // pod Name
	ServerID   string   `json:"server_id"` // serverdId
	SuperPodId int32    `json:"super_pod_id"`
	Devices    []Device `json:"devices"` // dev
}

Instance is for annotation

type ManuallyFaultInfo

type ManuallyFaultInfo struct {
	LogicID     int32
	FirstHandle bool
	RecordTime  int64
}

ManuallyFaultInfo save the info of ManuallySeparateNPU

type NodeDeviceInfo

type NodeDeviceInfo struct {
	DeviceList map[string]string
	UpdateTime int64
}

NodeDeviceInfo record node NPU device information. Will be solidified into cm.

type NodeDeviceInfoCache

type NodeDeviceInfoCache struct {
	DeviceInfo  NodeDeviceInfo
	SuperPodID  int32
	ServerIndex int32
	CheckCode   string
}

NodeDeviceInfoCache record node NPU device information. Will be solidified into cm.

type NpuAllInfo

type NpuAllInfo struct {
	AllDevTypes []string
	AllDevs     []NpuDevice
	AICoreDevs  []*NpuDevice
}

NpuAllInfo all npu infos

type NpuBaseInfo

type NpuBaseInfo struct {
	IP            string
	SuperDeviceID uint32
}

NpuBaseInfo is the base info of npu

type NpuDevice

type NpuDevice struct {
	FaultCodes             []int64
	AlarmRaisedTime        int64
	NetworkFaultCodes      []int64
	NetworkAlarmRaisedTime int64
	DevType                string
	DeviceName             string
	Health                 string
	NetworkHealth          string
	CardDrop               bool
	IP                     string
	LogicID                int32
	PhyID                  int32
	CardID                 int32
	SuperDeviceID          uint32
	Status                 string
}

NpuDevice npu device description

type Option

type Option struct {
	GetFdFlag          bool     // to describe FdFlag
	UseAscendDocker    bool     // UseAscendDocker to chose docker type
	UseVolcanoType     bool     // use volcano mode
	AutoStowingDevs    bool     // auto stowing fixes devices or not
	PresetVDevice      bool     // preset virtual device
	Use310PMixedInsert bool     // chose 310P mixed insert mode
	GraceToleranceOn   bool     // check if grace tolerance is on
	ListAndWatchPeriod int      // set listening device state period
	HotReset           int      // unhealthy chip hot reset
	ShareCount         uint     // share device count
	AiCoreCount        int32    // found by dcmi interface
	BuildScene         string   // build scene judge device-plugin start scene
	ProductTypes       []string // all product types
	RealCardType       string   // real card type
	LinkdownTimeout    int64    // linkdown timeout duration
	DealWatchHandler   bool     // update pod cache when receiving pod informer watch errors
	EnableSwitchFault  bool     // if enable switch faul
	CheckCachedPods    bool     // check cached pods periodically
}

Option option

var (
	// ParamOption for option
	ParamOption Option
	// DpStartReset for reset configmap
	DpStartReset sync.Once
)

type PodDeviceInfo

type PodDeviceInfo struct {
	Pod        v1.Pod
	KltDevice  []string
	RealDevice []string
}

PodDeviceInfo define device info of pod, include kubelet allocate and real allocate device

type ServerInfo

type ServerInfo struct {
	ServerID   string
	DeviceType string
	SuperPodID int32
}

ServerInfo used for pass parameters

type SuperPodInfo

type SuperPodInfo struct {
	ScaleType  int32
	SuperPodId int32
	ServerId   int32
	Reserve    []int32
}

SuperPodInfo is super pod info

type SwitchFaultFileInfo

type SwitchFaultFileInfo struct {
	NotHandleFaultCodes []string
	ReportFaultCodes    []string
	SubHealthFaultCodes []string
	ResetFaultCodes     []string
	SeparateFaultCodes  []string
}

SwitchFaultFileInfo contains all fault code loading from faultconfig configmap or switchfaultconfig.json

type SwitchFaultInfo

type SwitchFaultInfo struct {
	FaultCode  []string
	FaultLevel string
	UpdateTime int64
	NodeStatus string
}

SwitchFaultInfo Switch Fault Info

func GetSwitchFaultInfo

func GetSwitchFaultInfo() SwitchFaultInfo

GetSwitchFaultInfo GetSwitch Fault Info by CurrentSwitchFault and fault config of switch

type TaskDevInfo

type TaskDevInfo struct {
	RankId int
	DevFaultInfo
}

TaskDevInfo is the device info of a task

type TaskFaultInfo

type TaskFaultInfo struct {
	FaultRank  []int
	UpdateTime int64
}

TaskFaultInfo record task fault rank information

type TaskFaultInfoCache

type TaskFaultInfoCache struct {
	FaultInfo *TaskFaultInfo
	CheckCode string
}

TaskFaultInfoCache record task fault rank information cache

type TaskResetInfo

type TaskResetInfo struct {
	RankList   []*TaskDevInfo
	UpdateTime int64
	RetryTime  int
}

TaskResetInfo record task reset device information

type TaskResetInfoCache

type TaskResetInfoCache struct {
	ResetInfo *TaskResetInfo
	CheckCode string
}

TaskResetInfoCache record task reset device information cache

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL