Documentation
¶
Overview ¶
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
Index ¶
- Constants
- Variables
- func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
- func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, ...) (err error)
- func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
- func AttachDriver() error
- func ClearPolicyForGroup(group GroupHandle) error
- func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
- func DestroyGroup(groupID GroupHandle) (err error)
- func DetachDriver() error
- func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
- func FieldsInit() int
- func FieldsTerm() int
- func FindFirstNonAsciiIndex(value [4096]byte) int
- func Fv2_Blob(fv FieldValue_v2) [4096]byte
- func Fv2_String(fv FieldValue_v2) string
- func GetAllDeviceCount() (uint, error)
- func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
- func GetSupportedDevices() ([]uint, error)
- func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
- func Init(m mode, args ...string) (cleanup func(), err error)
- func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
- func IsCurrentField(fieldName string) bool
- func IsInt32Blank(value int) bool
- func IsInt64Blank(value int64) bool
- func IsLegacyField(fieldName string) bool
- func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
- func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
- func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error
- func Shutdown() (err error)
- func UnwatchFields(fieldsGroup FieldHandle, group GroupHandle) error
- func UpdateAllFields() error
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
- func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, ...) error
- func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error)
- type CPUHierarchyCPU_v1
- type CPUHierarchy_v1
- type ClockInfo
- type DbePolicyCondition
- type DcgmBindUnbindEventState
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type DiagErrorDetail
- type DiagResult
- type DiagResults
- type DiagType
- type ECCErrorsInfo
- type EntityStatus
- type Error
- type FieldHandle
- type FieldMeta
- type FieldValue_v1
- func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
- func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
- func LinkGetLatestValues(index uint, parentType Field_Entity_Group, parentId uint, fields []Short) ([]FieldValue_v1, error)
- type FieldValue_v2
- type Field_Entity_Group
- type GroupEntityPair
- type GroupHandle
- func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
- func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
- func GroupAllGPUs() GroupHandle
- func NewDefaultGroup(groupName string) (GroupHandle, error)
- func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
- func WatchPidFields() (GroupHandle, error)
- func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
- type GroupInfo
- type HealthCheckErrorCode
- type HealthResponse
- type HealthResult
- type HealthSystem
- type Incident
- type Link_State
- type MemoryInfo
- type MetricGroup
- type MigEntityInfo
- type MigHierarchyInfo
- type MigHierarchyInfo_v2
- type MigHierarchy_v2
- type MigProfile
- type NvLinkP2PStatus
- type NvLinkStatus
- type NvlinkPolicyCondition
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PciPolicyCondition
- type PerfState
- type PolicyAction
- type PolicyCondition
- type PolicyConfig
- type PolicyStatus
- type PolicyValidation
- type PolicyViolation
- type PowerPolicyCondition
- type ProcessInfo
- type ProcessUtilInfo
- type RetiredPagesPolicyCondition
- type Short
- type Status
- type SystemWatch
- type ThermalPolicyCondition
- type Time
- type UtilizationInfo
- type ViolationTime
- type XIDErrorInfo
- type XidPolicyCondition
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( // DCGM_FT_BINARY is the type for binary data DCGM_FT_BINARY = uint('b') // DCGM_FT_DOUBLE is the type for floating-point numbers DCGM_FT_DOUBLE = uint('d') // DCGM_FT_INT64 is the type for 64-bit integers DCGM_FT_INT64 = uint('i') // DCGM_FT_STRING is the type for strings DCGM_FT_STRING = uint('s') // DCGM_FT_TIMESTAMP is the type for timestamps DCGM_FT_TIMESTAMP = uint('t') // DCGM_FT_INT32_BLANK is the blank value for 32-bit integers DCGM_FT_INT32_BLANK = int64(2147483632) // DCGM_FT_INT32_NOT_FOUND is the value for not found in 32-bit integers DCGM_FT_INT32_NOT_FOUND = DCGM_FT_INT32_BLANK + 1 // DCGM_FT_INT32_NOT_SUPPORTED is the value for not supported in 32-bit integers DCGM_FT_INT32_NOT_SUPPORTED = DCGM_FT_INT32_BLANK + 2 // DCGM_FT_INT32_NOT_PERMISSIONED is the value for not permissioned in 32-bit integers DCGM_FT_INT32_NOT_PERMISSIONED = DCGM_FT_INT32_BLANK + 3 // DCGM_FT_INT64_BLANK is the blank value for 64-bit integers DCGM_FT_INT64_BLANK = int64(9223372036854775792) // DCGM_FT_INT64_NOT_FOUND is the value for not found in 64-bit integers DCGM_FT_INT64_NOT_FOUND = DCGM_FT_INT64_BLANK + 1 // DCGM_FT_INT64_NOT_SUPPORTED is the value for not supported in 64-bit integers DCGM_FT_INT64_NOT_SUPPORTED = DCGM_FT_INT64_BLANK + 2 // DCGM_FT_INT64_NOT_PERMISSIONED is the value for not permissioned in 64-bit integers DCGM_FT_INT64_NOT_PERMISSIONED = DCGM_FT_INT64_BLANK + 3 // DCGM_FT_FP64_BLANK is the blank value for floating-point numbers DCGM_FT_FP64_BLANK = 140737488355328.0 // DCGM_FT_FP64_NOT_FOUND is the value for not found in floating-point numbers DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) // DCGM_FT_FP64_NOT_SUPPORTED is the value for not supported in floating-point numbers DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) // DCGM_FT_FP64_NOT_PERMISSIONED is the value for not permissioned in floating-point numbers DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) // DCGM_FT_STR_BLANK is the blank value for strings DCGM_FT_STR_BLANK = "<<<NULL>>>" // DCGM_FT_STR_NOT_FOUND is the value for not found in strings DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>" // DCGM_FT_STR_NOT_SUPPORTED is the value for not supported in strings DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>" // DCGM_FT_STR_NOT_PERMISSIONED is the value for not permissioned in strings DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERMISSIONED>>>" // DCGM_ST_OK is the value for ECC OK DCGM_ST_OK = 0 // DCGM_ST_BADPARAM is the value for ECC BAD PARAM DCGM_ST_BADPARAM = -1 // DCGM_ST_GENERIC_ERROR is the value for ECC GENERIC ERROR DCGM_ST_GENERIC_ERROR = -3 // DCGM_ST_MEMORY is the value for ECC MEMORY DCGM_ST_MEMORY = -4 // DCGM_ST_NOT_CONFIGURED is the value for ECC NOT CONFIGURED DCGM_ST_NOT_CONFIGURED = -5 // DCGM_ST_NOT_SUPPORTED is the value for ECC NOT SUPPORTED DCGM_ST_NOT_SUPPORTED = -6 // DCGM_ST_INIT_ERROR is the value for ECC INIT ERROR DCGM_ST_INIT_ERROR = -7 // DCGM_ST_NVML_ERROR is the value for ECC NVML ERROR DCGM_ST_NVML_ERROR = -8 // DCGM_ST_PENDING is the value for ECC PENDING DCGM_ST_PENDING = -9 // DCGM_ST_TIMEOUT is the value for ECC TIMEOUT DCGM_ST_TIMEOUT = -11 // DCGM_ST_VER_MISMATCH is the value for ECC VER MISMATCH DCGM_ST_VER_MISMATCH = -12 // DCGM_ST_UNKNOWN_FIELD is the value for ECC UNKNOWN FIELD DCGM_ST_UNKNOWN_FIELD = -13 // DCGM_ST_NO_DATA is the value for ECC NO DATA DCGM_ST_NO_DATA = -14 // DCGM_ST_STALE_DATA is the value for ECC STALE DATA DCGM_ST_STALE_DATA = -15 // DCGM_ST_NOT_WATCHED is the value for ECC NOT WATCHED DCGM_ST_NOT_WATCHED = -16 // DCGM_ST_NO_PERMISSION is the value for ECC NO PERMISSION DCGM_ST_NO_PERMISSION = -17 // DCGM_ST_GPU_IS_LOST is the value for ECC GPU IS LOST DCGM_ST_GPU_IS_LOST = -18 // DCGM_ST_RESET_REQUIRED is the value for ECC RESET REQUIRED DCGM_ST_RESET_REQUIRED = -19 // DCGM_ST_FUNCTION_NOT_FOUND is the value for ECC FUNCTION NOT FOUND DCGM_ST_FUNCTION_NOT_FOUND = -20 // DCGM_ST_CONNECTION_NOT_VALID is the value for ECC CONNECTION NOT VALID DCGM_ST_CONNECTION_NOT_VALID = -21 // DCGM_ST_GPU_NOT_SUPPORTED is the value for ECC GPU NOT SUPPORTED DCGM_ST_GPU_NOT_SUPPORTED = -22 // DCGM_ST_GROUP_INCOMPATIBLE is the value for ECC GROUP INCOMPATIBLE DCGM_ST_GROUP_INCOMPATIBLE = -23 // DCGM_ST_MAX_LIMIT is the value for ECC MAX LIMIT DCGM_ST_MAX_LIMIT = -24 // DCGM_ST_LIBRARY_NOT_FOUND is the value for ECC LIBRARY NOT FOUND DCGM_ST_LIBRARY_NOT_FOUND = -25 // DCGM_ST_DUPLICATE_KEY is the value for ECC DUPLICATE KEY DCGM_ST_DUPLICATE_KEY = -26 // DCGM_ST_GPU_IN_SYNC_BOOST_GROUP is the value for ECC GPU IN SYNC BOOST GROUP DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 // DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP is the value for ECC GPU NOT IN SYNC BOOST GROUP DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 // DCGM_ST_REQUIRES_ROOT is the value for ECC REQUIRES ROOT DCGM_ST_REQUIRES_ROOT = -29 // DCGM_ST_NVVS_ERROR is the value for ECC NVVS ERROR DCGM_ST_NVVS_ERROR = -30 // DCGM_ST_INSUFFICIENT_SIZE is the value for ECC INSUFFICIENT SIZE DCGM_ST_INSUFFICIENT_SIZE = -31 // DCGM_ST_FIELD_UNSUPPORTED_BY_API is the value for ECC FIELD UNSUPPORTED BY API DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 // DCGM_ST_MODULE_NOT_LOADED is the value for ECC MODULE NOT LOADED DCGM_ST_MODULE_NOT_LOADED = -33 // DCGM_ST_IN_USE is the value for ECC IN USE DCGM_ST_IN_USE = -34 // DCGM_ST_GROUP_IS_EMPTY is the value for ECC GROUP IS EMPTY DCGM_ST_GROUP_IS_EMPTY = -35 // DCGM_ST_PROFILING_NOT_SUPPORTED is the value for ECC PROFILING NOT SUPPORTED DCGM_ST_PROFILING_NOT_SUPPORTED = -36 // DCGM_ST_PROFILING_LIBRARY_ERROR is the value for ECC PROFILING LIBRARY ERROR DCGM_ST_PROFILING_LIBRARY_ERROR = -37 // DCGM_ST_PROFILING_MULTI_PASS is the value for ECC PROFILING MULTI PASS DCGM_ST_PROFILING_MULTI_PASS = -38 // DCGM_ST_DIAG_ALREADY_RUNNING is the value for ECC DIAG ALREADY RUNNING DCGM_ST_DIAG_ALREADY_RUNNING = -39 // DCGM_ST_DIAG_BAD_JSON is the value for ECC DIAG BAD JSON DCGM_ST_DIAG_BAD_JSON = -40 // DCGM_ST_DIAG_BAD_LAUNCH is the value for ECC DIAG BAD LAUNCH DCGM_ST_DIAG_BAD_LAUNCH = -41 // DCGM_ST_DIAG_UNUSED is the value for ECC DIAG UNUSED DCGM_ST_DIAG_UNUSED = -42 // DCGM_ST_DIAG_THRESHOLD_EXCEEDED is the value for ECC DIAG THRESHOLD EXCEEDED DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 // DCGM_ST_INSUFFICIENT_DRIVER_VERSION is the value for ECC INSUFFICIENT DRIVER VERSION DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 // DCGM_ST_INSTANCE_NOT_FOUND is the value for ECC INSTANCE NOT FOUND DCGM_ST_INSTANCE_NOT_FOUND = -45 // DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND is the value for ECC COMPUTE INSTANCE NOT FOUND DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 // DCGM_ST_CHILD_NOT_KILLED is the value for ECC CHILD NOT KILLED DCGM_ST_CHILD_NOT_KILLED = -47 // DCGM_ST_3RD_PARTY_LIBRARY_ERROR is the value for ECC 3RD PARTY LIBRARY ERROR DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 // DCGM_ST_INSUFFICIENT_RESOURCES is the value for ECC INSUFFICIENT RESOURCES DCGM_ST_INSUFFICIENT_RESOURCES = -49 // DCGM_ST_PLUGIN_EXCEPTION is the value for ECC PLUGIN EXCEPTION DCGM_ST_PLUGIN_EXCEPTION = -50 // DCGM_ST_NVVS_ISOLATE_ERROR is the value for ECC NVVS ISOLATE ERROR DCGM_ST_NVVS_ISOLATE_ERROR = -51 // DCGM_ST_NVVS_BINARY_NOT_FOUND is the value for ECC NVVS BINARY NOT FOUND DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 // DCGM_ST_NVVS_KILLED is the value for ECC NVVS KILLED DCGM_ST_NVVS_KILLED = -53 // DCGM_ST_PAUSED is the value for ECC PAUSED DCGM_ST_PAUSED = -54 // DCGM_ST_ALREADY_INITIALIZED is the value for ECC ALREADY INITIALIZED DCGM_ST_ALREADY_INITIALIZED = -55 // DCGM_ST_NVML_NOT_LOADED is the value for ECC NVML NOT LOADED DCGM_ST_NVML_NOT_LOADED = -56 // DCGM_ST_NVML_DRIVER_TIMEOUT is the value for ECC NVML DRIVER TIMEOUT DCGM_ST_NVML_DRIVER_TIMEOUT = -57 // DCGM_ST_NVVS_NO_AVAILABLE_TEST is the value for ECC NVVS NO AVAILABLE TEST DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58 // DCGM_ST_UNINITIALIZED is the value for DCGM not initialized DCGM_ST_UNINITIALIZED = -59 // DCGM_ST_NO_NVVS is the value for NVVS not available DCGM_ST_NO_NVVS = -60 // DCGM_ST_NVVS_NOT_RUNNING is the value for NVVS not running DCGM_ST_NVVS_NOT_RUNNING = -61 // DCGM_ST_CHILD_SPAWN_FAILED is the value for child spawn failed DCGM_ST_CHILD_SPAWN_FAILED = -62 // DCGM_ST_FILE_IO_ERROR is the value for file I/O error DCGM_ST_FILE_IO_ERROR = -63 // DCGM_ST_CHILD_SIGNAL_RECEIVED is the value for child signal received DCGM_ST_CHILD_SIGNAL_RECEIVED = -64 // DCGM_ST_CALLER_ALREADY_STOPPED is the value for caller already stopped DCGM_ST_CALLER_ALREADY_STOPPED = -65 // DCGM_ST_DIAG_STOPPED is the value for diagnostic stopped DCGM_ST_DIAG_STOPPED = -66 )
FieldType constants
const ( // MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES) // MAX_NUM_CPUS represents the maximum number of CPUs supported MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS) // CHAR_BIT represents the number of bits in a byte CHAR_BIT = uint(C.CHAR_BIT) // MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8) )
const ( // PerfStateMax represents the highest performance state (P0) PerfStateMax = 0 // PerfStateMin represents the lowest performance state (P15) PerfStateMin = 15 // PerfStateUnknown represents an unknown performance state PerfStateUnknown = 32 )
const ( // MAX_NUM_DEVICES represents the maximum number of GPU devices supported MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES) // MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO) )
const ( // DbePolicy represents a Double-bit ECC error policy condition DbePolicy = PolicyCondition("Double-bit ECC error") // PCIePolicy represents a PCI error policy condition PCIePolicy = PolicyCondition("PCI error") // MaxRtPgPolicy represents a Maximum Retired Pages Limit policy condition MaxRtPgPolicy = PolicyCondition("Max Retired Pages Limit") // ThermalPolicy represents a Thermal Limit policy condition ThermalPolicy = PolicyCondition("Thermal Limit") // PowerPolicy represents a Power Limit policy condition PowerPolicy = PolicyCondition("Power Limit") // NvlinkPolicy represents an NVLink error policy condition NvlinkPolicy = PolicyCondition("Nvlink Error") // XidPolicy represents an XID error policy condition XidPolicy = PolicyCondition("XID Error") )
Policy condition types
const ( // DefaultMaxRetiredPages is the default threshold for retired pages (matches dcgmi default) DefaultMaxRetiredPages = 10 // DefaultMaxTemperature is the default threshold for temperature in Celsius (matches dcgmi default) DefaultMaxTemperature = 100 // DefaultMaxPower is the default threshold for power in Watts (matches dcgmi default) DefaultMaxPower = 250 )
Default policy thresholds matching dcgmi defaults
const ( // DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML" // DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file DCGM_DBG_FILE = "__DCGM_DBG_FILE" // DCGM_DBG_LVL is environment variables which enables DCGM logging level DCGM_DBG_LVL = "__DCGM_DBG_LVL" )
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)
DCGM_FV_FLAG_LIVE_DATA is a flag for the DCGM fields.
const (
DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
)
DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group
const DIAG_RESULT_STRING_SIZE = 1024
DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings
Variables ¶
var ErrInvalidMode = errors.New("invalid mode")
ErrInvalidMode represents an error indicating that an invalid mode was used
Functions ¶
func AddEntityToGroup ¶
func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)
AddEntityToGroup adds an entity to an existing group
func AddLinkEntityToGroup ¶
func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, parentID uint) (err error)
AddLinkEntityToGroup adds a link entity to the group
func AddToGroup ¶
func AddToGroup(groupID GroupHandle, gpuID uint) (err error)
AddToGroup adds a GPU to an existing group
func AttachDriver ¶
func AttachDriver() error
AttachDriver attaches the driver to DCGM. This is used to reattach the driver after a DetachDriver call, typically when updating the driver without restarting DCGM. Requires DCGM 4.5.0 or later.
func ClearPolicyForGroup ¶
func ClearPolicyForGroup(group GroupHandle) error
ClearPolicyForGroup clears all policy conditions for a GPU group
func CreateFakeEntities ¶
func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)
CreateFakeEntities creates test entities with the specified MIG hierarchy information. This function is intended for testing purposes only. Returns a slice of Entity IDs for the created entities and any error encountered.
func DestroyGroup ¶
func DestroyGroup(groupID GroupHandle) (err error)
DestroyGroup destroys an existing GPU group
func DetachDriver ¶
func DetachDriver() error
DetachDriver detaches the driver from DCGM. This is used when you want to update the driver without restarting DCGM. After detaching, GPUs will not be accessible until AttachDriver is called. Requires DCGM 4.5.0 or later.
func FieldGroupDestroy ¶
func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
FieldGroupDestroy destroys a previously created field group. Returns an error if the group cannot be destroyed.
func FieldsInit ¶
func FieldsInit() int
FieldsInit initializes the DCGM fields module. Returns an integer status code.
func FieldsTerm ¶
func FieldsTerm() int
FieldsTerm terminates the DCGM fields module. Returns an integer status code.
func FindFirstNonAsciiIndex ¶
FindFirstNonAsciiIndex returns the index of the first non-ASCII character in the byte array. Returns 4096 if no non-ASCII character is found.
func Fv2_Blob ¶
func Fv2_Blob(fv FieldValue_v2) [4096]byte
Fv2_Blob returns the raw field value of a FieldValue_v2 as a byte array.
func Fv2_String ¶
func Fv2_String(fv FieldValue_v2) string
Fv2_String returns the string value of a FieldValue_v2.
func GetAllDeviceCount ¶
GetAllDeviceCount returns the count of all GPUs in the system
func GetEntityGroupEntities ¶
func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)
GetEntityGroupEntities returns all entities of the specified group type
func GetSupportedDevices ¶
GetSupportedDevices returns a list of DCGM-supported GPU IDs
func HealthSet ¶
func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)
HealthSet enables the DCGM health check system for the given systems. It configures which health watch systems should be monitored for the specified group.
func Init ¶
Init starts DCGM in the specified mode Mode can be: - Embedded: Start hostengine within this process - Standalone: Connect to an already running nv-hostengine - StartHostengine: Start and connect to nv-hostengine, terminate before exiting Returns a cleanup function and any error encountered
func InjectFieldValue ¶
func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error
InjectFieldValue injects a test value for a specific field into DCGM's field manager. This function is intended for testing purposes only.
Parameters:
- gpu: The GPU ID to inject the field value for
- fieldID: The DCGM field identifier
- fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
- status: The status code for the field
- ts: The timestamp for the field value
- value: The value to inject (must match fieldType)
Returns an error if the injection fails
func IsCurrentField ¶
IsCurrentField returns true if the given field name is a current field
func IsInt32Blank ¶
IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). These values indicate that no valid data is available for the field.
func IsInt64Blank ¶
IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). These values indicate that no valid data is available for the field.
func IsLegacyField ¶
IsLegacyField returns true if the given field name is a legacy field
func ListenForPolicyViolations ¶
func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs. Returns a channel that receives policy violations and any error encountered.
Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. When the context is cancelled, the returned channel will be closed and all resources will be automatically cleaned up.
Example:
ctx, cancel := context.WithCancel(context.Background())
defer cancel() // Ensures cleanup happens
violations, err := dcgm.ListenForPolicyViolations(ctx, dcgm.XidPolicy)
if err != nil {
return err
}
for violation := range violations {
// Handle violation...
}
func ListenForPolicyViolationsForGroup ¶
func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)
ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group. Returns a channel that receives policy violations and any error encountered.
Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. See ListenForPolicyViolations for usage example.
func SetPolicyForGroup ¶
func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error
SetPolicyForGroup configures policies with optional custom thresholds and actions for a GPU group
func Shutdown ¶
func Shutdown() (err error)
Shutdown stops DCGM and destroys all connections Returns an error if DCGM is not initialized
func UnwatchFields ¶
func UnwatchFields(fieldsGroup FieldHandle, group GroupHandle) error
UnwatchFields stops monitoring the specified fields for a GPU group. fieldsGroup is the handle to the field group to stop watching. group is the handle to the GPU group to stop watching.
func UpdateAllFields ¶
func UpdateAllFields() error
UpdateAllFields forces an update of all field values. Returns an error if the update fails.
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchFieldsWithGroup ¶
func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
WatchFieldsWithGroup starts monitoring fields using default parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. Returns an error if the watch operation fails.
func WatchFieldsWithGroupEx ¶
func WatchFieldsWithGroupEx( fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, ) error
WatchFieldsWithGroupEx starts monitoring fields with custom parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. updateFreq is the update frequency in microseconds. maxKeepAge is the maximum age of samples to keep in seconds. maxKeepSamples is the maximum number of samples to keep. Returns an error if the watch operation fails.
func WatchPolicyViolationsForGroup ¶
func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error)
WatchPolicyViolationsForGroup registers to receive violation notifications for a specific GPU group
Types ¶
type CPUHierarchyCPU_v1 ¶
type CPUHierarchyCPU_v1 struct {
// CPUID is the unique identifier for this CPU
CPUID uint
// OwnedCores is a bitmask array representing the cores owned by this CPU
OwnedCores []uint64
}
CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores
type CPUHierarchy_v1 ¶
type CPUHierarchy_v1 struct {
// Version is the version number of the hierarchy structure
Version uint
// NumCPUs is the number of CPUs in the system
NumCPUs uint
// CPUs contains information about each CPU in the system
CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1
}
CPUHierarchy_v1 represents version 1 of the CPU hierarchy information
func GetCPUHierarchy ¶
func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error)
GetCPUHierarchy retrieves the CPU hierarchy information from DCGM
type DbePolicyCondition ¶
type DbePolicyCondition struct {
// Location specifies where the ECC error occurred
Location string
// NumErrors indicates the number of errors detected
NumErrors uint
}
DbePolicyCondition contains details about a Double-bit ECC error
type DcgmBindUnbindEventState ¶
type DcgmBindUnbindEventState int
DcgmBindUnbindEventState represents the state of GPU bind/unbind events
const ( // DcgmBUEventStateSystemReinitializing indicates the system is reinitializing (GPU unbind) DcgmBUEventStateSystemReinitializing DcgmBindUnbindEventState = 1 // DcgmBUEventStateSystemReinitializationCompleted indicates system reinitialization is complete (GPU bind) DcgmBUEventStateSystemReinitializationCompleted DcgmBindUnbindEventState = 2 )
type Device ¶
type Device struct {
GPU uint
DCGMSupported string
UUID string
Power uint // W
PCI PCIInfo
Identifiers DeviceIdentifiers
Topology []P2PLink
CPUAffinity string
}
Device represents a GPU device and its properties
func GetDeviceInfo ¶
GetDeviceInfo returns detailed information about the specified GPU
type DeviceHealth ¶
type DeviceHealth struct {
// GPU is the ID of the GPU device
GPU uint
// Status indicates the overall health status of the GPU
Status string
// Watches contains the status of individual health watch systems
Watches []SystemWatch
}
DeviceHealth represents the health status of a GPU device
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error)
HealthCheckByGpuId performs a health check on the specified GPU
type DeviceIdentifiers ¶
type DeviceIdentifiers struct {
Brand string
Model string
Serial string
Vbios string
InforomImageVersion string
DriverVersion string
}
DeviceIdentifiers contains various identification information for a GPU device
type DeviceStatus ¶
type DeviceStatus struct {
Power float64 // W
Temperature int64 // °C
Utilization UtilizationInfo
Memory MemoryInfo
Clocks ClockInfo
PCI PCIStatusInfo
Performance PerfState
FanSpeed int64 // %
}
DeviceStatus contains comprehensive GPU device status information
func GetDeviceStatus ¶
func GetDeviceStatus(gpuID uint) (DeviceStatus, error)
GetDeviceStatus returns current status information about the specified GPU
type DiagErrorDetail ¶
type DiagErrorDetail struct {
// Message contains a human-readable description of the error
Message string
// Code identifies the specific type of error
Code HealthCheckErrorCode
}
DiagErrorDetail contains detailed information about a health check error
type DiagResult ¶
type DiagResult struct {
// Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun"
Status string
// TestName is the name of the diagnostic test that was run
TestName string
// TestOutput contains any additional output or messages from the test
TestOutput string
// ErrorCode is the numeric error code if the test failed
ErrorCode uint
// ErrorMessage contains a detailed error message if the test failed
ErrorMessage string
// Serial number of the tested entity
SerialNumber string
// EntityID
EntityID uint
}
DiagResult represents the result of a single diagnostic test
type DiagResults ¶
type DiagResults struct {
// Software contains the results of software-related diagnostic tests
Software []DiagResult
}
DiagResults contains the results of all diagnostic tests
func RunDiag ¶
func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error)
RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. Parameters:
- diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
- groupId: The group of GPUs to run diagnostics on
Returns:
- DiagResults containing the results of all diagnostic tests
- error if the diagnostics failed to run
type DiagType ¶
type DiagType int
DiagType represents the type of diagnostic test to run
const ( // DiagQuick represents a quick diagnostic test that performs basic health checks DiagQuick DiagType = 1 // DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks DiagMedium DiagType = 2 // DiagLong represents a long diagnostic test that performs extensive health checks DiagLong DiagType = 3 // DiagExtended represents an extended diagnostic test that performs the most thorough system checks DiagExtended DiagType = 4 )
type ECCErrorsInfo ¶
ECCErrorsInfo contains ECC memory error counts
type EntityStatus ¶
type EntityStatus uint
EntityStatus represents the status of a GPU entity
const ( // EntityStatusUnknown - Entity has not been referenced yet EntityStatusUnknown EntityStatus = 0 // EntityStatusOk - Entity is known and OK EntityStatusOk EntityStatus = 1 // EntityStatusUnsupported - Entity is unsupported by DCGM EntityStatusUnsupported EntityStatus = 2 // EntityStatusInaccessible - Entity is inaccessible, usually due to cgroups EntityStatusInaccessible EntityStatus = 3 // EntityStatusLost - Entity has been lost. Usually set from NVML returning NVML_ERROR_GPU_IS_LOST EntityStatusLost EntityStatus = 4 // EntityStatusFake - Entity is a fake, injection-only entity for testing EntityStatusFake EntityStatus = 5 // EntityStatusDisabled - Don't collect values from this GPU EntityStatusDisabled EntityStatus = 6 // EntityStatusDetached - Entity is detached, not good for any uses EntityStatusDetached EntityStatus = 7 )
func GetGPUStatus ¶
func GetGPUStatus(gpuID uint) EntityStatus
GetGPUStatus returns the entity status of the specified GPU
func (EntityStatus) String ¶
func (e EntityStatus) String() string
String returns a string representation of the entity status
type Error ¶
type Error struct {
Code C.dcgmReturn_t // dcgmReturn_t value of error
// contains filtered or unexported fields
}
Error represents an error returned by the DCGM library
type FieldHandle ¶
type FieldHandle struct {
// contains filtered or unexported fields
}
FieldHandle represents a handle to a DCGM field group
func FieldGroupCreate ¶
func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)
FieldGroupCreate creates a new field group with the specified fields. fieldsGroupName is the name for the new group. fields is a slice of field IDs to include in the group. Returns the field group handle and any error encountered.
Important: Field groups must be destroyed using FieldGroupDestroy when no longer needed to prevent resource leaks in the DCGM library.
Example:
fieldGroup, err := dcgm.FieldGroupCreate("myFields", []dcgm.Short{dcgm.DCGM_FI_DEV_POWER_USAGE})
if err != nil {
return err
}
defer dcgm.FieldGroupDestroy(fieldGroup)
// Use the field group...
func (*FieldHandle) GetHandle ¶
func (f *FieldHandle) GetHandle() uintptr
GetHandle returns the internal DCGM field group handle as a uintptr
func (*FieldHandle) SetHandle ¶
func (f *FieldHandle) SetHandle(val uintptr)
SetHandle sets the internal DCGM field group handle to the provided value
type FieldMeta ¶
type FieldMeta struct {
FieldID Short // Unique identifier for the field
FieldType byte // Type of the field (e.g., integer, float, string)
Size byte // Size of the field in bytes
Tag string // Human-readable tag/name for the field
Scope int // Scope of the field
NvmlFieldID int // Corresponding NVML field identifier
EntityLevel Field_Entity_Group // Entity level/group this field belongs to
}
FieldMeta represents metadata about a DCGM field, including its identifier, type, size, and other attributes. This struct is used to describe the characteristics and properties of fields that can be monitored or queried through DCGM.
func FieldGetByID ¶
FieldGetByID retrieves field metadata for the specified field ID.
func ToFieldMeta ¶
func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta
ToFieldMeta converts a C DCGM field metadata structure to a Go FieldMeta struct.
type FieldValue_v1 ¶
type FieldValue_v1 struct {
Version uint
FieldID Short
FieldType uint
Status int
TS int64
Value [4096]byte
}
FieldValue_v1 represents a field value in version 1
func EntityGetLatestValues ¶
func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
EntityGetLatestValues retrieves the latest values for specified fields of any entity. entityGroup specifies the type of entity to query. entityId is the ID of the entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func GetLatestValuesForFields ¶
func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
GetLatestValuesForFields retrieves the most recent values for the specified fields. gpu is the ID of the GPU to query. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func LinkGetLatestValues ¶
func LinkGetLatestValues(index uint, parentType Field_Entity_Group, parentId uint, fields []Short) ([]FieldValue_v1, error)
LinkGetLatestValues retrieves the latest values for specified fields of a link entity. index is the link index. parentId is the ID of the parent entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.
func (FieldValue_v1) Blob ¶
func (fv FieldValue_v1) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v1) Float64 ¶
func (fv FieldValue_v1) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v1) Int64 ¶
func (fv FieldValue_v1) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v1) String ¶
func (fv FieldValue_v1) String() string
String returns the field value as a string.
type FieldValue_v2 ¶
type FieldValue_v2 struct {
Version uint
EntityGroupId Field_Entity_Group
EntityID uint
FieldID Short
FieldType uint
Status int
TS int64
Value [4096]byte
StringValue *string
}
FieldValue_v2 represents a field value in version 2
func EntitiesGetLatestValues ¶
func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)
EntitiesGetLatestValues retrieves the latest values for specified fields across multiple entities. entities is a slice of entity pairs to query. fields is a slice of field IDs to retrieve. flags specify additional options for the query. Returns a slice of field values and any error encountered.
func GetValuesSince ¶
func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error)
GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.
GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.
fieldGroup is a FieldHandle representing the group of fields for which data is requested.
sinceTime is a time.Time value representing the timestamp from which to request updated values. A zero value (time.Time{}) requests all available data.
Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time of the latest data retrieval, and an error if there is any issue during the operation.
If the number of field values exceeds maxCallbackValues (131,072), an error is returned to prevent unbounded memory growth. To avoid this, reduce the time range, field group size, or entity count.
func (FieldValue_v2) Blob ¶
func (fv FieldValue_v2) Blob() [4096]byte
Blob returns the raw field value as a byte array.
func (FieldValue_v2) Float64 ¶
func (fv FieldValue_v2) Float64() float64
Float64 returns the field value as a float64.
func (FieldValue_v2) Int64 ¶
func (fv FieldValue_v2) Int64() int64
Int64 returns the field value as an int64.
func (FieldValue_v2) String ¶
func (fv FieldValue_v2) String() string
String returns the field value as a string.
type Field_Entity_Group ¶
type Field_Entity_Group uint
Field_Entity_Group represents the type of DCGM entity
const ( // FE_NONE represents no entity type FE_NONE Field_Entity_Group = iota // FE_GPU represents a GPU device entity FE_GPU // FE_VGPU represents a virtual GPU entity FE_VGPU // FE_SWITCH represents an NVSwitch entity FE_SWITCH // FE_GPU_I represents a GPU instance entity FE_GPU_I // FE_GPU_CI represents a GPU compute instance entity FE_GPU_CI // FE_LINK represents an NVLink entity FE_LINK // FE_CPU represents a CPU entity FE_CPU // FE_CPU_CORE represents a CPU core entity FE_CPU_CORE // FE_COUNT represents the total number of entity types FE_COUNT )
func (Field_Entity_Group) String ¶
func (e Field_Entity_Group) String() string
String returns a string representation of the Field_Entity_Group
type GroupEntityPair ¶
type GroupEntityPair struct {
// EntityGroupId specifies the type of the entity
EntityGroupId Field_Entity_Group
// EntityId is the unique identifier for this entity
EntityId uint
}
GroupEntityPair represents a DCGM entity and its group identifier
type GroupHandle ¶
type GroupHandle struct {
// contains filtered or unexported fields
}
GroupHandle represents a handle to a DCGM GPU group
func CreateGroup ¶
func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
CreateGroup creates a new empty GPU group with the specified name.
Important: Groups must be destroyed using DestroyGroup when no longer needed to prevent resource leaks in the DCGM library.
Example:
group, err := dcgm.CreateGroup("myGroup")
if err != nil {
return err
}
defer dcgm.DestroyGroup(group)
// Use the group...
func CreateGroupWithContext ¶
func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)
CreateGroupWithContext creates a new group with a context
func GroupAllGPUs ¶
func GroupAllGPUs() GroupHandle
GroupAllGPUs returns a GroupHandle representing all GPUs in the system
func NewDefaultGroup ¶
func NewDefaultGroup(groupName string) (GroupHandle, error)
NewDefaultGroup creates a new group with default GPUs and the specified name
func WatchFields ¶
func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
WatchFields starts monitoring the specified fields for a GPU. gpuId is the ID of the GPU to monitor. fieldsGroup is the handle of the field group to watch. groupName is a name for the watch group. Returns a group handle and any error encountered.
func WatchPidFields ¶
func WatchPidFields() (GroupHandle, error)
WatchPidFields configures DCGM to start recording stats for GPU processes Must be called before GetProcessInfo.
Important: The returned GroupHandle should be cleaned up by calling DestroyGroup when monitoring is no longer needed to prevent resource leaks.
Example:
group, err := dcgm.WatchPidFields()
if err != nil {
return err
}
defer dcgm.DestroyGroup(group)
// Use GetProcessInfo with the group...
func WatchPidFieldsEx ¶
func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.
func (*GroupHandle) GetHandle ¶
func (g *GroupHandle) GetHandle() uintptr
GetHandle returns the internal group handle value
func (*GroupHandle) SetHandle ¶
func (g *GroupHandle) SetHandle(val uintptr)
SetHandle sets the internal group handle value
type GroupInfo ¶
type GroupInfo struct {
Version uint32
GroupName string
EntityList []GroupEntityPair
}
GroupInfo contains information about a DCGM group
func GetGroupInfo ¶
func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error)
GetGroupInfo retrieves information about a DCGM group
type HealthCheckErrorCode ¶
type HealthCheckErrorCode uint
HealthCheckErrorCode error codes for passive and active health checks.
const ( // DCGM_FR_OK No error DCGM_FR_OK HealthCheckErrorCode = 0 // DCGM_FR_UNKNOWN Unknown error code DCGM_FR_UNKNOWN HealthCheckErrorCode = 1 // DCGM_FR_UNRECOGNIZED Unrecognized error code DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2 // DCGM_FR_PCI_REPLAY_RATE Unacceptable rate of PCI errors DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3 // DCGM_FR_VOLATILE_DBE_DETECTED Unacceptable rate of volatile double bit errors DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4 // DCGM_FR_VOLATILE_SBE_DETECTED Unacceptable rate of volatile single bit errors DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5 // DCGM_FR_VOLATILE_SBE_DETECTED_TS Unacceptable rate of volatile single bit errors with a timestamp DCGM_FR_VOLATILE_SBE_DETECTED_TS HealthCheckErrorCode = 6 // DCGM_FR_PENDING_PAGE_RETIREMENTS Pending page retirements detected DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6 // DCGM_FR_RETIRED_PAGES_LIMIT Unacceptable total page retirements detected DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7 // DCGM_FR_RETIRED_PAGES_DBE_LIMIT Unacceptable total page retirements due to uncorrectable errors DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8 // DCGM_FR_CORRUPT_INFOROM Corrupt inforom found DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9 // DCGM_FR_CLOCK_THROTTLE_THERMAL Clocks being throttled due to overheating DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10 // DCGM_FR_POWER_UNREADABLE Cannot get a reading for power from NVML DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11 // DCGM_FR_CLOCK_THROTTLE_POWER Clock being throttled due to power restrictions DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12 // DCGM_FR_NVLINK_ERROR_THRESHOLD Unacceptable rate of NVLink errors DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13 // DCGM_FR_NVLINK_DOWN NVLink is down DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14 // DCGM_FR_NVSWITCH_FATAL_ERROR Fatal errors on the NVSwitch DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15 // DCGM_FR_NVSWITCH_NON_FATAL_ERROR Non-fatal errors on the NVSwitch DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16 // DCGM_FR_NVSWITCH_DOWN NVSwitch is down DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17 // DCGM_FR_NO_ACCESS_TO_FILE Cannot access a file DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18 // DCGM_FR_NVML_API Error occurred on an NVML API - NOT USED: DEPRECATED DCGM_FR_NVML_API HealthCheckErrorCode = 19 // DCGM_FR_DEVICE_COUNT_MISMATCH Device count mismatch DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20 // DCGM_FR_BAD_PARAMETER Bad parameter passed to API DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21 // DCGM_FR_CANNOT_OPEN_LIB Cannot open a library that must be accessed DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22 // DCGM_FR_DENYLISTED_DRIVER A driver on the denylist (nouveau) is active DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23 // DCGM_FR_NVML_LIB_BAD NVML library is missing expected functions - NOT USED: DEPRECATED DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24 // DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 // DCGM_FR_HOSTENGINE_CONN Bad connection to nv-hostengine - NOT USED: DEPRECATED DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26 // DCGM_FR_FIELD_QUERY Field query failed DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27 // DCGM_FR_BAD_CUDA_ENV The environment has variables that hurt CUDA DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28 // DCGM_FR_PERSISTENCE_MODE Persistence mode is disabled DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29 // DCGM_FR_BAD_NVLINK_ENV The environment has variables that hurt NVLink DCGM_FR_BAD_NVLINK_ENV HealthCheckErrorCode = 29 // DCGM_FR_LOW_BANDWIDTH The bandwidth is unacceptably low DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30 // DCGM_FR_HIGH_LATENCY Latency is too high DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31 // DCGM_FR_CANNOT_GET_FIELD_TAG Cannot find a tag for a field DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32 // DCGM_FR_FIELD_VIOLATION The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33 // DCGM_FR_FIELD_THRESHOLD The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34 // DCGM_FR_FIELD_VIOLATION_DBL The value for the specified error field is above 0 DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35 // DCGM_FR_FIELD_THRESHOLD_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36 // DCGM_FR_UNSUPPORTED_FIELD_TYPE Field type cannot be supported DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37 // DCGM_FR_FIELD_THRESHOLD_TS The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38 // DCGM_FR_FIELD_THRESHOLD_TS_DBL The value for the specified field is above the threshold DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39 // DCGM_FR_THERMAL_VIOLATIONS Thermal violations detected DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40 // DCGM_FR_THERMAL_VIOLATIONS_TS Thermal violations detected with a timestamp DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41 // DCGM_FR_TEMP_VIOLATION Non-benign clock throttling is occurring DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42 // DCGM_FR_THROTTLING_VIOLATION Non-benign clock throttling is occurring DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43 // DCGM_FR_INTERNAL An internal error was detected DCGM_FR_INTERNAL HealthCheckErrorCode = 44 // DCGM_FR_PCIE_GENERATION PCIe generation is too low DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45 // DCGM_FR_PCIE_WIDTH PCIe width is too low DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46 // DCGM_FR_ABORTED Test was aborted by a user signal DCGM_FR_ABORTED HealthCheckErrorCode = 47 // DCGM_FR_TEST_DISABLED Test was disabled by a user signal DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48 // DCGM_FR_CANNOT_GET_STAT Cannot get telemetry for a needed value DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49 // DCGM_FR_STRESS_LEVEL Stress level is too low (bad performance) DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50 // DCGM_FR_CUDA_API HealthCheckErrorCode = 51 DCGM_FR_CUDA_API HealthCheckErrorCode = 51 // DCGM_FR_FAULTY_MEMORY Faulty memory detected on this GPU DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52 // DCGM_FR_CANNOT_SET_WATCHES Unable to set field watches in DCGM - NOT USED: DEPRECATED DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53 // DCGM_FR_CUDA_UNBOUND CUDA context is no longer bound DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54 // DCGM_FR_ECC_DISABLED ECC memory is disabled right now DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55 // DCGM_FR_MEMORY_ALLOC Cannot allocate memory on the GPU DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56 // DCGM_FR_CUDA_DBE CUDA detected unrecovable double-bit error DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57 // DCGM_FR_MEMORY_MISMATCH Memory error detected DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58 // DCGM_FR_CUDA_DEVICE No CUDA device discoverable for existing GPU DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59 // DCGM_FR_ECC_UNSUPPORTED ECC memory is unsupported by this SKU DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60 // DCGM_FR_ECC_PENDING ECC memory is in a pending state - NOT USED: DEPRECATED DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61 // DCGM_FR_MEMORY_BANDWIDTH Memory bandwidth is too low DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62 // DCGM_FR_TARGET_POWER The target power is too low DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63 // DCGM_FR_API_FAIL The specified API call failed DCGM_FR_API_FAIL HealthCheckErrorCode = 64 // DCGM_FR_API_FAIL_GPU The specified API call failed for the specified GPU DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65 // DCGM_FR_CUDA_CONTEXT Cannot create a CUDA context on this GPU DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66 // DCGM_FR_DCGM_API DCGM API failure DCGM_FR_DCGM_API HealthCheckErrorCode = 67 // DCGM_FR_CONCURRENT_GPUS Need multiple GPUs to run this test DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68 // DCGM_FR_TOO_MANY_ERRORS More errors than fit in the return struct - NOT USED: DEPRECATED DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69 // DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD NVLink CRC error threshold violation DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70 // DCGM_FR_NVLINK_ERROR_CRITICAL NVLink error for a field that should always be 0 DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71 // DCGM_FR_ENFORCED_POWER_LIMIT The enforced power limit is too low to hit the target DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72 // DCGM_FR_MEMORY_ALLOC_HOST Cannot allocate memory on the host DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73 // DCGM_FR_GPU_OP_MODE Bad GPU operating mode for running plugin - NOT USED: DEPRECATED DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74 // DCGM_FR_NO_MEMORY_CLOCKS No memory clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75 // DCGM_FR_NO_GRAPHICS_CLOCKS No graphics clocks with the needed MHz found - NOT USED: DEPRECATED DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76 // DCGM_FR_HAD_TO_RESTORE_STATE Note that we had to restore a GPU's state DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77 // DCGM_FR_L1TAG_UNSUPPORTED L1TAG test is unsupported by this SKU DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78 // DCGM_FR_L1TAG_MISCOMPARE L1TAG test failed on a miscompare DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79 // DCGM_FR_ROW_REMAP_FAILURE Row remapping failed (Ampere or newer GPUs) DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80 // DCGM_FR_UNCONTAINED_ERROR Uncontained error - XID 95 DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81 // DCGM_FR_EMPTY_GPU_LIST No GPU information given to plugin DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82 // DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS Pending page retirements due to a DBE DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83 // DCGM_FR_UNCORRECTABLE_ROW_REMAP Uncorrectable row remapping DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84 // DCGM_FR_PENDING_ROW_REMAP Row remapping is pending DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85 // DCGM_FR_BROKEN_P2P_MEMORY_DEVICE P2P copy test detected an error writing to this GPU DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86 // DCGM_FR_BROKEN_P2P_WRITER_DEVICE P2P copy test detected an error writing from this GPU DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87 // DCGM_FR_NVSWITCH_NVLINK_DOWN An NvLink is down for the specified NVSwitch DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88 // DCGM_FR_EUD_BINARY_PERMISSIONS EUD binary permissions are incorrect DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89 // DCGM_FR_EUD_NON_ROOT_USER EUD plugin is not running as root DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90 // DCGM_FR_EUD_SPAWN_FAILURE EUD plugin failed to spawn the EUD binary DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91 // DCGM_FR_EUD_TIMEOUT EUD plugin timed out DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92 // DCGM_FR_EUD_ZOMBIE EUD process remains running after the plugin considers it finished DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93 // DCGM_FR_EUD_NON_ZERO_EXIT_CODE EUD process exited with a non-zero exit code DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94 // DCGM_FR_EUD_TEST_FAILED EUD test failed DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95 // DCGM_FR_FILE_CREATE_PERMISSIONS We cannot create a file in this directory. DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96 // DCGM_FR_PAUSE_RESUME_FAILED Pause/Resume failed DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97 // DCGM_FR_PCIE_H_REPLAY_VIOLATION PCIe H replay violation DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98 // DCGM_FR_GPU_EXPECTED_NVLINKS_UP Expected nvlinks up per gpu DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99 // DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP Expected nvlinks up per nvswitch DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100 // DCGM_FR_XID_ERROR XID error detected DCGM_FR_XID_ERROR HealthCheckErrorCode = 101 // DCGM_FR_SBE_VIOLATION Single bit error detected DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102 // DCGM_FR_DBE_VIOLATION Double bit error detected DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103 // DCGM_FR_PCIE_REPLAY_VIOLATION PCIe replay errors detected DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104 // DCGM_FR_SBE_THRESHOLD_VIOLATION SBE threshold violated DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105 // DCGM_FR_DBE_THRESHOLD_VIOLATION DBE threshold violated DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106 // DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION PCIe replay count violated DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // DCGM_FR_CUDA_FM_NOT_INITIALIZED The fabricmanager is not initialized DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // DCGM_FR_SXID_ERROR NvSwitch fatal error detected DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // DCGM_FR_GFLOPS_THRESHOLD_VIOLATION GPU GFLOPs threshold violated DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110 // DCGM_FR_NAN_VALUE NaN value detected on this GPU DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111 // DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR Fabric Manager did not finish training DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112 // DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113 // DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE P2P copy test detected an error writing from this GPU over PCIE DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114 // DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115 // DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE P2P copy test detected an error writing from this GPU over NVLink DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116 // DCGM_FR_ERROR_SENTINEL MUST BE THE LAST ERROR CODE DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 117 )
type HealthResponse ¶
type HealthResponse struct {
// OverallHealth indicates the aggregate health status across all watches
OverallHealth HealthResult
// Incidents contains details about any health issues detected
Incidents []Incident
}
HealthResponse contains the results of a health check operation
func HealthCheck ¶
func HealthCheck(groupID GroupHandle) (HealthResponse, error)
HealthCheck checks the configured watches for any errors/failures/warnings that have occurred since the last time this check was invoked. On the first call, stateful information about all of the enabled watches within a group is created but no error results are provided. On subsequent calls, any error information will be returned.
type HealthResult ¶
type HealthResult uint
HealthResult is the result of a health check.
const ( // DCGM_HEALTH_RESULT_PASS All results within this system are reporting normal DCGM_HEALTH_RESULT_PASS HealthResult = 0 // DCGM_HEALTH_RESULT_WARN A warning has been issued, refer to the response for more information DCGM_HEALTH_RESULT_WARN HealthResult = 10 // DCGM_HEALTH_RESULT_FAIL A failure has been issued, refer to the response for more information DCGM_HEALTH_RESULT_FAIL HealthResult = 20 )
type HealthSystem ¶
type HealthSystem uint
HealthSystem is the system to watch for health checks.
const ( // DCGM_HEALTH_WATCH_PCIE PCIe health check DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 // DCGM_HEALTH_WATCH_NVLINK NVLink health check DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2 // DCGM_HEALTH_WATCH_PMU PMU health check DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4 // DCGM_HEALTH_WATCH_MCU MCU health check DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8 // DCGM_HEALTH_WATCH_MEM Memory health check DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10 // DCGM_HEALTH_WATCH_SM SM health check DCGM_HEALTH_WATCH_SM HealthSystem = 0x20 // DCGM_HEALTH_WATCH_INFOROM Inforom health check DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40 // DCGM_HEALTH_WATCH_THERMAL Thermal health check DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 // DCGM_HEALTH_WATCH_POWER Power health check DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 // DCGM_HEALTH_WATCH_DRIVER Driver health check DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200 // DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL NVSwitch non-fatal health check DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400 // DCGM_HEALTH_WATCH_NVSWITCH_FATAL NVSwitch fatal health check DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800 // DCGM_HEALTH_WATCH_ALL All health checks DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF )
func HealthGet ¶
func HealthGet(groupID GroupHandle) (HealthSystem, error)
HealthGet retrieves the current state of the DCGM health check system. It returns which health watch systems are currently enabled for the specified group.
type Incident ¶
type Incident struct {
// System identifies which health watch system detected the incident
System HealthSystem
// Health indicates the severity of the incident
Health HealthResult
// Error contains detailed information about the incident
Error DiagErrorDetail
// EntityInfo identifies the GPU or component where the incident occurred
EntityInfo GroupEntityPair
}
Incident represents a health check incident that occurred
type Link_State ¶
type Link_State uint
Link_State represents the state of an NVLINK connection
const ( // LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs) LS_NOT_SUPPORTED Link_State = iota // LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches) LS_DISABLED // LS_DOWN indicates the link is down (inactive) LS_DOWN // LS_UP indicates the link is up (active) LS_UP )
type MemoryInfo ¶
type MemoryInfo struct {
GlobalUsed int64
ECCErrors ECCErrorsInfo
}
MemoryInfo contains GPU memory usage and error information
type MetricGroup ¶
MetricGroup represents a group of metrics for a specific GPU
func GetSupportedMetricGroups ¶
func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error)
GetSupportedMetricGroups returns all supported metric groups for the specified GPU
type MigEntityInfo ¶
type MigEntityInfo struct {
// GpuUuid is the UUID of the parent GPU
GpuUuid string
// NvmlGpuIndex is the NVML index of the parent GPU
NvmlGpuIndex uint
// NvmlInstanceId is the NVML GPU instance ID
NvmlInstanceId uint
// NvmlComputeInstanceId is the NVML compute instance ID
NvmlComputeInstanceId uint
// NvmlMigProfileId is the NVML MIG profile ID
NvmlMigProfileId uint
// NvmlProfileSlices is the number of slices in the MIG profile
NvmlProfileSlices uint
}
MigEntityInfo contains information about a MIG entity
type MigHierarchyInfo ¶
type MigHierarchyInfo struct {
// Entity represents the current GPU entity in the hierarchy
Entity GroupEntityPair
// Parent represents the parent GPU entity in the hierarchy
Parent GroupEntityPair
// SliceProfile defines the MIG profile configuration for this entity
SliceProfile MigProfile
}
MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information for a GPU entity and its relationship to other entities
type MigHierarchyInfo_v2 ¶
type MigHierarchyInfo_v2 struct {
// Entity contains the entity information
Entity GroupEntityPair
// Parent contains the parent entity information
Parent GroupEntityPair
// Info contains detailed MIG entity information
Info MigEntityInfo
}
MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information
type MigHierarchy_v2 ¶
type MigHierarchy_v2 struct {
// Version is the version number of the hierarchy structure
Version uint
// Count is the number of valid entries in EntityList
Count uint
// EntityList contains the MIG hierarchy information for each entity
EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
}
MigHierarchy_v2 represents version 2 of the complete MIG hierarchy
func GetGPUInstanceHierarchy ¶
func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)
GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information
type MigProfile ¶
type MigProfile int
MigProfile represents the Multi-Instance GPU (MIG) profile type
const ( // MigProfileNone indicates no MIG profile is set (for GPUs) MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ // MigProfileGPUInstanceSlice1 represents GPU instance slice 1 MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ // MigProfileGPUInstanceSlice2 represents GPU instance slice 2 MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ // MigProfileGPUInstanceSlice3 represents GPU instance slice 3 MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ // MigProfileGPUInstanceSlice4 represents GPU instance slice 4 MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ // MigProfileGPUInstanceSlice7 represents GPU instance slice 7 MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ // MigProfileGPUInstanceSlice8 represents GPU instance slice 8 MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ // MigProfileGPUInstanceSlice6 represents GPU instance slice 6 MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ // MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1 MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ // MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1 MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ // MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2 MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ // MigProfileComputeInstanceSlice1 represents compute instance slice 1 MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ // MigProfileComputeInstanceSlice2 represents compute instance slice 2 MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ // MigProfileComputeInstanceSlice3 represents compute instance slice 3 MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ // MigProfileComputeInstanceSlice4 represents compute instance slice 4 MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ // MigProfileComputeInstanceSlice7 represents compute instance slice 7 MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ // MigProfileComputeInstanceSlice8 represents compute instance slice 8 MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ // MigProfileComputeInstanceSlice6 represents compute instance slice 6 MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ // MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1 MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ )
type NvLinkP2PStatus ¶
type NvLinkP2PStatus struct {
Gpus [][]Link_State
// contains filtered or unexported fields
}
NvLinkP2PStatus represents the state of NvLinks between the GPU pairs
func GetNvLinkP2PStatus ¶
func GetNvLinkP2PStatus() (NvLinkP2PStatus, error)
GetNvLinkP2PStatus returns the status of NvLinks between GPU pairs
type NvLinkStatus ¶
type NvLinkStatus struct {
// ParentId is the ID of the parent entity (GPU or NVSwitch)
ParentId uint
// ParentType is the type of the parent entity
ParentType Field_Entity_Group
// State is the current state of the NVLINK
State Link_State
// Index is the link index number
Index uint
}
NvLinkStatus contains information about an NVLINK connection status
func GetNvLinkLinkStatus ¶
func GetNvLinkLinkStatus() ([]NvLinkStatus, error)
GetNvLinkLinkStatus returns the status of all NVLink connections
type NvlinkPolicyCondition ¶
type NvlinkPolicyCondition struct {
// FieldId identifies the specific NVLink field that had an error
FieldId uint16
// Counter indicates the number of errors detected
Counter uint
}
NvlinkPolicyCondition contains details about an NVLink error
type P2PLink ¶
type P2PLink struct {
// GPU is the ID of the GPU
GPU uint
// BusID is the PCIe bus ID of the GPU
BusID string
// Link is the type of P2P connection
Link P2PLinkType
}
P2PLink contains information about a peer-to-peer connection
func GetDeviceTopology ¶
GetDeviceTopology returns the topology (connectivity) information for the specified GPU
type P2PLinkType ¶
type P2PLinkType uint
P2PLinkType represents the type of peer-to-peer connection between GPUs
const ( // P2PLinkUnknown represents an unknown link type P2PLinkUnknown P2PLinkType = iota // P2PLinkCrossCPU represents a connection across different CPUs P2PLinkCrossCPU // P2PLinkSameCPU represents a connection within the same CPU P2PLinkSameCPU // P2PLinkHostBridge represents a connection through the host bridge P2PLinkHostBridge // P2PLinkMultiSwitch represents a connection through multiple PCIe switches P2PLinkMultiSwitch // P2PLinkSingleSwitch represents a connection through a single PCIe switch P2PLinkSingleSwitch // P2PLinkSameBoard represents a connection on the same board P2PLinkSameBoard // SingleNVLINKLink represents a single NVLINK connection SingleNVLINKLink // TwoNVLINKLinks represents two NVLINK connections TwoNVLINKLinks // ThreeNVLINKLinks represents three NVLINK connections ThreeNVLINKLinks // FourNVLINKLinks represents four NVLINK connections FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
PCIPaths returns a string representation of the P2P link type
type PCIStatusInfo ¶
type PCIStatusInfo struct {
BAR1Used int64 // MB
Throughput PCIThroughputInfo
FBUsed int64
}
PCIStatusInfo contains PCI bus status information
type PCIThroughputInfo ¶
PCIThroughputInfo contains PCI bus transfer metrics
type PciPolicyCondition ¶
type PciPolicyCondition struct {
// ReplayCounter indicates the number of PCI replays
ReplayCounter uint
}
PciPolicyCondition contains details about a PCI error
type PolicyAction ¶
type PolicyAction uint32
PolicyAction specifies the action to take when a policy violation occurs
const ( // PolicyActionNone indicates no action should be taken on violation (default) PolicyActionNone PolicyAction = 0 // PolicyActionGPUReset indicates the GPU should be reset on violation PolicyActionGPUReset PolicyAction = 1 )
type PolicyCondition ¶
type PolicyCondition string
PolicyCondition represents a type of policy violation that can be monitored
type PolicyConfig ¶
type PolicyConfig struct {
// Condition specifies the type of policy to monitor
Condition PolicyCondition
// Action specifies what action to take when this policy violation occurs (optional, defaults to PolicyActionNone)
Action *PolicyAction
// Validation specifies what validation to perform after the action (optional, defaults to PolicyValidationNone)
Validation *PolicyValidation
// MaxRetiredPages specifies the threshold for MaxRtPgPolicy (optional, defaults to DefaultMaxRetiredPages)
MaxRetiredPages *uint32
// MaxTemperature specifies the threshold for ThermalPolicy in Celsius (optional, defaults to DefaultMaxTemperature)
MaxTemperature *uint32
// MaxPower specifies the threshold for PowerPolicy in Watts (optional, defaults to DefaultMaxPower)
MaxPower *uint32
}
PolicyConfig configures a policy condition with optional custom thresholds and actions
type PolicyStatus ¶
type PolicyStatus struct {
// Mode indicates the operation mode (automatic or manual)
Mode uint32
// Action specifies what action is taken on violation
Action PolicyAction
// Validation specifies what validation is performed after action
Validation PolicyValidation
// Conditions is a map of enabled policy conditions with their thresholds
// Key is the PolicyCondition, value is the threshold (if applicable)
Conditions map[PolicyCondition]interface{}
}
PolicyStatus represents the current policy configuration for a group
func GetPolicyForGroup ¶
func GetPolicyForGroup(group GroupHandle) (*PolicyStatus, error)
GetPolicyForGroup retrieves the current policy configuration for a GPU group
type PolicyValidation ¶
type PolicyValidation uint32
PolicyValidation specifies the validation to perform after a policy action
const ( // PolicyValidationNone indicates no validation after action (default) PolicyValidationNone PolicyValidation = 0 // PolicyValidationShort indicates a short system validation should be performed PolicyValidationShort PolicyValidation = 1 // PolicyValidationMedium indicates a medium system validation should be performed PolicyValidationMedium PolicyValidation = 2 // PolicyValidationLong indicates a long system validation should be performed PolicyValidationLong PolicyValidation = 3 )
type PolicyViolation ¶
type PolicyViolation struct {
// Condition specifies the type of policy that was violated
Condition PolicyCondition
// Timestamp indicates when the violation occurred
Timestamp time.Time
// Data contains violation-specific details
Data any
}
PolicyViolation represents a detected violation of a policy condition
type PowerPolicyCondition ¶
type PowerPolicyCondition struct {
// PowerViolation indicates the severity of the power violation
PowerViolation uint
}
PowerPolicyCondition contains details about a power violation
type ProcessInfo ¶
type ProcessInfo struct {
// GPU is the ID of the GPU being used
GPU uint
// PID is the process ID
PID uint
// Name is the name of the process
Name string
// ProcessUtilization contains process-specific utilization metrics
ProcessUtilization ProcessUtilInfo
// PCI contains PCI bus statistics
PCI PCIStatusInfo
// Memory contains memory usage statistics
Memory MemoryInfo
// GpuUtilization contains GPU utilization metrics
GpuUtilization UtilizationInfo
// Clocks contains GPU clock frequencies
Clocks ClockInfo
// Violations contains throttling statistics
Violations ViolationTime
// XIDErrors contains XID error information
XIDErrors XIDErrorInfo
}
ProcessInfo contains comprehensive information about a GPU process
func GetProcessInfo ¶
func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo returns detailed per-GPU statistics for the specified process
type ProcessUtilInfo ¶
type ProcessUtilInfo struct {
// StartTime is when the process started using the GPU
StartTime Time
// EndTime is when the process stopped using the GPU (0 if still running)
EndTime Time
// EnergyConsumed is the energy consumed by the process in Joules
EnergyConsumed *uint64
// SmUtil is the GPU SM (Streaming Multiprocessor) utilization percentage
SmUtil *float64
// MemUtil is the GPU memory utilization percentage
MemUtil *float64
}
ProcessUtilInfo contains utilization metrics for a GPU process
type RetiredPagesPolicyCondition ¶
type RetiredPagesPolicyCondition struct {
// SbePages indicates the number of pages retired due to single-bit errors
SbePages uint
// DbePages indicates the number of pages retired due to double-bit errors
DbePages uint
}
RetiredPagesPolicyCondition contains details about retired memory pages
type Short ¶
Short is an alias for the C.ushort type. It is primarily used for DCGM field identifiers and field collections in the DCGM API bindings. This type provides a direct mapping to the C unsigned short type used in the underlying DCGM C API.
const ( // DCGM_FI_UNKNOWN represents / DCGM_FI_UNKNOWN Short = 0 // DCGM_FI_DRIVER_VERSION represents / DCGM_FI_DRIVER_VERSION Short = 1 // DCGM_FI_NVML_VERSION DCGM_FI_NVML_VERSION Short = 2 // DCGM_FI_PROCESS_NAME represents / DCGM_FI_PROCESS_NAME Short = 3 // DCGM_FI_DEV_COUNT represents / DCGM_FI_DEV_COUNT Short = 4 // DCGM_FI_CUDA_DRIVER_VERSION represents / DCGM_FI_CUDA_DRIVER_VERSION Short = 5 // DCGM_FI_BIND_UNBIND_EVENT represents / DCGM_FI_BIND_UNBIND_EVENT Short = 6 // DCGM_FI_DEV_NAME represents / DCGM_FI_DEV_NAME Short = 50 // DCGM_FI_DEV_BRAND represents / DCGM_FI_DEV_BRAND Short = 51 // DCGM_FI_DEV_NVML_INDEX represents / DCGM_FI_DEV_NVML_INDEX Short = 52 // DCGM_FI_DEV_SERIAL represents / DCGM_FI_DEV_SERIAL Short = 53 // DCGM_FI_DEV_UUID represents / DCGM_FI_DEV_UUID Short = 54 // DCGM_FI_DEV_MINOR_NUMBER represents / DCGM_FI_DEV_MINOR_NUMBER Short = 55 // DCGM_FI_DEV_OEM_INFOROM_VER represents / DCGM_FI_DEV_OEM_INFOROM_VER Short = 56 // DCGM_FI_DEV_PCI_BUSID represents / DCGM_FI_DEV_PCI_BUSID Short = 57 // DCGM_FI_DEV_PCI_COMBINED_ID represents / DCGM_FI_DEV_PCI_COMBINED_ID Short = 58 // DCGM_FI_DEV_PCI_SUBSYS_ID represents / DCGM_FI_DEV_PCI_SUBSYS_ID Short = 59 // DCGM_FI_GPU_TOPOLOGY_PCI represents / DCGM_FI_GPU_TOPOLOGY_PCI Short = 60 // DCGM_FI_GPU_TOPOLOGY_NVLINK represents / DCGM_FI_GPU_TOPOLOGY_NVLINK Short = 61 // DCGM_FI_GPU_TOPOLOGY_AFFINITY represents / DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = 62 // DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY represents / DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = 63 // DCGM_FI_DEV_P2P_NVLINK_STATUS represents / DCGM_FI_DEV_P2P_NVLINK_STATUS Short = 64 // DCGM_FI_DEV_COMPUTE_MODE represents / DCGM_FI_DEV_COMPUTE_MODE Short = 65 // DCGM_FI_DEV_PERSISTENCE_MODE represents / DCGM_FI_DEV_PERSISTENCE_MODE Short = 66 // DCGM_FI_DEV_MIG_MODE represents / DCGM_FI_DEV_MIG_MODE Short = 67 // DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR represents / DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = 68 // DCGM_FI_DEV_MIG_MAX_SLICES represents / DCGM_FI_DEV_MIG_MAX_SLICES Short = 69 // DCGM_FI_DEV_CPU_AFFINITY_0 represents / DCGM_FI_DEV_CPU_AFFINITY_0 Short = 70 // DCGM_FI_DEV_CPU_AFFINITY_1 represents / DCGM_FI_DEV_CPU_AFFINITY_1 Short = 71 // DCGM_FI_DEV_CPU_AFFINITY_2 represents / DCGM_FI_DEV_CPU_AFFINITY_2 Short = 72 // DCGM_FI_DEV_CPU_AFFINITY_3 represents / DCGM_FI_DEV_CPU_AFFINITY_3 Short = 73 // DCGM_FI_DEV_CC_MODE represents / DCGM_FI_DEV_CC_MODE Short = 74 // DCGM_FI_DEV_MIG_ATTRIBUTES represents / DCGM_FI_DEV_MIG_ATTRIBUTES Short = 75 // DCGM_FI_DEV_MIG_GI_INFO represents / DCGM_FI_DEV_MIG_GI_INFO Short = 76 // DCGM_FI_DEV_MIG_CI_INFO represents / DCGM_FI_DEV_MIG_CI_INFO Short = 77 // DCGM_FI_DEV_ECC_INFOROM_VER represents / DCGM_FI_DEV_ECC_INFOROM_VER Short = 80 // DCGM_FI_DEV_POWER_INFOROM_VER represents / DCGM_FI_DEV_POWER_INFOROM_VER Short = 81 // DCGM_FI_DEV_INFOROM_IMAGE_VER represents / DCGM_FI_DEV_INFOROM_IMAGE_VER Short = 82 // DCGM_FI_DEV_INFOROM_CONFIG_CHECK represents / DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = 83 // DCGM_FI_DEV_INFOROM_CONFIG_VALID represents / DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = 84 // DCGM_FI_DEV_VBIOS_VERSION represents / DCGM_FI_DEV_VBIOS_VERSION Short = 85 // DCGM_FI_DEV_MEM_AFFINITY_0 represents / DCGM_FI_DEV_MEM_AFFINITY_0 Short = 86 // DCGM_FI_DEV_MEM_AFFINITY_1 represents / DCGM_FI_DEV_MEM_AFFINITY_1 Short = 87 // DCGM_FI_DEV_MEM_AFFINITY_2 represents / DCGM_FI_DEV_MEM_AFFINITY_2 Short = 88 // DCGM_FI_DEV_MEM_AFFINITY_3 represents / DCGM_FI_DEV_MEM_AFFINITY_3 Short = 89 // DCGM_FI_DEV_BAR1_TOTAL represents / DCGM_FI_DEV_BAR1_TOTAL Short = 90 // DCGM_FI_SYNC_BOOST represents / DCGM_FI_SYNC_BOOST Short = 91 // DCGM_FI_DEV_BAR1_USED represents / DCGM_FI_DEV_BAR1_USED Short = 92 // DCGM_FI_DEV_BAR1_FREE represents / DCGM_FI_DEV_BAR1_FREE Short = 93 // DCGM_FI_DEV_GPM_SUPPORT represents */ DCGM_FI_DEV_GPM_SUPPORT Short = 94 // DCGM_FI_DEV_SM_CLOCK represents / DCGM_FI_DEV_SM_CLOCK Short = 100 // DCGM_FI_DEV_MEM_CLOCK represents / DCGM_FI_DEV_MEM_CLOCK Short = 101 // DCGM_FI_DEV_VIDEO_CLOCK represents / DCGM_FI_DEV_VIDEO_CLOCK Short = 102 // DCGM_FI_DEV_APP_SM_CLOCK represents / DCGM_FI_DEV_APP_SM_CLOCK Short = 110 // DCGM_FI_DEV_APP_MEM_CLOCK represents / DCGM_FI_DEV_APP_MEM_CLOCK Short = 111 // DCGM_FI_DEV_CLOCKS_EVENT_REASONS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASONS Short = 112 // DCGM_FI_DEV_MAX_SM_CLOCK represents / DCGM_FI_DEV_MAX_SM_CLOCK Short = 113 // DCGM_FI_DEV_MAX_MEM_CLOCK represents / DCGM_FI_DEV_MAX_MEM_CLOCK Short = 114 // DCGM_FI_DEV_MAX_VIDEO_CLOCK represents / DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = 115 // DCGM_FI_DEV_AUTOBOOST represents / DCGM_FI_DEV_AUTOBOOST Short = 120 // DCGM_FI_DEV_SUPPORTED_CLOCKS represents / DCGM_FI_DEV_SUPPORTED_CLOCKS Short = 130 // DCGM_FI_DEV_MEMORY_TEMP represents / DCGM_FI_DEV_MEMORY_TEMP Short = 140 // DCGM_FI_DEV_GPU_TEMP represents / DCGM_FI_DEV_GPU_TEMP Short = 150 // DCGM_FI_DEV_MEM_MAX_OP_TEMP represents / DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = 151 // DCGM_FI_DEV_GPU_MAX_OP_TEMP represents / DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = 152 // DCGM_FI_DEV_GPU_TEMP_LIMIT represents / DCGM_FI_DEV_GPU_TEMP_LIMIT Short = 153 // DCGM_FI_DEV_POWER_USAGE represents / DCGM_FI_DEV_POWER_USAGE Short = 155 // DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION represents / DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = 156 // DCGM_FI_DEV_POWER_USAGE_INSTANT represents / DCGM_FI_DEV_POWER_USAGE_INSTANT Short = 157 // DCGM_FI_DEV_SLOWDOWN_TEMP represents / DCGM_FI_DEV_SLOWDOWN_TEMP Short = 158 // DCGM_FI_DEV_SHUTDOWN_TEMP represents / DCGM_FI_DEV_SHUTDOWN_TEMP Short = 159 // DCGM_FI_DEV_POWER_MGMT_LIMIT represents / DCGM_FI_DEV_POWER_MGMT_LIMIT Short = 160 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN represents / DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = 161 // DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX represents / DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = 162 // DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF represents / DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = 163 // DCGM_FI_DEV_ENFORCED_POWER_LIMIT represents / DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = 164 // DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK represents / DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK Short = 165 // DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK represents / DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK Short = 166 // DCGM_FI_DEV_VALID_POWER_PROFILE_MASK represents / DCGM_FI_DEV_VALID_POWER_PROFILE_MASK Short = 167 // DCGM_FI_DEV_FABRIC_MANAGER_STATUS represents / DCGM_FI_DEV_FABRIC_MANAGER_STATUS Short = 170 // DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE represents / DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE Short = 171 // DCGM_FI_DEV_FABRIC_CLUSTER_UUID represents / DCGM_FI_DEV_FABRIC_CLUSTER_UUID Short = 172 // DCGM_FI_DEV_FABRIC_CLIQUE_ID represents / DCGM_FI_DEV_FABRIC_CLIQUE_ID Short = 173 // DCGM_FI_DEV_FABRIC_HEALTH_MASK represents / DCGM_FI_DEV_FABRIC_HEALTH_MASK Short = 174 // DCGM_FI_DEV_PSTATE represents / DCGM_FI_DEV_PSTATE Short = 190 // DCGM_FI_DEV_FAN_SPEED represents / DCGM_FI_DEV_FAN_SPEED Short = 191 // DCGM_FI_DEV_PCIE_TX_THROUGHPUT represents / DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = 200 // DCGM_FI_DEV_PCIE_RX_THROUGHPUT represents / DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = 201 // DCGM_FI_DEV_PCIE_REPLAY_COUNTER represents / DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = 202 // DCGM_FI_DEV_GPU_UTIL represents / DCGM_FI_DEV_GPU_UTIL Short = 203 // DCGM_FI_DEV_MEM_COPY_UTIL represents / DCGM_FI_DEV_MEM_COPY_UTIL Short = 204 // DCGM_FI_DEV_ACCOUNTING_DATA represents / DCGM_FI_DEV_ACCOUNTING_DATA Short = 205 // DCGM_FI_DEV_ENC_UTIL represents / DCGM_FI_DEV_ENC_UTIL Short = 206 // DCGM_FI_DEV_DEC_UTIL represents / DCGM_FI_DEV_DEC_UTIL Short = 207 // DCGM_FI_DEV_XID_ERRORS represents / DCGM_FI_DEV_XID_ERRORS Short = 230 // DCGM_FI_DEV_PCIE_MAX_LINK_GEN represents / DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = 235 // DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH represents / DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = 236 // DCGM_FI_DEV_PCIE_LINK_GEN represents / DCGM_FI_DEV_PCIE_LINK_GEN Short = 237 // DCGM_FI_DEV_PCIE_LINK_WIDTH represents / DCGM_FI_DEV_PCIE_LINK_WIDTH Short = 238 // DCGM_FI_DEV_POWER_VIOLATION represents / DCGM_FI_DEV_POWER_VIOLATION Short = 240 // DCGM_FI_DEV_THERMAL_VIOLATION represents / DCGM_FI_DEV_THERMAL_VIOLATION Short = 241 // DCGM_FI_DEV_SYNC_BOOST_VIOLATION represents / DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = 242 // DCGM_FI_DEV_BOARD_LIMIT_VIOLATION represents / DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = 243 // DCGM_FI_DEV_LOW_UTIL_VIOLATION represents / DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = 244 // DCGM_FI_DEV_RELIABILITY_VIOLATION represents / DCGM_FI_DEV_RELIABILITY_VIOLATION Short = 245 // DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION represents / DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = 246 // DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION represents / DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = 247 // DCGM_FI_DEV_FB_TOTAL represents / DCGM_FI_DEV_FB_TOTAL Short = 250 // DCGM_FI_DEV_FB_FREE represents / DCGM_FI_DEV_FB_FREE Short = 251 // DCGM_FI_DEV_FB_USED represents / DCGM_FI_DEV_FB_USED Short = 252 // DCGM_FI_DEV_FB_RESERVED represents / DCGM_FI_DEV_FB_RESERVED Short = 253 // DCGM_FI_DEV_FB_USED_PERCENT represents / DCGM_FI_DEV_FB_USED_PERCENT Short = 254 // DCGM_FI_DEV_C2C_LINK_COUNT represents / DCGM_FI_DEV_C2C_LINK_COUNT Short = 285 // DCGM_FI_DEV_C2C_LINK_STATUS represents / DCGM_FI_DEV_C2C_LINK_STATUS Short = 286 // DCGM_FI_DEV_C2C_MAX_BANDWIDTH represents / DCGM_FI_DEV_C2C_MAX_BANDWIDTH Short = 287 // DCGM_FI_DEV_ECC_CURRENT represents / DCGM_FI_DEV_ECC_CURRENT Short = 300 // DCGM_FI_DEV_ECC_PENDING represents / DCGM_FI_DEV_ECC_PENDING Short = 301 // DCGM_FI_DEV_ECC_SBE_VOL_TOTAL represents / DCGM_FI_DEV_ECC_SBE_VOL_TOTAL Short = 310 // DCGM_FI_DEV_ECC_DBE_VOL_TOTAL represents / DCGM_FI_DEV_ECC_DBE_VOL_TOTAL Short = 311 // DCGM_FI_DEV_ECC_SBE_AGG_TOTAL represents / DCGM_FI_DEV_ECC_SBE_AGG_TOTAL Short = 312 // DCGM_FI_DEV_ECC_DBE_AGG_TOTAL represents / DCGM_FI_DEV_ECC_DBE_AGG_TOTAL Short = 313 // DCGM_FI_DEV_ECC_SBE_VOL_L1 represents / DCGM_FI_DEV_ECC_SBE_VOL_L1 Short = 314 // DCGM_FI_DEV_ECC_DBE_VOL_L1 represents / DCGM_FI_DEV_ECC_DBE_VOL_L1 Short = 315 // DCGM_FI_DEV_ECC_SBE_VOL_L2 represents / DCGM_FI_DEV_ECC_SBE_VOL_L2 Short = 316 // DCGM_FI_DEV_ECC_DBE_VOL_L2 represents / DCGM_FI_DEV_ECC_DBE_VOL_L2 Short = 317 // DCGM_FI_DEV_ECC_SBE_VOL_DEV represents / DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = 318 // DCGM_FI_DEV_ECC_DBE_VOL_DEV represents / DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = 319 // DCGM_FI_DEV_ECC_SBE_VOL_REG represents / DCGM_FI_DEV_ECC_SBE_VOL_REG Short = 320 // DCGM_FI_DEV_ECC_DBE_VOL_REG represents / DCGM_FI_DEV_ECC_DBE_VOL_REG Short = 321 // DCGM_FI_DEV_ECC_SBE_VOL_TEX represents / DCGM_FI_DEV_ECC_SBE_VOL_TEX Short = 322 // DCGM_FI_DEV_ECC_DBE_VOL_TEX represents / DCGM_FI_DEV_ECC_DBE_VOL_TEX Short = 323 // DCGM_FI_DEV_ECC_SBE_AGG_L1 represents / DCGM_FI_DEV_ECC_SBE_AGG_L1 Short = 324 // DCGM_FI_DEV_ECC_DBE_AGG_L1 represents / DCGM_FI_DEV_ECC_DBE_AGG_L1 Short = 325 // DCGM_FI_DEV_ECC_SBE_AGG_L2 represents / DCGM_FI_DEV_ECC_SBE_AGG_L2 Short = 326 // DCGM_FI_DEV_ECC_DBE_AGG_L2 represents / DCGM_FI_DEV_ECC_DBE_AGG_L2 Short = 327 // DCGM_FI_DEV_ECC_SBE_AGG_DEV represents / DCGM_FI_DEV_ECC_SBE_AGG_DEV Short = 328 // DCGM_FI_DEV_ECC_DBE_AGG_DEV represents / DCGM_FI_DEV_ECC_DBE_AGG_DEV Short = 329 // DCGM_FI_DEV_ECC_SBE_AGG_REG represents / DCGM_FI_DEV_ECC_SBE_AGG_REG Short = 330 // DCGM_FI_DEV_ECC_DBE_AGG_REG represents / DCGM_FI_DEV_ECC_DBE_AGG_REG Short = 331 // DCGM_FI_DEV_ECC_SBE_AGG_TEX represents / DCGM_FI_DEV_ECC_SBE_AGG_TEX Short = 332 // DCGM_FI_DEV_ECC_DBE_AGG_TEX represents / DCGM_FI_DEV_ECC_DBE_AGG_TEX Short = 333 // DCGM_FI_DEV_ECC_SBE_VOL_SHM represents / DCGM_FI_DEV_ECC_SBE_VOL_SHM Short = 334 // DCGM_FI_DEV_ECC_DBE_VOL_SHM represents / DCGM_FI_DEV_ECC_DBE_VOL_SHM Short = 335 // DCGM_FI_DEV_ECC_SBE_VOL_CBU represents / DCGM_FI_DEV_ECC_SBE_VOL_CBU Short = 336 // DCGM_FI_DEV_ECC_DBE_VOL_CBU represents / DCGM_FI_DEV_ECC_DBE_VOL_CBU Short = 337 // DCGM_FI_DEV_ECC_SBE_AGG_SHM represents / DCGM_FI_DEV_ECC_SBE_AGG_SHM Short = 338 // DCGM_FI_DEV_ECC_DBE_AGG_SHM represents / DCGM_FI_DEV_ECC_DBE_AGG_SHM Short = 339 // DCGM_FI_DEV_ECC_SBE_AGG_CBU represents / DCGM_FI_DEV_ECC_SBE_AGG_CBU Short = 340 // DCGM_FI_DEV_ECC_DBE_AGG_CBU represents / DCGM_FI_DEV_ECC_DBE_AGG_CBU Short = 341 // DCGM_FI_DEV_ECC_SBE_VOL_SRM represents / DCGM_FI_DEV_ECC_SBE_VOL_SRM Short = 342 // DCGM_FI_DEV_ECC_DBE_VOL_SRM represents / DCGM_FI_DEV_ECC_DBE_VOL_SRM Short = 343 // DCGM_FI_DEV_ECC_SBE_AGG_SRM represents / DCGM_FI_DEV_ECC_SBE_AGG_SRM Short = 344 // DCGM_FI_DEV_ECC_DBE_AGG_SRM represents / DCGM_FI_DEV_ECC_DBE_AGG_SRM Short = 345 // DCGM_FI_DEV_THRESHOLD_SRM represents / DCGM_FI_DEV_THRESHOLD_SRM Short = 346 // DCGM_FI_DEV_DIAG_MEMORY_RESULT represents / DCGM_FI_DEV_DIAG_MEMORY_RESULT Short = 350 // DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT represents / DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT Short = 351 // DCGM_FI_DEV_DIAG_PCIE_RESULT represents / DCGM_FI_DEV_DIAG_PCIE_RESULT Short = 352 // DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT represents / DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT Short = 353 // DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT represents / DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT Short = 354 // DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT represents / DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT Short = 355 // DCGM_FI_DEV_DIAG_MEMTEST_RESULT represents / DCGM_FI_DEV_DIAG_MEMTEST_RESULT Short = 356 // DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT represents / DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT Short = 357 // DCGM_FI_DEV_DIAG_EUD_RESULT represents / DCGM_FI_DEV_DIAG_EUD_RESULT Short = 358 // DCGM_FI_DEV_DIAG_CPU_EUD_RESULT represents / DCGM_FI_DEV_DIAG_CPU_EUD_RESULT Short = 359 // DCGM_FI_DEV_DIAG_SOFTWARE_RESULT represents / DCGM_FI_DEV_DIAG_SOFTWARE_RESULT Short = 360 // DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT represents / DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT Short = 361 // DCGM_FI_DEV_DIAG_STATUS represents / DCGM_FI_DEV_DIAG_STATUS Short = 362 // DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT represents / DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT Short = 363 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX represents / DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX Short = 385 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH represents / DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH Short = 386 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL represents / DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL Short = 387 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW represents / DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW Short = 388 // DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE represents / DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE Short = 389 // DCGM_FI_DEV_RETIRED_SBE represents / DCGM_FI_DEV_RETIRED_SBE Short = 390 // DCGM_FI_DEV_RETIRED_DBE represents / DCGM_FI_DEV_RETIRED_DBE Short = 391 // DCGM_FI_DEV_RETIRED_PENDING represents / DCGM_FI_DEV_RETIRED_PENDING Short = 392 // DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS represents / DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Short = 393 // DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS represents / DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Short = 394 // DCGM_FI_DEV_ROW_REMAP_FAILURE represents / DCGM_FI_DEV_ROW_REMAP_FAILURE Short = 395 // DCGM_FI_DEV_ROW_REMAP_PENDING represents / DCGM_FI_DEV_ROW_REMAP_PENDING Short = 396 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 Short = 400 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 Short = 401 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 Short = 402 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 Short = 403 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 Short = 404 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 Short = 405 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 Short = 406 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 Short = 407 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 Short = 408 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL Short = 409 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 Short = 410 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 Short = 411 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 Short = 412 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 Short = 413 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 Short = 414 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 Short = 415 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 Short = 416 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 Short = 417 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 Short = 418 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL Short = 419 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 Short = 420 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 Short = 421 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 Short = 422 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 Short = 423 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 Short = 424 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 Short = 425 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 Short = 426 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 Short = 427 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 Short = 428 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL Short = 429 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 Short = 430 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 Short = 431 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 Short = 432 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 Short = 433 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 Short = 434 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 Short = 435 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 Short = 436 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 Short = 437 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 Short = 438 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL Short = 439 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 Short = 440 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 Short = 441 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 Short = 442 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 Short = 443 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 Short = 444 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 Short = 445 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 Short = 446 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 Short = 447 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 Short = 448 // DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Short = 449 // DCGM_FI_DEV_GPU_NVLINK_ERRORS represents / DCGM_FI_DEV_GPU_NVLINK_ERRORS Short = 450 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 Short = 451 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 Short = 452 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 Short = 453 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 Short = 454 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 Short = 455 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 represents / DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 Short = 456 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 Short = 457 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 Short = 458 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 Short = 459 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 Short = 460 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 Short = 461 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 represents / DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 Short = 462 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 Short = 463 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 Short = 464 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 Short = 465 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 Short = 466 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 Short = 467 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 represents / DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 Short = 468 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 Short = 469 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 Short = 470 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 Short = 471 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 Short = 472 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 Short = 473 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 represents / DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 Short = 474 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 Short = 475 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 Short = 476 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 Short = 477 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 Short = 478 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 Short = 479 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 represents / DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 Short = 480 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 Short = 481 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 Short = 482 // DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 Short = 483 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 Short = 484 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 Short = 485 // DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 Short = 486 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 Short = 487 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 Short = 488 // DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 Short = 489 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 Short = 491 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 Short = 492 // DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 Short = 493 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 Short = 494 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 Short = 495 // DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 Short = 496 // DCGM_FI_DEV_NVLINK_ERROR_DL_CRC represents / DCGM_FI_DEV_NVLINK_ERROR_DL_CRC Short = 497 // DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY represents / DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY Short = 498 // DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY represents / DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY Short = 499 // DCGM_FI_DEV_VIRTUAL_MODE represents / DCGM_FI_DEV_VIRTUAL_MODE Short = 500 // DCGM_FI_DEV_SUPPORTED_TYPE_INFO represents / DCGM_FI_DEV_SUPPORTED_TYPE_INFO Short = 501 // DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS represents / DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS Short = 502 // DCGM_FI_DEV_VGPU_INSTANCE_IDS represents / DCGM_FI_DEV_VGPU_INSTANCE_IDS Short = 503 // DCGM_FI_DEV_VGPU_UTILIZATIONS represents / DCGM_FI_DEV_VGPU_UTILIZATIONS Short = 504 // DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION represents / DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION Short = 505 // DCGM_FI_DEV_ENC_STATS represents / DCGM_FI_DEV_ENC_STATS Short = 506 // DCGM_FI_DEV_FBC_STATS represents / DCGM_FI_DEV_FBC_STATS Short = 507 // DCGM_FI_DEV_FBC_SESSIONS_INFO represents / DCGM_FI_DEV_FBC_SESSIONS_INFO Short = 508 // DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS represents / DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS Short = 509 // DCGM_FI_DEV_VGPU_TYPE_INFO represents / DCGM_FI_DEV_VGPU_TYPE_INFO Short = 510 // DCGM_FI_DEV_VGPU_TYPE_NAME represents / DCGM_FI_DEV_VGPU_TYPE_NAME Short = 511 // DCGM_FI_DEV_VGPU_TYPE_CLASS represents / DCGM_FI_DEV_VGPU_TYPE_CLASS Short = 512 // DCGM_FI_DEV_VGPU_TYPE_LICENSE represents / DCGM_FI_DEV_VGPU_TYPE_LICENSE Short = 513 // DCGM_FI_DEV_VGPU_VM_ID represents / DCGM_FI_DEV_VGPU_VM_ID Short = 520 // DCGM_FI_FIRST_VGPU_FIELD_ID represents / DCGM_FI_FIRST_VGPU_FIELD_ID Short = 520 // DCGM_FI_DEV_VGPU_VM_NAME represents / DCGM_FI_DEV_VGPU_VM_NAME Short = 521 // DCGM_FI_DEV_VGPU_TYPE represents / DCGM_FI_DEV_VGPU_TYPE Short = 522 // DCGM_FI_DEV_VGPU_UUID represents / DCGM_FI_DEV_VGPU_UUID Short = 523 // DCGM_FI_DEV_VGPU_DRIVER_VERSION represents / DCGM_FI_DEV_VGPU_DRIVER_VERSION Short = 524 // DCGM_FI_DEV_VGPU_MEMORY_USAGE represents / DCGM_FI_DEV_VGPU_MEMORY_USAGE Short = 525 // DCGM_FI_DEV_VGPU_LICENSE_STATUS represents / DCGM_FI_DEV_VGPU_LICENSE_STATUS Short = 526 // DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT represents / DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT Short = 527 // DCGM_FI_DEV_VGPU_ENC_STATS represents / DCGM_FI_DEV_VGPU_ENC_STATS Short = 528 // DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO represents / DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO Short = 529 // DCGM_FI_DEV_VGPU_FBC_STATS represents / DCGM_FI_DEV_VGPU_FBC_STATS Short = 530 // DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO represents / DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO Short = 531 // DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE represents / DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE Short = 532 // DCGM_FI_DEV_VGPU_PCI_ID represents / DCGM_FI_DEV_VGPU_PCI_ID Short = 533 // DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID represents / DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID Short = 534 // DCGM_FI_LAST_VGPU_FIELD_ID represents / DCGM_FI_LAST_VGPU_FIELD_ID Short = 570 // DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID represents / DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID Short = 571 // DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER represents / DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER Short = 572 // DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER represents / DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER Short = 573 // DCGM_FI_DEV_PLATFORM_TRAY_INDEX represents / DCGM_FI_DEV_PLATFORM_TRAY_INDEX Short = 574 // DCGM_FI_DEV_PLATFORM_HOST_ID represents / DCGM_FI_DEV_PLATFORM_HOST_ID Short = 575 // DCGM_FI_DEV_PLATFORM_PEER_TYPE represents / DCGM_FI_DEV_PLATFORM_PEER_TYPE Short = 576 // DCGM_FI_DEV_PLATFORM_MODULE_ID represents / DCGM_FI_DEV_PLATFORM_MODULE_ID Short = 577 // DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY represents / DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY Short = 580 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST represents / DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST Short = 581 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO represents / DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO Short = 582 // DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS represents / DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS Short = 583 // DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS represents / DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS Short = 584 // DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER represents / DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER Short = 585 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES Short = 586 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR Short = 587 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE Short = 588 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES Short = 589 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES Short = 590 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS Short = 591 // DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS represents / DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS Short = 592 // DCGM_FI_INTERNAL_FIELDS_0_START represents / DCGM_FI_INTERNAL_FIELDS_0_START Short = 600 // DCGM_FI_INTERNAL_FIELDS_0_END represents / DCGM_FI_INTERNAL_FIELDS_0_END Short = 699 // DCGM_FI_FIRST_NVSWITCH_FIELD_ID represents / DCGM_FI_FIRST_NVSWITCH_FIELD_ID Short = 700 // DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT represents / DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT Short = 701 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ represents / DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ Short = 702 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV represents / DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV Short = 703 // DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD represents / DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD Short = 704 // DCGM_FI_DEV_NVSWITCH_POWER_VDD represents / DCGM_FI_DEV_NVSWITCH_POWER_VDD Short = 705 // DCGM_FI_DEV_NVSWITCH_POWER_DVDD represents / DCGM_FI_DEV_NVSWITCH_POWER_DVDD Short = 706 // DCGM_FI_DEV_NVSWITCH_POWER_HVDD represents / DCGM_FI_DEV_NVSWITCH_POWER_HVDD Short = 707 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX represents / DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX Short = 780 // DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX represents / DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX Short = 781 // DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS Short = 782 // DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS Short = 783 // DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS Short = 784 // DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS Short = 785 // DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS Short = 786 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS Short = 787 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS Short = 788 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 Short = 789 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 Short = 790 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 Short = 791 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 Short = 792 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 Short = 793 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 Short = 794 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 Short = 795 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 Short = 796 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 Short = 797 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 Short = 798 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 Short = 799 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 Short = 800 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 Short = 801 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 Short = 802 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 Short = 803 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 Short = 804 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 Short = 805 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 Short = 806 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 Short = 807 // DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 represents / DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 Short = 808 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 Short = 809 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 Short = 810 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 Short = 811 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 Short = 812 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 Short = 813 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 Short = 814 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 Short = 815 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 Short = 816 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 Short = 817 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 Short = 818 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 Short = 819 // DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 represents / DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 Short = 820 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 Short = 821 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 Short = 822 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 Short = 823 // DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 represents / DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 Short = 824 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 Short = 825 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 Short = 826 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 Short = 827 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 Short = 828 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 Short = 829 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 Short = 830 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 Short = 831 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 Short = 832 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 Short = 833 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 Short = 834 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 Short = 835 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 Short = 836 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 Short = 837 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 Short = 838 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 Short = 839 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 Short = 840 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 Short = 841 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 Short = 842 // DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL represents / DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL Short = 843 // DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS represents / DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS Short = 856 // DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS represents / DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS Short = 857 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT represents / DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT Short = 858 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN represents / DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN Short = 859 // DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN represents / DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN Short = 860 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX represents / DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX Short = 861 // DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX represents / DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX Short = 862 // DCGM_FI_DEV_NVSWITCH_PHYS_ID represents / DCGM_FI_DEV_NVSWITCH_PHYS_ID Short = 863 // DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED represents / DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED Short = 864 // DCGM_FI_DEV_NVSWITCH_LINK_ID represents / DCGM_FI_DEV_NVSWITCH_LINK_ID Short = 865 // DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN represents / DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN Short = 866 // DCGM_FI_DEV_NVSWITCH_PCIE_BUS represents / DCGM_FI_DEV_NVSWITCH_PCIE_BUS Short = 867 // DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE represents / DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE Short = 868 // DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION represents / DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION Short = 869 // DCGM_FI_DEV_NVSWITCH_LINK_STATUS represents / DCGM_FI_DEV_NVSWITCH_LINK_STATUS Short = 870 // DCGM_FI_DEV_NVSWITCH_LINK_TYPE represents / DCGM_FI_DEV_NVSWITCH_LINK_TYPE Short = 871 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN represents / DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN Short = 872 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS represents / DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS Short = 873 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE represents / DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE Short = 874 // DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION represents / DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION Short = 875 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID represents / DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID Short = 876 // DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID represents / DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID Short = 877 // DCGM_FI_DEV_NVSWITCH_DEVICE_UUID represents / DCGM_FI_DEV_NVSWITCH_DEVICE_UUID Short = 878 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 Short = 879 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 Short = 880 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 Short = 881 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 Short = 882 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 Short = 883 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 Short = 884 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 Short = 885 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 Short = 886 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 Short = 887 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 Short = 888 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 Short = 889 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 Short = 890 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 Short = 891 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 Short = 892 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 Short = 893 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 Short = 894 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 Short = 895 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 Short = 896 // DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL represents / DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL Short = 897 // DCGM_FI_LAST_NVSWITCH_FIELD_ID represents / DCGM_FI_LAST_NVSWITCH_FIELD_ID Short = 899 // DCGM_FI_PROF_GR_ENGINE_ACTIVE represents / DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = 1001 // DCGM_FI_PROF_SM_ACTIVE represents / DCGM_FI_PROF_SM_ACTIVE Short = 1002 // DCGM_FI_PROF_SM_OCCUPANCY represents / DCGM_FI_PROF_SM_OCCUPANCY Short = 1003 // DCGM_FI_PROF_PIPE_TENSOR_ACTIVE represents / DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = 1004 // DCGM_FI_PROF_DRAM_ACTIVE represents / DCGM_FI_PROF_DRAM_ACTIVE Short = 1005 // DCGM_FI_PROF_PIPE_FP64_ACTIVE represents / DCGM_FI_PROF_PIPE_FP64_ACTIVE Short = 1006 // DCGM_FI_PROF_PIPE_FP32_ACTIVE represents / DCGM_FI_PROF_PIPE_FP32_ACTIVE Short = 1007 // DCGM_FI_PROF_PIPE_FP16_ACTIVE represents / DCGM_FI_PROF_PIPE_FP16_ACTIVE Short = 1008 // DCGM_FI_PROF_PCIE_TX_BYTES represents / DCGM_FI_PROF_PCIE_TX_BYTES Short = 1009 // DCGM_FI_PROF_PCIE_RX_BYTES represents / DCGM_FI_PROF_PCIE_RX_BYTES Short = 1010 // DCGM_FI_PROF_NVLINK_TX_BYTES represents / DCGM_FI_PROF_NVLINK_TX_BYTES Short = 1011 // DCGM_FI_PROF_NVLINK_RX_BYTES represents / DCGM_FI_PROF_NVLINK_RX_BYTES Short = 1012 // DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE represents / DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE Short = 1013 // DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE represents / DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE Short = 1014 // DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE represents / DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE Short = 1015 // DCGM_FI_PROF_PIPE_INT_ACTIVE represents / DCGM_FI_PROF_PIPE_INT_ACTIVE Short = 1016 // DCGM_FI_PROF_NVDEC0_ACTIVE represents / DCGM_FI_PROF_NVDEC0_ACTIVE Short = 1017 // DCGM_FI_PROF_NVDEC1_ACTIVE DCGM_FI_PROF_NVDEC1_ACTIVE Short = 1018 // DCGM_FI_PROF_NVDEC2_ACTIVE DCGM_FI_PROF_NVDEC2_ACTIVE Short = 1019 // DCGM_FI_PROF_NVDEC3_ACTIVE DCGM_FI_PROF_NVDEC3_ACTIVE Short = 1020 // DCGM_FI_PROF_NVDEC4_ACTIVE DCGM_FI_PROF_NVDEC4_ACTIVE Short = 1021 // DCGM_FI_PROF_NVDEC5_ACTIVE DCGM_FI_PROF_NVDEC5_ACTIVE Short = 1022 // DCGM_FI_PROF_NVDEC6_ACTIVE DCGM_FI_PROF_NVDEC6_ACTIVE Short = 1023 // DCGM_FI_PROF_NVDEC7_ACTIVE DCGM_FI_PROF_NVDEC7_ACTIVE Short = 1024 // DCGM_FI_PROF_NVJPG0_ACTIVE represents / DCGM_FI_PROF_NVJPG0_ACTIVE Short = 1025 // DCGM_FI_PROF_NVJPG1_ACTIVE DCGM_FI_PROF_NVJPG1_ACTIVE Short = 1026 // DCGM_FI_PROF_NVJPG2_ACTIVE DCGM_FI_PROF_NVJPG2_ACTIVE Short = 1027 // DCGM_FI_PROF_NVJPG3_ACTIVE DCGM_FI_PROF_NVJPG3_ACTIVE Short = 1028 // DCGM_FI_PROF_NVJPG4_ACTIVE DCGM_FI_PROF_NVJPG4_ACTIVE Short = 1029 // DCGM_FI_PROF_NVJPG5_ACTIVE DCGM_FI_PROF_NVJPG5_ACTIVE Short = 1030 // DCGM_FI_PROF_NVJPG6_ACTIVE DCGM_FI_PROF_NVJPG6_ACTIVE Short = 1031 // DCGM_FI_PROF_NVJPG7_ACTIVE DCGM_FI_PROF_NVJPG7_ACTIVE Short = 1032 // DCGM_FI_PROF_NVOFA0_ACTIVE represents / DCGM_FI_PROF_NVOFA0_ACTIVE Short = 1033 // DCGM_FI_PROF_NVOFA1_ACTIVE DCGM_FI_PROF_NVOFA1_ACTIVE Short = 1034 // DCGM_FI_PROF_NVLINK_L0_TX_BYTES represents / DCGM_FI_PROF_NVLINK_L0_TX_BYTES Short = 1040 // DCGM_FI_PROF_NVLINK_L0_RX_BYTES DCGM_FI_PROF_NVLINK_L0_RX_BYTES Short = 1041 // DCGM_FI_PROF_NVLINK_L1_TX_BYTES DCGM_FI_PROF_NVLINK_L1_TX_BYTES Short = 1042 // DCGM_FI_PROF_NVLINK_L1_RX_BYTES DCGM_FI_PROF_NVLINK_L1_RX_BYTES Short = 1043 // DCGM_FI_PROF_NVLINK_L2_TX_BYTES DCGM_FI_PROF_NVLINK_L2_TX_BYTES Short = 1044 // DCGM_FI_PROF_NVLINK_L2_RX_BYTES DCGM_FI_PROF_NVLINK_L2_RX_BYTES Short = 1045 // DCGM_FI_PROF_NVLINK_L3_TX_BYTES DCGM_FI_PROF_NVLINK_L3_TX_BYTES Short = 1046 // DCGM_FI_PROF_NVLINK_L3_RX_BYTES DCGM_FI_PROF_NVLINK_L3_RX_BYTES Short = 1047 // DCGM_FI_PROF_NVLINK_L4_TX_BYTES DCGM_FI_PROF_NVLINK_L4_TX_BYTES Short = 1048 // DCGM_FI_PROF_NVLINK_L4_RX_BYTES DCGM_FI_PROF_NVLINK_L4_RX_BYTES Short = 1049 // DCGM_FI_PROF_NVLINK_L5_TX_BYTES DCGM_FI_PROF_NVLINK_L5_TX_BYTES Short = 1050 // DCGM_FI_PROF_NVLINK_L5_RX_BYTES DCGM_FI_PROF_NVLINK_L5_RX_BYTES Short = 1051 // DCGM_FI_PROF_NVLINK_L6_TX_BYTES DCGM_FI_PROF_NVLINK_L6_TX_BYTES Short = 1052 // DCGM_FI_PROF_NVLINK_L6_RX_BYTES DCGM_FI_PROF_NVLINK_L6_RX_BYTES Short = 1053 // DCGM_FI_PROF_NVLINK_L7_TX_BYTES DCGM_FI_PROF_NVLINK_L7_TX_BYTES Short = 1054 // DCGM_FI_PROF_NVLINK_L7_RX_BYTES DCGM_FI_PROF_NVLINK_L7_RX_BYTES Short = 1055 // DCGM_FI_PROF_NVLINK_L8_TX_BYTES DCGM_FI_PROF_NVLINK_L8_TX_BYTES Short = 1056 // DCGM_FI_PROF_NVLINK_L8_RX_BYTES DCGM_FI_PROF_NVLINK_L8_RX_BYTES Short = 1057 // DCGM_FI_PROF_NVLINK_L9_TX_BYTES DCGM_FI_PROF_NVLINK_L9_TX_BYTES Short = 1058 // DCGM_FI_PROF_NVLINK_L9_RX_BYTES DCGM_FI_PROF_NVLINK_L9_RX_BYTES Short = 1059 // DCGM_FI_PROF_NVLINK_L10_TX_BYTES DCGM_FI_PROF_NVLINK_L10_TX_BYTES Short = 1060 // DCGM_FI_PROF_NVLINK_L10_RX_BYTES DCGM_FI_PROF_NVLINK_L10_RX_BYTES Short = 1061 // DCGM_FI_PROF_NVLINK_L11_TX_BYTES DCGM_FI_PROF_NVLINK_L11_TX_BYTES Short = 1062 // DCGM_FI_PROF_NVLINK_L11_RX_BYTES DCGM_FI_PROF_NVLINK_L11_RX_BYTES Short = 1063 // DCGM_FI_PROF_NVLINK_L12_TX_BYTES DCGM_FI_PROF_NVLINK_L12_TX_BYTES Short = 1064 // DCGM_FI_PROF_NVLINK_L12_RX_BYTES DCGM_FI_PROF_NVLINK_L12_RX_BYTES Short = 1065 // DCGM_FI_PROF_NVLINK_L13_TX_BYTES DCGM_FI_PROF_NVLINK_L13_TX_BYTES Short = 1066 // DCGM_FI_PROF_NVLINK_L13_RX_BYTES DCGM_FI_PROF_NVLINK_L13_RX_BYTES Short = 1067 // DCGM_FI_PROF_NVLINK_L14_TX_BYTES DCGM_FI_PROF_NVLINK_L14_TX_BYTES Short = 1068 // DCGM_FI_PROF_NVLINK_L14_RX_BYTES DCGM_FI_PROF_NVLINK_L14_RX_BYTES Short = 1069 // DCGM_FI_PROF_NVLINK_L15_TX_BYTES DCGM_FI_PROF_NVLINK_L15_TX_BYTES Short = 1070 // DCGM_FI_PROF_NVLINK_L15_RX_BYTES DCGM_FI_PROF_NVLINK_L15_RX_BYTES Short = 1071 // DCGM_FI_PROF_NVLINK_L16_TX_BYTES DCGM_FI_PROF_NVLINK_L16_TX_BYTES Short = 1072 // DCGM_FI_PROF_NVLINK_L16_RX_BYTES DCGM_FI_PROF_NVLINK_L16_RX_BYTES Short = 1073 // DCGM_FI_PROF_NVLINK_L17_TX_BYTES DCGM_FI_PROF_NVLINK_L17_TX_BYTES Short = 1074 // DCGM_FI_PROF_NVLINK_L17_RX_BYTES DCGM_FI_PROF_NVLINK_L17_RX_BYTES Short = 1075 // DCGM_FI_PROF_C2C_TX_ALL_BYTES represents / DCGM_FI_PROF_C2C_TX_ALL_BYTES Short = 1076 // DCGM_FI_PROF_C2C_TX_DATA_BYTES represents / DCGM_FI_PROF_C2C_TX_DATA_BYTES Short = 1077 // DCGM_FI_PROF_C2C_RX_ALL_BYTES represents / DCGM_FI_PROF_C2C_RX_ALL_BYTES Short = 1078 // DCGM_FI_PROF_C2C_RX_DATA_BYTES represents / DCGM_FI_PROF_C2C_RX_DATA_BYTES Short = 1079 // DCGM_FI_PROF_HOSTMEM_CACHE_HIT represents / DCGM_FI_PROF_HOSTMEM_CACHE_HIT Short = 1080 // DCGM_FI_PROF_HOSTMEM_CACHE_MISS represents / DCGM_FI_PROF_HOSTMEM_CACHE_MISS Short = 1081 // DCGM_FI_PROF_PEERMEM_CACHE_HIT represents / DCGM_FI_PROF_PEERMEM_CACHE_HIT Short = 1082 // DCGM_FI_PROF_PEERMEM_CACHE_MISS represents / DCGM_FI_PROF_PEERMEM_CACHE_MISS Short = 1083 // DCGM_FI_DEV_CPU_UTIL_TOTAL represents / DCGM_FI_DEV_CPU_UTIL_TOTAL Short = 1100 // DCGM_FI_DEV_CPU_UTIL_USER represents / DCGM_FI_DEV_CPU_UTIL_USER Short = 1101 // DCGM_FI_DEV_CPU_UTIL_NICE represents / DCGM_FI_DEV_CPU_UTIL_NICE Short = 1102 // DCGM_FI_DEV_CPU_UTIL_SYS represents / DCGM_FI_DEV_CPU_UTIL_SYS Short = 1103 // DCGM_FI_DEV_CPU_UTIL_IRQ represents / DCGM_FI_DEV_CPU_UTIL_IRQ Short = 1104 // DCGM_FI_DEV_CPU_TEMP_CURRENT represents / DCGM_FI_DEV_CPU_TEMP_CURRENT Short = 1110 // DCGM_FI_DEV_CPU_TEMP_WARNING represents / DCGM_FI_DEV_CPU_TEMP_WARNING Short = 1111 // DCGM_FI_DEV_CPU_TEMP_CRITICAL represents / DCGM_FI_DEV_CPU_TEMP_CRITICAL Short = 1112 // DCGM_FI_DEV_CPU_CLOCK_CURRENT represents / DCGM_FI_DEV_CPU_CLOCK_CURRENT Short = 1120 // DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT represents / DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT Short = 1130 // DCGM_FI_DEV_CPU_POWER_LIMIT represents / DCGM_FI_DEV_CPU_POWER_LIMIT Short = 1131 // DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT represents / DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT Short = 1132 // DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT represents / DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT Short = 1133 // DCGM_FI_DEV_CPU_VENDOR represents / DCGM_FI_DEV_CPU_VENDOR Short = 1140 // DCGM_FI_DEV_CPU_MODEL represents / DCGM_FI_DEV_CPU_MODEL Short = 1141 // DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS represents / DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS Short = 1200 // DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES represents / DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES Short = 1201 // DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS Short = 1202 // DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES represents / DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES Short = 1203 // DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS Short = 1204 // DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS Short = 1205 // DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS Short = 1206 // DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS Short = 1207 // DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS Short = 1208 // DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS Short = 1209 // DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS represents / DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS Short = 1210 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS represents / DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS Short = 1211 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS represents / DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS Short = 1212 // DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS represents / DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS Short = 1213 // DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS Short = 1214 // DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER represents / DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER Short = 1215 // DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT represents / DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT Short = 1216 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER represents / DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER Short = 1217 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT represents / DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT Short = 1218 // DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS represents / DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS Short = 1219 // DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL represents / DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL Short = 1220 // DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID represents / DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID Short = 1300 // DCGM_FI_DEV_CONNECTX_HEALTH represents / DCGM_FI_DEV_CONNECTX_HEALTH Short = 1300 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH represents / DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH Short = 1301 // DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED represents / DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED Short = 1302 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH represents / DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH Short = 1303 // DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED represents / DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED Short = 1304 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS represents / DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS Short = 1305 // DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK represents / DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK Short = 1306 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS represents / DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS Short = 1307 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK represents / DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK Short = 1308 // DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY represents / DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY Short = 1309 // DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE represents / DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE Short = 1310 // DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID represents / DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID Short = 1399 // DCGM_FI_DEV_C2C_LINK_ERROR_INTR represents / DCGM_FI_DEV_C2C_LINK_ERROR_INTR Short = 1400 // DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY represents / DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY Short = 1401 // DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B represents / DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B Short = 1402 // DCGM_FI_DEV_C2C_LINK_POWER_STATE represents / DCGM_FI_DEV_C2C_LINK_POWER_STATE Short = 1403 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 Short = 1404 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 Short = 1405 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 Short = 1406 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 Short = 1407 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 Short = 1408 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 Short = 1409 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 Short = 1410 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 Short = 1411 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 Short = 1412 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 Short = 1413 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 Short = 1414 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 Short = 1415 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 Short = 1416 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 Short = 1417 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 Short = 1418 // DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 represents / DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 Short = 1419 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS Short = 1420 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS Short = 1421 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS Short = 1422 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS Short = 1423 // DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS represents / DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS Short = 1424 // DCGM_FI_DEV_PWR_SMOOTHING_ENABLED represents / DCGM_FI_DEV_PWR_SMOOTHING_ENABLED Short = 1425 // DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL represents / DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL Short = 1426 // DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED represents / DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED Short = 1427 // DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL represents / DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL Short = 1428 // DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR represents / DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR Short = 1429 // DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING represents / DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING Short = 1430 // DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING represents / DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING Short = 1431 // DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING represents / DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING Short = 1432 // DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES represents / DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES Short = 1433 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR represents / DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR Short = 1434 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE represents / DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE Short = 1435 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE represents / DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE Short = 1436 // DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL represents / DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL Short = 1437 // DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE represents / DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE Short = 1438 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR represents / DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR Short = 1439 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE represents / DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE Short = 1440 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE represents / DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE Short = 1441 // DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL represents / DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL Short = 1442 // DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS represents / DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS Short = 1501 // DCGM_FI_IMEX_DOMAIN_STATUS represents / DCGM_FI_IMEX_DOMAIN_STATUS Short = 1502 // DCGM_FI_IMEX_DAEMON_STATUS represents / DCGM_FI_IMEX_DAEMON_STATUS Short = 1503 // DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG represents / DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG Short = 1507 // DCGM_FI_DEV_NVLINK_GET_STATE represents / DCGM_FI_DEV_NVLINK_GET_STATE Short = 1508 // DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT represents / DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT Short = 1509 // DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION represents / DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION Short = 1523 )
func GetFieldID ¶
GetFieldID returns the DCGM field ID for a given field name and whether it was found It first checks the current field IDs, then falls back to legacy field IDs if not found
func GetFieldIDOrPanic ¶
GetFieldIDOrPanic returns the DCGM field ID for a given field name It panics if the field name is not found in either current or legacy maps
type Status ¶
type Status struct {
// Memory represents the current memory usage of the DCGM hostengine in kilobytes
Memory int64
// CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100)
CPU float64
}
Status represents the current resource utilization of the DCGM hostengine process
func Introspect ¶
Introspect returns memory and CPU usage statistics for the DCGM hostengine
type SystemWatch ¶
type SystemWatch struct {
// Type identifies the type of health watch system
Type string
// Status indicates the current health status
Status string
// Error contains any error message if status is not healthy
Error string
}
SystemWatch represents a health watch system and its status
type ThermalPolicyCondition ¶
type ThermalPolicyCondition struct {
// ThermalViolation indicates the severity of the thermal violation
ThermalViolation uint
}
ThermalPolicyCondition contains details about a thermal violation
type UtilizationInfo ¶
type UtilizationInfo struct {
GPU int64 // %
Memory int64 // %
Encoder int64 // %
Decoder int64 // %
}
UtilizationInfo contains GPU utilization metrics
type ViolationTime ¶
type ViolationTime struct {
// Power is time spent throttling due to power constraints
Power *uint64
// Thermal is time spent throttling due to thermal constraints
Thermal *uint64
// Reliability is time spent throttling due to reliability constraints
Reliability *uint64
// BoardLimit is time spent throttling due to board limit constraints
BoardLimit *uint64
// LowUtilization is time spent throttling due to low utilization
LowUtilization *uint64
// SyncBoost is time spent throttling due to sync boost
SyncBoost *uint64
}
ViolationTime measures amount of time (in ms) GPU was at reduced clocks
type XIDErrorInfo ¶
type XIDErrorInfo struct {
// NumErrors is the number of XID errors that occurred
NumErrors int
// Timestamp contains the timestamps of when XID errors occurred
Timestamp []uint64
}
XIDErrorInfo contains information about XID errors
type XidPolicyCondition ¶
type XidPolicyCondition struct {
// ErrNum is the XID error number
ErrNum uint
}
XidPolicyCondition contains details about an XID error