dcgm

package
v0.0.0-...-6cbb046 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 15, 2026 License: Apache-2.0 Imports: 22 Imported by: 27

Documentation

Overview

Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)

Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)

Index

Constants

View Source
const (
	Embedded mode = iota
	Standalone
	StartHostengine
)

const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine

View Source
const (
	// DCGM_FT_BINARY is the type for binary data
	DCGM_FT_BINARY = uint('b')
	// DCGM_FT_DOUBLE is the type for floating-point numbers
	DCGM_FT_DOUBLE = uint('d')
	// DCGM_FT_INT64 is the type for 64-bit integers
	DCGM_FT_INT64 = uint('i')
	// DCGM_FT_STRING is the type for strings
	DCGM_FT_STRING = uint('s')
	// DCGM_FT_TIMESTAMP is the type for timestamps
	DCGM_FT_TIMESTAMP = uint('t')
	// DCGM_FT_INT32_BLANK is the blank value for 32-bit integers
	DCGM_FT_INT32_BLANK = int64(2147483632)
	// DCGM_FT_INT32_NOT_FOUND is the value for not found in 32-bit integers
	DCGM_FT_INT32_NOT_FOUND = DCGM_FT_INT32_BLANK + 1
	// DCGM_FT_INT32_NOT_SUPPORTED is the value for not supported in 32-bit integers
	DCGM_FT_INT32_NOT_SUPPORTED = DCGM_FT_INT32_BLANK + 2
	// DCGM_FT_INT32_NOT_PERMISSIONED is the value for not permissioned in 32-bit integers
	DCGM_FT_INT32_NOT_PERMISSIONED = DCGM_FT_INT32_BLANK + 3
	// DCGM_FT_INT64_BLANK is the blank value for 64-bit integers
	DCGM_FT_INT64_BLANK = int64(9223372036854775792)
	// DCGM_FT_INT64_NOT_FOUND is the value for not found in 64-bit integers
	DCGM_FT_INT64_NOT_FOUND = DCGM_FT_INT64_BLANK + 1
	// DCGM_FT_INT64_NOT_SUPPORTED is the value for not supported in 64-bit integers
	DCGM_FT_INT64_NOT_SUPPORTED = DCGM_FT_INT64_BLANK + 2
	// DCGM_FT_INT64_NOT_PERMISSIONED is the value for not permissioned in 64-bit integers
	DCGM_FT_INT64_NOT_PERMISSIONED = DCGM_FT_INT64_BLANK + 3
	// DCGM_FT_FP64_BLANK is the blank value for floating-point numbers
	DCGM_FT_FP64_BLANK = 140737488355328.0
	// DCGM_FT_FP64_NOT_FOUND is the value for not found in floating-point numbers
	DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0)
	// DCGM_FT_FP64_NOT_SUPPORTED is the value for not supported in floating-point numbers
	DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0)
	// DCGM_FT_FP64_NOT_PERMISSIONED is the value for not permissioned in floating-point numbers
	DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0)
	// DCGM_FT_STR_BLANK is the blank value for strings
	DCGM_FT_STR_BLANK = "<<<NULL>>>"
	// DCGM_FT_STR_NOT_FOUND is the value for not found in strings
	DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>"
	// DCGM_FT_STR_NOT_SUPPORTED is the value for not supported in strings
	DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>"
	// DCGM_FT_STR_NOT_PERMISSIONED is the value for not permissioned in strings
	DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERMISSIONED>>>"

	// DCGM_ST_OK is the value for ECC OK
	DCGM_ST_OK = 0
	// DCGM_ST_BADPARAM is the value for ECC BAD PARAM
	DCGM_ST_BADPARAM = -1
	// DCGM_ST_GENERIC_ERROR is the value for ECC GENERIC ERROR
	DCGM_ST_GENERIC_ERROR = -3
	// DCGM_ST_MEMORY is the value for ECC MEMORY
	DCGM_ST_MEMORY = -4
	// DCGM_ST_NOT_CONFIGURED is the value for ECC NOT CONFIGURED
	DCGM_ST_NOT_CONFIGURED = -5
	// DCGM_ST_NOT_SUPPORTED is the value for ECC NOT SUPPORTED
	DCGM_ST_NOT_SUPPORTED = -6
	// DCGM_ST_INIT_ERROR is the value for ECC INIT ERROR
	DCGM_ST_INIT_ERROR = -7
	// DCGM_ST_NVML_ERROR is the value for ECC NVML ERROR
	DCGM_ST_NVML_ERROR = -8
	// DCGM_ST_PENDING is the value for ECC PENDING
	DCGM_ST_PENDING = -9
	// DCGM_ST_TIMEOUT is the value for ECC TIMEOUT
	DCGM_ST_TIMEOUT = -11
	// DCGM_ST_VER_MISMATCH is the value for ECC VER MISMATCH
	DCGM_ST_VER_MISMATCH = -12
	// DCGM_ST_UNKNOWN_FIELD is the value for ECC UNKNOWN FIELD
	DCGM_ST_UNKNOWN_FIELD = -13
	// DCGM_ST_NO_DATA is the value for ECC NO DATA
	DCGM_ST_NO_DATA = -14
	// DCGM_ST_STALE_DATA is the value for ECC STALE DATA
	DCGM_ST_STALE_DATA = -15
	// DCGM_ST_NOT_WATCHED is the value for ECC NOT WATCHED
	DCGM_ST_NOT_WATCHED = -16
	// DCGM_ST_NO_PERMISSION is the value for ECC NO PERMISSION
	DCGM_ST_NO_PERMISSION = -17
	// DCGM_ST_GPU_IS_LOST is the value for ECC GPU IS LOST
	DCGM_ST_GPU_IS_LOST = -18
	// DCGM_ST_RESET_REQUIRED is the value for ECC RESET REQUIRED
	DCGM_ST_RESET_REQUIRED = -19
	// DCGM_ST_FUNCTION_NOT_FOUND is the value for ECC FUNCTION NOT FOUND
	DCGM_ST_FUNCTION_NOT_FOUND = -20
	// DCGM_ST_CONNECTION_NOT_VALID is the value for ECC CONNECTION NOT VALID
	DCGM_ST_CONNECTION_NOT_VALID = -21
	// DCGM_ST_GPU_NOT_SUPPORTED is the value for ECC GPU NOT SUPPORTED
	DCGM_ST_GPU_NOT_SUPPORTED = -22
	// DCGM_ST_GROUP_INCOMPATIBLE is the value for ECC GROUP INCOMPATIBLE
	DCGM_ST_GROUP_INCOMPATIBLE = -23
	// DCGM_ST_MAX_LIMIT is the value for ECC MAX LIMIT
	DCGM_ST_MAX_LIMIT = -24
	// DCGM_ST_LIBRARY_NOT_FOUND is the value for ECC LIBRARY NOT FOUND
	DCGM_ST_LIBRARY_NOT_FOUND = -25
	// DCGM_ST_DUPLICATE_KEY is the value for ECC DUPLICATE KEY
	DCGM_ST_DUPLICATE_KEY = -26
	// DCGM_ST_GPU_IN_SYNC_BOOST_GROUP is the value for ECC GPU IN SYNC BOOST GROUP
	DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27
	// DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP is the value for ECC GPU NOT IN SYNC BOOST GROUP
	DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28
	// DCGM_ST_REQUIRES_ROOT is the value for ECC REQUIRES ROOT
	DCGM_ST_REQUIRES_ROOT = -29
	// DCGM_ST_NVVS_ERROR is the value for ECC NVVS ERROR
	DCGM_ST_NVVS_ERROR = -30
	// DCGM_ST_INSUFFICIENT_SIZE is the value for ECC INSUFFICIENT SIZE
	DCGM_ST_INSUFFICIENT_SIZE = -31
	// DCGM_ST_FIELD_UNSUPPORTED_BY_API is the value for ECC FIELD UNSUPPORTED BY API
	DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32
	// DCGM_ST_MODULE_NOT_LOADED is the value for ECC MODULE NOT LOADED
	DCGM_ST_MODULE_NOT_LOADED = -33
	// DCGM_ST_IN_USE is the value for ECC IN USE
	DCGM_ST_IN_USE = -34
	// DCGM_ST_GROUP_IS_EMPTY is the value for ECC GROUP IS EMPTY
	DCGM_ST_GROUP_IS_EMPTY = -35
	// DCGM_ST_PROFILING_NOT_SUPPORTED is the value for ECC PROFILING NOT SUPPORTED
	DCGM_ST_PROFILING_NOT_SUPPORTED = -36
	// DCGM_ST_PROFILING_LIBRARY_ERROR is the value for ECC PROFILING LIBRARY ERROR
	DCGM_ST_PROFILING_LIBRARY_ERROR = -37
	// DCGM_ST_PROFILING_MULTI_PASS is the value for ECC PROFILING MULTI PASS
	DCGM_ST_PROFILING_MULTI_PASS = -38
	// DCGM_ST_DIAG_ALREADY_RUNNING is the value for ECC DIAG ALREADY RUNNING
	DCGM_ST_DIAG_ALREADY_RUNNING = -39
	// DCGM_ST_DIAG_BAD_JSON is the value for ECC DIAG BAD JSON
	DCGM_ST_DIAG_BAD_JSON = -40
	// DCGM_ST_DIAG_BAD_LAUNCH is the value for ECC DIAG BAD LAUNCH
	DCGM_ST_DIAG_BAD_LAUNCH = -41
	// DCGM_ST_DIAG_UNUSED is the value for ECC DIAG UNUSED
	DCGM_ST_DIAG_UNUSED = -42
	// DCGM_ST_DIAG_THRESHOLD_EXCEEDED is the value for ECC DIAG THRESHOLD EXCEEDED
	DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43
	// DCGM_ST_INSUFFICIENT_DRIVER_VERSION is the value for ECC INSUFFICIENT DRIVER VERSION
	DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44
	// DCGM_ST_INSTANCE_NOT_FOUND is the value for ECC INSTANCE NOT FOUND
	DCGM_ST_INSTANCE_NOT_FOUND = -45
	// DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND is the value for ECC COMPUTE INSTANCE NOT FOUND
	DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46
	// DCGM_ST_CHILD_NOT_KILLED is the value for ECC CHILD NOT KILLED
	DCGM_ST_CHILD_NOT_KILLED = -47
	// DCGM_ST_3RD_PARTY_LIBRARY_ERROR is the value for ECC 3RD PARTY LIBRARY ERROR
	DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48
	// DCGM_ST_INSUFFICIENT_RESOURCES is the value for ECC INSUFFICIENT RESOURCES
	DCGM_ST_INSUFFICIENT_RESOURCES = -49
	// DCGM_ST_PLUGIN_EXCEPTION is the value for ECC PLUGIN EXCEPTION
	DCGM_ST_PLUGIN_EXCEPTION = -50
	// DCGM_ST_NVVS_ISOLATE_ERROR is the value for ECC NVVS ISOLATE ERROR
	DCGM_ST_NVVS_ISOLATE_ERROR = -51
	// DCGM_ST_NVVS_BINARY_NOT_FOUND is the value for ECC NVVS BINARY NOT FOUND
	DCGM_ST_NVVS_BINARY_NOT_FOUND = -52
	// DCGM_ST_NVVS_KILLED is the value for ECC NVVS KILLED
	DCGM_ST_NVVS_KILLED = -53
	// DCGM_ST_PAUSED is the value for ECC PAUSED
	DCGM_ST_PAUSED = -54
	// DCGM_ST_ALREADY_INITIALIZED is the value for ECC ALREADY INITIALIZED
	DCGM_ST_ALREADY_INITIALIZED = -55
	// DCGM_ST_NVML_NOT_LOADED is the value for ECC NVML NOT LOADED
	DCGM_ST_NVML_NOT_LOADED = -56
	// DCGM_ST_NVML_DRIVER_TIMEOUT is the value for ECC NVML DRIVER TIMEOUT
	DCGM_ST_NVML_DRIVER_TIMEOUT = -57
	// DCGM_ST_NVVS_NO_AVAILABLE_TEST is the value for ECC NVVS NO AVAILABLE TEST
	DCGM_ST_NVVS_NO_AVAILABLE_TEST = -58
	// DCGM_ST_UNINITIALIZED is the value for DCGM not initialized
	DCGM_ST_UNINITIALIZED = -59
	// DCGM_ST_NO_NVVS is the value for NVVS not available
	DCGM_ST_NO_NVVS = -60
	// DCGM_ST_NVVS_NOT_RUNNING is the value for NVVS not running
	DCGM_ST_NVVS_NOT_RUNNING = -61
	// DCGM_ST_CHILD_SPAWN_FAILED is the value for child spawn failed
	DCGM_ST_CHILD_SPAWN_FAILED = -62
	// DCGM_ST_FILE_IO_ERROR is the value for file I/O error
	DCGM_ST_FILE_IO_ERROR = -63
	// DCGM_ST_CHILD_SIGNAL_RECEIVED is the value for child signal received
	DCGM_ST_CHILD_SIGNAL_RECEIVED = -64
	// DCGM_ST_CALLER_ALREADY_STOPPED is the value for caller already stopped
	DCGM_ST_CALLER_ALREADY_STOPPED = -65
	// DCGM_ST_DIAG_STOPPED is the value for diagnostic stopped
	DCGM_ST_DIAG_STOPPED = -66
)

FieldType constants

View Source
const (
	// MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported
	MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES)

	// MAX_NUM_CPUS represents the maximum number of CPUs supported
	MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS)

	// CHAR_BIT represents the number of bits in a byte
	CHAR_BIT = uint(C.CHAR_BIT)

	// MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks
	MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8)
)
View Source
const (
	// PerfStateMax represents the highest performance state (P0)
	PerfStateMax = 0

	// PerfStateMin represents the lowest performance state (P15)
	PerfStateMin = 15

	// PerfStateUnknown represents an unknown performance state
	PerfStateUnknown = 32
)
View Source
const (
	// MAX_NUM_DEVICES represents the maximum number of GPU devices supported
	MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES)

	// MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information
	MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO)
)
View Source
const (
	// DbePolicy represents a Double-bit ECC error policy condition
	DbePolicy = PolicyCondition("Double-bit ECC error")

	// PCIePolicy represents a PCI error policy condition
	PCIePolicy = PolicyCondition("PCI error")

	// MaxRtPgPolicy represents a Maximum Retired Pages Limit policy condition
	MaxRtPgPolicy = PolicyCondition("Max Retired Pages Limit")

	// ThermalPolicy represents a Thermal Limit policy condition
	ThermalPolicy = PolicyCondition("Thermal Limit")

	// PowerPolicy represents a Power Limit policy condition
	PowerPolicy = PolicyCondition("Power Limit")

	// NvlinkPolicy represents an NVLink error policy condition
	NvlinkPolicy = PolicyCondition("Nvlink Error")

	// XidPolicy represents an XID error policy condition
	XidPolicy = PolicyCondition("XID Error")
)

Policy condition types

View Source
const (
	// DefaultMaxRetiredPages is the default threshold for retired pages (matches dcgmi default)
	DefaultMaxRetiredPages = 10

	// DefaultMaxTemperature is the default threshold for temperature in Celsius (matches dcgmi default)
	DefaultMaxTemperature = 100

	// DefaultMaxPower is the default threshold for power in Watts (matches dcgmi default)
	DefaultMaxPower = 250
)

Default policy thresholds matching dcgmi defaults

View Source
const (
	// DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration
	DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML"
	// DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file
	DCGM_DBG_FILE = "__DCGM_DBG_FILE"
	// DCGM_DBG_LVL is environment variables which enables DCGM logging level
	DCGM_DBG_LVL = "__DCGM_DBG_LVL"
)
View Source
const (
	DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)

DCGM_FV_FLAG_LIVE_DATA is a flag for the DCGM fields.

View Source
const (
	DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
)

DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group

View Source
const DIAG_RESULT_STRING_SIZE = 1024

DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings

Variables

View Source
var ErrInvalidMode = errors.New("invalid mode")

ErrInvalidMode represents an error indicating that an invalid mode was used

Functions

func AddEntityToGroup

func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error)

AddEntityToGroup adds an entity to an existing group

func AddLinkEntityToGroup

func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, parentID uint) (err error)

AddLinkEntityToGroup adds a link entity to the group

func AddToGroup

func AddToGroup(groupID GroupHandle, gpuID uint) (err error)

AddToGroup adds a GPU to an existing group

func AttachDriver

func AttachDriver() error

AttachDriver attaches the driver to DCGM. This is used to reattach the driver after a DetachDriver call, typically when updating the driver without restarting DCGM. Requires DCGM 4.5.0 or later.

func ClearPolicyForGroup

func ClearPolicyForGroup(group GroupHandle) error

ClearPolicyForGroup clears all policy conditions for a GPU group

func CreateFakeEntities

func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error)

CreateFakeEntities creates test entities with the specified MIG hierarchy information. This function is intended for testing purposes only. Returns a slice of Entity IDs for the created entities and any error encountered.

func DestroyGroup

func DestroyGroup(groupID GroupHandle) (err error)

DestroyGroup destroys an existing GPU group

func DetachDriver

func DetachDriver() error

DetachDriver detaches the driver from DCGM. This is used when you want to update the driver without restarting DCGM. After detaching, GPUs will not be accessible until AttachDriver is called. Requires DCGM 4.5.0 or later.

func FieldGroupDestroy

func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)

FieldGroupDestroy destroys a previously created field group. Returns an error if the group cannot be destroyed.

func FieldsInit

func FieldsInit() int

FieldsInit initializes the DCGM fields module. Returns an integer status code.

func FieldsTerm

func FieldsTerm() int

FieldsTerm terminates the DCGM fields module. Returns an integer status code.

func FindFirstNonAsciiIndex

func FindFirstNonAsciiIndex(value [4096]byte) int

FindFirstNonAsciiIndex returns the index of the first non-ASCII character in the byte array. Returns 4096 if no non-ASCII character is found.

func Fv2_Blob

func Fv2_Blob(fv FieldValue_v2) [4096]byte

Fv2_Blob returns the raw field value of a FieldValue_v2 as a byte array.

func Fv2_String

func Fv2_String(fv FieldValue_v2) string

Fv2_String returns the string value of a FieldValue_v2.

func GetAllDeviceCount

func GetAllDeviceCount() (uint, error)

GetAllDeviceCount returns the count of all GPUs in the system

func GetEntityGroupEntities

func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error)

GetEntityGroupEntities returns all entities of the specified group type

func GetSupportedDevices

func GetSupportedDevices() ([]uint, error)

GetSupportedDevices returns a list of DCGM-supported GPU IDs

func HealthSet

func HealthSet(groupID GroupHandle, systems HealthSystem) (err error)

HealthSet enables the DCGM health check system for the given systems. It configures which health watch systems should be monitored for the specified group.

func Init

func Init(m mode, args ...string) (cleanup func(), err error)

Init starts DCGM in the specified mode Mode can be: - Embedded: Start hostengine within this process - Standalone: Connect to an already running nv-hostengine - StartHostengine: Start and connect to nv-hostengine, terminate before exiting Returns a cleanup function and any error encountered

func InjectFieldValue

func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error

InjectFieldValue injects a test value for a specific field into DCGM's field manager. This function is intended for testing purposes only.

Parameters:

  • gpu: The GPU ID to inject the field value for
  • fieldID: The DCGM field identifier
  • fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
  • status: The status code for the field
  • ts: The timestamp for the field value
  • value: The value to inject (must match fieldType)

Returns an error if the injection fails

func IsCurrentField

func IsCurrentField(fieldName string) bool

IsCurrentField returns true if the given field name is a current field

func IsInt32Blank

func IsInt32Blank(value int) bool

IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). These values indicate that no valid data is available for the field.

func IsInt64Blank

func IsInt64Blank(value int64) bool

IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). These values indicate that no valid data is available for the field.

func IsLegacyField

func IsLegacyField(fieldName string) bool

IsLegacyField returns true if the given field name is a legacy field

func ListenForPolicyViolations

func ListenForPolicyViolations(ctx context.Context, typ ...policyCondition) (<-chan PolicyViolation, error)

ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs. Returns a channel that receives policy violations and any error encountered.

Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. When the context is cancelled, the returned channel will be closed and all resources will be automatically cleaned up.

Example:

ctx, cancel := context.WithCancel(context.Background())
defer cancel() // Ensures cleanup happens

violations, err := dcgm.ListenForPolicyViolations(ctx, dcgm.XidPolicy)
if err != nil {
    return err
}

for violation := range violations {
    // Handle violation...
}

func ListenForPolicyViolationsForGroup

func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...policyCondition) (<-chan PolicyViolation, error)

ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group. Returns a channel that receives policy violations and any error encountered.

Important: The context MUST be cancelled when monitoring is no longer needed to properly clean up resources and prevent goroutine leaks. See ListenForPolicyViolations for usage example.

func SetPolicyForGroup

func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error

SetPolicyForGroup configures policies with optional custom thresholds and actions for a GPU group

func Shutdown

func Shutdown() (err error)

Shutdown stops DCGM and destroys all connections Returns an error if DCGM is not initialized

func UnwatchFields

func UnwatchFields(fieldsGroup FieldHandle, group GroupHandle) error

UnwatchFields stops monitoring the specified fields for a GPU group. fieldsGroup is the handle to the field group to stop watching. group is the handle to the GPU group to stop watching.

func UpdateAllFields

func UpdateAllFields() error

UpdateAllFields forces an update of all field values. Returns an error if the update fails.

func ViolationRegistration

func ViolationRegistration(data unsafe.Pointer) int

ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()

func WatchFieldsWithGroup

func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error

WatchFieldsWithGroup starts monitoring fields using default parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. Returns an error if the watch operation fails.

func WatchFieldsWithGroupEx

func WatchFieldsWithGroupEx(
	fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32,
) error

WatchFieldsWithGroupEx starts monitoring fields with custom parameters. fieldsGroup is the handle of the field group to watch. group is the group handle to associate with the watch. updateFreq is the update frequency in microseconds. maxKeepAge is the maximum age of samples to keep in seconds. maxKeepSamples is the maximum number of samples to keep. Returns an error if the watch operation fails.

func WatchPolicyViolationsForGroup

func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error)

WatchPolicyViolationsForGroup registers to receive violation notifications for a specific GPU group

Types

type CPUHierarchyCPU_v1

type CPUHierarchyCPU_v1 struct {
	// CPUID is the unique identifier for this CPU
	CPUID uint
	// OwnedCores is a bitmask array representing the cores owned by this CPU
	OwnedCores []uint64
}

CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores

type CPUHierarchy_v1

type CPUHierarchy_v1 struct {
	// Version is the version number of the hierarchy structure
	Version uint
	// NumCPUs is the number of CPUs in the system
	NumCPUs uint
	// CPUs contains information about each CPU in the system
	CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1
}

CPUHierarchy_v1 represents version 1 of the CPU hierarchy information

func GetCPUHierarchy

func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error)

GetCPUHierarchy retrieves the CPU hierarchy information from DCGM

type ClockInfo

type ClockInfo struct {
	Cores  int64 // MHz
	Memory int64 // MHz
}

ClockInfo contains GPU clock frequencies

type DbePolicyCondition

type DbePolicyCondition struct {
	// Location specifies where the ECC error occurred
	Location string
	// NumErrors indicates the number of errors detected
	NumErrors uint
}

DbePolicyCondition contains details about a Double-bit ECC error

type DcgmBindUnbindEventState

type DcgmBindUnbindEventState int

DcgmBindUnbindEventState represents the state of GPU bind/unbind events

const (
	// DcgmBUEventStateSystemReinitializing indicates the system is reinitializing (GPU unbind)
	DcgmBUEventStateSystemReinitializing DcgmBindUnbindEventState = 1
	// DcgmBUEventStateSystemReinitializationCompleted indicates system reinitialization is complete (GPU bind)
	DcgmBUEventStateSystemReinitializationCompleted DcgmBindUnbindEventState = 2
)

type Device

type Device struct {
	GPU           uint
	DCGMSupported string
	UUID          string
	Power         uint // W
	PCI           PCIInfo
	Identifiers   DeviceIdentifiers
	Topology      []P2PLink
	CPUAffinity   string
}

Device represents a GPU device and its properties

func GetDeviceInfo

func GetDeviceInfo(gpuID uint) (Device, error)

GetDeviceInfo returns detailed information about the specified GPU

type DeviceHealth

type DeviceHealth struct {
	// GPU is the ID of the GPU device
	GPU uint
	// Status indicates the overall health status of the GPU
	Status string
	// Watches contains the status of individual health watch systems
	Watches []SystemWatch
}

DeviceHealth represents the health status of a GPU device

func HealthCheckByGpuId

func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error)

HealthCheckByGpuId performs a health check on the specified GPU

type DeviceIdentifiers

type DeviceIdentifiers struct {
	Brand               string
	Model               string
	Serial              string
	Vbios               string
	InforomImageVersion string
	DriverVersion       string
}

DeviceIdentifiers contains various identification information for a GPU device

type DeviceStatus

type DeviceStatus struct {
	Power       float64 // W
	Temperature int64   // °C
	Utilization UtilizationInfo
	Memory      MemoryInfo
	Clocks      ClockInfo
	PCI         PCIStatusInfo
	Performance PerfState
	FanSpeed    int64 // %
}

DeviceStatus contains comprehensive GPU device status information

func GetDeviceStatus

func GetDeviceStatus(gpuID uint) (DeviceStatus, error)

GetDeviceStatus returns current status information about the specified GPU

type DiagErrorDetail

type DiagErrorDetail struct {
	// Message contains a human-readable description of the error
	Message string
	// Code identifies the specific type of error
	Code HealthCheckErrorCode
}

DiagErrorDetail contains detailed information about a health check error

type DiagResult

type DiagResult struct {
	// Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun"
	Status string
	// TestName is the name of the diagnostic test that was run
	TestName string
	// TestOutput contains any additional output or messages from the test
	TestOutput string
	// ErrorCode is the numeric error code if the test failed
	ErrorCode uint
	// ErrorMessage contains a detailed error message if the test failed
	ErrorMessage string
	// Serial number of the tested entity
	SerialNumber string
	// EntityID
	EntityID uint
}

DiagResult represents the result of a single diagnostic test

type DiagResults

type DiagResults struct {
	// Software contains the results of software-related diagnostic tests
	Software []DiagResult
}

DiagResults contains the results of all diagnostic tests

func RunDiag

func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error)

RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. Parameters:

  • diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
  • groupId: The group of GPUs to run diagnostics on

Returns:

  • DiagResults containing the results of all diagnostic tests
  • error if the diagnostics failed to run

type DiagType

type DiagType int

DiagType represents the type of diagnostic test to run

const (
	// DiagQuick represents a quick diagnostic test that performs basic health checks
	DiagQuick DiagType = 1

	// DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks
	DiagMedium DiagType = 2

	// DiagLong represents a long diagnostic test that performs extensive health checks
	DiagLong DiagType = 3

	// DiagExtended represents an extended diagnostic test that performs the most thorough system checks
	DiagExtended DiagType = 4
)

type ECCErrorsInfo

type ECCErrorsInfo struct {
	SingleBit int64
	DoubleBit int64
}

ECCErrorsInfo contains ECC memory error counts

type EntityStatus

type EntityStatus uint

EntityStatus represents the status of a GPU entity

const (
	// EntityStatusUnknown - Entity has not been referenced yet
	EntityStatusUnknown EntityStatus = 0
	// EntityStatusOk - Entity is known and OK
	EntityStatusOk EntityStatus = 1
	// EntityStatusUnsupported - Entity is unsupported by DCGM
	EntityStatusUnsupported EntityStatus = 2
	// EntityStatusInaccessible - Entity is inaccessible, usually due to cgroups
	EntityStatusInaccessible EntityStatus = 3
	// EntityStatusLost - Entity has been lost. Usually set from NVML returning NVML_ERROR_GPU_IS_LOST
	EntityStatusLost EntityStatus = 4
	// EntityStatusFake - Entity is a fake, injection-only entity for testing
	EntityStatusFake EntityStatus = 5
	// EntityStatusDisabled - Don't collect values from this GPU
	EntityStatusDisabled EntityStatus = 6
	// EntityStatusDetached - Entity is detached, not good for any uses
	EntityStatusDetached EntityStatus = 7
)

func GetGPUStatus

func GetGPUStatus(gpuID uint) EntityStatus

GetGPUStatus returns the entity status of the specified GPU

func (EntityStatus) String

func (e EntityStatus) String() string

String returns a string representation of the entity status

type Error

type Error struct {
	Code C.dcgmReturn_t // dcgmReturn_t value of error
	// contains filtered or unexported fields
}

Error represents an error returned by the DCGM library

func (*Error) Error

func (e *Error) Error() string

type FieldHandle

type FieldHandle struct {
	// contains filtered or unexported fields
}

FieldHandle represents a handle to a DCGM field group

func FieldGroupCreate

func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)

FieldGroupCreate creates a new field group with the specified fields. fieldsGroupName is the name for the new group. fields is a slice of field IDs to include in the group. Returns the field group handle and any error encountered.

Important: Field groups must be destroyed using FieldGroupDestroy when no longer needed to prevent resource leaks in the DCGM library.

Example:

fieldGroup, err := dcgm.FieldGroupCreate("myFields", []dcgm.Short{dcgm.DCGM_FI_DEV_POWER_USAGE})
if err != nil {
    return err
}
defer dcgm.FieldGroupDestroy(fieldGroup)

// Use the field group...

func (*FieldHandle) GetHandle

func (f *FieldHandle) GetHandle() uintptr

GetHandle returns the internal DCGM field group handle as a uintptr

func (*FieldHandle) SetHandle

func (f *FieldHandle) SetHandle(val uintptr)

SetHandle sets the internal DCGM field group handle to the provided value

type FieldMeta

type FieldMeta struct {
	FieldID     Short              // Unique identifier for the field
	FieldType   byte               // Type of the field (e.g., integer, float, string)
	Size        byte               // Size of the field in bytes
	Tag         string             // Human-readable tag/name for the field
	Scope       int                // Scope of the field
	NvmlFieldID int                // Corresponding NVML field identifier
	EntityLevel Field_Entity_Group // Entity level/group this field belongs to
}

FieldMeta represents metadata about a DCGM field, including its identifier, type, size, and other attributes. This struct is used to describe the characteristics and properties of fields that can be monitored or queried through DCGM.

func FieldGetByID

func FieldGetByID(fieldId Short) FieldMeta

FieldGetByID retrieves field metadata for the specified field ID.

func ToFieldMeta

func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta

ToFieldMeta converts a C DCGM field metadata structure to a Go FieldMeta struct.

type FieldValue_v1

type FieldValue_v1 struct {
	Version   uint
	FieldID   Short
	FieldType uint
	Status    int
	TS        int64
	Value     [4096]byte
}

FieldValue_v1 represents a field value in version 1

func EntityGetLatestValues

func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)

EntityGetLatestValues retrieves the latest values for specified fields of any entity. entityGroup specifies the type of entity to query. entityId is the ID of the entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func GetLatestValuesForFields

func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)

GetLatestValuesForFields retrieves the most recent values for the specified fields. gpu is the ID of the GPU to query. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func LinkGetLatestValues

func LinkGetLatestValues(index uint, parentType Field_Entity_Group, parentId uint, fields []Short) ([]FieldValue_v1, error)

LinkGetLatestValues retrieves the latest values for specified fields of a link entity. index is the link index. parentId is the ID of the parent entity. fields is a slice of field IDs to retrieve. Returns a slice of field values and any error encountered.

func (FieldValue_v1) Blob

func (fv FieldValue_v1) Blob() [4096]byte

Blob returns the raw field value as a byte array.

func (FieldValue_v1) Float64

func (fv FieldValue_v1) Float64() float64

Float64 returns the field value as a float64.

func (FieldValue_v1) Int64

func (fv FieldValue_v1) Int64() int64

Int64 returns the field value as an int64.

func (FieldValue_v1) String

func (fv FieldValue_v1) String() string

String returns the field value as a string.

type FieldValue_v2

type FieldValue_v2 struct {
	Version       uint
	EntityGroupId Field_Entity_Group
	EntityID      uint
	FieldID       Short
	FieldType     uint
	Status        int
	TS            int64
	Value         [4096]byte
	StringValue   *string
}

FieldValue_v2 represents a field value in version 2

func EntitiesGetLatestValues

func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)

EntitiesGetLatestValues retrieves the latest values for specified fields across multiple entities. entities is a slice of entity pairs to query. fields is a slice of field IDs to retrieve. flags specify additional options for the query. Returns a slice of field values and any error encountered.

func GetValuesSince

func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error)

GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.

GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.

fieldGroup is a FieldHandle representing the group of fields for which data is requested.

sinceTime is a time.Time value representing the timestamp from which to request updated values. A zero value (time.Time{}) requests all available data.

Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time of the latest data retrieval, and an error if there is any issue during the operation.

If the number of field values exceeds maxCallbackValues (131,072), an error is returned to prevent unbounded memory growth. To avoid this, reduce the time range, field group size, or entity count.

func (FieldValue_v2) Blob

func (fv FieldValue_v2) Blob() [4096]byte

Blob returns the raw field value as a byte array.

func (FieldValue_v2) Float64

func (fv FieldValue_v2) Float64() float64

Float64 returns the field value as a float64.

func (FieldValue_v2) Int64

func (fv FieldValue_v2) Int64() int64

Int64 returns the field value as an int64.

func (FieldValue_v2) String

func (fv FieldValue_v2) String() string

String returns the field value as a string.

type Field_Entity_Group

type Field_Entity_Group uint

Field_Entity_Group represents the type of DCGM entity

const (
	// FE_NONE represents no entity type
	FE_NONE Field_Entity_Group = iota
	// FE_GPU represents a GPU device entity
	FE_GPU
	// FE_VGPU represents a virtual GPU entity
	FE_VGPU
	// FE_SWITCH represents an NVSwitch entity
	FE_SWITCH
	// FE_GPU_I represents a GPU instance entity
	FE_GPU_I
	// FE_GPU_CI represents a GPU compute instance entity
	FE_GPU_CI
	// FE_LINK represents an NVLink entity
	FE_LINK
	// FE_CPU represents a CPU entity
	FE_CPU
	// FE_CPU_CORE represents a CPU core entity
	FE_CPU_CORE
	// FE_COUNT represents the total number of entity types
	FE_COUNT
)

func (Field_Entity_Group) String

func (e Field_Entity_Group) String() string

String returns a string representation of the Field_Entity_Group

type GroupEntityPair

type GroupEntityPair struct {
	// EntityGroupId specifies the type of the entity
	EntityGroupId Field_Entity_Group
	// EntityId is the unique identifier for this entity
	EntityId uint
}

GroupEntityPair represents a DCGM entity and its group identifier

type GroupHandle

type GroupHandle struct {
	// contains filtered or unexported fields
}

GroupHandle represents a handle to a DCGM GPU group

func CreateGroup

func CreateGroup(groupName string) (goGroupId GroupHandle, err error)

CreateGroup creates a new empty GPU group with the specified name.

Important: Groups must be destroyed using DestroyGroup when no longer needed to prevent resource leaks in the DCGM library.

Example:

group, err := dcgm.CreateGroup("myGroup")
if err != nil {
    return err
}
defer dcgm.DestroyGroup(group)

// Use the group...

func CreateGroupWithContext

func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error)

CreateGroupWithContext creates a new group with a context

func GroupAllGPUs

func GroupAllGPUs() GroupHandle

GroupAllGPUs returns a GroupHandle representing all GPUs in the system

func NewDefaultGroup

func NewDefaultGroup(groupName string) (GroupHandle, error)

NewDefaultGroup creates a new group with default GPUs and the specified name

func WatchFields

func WatchFields(gpuID uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)

WatchFields starts monitoring the specified fields for a GPU. gpuId is the ID of the GPU to monitor. fieldsGroup is the handle of the field group to watch. groupName is a name for the watch group. Returns a group handle and any error encountered.

func WatchPidFields

func WatchPidFields() (GroupHandle, error)

WatchPidFields configures DCGM to start recording stats for GPU processes Must be called before GetProcessInfo.

Important: The returned GroupHandle should be cleaned up by calling DestroyGroup when monitoring is no longer needed to prevent resource leaks.

Example:

group, err := dcgm.WatchPidFields()
if err != nil {
    return err
}
defer dcgm.DestroyGroup(group)

// Use GetProcessInfo with the group...

func WatchPidFieldsEx

func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)

WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.

func (*GroupHandle) GetHandle

func (g *GroupHandle) GetHandle() uintptr

GetHandle returns the internal group handle value

func (*GroupHandle) SetHandle

func (g *GroupHandle) SetHandle(val uintptr)

SetHandle sets the internal group handle value

type GroupInfo

type GroupInfo struct {
	Version    uint32
	GroupName  string
	EntityList []GroupEntityPair
}

GroupInfo contains information about a DCGM group

func GetGroupInfo

func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error)

GetGroupInfo retrieves information about a DCGM group

type HealthCheckErrorCode

type HealthCheckErrorCode uint

HealthCheckErrorCode error codes for passive and active health checks.

const (
	// DCGM_FR_OK No error
	DCGM_FR_OK HealthCheckErrorCode = 0
	// DCGM_FR_UNKNOWN Unknown error code
	DCGM_FR_UNKNOWN HealthCheckErrorCode = 1
	// DCGM_FR_UNRECOGNIZED Unrecognized error code
	DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2
	// DCGM_FR_PCI_REPLAY_RATE Unacceptable rate of PCI errors
	DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3
	// DCGM_FR_VOLATILE_DBE_DETECTED Unacceptable rate of volatile double bit errors
	DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4
	// DCGM_FR_VOLATILE_SBE_DETECTED Unacceptable rate of volatile single bit errors
	DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5
	// DCGM_FR_VOLATILE_SBE_DETECTED_TS Unacceptable rate of volatile single bit errors with a timestamp
	DCGM_FR_VOLATILE_SBE_DETECTED_TS HealthCheckErrorCode = 6
	// DCGM_FR_PENDING_PAGE_RETIREMENTS Pending page retirements detected
	DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6
	// DCGM_FR_RETIRED_PAGES_LIMIT Unacceptable total page retirements detected
	DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7
	// DCGM_FR_RETIRED_PAGES_DBE_LIMIT Unacceptable total page retirements due to uncorrectable errors
	DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8
	// DCGM_FR_CORRUPT_INFOROM Corrupt inforom found
	DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9
	// DCGM_FR_CLOCK_THROTTLE_THERMAL Clocks being throttled due to overheating
	DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10
	// DCGM_FR_POWER_UNREADABLE Cannot get a reading for power from NVML
	DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11
	// DCGM_FR_CLOCK_THROTTLE_POWER Clock being throttled due to power restrictions
	DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12
	// DCGM_FR_NVLINK_ERROR_THRESHOLD Unacceptable rate of NVLink errors
	DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13
	// DCGM_FR_NVLINK_DOWN NVLink is down
	DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14
	// DCGM_FR_NVSWITCH_FATAL_ERROR Fatal errors on the NVSwitch
	DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15
	// DCGM_FR_NVSWITCH_NON_FATAL_ERROR Non-fatal errors on the NVSwitch
	DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16
	// DCGM_FR_NVSWITCH_DOWN NVSwitch is down
	DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17
	// DCGM_FR_NO_ACCESS_TO_FILE Cannot access a file
	DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18
	// DCGM_FR_NVML_API Error occurred on an NVML API - NOT USED: DEPRECATED
	DCGM_FR_NVML_API HealthCheckErrorCode = 19
	// DCGM_FR_DEVICE_COUNT_MISMATCH Device count mismatch
	DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20
	// DCGM_FR_BAD_PARAMETER Bad parameter passed to API
	DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21
	// DCGM_FR_CANNOT_OPEN_LIB Cannot open a library that must be accessed
	DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22
	// DCGM_FR_DENYLISTED_DRIVER A driver on the denylist (nouveau) is active
	DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23
	// DCGM_FR_NVML_LIB_BAD NVML library is missing expected functions - NOT USED: DEPRECATED
	DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24
	// DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25
	DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25
	// DCGM_FR_HOSTENGINE_CONN Bad connection to nv-hostengine - NOT USED: DEPRECATED
	DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26
	// DCGM_FR_FIELD_QUERY Field query failed
	DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27
	// DCGM_FR_BAD_CUDA_ENV The environment has variables that hurt CUDA
	DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28
	// DCGM_FR_PERSISTENCE_MODE Persistence mode is disabled
	DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29
	// DCGM_FR_BAD_NVLINK_ENV The environment has variables that hurt NVLink
	DCGM_FR_BAD_NVLINK_ENV HealthCheckErrorCode = 29
	// DCGM_FR_LOW_BANDWIDTH The bandwidth is unacceptably low
	DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30
	// DCGM_FR_HIGH_LATENCY Latency is too high
	DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31
	// DCGM_FR_CANNOT_GET_FIELD_TAG Cannot find a tag for a field
	DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32
	// DCGM_FR_FIELD_VIOLATION The value for the specified error field is above 0
	DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33
	// DCGM_FR_FIELD_THRESHOLD The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34
	// DCGM_FR_FIELD_VIOLATION_DBL The value for the specified error field is above 0
	DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35
	// DCGM_FR_FIELD_THRESHOLD_DBL The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36
	// DCGM_FR_UNSUPPORTED_FIELD_TYPE Field type cannot be supported
	DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37
	// DCGM_FR_FIELD_THRESHOLD_TS The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38
	// DCGM_FR_FIELD_THRESHOLD_TS_DBL The value for the specified field is above the threshold
	DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39
	// DCGM_FR_THERMAL_VIOLATIONS Thermal violations detected
	DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40
	// DCGM_FR_THERMAL_VIOLATIONS_TS Thermal violations detected with a timestamp
	DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41
	// DCGM_FR_TEMP_VIOLATION Non-benign clock throttling is occurring
	DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42
	// DCGM_FR_THROTTLING_VIOLATION Non-benign clock throttling is occurring
	DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43
	// DCGM_FR_INTERNAL An internal error was detected
	DCGM_FR_INTERNAL HealthCheckErrorCode = 44
	// DCGM_FR_PCIE_GENERATION PCIe generation is too low
	DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45
	// DCGM_FR_PCIE_WIDTH PCIe width is too low
	DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46
	// DCGM_FR_ABORTED Test was aborted by a user signal
	DCGM_FR_ABORTED HealthCheckErrorCode = 47
	// DCGM_FR_TEST_DISABLED Test was disabled by a user signal
	DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48
	// DCGM_FR_CANNOT_GET_STAT Cannot get telemetry for a needed value
	DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49
	// DCGM_FR_STRESS_LEVEL Stress level is too low (bad performance)
	DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50
	// DCGM_FR_CUDA_API HealthCheckErrorCode = 51
	DCGM_FR_CUDA_API HealthCheckErrorCode = 51
	// DCGM_FR_FAULTY_MEMORY Faulty memory detected on this GPU
	DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52
	// DCGM_FR_CANNOT_SET_WATCHES Unable to set field watches in DCGM - NOT USED: DEPRECATED
	DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53
	// DCGM_FR_CUDA_UNBOUND CUDA context is no longer bound
	DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54
	// DCGM_FR_ECC_DISABLED ECC memory is disabled right now
	DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55
	// DCGM_FR_MEMORY_ALLOC Cannot allocate memory on the GPU
	DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56
	// DCGM_FR_CUDA_DBE CUDA detected unrecovable double-bit error
	DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57
	// DCGM_FR_MEMORY_MISMATCH Memory error detected
	DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58
	// DCGM_FR_CUDA_DEVICE No CUDA device discoverable for existing GPU
	DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59
	// DCGM_FR_ECC_UNSUPPORTED ECC memory is unsupported by this SKU
	DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60
	// DCGM_FR_ECC_PENDING ECC memory is in a pending state - NOT USED: DEPRECATED
	DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61
	// DCGM_FR_MEMORY_BANDWIDTH Memory bandwidth is too low
	DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62
	// DCGM_FR_TARGET_POWER The target power is too low
	DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63
	// DCGM_FR_API_FAIL The specified API call failed
	DCGM_FR_API_FAIL HealthCheckErrorCode = 64
	// DCGM_FR_API_FAIL_GPU The specified API call failed for the specified GPU
	DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65
	// DCGM_FR_CUDA_CONTEXT Cannot create a CUDA context on this GPU
	DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66
	// DCGM_FR_DCGM_API DCGM API failure
	DCGM_FR_DCGM_API HealthCheckErrorCode = 67
	// DCGM_FR_CONCURRENT_GPUS Need multiple GPUs to run this test
	DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68
	// DCGM_FR_TOO_MANY_ERRORS More errors than fit in the return struct - NOT USED: DEPRECATED
	DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69
	// DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD NVLink CRC error threshold violation
	DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70
	// DCGM_FR_NVLINK_ERROR_CRITICAL NVLink error for a field that should always be 0
	DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71
	// DCGM_FR_ENFORCED_POWER_LIMIT The enforced power limit is too low to hit the target
	DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72
	// DCGM_FR_MEMORY_ALLOC_HOST Cannot allocate memory on the host
	DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73
	// DCGM_FR_GPU_OP_MODE Bad GPU operating mode for running plugin - NOT USED: DEPRECATED
	DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74
	// DCGM_FR_NO_MEMORY_CLOCKS No memory clocks with the needed MHz found - NOT USED: DEPRECATED
	DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75
	// DCGM_FR_NO_GRAPHICS_CLOCKS No graphics clocks with the needed MHz found - NOT USED: DEPRECATED
	DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76
	// DCGM_FR_HAD_TO_RESTORE_STATE Note that we had to restore a GPU's state
	DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77
	// DCGM_FR_L1TAG_UNSUPPORTED L1TAG test is unsupported by this SKU
	DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78
	// DCGM_FR_L1TAG_MISCOMPARE L1TAG test failed on a miscompare
	DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79
	// DCGM_FR_ROW_REMAP_FAILURE Row remapping failed (Ampere or newer GPUs)
	DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80
	// DCGM_FR_UNCONTAINED_ERROR Uncontained error - XID 95
	DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81
	// DCGM_FR_EMPTY_GPU_LIST No GPU information given to plugin
	DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82
	// DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS Pending page retirements due to a DBE
	DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83
	// DCGM_FR_UNCORRECTABLE_ROW_REMAP Uncorrectable row remapping
	DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84
	// DCGM_FR_PENDING_ROW_REMAP Row remapping is pending
	DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85
	// DCGM_FR_BROKEN_P2P_MEMORY_DEVICE P2P copy test detected an error writing to this GPU
	DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86
	// DCGM_FR_BROKEN_P2P_WRITER_DEVICE P2P copy test detected an error writing from this GPU
	DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87
	// DCGM_FR_NVSWITCH_NVLINK_DOWN An NvLink is down for the specified NVSwitch
	DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88
	// DCGM_FR_EUD_BINARY_PERMISSIONS EUD binary permissions are incorrect
	DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89
	// DCGM_FR_EUD_NON_ROOT_USER EUD plugin is not running as root
	DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90
	// DCGM_FR_EUD_SPAWN_FAILURE EUD plugin failed to spawn the EUD binary
	DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91
	// DCGM_FR_EUD_TIMEOUT EUD plugin timed out
	DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92
	// DCGM_FR_EUD_ZOMBIE EUD process remains running after the plugin considers it finished
	DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93
	// DCGM_FR_EUD_NON_ZERO_EXIT_CODE EUD process exited with a non-zero exit code
	DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94
	// DCGM_FR_EUD_TEST_FAILED EUD test failed
	DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95
	// DCGM_FR_FILE_CREATE_PERMISSIONS We cannot create a file in this directory.
	DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96
	// DCGM_FR_PAUSE_RESUME_FAILED Pause/Resume failed
	DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97
	// DCGM_FR_PCIE_H_REPLAY_VIOLATION PCIe H replay violation
	DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98
	// DCGM_FR_GPU_EXPECTED_NVLINKS_UP Expected nvlinks up per gpu
	DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99
	// DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP Expected nvlinks up per nvswitch
	DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100
	// DCGM_FR_XID_ERROR XID error detected
	DCGM_FR_XID_ERROR HealthCheckErrorCode = 101
	// DCGM_FR_SBE_VIOLATION Single bit error detected
	DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102
	// DCGM_FR_DBE_VIOLATION Double bit error detected
	DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103
	// DCGM_FR_PCIE_REPLAY_VIOLATION PCIe replay errors detected
	DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104
	// DCGM_FR_SBE_THRESHOLD_VIOLATION SBE threshold violated
	DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105
	// DCGM_FR_DBE_THRESHOLD_VIOLATION DBE threshold violated
	DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106
	// DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION PCIe replay count violated
	DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107
	// DCGM_FR_CUDA_FM_NOT_INITIALIZED The fabricmanager is not initialized
	DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108
	// DCGM_FR_SXID_ERROR NvSwitch fatal error detected
	DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109
	// DCGM_FR_GFLOPS_THRESHOLD_VIOLATION GPU GFLOPs threshold violated
	DCGM_FR_GFLOPS_THRESHOLD_VIOLATION HealthCheckErrorCode = 110
	// DCGM_FR_NAN_VALUE NaN value detected on this GPU
	DCGM_FR_NAN_VALUE HealthCheckErrorCode = 111
	// DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR Fabric Manager did not finish training
	DCGM_FR_FABRIC_MANAGER_TRAINING_ERROR HealthCheckErrorCode = 112
	// DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over PCIE
	DCGM_FR_BROKEN_P2P_PCIE_MEMORY_DEVICE HealthCheckErrorCode = 113
	// DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE P2P copy test detected an error writing from this GPU over PCIE
	DCGM_FR_BROKEN_P2P_PCIE_WRITER_DEVICE HealthCheckErrorCode = 114
	// DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE P2P copy test detected an error writing to this GPU over NVLink
	DCGM_FR_BROKEN_P2P_NVLINK_MEMORY_DEVICE HealthCheckErrorCode = 115
	// DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE P2P copy test detected an error writing from this GPU over NVLink
	DCGM_FR_BROKEN_P2P_NVLINK_WRITER_DEVICE HealthCheckErrorCode = 116
	// DCGM_FR_ERROR_SENTINEL MUST BE THE LAST ERROR CODE
	DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 117
)

type HealthResponse

type HealthResponse struct {
	// OverallHealth indicates the aggregate health status across all watches
	OverallHealth HealthResult
	// Incidents contains details about any health issues detected
	Incidents []Incident
}

HealthResponse contains the results of a health check operation

func HealthCheck

func HealthCheck(groupID GroupHandle) (HealthResponse, error)

HealthCheck checks the configured watches for any errors/failures/warnings that have occurred since the last time this check was invoked. On the first call, stateful information about all of the enabled watches within a group is created but no error results are provided. On subsequent calls, any error information will be returned.

type HealthResult

type HealthResult uint

HealthResult is the result of a health check.

const (
	// DCGM_HEALTH_RESULT_PASS All results within this system are reporting normal
	DCGM_HEALTH_RESULT_PASS HealthResult = 0
	// DCGM_HEALTH_RESULT_WARN A warning has been issued, refer to the response for more information
	DCGM_HEALTH_RESULT_WARN HealthResult = 10
	// DCGM_HEALTH_RESULT_FAIL A failure has been issued, refer to the response for more information
	DCGM_HEALTH_RESULT_FAIL HealthResult = 20
)

type HealthSystem

type HealthSystem uint

HealthSystem is the system to watch for health checks.

const (
	// DCGM_HEALTH_WATCH_PCIE PCIe health check
	DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1
	// DCGM_HEALTH_WATCH_NVLINK NVLink health check
	DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2
	// DCGM_HEALTH_WATCH_PMU PMU health check
	DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4
	// DCGM_HEALTH_WATCH_MCU MCU health check
	DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8
	// DCGM_HEALTH_WATCH_MEM Memory health check
	DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10
	// DCGM_HEALTH_WATCH_SM SM health check
	DCGM_HEALTH_WATCH_SM HealthSystem = 0x20
	// DCGM_HEALTH_WATCH_INFOROM Inforom health check
	DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40
	// DCGM_HEALTH_WATCH_THERMAL Thermal health check
	DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80
	// DCGM_HEALTH_WATCH_POWER Power health check
	DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100
	// DCGM_HEALTH_WATCH_DRIVER Driver health check
	DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200
	// DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL NVSwitch non-fatal health check
	DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400
	// DCGM_HEALTH_WATCH_NVSWITCH_FATAL NVSwitch fatal health check
	DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800
	// DCGM_HEALTH_WATCH_ALL All health checks
	DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF
)

func HealthGet

func HealthGet(groupID GroupHandle) (HealthSystem, error)

HealthGet retrieves the current state of the DCGM health check system. It returns which health watch systems are currently enabled for the specified group.

type Incident

type Incident struct {
	// System identifies which health watch system detected the incident
	System HealthSystem
	// Health indicates the severity of the incident
	Health HealthResult
	// Error contains detailed information about the incident
	Error DiagErrorDetail
	// EntityInfo identifies the GPU or component where the incident occurred
	EntityInfo GroupEntityPair
}

Incident represents a health check incident that occurred

type Link_State uint

Link_State represents the state of an NVLINK connection

const (
	// LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs)
	LS_NOT_SUPPORTED Link_State = iota
	// LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches)
	LS_DISABLED
	// LS_DOWN indicates the link is down (inactive)
	LS_DOWN
	// LS_UP indicates the link is up (active)
	LS_UP
)

type MemoryInfo

type MemoryInfo struct {
	GlobalUsed int64
	ECCErrors  ECCErrorsInfo
}

MemoryInfo contains GPU memory usage and error information

type MetricGroup

type MetricGroup struct {
	Major    uint
	Minor    uint
	FieldIds []uint
}

MetricGroup represents a group of metrics for a specific GPU

func GetSupportedMetricGroups

func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error)

GetSupportedMetricGroups returns all supported metric groups for the specified GPU

type MigEntityInfo

type MigEntityInfo struct {
	// GpuUuid is the UUID of the parent GPU
	GpuUuid string
	// NvmlGpuIndex is the NVML index of the parent GPU
	NvmlGpuIndex uint
	// NvmlInstanceId is the NVML GPU instance ID
	NvmlInstanceId uint
	// NvmlComputeInstanceId is the NVML compute instance ID
	NvmlComputeInstanceId uint
	// NvmlMigProfileId is the NVML MIG profile ID
	NvmlMigProfileId uint
	// NvmlProfileSlices is the number of slices in the MIG profile
	NvmlProfileSlices uint
}

MigEntityInfo contains information about a MIG entity

type MigHierarchyInfo

type MigHierarchyInfo struct {
	// Entity represents the current GPU entity in the hierarchy
	Entity GroupEntityPair
	// Parent represents the parent GPU entity in the hierarchy
	Parent GroupEntityPair
	// SliceProfile defines the MIG profile configuration for this entity
	SliceProfile MigProfile
}

MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information for a GPU entity and its relationship to other entities

type MigHierarchyInfo_v2

type MigHierarchyInfo_v2 struct {
	// Entity contains the entity information
	Entity GroupEntityPair
	// Parent contains the parent entity information
	Parent GroupEntityPair
	// Info contains detailed MIG entity information
	Info MigEntityInfo
}

MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information

type MigHierarchy_v2

type MigHierarchy_v2 struct {
	// Version is the version number of the hierarchy structure
	Version uint
	// Count is the number of valid entries in EntityList
	Count uint
	// EntityList contains the MIG hierarchy information for each entity
	EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
}

MigHierarchy_v2 represents version 2 of the complete MIG hierarchy

func GetGPUInstanceHierarchy

func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)

GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information

type MigProfile

type MigProfile int

MigProfile represents the Multi-Instance GPU (MIG) profile type

const (
	// MigProfileNone indicates no MIG profile is set (for GPUs)
	MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */
	// MigProfileGPUInstanceSlice1 represents GPU instance slice 1
	MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */
	// MigProfileGPUInstanceSlice2 represents GPU instance slice 2
	MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */
	// MigProfileGPUInstanceSlice3 represents GPU instance slice 3
	MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */
	// MigProfileGPUInstanceSlice4 represents GPU instance slice 4
	MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */
	// MigProfileGPUInstanceSlice7 represents GPU instance slice 7
	MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */
	// MigProfileGPUInstanceSlice8 represents GPU instance slice 8
	MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */
	// MigProfileGPUInstanceSlice6 represents GPU instance slice 6
	MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */
	// MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1
	MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */
	// MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1
	MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */
	// MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2
	MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */
	// MigProfileComputeInstanceSlice1 represents compute instance slice 1
	MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */
	// MigProfileComputeInstanceSlice2 represents compute instance slice 2
	MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */
	// MigProfileComputeInstanceSlice3 represents compute instance slice 3
	MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */
	// MigProfileComputeInstanceSlice4 represents compute instance slice 4
	MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/
	// MigProfileComputeInstanceSlice7 represents compute instance slice 7
	MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */
	// MigProfileComputeInstanceSlice8 represents compute instance slice 8
	MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */
	// MigProfileComputeInstanceSlice6 represents compute instance slice 6
	MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */
	// MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1
	MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */
)

type NvLinkP2PStatus

type NvLinkP2PStatus struct {
	Gpus [][]Link_State
	// contains filtered or unexported fields
}

NvLinkP2PStatus represents the state of NvLinks between the GPU pairs

func GetNvLinkP2PStatus

func GetNvLinkP2PStatus() (NvLinkP2PStatus, error)

GetNvLinkP2PStatus returns the status of NvLinks between GPU pairs

type NvLinkStatus

type NvLinkStatus struct {
	// ParentId is the ID of the parent entity (GPU or NVSwitch)
	ParentId uint
	// ParentType is the type of the parent entity
	ParentType Field_Entity_Group
	// State is the current state of the NVLINK
	State Link_State
	// Index is the link index number
	Index uint
}

NvLinkStatus contains information about an NVLINK connection status

func GetNvLinkLinkStatus

func GetNvLinkLinkStatus() ([]NvLinkStatus, error)

GetNvLinkLinkStatus returns the status of all NVLink connections

type NvlinkPolicyCondition

type NvlinkPolicyCondition struct {
	// FieldId identifies the specific NVLink field that had an error
	FieldId uint16
	// Counter indicates the number of errors detected
	Counter uint
}

NvlinkPolicyCondition contains details about an NVLink error

type P2PLink struct {
	// GPU is the ID of the GPU
	GPU uint
	// BusID is the PCIe bus ID of the GPU
	BusID string
	// Link is the type of P2P connection
	Link P2PLinkType
}

P2PLink contains information about a peer-to-peer connection

func GetDeviceTopology

func GetDeviceTopology(gpuID uint) ([]P2PLink, error)

GetDeviceTopology returns the topology (connectivity) information for the specified GPU

type P2PLinkType

type P2PLinkType uint

P2PLinkType represents the type of peer-to-peer connection between GPUs

const (
	// P2PLinkUnknown represents an unknown link type
	P2PLinkUnknown P2PLinkType = iota
	// P2PLinkCrossCPU represents a connection across different CPUs
	P2PLinkCrossCPU
	// P2PLinkSameCPU represents a connection within the same CPU
	P2PLinkSameCPU
	// P2PLinkHostBridge represents a connection through the host bridge
	P2PLinkHostBridge
	// P2PLinkMultiSwitch represents a connection through multiple PCIe switches
	P2PLinkMultiSwitch
	// P2PLinkSingleSwitch represents a connection through a single PCIe switch
	P2PLinkSingleSwitch
	// P2PLinkSameBoard represents a connection on the same board
	P2PLinkSameBoard
	// SingleNVLINKLink represents a single NVLINK connection
	SingleNVLINKLink
	// TwoNVLINKLinks represents two NVLINK connections
	TwoNVLINKLinks
	// ThreeNVLINKLinks represents three NVLINK connections
	ThreeNVLINKLinks
	// FourNVLINKLinks represents four NVLINK connections
	FourNVLINKLinks
)

func (P2PLinkType) PCIPaths

func (l P2PLinkType) PCIPaths() string

PCIPaths returns a string representation of the P2P link type

type PCIInfo

type PCIInfo struct {
	BusID     string
	BAR1      uint  // MB
	FBTotal   uint  // MB
	Bandwidth int64 // MB/s
}

PCIInfo contains PCI bus related information for a GPU device

type PCIStatusInfo

type PCIStatusInfo struct {
	BAR1Used   int64 // MB
	Throughput PCIThroughputInfo
	FBUsed     int64
}

PCIStatusInfo contains PCI bus status information

type PCIThroughputInfo

type PCIThroughputInfo struct {
	Rx      int64 // MB
	Tx      int64 // MB
	Replays int64
}

PCIThroughputInfo contains PCI bus transfer metrics

type PciPolicyCondition

type PciPolicyCondition struct {
	// ReplayCounter indicates the number of PCI replays
	ReplayCounter uint
}

PciPolicyCondition contains details about a PCI error

type PerfState

type PerfState uint

PerfState represents the performance state (P-state) of a GPU

func (PerfState) String

func (p PerfState) String() string

String returns a string representation of the performance state

type PolicyAction

type PolicyAction uint32

PolicyAction specifies the action to take when a policy violation occurs

const (
	// PolicyActionNone indicates no action should be taken on violation (default)
	PolicyActionNone PolicyAction = 0

	// PolicyActionGPUReset indicates the GPU should be reset on violation
	PolicyActionGPUReset PolicyAction = 1
)

type PolicyCondition

type PolicyCondition string

PolicyCondition represents a type of policy violation that can be monitored

type PolicyConfig

type PolicyConfig struct {
	// Condition specifies the type of policy to monitor
	Condition PolicyCondition

	// Action specifies what action to take when this policy violation occurs (optional, defaults to PolicyActionNone)
	Action *PolicyAction

	// Validation specifies what validation to perform after the action (optional, defaults to PolicyValidationNone)
	Validation *PolicyValidation

	// MaxRetiredPages specifies the threshold for MaxRtPgPolicy (optional, defaults to DefaultMaxRetiredPages)
	MaxRetiredPages *uint32

	// MaxTemperature specifies the threshold for ThermalPolicy in Celsius (optional, defaults to DefaultMaxTemperature)
	MaxTemperature *uint32

	// MaxPower specifies the threshold for PowerPolicy in Watts (optional, defaults to DefaultMaxPower)
	MaxPower *uint32
}

PolicyConfig configures a policy condition with optional custom thresholds and actions

type PolicyStatus

type PolicyStatus struct {
	// Mode indicates the operation mode (automatic or manual)
	Mode uint32

	// Action specifies what action is taken on violation
	Action PolicyAction

	// Validation specifies what validation is performed after action
	Validation PolicyValidation

	// Conditions is a map of enabled policy conditions with their thresholds
	// Key is the PolicyCondition, value is the threshold (if applicable)
	Conditions map[PolicyCondition]interface{}
}

PolicyStatus represents the current policy configuration for a group

func GetPolicyForGroup

func GetPolicyForGroup(group GroupHandle) (*PolicyStatus, error)

GetPolicyForGroup retrieves the current policy configuration for a GPU group

type PolicyValidation

type PolicyValidation uint32

PolicyValidation specifies the validation to perform after a policy action

const (
	// PolicyValidationNone indicates no validation after action (default)
	PolicyValidationNone PolicyValidation = 0

	// PolicyValidationShort indicates a short system validation should be performed
	PolicyValidationShort PolicyValidation = 1

	// PolicyValidationMedium indicates a medium system validation should be performed
	PolicyValidationMedium PolicyValidation = 2

	// PolicyValidationLong indicates a long system validation should be performed
	PolicyValidationLong PolicyValidation = 3
)

type PolicyViolation

type PolicyViolation struct {
	// Condition specifies the type of policy that was violated
	Condition PolicyCondition
	// Timestamp indicates when the violation occurred
	Timestamp time.Time
	// Data contains violation-specific details
	Data any
}

PolicyViolation represents a detected violation of a policy condition

type PowerPolicyCondition

type PowerPolicyCondition struct {
	// PowerViolation indicates the severity of the power violation
	PowerViolation uint
}

PowerPolicyCondition contains details about a power violation

type ProcessInfo

type ProcessInfo struct {
	// GPU is the ID of the GPU being used
	GPU uint
	// PID is the process ID
	PID uint
	// Name is the name of the process
	Name string
	// ProcessUtilization contains process-specific utilization metrics
	ProcessUtilization ProcessUtilInfo
	// PCI contains PCI bus statistics
	PCI PCIStatusInfo
	// Memory contains memory usage statistics
	Memory MemoryInfo
	// GpuUtilization contains GPU utilization metrics
	GpuUtilization UtilizationInfo
	// Clocks contains GPU clock frequencies
	Clocks ClockInfo
	// Violations contains throttling statistics
	Violations ViolationTime
	// XIDErrors contains XID error information
	XIDErrors XIDErrorInfo
}

ProcessInfo contains comprehensive information about a GPU process

func GetProcessInfo

func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)

GetProcessInfo returns detailed per-GPU statistics for the specified process

type ProcessUtilInfo

type ProcessUtilInfo struct {
	// StartTime is when the process started using the GPU
	StartTime Time
	// EndTime is when the process stopped using the GPU (0 if still running)
	EndTime Time
	// EnergyConsumed is the energy consumed by the process in Joules
	EnergyConsumed *uint64
	// SmUtil is the GPU SM (Streaming Multiprocessor) utilization percentage
	SmUtil *float64
	// MemUtil is the GPU memory utilization percentage
	MemUtil *float64
}

ProcessUtilInfo contains utilization metrics for a GPU process

type RetiredPagesPolicyCondition

type RetiredPagesPolicyCondition struct {
	// SbePages indicates the number of pages retired due to single-bit errors
	SbePages uint
	// DbePages indicates the number of pages retired due to double-bit errors
	DbePages uint
}

RetiredPagesPolicyCondition contains details about retired memory pages

type Short

type Short C.ushort

Short is an alias for the C.ushort type. It is primarily used for DCGM field identifiers and field collections in the DCGM API bindings. This type provides a direct mapping to the C unsigned short type used in the underlying DCGM C API.

const (
	// DCGM_FI_UNKNOWN represents /
	DCGM_FI_UNKNOWN Short = 0
	// DCGM_FI_DRIVER_VERSION represents /
	DCGM_FI_DRIVER_VERSION Short = 1
	// DCGM_FI_NVML_VERSION
	DCGM_FI_NVML_VERSION Short = 2
	// DCGM_FI_PROCESS_NAME represents /
	DCGM_FI_PROCESS_NAME Short = 3
	// DCGM_FI_DEV_COUNT represents /
	DCGM_FI_DEV_COUNT Short = 4
	// DCGM_FI_CUDA_DRIVER_VERSION represents /
	DCGM_FI_CUDA_DRIVER_VERSION Short = 5
	// DCGM_FI_BIND_UNBIND_EVENT represents /
	DCGM_FI_BIND_UNBIND_EVENT Short = 6
	// DCGM_FI_DEV_NAME represents /
	DCGM_FI_DEV_NAME Short = 50
	// DCGM_FI_DEV_BRAND represents /
	DCGM_FI_DEV_BRAND Short = 51
	// DCGM_FI_DEV_NVML_INDEX represents /
	DCGM_FI_DEV_NVML_INDEX Short = 52
	// DCGM_FI_DEV_SERIAL represents /
	DCGM_FI_DEV_SERIAL Short = 53
	// DCGM_FI_DEV_UUID represents /
	DCGM_FI_DEV_UUID Short = 54
	// DCGM_FI_DEV_MINOR_NUMBER represents /
	DCGM_FI_DEV_MINOR_NUMBER Short = 55
	// DCGM_FI_DEV_OEM_INFOROM_VER represents /
	DCGM_FI_DEV_OEM_INFOROM_VER Short = 56
	// DCGM_FI_DEV_PCI_BUSID represents /
	DCGM_FI_DEV_PCI_BUSID Short = 57
	// DCGM_FI_DEV_PCI_COMBINED_ID represents /
	DCGM_FI_DEV_PCI_COMBINED_ID Short = 58
	// DCGM_FI_DEV_PCI_SUBSYS_ID represents /
	DCGM_FI_DEV_PCI_SUBSYS_ID Short = 59
	// DCGM_FI_GPU_TOPOLOGY_PCI represents /
	DCGM_FI_GPU_TOPOLOGY_PCI Short = 60
	// DCGM_FI_GPU_TOPOLOGY_NVLINK represents /
	DCGM_FI_GPU_TOPOLOGY_NVLINK Short = 61
	// DCGM_FI_GPU_TOPOLOGY_AFFINITY represents /
	DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = 62
	// DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY represents /
	DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = 63
	// DCGM_FI_DEV_P2P_NVLINK_STATUS represents /
	DCGM_FI_DEV_P2P_NVLINK_STATUS Short = 64
	// DCGM_FI_DEV_COMPUTE_MODE represents /
	DCGM_FI_DEV_COMPUTE_MODE Short = 65
	// DCGM_FI_DEV_PERSISTENCE_MODE represents /
	DCGM_FI_DEV_PERSISTENCE_MODE Short = 66
	// DCGM_FI_DEV_MIG_MODE represents /
	DCGM_FI_DEV_MIG_MODE Short = 67
	// DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR represents /
	DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = 68
	// DCGM_FI_DEV_MIG_MAX_SLICES represents /
	DCGM_FI_DEV_MIG_MAX_SLICES Short = 69
	// DCGM_FI_DEV_CPU_AFFINITY_0 represents /
	DCGM_FI_DEV_CPU_AFFINITY_0 Short = 70
	// DCGM_FI_DEV_CPU_AFFINITY_1 represents /
	DCGM_FI_DEV_CPU_AFFINITY_1 Short = 71
	// DCGM_FI_DEV_CPU_AFFINITY_2 represents /
	DCGM_FI_DEV_CPU_AFFINITY_2 Short = 72
	// DCGM_FI_DEV_CPU_AFFINITY_3 represents /
	DCGM_FI_DEV_CPU_AFFINITY_3 Short = 73
	// DCGM_FI_DEV_CC_MODE represents /
	DCGM_FI_DEV_CC_MODE Short = 74
	// DCGM_FI_DEV_MIG_ATTRIBUTES represents /
	DCGM_FI_DEV_MIG_ATTRIBUTES Short = 75
	// DCGM_FI_DEV_MIG_GI_INFO represents /
	DCGM_FI_DEV_MIG_GI_INFO Short = 76
	// DCGM_FI_DEV_MIG_CI_INFO represents /
	DCGM_FI_DEV_MIG_CI_INFO Short = 77
	// DCGM_FI_DEV_ECC_INFOROM_VER represents /
	DCGM_FI_DEV_ECC_INFOROM_VER Short = 80
	// DCGM_FI_DEV_POWER_INFOROM_VER represents /
	DCGM_FI_DEV_POWER_INFOROM_VER Short = 81
	// DCGM_FI_DEV_INFOROM_IMAGE_VER represents /
	DCGM_FI_DEV_INFOROM_IMAGE_VER Short = 82
	// DCGM_FI_DEV_INFOROM_CONFIG_CHECK represents /
	DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = 83
	// DCGM_FI_DEV_INFOROM_CONFIG_VALID represents /
	DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = 84
	// DCGM_FI_DEV_VBIOS_VERSION represents /
	DCGM_FI_DEV_VBIOS_VERSION Short = 85
	// DCGM_FI_DEV_MEM_AFFINITY_0 represents /
	DCGM_FI_DEV_MEM_AFFINITY_0 Short = 86
	// DCGM_FI_DEV_MEM_AFFINITY_1 represents /
	DCGM_FI_DEV_MEM_AFFINITY_1 Short = 87
	// DCGM_FI_DEV_MEM_AFFINITY_2 represents /
	DCGM_FI_DEV_MEM_AFFINITY_2 Short = 88
	// DCGM_FI_DEV_MEM_AFFINITY_3 represents /
	DCGM_FI_DEV_MEM_AFFINITY_3 Short = 89
	// DCGM_FI_DEV_BAR1_TOTAL represents /
	DCGM_FI_DEV_BAR1_TOTAL Short = 90
	// DCGM_FI_SYNC_BOOST represents /
	DCGM_FI_SYNC_BOOST Short = 91
	// DCGM_FI_DEV_BAR1_USED represents /
	DCGM_FI_DEV_BAR1_USED Short = 92
	// DCGM_FI_DEV_BAR1_FREE represents /
	DCGM_FI_DEV_BAR1_FREE Short = 93
	// DCGM_FI_DEV_GPM_SUPPORT represents */
	DCGM_FI_DEV_GPM_SUPPORT Short = 94
	// DCGM_FI_DEV_SM_CLOCK represents /
	DCGM_FI_DEV_SM_CLOCK Short = 100
	// DCGM_FI_DEV_MEM_CLOCK represents /
	DCGM_FI_DEV_MEM_CLOCK Short = 101
	// DCGM_FI_DEV_VIDEO_CLOCK represents /
	DCGM_FI_DEV_VIDEO_CLOCK Short = 102
	// DCGM_FI_DEV_APP_SM_CLOCK represents /
	DCGM_FI_DEV_APP_SM_CLOCK Short = 110
	// DCGM_FI_DEV_APP_MEM_CLOCK represents /
	DCGM_FI_DEV_APP_MEM_CLOCK Short = 111
	// DCGM_FI_DEV_CLOCKS_EVENT_REASONS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASONS Short = 112
	// DCGM_FI_DEV_MAX_SM_CLOCK represents /
	DCGM_FI_DEV_MAX_SM_CLOCK Short = 113
	// DCGM_FI_DEV_MAX_MEM_CLOCK represents /
	DCGM_FI_DEV_MAX_MEM_CLOCK Short = 114
	// DCGM_FI_DEV_MAX_VIDEO_CLOCK represents /
	DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = 115
	// DCGM_FI_DEV_AUTOBOOST represents /
	DCGM_FI_DEV_AUTOBOOST Short = 120
	// DCGM_FI_DEV_SUPPORTED_CLOCKS represents /
	DCGM_FI_DEV_SUPPORTED_CLOCKS Short = 130
	// DCGM_FI_DEV_MEMORY_TEMP represents /
	DCGM_FI_DEV_MEMORY_TEMP Short = 140
	// DCGM_FI_DEV_GPU_TEMP represents /
	DCGM_FI_DEV_GPU_TEMP Short = 150
	// DCGM_FI_DEV_MEM_MAX_OP_TEMP represents /
	DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = 151
	// DCGM_FI_DEV_GPU_MAX_OP_TEMP represents /
	DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = 152
	// DCGM_FI_DEV_GPU_TEMP_LIMIT represents /
	DCGM_FI_DEV_GPU_TEMP_LIMIT Short = 153
	// DCGM_FI_DEV_POWER_USAGE represents /
	DCGM_FI_DEV_POWER_USAGE Short = 155
	// DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION represents /
	DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = 156
	// DCGM_FI_DEV_POWER_USAGE_INSTANT represents /
	DCGM_FI_DEV_POWER_USAGE_INSTANT Short = 157
	// DCGM_FI_DEV_SLOWDOWN_TEMP represents /
	DCGM_FI_DEV_SLOWDOWN_TEMP Short = 158
	// DCGM_FI_DEV_SHUTDOWN_TEMP represents /
	DCGM_FI_DEV_SHUTDOWN_TEMP Short = 159
	// DCGM_FI_DEV_POWER_MGMT_LIMIT represents /
	DCGM_FI_DEV_POWER_MGMT_LIMIT Short = 160
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN represents /
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = 161
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX represents /
	DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = 162
	// DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF represents /
	DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = 163
	// DCGM_FI_DEV_ENFORCED_POWER_LIMIT represents /
	DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = 164
	// DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK represents /
	DCGM_FI_DEV_REQUESTED_POWER_PROFILE_MASK Short = 165
	// DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK represents /
	DCGM_FI_DEV_ENFORCED_POWER_PROFILE_MASK Short = 166
	// DCGM_FI_DEV_VALID_POWER_PROFILE_MASK represents /
	DCGM_FI_DEV_VALID_POWER_PROFILE_MASK Short = 167
	// DCGM_FI_DEV_FABRIC_MANAGER_STATUS represents /
	DCGM_FI_DEV_FABRIC_MANAGER_STATUS Short = 170
	// DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE represents /
	DCGM_FI_DEV_FABRIC_MANAGER_ERROR_CODE Short = 171
	// DCGM_FI_DEV_FABRIC_CLUSTER_UUID represents /
	DCGM_FI_DEV_FABRIC_CLUSTER_UUID Short = 172
	// DCGM_FI_DEV_FABRIC_CLIQUE_ID represents /
	DCGM_FI_DEV_FABRIC_CLIQUE_ID Short = 173
	// DCGM_FI_DEV_FABRIC_HEALTH_MASK represents /
	DCGM_FI_DEV_FABRIC_HEALTH_MASK Short = 174
	// DCGM_FI_DEV_PSTATE represents /
	DCGM_FI_DEV_PSTATE Short = 190
	// DCGM_FI_DEV_FAN_SPEED represents /
	DCGM_FI_DEV_FAN_SPEED Short = 191
	// DCGM_FI_DEV_PCIE_TX_THROUGHPUT represents /
	DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = 200
	// DCGM_FI_DEV_PCIE_RX_THROUGHPUT represents /
	DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = 201
	// DCGM_FI_DEV_PCIE_REPLAY_COUNTER represents /
	DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = 202
	// DCGM_FI_DEV_GPU_UTIL represents /
	DCGM_FI_DEV_GPU_UTIL Short = 203
	// DCGM_FI_DEV_MEM_COPY_UTIL represents /
	DCGM_FI_DEV_MEM_COPY_UTIL Short = 204
	// DCGM_FI_DEV_ACCOUNTING_DATA represents /
	DCGM_FI_DEV_ACCOUNTING_DATA Short = 205
	// DCGM_FI_DEV_ENC_UTIL represents /
	DCGM_FI_DEV_ENC_UTIL Short = 206
	// DCGM_FI_DEV_DEC_UTIL represents /
	DCGM_FI_DEV_DEC_UTIL Short = 207
	// DCGM_FI_DEV_XID_ERRORS represents /
	DCGM_FI_DEV_XID_ERRORS Short = 230
	// DCGM_FI_DEV_PCIE_MAX_LINK_GEN represents /
	DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = 235
	// DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH represents /
	DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = 236
	// DCGM_FI_DEV_PCIE_LINK_GEN represents /
	DCGM_FI_DEV_PCIE_LINK_GEN Short = 237
	// DCGM_FI_DEV_PCIE_LINK_WIDTH represents /
	DCGM_FI_DEV_PCIE_LINK_WIDTH Short = 238
	// DCGM_FI_DEV_POWER_VIOLATION represents /
	DCGM_FI_DEV_POWER_VIOLATION Short = 240
	// DCGM_FI_DEV_THERMAL_VIOLATION represents /
	DCGM_FI_DEV_THERMAL_VIOLATION Short = 241
	// DCGM_FI_DEV_SYNC_BOOST_VIOLATION represents /
	DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = 242
	// DCGM_FI_DEV_BOARD_LIMIT_VIOLATION represents /
	DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = 243
	// DCGM_FI_DEV_LOW_UTIL_VIOLATION represents /
	DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = 244
	// DCGM_FI_DEV_RELIABILITY_VIOLATION represents /
	DCGM_FI_DEV_RELIABILITY_VIOLATION Short = 245
	// DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION represents /
	DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = 246
	// DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION represents /
	DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = 247
	// DCGM_FI_DEV_FB_TOTAL represents /
	DCGM_FI_DEV_FB_TOTAL Short = 250
	// DCGM_FI_DEV_FB_FREE represents /
	DCGM_FI_DEV_FB_FREE Short = 251
	// DCGM_FI_DEV_FB_USED represents /
	DCGM_FI_DEV_FB_USED Short = 252
	// DCGM_FI_DEV_FB_RESERVED represents /
	DCGM_FI_DEV_FB_RESERVED Short = 253
	// DCGM_FI_DEV_FB_USED_PERCENT represents /
	DCGM_FI_DEV_FB_USED_PERCENT Short = 254
	// DCGM_FI_DEV_C2C_LINK_COUNT represents /
	DCGM_FI_DEV_C2C_LINK_COUNT Short = 285
	// DCGM_FI_DEV_C2C_LINK_STATUS represents /
	DCGM_FI_DEV_C2C_LINK_STATUS Short = 286
	// DCGM_FI_DEV_C2C_MAX_BANDWIDTH represents /
	DCGM_FI_DEV_C2C_MAX_BANDWIDTH Short = 287
	// DCGM_FI_DEV_ECC_CURRENT represents /
	DCGM_FI_DEV_ECC_CURRENT Short = 300
	// DCGM_FI_DEV_ECC_PENDING represents /
	DCGM_FI_DEV_ECC_PENDING Short = 301
	// DCGM_FI_DEV_ECC_SBE_VOL_TOTAL represents /
	DCGM_FI_DEV_ECC_SBE_VOL_TOTAL Short = 310
	// DCGM_FI_DEV_ECC_DBE_VOL_TOTAL represents /
	DCGM_FI_DEV_ECC_DBE_VOL_TOTAL Short = 311
	// DCGM_FI_DEV_ECC_SBE_AGG_TOTAL represents /
	DCGM_FI_DEV_ECC_SBE_AGG_TOTAL Short = 312
	// DCGM_FI_DEV_ECC_DBE_AGG_TOTAL represents /
	DCGM_FI_DEV_ECC_DBE_AGG_TOTAL Short = 313
	// DCGM_FI_DEV_ECC_SBE_VOL_L1 represents /
	DCGM_FI_DEV_ECC_SBE_VOL_L1 Short = 314
	// DCGM_FI_DEV_ECC_DBE_VOL_L1 represents /
	DCGM_FI_DEV_ECC_DBE_VOL_L1 Short = 315
	// DCGM_FI_DEV_ECC_SBE_VOL_L2 represents /
	DCGM_FI_DEV_ECC_SBE_VOL_L2 Short = 316
	// DCGM_FI_DEV_ECC_DBE_VOL_L2 represents /
	DCGM_FI_DEV_ECC_DBE_VOL_L2 Short = 317
	// DCGM_FI_DEV_ECC_SBE_VOL_DEV represents /
	DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = 318
	// DCGM_FI_DEV_ECC_DBE_VOL_DEV represents /
	DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = 319
	// DCGM_FI_DEV_ECC_SBE_VOL_REG represents /
	DCGM_FI_DEV_ECC_SBE_VOL_REG Short = 320
	// DCGM_FI_DEV_ECC_DBE_VOL_REG represents /
	DCGM_FI_DEV_ECC_DBE_VOL_REG Short = 321
	// DCGM_FI_DEV_ECC_SBE_VOL_TEX represents /
	DCGM_FI_DEV_ECC_SBE_VOL_TEX Short = 322
	// DCGM_FI_DEV_ECC_DBE_VOL_TEX represents /
	DCGM_FI_DEV_ECC_DBE_VOL_TEX Short = 323
	// DCGM_FI_DEV_ECC_SBE_AGG_L1 represents /
	DCGM_FI_DEV_ECC_SBE_AGG_L1 Short = 324
	// DCGM_FI_DEV_ECC_DBE_AGG_L1 represents /
	DCGM_FI_DEV_ECC_DBE_AGG_L1 Short = 325
	// DCGM_FI_DEV_ECC_SBE_AGG_L2 represents /
	DCGM_FI_DEV_ECC_SBE_AGG_L2 Short = 326
	// DCGM_FI_DEV_ECC_DBE_AGG_L2 represents /
	DCGM_FI_DEV_ECC_DBE_AGG_L2 Short = 327
	// DCGM_FI_DEV_ECC_SBE_AGG_DEV represents /
	DCGM_FI_DEV_ECC_SBE_AGG_DEV Short = 328
	// DCGM_FI_DEV_ECC_DBE_AGG_DEV represents /
	DCGM_FI_DEV_ECC_DBE_AGG_DEV Short = 329
	// DCGM_FI_DEV_ECC_SBE_AGG_REG represents /
	DCGM_FI_DEV_ECC_SBE_AGG_REG Short = 330
	// DCGM_FI_DEV_ECC_DBE_AGG_REG represents /
	DCGM_FI_DEV_ECC_DBE_AGG_REG Short = 331
	// DCGM_FI_DEV_ECC_SBE_AGG_TEX represents /
	DCGM_FI_DEV_ECC_SBE_AGG_TEX Short = 332
	// DCGM_FI_DEV_ECC_DBE_AGG_TEX represents /
	DCGM_FI_DEV_ECC_DBE_AGG_TEX Short = 333
	// DCGM_FI_DEV_ECC_SBE_VOL_SHM represents /
	DCGM_FI_DEV_ECC_SBE_VOL_SHM Short = 334
	// DCGM_FI_DEV_ECC_DBE_VOL_SHM represents /
	DCGM_FI_DEV_ECC_DBE_VOL_SHM Short = 335
	// DCGM_FI_DEV_ECC_SBE_VOL_CBU represents /
	DCGM_FI_DEV_ECC_SBE_VOL_CBU Short = 336
	// DCGM_FI_DEV_ECC_DBE_VOL_CBU represents /
	DCGM_FI_DEV_ECC_DBE_VOL_CBU Short = 337
	// DCGM_FI_DEV_ECC_SBE_AGG_SHM represents /
	DCGM_FI_DEV_ECC_SBE_AGG_SHM Short = 338
	// DCGM_FI_DEV_ECC_DBE_AGG_SHM represents /
	DCGM_FI_DEV_ECC_DBE_AGG_SHM Short = 339
	// DCGM_FI_DEV_ECC_SBE_AGG_CBU represents /
	DCGM_FI_DEV_ECC_SBE_AGG_CBU Short = 340
	// DCGM_FI_DEV_ECC_DBE_AGG_CBU represents /
	DCGM_FI_DEV_ECC_DBE_AGG_CBU Short = 341
	// DCGM_FI_DEV_ECC_SBE_VOL_SRM represents /
	DCGM_FI_DEV_ECC_SBE_VOL_SRM Short = 342
	// DCGM_FI_DEV_ECC_DBE_VOL_SRM represents /
	DCGM_FI_DEV_ECC_DBE_VOL_SRM Short = 343
	// DCGM_FI_DEV_ECC_SBE_AGG_SRM represents /
	DCGM_FI_DEV_ECC_SBE_AGG_SRM Short = 344
	// DCGM_FI_DEV_ECC_DBE_AGG_SRM represents /
	DCGM_FI_DEV_ECC_DBE_AGG_SRM Short = 345
	// DCGM_FI_DEV_THRESHOLD_SRM represents /
	DCGM_FI_DEV_THRESHOLD_SRM Short = 346
	// DCGM_FI_DEV_DIAG_MEMORY_RESULT represents /
	DCGM_FI_DEV_DIAG_MEMORY_RESULT Short = 350
	// DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT represents /
	DCGM_FI_DEV_DIAG_DIAGNOSTIC_RESULT Short = 351
	// DCGM_FI_DEV_DIAG_PCIE_RESULT represents /
	DCGM_FI_DEV_DIAG_PCIE_RESULT Short = 352
	// DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT represents /
	DCGM_FI_DEV_DIAG_TARGETED_STRESS_RESULT Short = 353
	// DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT represents /
	DCGM_FI_DEV_DIAG_TARGETED_POWER_RESULT Short = 354
	// DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT represents /
	DCGM_FI_DEV_DIAG_MEMORY_BANDWIDTH_RESULT Short = 355
	// DCGM_FI_DEV_DIAG_MEMTEST_RESULT represents /
	DCGM_FI_DEV_DIAG_MEMTEST_RESULT Short = 356
	// DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT represents /
	DCGM_FI_DEV_DIAG_PULSE_TEST_RESULT Short = 357
	// DCGM_FI_DEV_DIAG_EUD_RESULT represents /
	DCGM_FI_DEV_DIAG_EUD_RESULT Short = 358
	// DCGM_FI_DEV_DIAG_CPU_EUD_RESULT represents /
	DCGM_FI_DEV_DIAG_CPU_EUD_RESULT Short = 359
	// DCGM_FI_DEV_DIAG_SOFTWARE_RESULT represents /
	DCGM_FI_DEV_DIAG_SOFTWARE_RESULT Short = 360
	// DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT represents /
	DCGM_FI_DEV_DIAG_NVBANDWIDTH_RESULT Short = 361
	// DCGM_FI_DEV_DIAG_STATUS represents /
	DCGM_FI_DEV_DIAG_STATUS Short = 362
	// DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT represents /
	DCGM_FI_DEV_DIAG_NCCL_TESTS_RESULT Short = 363
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX represents /
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX Short = 385
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH represents /
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH Short = 386
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL represents /
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL Short = 387
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW represents /
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW Short = 388
	// DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE represents /
	DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE Short = 389
	// DCGM_FI_DEV_RETIRED_SBE represents /
	DCGM_FI_DEV_RETIRED_SBE Short = 390
	// DCGM_FI_DEV_RETIRED_DBE represents /
	DCGM_FI_DEV_RETIRED_DBE Short = 391
	// DCGM_FI_DEV_RETIRED_PENDING represents /
	DCGM_FI_DEV_RETIRED_PENDING Short = 392
	// DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS represents /
	DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS Short = 393
	// DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS represents /
	DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS Short = 394
	// DCGM_FI_DEV_ROW_REMAP_FAILURE represents /
	DCGM_FI_DEV_ROW_REMAP_FAILURE Short = 395
	// DCGM_FI_DEV_ROW_REMAP_PENDING represents /
	DCGM_FI_DEV_ROW_REMAP_PENDING Short = 396
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 Short = 400
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 Short = 401
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 Short = 402
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 Short = 403
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 Short = 404
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 Short = 405
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 Short = 406
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 Short = 407
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 Short = 408
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL Short = 409
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 Short = 410
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 Short = 411
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 Short = 412
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 Short = 413
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 Short = 414
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 Short = 415
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 Short = 416
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 Short = 417
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 Short = 418
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL Short = 419
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 Short = 420
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 Short = 421
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 Short = 422
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 Short = 423
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 Short = 424
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 Short = 425
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 Short = 426
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 Short = 427
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 Short = 428
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL Short = 429
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 Short = 430
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 Short = 431
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 Short = 432
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 Short = 433
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 Short = 434
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 Short = 435
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 Short = 436
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 Short = 437
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 Short = 438
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL Short = 439
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 Short = 440
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 Short = 441
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 Short = 442
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 Short = 443
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 Short = 444
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 Short = 445
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L12
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 Short = 446
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L13
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 Short = 447
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L14
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 Short = 448
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL Short = 449
	// DCGM_FI_DEV_GPU_NVLINK_ERRORS represents /
	DCGM_FI_DEV_GPU_NVLINK_ERRORS Short = 450
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 Short = 451
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 Short = 452
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 Short = 453
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 Short = 454
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 Short = 455
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 represents /
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 Short = 456
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 Short = 457
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 Short = 458
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 Short = 459
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 Short = 460
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 Short = 461
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 represents /
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 Short = 462
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 Short = 463
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 Short = 464
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 Short = 465
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 Short = 466
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 Short = 467
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 represents /
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 Short = 468
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 Short = 469
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 Short = 470
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 Short = 471
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 Short = 472
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 Short = 473
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 represents /
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 Short = 474
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 Short = 475
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 Short = 476
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 Short = 477
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 Short = 478
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 Short = 479
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 represents /
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 Short = 480
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 Short = 481
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 Short = 482
	// DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17
	DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 Short = 483
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 Short = 484
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 Short = 485
	// DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17
	DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 Short = 486
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 Short = 487
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 Short = 488
	// DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17
	DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 Short = 489
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 Short = 491
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 Short = 492
	// DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17
	DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 Short = 493
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L15
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 Short = 494
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L16
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 Short = 495
	// DCGM_FI_DEV_NVLINK_BANDWIDTH_L17
	DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 Short = 496
	// DCGM_FI_DEV_NVLINK_ERROR_DL_CRC represents /
	DCGM_FI_DEV_NVLINK_ERROR_DL_CRC Short = 497
	// DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY represents /
	DCGM_FI_DEV_NVLINK_ERROR_DL_RECOVERY Short = 498
	// DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY represents /
	DCGM_FI_DEV_NVLINK_ERROR_DL_REPLAY Short = 499
	// DCGM_FI_DEV_VIRTUAL_MODE represents /
	DCGM_FI_DEV_VIRTUAL_MODE Short = 500
	// DCGM_FI_DEV_SUPPORTED_TYPE_INFO represents /
	DCGM_FI_DEV_SUPPORTED_TYPE_INFO Short = 501
	// DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS represents /
	DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS Short = 502
	// DCGM_FI_DEV_VGPU_INSTANCE_IDS represents /
	DCGM_FI_DEV_VGPU_INSTANCE_IDS Short = 503
	// DCGM_FI_DEV_VGPU_UTILIZATIONS represents /
	DCGM_FI_DEV_VGPU_UTILIZATIONS Short = 504
	// DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION represents /
	DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION Short = 505
	// DCGM_FI_DEV_ENC_STATS represents /
	DCGM_FI_DEV_ENC_STATS Short = 506
	// DCGM_FI_DEV_FBC_STATS represents /
	DCGM_FI_DEV_FBC_STATS Short = 507
	// DCGM_FI_DEV_FBC_SESSIONS_INFO represents /
	DCGM_FI_DEV_FBC_SESSIONS_INFO Short = 508
	// DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS represents /
	DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS Short = 509
	// DCGM_FI_DEV_VGPU_TYPE_INFO represents /
	DCGM_FI_DEV_VGPU_TYPE_INFO Short = 510
	// DCGM_FI_DEV_VGPU_TYPE_NAME represents /
	DCGM_FI_DEV_VGPU_TYPE_NAME Short = 511
	// DCGM_FI_DEV_VGPU_TYPE_CLASS represents /
	DCGM_FI_DEV_VGPU_TYPE_CLASS Short = 512
	// DCGM_FI_DEV_VGPU_TYPE_LICENSE represents /
	DCGM_FI_DEV_VGPU_TYPE_LICENSE Short = 513
	// DCGM_FI_DEV_VGPU_VM_ID represents /
	DCGM_FI_DEV_VGPU_VM_ID Short = 520
	// DCGM_FI_FIRST_VGPU_FIELD_ID represents /
	DCGM_FI_FIRST_VGPU_FIELD_ID Short = 520
	// DCGM_FI_DEV_VGPU_VM_NAME represents /
	DCGM_FI_DEV_VGPU_VM_NAME Short = 521
	// DCGM_FI_DEV_VGPU_TYPE represents /
	DCGM_FI_DEV_VGPU_TYPE Short = 522
	// DCGM_FI_DEV_VGPU_UUID represents /
	DCGM_FI_DEV_VGPU_UUID Short = 523
	// DCGM_FI_DEV_VGPU_DRIVER_VERSION represents /
	DCGM_FI_DEV_VGPU_DRIVER_VERSION Short = 524
	// DCGM_FI_DEV_VGPU_MEMORY_USAGE represents /
	DCGM_FI_DEV_VGPU_MEMORY_USAGE Short = 525
	// DCGM_FI_DEV_VGPU_LICENSE_STATUS represents /
	DCGM_FI_DEV_VGPU_LICENSE_STATUS Short = 526
	// DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT represents /
	DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT Short = 527
	// DCGM_FI_DEV_VGPU_ENC_STATS represents /
	DCGM_FI_DEV_VGPU_ENC_STATS Short = 528
	// DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO represents /
	DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO Short = 529
	// DCGM_FI_DEV_VGPU_FBC_STATS represents /
	DCGM_FI_DEV_VGPU_FBC_STATS Short = 530
	// DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO represents /
	DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO Short = 531
	// DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE represents /
	DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE Short = 532
	// DCGM_FI_DEV_VGPU_PCI_ID represents /
	DCGM_FI_DEV_VGPU_PCI_ID Short = 533
	// DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID represents /
	DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID Short = 534
	// DCGM_FI_LAST_VGPU_FIELD_ID represents /
	DCGM_FI_LAST_VGPU_FIELD_ID Short = 570
	// DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID represents /
	DCGM_FI_DEV_PLATFORM_INFINIBAND_GUID Short = 571
	// DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER represents /
	DCGM_FI_DEV_PLATFORM_CHASSIS_SERIAL_NUMBER Short = 572
	// DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER represents /
	DCGM_FI_DEV_PLATFORM_CHASSIS_SLOT_NUMBER Short = 573
	// DCGM_FI_DEV_PLATFORM_TRAY_INDEX represents /
	DCGM_FI_DEV_PLATFORM_TRAY_INDEX Short = 574
	// DCGM_FI_DEV_PLATFORM_HOST_ID represents /
	DCGM_FI_DEV_PLATFORM_HOST_ID Short = 575
	// DCGM_FI_DEV_PLATFORM_PEER_TYPE represents /
	DCGM_FI_DEV_PLATFORM_PEER_TYPE Short = 576
	// DCGM_FI_DEV_PLATFORM_MODULE_ID represents /
	DCGM_FI_DEV_PLATFORM_MODULE_ID Short = 577
	// DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY represents /
	DCGM_FI_DEV_NVLINK_PPRM_OPER_RECOVERY Short = 580
	// DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST represents /
	DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_SINCE_LAST Short = 581
	// DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO represents /
	DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TIME_BETWEEN_LAST_TWO Short = 582
	// DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS represents /
	DCGM_FI_DEV_NVLINK_PPCNT_RECOVERY_TOTAL_SUCCESSFUL_EVENTS Short = 583
	// DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_SUCCESSFUL_RECOVERY_EVENTS Short = 584
	// DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PHYSICAL_LINK_DOWN_COUNTER Short = 585
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODES Short = 586
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_CODE_ERR Short = 587
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_RCV_UNCORRECTABLE_CODE Short = 588
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_CODES Short = 589
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_CODES Short = 590
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_XMIT_RETRY_EVENTS Short = 591
	// DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS represents /
	DCGM_FI_DEV_NVLINK_PPCNT_PLR_SYNC_EVENTS Short = 592
	// DCGM_FI_INTERNAL_FIELDS_0_START represents /
	DCGM_FI_INTERNAL_FIELDS_0_START Short = 600
	// DCGM_FI_INTERNAL_FIELDS_0_END represents /
	DCGM_FI_INTERNAL_FIELDS_0_END Short = 699
	// DCGM_FI_FIRST_NVSWITCH_FIELD_ID represents /
	DCGM_FI_FIRST_NVSWITCH_FIELD_ID Short = 700
	// DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT represents /
	DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT Short = 701
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ represents /
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ Short = 702
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV represents /
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV Short = 703
	// DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD represents /
	DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD Short = 704
	// DCGM_FI_DEV_NVSWITCH_POWER_VDD represents /
	DCGM_FI_DEV_NVSWITCH_POWER_VDD Short = 705
	// DCGM_FI_DEV_NVSWITCH_POWER_DVDD represents /
	DCGM_FI_DEV_NVSWITCH_POWER_DVDD Short = 706
	// DCGM_FI_DEV_NVSWITCH_POWER_HVDD represents /
	DCGM_FI_DEV_NVSWITCH_POWER_HVDD Short = 707
	// DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX represents /
	DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX Short = 780
	// DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX represents /
	DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX Short = 781
	// DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS Short = 782
	// DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS Short = 783
	// DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS Short = 784
	// DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS Short = 785
	// DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS Short = 786
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS Short = 787
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS Short = 788
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 Short = 789
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 Short = 790
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 Short = 791
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 Short = 792
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 Short = 793
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 Short = 794
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 Short = 795
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 Short = 796
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 Short = 797
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 Short = 798
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 Short = 799
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 Short = 800
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 Short = 801
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 Short = 802
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 Short = 803
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 Short = 804
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 Short = 805
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 Short = 806
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 Short = 807
	// DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 Short = 808
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 Short = 809
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 Short = 810
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 Short = 811
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 Short = 812
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 Short = 813
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 Short = 814
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 Short = 815
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 Short = 816
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE4 Short = 817
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE5 Short = 818
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE6 Short = 819
	// DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE7 Short = 820
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE4 Short = 821
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE5 Short = 822
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE6 Short = 823
	// DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE7 Short = 824
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L0 Short = 825
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L1 Short = 826
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L2 Short = 827
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L3 Short = 828
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L4 Short = 829
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L5 Short = 830
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L6 Short = 831
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L7 Short = 832
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L8 Short = 833
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L9 Short = 834
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L10 Short = 835
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L11 Short = 836
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L12 Short = 837
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L13 Short = 838
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L14 Short = 839
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L15 Short = 840
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L16 Short = 841
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_L17 Short = 842
	// DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL represents /
	DCGM_FI_DEV_NVLINK_TX_BANDWIDTH_TOTAL Short = 843
	// DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS Short = 856
	// DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS represents /
	DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS Short = 857
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT represents /
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT Short = 858
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN represents /
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN Short = 859
	// DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN represents /
	DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN Short = 860
	// DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX represents /
	DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX Short = 861
	// DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX represents /
	DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX Short = 862
	// DCGM_FI_DEV_NVSWITCH_PHYS_ID represents /
	DCGM_FI_DEV_NVSWITCH_PHYS_ID Short = 863
	// DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED represents /
	DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED Short = 864
	// DCGM_FI_DEV_NVSWITCH_LINK_ID represents /
	DCGM_FI_DEV_NVSWITCH_LINK_ID Short = 865
	// DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN represents /
	DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN Short = 866
	// DCGM_FI_DEV_NVSWITCH_PCIE_BUS represents /
	DCGM_FI_DEV_NVSWITCH_PCIE_BUS Short = 867
	// DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE represents /
	DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE Short = 868
	// DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION represents /
	DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION Short = 869
	// DCGM_FI_DEV_NVSWITCH_LINK_STATUS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_STATUS Short = 870
	// DCGM_FI_DEV_NVSWITCH_LINK_TYPE represents /
	DCGM_FI_DEV_NVSWITCH_LINK_TYPE Short = 871
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN represents /
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN Short = 872
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS represents /
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS Short = 873
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE represents /
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE Short = 874
	// DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION represents /
	DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION Short = 875
	// DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID represents /
	DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID Short = 876
	// DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID represents /
	DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID Short = 877
	// DCGM_FI_DEV_NVSWITCH_DEVICE_UUID represents /
	DCGM_FI_DEV_NVSWITCH_DEVICE_UUID Short = 878
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L0 Short = 879
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L1 Short = 880
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L2 Short = 881
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L3 Short = 882
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L4 Short = 883
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L5 Short = 884
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L6 Short = 885
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L7 Short = 886
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L8 Short = 887
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L9 Short = 888
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L10 Short = 889
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L11 Short = 890
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L12 Short = 891
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L13 Short = 892
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L14 Short = 893
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L15 Short = 894
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L16 Short = 895
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_L17 Short = 896
	// DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL represents /
	DCGM_FI_DEV_NVLINK_RX_BANDWIDTH_TOTAL Short = 897
	// DCGM_FI_LAST_NVSWITCH_FIELD_ID represents /
	DCGM_FI_LAST_NVSWITCH_FIELD_ID Short = 899
	// DCGM_FI_PROF_GR_ENGINE_ACTIVE represents /
	DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = 1001
	// DCGM_FI_PROF_SM_ACTIVE represents /
	DCGM_FI_PROF_SM_ACTIVE Short = 1002
	// DCGM_FI_PROF_SM_OCCUPANCY represents /
	DCGM_FI_PROF_SM_OCCUPANCY Short = 1003
	// DCGM_FI_PROF_PIPE_TENSOR_ACTIVE represents /
	DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = 1004
	// DCGM_FI_PROF_DRAM_ACTIVE represents /
	DCGM_FI_PROF_DRAM_ACTIVE Short = 1005
	// DCGM_FI_PROF_PIPE_FP64_ACTIVE represents /
	DCGM_FI_PROF_PIPE_FP64_ACTIVE Short = 1006
	// DCGM_FI_PROF_PIPE_FP32_ACTIVE represents /
	DCGM_FI_PROF_PIPE_FP32_ACTIVE Short = 1007
	// DCGM_FI_PROF_PIPE_FP16_ACTIVE represents /
	DCGM_FI_PROF_PIPE_FP16_ACTIVE Short = 1008
	// DCGM_FI_PROF_PCIE_TX_BYTES represents /
	DCGM_FI_PROF_PCIE_TX_BYTES Short = 1009
	// DCGM_FI_PROF_PCIE_RX_BYTES represents /
	DCGM_FI_PROF_PCIE_RX_BYTES Short = 1010
	// DCGM_FI_PROF_NVLINK_TX_BYTES represents /
	DCGM_FI_PROF_NVLINK_TX_BYTES Short = 1011
	// DCGM_FI_PROF_NVLINK_RX_BYTES represents /
	DCGM_FI_PROF_NVLINK_RX_BYTES Short = 1012
	// DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE represents /
	DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE Short = 1013
	// DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE represents /
	DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE Short = 1014
	// DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE represents /
	DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE Short = 1015
	// DCGM_FI_PROF_PIPE_INT_ACTIVE represents /
	DCGM_FI_PROF_PIPE_INT_ACTIVE Short = 1016
	// DCGM_FI_PROF_NVDEC0_ACTIVE represents /
	DCGM_FI_PROF_NVDEC0_ACTIVE Short = 1017
	// DCGM_FI_PROF_NVDEC1_ACTIVE
	DCGM_FI_PROF_NVDEC1_ACTIVE Short = 1018
	// DCGM_FI_PROF_NVDEC2_ACTIVE
	DCGM_FI_PROF_NVDEC2_ACTIVE Short = 1019
	// DCGM_FI_PROF_NVDEC3_ACTIVE
	DCGM_FI_PROF_NVDEC3_ACTIVE Short = 1020
	// DCGM_FI_PROF_NVDEC4_ACTIVE
	DCGM_FI_PROF_NVDEC4_ACTIVE Short = 1021
	// DCGM_FI_PROF_NVDEC5_ACTIVE
	DCGM_FI_PROF_NVDEC5_ACTIVE Short = 1022
	// DCGM_FI_PROF_NVDEC6_ACTIVE
	DCGM_FI_PROF_NVDEC6_ACTIVE Short = 1023
	// DCGM_FI_PROF_NVDEC7_ACTIVE
	DCGM_FI_PROF_NVDEC7_ACTIVE Short = 1024
	// DCGM_FI_PROF_NVJPG0_ACTIVE represents /
	DCGM_FI_PROF_NVJPG0_ACTIVE Short = 1025
	// DCGM_FI_PROF_NVJPG1_ACTIVE
	DCGM_FI_PROF_NVJPG1_ACTIVE Short = 1026
	// DCGM_FI_PROF_NVJPG2_ACTIVE
	DCGM_FI_PROF_NVJPG2_ACTIVE Short = 1027
	// DCGM_FI_PROF_NVJPG3_ACTIVE
	DCGM_FI_PROF_NVJPG3_ACTIVE Short = 1028
	// DCGM_FI_PROF_NVJPG4_ACTIVE
	DCGM_FI_PROF_NVJPG4_ACTIVE Short = 1029
	// DCGM_FI_PROF_NVJPG5_ACTIVE
	DCGM_FI_PROF_NVJPG5_ACTIVE Short = 1030
	// DCGM_FI_PROF_NVJPG6_ACTIVE
	DCGM_FI_PROF_NVJPG6_ACTIVE Short = 1031
	// DCGM_FI_PROF_NVJPG7_ACTIVE
	DCGM_FI_PROF_NVJPG7_ACTIVE Short = 1032
	// DCGM_FI_PROF_NVOFA0_ACTIVE represents /
	DCGM_FI_PROF_NVOFA0_ACTIVE Short = 1033
	// DCGM_FI_PROF_NVOFA1_ACTIVE
	DCGM_FI_PROF_NVOFA1_ACTIVE Short = 1034
	// DCGM_FI_PROF_NVLINK_L0_TX_BYTES represents /
	DCGM_FI_PROF_NVLINK_L0_TX_BYTES Short = 1040
	// DCGM_FI_PROF_NVLINK_L0_RX_BYTES
	DCGM_FI_PROF_NVLINK_L0_RX_BYTES Short = 1041
	// DCGM_FI_PROF_NVLINK_L1_TX_BYTES
	DCGM_FI_PROF_NVLINK_L1_TX_BYTES Short = 1042
	// DCGM_FI_PROF_NVLINK_L1_RX_BYTES
	DCGM_FI_PROF_NVLINK_L1_RX_BYTES Short = 1043
	// DCGM_FI_PROF_NVLINK_L2_TX_BYTES
	DCGM_FI_PROF_NVLINK_L2_TX_BYTES Short = 1044
	// DCGM_FI_PROF_NVLINK_L2_RX_BYTES
	DCGM_FI_PROF_NVLINK_L2_RX_BYTES Short = 1045
	// DCGM_FI_PROF_NVLINK_L3_TX_BYTES
	DCGM_FI_PROF_NVLINK_L3_TX_BYTES Short = 1046
	// DCGM_FI_PROF_NVLINK_L3_RX_BYTES
	DCGM_FI_PROF_NVLINK_L3_RX_BYTES Short = 1047
	// DCGM_FI_PROF_NVLINK_L4_TX_BYTES
	DCGM_FI_PROF_NVLINK_L4_TX_BYTES Short = 1048
	// DCGM_FI_PROF_NVLINK_L4_RX_BYTES
	DCGM_FI_PROF_NVLINK_L4_RX_BYTES Short = 1049
	// DCGM_FI_PROF_NVLINK_L5_TX_BYTES
	DCGM_FI_PROF_NVLINK_L5_TX_BYTES Short = 1050
	// DCGM_FI_PROF_NVLINK_L5_RX_BYTES
	DCGM_FI_PROF_NVLINK_L5_RX_BYTES Short = 1051
	// DCGM_FI_PROF_NVLINK_L6_TX_BYTES
	DCGM_FI_PROF_NVLINK_L6_TX_BYTES Short = 1052
	// DCGM_FI_PROF_NVLINK_L6_RX_BYTES
	DCGM_FI_PROF_NVLINK_L6_RX_BYTES Short = 1053
	// DCGM_FI_PROF_NVLINK_L7_TX_BYTES
	DCGM_FI_PROF_NVLINK_L7_TX_BYTES Short = 1054
	// DCGM_FI_PROF_NVLINK_L7_RX_BYTES
	DCGM_FI_PROF_NVLINK_L7_RX_BYTES Short = 1055
	// DCGM_FI_PROF_NVLINK_L8_TX_BYTES
	DCGM_FI_PROF_NVLINK_L8_TX_BYTES Short = 1056
	// DCGM_FI_PROF_NVLINK_L8_RX_BYTES
	DCGM_FI_PROF_NVLINK_L8_RX_BYTES Short = 1057
	// DCGM_FI_PROF_NVLINK_L9_TX_BYTES
	DCGM_FI_PROF_NVLINK_L9_TX_BYTES Short = 1058
	// DCGM_FI_PROF_NVLINK_L9_RX_BYTES
	DCGM_FI_PROF_NVLINK_L9_RX_BYTES Short = 1059
	// DCGM_FI_PROF_NVLINK_L10_TX_BYTES
	DCGM_FI_PROF_NVLINK_L10_TX_BYTES Short = 1060
	// DCGM_FI_PROF_NVLINK_L10_RX_BYTES
	DCGM_FI_PROF_NVLINK_L10_RX_BYTES Short = 1061
	// DCGM_FI_PROF_NVLINK_L11_TX_BYTES
	DCGM_FI_PROF_NVLINK_L11_TX_BYTES Short = 1062
	// DCGM_FI_PROF_NVLINK_L11_RX_BYTES
	DCGM_FI_PROF_NVLINK_L11_RX_BYTES Short = 1063
	// DCGM_FI_PROF_NVLINK_L12_TX_BYTES
	DCGM_FI_PROF_NVLINK_L12_TX_BYTES Short = 1064
	// DCGM_FI_PROF_NVLINK_L12_RX_BYTES
	DCGM_FI_PROF_NVLINK_L12_RX_BYTES Short = 1065
	// DCGM_FI_PROF_NVLINK_L13_TX_BYTES
	DCGM_FI_PROF_NVLINK_L13_TX_BYTES Short = 1066
	// DCGM_FI_PROF_NVLINK_L13_RX_BYTES
	DCGM_FI_PROF_NVLINK_L13_RX_BYTES Short = 1067
	// DCGM_FI_PROF_NVLINK_L14_TX_BYTES
	DCGM_FI_PROF_NVLINK_L14_TX_BYTES Short = 1068
	// DCGM_FI_PROF_NVLINK_L14_RX_BYTES
	DCGM_FI_PROF_NVLINK_L14_RX_BYTES Short = 1069
	// DCGM_FI_PROF_NVLINK_L15_TX_BYTES
	DCGM_FI_PROF_NVLINK_L15_TX_BYTES Short = 1070
	// DCGM_FI_PROF_NVLINK_L15_RX_BYTES
	DCGM_FI_PROF_NVLINK_L15_RX_BYTES Short = 1071
	// DCGM_FI_PROF_NVLINK_L16_TX_BYTES
	DCGM_FI_PROF_NVLINK_L16_TX_BYTES Short = 1072
	// DCGM_FI_PROF_NVLINK_L16_RX_BYTES
	DCGM_FI_PROF_NVLINK_L16_RX_BYTES Short = 1073
	// DCGM_FI_PROF_NVLINK_L17_TX_BYTES
	DCGM_FI_PROF_NVLINK_L17_TX_BYTES Short = 1074
	// DCGM_FI_PROF_NVLINK_L17_RX_BYTES
	DCGM_FI_PROF_NVLINK_L17_RX_BYTES Short = 1075
	// DCGM_FI_PROF_C2C_TX_ALL_BYTES represents /
	DCGM_FI_PROF_C2C_TX_ALL_BYTES Short = 1076
	// DCGM_FI_PROF_C2C_TX_DATA_BYTES represents /
	DCGM_FI_PROF_C2C_TX_DATA_BYTES Short = 1077
	// DCGM_FI_PROF_C2C_RX_ALL_BYTES represents /
	DCGM_FI_PROF_C2C_RX_ALL_BYTES Short = 1078
	// DCGM_FI_PROF_C2C_RX_DATA_BYTES represents /
	DCGM_FI_PROF_C2C_RX_DATA_BYTES Short = 1079
	// DCGM_FI_PROF_HOSTMEM_CACHE_HIT represents /
	DCGM_FI_PROF_HOSTMEM_CACHE_HIT Short = 1080
	// DCGM_FI_PROF_HOSTMEM_CACHE_MISS represents /
	DCGM_FI_PROF_HOSTMEM_CACHE_MISS Short = 1081
	// DCGM_FI_PROF_PEERMEM_CACHE_HIT represents /
	DCGM_FI_PROF_PEERMEM_CACHE_HIT Short = 1082
	// DCGM_FI_PROF_PEERMEM_CACHE_MISS represents /
	DCGM_FI_PROF_PEERMEM_CACHE_MISS Short = 1083
	// DCGM_FI_DEV_CPU_UTIL_TOTAL represents /
	DCGM_FI_DEV_CPU_UTIL_TOTAL Short = 1100
	// DCGM_FI_DEV_CPU_UTIL_USER represents /
	DCGM_FI_DEV_CPU_UTIL_USER Short = 1101
	// DCGM_FI_DEV_CPU_UTIL_NICE represents /
	DCGM_FI_DEV_CPU_UTIL_NICE Short = 1102
	// DCGM_FI_DEV_CPU_UTIL_SYS represents /
	DCGM_FI_DEV_CPU_UTIL_SYS Short = 1103
	// DCGM_FI_DEV_CPU_UTIL_IRQ represents /
	DCGM_FI_DEV_CPU_UTIL_IRQ Short = 1104
	// DCGM_FI_DEV_CPU_TEMP_CURRENT represents /
	DCGM_FI_DEV_CPU_TEMP_CURRENT Short = 1110
	// DCGM_FI_DEV_CPU_TEMP_WARNING represents /
	DCGM_FI_DEV_CPU_TEMP_WARNING Short = 1111
	// DCGM_FI_DEV_CPU_TEMP_CRITICAL represents /
	DCGM_FI_DEV_CPU_TEMP_CRITICAL Short = 1112
	// DCGM_FI_DEV_CPU_CLOCK_CURRENT represents /
	DCGM_FI_DEV_CPU_CLOCK_CURRENT Short = 1120
	// DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT represents /
	DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT Short = 1130
	// DCGM_FI_DEV_CPU_POWER_LIMIT represents /
	DCGM_FI_DEV_CPU_POWER_LIMIT Short = 1131
	// DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT represents /
	DCGM_FI_DEV_SYSIO_POWER_UTIL_CURRENT Short = 1132
	// DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT represents /
	DCGM_FI_DEV_MODULE_POWER_UTIL_CURRENT Short = 1133
	// DCGM_FI_DEV_CPU_VENDOR represents /
	DCGM_FI_DEV_CPU_VENDOR Short = 1140
	// DCGM_FI_DEV_CPU_MODEL represents /
	DCGM_FI_DEV_CPU_MODEL Short = 1141
	// DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS represents /
	DCGM_FI_DEV_NVLINK_COUNT_TX_PACKETS Short = 1200
	// DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES represents /
	DCGM_FI_DEV_NVLINK_COUNT_TX_BYTES Short = 1201
	// DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_PACKETS Short = 1202
	// DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_BYTES Short = 1203
	// DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS Short = 1204
	// DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_BUFFER_OVERRUN_ERRORS Short = 1205
	// DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS Short = 1206
	// DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS Short = 1207
	// DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS Short = 1208
	// DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS Short = 1209
	// DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS represents /
	DCGM_FI_DEV_NVLINK_COUNT_TX_DISCARDS Short = 1210
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS represents /
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS Short = 1211
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS represents /
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS Short = 1212
	// DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS represents /
	DCGM_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS Short = 1213
	// DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS Short = 1214
	// DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER represents /
	DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER Short = 1215
	// DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT represents /
	DCGM_FI_DEV_NVLINK_COUNT_SYMBOL_BER_FLOAT Short = 1216
	// DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER represents /
	DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER Short = 1217
	// DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT represents /
	DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER_FLOAT Short = 1218
	// DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS represents /
	DCGM_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS Short = 1219
	// DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL represents /
	DCGM_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL Short = 1220
	// DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID represents /
	DCGM_FI_DEV_FIRST_CONNECTX_FIELD_ID Short = 1300
	// DCGM_FI_DEV_CONNECTX_HEALTH represents /
	DCGM_FI_DEV_CONNECTX_HEALTH Short = 1300
	// DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH represents /
	DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_WIDTH Short = 1301
	// DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED represents /
	DCGM_FI_DEV_CONNECTX_ACTIVE_PCIE_LINK_SPEED Short = 1302
	// DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH represents /
	DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_WIDTH Short = 1303
	// DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED represents /
	DCGM_FI_DEV_CONNECTX_EXPECT_PCIE_LINK_SPEED Short = 1304
	// DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS represents /
	DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_STATUS Short = 1305
	// DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK represents /
	DCGM_FI_DEV_CONNECTX_CORRECTABLE_ERR_MASK Short = 1306
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS represents /
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_STATUS Short = 1307
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK represents /
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_MASK Short = 1308
	// DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY represents /
	DCGM_FI_DEV_CONNECTX_UNCORRECTABLE_ERR_SEVERITY Short = 1309
	// DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE represents /
	DCGM_FI_DEV_CONNECTX_DEVICE_TEMPERATURE Short = 1310
	// DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID represents /
	DCGM_FI_DEV_LAST_CONNECTX_FIELD_ID Short = 1399
	// DCGM_FI_DEV_C2C_LINK_ERROR_INTR represents /
	DCGM_FI_DEV_C2C_LINK_ERROR_INTR Short = 1400
	// DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY represents /
	DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY Short = 1401
	// DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B represents /
	DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY_B2B Short = 1402
	// DCGM_FI_DEV_C2C_LINK_POWER_STATE represents /
	DCGM_FI_DEV_C2C_LINK_POWER_STATE Short = 1403
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0 Short = 1404
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1 Short = 1405
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2 Short = 1406
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3 Short = 1407
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4 Short = 1408
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5 Short = 1409
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6 Short = 1410
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7 Short = 1411
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8 Short = 1412
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9 Short = 1413
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10 Short = 1414
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11 Short = 1415
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12 Short = 1416
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13 Short = 1417
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14 Short = 1418
	// DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 represents /
	DCGM_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15 Short = 1419
	// DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_POWER_CAP_NS Short = 1420
	// DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASON_SYNC_BOOST_NS Short = 1421
	// DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASON_SW_THERM_SLOWDOWN_NS Short = 1422
	// DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_THERM_SLOWDOWN_NS Short = 1423
	// DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS represents /
	DCGM_FI_DEV_CLOCKS_EVENT_REASON_HW_POWER_BRAKE_SLOWDOWN_NS Short = 1424
	// DCGM_FI_DEV_PWR_SMOOTHING_ENABLED represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ENABLED Short = 1425
	// DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL represents /
	DCGM_FI_DEV_PWR_SMOOTHING_PRIV_LVL Short = 1426
	// DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED represents /
	DCGM_FI_DEV_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED Short = 1427
	// DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL represents /
	DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_CEIL Short = 1428
	// DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR represents /
	DCGM_FI_DEV_PWR_SMOOTHING_APPLIED_TMP_FLOOR Short = 1429
	// DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING represents /
	DCGM_FI_DEV_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING Short = 1430
	// DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING represents /
	DCGM_FI_DEV_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING Short = 1431
	// DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING represents /
	DCGM_FI_DEV_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING Short = 1432
	// DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES represents /
	DCGM_FI_DEV_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES Short = 1433
	// DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR represents /
	DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR Short = 1434
	// DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE represents /
	DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE Short = 1435
	// DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE represents /
	DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE Short = 1436
	// DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL represents /
	DCGM_FI_DEV_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL Short = 1437
	// DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE Short = 1438
	// DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR Short = 1439
	// DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE Short = 1440
	// DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE Short = 1441
	// DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL represents /
	DCGM_FI_DEV_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL Short = 1442
	// DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS represents /
	DCGM_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS Short = 1501
	// DCGM_FI_IMEX_DOMAIN_STATUS represents /
	DCGM_FI_IMEX_DOMAIN_STATUS Short = 1502
	// DCGM_FI_IMEX_DAEMON_STATUS represents /
	DCGM_FI_IMEX_DAEMON_STATUS Short = 1503
	// DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG represents /
	DCGM_FI_DEV_MEMORY_UNREPAIRABLE_FLAG Short = 1507
	// DCGM_FI_DEV_NVLINK_GET_STATE represents /
	DCGM_FI_DEV_NVLINK_GET_STATE Short = 1508
	// DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT represents /
	DCGM_FI_DEV_NVLINK_PPCNT_IBPC_PORT_XMIT_WAIT Short = 1509
	// DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION represents /
	DCGM_FI_DEV_GET_GPU_RECOVERY_ACTION Short = 1523
)

func GetFieldID

func GetFieldID(fieldName string) (Short, bool)

GetFieldID returns the DCGM field ID for a given field name and whether it was found It first checks the current field IDs, then falls back to legacy field IDs if not found

func GetFieldIDOrPanic

func GetFieldIDOrPanic(fieldName string) Short

GetFieldIDOrPanic returns the DCGM field ID for a given field name It panics if the field name is not found in either current or legacy maps

type Status

type Status struct {
	// Memory represents the current memory usage of the DCGM hostengine in kilobytes
	Memory int64
	// CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100)
	CPU float64
}

Status represents the current resource utilization of the DCGM hostengine process

func Introspect

func Introspect() (Status, error)

Introspect returns memory and CPU usage statistics for the DCGM hostengine

type SystemWatch

type SystemWatch struct {
	// Type identifies the type of health watch system
	Type string
	// Status indicates the current health status
	Status string
	// Error contains any error message if status is not healthy
	Error string
}

SystemWatch represents a health watch system and its status

type ThermalPolicyCondition

type ThermalPolicyCondition struct {
	// ThermalViolation indicates the severity of the thermal violation
	ThermalViolation uint
}

ThermalPolicyCondition contains details about a thermal violation

type Time

type Time uint64

Time represents a Unix timestamp in seconds

func (Time) String

func (t Time) String() string

String returns a human-readable string representation of the timestamp. Returns "Running" if the timestamp is 0, otherwise returns the formatted time.

type UtilizationInfo

type UtilizationInfo struct {
	GPU     int64 // %
	Memory  int64 // %
	Encoder int64 // %
	Decoder int64 // %
}

UtilizationInfo contains GPU utilization metrics

type ViolationTime

type ViolationTime struct {
	// Power is time spent throttling due to power constraints
	Power *uint64
	// Thermal is time spent throttling due to thermal constraints
	Thermal *uint64
	// Reliability is time spent throttling due to reliability constraints
	Reliability *uint64
	// BoardLimit is time spent throttling due to board limit constraints
	BoardLimit *uint64
	// LowUtilization is time spent throttling due to low utilization
	LowUtilization *uint64
	// SyncBoost is time spent throttling due to sync boost
	SyncBoost *uint64
}

ViolationTime measures amount of time (in ms) GPU was at reduced clocks

type XIDErrorInfo

type XIDErrorInfo struct {
	// NumErrors is the number of XID errors that occurred
	NumErrors int
	// Timestamp contains the timestamps of when XID errors occurred
	Timestamp []uint64
}

XIDErrorInfo contains information about XID errors

type XidPolicyCondition

type XidPolicyCondition struct {
	// ErrNum is the XID error number
	ErrNum uint
}

XidPolicyCondition contains details about an XID error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL