Documentation
¶
Overview ¶
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.
Index ¶
- Constants
- func GetTaskRestartReason(reasonList []FaultReasonList) string
- func NewHandler() plugin.FaultHandler
- type DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
- func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
- type FaultCard
- type FaultDeviceList
- type FaultJob
- func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- func (fJob *FaultJob) IsNormalJobNeedRestart() bool
- func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
- type FaultNode
- type FaultNodeInfoToCm
- type FaultReasonList
- type FaultTask
- type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode) error
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo
- func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error
- func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
- func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
- type RemainRetryTimes
- type RescheduleReason
- type RescheduleRecord
- type RescheduleTaskReason
- type SimpleFNodeInfo
Constants ¶
const ( // RePropertyName name specifying re-scheduler cm RePropertyName = "re-scheduling" // ReschedulingReasonKey is used to record the reason of rescheduling ReschedulingReasonKey = "rescheduling-reason" // CmName Name of ReSchedulerConfigmap CmName = "vcjob-fault-npu-cm" // CmNameSpace Namespace of ReSchedulerConfigmap CmNameSpace = "volcano-system" // RescheduleReasonCmName Name of RescheduleReasonConfigmap RescheduleReasonCmName = "job-reschedule-reason" // RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap RescheduleReasonCmNamespace = "mindx-dl" // JobRescheduleLabelKey key word of re-scheduling configuration JobRescheduleLabelKey = "fault-scheduling" // JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration JobGraceRescheduleLabelValue = "grace" // JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration JobForceRescheduleLabelValue = "force" // JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration JobOffRescheduleLabelValue = "off" // GraceOverTimeKey for GraceOverTime config by user GraceOverTimeKey = "grace-over-time" // ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling ElasticSchedulingKey = "elastic-scheduling" // JobOnElasticScheduling job enabled with elastic scheduling JobOnElasticScheduling = "on" // JobOffElasticScheduling job not enabled with elastic scheduling JobOffElasticScheduling = "off" // CmFaultNodeKind key in configmap which saves the FaultNode cache CmFaultNodeKind = "fault-node" // CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache CmFaultJob910bx2Kind = "fault-job-910bx2" // CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x8Kind = "fault-job-910x8" // CmJobRemainRetryTimes key in configmap which saves remain retry times of job CmJobRemainRetryTimes = "remain-retry-times" // MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted // if record more than MaxRescheduleRecordsNum records MaxRescheduleRecordsNum = 10 // MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records MaxKbOfRescheduleRecords = 950 * 1024 // CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling CmJobRescheduleReasonsKey = "recent-reschedule-records" // CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence CmNodeRankTimeMapKind = "node-rankIndex-Occurrence" // CmCheckCode Check code key CmCheckCode = "checkCode" // CmFaultJob key in configmap which saves the FaultJob cache CmFaultJob = "fault-job" // DefaultGraceOverTime time interval for grace delete DefaultGraceOverTime = 900 // PublicFaultType represents a PublicFault fault type PublicFaultType = "PublicFault" // CardHealthy represents a healthy card CardHealthy = "Healthy" // CardUnhealthy represents an unhealthy card CardUnhealthy = "Unhealthy" // CardNetworkUnhealthy represents a network unhealthy card CardNetworkUnhealthy = "NetworkUnhealthy" // NodeHealthy represents node is available for scheduling NodeHealthy = "Healthy" // NodeUnhealthy represents node is unhealthy NodeUnhealthy = "NodeUnhealthy" // NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy NodeCardUnhealthy = "CardUnhealthy" // NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy NodeCardNetworkUnhealthy = "CardNetworkUnhealthy" // NoFaultJobsErr none fault jobs NoFaultJobsErr = "none fault jobs to be restarted in cache" // JobRecovery Name of cm for recovery JobRecovery = "job-recovery" // DeviceFaultCmKey the key of DeviceFault info DeviceFaultCmKey = "huawei.com/Ascend910-Fault" // PodFailed the state of failed pod PodFailed = "pod-failed" // PodHealthy the state of healthy pod PodHealthy = "pod-healthy" // FaultRetryTimesKey key of fault-retry-times label FaultRetryTimesKey = "fault-retry-times" )
const ( // PreSeparateNPU fault type waiting user check PreSeparateNPU = "PreSeparateNPU" // NotHandleFault fault type not handle NotHandleFault = "NotHandleFault" // NodeFaultCode fault type nodeUnhealthy NodeFaultCode = "heartbeatTimeOut" // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const (
// SuperPodAnnoKey annotation key of super pod
SuperPodAnnoKey = "sp-block"
)
Variables ¶
This section is empty.
Functions ¶
func GetTaskRestartReason ¶
func GetTaskRestartReason(reasonList []FaultReasonList) string
GetTaskRestartReason convert to json str
Types ¶
type DealReSchedulerCache ¶
type DealReSchedulerCache struct {
FaultNodes map[string]*FaultNode
FaultJobs map[api.JobID]*FaultJob
JobRemainRetryTimes map[api.JobID]*RemainRetryTimes
JobRecentRescheduleRecords map[api.JobID]*RescheduleReason
}
DealReSchedulerCache object with method for re-scheduler cache
func GetReSchedulerCache ¶
func GetReSchedulerCache() *DealReSchedulerCache
GetReSchedulerCache return reschedule cache
func (DealReSchedulerCache) GetRealFaultNodes ¶
func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
GetRealFaultNodes get the nodes whose isFaultNode property takes true value
func (*DealReSchedulerCache) SetJobRecentRescheduleRecords ¶
func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it
func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶
func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap
type FaultDeviceList ¶
type FaultDeviceList struct {
FaultType string `json:"fault_type"`
NPUName string `json:"npu_name"`
FaultLevel string `json:"fault_level"`
FaultHandling string `json:"fault_handling"`
LargeModelFaultLevel string `json:"large_model_fault_level"`
FaultCode string `json:"fault_code"`
}
FaultDeviceList is the fault reason of card
func GetNodeDeviceFaultFromDeviceInfo ¶
func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)
GetNodeDeviceFaultFromDeviceInfo get device fault from device info
type FaultJob ¶
type FaultJob struct {
ReScheduleKey string // values taken off/grace/force
RescheduleTime int64
SubHealthyStrategy string
IsSubHealthFault bool
PendingSessionNum int
IsFaultJob bool
JobName string
JobUID api.JobID
JobNamespace string
SuperPods map[string][]plugin.SuperNode
FaultTasks []FaultTask
UpdateTime int64
FaultTypes []string
DeleteExecutedFlag bool
ElasticScheduling string
ReferenceName string
FaultRetryTimes int
UUID types.UID
// contains filtered or unexported fields
}
FaultJob job object for re-scheduling
func (*FaultJob) ForceDeleteJob ¶
func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones
func (*FaultJob) GetJobElasticSchedulingLabel ¶
func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
GetJobElasticSchedulingLabel get job's elastic scheduling label
func (*FaultJob) GetJobFaultRescheduleLabel ¶
func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
GetJobFaultRescheduleLabel Get job's fault reschedule label.
func (*FaultJob) GraceDeleteJob ¶
func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
GraceDeleteJob grace delete jobs labelled to be deleted gracefully
func (*FaultJob) IsJobSingleRescheduling ¶
func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
IsJobSingleRescheduling valid job.
func (*FaultJob) IsNormalJobNeedRestart ¶
IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault
func (*FaultJob) IsProcessReschedulingJob ¶
func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
IsProcessReschedulingJob valid job.
type FaultNode ¶
type FaultNode struct {
SuperPodID int32
NodeName string
NPUName string
FaultDeviceList []FaultDeviceList
UpdateTime int64
UnhealthyNPU []string
NetworkUnhealthyNPU []string
IsFaultNode bool
NodeDEnable bool
NodeHealthState string
FaultCards []FaultCard
HasSwitchSubHealthFault bool
HasCardSubHealthFault bool
}
FaultNode node object for re-scheduling
type FaultNodeInfoToCm ¶
type FaultNodeInfoToCm struct {
FaultDeviceList []FaultDeviceList
NodeName string
UnhealthyNPU []string
NetworkUnhealthyNPU []string
NodeDEnable bool
NodeHealthState string
UpdateTime int64
}
FaultNodeInfoToCm fault node info to cm
type FaultReasonList ¶
type FaultReasonList struct {
NodeName string `json:"node_name"`
TaskName string `json:"task_name"`
FaultRankList []string
FaultDeviceList
}
FaultReasonList node Fault Device List
type FaultTask ¶
type FaultTask struct {
Reason []FaultReasonList
RelationFault string
IsFaultTask bool
IsFaultRetryEnable bool
HasSubHealthFault bool
IsSoftwareFault bool
TaskUID api.TaskID
TaskName string
TaskNamespace string
NodeName string
NodeRankIndex string
UseCardName []string
PodCreateTime int64
// contains filtered or unexported fields
}
FaultTask object dealing with node for rescheduling
func (*FaultTask) DeleteRealPodByTask ¶
func (fTask *FaultTask) DeleteRealPodByTask(kubeClient kubernetes.Interface, waitTime int64) error
DeleteRealPodByTask delete pod from kubernetes of tasks
type ReScheduler ¶
type ReScheduler struct {
*DealReSchedulerCache
GraceDeleteTime int64
Jobs map[api.JobID]plugin.SchedulerJob
Nodes map[string]plugin.NPUNode
// contains filtered or unexported fields
}
ReScheduler object for re-scheduling
func (*ReScheduler) AddFaultJobWithSession ¶
func (reScheduler *ReScheduler) AddFaultJobWithSession( jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects
func (*ReScheduler) AddFaultNodeWithSession ¶
func (reScheduler *ReScheduler) AddFaultNodeWithSession()
AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache
func (*ReScheduler) CheckNodeNPUByTask ¶
CheckNodeNPUByTask used in the predicate process of task and node
func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶
func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs( schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully
func (*ReScheduler) GetRunningJobs ¶
GetRunningJobs get all the running jobs of <UseCardName> type
func (*ReScheduler) PreStartAction ¶
func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error
PreStartAction pre-processing actions for rescheduler handler
func (*ReScheduler) PreStopAction ¶
func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
PreStopAction post-processing actions for re-scheduling
func (*ReScheduler) RestartFaultJobs ¶
func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off
func (*ReScheduler) RestartNeedForceDeleteJobs ¶
func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartNeedForceDeleteJobs Restart jobs that need to be force deleted
func (*ReScheduler) ScoreBestNPUNodes ¶
func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks
func (*ReScheduler) SyncJobRecentRescheduleReason ¶
func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync
func (*ReScheduler) SyncJobRemainRetryTimes ¶
func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session
type RemainRetryTimes ¶
RemainRetryTimes remained retry times
type RescheduleReason ¶
type RescheduleReason struct {
// JobID the job id of this record
JobID api.JobID
// TotalRescheduleTimes to show how many times reschedule has happened since job created
TotalRescheduleTimes int
// RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling
RescheduleRecords []RescheduleRecord
// AdditionalInfo is used to provide additional information, such as for length concern reduce some records
AdditionalInfo string `json:",omitempty"`
}
RescheduleReason shows the reason of this job rescheduling
type RescheduleRecord ¶
type RescheduleRecord struct {
// LogFileFormatTime is the formated time, to make it convenient to read and locate log
LogFileFormatTime string
// RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened
RescheduleTimeStamp int64
// ReasonOfTask record the reason of this rescheduling of task
ReasonOfTask []RescheduleTaskReason
}
RescheduleRecord will records job rescheduling records
type RescheduleTaskReason ¶
type RescheduleTaskReason struct {
// RescheduleReason the fault type of this rescheduling
RescheduleReason string
// PodName the fault task caused this rescheduling
PodName string
// NodeName the fault node caused this rescheduling
NodeName string
// NodeRankIndex the rank index of the fault task
NodeRankIndex string
}
RescheduleTaskReason record the reason of this rescheduling of task