 Documentation
      ¶
      Documentation
      ¶
    
    
  
    
  
    Overview ¶
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin fault rescheduling.
Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.
Index ¶
- Constants
- func GetTaskRestartReason(reasonList []FaultReasonList) string
- func NewHandler() plugin.FaultHandler
- type DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
- func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
 
- type FaultCard
- type FaultDeviceList
- type FaultJob
- func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- func (fJob *FaultJob) IsNormalJobNeedRestart() bool
- func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
 
- type FaultNode
- type FaultNodeInfoToCm
- type FaultReasonList
- type FaultTask
- type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode) error
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo
- func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error
- func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
- func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
 
- type RemainRetryTimes
- type RescheduleReason
- type RescheduleRecord
- type RescheduleTaskReason
- type SimpleFNodeInfo
Constants ¶
const ( // RePropertyName name specifying re-scheduler cm RePropertyName = "re-scheduling" // ReschedulingReasonKey is used to record the reason of rescheduling ReschedulingReasonKey = "rescheduling-reason" // CmName Name of ReSchedulerConfigmap CmName = "vcjob-fault-npu-cm" // CmNameSpace Namespace of ReSchedulerConfigmap CmNameSpace = "volcano-system" // RescheduleReasonCmName Name of RescheduleReasonConfigmap RescheduleReasonCmName = "job-reschedule-reason" // RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap RescheduleReasonCmNamespace = "mindx-dl" // JobRescheduleLabelKey key word of re-scheduling configuration JobRescheduleLabelKey = "fault-scheduling" // JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration JobGraceRescheduleLabelValue = "grace" // JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration JobForceRescheduleLabelValue = "force" // JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration JobOffRescheduleLabelValue = "off" // GraceOverTimeKey for GraceOverTime config by user GraceOverTimeKey = "grace-over-time" // ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling ElasticSchedulingKey = "elastic-scheduling" // JobOnElasticScheduling job enabled with elastic scheduling JobOnElasticScheduling = "on" // JobOffElasticScheduling job not enabled with elastic scheduling JobOffElasticScheduling = "off" // CmFaultNodeKind key in configmap which saves the FaultNode cache CmFaultNodeKind = "fault-node" // CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache CmFaultJob910bx2Kind = "fault-job-910bx2" // CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache CmFaultJob910x8Kind = "fault-job-910x8" // CmJobRemainRetryTimes key in configmap which saves remain retry times of job CmJobRemainRetryTimes = "remain-retry-times" // MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted // if record more than MaxRescheduleRecordsNum records MaxRescheduleRecordsNum = 10 // MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records MaxKbOfRescheduleRecords = 950 * 1024 // CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling CmJobRescheduleReasonsKey = "recent-reschedule-records" // CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence CmNodeRankTimeMapKind = "node-rankIndex-Occurrence" // CmCheckCode Check code key CmCheckCode = "checkCode" // CmFaultJob key in configmap which saves the FaultJob cache CmFaultJob = "fault-job" // DefaultGraceOverTime time interval for grace delete DefaultGraceOverTime = 900 // PublicFaultType represents a PublicFault fault type PublicFaultType = "PublicFault" // CardHealthy represents a healthy card CardHealthy = "Healthy" // CardUnhealthy represents an unhealthy card CardUnhealthy = "Unhealthy" // CardNetworkUnhealthy represents a network unhealthy card CardNetworkUnhealthy = "NetworkUnhealthy" // NodeHealthy represents node is available for scheduling NodeHealthy = "Healthy" // NodeUnhealthy represents node is unhealthy NodeUnhealthy = "NodeUnhealthy" // NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy NodeCardUnhealthy = "CardUnhealthy" // NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy NodeCardNetworkUnhealthy = "CardNetworkUnhealthy" // NoFaultJobsErr none fault jobs NoFaultJobsErr = "none fault jobs to be restarted in cache" // JobRecovery Name of cm for recovery JobRecovery = "job-recovery" // DeviceFaultCmKey the key of DeviceFault info DeviceFaultCmKey = "huawei.com/Ascend910-Fault" // PodFailed the state of failed pod PodFailed = "pod-failed" // PodHealthy the state of healthy pod PodHealthy = "pod-healthy" // FaultRetryTimesKey key of fault-retry-times label FaultRetryTimesKey = "fault-retry-times" )
const ( // PreSeparateNPU fault type waiting user check PreSeparateNPU = "PreSeparateNPU" // NotHandleFault fault type not handle NotHandleFault = "NotHandleFault" // NodeFaultCode fault type nodeUnhealthy NodeFaultCode = "heartbeatTimeOut" // SubHealthFault subHealth code SubHealthFault = "SubHealthFault" )
const (
	// SuperPodAnnoKey annotation key of super pod
	SuperPodAnnoKey = "sp-block"
)
    Variables ¶
This section is empty.
Functions ¶
func GetTaskRestartReason ¶
func GetTaskRestartReason(reasonList []FaultReasonList) string
GetTaskRestartReason convert to json str
Types ¶
type DealReSchedulerCache ¶
type DealReSchedulerCache struct {
	FaultNodes                 map[string]*FaultNode
	FaultJobs                  map[api.JobID]*FaultJob
	JobRemainRetryTimes        map[api.JobID]*RemainRetryTimes
	JobRecentRescheduleRecords map[api.JobID]*RescheduleReason
}
    DealReSchedulerCache object with method for re-scheduler cache
func GetReSchedulerCache ¶
func GetReSchedulerCache() *DealReSchedulerCache
GetReSchedulerCache return reschedule cache
func (DealReSchedulerCache) GetRealFaultNodes ¶
func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
GetRealFaultNodes get the nodes whose isFaultNode property takes true value
func (*DealReSchedulerCache) SetJobRecentRescheduleRecords ¶
func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it
func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶
func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap
type FaultDeviceList ¶
type FaultDeviceList struct {
	FaultType            string `json:"fault_type"`
	NPUName              string `json:"npu_name"`
	FaultLevel           string `json:"fault_level"`
	FaultHandling        string `json:"fault_handling"`
	LargeModelFaultLevel string `json:"large_model_fault_level"`
	FaultCode            string `json:"fault_code"`
}
    FaultDeviceList is the fault reason of card
func GetNodeDeviceFaultFromDeviceInfo ¶
func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)
GetNodeDeviceFaultFromDeviceInfo get device fault from device info
type FaultJob ¶
type FaultJob struct {
	ReScheduleKey      string // values taken off/grace/force
	RescheduleTime     int64
	SubHealthyStrategy string
	IsSubHealthFault   bool
	PendingSessionNum  int
	IsFaultJob         bool
	JobName            string
	JobUID             api.JobID
	JobNamespace       string
	SuperPods          map[string][]plugin.SuperNode
	FaultTasks         []FaultTask
	UpdateTime         int64
	FaultTypes         []string
	DeleteExecutedFlag bool
	ElasticScheduling  string
	ReferenceName      string
	FaultRetryTimes    int
	UUID types.UID
	// contains filtered or unexported fields
}
    FaultJob job object for re-scheduling
func (*FaultJob) ForceDeleteJob ¶
func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones
func (*FaultJob) GetJobElasticSchedulingLabel ¶
func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
GetJobElasticSchedulingLabel get job's elastic scheduling label
func (*FaultJob) GetJobFaultRescheduleLabel ¶
func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
GetJobFaultRescheduleLabel Get job's fault reschedule label.
func (*FaultJob) GraceDeleteJob ¶
func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
GraceDeleteJob grace delete jobs labelled to be deleted gracefully
func (*FaultJob) IsJobSingleRescheduling ¶
func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
IsJobSingleRescheduling valid job.
func (*FaultJob) IsNormalJobNeedRestart ¶
IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault
func (*FaultJob) IsProcessReschedulingJob ¶
func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
IsProcessReschedulingJob valid job.
type FaultNode ¶
type FaultNode struct {
	SuperPodID              int32
	NodeName                string
	NPUName                 string
	FaultDeviceList         []FaultDeviceList
	UpdateTime              int64
	UnhealthyNPU            []string
	NetworkUnhealthyNPU     []string
	IsFaultNode             bool
	NodeDEnable             bool
	NodeHealthState         string
	FaultCards              []FaultCard
	HasSwitchSubHealthFault bool
	HasCardSubHealthFault   bool
}
    FaultNode node object for re-scheduling
type FaultNodeInfoToCm ¶
type FaultNodeInfoToCm struct {
	FaultDeviceList     []FaultDeviceList
	NodeName            string
	UnhealthyNPU        []string
	NetworkUnhealthyNPU []string
	NodeDEnable         bool
	NodeHealthState     string
	UpdateTime          int64
}
    FaultNodeInfoToCm fault node info to cm
type FaultReasonList ¶
type FaultReasonList struct {
	NodeName      string `json:"node_name"`
	TaskName      string `json:"task_name"`
	FaultRankList []string
	FaultDeviceList
}
    FaultReasonList node Fault Device List
type FaultTask ¶
type FaultTask struct {
	Reason             []FaultReasonList
	RelationFault      string
	IsFaultTask        bool
	IsFaultRetryEnable bool
	HasSubHealthFault  bool
	IsSoftwareFault    bool
	TaskUID            api.TaskID
	TaskName           string
	TaskNamespace      string
	NodeName           string
	NodeRankIndex      string
	UseCardName        []string
	PodCreateTime      int64
	// contains filtered or unexported fields
}
    FaultTask object dealing with node for rescheduling
func (*FaultTask) DeleteRealPodByTask ¶
func (fTask *FaultTask) DeleteRealPodByTask(kubeClient kubernetes.Interface, waitTime int64) error
DeleteRealPodByTask delete pod from kubernetes of tasks
type ReScheduler ¶
type ReScheduler struct {
	*DealReSchedulerCache
	GraceDeleteTime int64
	Jobs            map[api.JobID]plugin.SchedulerJob
	Nodes           map[string]plugin.NPUNode
	// contains filtered or unexported fields
}
    ReScheduler object for re-scheduling
func (*ReScheduler) AddFaultJobWithSession ¶
func (reScheduler *ReScheduler) AddFaultJobWithSession( jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects
func (*ReScheduler) AddFaultNodeWithSession ¶
func (reScheduler *ReScheduler) AddFaultNodeWithSession()
AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache
func (*ReScheduler) CheckNodeNPUByTask ¶
CheckNodeNPUByTask used in the predicate process of task and node
func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶
func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs( schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully
func (*ReScheduler) GetRunningJobs ¶
GetRunningJobs get all the running jobs of <UseCardName> type
func (*ReScheduler) PreStartAction ¶
func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error
PreStartAction pre-processing actions for rescheduler handler
func (*ReScheduler) PreStopAction ¶
func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
PreStopAction post-processing actions for re-scheduling
func (*ReScheduler) RestartFaultJobs ¶
func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off
func (*ReScheduler) RestartNeedForceDeleteJobs ¶
func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
RestartNeedForceDeleteJobs Restart jobs that need to be force deleted
func (*ReScheduler) ScoreBestNPUNodes ¶
func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks
func (*ReScheduler) SyncJobRecentRescheduleReason ¶
func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync
func (*ReScheduler) SyncJobRemainRetryTimes ¶
func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session
type RemainRetryTimes ¶
RemainRetryTimes remained retry times
type RescheduleReason ¶
type RescheduleReason struct {
	// JobID the job id of this record
	JobID api.JobID
	// TotalRescheduleTimes to show how many times reschedule has happened since job created
	TotalRescheduleTimes int
	// RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling
	RescheduleRecords []RescheduleRecord
	// AdditionalInfo is used to provide additional information, such as for length concern reduce some records
	AdditionalInfo string `json:",omitempty"`
}
    RescheduleReason shows the reason of this job rescheduling
type RescheduleRecord ¶
type RescheduleRecord struct {
	// LogFileFormatTime is the formated time, to make it convenient to read and locate log
	LogFileFormatTime string
	// RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened
	RescheduleTimeStamp int64
	// ReasonOfTask record the reason of this rescheduling of task
	ReasonOfTask []RescheduleTaskReason
}
    RescheduleRecord will records job rescheduling records
type RescheduleTaskReason ¶
type RescheduleTaskReason struct {
	// RescheduleReason the fault type of this rescheduling
	RescheduleReason string
	// PodName the fault task caused this rescheduling
	PodName string
	// NodeName the fault node caused this rescheduling
	NodeName string
	// NodeRankIndex the rank index of the fault task
	NodeRankIndex string
}
    RescheduleTaskReason record the reason of this rescheduling of task