rescheduling

package

v0.0.0-...-c70b2fc Latest Latest Go to latest Published: Apr 23, 2025 License: Apache-2.0, Apache-2.0 Imports: 19 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/professorshandian/mind-cluster

Links

Open Source Insights

Documentation ¶

Overview ¶

Package rescheduling is using for HuaWei Ascend pin fault rescheduling.

Package rescheduling is using for HuaWei Ascend pin affinity schedule utilities.

Index ¶

Constants
func GetTaskRestartReason(reasonList []FaultReasonList) string
func NewHandler() plugin.FaultHandler
type DealReSchedulerCache
- func GetReSchedulerCache() *DealReSchedulerCache
- func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode
- func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool, client kubernetes.Interface) error
- func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error
type FaultCard
type FaultDeviceList
- func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)
type FaultJob
- func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string
- func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob, env plugin.ScheduleEnv) error
- func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool
- func (fJob *FaultJob) IsNormalJobNeedRestart() bool
- func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool
type FaultNode
type FaultNodeInfoToCm
type FaultReasonList
type FaultTask
- func (fTask *FaultTask) DeleteRealPodByTask(kubeClient kubernetes.Interface, waitTime int64) error
type ReScheduler
- func (reScheduler *ReScheduler) AddFaultJobWithSession(jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) AddFaultNodeWithSession()
- func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode) error
- func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)
- func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo
- func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error
- func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error
- func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)
- func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)
- func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)
type RemainRetryTimes
type RescheduleReason
type RescheduleRecord
type RescheduleTaskReason
type SimpleFNodeInfo

Constants ¶

View Source

const (
	// RePropertyName name specifying re-scheduler cm
	RePropertyName = "re-scheduling"
	// ReschedulingReasonKey is used to record the reason of rescheduling
	ReschedulingReasonKey = "rescheduling-reason"
	// CmName Name of ReSchedulerConfigmap
	CmName = "vcjob-fault-npu-cm"
	// CmNameSpace Namespace of ReSchedulerConfigmap
	CmNameSpace = "volcano-system"
	// RescheduleReasonCmName Name of RescheduleReasonConfigmap
	RescheduleReasonCmName = "job-reschedule-reason"
	// RescheduleReasonCmNamespace Namespace of RescheduleReasonConfigmap
	RescheduleReasonCmNamespace = "mindx-dl"

	// JobRescheduleLabelKey key word of re-scheduling configuration
	JobRescheduleLabelKey = "fault-scheduling"
	// JobGraceRescheduleLabelValue Grace delete reschedule job, possible value of re-scheduling configuration
	JobGraceRescheduleLabelValue = "grace"
	// JobForceRescheduleLabelValue Force delete reschedule job, possible value of re-scheduling configuration
	JobForceRescheduleLabelValue = "force"
	// JobOffRescheduleLabelValue not delete reschedule job, possible value of re-scheduling configuration
	JobOffRescheduleLabelValue = "off"
	// GraceOverTimeKey for GraceOverTime config by user
	GraceOverTimeKey = "grace-over-time"
	// ElasticSchedulingKey for distinguishing whether a job is enabled with elastic scheduling
	ElasticSchedulingKey = "elastic-scheduling"
	// JobOnElasticScheduling job enabled with elastic scheduling
	JobOnElasticScheduling = "on"
	// JobOffElasticScheduling job not enabled with elastic scheduling
	JobOffElasticScheduling = "off"

	// CmFaultNodeKind key in configmap which saves the FaultNode cache
	CmFaultNodeKind = "fault-node"
	// CmFaultJob910bx2Kind key in configmap which saves the 910bx2 FaultJob cache
	CmFaultJob910bx2Kind = "fault-job-910bx2"
	// CmFaultJob910x8Kind key in configmap which saves the 910x8 FaultJob cache
	CmFaultJob910x8Kind = "fault-job-910x8"
	// CmJobRemainRetryTimes key in configmap which saves remain retry times of job
	CmJobRemainRetryTimes = "remain-retry-times"
	// MaxRescheduleRecordsNum the upper limit of the cm kept reschedule records, oldest record will be deleted
	// if record more than MaxRescheduleRecordsNum records
	MaxRescheduleRecordsNum = 10
	// MaxKbOfRescheduleRecords the upper limit words of the cm kept reschedule records
	MaxKbOfRescheduleRecords = 950 * 1024
	// CmJobRescheduleReasonsKey keeping recent MaxRescheduleRecordsNum records of rescheduling
	CmJobRescheduleReasonsKey = "recent-reschedule-records"
	// CmNodeRankTimeMapKind record map jobUID rankIndex node and times of occurrence
	CmNodeRankTimeMapKind = "node-rankIndex-Occurrence"
	// CmCheckCode Check code key
	CmCheckCode = "checkCode"

	// CmFaultJob key in configmap which saves the FaultJob cache
	CmFaultJob = "fault-job"

	// DefaultGraceOverTime time interval for grace delete
	DefaultGraceOverTime = 900

	// PublicFaultType represents a PublicFault fault type
	PublicFaultType = "PublicFault"
	// CardHealthy represents a healthy card
	CardHealthy = "Healthy"
	// CardUnhealthy represents an unhealthy card
	CardUnhealthy = "Unhealthy"
	// CardNetworkUnhealthy represents a network unhealthy card
	CardNetworkUnhealthy = "NetworkUnhealthy"
	// NodeHealthy represents node is available for scheduling
	NodeHealthy = "Healthy"
	// NodeUnhealthy represents node is unhealthy
	NodeUnhealthy = "NodeUnhealthy"
	// NodeCardUnhealthy represents node is unhealthy because of the card is unhealthy
	NodeCardUnhealthy = "CardUnhealthy"
	// NodeCardNetworkUnhealthy represents node is unhealthy because of card is network unhealthy
	NodeCardNetworkUnhealthy = "CardNetworkUnhealthy"
	// NoFaultJobsErr none fault jobs
	NoFaultJobsErr = "none fault jobs to be restarted in cache"
	// JobRecovery Name of cm for recovery
	JobRecovery = "job-recovery"
	// DeviceFaultCmKey the key of DeviceFault info
	DeviceFaultCmKey = "huawei.com/Ascend910-Fault"
	// PodFailed the state of failed pod
	PodFailed = "pod-failed"
	// PodHealthy the state of healthy pod
	PodHealthy = "pod-healthy"

	// FaultRetryTimesKey key of fault-retry-times label
	FaultRetryTimesKey = "fault-retry-times"
)

View Source

const (
	// PreSeparateNPU fault type waiting user check
	PreSeparateNPU = "PreSeparateNPU"
	// NotHandleFault fault type not handle
	NotHandleFault = "NotHandleFault"
	// NodeFaultCode fault type nodeUnhealthy
	NodeFaultCode = "heartbeatTimeOut"
	// SubHealthFault subHealth code
	SubHealthFault = "SubHealthFault"
)

View Source

const (

	// SuperPodAnnoKey annotation key of super pod
	SuperPodAnnoKey = "sp-block"
)

Variables ¶

This section is empty.

Functions ¶

func GetTaskRestartReason ¶

func GetTaskRestartReason(reasonList []FaultReasonList) string

GetTaskRestartReason convert to json str

func NewHandler ¶

func NewHandler() plugin.FaultHandler

NewHandler new fault policy handler

Types ¶

type DealReSchedulerCache ¶

type DealReSchedulerCache struct {
	FaultNodes                 map[string]*FaultNode
	FaultJobs                  map[api.JobID]*FaultJob
	JobRemainRetryTimes        map[api.JobID]*RemainRetryTimes
	JobRecentRescheduleRecords map[api.JobID]*RescheduleReason
}

DealReSchedulerCache object with method for re-scheduler cache

func GetReSchedulerCache ¶

func GetReSchedulerCache() *DealReSchedulerCache

GetReSchedulerCache return reschedule cache

func (DealReSchedulerCache) GetRealFaultNodes ¶

func (reCache DealReSchedulerCache) GetRealFaultNodes() map[string]*FaultNode

GetRealFaultNodes get the nodes whose isFaultNode property takes true value

func (*DealReSchedulerCache) SetJobRecentRescheduleRecords ¶

func (reCache *DealReSchedulerCache) SetJobRecentRescheduleRecords(firstStartup *bool,
	client kubernetes.Interface) error

SetJobRecentRescheduleRecords get already recorded rescheduling records from cm, and cache it

func (*DealReSchedulerCache) WriteReSchedulerCacheToEnvCache ¶

func (reCache *DealReSchedulerCache) WriteReSchedulerCacheToEnvCache(env *plugin.ScheduleEnv, jobType string) error

WriteReSchedulerCacheToEnvCache write the modifications on cache data to env to update re-scheduling configmap

type FaultCard ¶

type FaultCard struct {
	IsFaultCard bool
	NPUName     string
	FaultType   string
}

FaultCard card object for re-scheduling

type FaultDeviceList ¶

type FaultDeviceList struct {
	FaultType            string `json:"fault_type"`
	NPUName              string `json:"npu_name"`
	FaultLevel           string `json:"fault_level"`
	FaultHandling        string `json:"fault_handling"`
	LargeModelFaultLevel string `json:"large_model_fault_level"`
	FaultCode            string `json:"fault_code"`
}

FaultDeviceList is the fault reason of card

func GetNodeDeviceFaultFromDeviceInfo ¶

func GetNodeDeviceFaultFromDeviceInfo(node *plugin.NPUNode) ([]FaultDeviceList, error)

GetNodeDeviceFaultFromDeviceInfo get device fault from device info

type FaultJob ¶

type FaultJob struct {
	ReScheduleKey      string // values taken off/grace/force
	RescheduleTime     int64
	SubHealthyStrategy string
	IsSubHealthFault   bool
	PendingSessionNum  int
	IsFaultJob         bool
	JobName            string
	JobUID             api.JobID
	JobNamespace       string
	SuperPods          map[string][]plugin.SuperNode
	FaultTasks         []FaultTask
	UpdateTime         int64
	FaultTypes         []string
	DeleteExecutedFlag bool
	ElasticScheduling  string
	ReferenceName      string
	FaultRetryTimes    int

	UUID types.UID
	// contains filtered or unexported fields
}

FaultJob job object for re-scheduling

func (*FaultJob) ForceDeleteJob ¶

func (fJob *FaultJob) ForceDeleteJob(schedulerJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

ForceDeleteJob force delete jobs includes labelled force delete ones and grace delete failed ones

func (*FaultJob) GetJobElasticSchedulingLabel ¶

func (fJob *FaultJob) GetJobElasticSchedulingLabel(job *plugin.SchedulerJob) string

GetJobElasticSchedulingLabel get job's elastic scheduling label

func (*FaultJob) GetJobFaultRescheduleLabel ¶

func (fJob *FaultJob) GetJobFaultRescheduleLabel(job *plugin.SchedulerJob) string

GetJobFaultRescheduleLabel Get job's fault reschedule label.

func (*FaultJob) GraceDeleteJob ¶

func (fJob *FaultJob) GraceDeleteJob(ssn *framework.Session, npuJob *plugin.SchedulerJob,
	env plugin.ScheduleEnv) error

GraceDeleteJob grace delete jobs labelled to be deleted gracefully

func (*FaultJob) IsJobSingleRescheduling ¶

func (fJob *FaultJob) IsJobSingleRescheduling(sJob *plugin.SchedulerJob) bool

IsJobSingleRescheduling valid job.

func (*FaultJob) IsNormalJobNeedRestart ¶

func (fJob *FaultJob) IsNormalJobNeedRestart() bool

IsNormalJobNeedRestart is Job has the key of PreSeparateNPU os Job has software fault

func (*FaultJob) IsProcessReschedulingJob ¶

func (fJob *FaultJob) IsProcessReschedulingJob(sJob *plugin.SchedulerJob) bool

IsProcessReschedulingJob valid job.

type FaultNode ¶

type FaultNode struct {
	SuperPodID              int32
	NodeName                string
	NPUName                 string
	FaultDeviceList         []FaultDeviceList
	UpdateTime              int64
	UnhealthyNPU            []string
	NetworkUnhealthyNPU     []string
	IsFaultNode             bool
	NodeDEnable             bool
	NodeHealthState         string
	FaultCards              []FaultCard
	HasSwitchSubHealthFault bool
	HasCardSubHealthFault   bool
}

FaultNode node object for re-scheduling

type FaultNodeInfoToCm ¶

type FaultNodeInfoToCm struct {
	FaultDeviceList     []FaultDeviceList
	NodeName            string
	UnhealthyNPU        []string
	NetworkUnhealthyNPU []string
	NodeDEnable         bool
	NodeHealthState     string
	UpdateTime          int64
}

FaultNodeInfoToCm fault node info to cm

type FaultReasonList ¶

type FaultReasonList struct {
	NodeName      string `json:"node_name"`
	TaskName      string `json:"task_name"`
	FaultRankList []string
	FaultDeviceList
}

FaultReasonList node Fault Device List

type FaultTask ¶

type FaultTask struct {
	Reason             []FaultReasonList
	RelationFault      string
	IsFaultTask        bool
	IsFaultRetryEnable bool
	HasSubHealthFault  bool
	IsSoftwareFault    bool
	TaskUID            api.TaskID
	TaskName           string
	TaskNamespace      string
	NodeName           string
	NodeRankIndex      string
	UseCardName        []string
	PodCreateTime      int64
	// contains filtered or unexported fields
}

FaultTask object dealing with node for rescheduling

func (*FaultTask) DeleteRealPodByTask ¶

func (fTask *FaultTask) DeleteRealPodByTask(kubeClient kubernetes.Interface, waitTime int64) error

DeleteRealPodByTask delete pod from kubernetes of tasks

type ReScheduler ¶

type ReScheduler struct {
	*DealReSchedulerCache
	GraceDeleteTime int64
	Jobs            map[api.JobID]plugin.SchedulerJob
	Nodes           map[string]plugin.NPUNode
	// contains filtered or unexported fields
}

ReScheduler object for re-scheduling

func (*ReScheduler) AddFaultJobWithSession ¶

func (reScheduler *ReScheduler) AddFaultJobWithSession(
	jobs map[api.JobID]*api.JobInfo, env plugin.ScheduleEnv) error

AddFaultJobWithSession read all running jobs of given card types and create the corresponding FaultJob objects

func (*ReScheduler) AddFaultNodeWithSession ¶

func (reScheduler *ReScheduler) AddFaultNodeWithSession()

AddFaultNodeWithSession Add FaultNode objects for new nodes in session not in cache

func (*ReScheduler) CheckNodeNPUByTask ¶

func (reScheduler *ReScheduler) CheckNodeNPUByTask(task *api.TaskInfo, vcNode plugin.NPUNode) error

CheckNodeNPUByTask used in the predicate process of task and node

func (*ReScheduler) GetNeedForceDeleteDelayingNPUJobs ¶

func (reScheduler *ReScheduler) GetNeedForceDeleteDelayingNPUJobs(
	schedulerJobs map[api.JobID]plugin.SchedulerJob, ssn *framework.Session) ([]plugin.SchedulerJob, error)

GetNeedForceDeleteDelayingNPUJobs get fault jobs with grace label but haven't been evicted successfully

func (*ReScheduler) GetRunningJobs ¶

func (reScheduler *ReScheduler) GetRunningJobs(ssn *framework.Session) map[api.JobID]*api.JobInfo

GetRunningJobs get all the running jobs of <UseCardName> type

func (*ReScheduler) PreStartAction ¶

func (reScheduler *ReScheduler) PreStartAction(env *plugin.ScheduleEnv, ssn *framework.Session) error

PreStartAction pre-processing actions for rescheduler handler

func (*ReScheduler) PreStopAction ¶

func (reScheduler *ReScheduler) PreStopAction(env *plugin.ScheduleEnv) error

PreStopAction post-processing actions for re-scheduling

func (*ReScheduler) RestartFaultJobs ¶

func (reScheduler *ReScheduler) RestartFaultJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartFaultJobs Restart fault jobs by its corresponding strategy grace,force,off

func (*ReScheduler) RestartNeedForceDeleteJobs ¶

func (reScheduler *ReScheduler) RestartNeedForceDeleteJobs(ssn *framework.Session, env plugin.ScheduleEnv) error

RestartNeedForceDeleteJobs Restart jobs that need to be force deleted

func (*ReScheduler) ScoreBestNPUNodes ¶

func (reScheduler *ReScheduler) ScoreBestNPUNodes(task *api.TaskInfo, scoreMap map[string]float64)

ScoreBestNPUNodes add scores on scoreMap for normal nodes used by re-scheduling tasks

func (*ReScheduler) SyncJobRecentRescheduleReason ¶

func (reScheduler *ReScheduler) SyncJobRecentRescheduleReason(ssn *framework.Session)

SyncJobRecentRescheduleReason sync recent reschedule records with ssn, to ensure cache is new and sync

func (*ReScheduler) SyncJobRemainRetryTimes ¶

func (reScheduler *ReScheduler) SyncJobRemainRetryTimes(ssn *framework.Session)

SyncJobRemainRetryTimes Synchronise job remain retry times in cache by updating the information using current session

type RemainRetryTimes ¶

type RemainRetryTimes struct {
	UUID  types.UID
	Times int
}

RemainRetryTimes remained retry times

type RescheduleReason ¶

type RescheduleReason struct {
	// JobID the job id of this record
	JobID api.JobID
	// TotalRescheduleTimes to show how many times reschedule has happened since job created
	TotalRescheduleTimes int
	// RescheduleRecords keep recent MaxRescheduleRecordsNum records of rescheduling
	RescheduleRecords []RescheduleRecord
	// AdditionalInfo is used to provide additional information, such as for length concern reduce some records
	AdditionalInfo string `json:",omitempty"`
}

RescheduleReason shows the reason of this job rescheduling

type RescheduleRecord ¶

type RescheduleRecord struct {
	// LogFileFormatTime is the formated time, to make it convenient to read and locate log
	LogFileFormatTime string
	// RescheduleTimeStamp time.now.unix() indicates when the rescheduling happened
	RescheduleTimeStamp int64
	// ReasonOfTask record the reason of this rescheduling of task
	ReasonOfTask []RescheduleTaskReason
}

RescheduleRecord will records job rescheduling records

type RescheduleTaskReason ¶

type RescheduleTaskReason struct {
	// RescheduleReason the fault type of this rescheduling
	RescheduleReason string
	// PodName the fault task caused this rescheduling
	PodName string
	// NodeName the fault node caused this rescheduling
	NodeName string
	// NodeRankIndex the rank index of the fault task
	NodeRankIndex string
}

RescheduleTaskReason record the reason of this rescheduling of task

type SimpleFNodeInfo ¶

type SimpleFNodeInfo struct {
	NodeName                string
	IsFaultNode             bool
	HasCardSubHealthFault   bool
	HasSwitchSubHealthFault bool
	NodeHealthState         string
}

SimpleFNodeInfo simple fault node info

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL