Documentation
¶
Index ¶
- func ExtractResourcePerNodeFromRuntime(info *Info) *corev1.ResourceRequirements
- func GetNumGPUPerNode(res *corev1.ResourceRequirements) int
- func RuntimeRefToRuntimeRegistryKey(runtimeRef trainer.RuntimeRef) string
- func TemplateSpecApply[A any](info *Info) (*A, bool)
- type Container
- type Info
- type InfoOption
- func WithAnnotations(annotations map[string]string) InfoOption
- func WithLabels(labels map[string]string) InfoOption
- func WithMLPolicySource(mlPolicy *trainer.MLPolicy) InfoOption
- func WithPodGroupPolicy(pgPolicy *trainer.PodGroupPolicy) InfoOption
- func WithPodSet(psName string, ancestor *string, count int32, typedPodSpec corev1.PodSpec, ...) InfoOption
- func WithTemplateSpecObjApply(objApply any) InfoOption
- type InfoOptions
- type PodSet
- type ReconcilerBuilder
- type Runtime
- type RuntimePolicy
- type Scheduler
- type TemplateSpec
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ExtractResourcePerNodeFromRuntime ¶ added in v2.1.0
func ExtractResourcePerNodeFromRuntime(info *Info) *corev1.ResourceRequirements
ExtractResourcePerNodeFromRuntime extracts the Trainer resource per node from the Info object.
func GetNumGPUPerNode ¶ added in v2.1.0
func GetNumGPUPerNode(res *corev1.ResourceRequirements) int
GetNumGPUPerNode returns the GPU count if found in container resources.
func RuntimeRefToRuntimeRegistryKey ¶
func RuntimeRefToRuntimeRegistryKey(runtimeRef trainer.RuntimeRef) string
func TemplateSpecApply ¶
Types ¶
type Container ¶
type Container struct {
Name string
Env []corev1ac.EnvVarApplyConfiguration
Ports []corev1ac.ContainerPortApplyConfiguration
VolumeMounts []corev1ac.VolumeMountApplyConfiguration
}
type Info ¶
type Info struct {
// Labels and Annotations to add to the RuntimeJobTemplate.
Labels map[string]string
Annotations map[string]string
// Original policy values from the runtime.
RuntimePolicy RuntimePolicy
// Scheduler parameters to add to the RuntimeJobTemplate.
Scheduler *Scheduler
// TemplateSpec is TrainingRuntime Template object.
// ObjApply podSpecs and this PodSets should be kept in sync by info.SyncPodSetsToTemplateSpec().
TemplateSpec TemplateSpec
}
func NewInfo ¶
func NewInfo(opts ...InfoOption) *Info
func (*Info) FindContainerByPodSetAncestorContainerName ¶
func (i *Info) FindContainerByPodSetAncestorContainerName(psAncestor, containerName string) *Container
FindContainerByPodSetAncestorContainerName finds runtime.Container from Info.TemplateSpec.PodSet by PodSet Ancestor and Container name.
func (*Info) FindPodSetByAncestor ¶
func (*Info) FindPodSetByName ¶
type InfoOption ¶
type InfoOption func(options *InfoOptions)
func WithAnnotations ¶
func WithAnnotations(annotations map[string]string) InfoOption
func WithLabels ¶
func WithLabels(labels map[string]string) InfoOption
func WithMLPolicySource ¶
func WithMLPolicySource(mlPolicy *trainer.MLPolicy) InfoOption
func WithPodGroupPolicy ¶
func WithPodGroupPolicy(pgPolicy *trainer.PodGroupPolicy) InfoOption
func WithPodSet ¶
func WithPodSet( psName string, ancestor *string, count int32, typedPodSpec corev1.PodSpec, podSpecApply *corev1ac.PodSpecApplyConfiguration, ) InfoOption
WithPodSet construct Info.TemplateSpec.PodSet from PodSpec. The forth argument, 'typedPodSpec' is used only to calculate requested resources.
func WithTemplateSpecObjApply ¶
func WithTemplateSpecObjApply(objApply any) InfoOption
type InfoOptions ¶
type InfoOptions struct {
// contains filtered or unexported fields
}
type PodSet ¶
type PodSet struct {
// PodSet name is the name to identify PodSpec.
// This typically has the name stored in each PodSpec.
Name string
// Ancestor is built by `trainer.kubeflow.org/trainjob-ancestor-step` label value
// in Runtime CRDs.
Ancestor *string
Count *int32
InitContainers []Container
Containers []Container
Volumes []corev1ac.VolumeApplyConfiguration
Endpoints iter.Seq[string]
// The total PodSet requests can be calculated with
// SinglePodRequests x Count.
SinglePodRequests corev1.ResourceList
}
type ReconcilerBuilder ¶
type Runtime ¶
type Runtime interface {
NewObjects(ctx context.Context, trainJob *trainer.TrainJob) ([]runtime.ApplyConfiguration, error)
RuntimeInfo(trainJob *trainer.TrainJob, runtimeTemplateSpec any, mlPolicy *trainer.MLPolicy, podGroupPolicy *trainer.PodGroupPolicy) (*Info, error)
TrainJobStatus(ctx context.Context, trainJob *trainer.TrainJob) (*trainer.TrainJobStatus, error)
EventHandlerRegistrars() []ReconcilerBuilder
ValidateObjects(ctx context.Context, old, new *trainer.TrainJob) (admission.Warnings, field.ErrorList)
}
type RuntimePolicy ¶
type RuntimePolicy struct {
MLPolicySource *trainer.MLPolicySource
PodGroupPolicy *trainer.PodGroupPolicy
}
type Scheduler ¶
TODO (andreyvelich): Potentially, we can add ScheduleTimeoutSeconds to the Scheduler for consistency.
type TemplateSpec ¶
Click to show internal directories.
Click to hide internal directories.