runtime

package
v2.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 7, 2025 License: Apache-2.0 Imports: 19 Imported by: 1

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ExtractResourcePerNodeFromRuntime added in v2.1.0

func ExtractResourcePerNodeFromRuntime(info *Info) *corev1.ResourceRequirements

ExtractResourcePerNodeFromRuntime extracts the Trainer resource per node from the Info object.

func GetNumGPUPerNode added in v2.1.0

func GetNumGPUPerNode(res *corev1.ResourceRequirements) int

GetNumGPUPerNode returns the GPU count if found in container resources.

func RuntimeRefToRuntimeRegistryKey

func RuntimeRefToRuntimeRegistryKey(runtimeRef trainer.RuntimeRef) string

func TemplateSpecApply

func TemplateSpecApply[A any](info *Info) (*A, bool)

Types

type Info

type Info struct {
	// Labels and Annotations to add to the RuntimeJobTemplate.
	Labels      map[string]string
	Annotations map[string]string
	// Original policy values from the runtime.
	RuntimePolicy RuntimePolicy
	// Scheduler parameters to add to the RuntimeJobTemplate.
	Scheduler *Scheduler
	// TemplateSpec is TrainingRuntime Template object.
	// ObjApply podSpecs and this PodSets should be kept in sync by info.SyncPodSetsToTemplateSpec().
	TemplateSpec TemplateSpec
}

func NewInfo

func NewInfo(opts ...InfoOption) *Info

func (*Info) FindContainerByPodSetAncestorContainerName

func (i *Info) FindContainerByPodSetAncestorContainerName(psAncestor, containerName string) *Container

FindContainerByPodSetAncestorContainerName finds runtime.Container from Info.TemplateSpec.PodSet by PodSet Ancestor and Container name.

func (*Info) FindPodSetByAncestor

func (i *Info) FindPodSetByAncestor(ancestor string) *PodSet

func (*Info) FindPodSetByName

func (i *Info) FindPodSetByName(psName string) *PodSet

type InfoOption

type InfoOption func(options *InfoOptions)

func WithAnnotations

func WithAnnotations(annotations map[string]string) InfoOption

func WithLabels

func WithLabels(labels map[string]string) InfoOption

func WithMLPolicySource

func WithMLPolicySource(mlPolicy *trainer.MLPolicy) InfoOption

func WithPodGroupPolicy

func WithPodGroupPolicy(pgPolicy *trainer.PodGroupPolicy) InfoOption

func WithPodSet

func WithPodSet(
	psName string, ancestor *string, count int32, typedPodSpec corev1.PodSpec, podSpecApply *corev1ac.PodSpecApplyConfiguration,
) InfoOption

WithPodSet construct Info.TemplateSpec.PodSet from PodSpec. The forth argument, 'typedPodSpec' is used only to calculate requested resources.

func WithTemplateSpecObjApply

func WithTemplateSpecObjApply(objApply any) InfoOption

type InfoOptions

type InfoOptions struct {
	// contains filtered or unexported fields
}

type PodSet

type PodSet struct {
	// PodSet name is the name to identify PodSpec.
	// This typically has the name stored in each PodSpec.
	Name string
	// Ancestor is built by `trainer.kubeflow.org/trainjob-ancestor-step` label value
	// in Runtime CRDs.
	Ancestor       *string
	Count          *int32
	InitContainers []Container
	Containers     []Container
	Volumes        []corev1ac.VolumeApplyConfiguration
	Endpoints      iter.Seq[string]
	// The total PodSet requests can be calculated with
	// SinglePodRequests x Count.
	SinglePodRequests corev1.ResourceList
}

type ReconcilerBuilder

type ReconcilerBuilder func(*builder.Builder, client.Client, cache.Cache) *builder.Builder

type Runtime

type Runtime interface {
	NewObjects(ctx context.Context, trainJob *trainer.TrainJob) ([]runtime.ApplyConfiguration, error)
	RuntimeInfo(trainJob *trainer.TrainJob, runtimeTemplateSpec any, mlPolicy *trainer.MLPolicy, podGroupPolicy *trainer.PodGroupPolicy) (*Info, error)
	TrainJobStatus(ctx context.Context, trainJob *trainer.TrainJob) (*trainer.TrainJobStatus, error)
	EventHandlerRegistrars() []ReconcilerBuilder
	ValidateObjects(ctx context.Context, old, new *trainer.TrainJob) (admission.Warnings, field.ErrorList)
}

type RuntimePolicy

type RuntimePolicy struct {
	MLPolicySource *trainer.MLPolicySource
	PodGroupPolicy *trainer.PodGroupPolicy
}

type Scheduler

type Scheduler struct {
	PodLabels      map[string]string
	PodAnnotations map[string]string
}

TODO (andreyvelich): Potentially, we can add ScheduleTimeoutSeconds to the Scheduler for consistency.

type TemplateSpec

type TemplateSpec struct {
	// ObjApply is ApplyConfiguration for the TrainingRuntimes Template field.
	ObjApply any
	// PodSets is a set of Pod extracted from ObjApply.
	// This is abstract concept to represent multiple PodSpec as a unit.
	PodSets []PodSet
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL