From f77bbc9dd8c8735b01ee3931ab0a1624f600630f Mon Sep 17 00:00:00 2001 From: Adrian Riobo Date: Mon, 19 Jan 2026 13:18:21 +0100 Subject: [PATCH] feat: Added rhelai support on Azure Signed-off-by: Adrian Riobo --- Makefile | 1 + cmd/mapt/cmd/aws/hosts/rhelai.go | 11 +- cmd/mapt/cmd/azure/azure.go | 1 + cmd/mapt/cmd/azure/hosts/rhelai.go | 106 +++++++ cmd/mapt/cmd/params/params.go | 31 +- pkg/provider/aws/action/rhel-ai/constants.go | 6 +- pkg/provider/aws/action/rhel-ai/rhelai.go | 34 +-- pkg/provider/azure/action/kind/kind.go | 5 +- pkg/provider/azure/action/linux/linux.go | 61 ++-- pkg/provider/azure/action/rhel-ai/rhelai.go | 52 ++++ pkg/provider/azure/action/windows/windows.go | 11 +- pkg/provider/azure/data/imageref.go | 15 +- pkg/provider/azure/data/images.go | 82 +++-- pkg/provider/azure/data/spot.go | 60 +++- .../virtual-machine/virtual-machine.go | 64 ++-- .../network/security-group/security-group.go | 4 - pkg/targets/host/rhelai/api.go | 18 ++ pkg/util/maps/maps.go | 10 + tkn/infra-azure-rhel-ai.yaml | 289 ++++++++++++++++++ tkn/template/infra-azure-rhel-ai.yaml | 289 ++++++++++++++++++ 20 files changed, 994 insertions(+), 156 deletions(-) create mode 100644 cmd/mapt/cmd/azure/hosts/rhelai.go create mode 100644 pkg/provider/azure/action/rhel-ai/rhelai.go create mode 100644 pkg/targets/host/rhelai/api.go create mode 100644 tkn/infra-azure-rhel-ai.yaml create mode 100644 tkn/template/infra-azure-rhel-ai.yaml diff --git a/Makefile b/Makefile index c895cdb85..e26861f67 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,7 @@ define tkn_update sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-aws-windows-server.yaml > tkn/infra-aws-windows-server.yaml sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-azure-aks.yaml > tkn/infra-azure-aks.yaml sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-azure-rhel.yaml > tkn/infra-azure-rhel.yaml + sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-azure-rhel-ai.yaml > tkn/infra-azure-rhel-ai.yaml sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-azure-fedora.yaml > tkn/infra-azure-fedora.yaml sed -e 's%%$(1)%g' -e 's%%$(2)%g' tkn/template/infra-azure-windows-desktop.yaml > tkn/infra-azure-windows-desktop.yaml endef diff --git a/cmd/mapt/cmd/aws/hosts/rhelai.go b/cmd/mapt/cmd/aws/hosts/rhelai.go index d9bbc4316..26df35c88 100644 --- a/cmd/mapt/cmd/aws/hosts/rhelai.go +++ b/cmd/mapt/cmd/aws/hosts/rhelai.go @@ -4,6 +4,7 @@ import ( "github.com/redhat-developer/mapt/cmd/mapt/cmd/params" maptContext "github.com/redhat-developer/mapt/pkg/manager/context" rhelai "github.com/redhat-developer/mapt/pkg/provider/aws/action/rhel-ai" + apiRHELAI "github.com/redhat-developer/mapt/pkg/targets/host/rhelai" "github.com/spf13/cobra" "github.com/spf13/pflag" "github.com/spf13/viper" @@ -52,12 +53,11 @@ func getRHELAICreate() *cobra.Command { DebugLevel: viper.GetUint(params.DebugLevel), Tags: viper.GetStringMapString(params.Tags), }, - &rhelai.RHELAIArgs{ + &apiRHELAI.RHELAIArgs{ Prefix: "main", Version: viper.GetString(params.RhelAIVersion), + Accelerator: viper.GetString(params.RhelAIAccelerator), CustomAMI: viper.GetString(params.RhelAIAMICustom), - SubsUsername: viper.GetString(params.SubsUsername), - SubsUserpass: viper.GetString(params.SubsUserpass), ComputeRequest: params.ComputeRequestArgs(), Spot: params.SpotArgs(), Timeout: viper.GetString(params.Timeout), @@ -68,9 +68,8 @@ func getRHELAICreate() *cobra.Command { flagSet.StringP(params.ConnectionDetailsOutput, "", "", params.ConnectionDetailsOutputDesc) flagSet.StringToStringP(params.Tags, "", nil, params.TagsDesc) flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc) + flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc) flagSet.StringP(params.RhelAIAMICustom, "", "", params.RhelAIAMICustomDesc) - flagSet.StringP(params.SubsUsername, "", "", params.SubsUsernameDesc) - flagSet.StringP(params.SubsUserpass, "", "", params.SubsUserpassDesc) flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc) params.AddComputeRequestFlags(flagSet) params.AddSpotFlags(flagSet) @@ -87,7 +86,7 @@ func getRHELAIDestroy() *cobra.Command { return err } return rhelai.Destroy(&maptContext.ContextArgs{ - Context: cmd.Context(), + Context: cmd.Context(), ProjectName: viper.GetString(params.ProjectName), BackedURL: viper.GetString(params.BackedURL), Debug: viper.IsSet(params.Debug), diff --git a/cmd/mapt/cmd/azure/azure.go b/cmd/mapt/cmd/azure/azure.go index 2ba8e1cc5..cb25a7604 100644 --- a/cmd/mapt/cmd/azure/azure.go +++ b/cmd/mapt/cmd/azure/azure.go @@ -34,6 +34,7 @@ func GetCmd() *cobra.Command { hosts.GetWindowsDesktopCmd(), hosts.GetUbuntuCmd(), hosts.GetRHELCmd(), + hosts.GetRHELAICmd(), hosts.GetFedoraCmd(), services.GetAKSCmd(), services.GetKindCmd()) diff --git a/cmd/mapt/cmd/azure/hosts/rhelai.go b/cmd/mapt/cmd/azure/hosts/rhelai.go new file mode 100644 index 000000000..1edf6d755 --- /dev/null +++ b/cmd/mapt/cmd/azure/hosts/rhelai.go @@ -0,0 +1,106 @@ +package hosts + +import ( + "github.com/redhat-developer/mapt/cmd/mapt/cmd/params" + maptContext "github.com/redhat-developer/mapt/pkg/manager/context" + rhelai "github.com/redhat-developer/mapt/pkg/provider/azure/action/rhel-ai" + apiRHELAI "github.com/redhat-developer/mapt/pkg/targets/host/rhelai" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + "github.com/spf13/viper" +) + +const ( + cmdRHELAI = "rhel-ai" + cmdRHELAIDesc = "manage rhel ai host" +) + +func GetRHELAICmd() *cobra.Command { + c := &cobra.Command{ + Use: cmdRHELAI, + Short: cmdRHELAIDesc, + RunE: func(cmd *cobra.Command, args []string) error { + if err := viper.BindPFlags(cmd.Flags()); err != nil { + return err + } + return nil + }, + } + + flagSet := pflag.NewFlagSet(cmdRHELAI, pflag.ExitOnError) + params.AddCommonFlags(flagSet) + c.PersistentFlags().AddFlagSet(flagSet) + + c.AddCommand(getRHELAICreate(), getRHELAIDestroy()) + return c +} + +func getRHELAICreate() *cobra.Command { + c := &cobra.Command{ + Use: params.CreateCmdName, + Short: params.CreateCmdName, + RunE: func(cmd *cobra.Command, args []string) error { + if err := viper.BindPFlags(cmd.Flags()); err != nil { + return err + } + return rhelai.Create( + &maptContext.ContextArgs{ + Context: cmd.Context(), + ProjectName: viper.GetString(params.ProjectName), + BackedURL: viper.GetString(params.BackedURL), + ResultsOutput: viper.GetString(params.ConnectionDetailsOutput), + Debug: viper.IsSet(params.Debug), + DebugLevel: viper.GetUint(params.DebugLevel), + Tags: viper.GetStringMapString(params.Tags), + }, + &apiRHELAI.RHELAIArgs{ + Prefix: "main", + Version: viper.GetString(params.RhelAIVersion), + Accelerator: viper.GetString(params.RhelAIAccelerator), + CustomAMI: viper.GetString(params.RhelAIAMICustom), + ComputeRequest: params.ComputeRequestArgs(), + Spot: params.SpotArgs(), + Timeout: viper.GetString(params.Timeout), + }) + }, + } + flagSet := pflag.NewFlagSet(params.CreateCmdName, pflag.ExitOnError) + flagSet.StringP(params.ConnectionDetailsOutput, "", "", params.ConnectionDetailsOutputDesc) + flagSet.StringToStringP(params.Tags, "", nil, params.TagsDesc) + flagSet.StringP(params.RhelAIVersion, "", params.RhelAIVersionDefault, params.RhelAIVersionDesc) + flagSet.StringP(params.RhelAIAccelerator, "", params.RhelAIAccelearatorDefault, params.RhelAIAccelearatorDesc) + flagSet.StringP(params.RhelAIAMICustom, "", "", params.RhelAIAMICustomDesc) + flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc) + params.AddComputeRequestFlags(flagSet) + params.AddSpotFlags(flagSet) + c.PersistentFlags().AddFlagSet(flagSet) + return c +} + +func getRHELAIDestroy() *cobra.Command { + c := &cobra.Command{ + Use: params.DestroyCmdName, + Short: params.DestroyCmdName, + RunE: func(cmd *cobra.Command, args []string) error { + if err := viper.BindPFlags(cmd.Flags()); err != nil { + return err + } + return rhelai.Destroy(&maptContext.ContextArgs{ + Context: cmd.Context(), + ProjectName: viper.GetString(params.ProjectName), + BackedURL: viper.GetString(params.BackedURL), + Debug: viper.IsSet(params.Debug), + DebugLevel: viper.GetUint(params.DebugLevel), + Serverless: viper.IsSet(params.Serverless), + ForceDestroy: viper.IsSet(params.ForceDestroy), + KeepState: viper.IsSet(params.KeepState), + }) + }, + } + flagSet := pflag.NewFlagSet(params.DestroyCmdName, pflag.ExitOnError) + flagSet.Bool(params.Serverless, false, params.ServerlessDesc) + flagSet.Bool(params.ForceDestroy, false, params.ForceDestroyDesc) + flagSet.Bool(params.KeepState, false, params.KeepStateDesc) + c.PersistentFlags().AddFlagSet(flagSet) + return c +} diff --git a/cmd/mapt/cmd/params/params.go b/cmd/mapt/cmd/params/params.go index 1dab0a39e..b288327c0 100644 --- a/cmd/mapt/cmd/params/params.go +++ b/cmd/mapt/cmd/params/params.go @@ -78,20 +78,23 @@ const ( cirrusPWLabelsDesc string = "additional labels to use on the persistent worker (--it-cirrus-pw-labels key1=value1,key2=value2)" //RHEL - SubsUsername string = "rh-subscription-username" - SubsUsernameDesc string = "username to register the subscription" - SubsUserpass string = "rh-subscription-password" - SubsUserpassDesc string = "password to register the subscription" - ProfileSNC string = "snc" - ProfileSNCDesc string = "if this flag is set the RHEL will be setup with SNC profile. Setting up all requirements to run https://github.com/crc-org/snc" - RhelVersion string = "version" - RhelVersionDesc string = "version for the RHEL OS" - RhelVersionDefault string = "9.4" - RhelAIVersion string = "version" - RhelAIVersionDesc string = "version for the RHELAI OS" - RhelAIVersionDefault string = "3.0.0" - RhelAIAMICustom string = "custom-ami" - RhelAIAMICustomDesc string = "custom AMI to spin RHEL AI OS" + SubsUsername string = "rh-subscription-username" + SubsUsernameDesc string = "username to register the subscription" + SubsUserpass string = "rh-subscription-password" + SubsUserpassDesc string = "password to register the subscription" + ProfileSNC string = "snc" + ProfileSNCDesc string = "if this flag is set the RHEL will be setup with SNC profile. Setting up all requirements to run https://github.com/crc-org/snc" + RhelVersion string = "version" + RhelVersionDesc string = "version for the RHEL OS" + RhelVersionDefault string = "9.4" + RhelAIVersion string = "version" + RhelAIVersionDesc string = "version for the RHELAI OS" + RhelAIVersionDefault string = "3.0.0" + RhelAIAccelerator string = "accelerator" + RhelAIAccelearatorDesc string = "accelerator type. Valid types: cuda and rocm" + RhelAIAccelearatorDefault string = "cuda" + RhelAIAMICustom string = "custom-ami" + RhelAIAMICustomDesc string = "custom AMI to spin RHEL AI OS" // Serverless Timeout string = "timeout" diff --git a/pkg/provider/aws/action/rhel-ai/constants.go b/pkg/provider/aws/action/rhel-ai/constants.go index fca64c4b2..ddc6bd5ea 100644 --- a/pkg/provider/aws/action/rhel-ai/constants.go +++ b/pkg/provider/aws/action/rhel-ai/constants.go @@ -16,7 +16,7 @@ var ( // amiProduct = "Red Hat Enterprise Linux" amiProduct = "Linux/UNIX" amiV1Regex = "rhel-ai-nvidia-aws-%s-*" - amiRegex = "rhel-ai-cuda-aws-%s-*" + amiRegex = "rhel-ai-%s-aws-%s-*" amiOwner = "610952687893" // amiOwnerSelf = "self" amiArch = "x86_64" @@ -48,10 +48,10 @@ var ( outputUserPrivateKey = "ardPrivatekey" ) -func amiName(version *string) string { +func amiName(accelerator, version *string) string { return util.If(strings.HasPrefix(*version, "1"), fmt.Sprintf(amiV1Regex, *version), - fmt.Sprintf(amiRegex, *version)) + fmt.Sprintf(amiRegex, *accelerator, *version)) } // NVIDIA A100 X2 diff --git a/pkg/provider/aws/action/rhel-ai/rhelai.go b/pkg/provider/aws/action/rhel-ai/rhelai.go index f5a6fa3e7..b345d3faa 100644 --- a/pkg/provider/aws/action/rhel-ai/rhelai.go +++ b/pkg/provider/aws/action/rhel-ai/rhelai.go @@ -11,8 +11,6 @@ import ( "github.com/redhat-developer/mapt/pkg/manager" mc "github.com/redhat-developer/mapt/pkg/manager/context" infra "github.com/redhat-developer/mapt/pkg/provider" - cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request" - spotTypes "github.com/redhat-developer/mapt/pkg/provider/api/spot" "github.com/redhat-developer/mapt/pkg/provider/aws" awsConstants "github.com/redhat-developer/mapt/pkg/provider/aws/constants" "github.com/redhat-developer/mapt/pkg/provider/aws/data" @@ -26,32 +24,18 @@ import ( securityGroup "github.com/redhat-developer/mapt/pkg/provider/aws/services/ec2/security-group" "github.com/redhat-developer/mapt/pkg/provider/util/command" "github.com/redhat-developer/mapt/pkg/provider/util/output" + apiRHELAI "github.com/redhat-developer/mapt/pkg/targets/host/rhelai" "github.com/redhat-developer/mapt/pkg/util" "github.com/redhat-developer/mapt/pkg/util/logging" resourcesUtil "github.com/redhat-developer/mapt/pkg/util/resources" ) -type RHELAIArgs struct { - Prefix string - Version string - CustomAMI string - Arch string - ComputeRequest *cr.ComputeRequestArgs - SubsUsername string - SubsUserpass string - Spot *spotTypes.SpotArgs - // If timeout is set a severless scheduled task will be created to self destroy the resources - Timeout string -} - type rhelAIRequest struct { mCtx *mc.Context prefix *string amiName *string arch *string spot bool - subsUsername *string - subsUserpass *string timeout *string allocationData *allocation.AllocationResult } @@ -68,26 +52,24 @@ func (r *rhelAIRequest) validate() error { // Create orchestrate 2 stacks: // If spot is enable it will run best spot option to get the best option to spin the machine // Then it will run the stack for windows dedicated host -func Create(mCtxArgs *mc.ContextArgs, args *RHELAIArgs) (err error) { +func Create(mCtxArgs *mc.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) { // Create mapt Context mCtx, err := mc.Init(mCtxArgs, aws.Provider()) if err != nil { return err } // Compose request - amiName := amiName(&args.Version) + amiName := amiName(&args.Accelerator, &args.Version) if len(args.CustomAMI) != 0 { amiName = fmt.Sprintf("%s*", args.CustomAMI) } prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main") r := rhelAIRequest{ - mCtx: mCtx, - prefix: &prefix, - amiName: &amiName, - arch: &args.Arch, - timeout: &args.Timeout, - subsUsername: &args.SubsUsername, - subsUserpass: &args.SubsUserpass} + mCtx: mCtx, + prefix: &prefix, + amiName: &amiName, + arch: &args.Arch, + timeout: &args.Timeout} if args.Spot != nil { r.spot = args.Spot.Spot } diff --git a/pkg/provider/azure/action/kind/kind.go b/pkg/provider/azure/action/kind/kind.go index d4b0e966d..dac0ebf55 100644 --- a/pkg/provider/azure/action/kind/kind.go +++ b/pkg/provider/azure/action/kind/kind.go @@ -171,10 +171,7 @@ func (r *kindRequest) deployer(ctx *pulumi.Context) error { NetworkInteface: n.NetworkInterface, // Check this VMSize: r.allocationData.ComputeSizes[0], - Publisher: r.allocationData.ImageRef.Publisher, - Offer: r.allocationData.ImageRef.Offer, - Sku: r.allocationData.ImageRef.Sku, - ImageID: r.allocationData.ImageRef.ID, + Image: r.allocationData.ImageRef, PrivateKey: privateKey, SpotPrice: r.allocationData.Price, UserDataAsBase64: udB64, diff --git a/pkg/provider/azure/action/linux/linux.go b/pkg/provider/azure/action/linux/linux.go index 0590039f6..78a2c3fa9 100644 --- a/pkg/provider/azure/action/linux/linux.go +++ b/pkg/provider/azure/action/linux/linux.go @@ -45,6 +45,7 @@ type LinuxArgs struct { ComputeRequest *cr.ComputeRequestArgs OSType data.OSType Version string + ImageRef *data.ImageReference Username string Spot *spotTypes.SpotArgs CloudConfigAsUserData userDataApi.CloudConfig @@ -52,11 +53,13 @@ type LinuxArgs struct { } type linuxRequest struct { - mCtx *mc.Context `validate:"required"` - prefix *string - arch *string - osType *data.OSType - version *string + mCtx *mc.Context `validate:"required"` + prefix *string + // Image info, either args (arch, osTYpe, version) to get it or the actual value (ir) + arch *string + osType *data.OSType + version *string + // host management allocationData *allocation.AllocationResult username *string cloudConfigAsUserData userDataApi.CloudConfig @@ -89,9 +92,12 @@ func Create(mCtxArgs *mc.ContextArgs, args *LinuxArgs) (err error) { cloudConfigAsUserData: args.CloudConfigAsUserData, readinessCommand: &args.ReadinessCommand, } - ir, err := data.GetImageRef(*r.osType, *r.arch, *r.version) - if err != nil { - return err + ir := args.ImageRef + if ir == nil { + ir, err = data.GetImageRef(*r.osType, *r.arch, *r.version) + if err != nil { + return err + } } r.allocationData, err = allocation.Allocation(mCtx, &allocation.AllocationArgs{ @@ -173,34 +179,27 @@ func (r *linuxRequest) deployer(ctx *pulumi.Context) error { return err } ctx.Export(fmt.Sprintf("%s-%s", *r.prefix, outputUserPrivateKey), privateKey.PrivateKeyPem) - // Image refence info - var userDataB64 *string + vmArgs := &virtualmachine.VirtualMachineArgs{ + Prefix: *r.prefix, + ComponentID: azureLinuxID, + ResourceGroup: rg, + NetworkInteface: n.NetworkInterface, + // Check this + VMSize: r.allocationData.ComputeSizes[0], + Image: r.allocationData.ImageRef, + AdminUsername: *r.username, + PrivateKey: privateKey, + SpotPrice: r.allocationData.Price, + Location: *r.allocationData.Location, + } if r.cloudConfigAsUserData != nil { - var err error - userDataB64, err = r.cloudConfigAsUserData.CloudConfig() + userDataB64, err := r.cloudConfigAsUserData.CloudConfig() if err != nil { return fmt.Errorf("error creating RHEL Server on Azure: %v", err) } + vmArgs.UserDataAsBase64 = pulumi.String(*userDataB64) } - - vm, err := virtualmachine.Create(ctx, r.mCtx, - &virtualmachine.VirtualMachineArgs{ - Prefix: *r.prefix, - ComponentID: azureLinuxID, - ResourceGroup: rg, - NetworkInteface: n.NetworkInterface, - // Check this - VMSize: r.allocationData.ComputeSizes[0], - Publisher: r.allocationData.ImageRef.Publisher, - Offer: r.allocationData.ImageRef.Offer, - Sku: r.allocationData.ImageRef.Sku, - ImageID: r.allocationData.ImageRef.ID, - AdminUsername: *r.username, - PrivateKey: privateKey, - SpotPrice: r.allocationData.Price, - UserDataAsBase64: pulumi.String(*userDataB64), - Location: *r.allocationData.Location, - }) + vm, err := virtualmachine.Create(ctx, r.mCtx, vmArgs) if err != nil { return err } diff --git a/pkg/provider/azure/action/rhel-ai/rhelai.go b/pkg/provider/azure/action/rhel-ai/rhelai.go new file mode 100644 index 000000000..6b346bd37 --- /dev/null +++ b/pkg/provider/azure/action/rhel-ai/rhelai.go @@ -0,0 +1,52 @@ +package rhelai + +import ( + "fmt" + "strings" + + maptContext "github.com/redhat-developer/mapt/pkg/manager/context" + azureLinux "github.com/redhat-developer/mapt/pkg/provider/azure/action/linux" + "github.com/redhat-developer/mapt/pkg/provider/azure/data" + "github.com/redhat-developer/mapt/pkg/provider/util/command" + apiRHELAI "github.com/redhat-developer/mapt/pkg/targets/host/rhelai" + "github.com/redhat-developer/mapt/pkg/util/logging" +) + +const ( + imageOwnerSubscriptionId = "02db6bd4-035c-4074-b699-468f3d914744" + // $1 accelerator $2 version + imageNameRegex = "rhel-ai-%s-azure-%s" + // $1 subscriptionId $2 rgName + imageIdRegex = "/subscriptions/%s/resourceGroups/aipcc-productization/providers/Microsoft.Compute/galleries/%s/images/%s/versions/1.0.0" + + username = "azureuser" +) + +func imageId(accelerator, version string) string { + iName := fmt.Sprintf(imageNameRegex, accelerator, version) + gName := strings.ReplaceAll(iName, "-", "_") + return fmt.Sprintf(imageIdRegex, + imageOwnerSubscriptionId, + gName, + iName) +} + +func Create(mCtxArgs *maptContext.ContextArgs, args *apiRHELAI.RHELAIArgs) (err error) { + logging.Debug("Creating RHEL Server") + azureLinuxRequest := + &azureLinux.LinuxArgs{ + Prefix: args.Prefix, + // Location: args.Location, + ComputeRequest: args.ComputeRequest, + Spot: args.Spot, + ImageRef: &data.ImageReference{ + SharedImageID: imageId(args.Accelerator, args.Version), + }, + Username: username, + ReadinessCommand: command.CommandPing} + return azureLinux.Create(mCtxArgs, azureLinuxRequest) +} + +func Destroy(mCtxArgs *maptContext.ContextArgs) error { + return azureLinux.Destroy(mCtxArgs) +} diff --git a/pkg/provider/azure/action/windows/windows.go b/pkg/provider/azure/action/windows/windows.go index 3bfb09949..d4fbfaf32 100644 --- a/pkg/provider/azure/action/windows/windows.go +++ b/pkg/provider/azure/action/windows/windows.go @@ -22,6 +22,7 @@ import ( cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request" spotTypes "github.com/redhat-developer/mapt/pkg/provider/api/spot" "github.com/redhat-developer/mapt/pkg/provider/azure" + "github.com/redhat-developer/mapt/pkg/provider/azure/data" "github.com/redhat-developer/mapt/pkg/provider/azure/modules/allocation" "github.com/redhat-developer/mapt/pkg/provider/azure/modules/network" virtualmachine "github.com/redhat-developer/mapt/pkg/provider/azure/modules/virtual-machine" @@ -182,10 +183,12 @@ func (r *windowsRequest) deployer(ctx *pulumi.Context) error { ResourceGroup: rg, NetworkInteface: n.NetworkInterface, // Check this - VMSize: r.allocationData.ComputeSizes[0], - Publisher: "MicrosoftWindowsDesktop", - Offer: fmt.Sprintf("windows-%s", *r.version), - Sku: fmt.Sprintf("win%s-%s", *r.version, *r.feature), + VMSize: r.allocationData.ComputeSizes[0], + Image: &data.ImageReference{ + Publisher: "MicrosoftWindowsDesktop", + Offer: fmt.Sprintf("windows-%s", *r.version), + Sku: fmt.Sprintf("win%s-%s", *r.version, *r.feature), + }, AdminUsername: *r.adminUsername, AdminPasswd: adminPasswd, SpotPrice: r.allocationData.Price, diff --git a/pkg/provider/azure/data/imageref.go b/pkg/provider/azure/data/imageref.go index 90516cdd4..a2d4dc5e9 100644 --- a/pkg/provider/azure/data/imageref.go +++ b/pkg/provider/azure/data/imageref.go @@ -19,12 +19,17 @@ var ( const fedoraImageGalleryBase = "/CommunityGalleries/Fedora-5e266ba4-2250-406d-adad-5d73860d958f/Images/" +// /subscriptions/02db6bd4-035c-4074-b699-468f3d914744/resourceGroups/RHEL-AI-CUDA-AZURE-3.0.0/providers/Microsoft.Compute/galleries/rhel_ai_cuda_azure_3.0.0/images/rhel-ai-cuda-azure-3.0.0/versions/1.0.0 + type ImageReference struct { + // Market Place Publisher string Offer string Sku string - // community gallery image ID - ID string + // Community + CommunityImageID string + // // Private Shared + SharedImageID string } var ( @@ -50,10 +55,10 @@ var ( }, Fedora: { "x86_64": { - ID: fedoraImageGalleryBase + "Fedora-Cloud-%s-x64/Versions/latest", + CommunityImageID: fedoraImageGalleryBase + "Fedora-Cloud-%s-x64/Versions/latest", }, "arm64": { - ID: fedoraImageGalleryBase + "Fedora-Cloud-%s-Arm64/Versions/latest", + CommunityImageID: fedoraImageGalleryBase + "Fedora-Cloud-%s-Arm64/Versions/latest", }, }, } @@ -78,7 +83,7 @@ func GetImageRef(osTarget OSType, arch string, version string) (*ImageReference, }, nil case Fedora: return &ImageReference{ - ID: fmt.Sprintf(ir.ID, versions[0]), + CommunityImageID: fmt.Sprintf(ir.CommunityImageID, versions[0]), }, nil } return nil, fmt.Errorf("os type not supported") diff --git a/pkg/provider/azure/data/images.go b/pkg/provider/azure/data/images.go index 050544415..fc2f29b26 100644 --- a/pkg/provider/azure/data/images.go +++ b/pkg/provider/azure/data/images.go @@ -8,8 +8,6 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azidentity" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" - mc "github.com/redhat-developer/mapt/pkg/manager/context" - "github.com/redhat-developer/mapt/pkg/util/logging" ) type ImageRequest struct { @@ -17,44 +15,74 @@ type ImageRequest struct { ImageReference } -func GetImage(ctx context.Context, req ImageRequest) (*armcompute.CommunityGalleryImagesClientGetResponse, error) { +func IsImageOffered(ctx context.Context, req ImageRequest) error { cred, err := azidentity.NewDefaultAzureCredential(nil) if err != nil { - return nil, err + return err } subscriptionId := os.Getenv("AZURE_SUBSCRIPTION_ID") - clientFactory, err := armcompute.NewClientFactory(subscriptionId, cred, nil) if err != nil { - return nil, err + return err } - // for community gallary images - if len(req.ID) > 0 { - // extract gallary ID and image name from ID url which looks like: - // /CommunityGalleries/Fedora-5e266ba4-2250-406d-adad-5d73860d958f/Images/Fedora-Cloud-40-Arm64/Versions/latest - parts := strings.Split(req.ID, "/") - if len(parts) != 7 { - return nil, fmt.Errorf("invalid community gallary image ID: %s", req.ID) - } - res, err := clientFactory.NewCommunityGalleryImagesClient().Get(ctx, req.Region, parts[2], parts[4], nil) - if err != nil { - return nil, err - } - return &res, nil + if len(req.CommunityImageID) > 0 { + _, err := getCommunityImage(ctx, clientFactory, &req.CommunityImageID, &req.Region) + return err + } + if len(req.SharedImageID) > 0 { + _, err := getSharedImage(ctx, clientFactory, &req.SharedImageID) + return err } // for azure offered VM images: https://learn.microsoft.com/en-us/rest/api/compute/virtual-machine-images/get // there's a different API to check but currently we only check availability of community images - return nil, nil + return fmt.Errorf("no valid image to check") } -func IsImageOffered(mCtx *mc.Context, req ImageRequest) bool { - if _, err := GetImage(mCtx.Context(), req); err != nil { - if mCtx.Debug() { - logging.Debugf("error while checking if image available at location: %v", err) - } - return false +func getCommunityImage(ctx context.Context, c *armcompute.ClientFactory, id, region *string) (*armcompute.CommunityGalleryImagesClientGetResponse, error) { + // extract gallary ID and image name from ID url which looks like: + // /CommunityGalleries/Fedora-5e266ba4-2250-406d-adad-5d73860d958f/Images/Fedora-Cloud-40-Arm64/Versions/latest + parts := strings.Split(*id, "/") + if len(parts) != 7 { + return nil, fmt.Errorf("invalid community gallary image ID: %s", *id) + } + res, err := c.NewCommunityGalleryImagesClient().Get(ctx, *region, parts[2], parts[4], nil) + if err != nil { + return nil, err + } + return &res, nil +} + +func GetSharedImage(ctx context.Context, id *string) (*armcompute.GalleryImageVersionsClientGetResponse, error) { + cred, err := azidentity.NewDefaultAzureCredential(nil) + if err != nil { + return nil, err + } + subscriptionId := os.Getenv("AZURE_SUBSCRIPTION_ID") + c, err := armcompute.NewClientFactory(subscriptionId, cred, nil) + if err != nil { + return nil, err + } + parts := strings.Split(*id, "/") + if len(parts) != 13 { + return nil, fmt.Errorf("invalid shared image ID: %s", *id) + } + res, err := c.NewGalleryImageVersionsClient().Get(ctx, parts[4], parts[8], parts[10], parts[12], nil) + if err != nil { + return nil, err + } + return &res, nil +} + +func getSharedImage(ctx context.Context, c *armcompute.ClientFactory, id *string) (*armcompute.GalleryImageVersionsClientGetResponse, error) { + parts := strings.Split(*id, "/") + if len(parts) != 13 { + return nil, fmt.Errorf("invalid shared image ID: %s", *id) + } + res, err := c.NewGalleryImageVersionsClient().Get(ctx, parts[4], parts[8], parts[10], parts[12], nil) + if err != nil { + return nil, err } - return true + return &res, nil } func SkuG2Support(ctx context.Context, location string, publisher string, offer string, sku string) (string, error) { diff --git a/pkg/provider/azure/data/spot.go b/pkg/provider/azure/data/spot.go index 7d4df8d17..0c39adfc2 100644 --- a/pkg/provider/azure/data/spot.go +++ b/pkg/provider/azure/data/spot.go @@ -137,6 +137,7 @@ func SpotInfo(mCtx *mc.Context, args *SpotInfoArgs) (*spot.SpotResults, error) { return nil, err } c, err := selectSpotChoice( + mCtx.Context(), &spotChoiceArgs{ evictionRates: evictionRates, spotPricings: spotPricings, @@ -197,16 +198,41 @@ func filterLocations(mCtx *mc.Context, args *SpotInfoArgs) ([]string, error) { }) } if args.ImageRef != nil { - locations = util.ArrayFilter(locations, - func(location string) bool { - return IsImageOffered(mCtx, - ImageRequest{ - Region: location, - ImageReference: *args.ImageRef, - }) - }) + locationsWithImage, err := hostingPlaces.RunOnHostingPlaces(locations, + imageOfferedArgs{ + ir: *args.ImageRef, + ctx: mCtx.Context(), + }, + isImageOfferedAsync) + if err != nil { + return nil, err + } + locations = utilMaps.KeysFiltered(locationsWithImage, func(v bool) bool { return v }) + } + if len(locations) == 0 { + return nil, fmt.Errorf("no locations to look for machines using current parameters") } - return locations, err + return locations, nil +} + +type imageOfferedArgs struct { + ir ImageReference + ctx context.Context +} + +// This will check if image is offered in an async way +func isImageOfferedAsync(location string, args imageOfferedArgs, c chan hostingPlaces.HostingPlaceData[bool]) { + err := IsImageOffered(args.ctx, + ImageRequest{ + Region: location, + ImageReference: args.ir, + }) + if err != nil { + logging.Error(err) + } + c <- hostingPlaces.HostingPlaceData[bool]{ + Region: location, + Value: err == nil} } func allowedER(spotTolerance spot.Tolerance) []string { @@ -401,7 +427,7 @@ type spotChoiceArgs struct { // # Also function take cares to transfrom from AzID to AZName // // first option matching the requirements will be returned -func selectSpotChoice(args *spotChoiceArgs) (*SpotInfoResult, error) { +func selectSpotChoice(ctx context.Context, args *spotChoiceArgs) (*SpotInfoResult, error) { result := make(map[string]*SpotInfoResult) // Fix random error with graphql query not giving information for eviction rates if len(args.evictionRates) == 0 { @@ -432,7 +458,19 @@ func selectSpotChoice(args *spotChoiceArgs) (*SpotInfoResult, error) { if len(spis) == 0 { return nil, fmt.Errorf("no good choice was found") } - return spis[0], nil + sirIndex := slices.IndexFunc(spis, + func(sir *SpotInfoResult) bool { + isOffered, err := IsVMSizeOfferedByLocation(ctx, sir.ComputeSize, sir.Location) + if err != nil { + logging.Error(err) + return false + } + return isOffered + }) + if sirIndex == -1 { + return nil, fmt.Errorf("no good choice was found") + } + return spis[sirIndex], nil } // // This is a fallback function in case we need to get an option only based in price diff --git a/pkg/provider/azure/modules/virtual-machine/virtual-machine.go b/pkg/provider/azure/modules/virtual-machine/virtual-machine.go index 9a0fd3eff..2d1fe3f3d 100644 --- a/pkg/provider/azure/modules/virtual-machine/virtual-machine.go +++ b/pkg/provider/azure/modules/virtual-machine/virtual-machine.go @@ -2,6 +2,8 @@ package virtualmachine import ( "fmt" + "os" + "strings" "github.com/pulumi/pulumi-azure-native-sdk/compute/v3" "github.com/pulumi/pulumi-azure-native-sdk/network/v3" @@ -26,12 +28,10 @@ type VirtualMachineArgs struct { ResourceGroup *resources.ResourceGroup NetworkInteface *network.NetworkInterface VMSize string - Publisher string - Offer string - Sku string - SpotPrice *float64 + + SpotPrice *float64 // community galary image ID - ImageID string + Image *data.ImageReference // Windows required AdminUsername string // Linux required @@ -47,21 +47,9 @@ type VirtualMachine = *compute.VirtualMachine // Create virtual machine based on request + export to context // adminusername and adminuserpassword func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (VirtualMachine, error) { - var imageReferenceArgs compute.ImageReferenceArgs - if len(args.ImageID) > 0 { - imageReferenceArgs = compute.ImageReferenceArgs{ - CommunityGalleryImageId: pulumi.String(args.ImageID)} - } else { - finalSku, err := data.SkuG2Support(mCtx.Context(), args.Location, args.Publisher, args.Offer, args.Sku) - if err != nil { - return nil, err - } - imageReferenceArgs = compute.ImageReferenceArgs{ - Publisher: pulumi.String(args.Publisher), - Offer: pulumi.String(args.Offer), - Sku: pulumi.String(finalSku), - Version: pulumi.String("latest"), - } + ira, err := convertImageRef(mCtx, *args.Image, args.Location) + if err != nil { + return nil, err } vmArgs := &compute.VirtualMachineArgs{ VmName: pulumi.String(mCtx.RunID()), @@ -79,7 +67,7 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *VirtualMachineArgs) (Vi VmSize: pulumi.String(args.VMSize), }, StorageProfile: compute.StorageProfileArgs{ - ImageReference: imageReferenceArgs, + ImageReference: ira, OsDisk: compute.OSDiskArgs{ Name: pulumi.String(mCtx.RunID()), DiskSizeGB: pulumi.Int(diskSize), @@ -136,3 +124,37 @@ func osProfile(computerName string, args *VirtualMachineArgs) compute.OSProfileA } return osProfile } + +func convertImageRef(mCtx *mc.Context, i data.ImageReference, location string) (*compute.ImageReferenceArgs, error) { + if len(i.CommunityImageID) > 0 { + return &compute.ImageReferenceArgs{ + CommunityGalleryImageId: pulumi.String(i.CommunityImageID), + }, nil + } + if len(i.SharedImageID) > 0 { + if isSelfOwned(&i.SharedImageID) { + return &compute.ImageReferenceArgs{ + Id: pulumi.String(i.SharedImageID), + }, nil + } + return &compute.ImageReferenceArgs{ + SharedGalleryImageId: pulumi.String(i.SharedImageID), + }, nil + + } + finalSku, err := data.SkuG2Support(mCtx.Context(), location, i.Publisher, i.Offer, i.Sku) + if err != nil { + return nil, err + } + return &compute.ImageReferenceArgs{ + Publisher: pulumi.String(i.Publisher), + Offer: pulumi.String(i.Offer), + Sku: pulumi.String(finalSku), + Version: pulumi.String("latest"), + }, nil +} + +func isSelfOwned(sharedImageId *string) bool { + sharedImageParams := strings.Split(*sharedImageId, "/") + return os.Getenv("AZURE_SUBSCRIPTION_ID") == sharedImageParams[2] +} diff --git a/pkg/provider/azure/services/network/security-group/security-group.go b/pkg/provider/azure/services/network/security-group/security-group.go index 99c4f5a66..f1af2a427 100644 --- a/pkg/provider/azure/services/network/security-group/security-group.go +++ b/pkg/provider/azure/services/network/security-group/security-group.go @@ -29,7 +29,6 @@ type SecurityGroup = *network.NetworkSecurityGroup func Create(ctx *pulumi.Context, mCtx *mc.Context, args *SecurityGroupArgs) (SecurityGroup, error) { nsg, err := network.NewNetworkSecurityGroup(ctx, args.Name, - &network.NetworkSecurityGroupArgs{ NetworkSecurityGroupName: pulumi.String(args.Name), ResourceGroupName: args.RG.Name, @@ -40,9 +39,6 @@ func Create(ctx *pulumi.Context, mCtx *mc.Context, args *SecurityGroupArgs) (Sec if err != nil { return nil, err } - if err != nil { - return nil, err - } return nsg, nil } diff --git a/pkg/targets/host/rhelai/api.go b/pkg/targets/host/rhelai/api.go new file mode 100644 index 000000000..90e04bc96 --- /dev/null +++ b/pkg/targets/host/rhelai/api.go @@ -0,0 +1,18 @@ +package rhelai + +import ( + cr "github.com/redhat-developer/mapt/pkg/provider/api/compute-request" + spotTypes "github.com/redhat-developer/mapt/pkg/provider/api/spot" +) + +type RHELAIArgs struct { + Prefix string + Accelerator string + Version string + CustomAMI string + Arch string + ComputeRequest *cr.ComputeRequestArgs + Spot *spotTypes.SpotArgs + // If timeout is set a severless scheduled task will be created to self destroy the resources + Timeout string +} diff --git a/pkg/util/maps/maps.go b/pkg/util/maps/maps.go index e430bdac7..c38b1ecdc 100644 --- a/pkg/util/maps/maps.go +++ b/pkg/util/maps/maps.go @@ -17,6 +17,16 @@ func Keys[X comparable, Y any](m map[X]Y) []X { return keys } +func KeysFiltered[X comparable, Y any](m map[X]Y, matchFilter func(y Y) bool) []X { + keys := make([]X, 0, len(m)) + for k, v := range m { + if matchFilter(v) { + keys = append(keys, k) + } + } + return keys +} + func Values[X comparable, Y any](m map[X]Y) []Y { values := make([]Y, 0, len(m)) for _, v := range m { diff --git a/tkn/infra-azure-rhel-ai.yaml b/tkn/infra-azure-rhel-ai.yaml new file mode 100644 index 000000000..4f57baa92 --- /dev/null +++ b/tkn/infra-azure-rhel-ai.yaml @@ -0,0 +1,289 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: infra-azure-rhel-ai + labels: + app.kubernetes.io/version: "1.0.0-dev" + annotations: + tekton.dev/pipelines.minVersion: "0.44.x" + tekton.dev/categories: infrastructure + tekton.dev/tags: infrastructure, azure, rhelai + tekton.dev/displayName: "azure manager" + tekton.dev/platforms: "linux/amd64, linux/arm64" +spec: + description: | + Task provision a RHEL AI dedicated on host on Azure + + volumes: + - name: az-credentials + secret: + secretName: $(params.secret-az-credentials) + - name: host-info + emptyDir: {} + + params: + - name: secret-az-credentials + description: | + ocp secret holding the azure credentials. Secret should be accessible to this task. + + --- + apiVersion: v1 + kind: Secret + metadata: + name: ${name} + type: Opaque + data: + tenant_id: ${tenant_id} + subscription_id: ${subscription_id} + client_id: ${client_id} + client_secret: ${client_secret} + storage_account: ${storage_account} + storage_key: ${storage_key} + blob: ${blob} + - name: id + description: identifier for the provisioned environment + - name: operation + description: operation to execute within the infrastructure. Current values (create, destroy) + + # Secret result + # naming + - name: host-access-secret-name + type: string + default: "" + description: | + Once the target is provisioned the config to connect is addded to a secret + check resutls. If this param is set the secret will be created with the name set + otherwise it will be created with a random name. + # ownership + - name: ownerKind + type: string + default: "PipelineRun" + description: | + The type of resource that should own the generated SpaceRequest. + Deletion of this resource will trigger deletion of the SpaceRequest. + Supported values: `PipelineRun`, `TaskRun`. + - name: ownerName + type: string + default: "" + description: | + The name of the resource that should own the generated SpaceRequest. + This should either be passed the value of `$(context.pipelineRun.name)` + or `$(context.taskRun.name)` depending on the value of `ownerKind`. + - name: ownerUid + type: string + default: "" + description: | + The uid of the resource that should own the generated SpaceRequest. + This should either be passed the value of `$(context.pipelineRun.uid)` + or `$(context.taskRun.uid)` depending on the value of `ownerKind`. + + # VM type params + - name: compute-sizes + description: Comma seperated list of sizes for the machines to be requested. If set this takes precedence over compute by args + default: "Standard_ND96is_MI300X_v5,Standard_ND96isr_MI300X_v5" + - name: spot + description: Check best spot option to spin the machine and will create resources on that region. + default: "true" + - name: spot-eviction-tolerance + description: | + If spot is enabled we can define the minimum tolerance level of eviction. + Allowed value are: lowest, low, medium, high or highest + default: "lowest" + - name: spot-excluded-regions + description: Comma-separated list of zone IDs to exclude from spot selection + default: "" + - name: spot-increase-rate + description: Percentage to be added on top of the current calculated spot price to increase chances to get the machine. + default: "20" + + # RHEL AI params + - name: version + description: Version of RHEL AI OS (default 3.2.0) + default: "3.2.0" + - name: accelerator + description: accelerator for RHEL AI OS rocm or cuda (default rocm) + default: "rocm" + + # Metadata params + - name: tags + description: tags for the resources created on the providers + default: "" + + # Control params + - name: debug + description: | + Warning setting this param to true exposes partially masked credentials + + The parameter is intended to add verbosity on the task execution and also print masked credentials + (showing first and last character with *** in the middle) on stdout to help with debugging + default: "false" + - name: keep-state + description: | + Keep Pulumi state files in S3 backend after successful destroy (by default, state files are removed). Only used when operation is destroy. + Allowed values: true, false + default: "false" + + results: + - name: host-access-secret + description: | + ocp secret holding the information to connect with the target machine. + + --- + apiVersion: v1 + kind: Secret + metadata: + name: ${name} + labels: + type: Opaque + data: + host: ${host} + username: ${username} + id_rsa: ${id_rsa} + # If airgap data for bastion host + bastion-host: ${bastion-host} + bastion-username: ${bastion-username} + bastion-id_rsa: ${bastion-id_rsa} + + steps: + - name: provisioner + image: quay.io/redhat-developer/mapt:v1.0.0-dev + imagePullPolicy: Always + volumeMounts: + - name: az-credentials + mountPath: /opt/az-credentials + - name: host-info + mountPath: /opt/host-info + script: | + #!/bin/sh + + set -euo pipefail + + # Function to mask credentials (show first and last char, hide middle) + mask_credential() { + local cred="$1" + local len=${#cred} + if [ $len -le 2 ]; then + echo "***" + else + echo "${cred:0:1}***${cred: -1}" + fi + } + + # Credentials - set these BEFORE enabling debug mode + export ARM_TENANT_ID=$(cat /opt/az-credentials/tenant_id) + export ARM_SUBSCRIPTION_ID=$(cat /opt/az-credentials/subscription_id) + export ARM_CLIENT_ID=$(cat /opt/az-credentials/client_id) + export ARM_CLIENT_SECRET=$(cat /opt/az-credentials/client_secret) + export AZURE_STORAGE_ACCOUNT=$(cat /opt/az-credentials/storage_account) + export AZURE_STORAGE_KEY=$(cat /opt/az-credentials/storage_key) + BLOB=$(cat /opt/az-credentials/blob) + + # If debug add verbosity and print masked credentials + if [[ "$(params.debug)" == "true" ]]; then + echo "ARM_TENANT_ID=$(mask_credential "$ARM_TENANT_ID")" + echo "ARM_SUBSCRIPTION_ID=$(mask_credential "$ARM_SUBSCRIPTION_ID")" + echo "ARM_CLIENT_ID=$(mask_credential "$ARM_CLIENT_ID")" + echo "ARM_CLIENT_SECRET=$(mask_credential "$ARM_CLIENT_SECRET")" + echo "AZURE_STORAGE_ACCOUNT=$(mask_credential "$AZURE_STORAGE_ACCOUNT")" + echo "AZURE_STORAGE_KEY=$(mask_credential "$AZURE_STORAGE_KEY")" + echo "BLOB=$BLOB" + set -xeuo pipefail + fi + + if [[ "$(params.operation)" == "create" ]]; then + if [[ "$(params.ownerName)" == "" || "$(params.ownerUid)" == "" ]]; then + echo "Parameter ownerName and ownerUid is recommended when creating instance" + fi + fi + + # Run mapt + cmd="mapt azure rhel-ai $(params.operation) " + cmd+="--project-name mapt-rhel-ai-$(params.id) " + cmd+="--backed-url azblob://${BLOB}/rhel-$(params.id) " + + if [[ "$(params.debug)" == "true" ]]; then + cmd+="--debug " + fi + + if [[ "$(params.operation)" == "create" ]]; then + cmd+="--conn-details-output /opt/host-info " + cmd+="--compute-sizes '$(params.compute-sizes)' " + cmd+="--version '$(params.version)' " + cmd+="--accelerator '$(params.accelerator)' " + if [[ "$(params.spot)" == "true" ]]; then + cmd+="--spot " + cmd+="--spot-increase-rate '$(params.spot-increase-rate)' " + cmd+="--spot-eviction-tolerance '$(params.spot-eviction-tolerance)' " + cmd+="--spot-excluded-regions '$(params.spot-excluded-regions)' " + fi + cmd+="--tags '$(params.tags)' " + fi + + if [[ "$(params.operation)" == "destroy" && "$(params.keep-state)" == "true" ]]; then + cmd+="--keep-state " + fi + + eval "${cmd}" + + resources: + requests: + memory: "200Mi" + cpu: "100m" + limits: + memory: "600Mi" + cpu: "300m" + - name: host-info-secret + image: registry.redhat.io/openshift4/ose-cli:4.13@sha256:e70eb2be867f1236b19f5cbfeb8e0625737ce0ec1369e32a4f9f146aaaf68d49 + env: + - name: NAMESPACE + value: $(context.taskRun.namespace) + - name: OWNER_KIND + value: $(params.ownerKind) + - name: OWNER_NAME + value: $(params.ownerName) + - name: OWNER_UID + value: $(params.ownerUid) + volumeMounts: + - name: host-info + mountPath: /opt/host-info + workingDir: /opt/host-info + script: | + #!/bin/bash + set -eo pipefail + if [[ "$(params.operation)" == "create" ]]; then + export SECRETNAME="generateName: mapt-aws-rhel-ai-" + if [[ "$(params.host-access-secret-name)" != "" ]]; then + export SECRETNAME="name: $(params.host-access-secret-name)" + fi + cat < host-info.yaml + apiVersion: v1 + kind: Secret + metadata: + $SECRETNAME + namespace: $NAMESPACE + EOF + if [[ "$OWNER_NAME" != "" && "$OWNER_UID" != "" ]]; then + cat <> host-info.yaml + ownerReferences: + - apiVersion: tekton.dev/v1 + kind: $OWNER_KIND + name: $OWNER_NAME + uid: $OWNER_UID + EOF + fi + cat <> host-info.yaml + type: Opaque + data: + host: $(cat /opt/host-info/host | base64 -w0) + username: $(cat /opt/host-info/username | base64 -w0) + id_rsa: $(cat /opt/host-info/id_rsa | base64 -w0) + EOF + + if [[ "$(params.debug)" == "true" ]]; then + cat /opt/host-info/* + fi + + NAME=$(oc create -f host-info.yaml -o=jsonpath='{.metadata.name}') + echo -n "${NAME}" | tee $(results.host-access-secret.path) + fi \ No newline at end of file diff --git a/tkn/template/infra-azure-rhel-ai.yaml b/tkn/template/infra-azure-rhel-ai.yaml new file mode 100644 index 000000000..0402c0862 --- /dev/null +++ b/tkn/template/infra-azure-rhel-ai.yaml @@ -0,0 +1,289 @@ +--- +apiVersion: tekton.dev/v1beta1 +kind: Task +metadata: + name: infra-azure-rhel-ai + labels: + app.kubernetes.io/version: "" + annotations: + tekton.dev/pipelines.minVersion: "0.44.x" + tekton.dev/categories: infrastructure + tekton.dev/tags: infrastructure, azure, rhelai + tekton.dev/displayName: "azure manager" + tekton.dev/platforms: "linux/amd64, linux/arm64" +spec: + description: | + Task provision a RHEL AI dedicated on host on Azure + + volumes: + - name: az-credentials + secret: + secretName: $(params.secret-az-credentials) + - name: host-info + emptyDir: {} + + params: + - name: secret-az-credentials + description: | + ocp secret holding the azure credentials. Secret should be accessible to this task. + + --- + apiVersion: v1 + kind: Secret + metadata: + name: ${name} + type: Opaque + data: + tenant_id: ${tenant_id} + subscription_id: ${subscription_id} + client_id: ${client_id} + client_secret: ${client_secret} + storage_account: ${storage_account} + storage_key: ${storage_key} + blob: ${blob} + - name: id + description: identifier for the provisioned environment + - name: operation + description: operation to execute within the infrastructure. Current values (create, destroy) + + # Secret result + # naming + - name: host-access-secret-name + type: string + default: "" + description: | + Once the target is provisioned the config to connect is addded to a secret + check resutls. If this param is set the secret will be created with the name set + otherwise it will be created with a random name. + # ownership + - name: ownerKind + type: string + default: "PipelineRun" + description: | + The type of resource that should own the generated SpaceRequest. + Deletion of this resource will trigger deletion of the SpaceRequest. + Supported values: `PipelineRun`, `TaskRun`. + - name: ownerName + type: string + default: "" + description: | + The name of the resource that should own the generated SpaceRequest. + This should either be passed the value of `$(context.pipelineRun.name)` + or `$(context.taskRun.name)` depending on the value of `ownerKind`. + - name: ownerUid + type: string + default: "" + description: | + The uid of the resource that should own the generated SpaceRequest. + This should either be passed the value of `$(context.pipelineRun.uid)` + or `$(context.taskRun.uid)` depending on the value of `ownerKind`. + + # VM type params + - name: compute-sizes + description: Comma seperated list of sizes for the machines to be requested. If set this takes precedence over compute by args + default: "Standard_ND96is_MI300X_v5,Standard_ND96isr_MI300X_v5" + - name: spot + description: Check best spot option to spin the machine and will create resources on that region. + default: "true" + - name: spot-eviction-tolerance + description: | + If spot is enabled we can define the minimum tolerance level of eviction. + Allowed value are: lowest, low, medium, high or highest + default: "lowest" + - name: spot-excluded-regions + description: Comma-separated list of zone IDs to exclude from spot selection + default: "" + - name: spot-increase-rate + description: Percentage to be added on top of the current calculated spot price to increase chances to get the machine. + default: "20" + + # RHEL AI params + - name: version + description: Version of RHEL AI OS (default 3.2.0) + default: "3.2.0" + - name: accelerator + description: accelerator for RHEL AI OS rocm or cuda (default rocm) + default: "rocm" + + # Metadata params + - name: tags + description: tags for the resources created on the providers + default: "" + + # Control params + - name: debug + description: | + Warning setting this param to true exposes partially masked credentials + + The parameter is intended to add verbosity on the task execution and also print masked credentials + (showing first and last character with *** in the middle) on stdout to help with debugging + default: "false" + - name: keep-state + description: | + Keep Pulumi state files in S3 backend after successful destroy (by default, state files are removed). Only used when operation is destroy. + Allowed values: true, false + default: "false" + + results: + - name: host-access-secret + description: | + ocp secret holding the information to connect with the target machine. + + --- + apiVersion: v1 + kind: Secret + metadata: + name: ${name} + labels: + type: Opaque + data: + host: ${host} + username: ${username} + id_rsa: ${id_rsa} + # If airgap data for bastion host + bastion-host: ${bastion-host} + bastion-username: ${bastion-username} + bastion-id_rsa: ${bastion-id_rsa} + + steps: + - name: provisioner + image: + imagePullPolicy: Always + volumeMounts: + - name: az-credentials + mountPath: /opt/az-credentials + - name: host-info + mountPath: /opt/host-info + script: | + #!/bin/sh + + set -euo pipefail + + # Function to mask credentials (show first and last char, hide middle) + mask_credential() { + local cred="$1" + local len=${#cred} + if [ $len -le 2 ]; then + echo "***" + else + echo "${cred:0:1}***${cred: -1}" + fi + } + + # Credentials - set these BEFORE enabling debug mode + export ARM_TENANT_ID=$(cat /opt/az-credentials/tenant_id) + export ARM_SUBSCRIPTION_ID=$(cat /opt/az-credentials/subscription_id) + export ARM_CLIENT_ID=$(cat /opt/az-credentials/client_id) + export ARM_CLIENT_SECRET=$(cat /opt/az-credentials/client_secret) + export AZURE_STORAGE_ACCOUNT=$(cat /opt/az-credentials/storage_account) + export AZURE_STORAGE_KEY=$(cat /opt/az-credentials/storage_key) + BLOB=$(cat /opt/az-credentials/blob) + + # If debug add verbosity and print masked credentials + if [[ "$(params.debug)" == "true" ]]; then + echo "ARM_TENANT_ID=$(mask_credential "$ARM_TENANT_ID")" + echo "ARM_SUBSCRIPTION_ID=$(mask_credential "$ARM_SUBSCRIPTION_ID")" + echo "ARM_CLIENT_ID=$(mask_credential "$ARM_CLIENT_ID")" + echo "ARM_CLIENT_SECRET=$(mask_credential "$ARM_CLIENT_SECRET")" + echo "AZURE_STORAGE_ACCOUNT=$(mask_credential "$AZURE_STORAGE_ACCOUNT")" + echo "AZURE_STORAGE_KEY=$(mask_credential "$AZURE_STORAGE_KEY")" + echo "BLOB=$BLOB" + set -xeuo pipefail + fi + + if [[ "$(params.operation)" == "create" ]]; then + if [[ "$(params.ownerName)" == "" || "$(params.ownerUid)" == "" ]]; then + echo "Parameter ownerName and ownerUid is recommended when creating instance" + fi + fi + + # Run mapt + cmd="mapt azure rhel-ai $(params.operation) " + cmd+="--project-name mapt-rhel-ai-$(params.id) " + cmd+="--backed-url azblob://${BLOB}/rhel-$(params.id) " + + if [[ "$(params.debug)" == "true" ]]; then + cmd+="--debug " + fi + + if [[ "$(params.operation)" == "create" ]]; then + cmd+="--conn-details-output /opt/host-info " + cmd+="--compute-sizes '$(params.compute-sizes)' " + cmd+="--version '$(params.version)' " + cmd+="--accelerator '$(params.accelerator)' " + if [[ "$(params.spot)" == "true" ]]; then + cmd+="--spot " + cmd+="--spot-increase-rate '$(params.spot-increase-rate)' " + cmd+="--spot-eviction-tolerance '$(params.spot-eviction-tolerance)' " + cmd+="--spot-excluded-regions '$(params.spot-excluded-regions)' " + fi + cmd+="--tags '$(params.tags)' " + fi + + if [[ "$(params.operation)" == "destroy" && "$(params.keep-state)" == "true" ]]; then + cmd+="--keep-state " + fi + + eval "${cmd}" + + resources: + requests: + memory: "200Mi" + cpu: "100m" + limits: + memory: "600Mi" + cpu: "300m" + - name: host-info-secret + image: registry.redhat.io/openshift4/ose-cli:4.13@sha256:e70eb2be867f1236b19f5cbfeb8e0625737ce0ec1369e32a4f9f146aaaf68d49 + env: + - name: NAMESPACE + value: $(context.taskRun.namespace) + - name: OWNER_KIND + value: $(params.ownerKind) + - name: OWNER_NAME + value: $(params.ownerName) + - name: OWNER_UID + value: $(params.ownerUid) + volumeMounts: + - name: host-info + mountPath: /opt/host-info + workingDir: /opt/host-info + script: | + #!/bin/bash + set -eo pipefail + if [[ "$(params.operation)" == "create" ]]; then + export SECRETNAME="generateName: mapt-aws-rhel-ai-" + if [[ "$(params.host-access-secret-name)" != "" ]]; then + export SECRETNAME="name: $(params.host-access-secret-name)" + fi + cat < host-info.yaml + apiVersion: v1 + kind: Secret + metadata: + $SECRETNAME + namespace: $NAMESPACE + EOF + if [[ "$OWNER_NAME" != "" && "$OWNER_UID" != "" ]]; then + cat <> host-info.yaml + ownerReferences: + - apiVersion: tekton.dev/v1 + kind: $OWNER_KIND + name: $OWNER_NAME + uid: $OWNER_UID + EOF + fi + cat <> host-info.yaml + type: Opaque + data: + host: $(cat /opt/host-info/host | base64 -w0) + username: $(cat /opt/host-info/username | base64 -w0) + id_rsa: $(cat /opt/host-info/id_rsa | base64 -w0) + EOF + + if [[ "$(params.debug)" == "true" ]]; then + cat /opt/host-info/* + fi + + NAME=$(oc create -f host-info.yaml -o=jsonpath='{.metadata.name}') + echo -n "${NAME}" | tee $(results.host-access-secret.path) + fi \ No newline at end of file