Skip to content

Commit b4a389a

Browse files
authored
improvement(helm): update GPU device plugin and add cert-manager issuers (#3036)
* improvement(helm): update GPU device plugin and add cert-manager issuers * fix(helm): address code review feedback for GPU plugin and cert-manager * fix(helm): remove duplicate nodeSelector, add hook for CA issuer ordering * fix(helm): remove incorrect hook, CA issuer auto-reconciles
1 parent 65bc216 commit b4a389a

File tree

3 files changed

+191
-37
lines changed

3 files changed

+191
-37
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{{- if .Values.certManager.enabled }}
2+
{{- /*
3+
cert-manager Issuer Bootstrap Pattern
4+
5+
PREREQUISITE: cert-manager must be installed in your cluster before enabling this.
6+
The root CA Certificate is created in the namespace specified by certManager.rootCA.namespace
7+
(defaults to "cert-manager"). Ensure this namespace exists and cert-manager is running there.
8+
9+
Install cert-manager: https://cert-manager.io/docs/installation/
10+
11+
This implements the recommended pattern from cert-manager documentation:
12+
1. A self-signed ClusterIssuer (for bootstrapping the root CA only)
13+
2. A root CA Certificate (self-signed, used to sign other certificates)
14+
3. A CA ClusterIssuer (uses the root CA to sign certificates)
15+
16+
Reference: https://cert-manager.io/docs/configuration/selfsigned/
17+
*/ -}}
18+
19+
---
20+
# 1. Self-Signed ClusterIssuer (Bootstrap Only)
21+
# This issuer is used ONLY to create the root CA certificate.
22+
# It should NOT be used directly for application certificates.
23+
apiVersion: cert-manager.io/v1
24+
kind: ClusterIssuer
25+
metadata:
26+
name: {{ .Values.certManager.selfSignedIssuer.name }}
27+
labels:
28+
{{- include "sim.labels" . | nindent 4 }}
29+
app.kubernetes.io/component: cert-manager
30+
spec:
31+
selfSigned: {}
32+
33+
---
34+
# 2. Root CA Certificate
35+
# This certificate is signed by the self-signed issuer and becomes the root of trust.
36+
# The secret created here will be used by the CA issuer to sign certificates.
37+
# NOTE: This must be created in the cert-manager namespace (or the namespace specified
38+
# in certManager.rootCA.namespace). Ensure cert-manager is installed there first.
39+
apiVersion: cert-manager.io/v1
40+
kind: Certificate
41+
metadata:
42+
name: {{ .Values.certManager.rootCA.certificateName }}
43+
namespace: {{ .Values.certManager.rootCA.namespace | default "cert-manager" }} # Must match cert-manager's cluster-resource-namespace
44+
labels:
45+
{{- include "sim.labels" . | nindent 4 }}
46+
app.kubernetes.io/component: cert-manager
47+
spec:
48+
isCA: true
49+
commonName: {{ .Values.certManager.rootCA.commonName }}
50+
secretName: {{ .Values.certManager.rootCA.secretName }}
51+
duration: {{ .Values.certManager.rootCA.duration | default "87600h" }}
52+
renewBefore: {{ .Values.certManager.rootCA.renewBefore | default "2160h" }}
53+
privateKey:
54+
algorithm: {{ .Values.certManager.rootCA.privateKey.algorithm | default "RSA" }}
55+
size: {{ .Values.certManager.rootCA.privateKey.size | default 4096 }}
56+
subject:
57+
organizations:
58+
{{- if .Values.certManager.rootCA.subject.organizations }}
59+
{{- toYaml .Values.certManager.rootCA.subject.organizations | nindent 6 }}
60+
{{- else }}
61+
- {{ .Release.Name }}
62+
{{- end }}
63+
issuerRef:
64+
name: {{ .Values.certManager.selfSignedIssuer.name }}
65+
kind: ClusterIssuer
66+
group: cert-manager.io
67+
68+
---
69+
# 3. CA ClusterIssuer
70+
# This is the issuer that should be used by applications to obtain certificates.
71+
# It signs certificates using the root CA created above.
72+
# NOTE: This issuer may briefly show "not ready" on first install while cert-manager
73+
# processes the Certificate above and creates the secret. It will auto-reconcile.
74+
apiVersion: cert-manager.io/v1
75+
kind: ClusterIssuer
76+
metadata:
77+
name: {{ .Values.certManager.caIssuer.name }}
78+
labels:
79+
{{- include "sim.labels" . | nindent 4 }}
80+
app.kubernetes.io/component: cert-manager
81+
spec:
82+
ca:
83+
secretName: {{ .Values.certManager.rootCA.secretName }}
84+
{{- end }}

helm/sim/templates/gpu-device-plugin.yaml

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
{{- if and .Values.ollama.enabled .Values.ollama.gpu.enabled }}
22
---
3-
# NVIDIA Device Plugin DaemonSet for GPU support
3+
# 1. ConfigMap for NVIDIA Device Plugin Configuration
4+
apiVersion: v1
5+
kind: ConfigMap
6+
metadata:
7+
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
8+
namespace: {{ .Release.Namespace }}
9+
labels:
10+
{{- include "sim.labels" . | nindent 4 }}
11+
app.kubernetes.io/component: nvidia-device-plugin
12+
data:
13+
config.yaml: |
14+
version: v1
15+
flags:
16+
{{- if eq .Values.ollama.gpu.strategy "mig" }}
17+
migStrategy: "single"
18+
{{- else }}
19+
migStrategy: "none"
20+
{{- end }}
21+
failOnInitError: false
22+
plugin:
23+
passDeviceSpecs: true
24+
deviceListStrategy: envvar
25+
{{- if eq .Values.ollama.gpu.strategy "time-slicing" }}
26+
sharing:
27+
timeSlicing:
28+
resources:
29+
- name: nvidia.com/gpu
30+
replicas: {{ .Values.ollama.gpu.timeSlicingReplicas | default 5 }}
31+
{{- end }}
32+
---
33+
# 2. NVIDIA Device Plugin DaemonSet for GPU support
434
apiVersion: apps/v1
535
kind: DaemonSet
636
metadata:
@@ -35,9 +65,6 @@ spec:
3565
# Only schedule on nodes with NVIDIA GPUs
3666
accelerator: nvidia
3767
priorityClassName: system-node-critical
38-
runtimeClassName: nvidia
39-
hostNetwork: true
40-
hostPID: true
4168
volumes:
4269
- name: device-plugin
4370
hostPath:
@@ -48,22 +75,21 @@ spec:
4875
- name: sys
4976
hostPath:
5077
path: /sys
51-
- name: proc-driver-nvidia
52-
hostPath:
53-
path: /proc/driver/nvidia
78+
# Volume to mount the ConfigMap
79+
- name: nvidia-device-plugin-config
80+
configMap:
81+
name: {{ include "sim.fullname" . }}-nvidia-device-plugin-config
5482
containers:
5583
- name: nvidia-device-plugin
56-
image: nvcr.io/nvidia/k8s-device-plugin:v0.14.5
84+
image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2
5785
imagePullPolicy: Always
5886
args:
59-
- --mig-strategy=single
60-
- --pass-device-specs=true
61-
- --fail-on-init-error=false
62-
- --device-list-strategy=envvar
63-
- --nvidia-driver-root=/host-sys/fs/cgroup
87+
- "--config-file=/etc/device-plugin/config.yaml"
88+
{{- if eq .Values.ollama.gpu.strategy "mig" }}
6489
env:
6590
- name: NVIDIA_MIG_MONITOR_DEVICES
6691
value: all
92+
{{- end }}
6793
securityContext:
6894
allowPrivilegeEscalation: false
6995
capabilities:
@@ -74,29 +100,16 @@ spec:
74100
- name: dev
75101
mountPath: /dev
76102
- name: sys
77-
mountPath: /host-sys
103+
mountPath: /sys
78104
readOnly: true
79-
- name: proc-driver-nvidia
80-
mountPath: /proc/driver/nvidia
105+
- name: nvidia-device-plugin-config
106+
mountPath: /etc/device-plugin/
81107
readOnly: true
82108
resources:
83109
requests:
84110
cpu: 50m
85-
memory: 10Mi
111+
memory: 20Mi
86112
limits:
87113
cpu: 50m
88-
memory: 20Mi
89-
{{- if .Values.nodeSelector }}
90-
nodeSelector:
91-
{{- toYaml .Values.nodeSelector | nindent 8 }}
92-
{{- end }}
93-
---
94-
# RuntimeClass for NVIDIA Container Runtime
95-
apiVersion: node.k8s.io/v1
96-
kind: RuntimeClass
97-
metadata:
98-
name: {{ include "sim.fullname" . }}-nvidia
99-
labels:
100-
{{- include "sim.labels" . | nindent 4 }}
101-
handler: nvidia
102-
{{- end }}
114+
memory: 50Mi
115+
{{- end }}

helm/sim/values.yaml

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -400,8 +400,10 @@ postgresql:
400400
algorithm: RSA # RSA or ECDSA
401401
size: 4096 # Key size in bits
402402
# Issuer reference (REQUIRED if tls.enabled is true)
403+
# By default, references the CA issuer created by certManager.caIssuer
404+
# Make sure certManager.enabled is true, or provide your own issuer
403405
issuerRef:
404-
name: selfsigned-cluster-issuer # Name of your cert-manager Issuer/ClusterIssuer
406+
name: sim-ca-issuer # Name of your cert-manager Issuer/ClusterIssuer
405407
kind: ClusterIssuer # ClusterIssuer or Issuer
406408
group: "" # Optional: cert-manager.io (leave empty for default)
407409
# Additional DNS names (optional)
@@ -463,20 +465,26 @@ externalDatabase:
463465
ollama:
464466
# Enable/disable Ollama deployment
465467
enabled: false
466-
468+
467469
# Image configuration
468470
image:
469471
repository: ollama/ollama
470472
tag: latest
471473
pullPolicy: Always
472-
474+
473475
# Number of replicas
474476
replicaCount: 1
475-
477+
476478
# GPU configuration
477479
gpu:
478480
enabled: false
479481
count: 1
482+
# GPU sharing strategy: "mig" (Multi-Instance GPU) or "time-slicing"
483+
# - mig: Hardware-level GPU partitioning (requires supported GPUs like A100)
484+
# - time-slicing: Software-level GPU sharing (works with most NVIDIA GPUs)
485+
strategy: "time-slicing"
486+
# Number of time-slicing replicas (only used when strategy is "time-slicing")
487+
timeSlicingReplicas: 5
480488

481489
# Node selector for GPU workloads (adjust labels based on your cluster configuration)
482490
nodeSelector:
@@ -1185,4 +1193,53 @@ externalSecrets:
11851193
# External database password (when using managed database services)
11861194
externalDatabase:
11871195
# Path to external database password in external store
1188-
password: ""
1196+
password: ""
1197+
1198+
# cert-manager configuration
1199+
# Prerequisites: Install cert-manager in your cluster first
1200+
# See: https://cert-manager.io/docs/installation/
1201+
#
1202+
# This implements the recommended CA bootstrap pattern from cert-manager:
1203+
# 1. Self-signed ClusterIssuer (bootstrap only - creates root CA)
1204+
# 2. Root CA Certificate (self-signed, becomes the trust anchor)
1205+
# 3. CA ClusterIssuer (signs application certificates using root CA)
1206+
#
1207+
# Reference: https://cert-manager.io/docs/configuration/selfsigned/
1208+
certManager:
1209+
# Enable/disable cert-manager issuer resources
1210+
enabled: false
1211+
1212+
# Self-signed ClusterIssuer (used ONLY to bootstrap the root CA)
1213+
# Do not reference this issuer directly for application certificates
1214+
selfSignedIssuer:
1215+
name: "sim-selfsigned-bootstrap-issuer"
1216+
1217+
# Root CA Certificate configuration
1218+
# This certificate is signed by the self-signed issuer and used as the trust anchor
1219+
rootCA:
1220+
# Name of the Certificate resource
1221+
certificateName: "sim-root-ca"
1222+
# Namespace where the root CA certificate and secret will be created
1223+
# Must match cert-manager's cluster-resource-namespace (default: cert-manager)
1224+
namespace: "cert-manager"
1225+
# Common name for the root CA certificate
1226+
commonName: "sim-root-ca"
1227+
# Secret name where the root CA certificate and key will be stored
1228+
secretName: "sim-root-ca-secret"
1229+
# Certificate validity duration (default: 10 years)
1230+
duration: "87600h"
1231+
# Renew before expiry (default: 90 days)
1232+
renewBefore: "2160h"
1233+
# Private key configuration
1234+
privateKey:
1235+
algorithm: RSA
1236+
size: 4096
1237+
# Subject configuration
1238+
subject:
1239+
organizations: []
1240+
# If empty, defaults to the release name
1241+
1242+
# CA ClusterIssuer configuration
1243+
# This is the issuer that applications should reference for obtaining certificates
1244+
caIssuer:
1245+
name: "sim-ca-issuer"

0 commit comments

Comments
 (0)