diff --git a/components/frontend/src/types/project-settings.ts b/components/frontend/src/types/project-settings.ts
index ccb9ebd0f..d0aff5cd9 100644
--- a/components/frontend/src/types/project-settings.ts
+++ b/components/frontend/src/types/project-settings.ts
@@ -4,11 +4,19 @@ export type LLMSettings = {
maxTokens: number;
};
+export type S3StorageConfig = {
+ enabled: boolean;
+ endpoint: string;
+ bucket: string;
+ region?: string;
+};
+
export type ProjectDefaultSettings = {
llmSettings: LLMSettings;
defaultTimeout: number;
allowedWebsiteDomains?: string[];
maxConcurrentSessions: number;
+ s3Storage?: S3StorageConfig;
};
export type ProjectResourceLimits = {
diff --git a/components/manifests/base/kustomization.yaml b/components/manifests/base/kustomization.yaml
index 58c3c658b..e35dc92cb 100644
--- a/components/manifests/base/kustomization.yaml
+++ b/components/manifests/base/kustomization.yaml
@@ -13,6 +13,7 @@ resources:
- frontend-deployment.yaml
- operator-deployment.yaml
- workspace-pvc.yaml
+- minio-deployment.yaml
# Default images (can be overridden by overlays)
images:
@@ -24,4 +25,6 @@ images:
newTag: latest
- name: quay.io/ambient_code/vteam_claude_runner
newTag: latest
+- name: quay.io/ambient_code/vteam_state_sync
+ newTag: latest
diff --git a/components/manifests/base/minio-credentials-secret.yaml.example b/components/manifests/base/minio-credentials-secret.yaml.example
new file mode 100644
index 000000000..58472d078
--- /dev/null
+++ b/components/manifests/base/minio-credentials-secret.yaml.example
@@ -0,0 +1,31 @@
+apiVersion: v1
+kind: Secret
+metadata:
+ name: minio-credentials
+type: Opaque
+stringData:
+ # MinIO root credentials
+ # Change these values in production!
+ root-user: "admin"
+ root-password: "changeme123"
+
+ # For use in project settings (same credentials for convenience)
+ access-key: "admin"
+ secret-key: "changeme123"
+---
+# Instructions:
+# 1. Copy this file to minio-credentials-secret.yaml
+# 2. Change root-user and root-password to secure values
+# 3. Apply: kubectl apply -f minio-credentials-secret.yaml -n ambient-code
+#
+# After MinIO is running:
+# 1. Access MinIO console: kubectl port-forward svc/minio 9001:9001 -n ambient-code
+# 2. Open http://localhost:9001 in browser
+# 3. Login with root-user/root-password
+# 4. Create bucket: "ambient-sessions"
+# 5. Configure bucket in project settings:
+# - S3_ENDPOINT: http://minio.ambient-code.svc:9000
+# - S3_BUCKET: ambient-sessions
+# - S3_ACCESS_KEY: {your-root-user}
+# - S3_SECRET_KEY: {your-root-password}
+
diff --git a/components/manifests/base/minio-deployment.yaml b/components/manifests/base/minio-deployment.yaml
new file mode 100644
index 000000000..f537d4d74
--- /dev/null
+++ b/components/manifests/base/minio-deployment.yaml
@@ -0,0 +1,102 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: minio-data
+ labels:
+ app: minio
+spec:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 50Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: minio
+ labels:
+ app: minio
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: minio
+ template:
+ metadata:
+ labels:
+ app: minio
+ spec:
+ containers:
+ - name: minio
+ image: quay.io/minio/minio:latest
+ args:
+ - server
+ - /data
+ - --console-address
+ - ":9001"
+ env:
+ - name: MINIO_ROOT_USER
+ valueFrom:
+ secretKeyRef:
+ name: minio-credentials
+ key: root-user
+ - name: MINIO_ROOT_PASSWORD
+ valueFrom:
+ secretKeyRef:
+ name: minio-credentials
+ key: root-password
+ ports:
+ - containerPort: 9000
+ name: api
+ protocol: TCP
+ - containerPort: 9001
+ name: console
+ protocol: TCP
+ volumeMounts:
+ - name: data
+ mountPath: /data
+ livenessProbe:
+ httpGet:
+ path: /minio/health/live
+ port: 9000
+ initialDelaySeconds: 30
+ periodSeconds: 10
+ readinessProbe:
+ httpGet:
+ path: /minio/health/ready
+ port: 9000
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ resources:
+ requests:
+ cpu: 250m
+ memory: 512Mi
+ limits:
+ cpu: 1000m
+ memory: 2Gi
+ volumes:
+ - name: data
+ persistentVolumeClaim:
+ claimName: minio-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: minio
+ labels:
+ app: minio
+spec:
+ type: ClusterIP
+ ports:
+ - port: 9000
+ targetPort: 9000
+ protocol: TCP
+ name: api
+ - port: 9001
+ targetPort: 9001
+ protocol: TCP
+ name: console
+ selector:
+ app: minio
+
diff --git a/components/manifests/base/operator-deployment.yaml b/components/manifests/base/operator-deployment.yaml
index fe8d38056..fe6a7b08e 100644
--- a/components/manifests/base/operator-deployment.yaml
+++ b/components/manifests/base/operator-deployment.yaml
@@ -19,7 +19,21 @@ spec:
- name: agentic-operator
image: quay.io/ambient_code/vteam_operator:latest
imagePullPolicy: Always
+ args:
+ # Controller-runtime configuration
+ - --max-concurrent-reconciles=10 # Process up to 10 sessions in parallel
+ - --health-probe-bind-address=:8081
+ - --leader-elect=false # Enable for HA deployments with replicas > 1
+ # Uncomment for debugging with legacy watch-based implementation:
+ # - --legacy-watch
+ ports:
+ - containerPort: 8081
+ name: health
+ protocol: TCP
env:
+ # Controller concurrency (can be overridden via args)
+ - name: MAX_CONCURRENT_RECONCILES
+ value: "10"
- name: NAMESPACE
valueFrom:
fieldRef:
@@ -35,7 +49,7 @@ spec:
- name: CONTENT_SERVICE_IMAGE
value: "quay.io/ambient_code/vteam_backend:latest"
- name: IMAGE_PULL_POLICY
- value: "Always"
+ value: "IfNotPresent"
# Vertex AI configuration from ConfigMap
- name: CLAUDE_CODE_USE_VERTEX
valueFrom:
@@ -96,6 +110,20 @@ spec:
name: google-workflow-app-secret
key: GOOGLE_OAUTH_CLIENT_SECRET
optional: true
+ # S3 state sync configuration (defaults - can be overridden per-project in settings)
+ - name: STATE_SYNC_IMAGE
+ value: "quay.io/ambient_code/vteam_state_sync:latest"
+ - name: S3_ENDPOINT
+ value: "http://minio.ambient-code.svc:9000" # In-cluster MinIO (change for external S3)
+ - name: S3_BUCKET
+ value: "ambient-sessions" # Create this bucket in MinIO console
+ # OpenTelemetry configuration
+ - name: OTEL_EXPORTER_OTLP_ENDPOINT
+ value: "otel-collector.ambient-code.svc:4317" # Deploy OTel collector separately
+ - name: DEPLOYMENT_ENV
+ value: "production"
+ - name: VERSION
+ value: "latest" # Override with actual version in production
resources:
requests:
cpu: 50m
@@ -104,11 +132,15 @@ spec:
cpu: 200m
memory: 256Mi
livenessProbe:
- exec:
- command:
- - /bin/sh
- - -c
- - "ps aux | grep '[o]perator' || exit 1"
- initialDelaySeconds: 30
+ httpGet:
+ path: /healthz
+ port: health
+ initialDelaySeconds: 15
+ periodSeconds: 20
+ readinessProbe:
+ httpGet:
+ path: /readyz
+ port: health
+ initialDelaySeconds: 5
periodSeconds: 10
restartPolicy: Always
diff --git a/components/manifests/base/rbac/operator-clusterrole.yaml b/components/manifests/base/rbac/operator-clusterrole.yaml
index e5a6b97ae..6d19ba779 100644
--- a/components/manifests/base/rbac/operator-clusterrole.yaml
+++ b/components/manifests/base/rbac/operator-clusterrole.yaml
@@ -25,10 +25,10 @@ rules:
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "watch", "create", "delete"]
-# Pods (for getting logs from failed jobs and cleanup on stop)
+# Pods (create runner pods directly, get logs, and cleanup on stop)
- apiGroups: [""]
resources: ["pods"]
- verbs: ["get", "list", "watch", "delete", "deletecollection"]
+ verbs: ["get", "list", "watch", "create", "delete", "deletecollection"]
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
diff --git a/components/manifests/deploy.sh b/components/manifests/deploy.sh
index ba0a3ba90..c3f33eb3a 100755
--- a/components/manifests/deploy.sh
+++ b/components/manifests/deploy.sh
@@ -133,6 +133,7 @@ DEFAULT_BACKEND_IMAGE="${DEFAULT_BACKEND_IMAGE:-${CONTAINER_REGISTRY}/vteam_back
DEFAULT_FRONTEND_IMAGE="${DEFAULT_FRONTEND_IMAGE:-${CONTAINER_REGISTRY}/vteam_frontend:${IMAGE_TAG}}"
DEFAULT_OPERATOR_IMAGE="${DEFAULT_OPERATOR_IMAGE:-${CONTAINER_REGISTRY}/vteam_operator:${IMAGE_TAG}}"
DEFAULT_RUNNER_IMAGE="${DEFAULT_RUNNER_IMAGE:-${CONTAINER_REGISTRY}/vteam_claude_runner:${IMAGE_TAG}}"
+DEFAULT_STATE_SYNC_IMAGE="${DEFAULT_STATE_SYNC_IMAGE:-${CONTAINER_REGISTRY}/vteam_state_sync:${IMAGE_TAG}}"
# Content service image (defaults to same as backend, but can be overridden)
CONTENT_SERVICE_IMAGE="${CONTENT_SERVICE_IMAGE:-${DEFAULT_BACKEND_IMAGE}}"
@@ -233,6 +234,7 @@ echo -e "Backend Image: ${GREEN}${DEFAULT_BACKEND_IMAGE}${NC}"
echo -e "Frontend Image: ${GREEN}${DEFAULT_FRONTEND_IMAGE}${NC}"
echo -e "Operator Image: ${GREEN}${DEFAULT_OPERATOR_IMAGE}${NC}"
echo -e "Runner Image: ${GREEN}${DEFAULT_RUNNER_IMAGE}${NC}"
+echo -e "State Sync Image: ${GREEN}${DEFAULT_STATE_SYNC_IMAGE}${NC}"
echo -e "Content Service Image: ${GREEN}${CONTENT_SERVICE_IMAGE}${NC}"
echo ""
@@ -305,6 +307,7 @@ kustomize edit set image quay.io/ambient_code/vteam_backend:latest=${DEFAULT_BAC
kustomize edit set image quay.io/ambient_code/vteam_frontend:latest=${DEFAULT_FRONTEND_IMAGE}
kustomize edit set image quay.io/ambient_code/vteam_operator:latest=${DEFAULT_OPERATOR_IMAGE}
kustomize edit set image quay.io/ambient_code/vteam_claude_runner:latest=${DEFAULT_RUNNER_IMAGE}
+kustomize edit set image quay.io/ambient_code/vteam_state_sync:latest=${DEFAULT_STATE_SYNC_IMAGE}
# Build and apply manifests
echo -e "${BLUE}Building and applying manifests...${NC}"
@@ -428,6 +431,7 @@ kustomize edit set image quay.io/ambient_code/vteam_backend:latest=quay.io/ambie
kustomize edit set image quay.io/ambient_code/vteam_frontend:latest=quay.io/ambient_code/vteam_frontend:latest
kustomize edit set image quay.io/ambient_code/vteam_operator:latest=quay.io/ambient_code/vteam_operator:latest
kustomize edit set image quay.io/ambient_code/vteam_claude_runner:latest=quay.io/ambient_code/vteam_claude_runner:latest
+kustomize edit set image quay.io/ambient_code/vteam_state_sync:latest=quay.io/ambient_code/vteam_state_sync:latest
cd ../..
echo -e "${GREEN}🎯 Ready to create RFE workflows with multi-agent collaboration!${NC}"
diff --git a/components/manifests/observability/README.md b/components/manifests/observability/README.md
new file mode 100644
index 000000000..1513a8eb2
--- /dev/null
+++ b/components/manifests/observability/README.md
@@ -0,0 +1,191 @@
+# Observability Stack for Ambient Code Platform
+
+Observability for OpenShift using **User Workload Monitoring** (no dedicated Prometheus needed).
+
+## Architecture
+
+```
+Operator (OTel SDK) → OTel Collector → OpenShift Prometheus
+ ↓
+ OpenShift Console
+ ↓
+ Grafana (optional)
+```
+
+## Quick Start
+
+### Deploy Base Stack
+
+```bash
+# From repository root
+make deploy-observability
+
+# Or manually
+kubectl apply -k components/manifests/observability/
+```
+
+**What you get**: OTel Collector + ServiceMonitor (128MB)
+
+### View Metrics
+
+Open **OpenShift Console → Observe → Metrics** and query:
+- `ambient_sessions_total`
+- `ambient_session_startup_duration_bucket`
+- `ambient_session_errors`
+
+---
+
+## Optional: Add Grafana
+
+If you want custom dashboards:
+
+```bash
+# Add Grafana overlay
+kubectl apply -k components/manifests/observability/overlays/with-grafana/
+```
+
+**Adds**: Grafana (additional 128MB) - still uses OpenShift Prometheus
+
+**Access Grafana**:
+```bash
+# Create route
+oc create route edge grafana --service=grafana -n ambient-code
+
+# Get URL
+oc get route grafana -n ambient-code -o jsonpath='{.spec.host}'
+# Login: admin/admin
+```
+
+**Import dashboard**: Upload `dashboards/ambient-operator-dashboard.json` in Grafana UI
+
+---
+
+## Components
+
+| Component | What It Does | Resource Usage |
+|-----------|--------------|----------------|
+| **OTel Collector** | Receives metrics from operator, exports to Prometheus format | 128MB RAM |
+| **ServiceMonitor** | Tells OpenShift Prometheus to scrape OTel Collector | None |
+| **Grafana** (optional) | Custom dashboards | 128MB RAM, 5GB storage |
+
+## Metrics Available
+
+All metrics are prefixed with `ambient_`:
+
+| Metric | Type | Description | Alert Threshold |
+|--------|------|-------------|-----------------|
+| `ambient_session_startup_duration` | Histogram | Time from creation to Running phase | p95 > 60s |
+| `ambient_session_phase_transitions` | Counter | Phase transition events | - |
+| `ambient_sessions_total` | Counter | Total sessions created | Sudden spikes |
+| `ambient_sessions_completed` | Counter | Sessions that reached terminal states | - |
+| `ambient_reconcile_duration` | Histogram | Reconciliation loop performance | p95 > 10s |
+| `ambient_pod_creation_duration` | Histogram | Time to create runner pods | p95 > 30s |
+| `ambient_token_provision_duration` | Histogram | Token provisioning time | p95 > 5s |
+| `ambient_session_errors` | Counter | Errors during reconciliation | Rate > 0.1/s |
+
+## Accessing Components
+
+### OpenShift Console (Options 1 & 2)
+
+Navigate to **Observe → Metrics** and query:
+
+```promql
+# Total sessions created
+ambient_sessions_total
+
+# Session creation rate
+rate(ambient_sessions_total[5m])
+
+# p95 startup time
+histogram_quantile(0.95, rate(ambient_session_startup_duration_bucket[5m]))
+
+# Error rate by namespace
+sum by (namespace) (rate(ambient_session_errors[5m]))
+```
+
+### OTel Collector Logs
+
+```bash
+kubectl logs -n ambient-code -l app=otel-collector -f
+```
+
+## Production Setup
+
+### Enable OpenShift User Workload Monitoring
+
+Check if enabled:
+```bash
+oc -n openshift-user-workload-monitoring get pod
+```
+
+If not:
+```bash
+oc apply -f - < 0 || (job.Status.Succeeded == 0 && job.Status.Failed == 0) {
- log.Printf("Job %s is still active, cleaning up job and pods", jobName)
-
- // First, delete the job itself with foreground propagation
- deletePolicy := v1.DeletePropagationForeground
- err = config.K8sClient.BatchV1().Jobs(sessionNamespace).Delete(context.TODO(), jobName, v1.DeleteOptions{
- PropagationPolicy: &deletePolicy,
- })
- if err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete job %s: %v", jobName, err)
- } else {
- log.Printf("Successfully deleted job %s for stopped session", jobName)
- }
+ // Pod exists, delete it
+ log.Printf("Pod %s is still active, cleaning up pod", podName)
- // Then, explicitly delete all pods for this job (by job-name label)
- podSelector := fmt.Sprintf("job-name=%s", jobName)
- log.Printf("Deleting pods with job-name selector: %s", podSelector)
- err = config.K8sClient.CoreV1().Pods(sessionNamespace).DeleteCollection(context.TODO(), v1.DeleteOptions{}, v1.ListOptions{
- LabelSelector: podSelector,
- })
- if err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete pods for job %s: %v (continuing anyway)", jobName, err)
- } else {
- log.Printf("Successfully deleted pods for job %s", jobName)
- }
+ // Delete the pod
+ deletePolicy := v1.DeletePropagationForeground
+ err = config.K8sClient.CoreV1().Pods(sessionNamespace).Delete(context.TODO(), podName, v1.DeleteOptions{
+ PropagationPolicy: &deletePolicy,
+ })
+ if err != nil && !errors.IsNotFound(err) {
+ log.Printf("Failed to delete pod %s: %v", podName, err)
+ } else {
+ log.Printf("Successfully deleted pod %s for stopped session", podName)
+ }
- // Also delete any pods labeled with this session (in case owner refs are lost)
- sessionPodSelector := fmt.Sprintf("agentic-session=%s", name)
- log.Printf("Deleting pods with agentic-session selector: %s", sessionPodSelector)
- err = config.K8sClient.CoreV1().Pods(sessionNamespace).DeleteCollection(context.TODO(), v1.DeleteOptions{}, v1.ListOptions{
- LabelSelector: sessionPodSelector,
- })
- if err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete session-labeled pods: %v (continuing anyway)", err)
- } else {
- log.Printf("Successfully deleted session-labeled pods")
- }
+ // Also delete any other pods labeled with this session (in case owner refs are lost)
+ sessionPodSelector := fmt.Sprintf("agentic-session=%s", name)
+ log.Printf("Deleting pods with agentic-session selector: %s", sessionPodSelector)
+ err = config.K8sClient.CoreV1().Pods(sessionNamespace).DeleteCollection(context.TODO(), v1.DeleteOptions{}, v1.ListOptions{
+ LabelSelector: sessionPodSelector,
+ })
+ if err != nil && !errors.IsNotFound(err) {
+ log.Printf("Failed to delete session-labeled pods: %v (continuing anyway)", err)
} else {
- log.Printf("Job %s already completed (Succeeded: %d, Failed: %d), no cleanup needed", jobName, job.Status.Succeeded, job.Status.Failed)
+ log.Printf("Successfully deleted session-labeled pods")
}
} else if !errors.IsNotFound(err) {
- log.Printf("Error checking job %s: %v", jobName, err)
+ log.Printf("Error checking pod %s: %v", podName, err)
} else {
- log.Printf("Job %s not found, already cleaned up", jobName)
+ log.Printf("Pod %s not found, already cleaned up", podName)
}
// Also cleanup ambient-vertex secret when session is stopped
@@ -508,25 +369,25 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
// If in Creating phase, check if job exists
if phase == "Creating" {
- jobName := fmt.Sprintf("%s-job", name)
- _, err := config.K8sClient.BatchV1().Jobs(sessionNamespace).Get(context.TODO(), jobName, v1.GetOptions{})
+ podName := fmt.Sprintf("%s-runner", name)
+ _, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), podName, v1.GetOptions{})
if err == nil {
- // Job exists, start monitoring if not already running
- monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, jobName)
- monitoredJobsMu.Lock()
- alreadyMonitoring := monitoredJobs[monitorKey]
+ // Pod exists, start monitoring if not already running
+ monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, podName)
+ monitoredPodsMu.Lock()
+ alreadyMonitoring := monitoredPods[monitorKey]
if !alreadyMonitoring {
- monitoredJobs[monitorKey] = true
- monitoredJobsMu.Unlock()
- log.Printf("Resuming monitoring for existing job %s (session in Creating phase)", jobName)
- go monitorJob(jobName, name, sessionNamespace)
+ monitoredPods[monitorKey] = true
+ monitoredPodsMu.Unlock()
+ log.Printf("Resuming monitoring for existing pod %s (session in Creating phase)", podName)
+ go monitorPod(podName, name, sessionNamespace)
} else {
- monitoredJobsMu.Unlock()
- log.Printf("Job %s already being monitored, skipping duplicate", jobName)
+ monitoredPodsMu.Unlock()
+ log.Printf("Pod %s already being monitored, skipping duplicate", podName)
}
return nil
} else if errors.IsNotFound(err) {
- // Job doesn't exist but phase is Creating - check if this is due to a stop request
+ // Pod doesn't exist but phase is Creating - check if this is due to a stop request
if desiredPhase == "Stopped" {
// Job already gone, can transition directly to Stopped (skip Stopping phase)
log.Printf("Session %s in Creating phase but job not found and stop requested, transitioning to Stopped", name)
@@ -537,14 +398,14 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
Type: conditionReady,
Status: "False",
Reason: "UserStopped",
- Message: "User requested stop during job creation",
+ Message: "User requested stop during pod creation",
})
// Update progress-tracking conditions
statusPatch.AddCondition(conditionUpdate{
- Type: conditionJobCreated,
+ Type: conditionPodCreated,
Status: "False",
Reason: "UserStopped",
- Message: "Job deleted by user stop request",
+ Message: "Pod deleted by user stop request",
})
statusPatch.AddCondition(conditionUpdate{
Type: conditionRunnerStarted,
@@ -558,11 +419,11 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
return nil
}
- // Job doesn't exist but phase is Creating - this is inconsistent state
+ // Pod doesn't exist but phase is Creating - this is inconsistent state
// Could happen if:
- // 1. Job was manually deleted
- // 2. Operator crashed between job creation and status update
- // 3. Session is being stopped and job was deleted (stale event)
+ // 1. Pod was manually deleted
+ // 2. Operator crashed between pod creation and status update
+ // 3. Session is being stopped and pod was deleted (stale event)
// Before recreating, verify the session hasn't been stopped
// Fetch fresh status to check for recent state changes
@@ -579,26 +440,26 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
freshStatus, _, _ := unstructured.NestedMap(freshObj.Object, "status")
freshPhase, _, _ := unstructured.NestedString(freshStatus, "phase")
if freshPhase == "Stopped" || freshPhase == "Stopping" || freshPhase == "Failed" || freshPhase == "Completed" {
- log.Printf("Session %s is now in %s phase (stale Creating event), skipping job recreation", name, freshPhase)
+ log.Printf("Session %s is now in %s phase (stale Creating event), skipping pod recreation", name, freshPhase)
return nil
}
}
- log.Printf("Session %s in Creating phase but job not found, resetting to Pending and recreating", name)
+ log.Printf("Session %s in Creating phase but pod not found, resetting to Pending and recreating", name)
statusPatch.SetField("phase", "Pending")
statusPatch.AddCondition(conditionUpdate{
- Type: conditionJobCreated,
+ Type: conditionPodCreated,
Status: "False",
- Reason: "JobMissing",
- Message: "Job not found, will recreate",
+ Reason: "PodMissing",
+ Message: "Pod not found, will recreate",
})
// Apply immediately and continue to Pending logic
_ = statusPatch.ApplyAndReset()
- // Don't return - fall through to Pending logic to create job
+ // Don't return - fall through to Pending logic to create pod
_ = "Pending" // phase reset handled by status update
} else {
- // Error checking job - log and continue
- log.Printf("Error checking job for Creating session %s: %v, will attempt recovery", name, err)
+ // Error checking pod - log and continue
+ log.Printf("Error checking pod for Creating session %s: %v, will attempt recovery", name, err)
// Fall through to Pending logic
_ = "Pending" // phase reset handled by status update
}
@@ -620,90 +481,8 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
}
}
- // Determine PVC name and owner references
- var pvcName string
- var ownerRefs []v1.OwnerReference
- reusingPVC := false
-
- if parentSessionID != "" {
- // Continuation: reuse parent's PVC
- pvcName = fmt.Sprintf("ambient-workspace-%s", parentSessionID)
- reusingPVC = true
- log.Printf("Session continuation: reusing PVC %s from parent session %s", pvcName, parentSessionID)
- // No owner refs - we don't own the parent's PVC
- } else {
- // New session: create fresh PVC with owner refs
- pvcName = fmt.Sprintf("ambient-workspace-%s", name)
- ownerRefs = []v1.OwnerReference{
- {
- APIVersion: "vteam.ambient-code/v1",
- Kind: "AgenticSession",
- Name: currentObj.GetName(),
- UID: currentObj.GetUID(),
- Controller: boolPtr(true),
- // BlockOwnerDeletion intentionally omitted to avoid permission issues
- },
- }
- }
-
- // Ensure PVC exists (skip for continuation if parent's PVC should exist)
- if !reusingPVC {
- if err := services.EnsureSessionWorkspacePVC(sessionNamespace, pvcName, ownerRefs); err != nil {
- log.Printf("Failed to ensure session PVC %s in %s: %v", pvcName, sessionNamespace, err)
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionPVCReady,
- Status: "False",
- Reason: "ProvisioningFailed",
- Message: err.Error(),
- })
- } else {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionPVCReady,
- Status: "True",
- Reason: "Bound",
- Message: fmt.Sprintf("PVC %s ready", pvcName),
- })
- }
- } else {
- // Verify parent's PVC exists
- if _, err := config.K8sClient.CoreV1().PersistentVolumeClaims(sessionNamespace).Get(context.TODO(), pvcName, v1.GetOptions{}); err != nil {
- log.Printf("Warning: Parent PVC %s not found for continuation session %s: %v", pvcName, name, err)
- // Fall back to creating new PVC with current session's owner refs
- pvcName = fmt.Sprintf("ambient-workspace-%s", name)
- ownerRefs = []v1.OwnerReference{
- {
- APIVersion: "vteam.ambient-code/v1",
- Kind: "AgenticSession",
- Name: currentObj.GetName(),
- UID: currentObj.GetUID(),
- Controller: boolPtr(true),
- },
- }
- if err := services.EnsureSessionWorkspacePVC(sessionNamespace, pvcName, ownerRefs); err != nil {
- log.Printf("Failed to create fallback PVC %s: %v", pvcName, err)
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionPVCReady,
- Status: "False",
- Reason: "ProvisioningFailed",
- Message: err.Error(),
- })
- } else {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionPVCReady,
- Status: "True",
- Reason: "Bound",
- Message: fmt.Sprintf("PVC %s ready", pvcName),
- })
- }
- } else {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionPVCReady,
- Status: "True",
- Reason: "Reused",
- Message: fmt.Sprintf("Reused PVC %s from parent session", pvcName),
- })
- }
- }
+ // EmptyDir replaces PVC - session state persists in S3
+ log.Printf("Session will use EmptyDir with S3 state persistence")
// Load config for this session
appConfig := config.LoadConfig()
@@ -795,61 +574,49 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
log.Printf("Langfuse disabled, skipping secret copy")
}
- // CRITICAL: Delete temp content pod before creating Job to avoid PVC mount conflict
- // The PVC is ReadWriteOnce, so only one pod can mount it at a time
- tempPodName = fmt.Sprintf("temp-content-%s", name)
- if _, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), tempPodName, v1.GetOptions{}); err == nil {
- log.Printf("[PVCConflict] Deleting temp pod %s before creating Job (ReadWriteOnce PVC)", tempPodName)
+ // Create a Kubernetes Pod for this AgenticSession
+ podName := fmt.Sprintf("%s-runner", name)
- // Force immediate termination with zero grace period
- gracePeriod := int64(0)
- deleteOptions := v1.DeleteOptions{
- GracePeriodSeconds: &gracePeriod,
- }
- if err := config.K8sClient.CoreV1().Pods(sessionNamespace).Delete(context.TODO(), tempPodName, deleteOptions); err != nil && !errors.IsNotFound(err) {
- log.Printf("[PVCConflict] Warning: failed to delete temp pod: %v", err)
- }
-
- // Wait for temp pod to fully terminate to prevent PVC mount conflicts
- // This is critical because ReadWriteOnce PVCs cannot be mounted by multiple pods
- // With gracePeriod=0, this should complete in 1-3 seconds
- log.Printf("[PVCConflict] Waiting for temp pod %s to fully terminate...", tempPodName)
- maxWaitSeconds := 10 // Reduced from 30 since we're force-deleting
- for i := 0; i < maxWaitSeconds*4; i++ { // Poll 4x per second for faster detection
- _, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), tempPodName, v1.GetOptions{})
- if errors.IsNotFound(err) {
- elapsed := float64(i) * 0.25
- log.Printf("[PVCConflict] Temp pod fully terminated after %.2f seconds", elapsed)
- break
- }
- if i == (maxWaitSeconds*4)-1 {
- log.Printf("[PVCConflict] Warning: temp pod still exists after %d seconds, proceeding anyway", maxWaitSeconds)
+ // Ensure runner token exists before creating pod
+ // This handles cases where sessions are created directly via kubectl (bypassing the backend)
+ // or when the backend failed to provision the token
+ runnerTokenSecretName := fmt.Sprintf("ambient-runner-token-%s", name)
+ if _, err := config.K8sClient.CoreV1().Secrets(sessionNamespace).Get(context.TODO(), runnerTokenSecretName, v1.GetOptions{}); err != nil {
+ if errors.IsNotFound(err) {
+ log.Printf("Runner token secret %s not found, creating it now", runnerTokenSecretName)
+ if err := regenerateRunnerToken(sessionNamespace, name, currentObj); err != nil {
+ errMsg := fmt.Sprintf("Failed to provision runner token: %v", err)
+ log.Print(errMsg)
+ statusPatch.SetField("phase", "Failed")
+ statusPatch.AddCondition(conditionUpdate{
+ Type: conditionReady,
+ Status: "False",
+ Reason: "TokenProvisionFailed",
+ Message: errMsg,
+ })
+ _ = statusPatch.Apply()
+ return fmt.Errorf("failed to provision runner token for session %s: %v", name, err)
}
- time.Sleep(250 * time.Millisecond) // Poll every 250ms instead of 1s
+ log.Printf("Successfully provisioned runner token for session %s", name)
+ } else {
+ log.Printf("Warning: error checking runner token secret: %v", err)
}
-
- // Clear temp pod annotations since we're starting the session
- _ = clearAnnotation(sessionNamespace, name, tempContentRequestedAnnotation)
- _ = clearAnnotation(sessionNamespace, name, tempContentLastAccessedAnnotation)
}
- // Create a Kubernetes Job for this AgenticSession
- jobName := fmt.Sprintf("%s-job", name)
-
- // Check if job already exists in the session's namespace
- _, err = config.K8sClient.BatchV1().Jobs(sessionNamespace).Get(context.TODO(), jobName, v1.GetOptions{})
+ // Check if pod already exists in the session's namespace
+ _, err = config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), podName, v1.GetOptions{})
if err == nil {
- log.Printf("Job %s already exists for AgenticSession %s", jobName, name)
+ log.Printf("Pod %s already exists for AgenticSession %s", podName, name)
statusPatch.SetField("phase", "Creating")
statusPatch.SetField("observedGeneration", currentObj.GetGeneration())
statusPatch.AddCondition(conditionUpdate{
- Type: conditionJobCreated,
+ Type: conditionPodCreated,
Status: "True",
- Reason: "JobExists",
- Message: "Runner job already exists",
+ Reason: "PodExists",
+ Message: "Runner pod already exists",
})
_ = statusPatch.Apply()
- // Clear desired-phase annotation if it exists (job already created)
+ // Clear desired-phase annotation if it exists (pod already created)
_ = clearAnnotation(sessionNamespace, name, "ambient-code.io/desired-phase")
return nil
}
@@ -919,63 +686,6 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
})
}
- // Extract repos configuration (simplified format: url and branch)
- type RepoConfig struct {
- URL string
- Branch string
- }
-
- var repos []RepoConfig
-
- // Read simplified repos[] array format
- if reposArr, found, _ := unstructured.NestedSlice(spec, "repos"); found && len(reposArr) > 0 {
- repos = make([]RepoConfig, 0, len(reposArr))
- for _, repoItem := range reposArr {
- if repoMap, ok := repoItem.(map[string]interface{}); ok {
- repo := RepoConfig{}
- if url, ok := repoMap["url"].(string); ok {
- repo.URL = url
- }
- if branch, ok := repoMap["branch"].(string); ok {
- repo.Branch = branch
- } else {
- repo.Branch = "main"
- }
- if repo.URL != "" {
- repos = append(repos, repo)
- }
- }
- }
- } else {
- // Fallback to old format for backward compatibility (input/output structure)
- inputRepo, _, _ := unstructured.NestedString(spec, "inputRepo")
- inputBranch, _, _ := unstructured.NestedString(spec, "inputBranch")
- if v, found, _ := unstructured.NestedString(spec, "input", "repo"); found && strings.TrimSpace(v) != "" {
- inputRepo = v
- }
- if v, found, _ := unstructured.NestedString(spec, "input", "branch"); found && strings.TrimSpace(v) != "" {
- inputBranch = v
- }
- if inputRepo != "" {
- if inputBranch == "" {
- inputBranch = "main"
- }
- repos = []RepoConfig{{
- URL: inputRepo,
- Branch: inputBranch,
- }}
- }
- }
-
- // Get first repo for backward compatibility env vars (first repo is always main repo)
- var inputRepo, inputBranch, outputRepo, outputBranch string
- if len(repos) > 0 {
- inputRepo = repos[0].URL
- inputBranch = repos[0].Branch
- outputRepo = repos[0].URL // Output same as input in simplified format
- outputBranch = repos[0].Branch
- }
-
// Read autoPushOnComplete flag
autoPushOnComplete, _, _ := unstructured.NestedBool(spec, "autoPushOnComplete")
@@ -992,18 +702,45 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
}
log.Printf("Session %s initiated by user: %s (userId: %s)", name, userName, userID)
- // Create the Job
- job := &batchv1.Job{
+ // Get S3 configuration for this project (from project secret or operator defaults)
+ s3Endpoint, s3Bucket, s3AccessKey, s3SecretKey, err := getS3ConfigForProject(sessionNamespace, appConfig)
+ if err != nil {
+ log.Printf("Warning: S3 not available for project %s: %v (sessions will use ephemeral storage only)", sessionNamespace, err)
+ statusPatch.AddCondition(conditionUpdate{
+ Type: "S3Available",
+ Status: "False",
+ Reason: "NotConfigured",
+ Message: fmt.Sprintf("S3 storage not configured: %v. Session state will not persist across pod restarts. Configure S3 in project settings.", err),
+ })
+ // Set empty values - init-hydrate and state-sync will skip S3 operations
+ s3Endpoint = ""
+ s3Bucket = ""
+ s3AccessKey = ""
+ s3SecretKey = ""
+ } else {
+ log.Printf("S3 configured for project %s: endpoint=%s, bucket=%s", sessionNamespace, s3Endpoint, s3Bucket)
+ statusPatch.AddCondition(conditionUpdate{
+ Type: "S3Available",
+ Status: "True",
+ Reason: "Configured",
+ Message: fmt.Sprintf("S3 storage configured: %s/%s", s3Endpoint, s3Bucket),
+ })
+ }
+
+ // Create the Pod directly (no Job wrapper for faster startup)
+ pod := &corev1.Pod{
ObjectMeta: v1.ObjectMeta{
- Name: jobName,
+ Name: podName,
Namespace: sessionNamespace,
Labels: map[string]string{
"agentic-session": name,
"app": "ambient-code-runner",
},
+ // If you run a service mesh that injects sidecars and causes egress issues:
+ // Annotations: map[string]string{"sidecar.istio.io/inject": "false"},
OwnerReferences: []v1.OwnerReference{
{
- APIVersion: "vteam.ambient-code/v1",
+ APIVersion: "vteam.ambient-code/v1alpha1",
Kind: "AgenticSession",
Name: currentObj.GetName(),
UID: currentObj.GetUID(),
@@ -1013,339 +750,422 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
},
},
},
- Spec: batchv1.JobSpec{
- BackoffLimit: int32Ptr(3),
- ActiveDeadlineSeconds: int64Ptr(14400), // 4 hour timeout for safety
- // Auto-cleanup finished Jobs if TTL controller is enabled in the cluster
- TTLSecondsAfterFinished: int32Ptr(600),
- Template: corev1.PodTemplateSpec{
- ObjectMeta: v1.ObjectMeta{
- Labels: map[string]string{
- "agentic-session": name,
- "app": "ambient-code-runner",
+ Spec: corev1.PodSpec{
+ RestartPolicy: corev1.RestartPolicyNever,
+ TerminationGracePeriodSeconds: int64Ptr(30), // Allow time for state-sync final sync
+ // Explicitly set service account for pod creation permissions
+ AutomountServiceAccountToken: boolPtr(false),
+ Volumes: []corev1.Volume{
+ {
+ Name: "workspace",
+ VolumeSource: corev1.VolumeSource{
+ EmptyDir: &corev1.EmptyDirVolumeSource{
+ SizeLimit: resource.NewQuantity(10*1024*1024*1024, resource.BinarySI), // 10Gi
+ },
},
- // If you run a service mesh that injects sidecars and causes egress issues for Jobs:
- // Annotations: map[string]string{"sidecar.istio.io/inject": "false"},
},
- Spec: corev1.PodSpec{
- RestartPolicy: corev1.RestartPolicyNever,
- // Explicitly set service account for pod creation permissions
- AutomountServiceAccountToken: boolPtr(false),
- Volumes: []corev1.Volume{
- {
- Name: "workspace",
- VolumeSource: corev1.VolumeSource{
- PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
- ClaimName: pvcName,
- },
- },
+ },
+
+ // InitContainer to hydrate session state from S3
+ InitContainers: []corev1.Container{
+ {
+ Name: "init-hydrate",
+ Image: appConfig.StateSyncImage,
+ ImagePullPolicy: appConfig.ImagePullPolicy,
+ Command: []string{"/usr/local/bin/hydrate.sh"},
+ SecurityContext: &corev1.SecurityContext{
+ AllowPrivilegeEscalation: boolPtr(false),
+ ReadOnlyRootFilesystem: boolPtr(false),
+ Capabilities: &corev1.Capabilities{
+ Drop: []corev1.Capability{"ALL"},
},
},
+ Env: func() []corev1.EnvVar {
+ base := []corev1.EnvVar{
+ {Name: "SESSION_NAME", Value: name},
+ {Name: "NAMESPACE", Value: sessionNamespace},
+ {Name: "S3_ENDPOINT", Value: s3Endpoint},
+ {Name: "S3_BUCKET", Value: s3Bucket},
+ {Name: "AWS_ACCESS_KEY_ID", Value: s3AccessKey},
+ {Name: "AWS_SECRET_ACCESS_KEY", Value: s3SecretKey},
+ {Name: "GIT_USER_NAME", Value: os.Getenv("GIT_USER_NAME")},
+ {Name: "GIT_USER_EMAIL", Value: os.Getenv("GIT_USER_EMAIL")},
+ }
+
+ // Add repos JSON if present
+ if repos, ok := spec["repos"].([]interface{}); ok && len(repos) > 0 {
+ b, _ := json.Marshal(repos)
+ base = append(base, corev1.EnvVar{Name: "REPOS_JSON", Value: string(b)})
+ }
+
+ // Add workflow info if present
+ if workflow, ok := spec["activeWorkflow"].(map[string]interface{}); ok {
+ if gitURL, ok := workflow["gitUrl"].(string); ok && strings.TrimSpace(gitURL) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_GIT_URL", Value: gitURL})
+ }
+ if branch, ok := workflow["branch"].(string); ok && strings.TrimSpace(branch) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_BRANCH", Value: branch})
+ }
+ if path, ok := workflow["path"].(string); ok && strings.TrimSpace(path) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_PATH", Value: path})
+ }
+ }
+
+ // Add GitHub token for private repos
+ secretName := ""
+ if meta, ok := currentObj.Object["metadata"].(map[string]interface{}); ok {
+ if anns, ok := meta["annotations"].(map[string]interface{}); ok {
+ if v, ok := anns["ambient-code.io/runner-token-secret"].(string); ok && strings.TrimSpace(v) != "" {
+ secretName = strings.TrimSpace(v)
+ }
+ }
+ }
+ if secretName == "" {
+ secretName = fmt.Sprintf("ambient-runner-token-%s", name)
+ }
+ base = append(base, corev1.EnvVar{
+ Name: "BOT_TOKEN",
+ ValueFrom: &corev1.EnvVarSource{SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: secretName},
+ Key: "k8s-token",
+ }},
+ })
+
+ return base
+ }(),
+ VolumeMounts: []corev1.VolumeMount{
+ {Name: "workspace", MountPath: "/workspace"},
+ // SubPath mount for .claude so init container writes to same location as runner
+ {Name: "workspace", MountPath: "/app/.claude", SubPath: ".claude"},
+ },
+ },
+ },
- // InitContainer to ensure workspace directory structure exists
- InitContainers: []corev1.Container{
- {
- Name: "init-workspace",
- Image: "registry.access.redhat.com/ubi8/ubi-minimal:latest",
- Command: []string{
- "sh", "-c",
- fmt.Sprintf("mkdir -p /workspace/sessions/%s/workspace && chmod 777 /workspace/sessions/%s/workspace && echo 'Workspace initialized'", name, name),
- },
- VolumeMounts: []corev1.VolumeMount{
- {Name: "workspace", MountPath: "/workspace"},
+ // Flip roles so the content writer is the main container that keeps the pod alive
+ Containers: []corev1.Container{
+ {
+ Name: "ambient-content",
+ Image: appConfig.ContentServiceImage,
+ ImagePullPolicy: appConfig.ImagePullPolicy,
+ Env: []corev1.EnvVar{
+ {Name: "CONTENT_SERVICE_MODE", Value: "true"},
+ {Name: "STATE_BASE_DIR", Value: "/workspace"},
+ },
+ Ports: []corev1.ContainerPort{{ContainerPort: 8080, Name: "http"}},
+ ReadinessProbe: &corev1.Probe{
+ ProbeHandler: corev1.ProbeHandler{
+ HTTPGet: &corev1.HTTPGetAction{
+ Path: "/health",
+ Port: intstr.FromString("http"),
},
},
+ InitialDelaySeconds: 5,
+ PeriodSeconds: 5,
},
-
- // Flip roles so the content writer is the main container that keeps the pod alive
- Containers: []corev1.Container{
- {
- Name: "ambient-content",
- Image: appConfig.ContentServiceImage,
- ImagePullPolicy: appConfig.ImagePullPolicy,
- Env: []corev1.EnvVar{
- {Name: "CONTENT_SERVICE_MODE", Value: "true"},
- {Name: "STATE_BASE_DIR", Value: "/workspace"},
- },
- Ports: []corev1.ContainerPort{{ContainerPort: 8080, Name: "http"}},
- ReadinessProbe: &corev1.Probe{
- ProbeHandler: corev1.ProbeHandler{
- HTTPGet: &corev1.HTTPGetAction{
- Path: "/health",
- Port: intstr.FromString("http"),
- },
- },
- InitialDelaySeconds: 5,
- PeriodSeconds: 5,
- },
- VolumeMounts: []corev1.VolumeMount{{Name: "workspace", MountPath: "/workspace"}},
+ VolumeMounts: []corev1.VolumeMount{{Name: "workspace", MountPath: "/workspace"}},
+ },
+ {
+ Name: "ambient-code-runner",
+ Image: appConfig.AmbientCodeRunnerImage,
+ ImagePullPolicy: appConfig.ImagePullPolicy,
+ // 🔒 Container-level security (SCC-compatible, no privileged capabilities)
+ SecurityContext: &corev1.SecurityContext{
+ AllowPrivilegeEscalation: boolPtr(false),
+ ReadOnlyRootFilesystem: boolPtr(false), // Playwright needs to write temp files
+ Capabilities: &corev1.Capabilities{
+ Drop: []corev1.Capability{"ALL"}, // Drop all capabilities for security
},
- {
- Name: "ambient-code-runner",
- Image: appConfig.AmbientCodeRunnerImage,
- ImagePullPolicy: appConfig.ImagePullPolicy,
- // 🔒 Container-level security (SCC-compatible, no privileged capabilities)
- SecurityContext: &corev1.SecurityContext{
- AllowPrivilegeEscalation: boolPtr(false),
- ReadOnlyRootFilesystem: boolPtr(false), // Playwright needs to write temp files
- Capabilities: &corev1.Capabilities{
- Drop: []corev1.Capability{"ALL"}, // Drop all capabilities for security
- },
- },
-
- // Expose AG-UI server port for backend proxy
- Ports: []corev1.ContainerPort{{
- Name: "agui",
- ContainerPort: 8001,
- Protocol: corev1.ProtocolTCP,
- }},
-
- VolumeMounts: []corev1.VolumeMount{
- {Name: "workspace", MountPath: "/workspace", ReadOnly: false},
- // Mount .claude directory for session state persistence
- // This enables SDK's built-in resume functionality
- {Name: "workspace", MountPath: "/app/.claude", SubPath: fmt.Sprintf("sessions/%s/.claude", name), ReadOnly: false},
- },
-
- Env: func() []corev1.EnvVar {
- base := []corev1.EnvVar{
- {Name: "DEBUG", Value: "true"},
- {Name: "INTERACTIVE", Value: fmt.Sprintf("%t", interactive)},
- {Name: "AGENTIC_SESSION_NAME", Value: name},
- {Name: "AGENTIC_SESSION_NAMESPACE", Value: sessionNamespace},
- // Provide session id and workspace path for the runner wrapper
- {Name: "SESSION_ID", Value: name},
- {Name: "WORKSPACE_PATH", Value: fmt.Sprintf("/workspace/sessions/%s/workspace", name)},
- {Name: "ARTIFACTS_DIR", Value: "_artifacts"},
- // Google MCP credentials directory for workspace-mcp server (writable workspace location)
- {Name: "GOOGLE_MCP_CREDENTIALS_DIR", Value: "/workspace/.google_workspace_mcp/credentials"},
- // Google OAuth client credentials for workspace-mcp
- {Name: "GOOGLE_OAUTH_CLIENT_ID", Value: os.Getenv("GOOGLE_OAUTH_CLIENT_ID")},
- {Name: "GOOGLE_OAUTH_CLIENT_SECRET", Value: os.Getenv("GOOGLE_OAUTH_CLIENT_SECRET")},
- }
+ },
- // Add user context for observability and auditing (Langfuse userId, logs, etc.)
- if userID != "" {
- base = append(base, corev1.EnvVar{Name: "USER_ID", Value: userID})
- }
- if userName != "" {
- base = append(base, corev1.EnvVar{Name: "USER_NAME", Value: userName})
- }
+ // Expose AG-UI server port for backend proxy
+ Ports: []corev1.ContainerPort{{
+ Name: "agui",
+ ContainerPort: 8001,
+ Protocol: corev1.ProtocolTCP,
+ }},
- // Add per-repo environment variables (simplified format)
- for i, repo := range repos {
- base = append(base,
- corev1.EnvVar{Name: fmt.Sprintf("REPO_%d_URL", i), Value: repo.URL},
- corev1.EnvVar{Name: fmt.Sprintf("REPO_%d_BRANCH", i), Value: repo.Branch},
- )
- }
+ VolumeMounts: []corev1.VolumeMount{
+ {Name: "workspace", MountPath: "/workspace", ReadOnly: false},
+ // Mount .claude directory for session state persistence (synced to S3)
+ // This enables SDK's built-in resume functionality
+ {Name: "workspace", MountPath: "/app/.claude", SubPath: ".claude", ReadOnly: false},
+ },
- // Backward compatibility: set INPUT_REPO_URL/OUTPUT_REPO_URL from main repo
- base = append(base,
- corev1.EnvVar{Name: "INPUT_REPO_URL", Value: inputRepo},
- corev1.EnvVar{Name: "INPUT_BRANCH", Value: inputBranch},
- corev1.EnvVar{Name: "OUTPUT_REPO_URL", Value: outputRepo},
- corev1.EnvVar{Name: "OUTPUT_BRANCH", Value: outputBranch},
- corev1.EnvVar{Name: "INITIAL_PROMPT", Value: prompt},
- corev1.EnvVar{Name: "LLM_MODEL", Value: model},
- corev1.EnvVar{Name: "LLM_TEMPERATURE", Value: fmt.Sprintf("%.2f", temperature)},
- corev1.EnvVar{Name: "LLM_MAX_TOKENS", Value: fmt.Sprintf("%d", maxTokens)},
- corev1.EnvVar{Name: "USE_AGUI", Value: "true"},
- corev1.EnvVar{Name: "TIMEOUT", Value: fmt.Sprintf("%d", timeout)},
- corev1.EnvVar{Name: "AUTO_PUSH_ON_COMPLETE", Value: fmt.Sprintf("%t", autoPushOnComplete)},
- corev1.EnvVar{Name: "BACKEND_API_URL", Value: fmt.Sprintf("http://backend-service.%s.svc.cluster.local:8080/api", appConfig.BackendNamespace)},
- // LEGACY: WEBSOCKET_URL removed - runner now uses AG-UI server pattern (FastAPI)
- // Backend proxies to runner's HTTP endpoint instead of WebSocket
- )
-
- // Platform-wide Langfuse observability configuration
- // Uses secretKeyRef to prevent credential exposure in pod specs
- // Secret is copied to session namespace from operator namespace
- // All keys are optional to prevent pod startup failures if keys are missing
- if ambientLangfuseSecretCopied {
- base = append(base,
- corev1.EnvVar{
- Name: "LANGFUSE_ENABLED",
- ValueFrom: &corev1.EnvVarSource{
- SecretKeyRef: &corev1.SecretKeySelector{
- LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
- Key: "LANGFUSE_ENABLED",
- Optional: boolPtr(true),
- },
- },
+ Env: func() []corev1.EnvVar {
+ base := []corev1.EnvVar{
+ {Name: "DEBUG", Value: "true"},
+ {Name: "INTERACTIVE", Value: fmt.Sprintf("%t", interactive)},
+ {Name: "AGENTIC_SESSION_NAME", Value: name},
+ {Name: "AGENTIC_SESSION_NAMESPACE", Value: sessionNamespace},
+ // Provide session id and workspace path for the runner wrapper
+ {Name: "SESSION_ID", Value: name},
+ {Name: "WORKSPACE_PATH", Value: "/workspace"},
+ {Name: "ARTIFACTS_DIR", Value: "artifacts"},
+ // Google MCP credentials directory for workspace-mcp server (writable workspace location)
+ {Name: "GOOGLE_MCP_CREDENTIALS_DIR", Value: "/workspace/.google_workspace_mcp/credentials"},
+ // Google OAuth client credentials for workspace-mcp
+ {Name: "GOOGLE_OAUTH_CLIENT_ID", Value: os.Getenv("GOOGLE_OAUTH_CLIENT_ID")},
+ {Name: "GOOGLE_OAUTH_CLIENT_SECRET", Value: os.Getenv("GOOGLE_OAUTH_CLIENT_SECRET")},
+ }
+
+ // Add user context for observability and auditing (Langfuse userId, logs, etc.)
+ if userID != "" {
+ base = append(base, corev1.EnvVar{Name: "USER_ID", Value: userID})
+ }
+ if userName != "" {
+ base = append(base, corev1.EnvVar{Name: "USER_NAME", Value: userName})
+ }
+
+ // Core session env vars
+ base = append(base,
+ corev1.EnvVar{Name: "INITIAL_PROMPT", Value: prompt},
+ corev1.EnvVar{Name: "LLM_MODEL", Value: model},
+ corev1.EnvVar{Name: "LLM_TEMPERATURE", Value: fmt.Sprintf("%.2f", temperature)},
+ corev1.EnvVar{Name: "LLM_MAX_TOKENS", Value: fmt.Sprintf("%d", maxTokens)},
+ corev1.EnvVar{Name: "USE_AGUI", Value: "true"},
+ corev1.EnvVar{Name: "TIMEOUT", Value: fmt.Sprintf("%d", timeout)},
+ corev1.EnvVar{Name: "AUTO_PUSH_ON_COMPLETE", Value: fmt.Sprintf("%t", autoPushOnComplete)},
+ corev1.EnvVar{Name: "BACKEND_API_URL", Value: fmt.Sprintf("http://backend-service.%s.svc.cluster.local:8080/api", appConfig.BackendNamespace)},
+ // LEGACY: WEBSOCKET_URL removed - runner now uses AG-UI server pattern (FastAPI)
+ // Backend proxies to runner's HTTP endpoint instead of WebSocket
+ )
+
+ // Platform-wide Langfuse observability configuration
+ // Uses secretKeyRef to prevent credential exposure in pod specs
+ // Secret is copied to session namespace from operator namespace
+ // All keys are optional to prevent pod startup failures if keys are missing
+ if ambientLangfuseSecretCopied {
+ base = append(base,
+ corev1.EnvVar{
+ Name: "LANGFUSE_ENABLED",
+ ValueFrom: &corev1.EnvVarSource{
+ SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
+ Key: "LANGFUSE_ENABLED",
+ Optional: boolPtr(true),
},
- corev1.EnvVar{
- Name: "LANGFUSE_HOST",
- ValueFrom: &corev1.EnvVarSource{
- SecretKeyRef: &corev1.SecretKeySelector{
- LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
- Key: "LANGFUSE_HOST",
- Optional: boolPtr(true),
- },
- },
+ },
+ },
+ corev1.EnvVar{
+ Name: "LANGFUSE_HOST",
+ ValueFrom: &corev1.EnvVarSource{
+ SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
+ Key: "LANGFUSE_HOST",
+ Optional: boolPtr(true),
},
- corev1.EnvVar{
- Name: "LANGFUSE_PUBLIC_KEY",
- ValueFrom: &corev1.EnvVarSource{
- SecretKeyRef: &corev1.SecretKeySelector{
- LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
- Key: "LANGFUSE_PUBLIC_KEY",
- Optional: boolPtr(true),
- },
- },
+ },
+ },
+ corev1.EnvVar{
+ Name: "LANGFUSE_PUBLIC_KEY",
+ ValueFrom: &corev1.EnvVarSource{
+ SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
+ Key: "LANGFUSE_PUBLIC_KEY",
+ Optional: boolPtr(true),
},
- corev1.EnvVar{
- Name: "LANGFUSE_SECRET_KEY",
- ValueFrom: &corev1.EnvVarSource{
- SecretKeyRef: &corev1.SecretKeySelector{
- LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
- Key: "LANGFUSE_SECRET_KEY",
- Optional: boolPtr(true),
- },
- },
+ },
+ },
+ corev1.EnvVar{
+ Name: "LANGFUSE_SECRET_KEY",
+ ValueFrom: &corev1.EnvVarSource{
+ SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: "ambient-admin-langfuse-secret"},
+ Key: "LANGFUSE_SECRET_KEY",
+ Optional: boolPtr(true),
},
- )
- log.Printf("Langfuse env vars configured via secretKeyRef for session %s", name)
+ },
+ },
+ )
+ log.Printf("Langfuse env vars configured via secretKeyRef for session %s", name)
+ }
+
+ // Add Vertex AI configuration only if enabled
+ if vertexEnabled {
+ base = append(base,
+ corev1.EnvVar{Name: "CLAUDE_CODE_USE_VERTEX", Value: "1"},
+ corev1.EnvVar{Name: "CLOUD_ML_REGION", Value: os.Getenv("CLOUD_ML_REGION")},
+ corev1.EnvVar{Name: "ANTHROPIC_VERTEX_PROJECT_ID", Value: os.Getenv("ANTHROPIC_VERTEX_PROJECT_ID")},
+ corev1.EnvVar{Name: "GOOGLE_APPLICATION_CREDENTIALS", Value: os.Getenv("GOOGLE_APPLICATION_CREDENTIALS")},
+ )
+ } else {
+ // Explicitly set to 0 when Vertex is disabled
+ base = append(base, corev1.EnvVar{Name: "CLAUDE_CODE_USE_VERTEX", Value: "0"})
+ }
+
+ // Add PARENT_SESSION_ID if this is a continuation
+ if parentSessionID != "" {
+ base = append(base, corev1.EnvVar{Name: "PARENT_SESSION_ID", Value: parentSessionID})
+ log.Printf("Session %s: passing PARENT_SESSION_ID=%s to runner", name, parentSessionID)
+ }
+
+ // Add IS_RESUME if this session has been started before
+ // Check status.startTime - if present, this is a resume (pod recreate/restart)
+ // This tells the runner to skip INITIAL_PROMPT and use continue_conversation
+ if status, found, _ := unstructured.NestedMap(currentObj.Object, "status"); found {
+ if startTime, ok := status["startTime"].(string); ok && startTime != "" {
+ base = append(base, corev1.EnvVar{Name: "IS_RESUME", Value: "true"})
+ log.Printf("Session %s: marking as resume (IS_RESUME=true, startTime=%s)", name, startTime)
+ }
+ }
+
+ // If backend annotated the session with a runner token secret, inject only BOT_TOKEN
+ // Secret contains: 'k8s-token' (for CR updates)
+ // Prefer annotated secret name; fallback to deterministic name
+ secretName := ""
+ if meta, ok := currentObj.Object["metadata"].(map[string]interface{}); ok {
+ if anns, ok := meta["annotations"].(map[string]interface{}); ok {
+ if v, ok := anns["ambient-code.io/runner-token-secret"].(string); ok && strings.TrimSpace(v) != "" {
+ secretName = strings.TrimSpace(v)
}
-
- // Add Vertex AI configuration only if enabled
- if vertexEnabled {
- base = append(base,
- corev1.EnvVar{Name: "CLAUDE_CODE_USE_VERTEX", Value: "1"},
- corev1.EnvVar{Name: "CLOUD_ML_REGION", Value: os.Getenv("CLOUD_ML_REGION")},
- corev1.EnvVar{Name: "ANTHROPIC_VERTEX_PROJECT_ID", Value: os.Getenv("ANTHROPIC_VERTEX_PROJECT_ID")},
- corev1.EnvVar{Name: "GOOGLE_APPLICATION_CREDENTIALS", Value: os.Getenv("GOOGLE_APPLICATION_CREDENTIALS")},
- )
- } else {
- // Explicitly set to 0 when Vertex is disabled
- base = append(base, corev1.EnvVar{Name: "CLAUDE_CODE_USE_VERTEX", Value: "0"})
+ }
+ }
+ if secretName == "" {
+ secretName = fmt.Sprintf("ambient-runner-token-%s", name)
+ }
+ base = append(base, corev1.EnvVar{
+ Name: "BOT_TOKEN",
+ ValueFrom: &corev1.EnvVarSource{SecretKeyRef: &corev1.SecretKeySelector{
+ LocalObjectReference: corev1.LocalObjectReference{Name: secretName},
+ Key: "k8s-token",
+ }},
+ })
+ // Add CR-provided envs last (override base when same key)
+ if spec, ok := currentObj.Object["spec"].(map[string]interface{}); ok {
+ // Inject REPOS_JSON and MAIN_REPO_NAME from spec.repos and spec.mainRepoName if present
+ if repos, ok := spec["repos"].([]interface{}); ok && len(repos) > 0 {
+ // Use a minimal JSON serialization via fmt (we'll rely on client to pass REPOS_JSON too)
+ // This ensures runner gets repos even if env vars weren't passed from frontend
+ b, _ := json.Marshal(repos)
+ base = append(base, corev1.EnvVar{Name: "REPOS_JSON", Value: string(b)})
+ }
+ if mrn, ok := spec["mainRepoName"].(string); ok && strings.TrimSpace(mrn) != "" {
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_NAME", Value: mrn})
+ }
+ // Inject MAIN_REPO_INDEX if provided
+ if mriRaw, ok := spec["mainRepoIndex"]; ok {
+ switch v := mriRaw.(type) {
+ case int64:
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
+ case int32:
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
+ case int:
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
+ case float64:
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", int64(v))})
+ case string:
+ if strings.TrimSpace(v) != "" {
+ base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: v})
+ }
}
-
- // Add PARENT_SESSION_ID if this is a continuation
- if parentSessionID != "" {
- base = append(base, corev1.EnvVar{Name: "PARENT_SESSION_ID", Value: parentSessionID})
- log.Printf("Session %s: passing PARENT_SESSION_ID=%s to runner", name, parentSessionID)
+ }
+ // Inject activeWorkflow environment variables if present
+ if workflow, ok := spec["activeWorkflow"].(map[string]interface{}); ok {
+ if gitURL, ok := workflow["gitUrl"].(string); ok && strings.TrimSpace(gitURL) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_GIT_URL", Value: gitURL})
}
- // If backend annotated the session with a runner token secret, inject only BOT_TOKEN
- // Secret contains: 'k8s-token' (for CR updates)
- // Prefer annotated secret name; fallback to deterministic name
- secretName := ""
- if meta, ok := currentObj.Object["metadata"].(map[string]interface{}); ok {
- if anns, ok := meta["annotations"].(map[string]interface{}); ok {
- if v, ok := anns["ambient-code.io/runner-token-secret"].(string); ok && strings.TrimSpace(v) != "" {
- secretName = strings.TrimSpace(v)
- }
- }
+ if branch, ok := workflow["branch"].(string); ok && strings.TrimSpace(branch) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_BRANCH", Value: branch})
}
- if secretName == "" {
- secretName = fmt.Sprintf("ambient-runner-token-%s", name)
+ if path, ok := workflow["path"].(string); ok && strings.TrimSpace(path) != "" {
+ base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_PATH", Value: path})
}
- base = append(base, corev1.EnvVar{
- Name: "BOT_TOKEN",
- ValueFrom: &corev1.EnvVarSource{SecretKeyRef: &corev1.SecretKeySelector{
- LocalObjectReference: corev1.LocalObjectReference{Name: secretName},
- Key: "k8s-token",
- }},
- })
- // Add CR-provided envs last (override base when same key)
- if spec, ok := currentObj.Object["spec"].(map[string]interface{}); ok {
- // Inject REPOS_JSON and MAIN_REPO_NAME from spec.repos and spec.mainRepoName if present
- if repos, ok := spec["repos"].([]interface{}); ok && len(repos) > 0 {
- // Use a minimal JSON serialization via fmt (we'll rely on client to pass REPOS_JSON too)
- // This ensures runner gets repos even if env vars weren't passed from frontend
- b, _ := json.Marshal(repos)
- base = append(base, corev1.EnvVar{Name: "REPOS_JSON", Value: string(b)})
- }
- if mrn, ok := spec["mainRepoName"].(string); ok && strings.TrimSpace(mrn) != "" {
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_NAME", Value: mrn})
- }
- // Inject MAIN_REPO_INDEX if provided
- if mriRaw, ok := spec["mainRepoIndex"]; ok {
- switch v := mriRaw.(type) {
- case int64:
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
- case int32:
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
- case int:
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", v)})
- case float64:
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: fmt.Sprintf("%d", int64(v))})
- case string:
- if strings.TrimSpace(v) != "" {
- base = append(base, corev1.EnvVar{Name: "MAIN_REPO_INDEX", Value: v})
+ }
+ if envMap, ok := spec["environmentVariables"].(map[string]interface{}); ok {
+ for k, v := range envMap {
+ if vs, ok := v.(string); ok {
+ // replace if exists
+ replaced := false
+ for i := range base {
+ if base[i].Name == k {
+ base[i].Value = vs
+ replaced = true
+ break
}
}
- }
- // Inject activeWorkflow environment variables if present
- if workflow, ok := spec["activeWorkflow"].(map[string]interface{}); ok {
- if gitURL, ok := workflow["gitUrl"].(string); ok && strings.TrimSpace(gitURL) != "" {
- base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_GIT_URL", Value: gitURL})
- }
- if branch, ok := workflow["branch"].(string); ok && strings.TrimSpace(branch) != "" {
- base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_BRANCH", Value: branch})
- }
- if path, ok := workflow["path"].(string); ok && strings.TrimSpace(path) != "" {
- base = append(base, corev1.EnvVar{Name: "ACTIVE_WORKFLOW_PATH", Value: path})
- }
- }
- if envMap, ok := spec["environmentVariables"].(map[string]interface{}); ok {
- for k, v := range envMap {
- if vs, ok := v.(string); ok {
- // replace if exists
- replaced := false
- for i := range base {
- if base[i].Name == k {
- base[i].Value = vs
- replaced = true
- break
- }
- }
- if !replaced {
- base = append(base, corev1.EnvVar{Name: k, Value: vs})
- }
- }
+ if !replaced {
+ base = append(base, corev1.EnvVar{Name: k, Value: vs})
}
}
}
+ }
+ }
+
+ return base
+ }(),
+
+ // Import secrets as environment variables
+ // - integrationSecretsName: Only if exists (GIT_TOKEN, JIRA_*, custom keys)
+ // - runnerSecretsName: Only when Vertex disabled (ANTHROPIC_API_KEY)
+ // - ambient-langfuse-keys: Platform-wide Langfuse observability (LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST, LANGFUSE_ENABLED)
+ EnvFrom: func() []corev1.EnvFromSource {
+ sources := []corev1.EnvFromSource{}
+
+ // Only inject integration secrets if they exist (optional)
+ if integrationSecretsExist {
+ sources = append(sources, corev1.EnvFromSource{
+ SecretRef: &corev1.SecretEnvSource{
+ LocalObjectReference: corev1.LocalObjectReference{Name: integrationSecretsName},
+ },
+ })
+ log.Printf("Injecting integration secrets from '%s' for session %s", integrationSecretsName, name)
+ } else {
+ log.Printf("Skipping integration secrets '%s' for session %s (not found or not configured)", integrationSecretsName, name)
+ }
+
+ // Only inject runner secrets (ANTHROPIC_API_KEY) when Vertex is disabled
+ if !vertexEnabled && runnerSecretsName != "" {
+ sources = append(sources, corev1.EnvFromSource{
+ SecretRef: &corev1.SecretEnvSource{
+ LocalObjectReference: corev1.LocalObjectReference{Name: runnerSecretsName},
+ },
+ })
+ log.Printf("Injecting runner secrets from '%s' for session %s (Vertex disabled)", runnerSecretsName, name)
+ } else if vertexEnabled && runnerSecretsName != "" {
+ log.Printf("Skipping runner secrets '%s' for session %s (Vertex enabled)", runnerSecretsName, name)
+ }
- return base
- }(),
-
- // Import secrets as environment variables
- // - integrationSecretsName: Only if exists (GIT_TOKEN, JIRA_*, custom keys)
- // - runnerSecretsName: Only when Vertex disabled (ANTHROPIC_API_KEY)
- // - ambient-langfuse-keys: Platform-wide Langfuse observability (LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST, LANGFUSE_ENABLED)
- EnvFrom: func() []corev1.EnvFromSource {
- sources := []corev1.EnvFromSource{}
-
- // Only inject integration secrets if they exist (optional)
- if integrationSecretsExist {
- sources = append(sources, corev1.EnvFromSource{
- SecretRef: &corev1.SecretEnvSource{
- LocalObjectReference: corev1.LocalObjectReference{Name: integrationSecretsName},
- },
- })
- log.Printf("Injecting integration secrets from '%s' for session %s", integrationSecretsName, name)
- } else {
- log.Printf("Skipping integration secrets '%s' for session %s (not found or not configured)", integrationSecretsName, name)
- }
-
- // Only inject runner secrets (ANTHROPIC_API_KEY) when Vertex is disabled
- if !vertexEnabled && runnerSecretsName != "" {
- sources = append(sources, corev1.EnvFromSource{
- SecretRef: &corev1.SecretEnvSource{
- LocalObjectReference: corev1.LocalObjectReference{Name: runnerSecretsName},
- },
- })
- log.Printf("Injecting runner secrets from '%s' for session %s (Vertex disabled)", runnerSecretsName, name)
- } else if vertexEnabled && runnerSecretsName != "" {
- log.Printf("Skipping runner secrets '%s' for session %s (Vertex enabled)", runnerSecretsName, name)
- }
-
- return sources
- }(),
+ return sources
+ }(),
- Resources: corev1.ResourceRequirements{},
+ Resources: corev1.ResourceRequirements{},
+ },
+ // S3 state-sync sidecar - syncs .claude/, artifacts/, uploads/ to S3
+ {
+ Name: "state-sync",
+ Image: appConfig.StateSyncImage,
+ ImagePullPolicy: appConfig.ImagePullPolicy,
+ Command: []string{"/usr/local/bin/sync.sh"},
+ SecurityContext: &corev1.SecurityContext{
+ AllowPrivilegeEscalation: boolPtr(false),
+ ReadOnlyRootFilesystem: boolPtr(false),
+ Capabilities: &corev1.Capabilities{
+ Drop: []corev1.Capability{"ALL"},
+ },
+ },
+ Env: []corev1.EnvVar{
+ {Name: "SESSION_NAME", Value: name},
+ {Name: "NAMESPACE", Value: sessionNamespace},
+ {Name: "S3_ENDPOINT", Value: s3Endpoint},
+ {Name: "S3_BUCKET", Value: s3Bucket},
+ {Name: "SYNC_INTERVAL", Value: "60"},
+ {Name: "MAX_SYNC_SIZE", Value: "1073741824"}, // 1GB
+ {Name: "AWS_ACCESS_KEY_ID", Value: s3AccessKey},
+ {Name: "AWS_SECRET_ACCESS_KEY", Value: s3SecretKey},
+ },
+ VolumeMounts: []corev1.VolumeMount{
+ {Name: "workspace", MountPath: "/workspace", ReadOnly: false},
+ // SubPath mount for .claude so sync sidecar reads from same location as runner
+ {Name: "workspace", MountPath: "/app/.claude", SubPath: ".claude", ReadOnly: false},
+ },
+ Resources: corev1.ResourceRequirements{
+ Requests: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("50m"),
+ corev1.ResourceMemory: resource.MustParse("64Mi"),
+ },
+ Limits: corev1.ResourceList{
+ corev1.ResourceCPU: resource.MustParse("200m"),
+ corev1.ResourceMemory: resource.MustParse("256Mi"),
},
},
},
@@ -1358,14 +1178,14 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
// If ambient-vertex secret was successfully copied, mount it as a volume
if ambientVertexSecretCopied {
- job.Spec.Template.Spec.Volumes = append(job.Spec.Template.Spec.Volumes, corev1.Volume{
+ pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: "vertex",
VolumeSource: corev1.VolumeSource{Secret: &corev1.SecretVolumeSource{SecretName: types.AmbientVertexSecretName}},
})
// Mount to the ambient-code-runner container by name
- for i := range job.Spec.Template.Spec.Containers {
- if job.Spec.Template.Spec.Containers[i].Name == "ambient-code-runner" {
- job.Spec.Template.Spec.Containers[i].VolumeMounts = append(job.Spec.Template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
+ for i := range pod.Spec.Containers {
+ if pod.Spec.Containers[i].Name == "ambient-code-runner" {
+ pod.Spec.Containers[i].VolumeMounts = append(pod.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
Name: "vertex",
MountPath: "/app/vertex",
ReadOnly: true,
@@ -1393,7 +1213,7 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
},
OwnerReferences: []v1.OwnerReference{
{
- APIVersion: "vteam.ambient-code/v1",
+ APIVersion: "vteam.ambient-code/v1alpha1",
Kind: "AgenticSession",
Name: currentObj.GetName(),
UID: currentObj.GetUID(),
@@ -1419,7 +1239,7 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
// Always mount Google OAuth secret (with Optional: true so pod starts even if empty)
// K8s will sync updates when backend populates credentials after OAuth completion (~60s)
- job.Spec.Template.Spec.Volumes = append(job.Spec.Template.Spec.Volumes, corev1.Volume{
+ pod.Spec.Volumes = append(pod.Spec.Volumes, corev1.Volume{
Name: "google-oauth",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
@@ -1429,9 +1249,9 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
},
})
// Mount to the ambient-code-runner container
- for i := range job.Spec.Template.Spec.Containers {
- if job.Spec.Template.Spec.Containers[i].Name == "ambient-code-runner" {
- job.Spec.Template.Spec.Containers[i].VolumeMounts = append(job.Spec.Template.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
+ for i := range pod.Spec.Containers {
+ if pod.Spec.Containers[i].Name == "ambient-code-runner" {
+ pod.Spec.Containers[i].VolumeMounts = append(pod.Spec.Containers[i].VolumeMounts, corev1.VolumeMount{
Name: "google-oauth",
MountPath: "/app/.google_workspace_mcp/credentials",
ReadOnly: true,
@@ -1443,19 +1263,19 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
// Do not mount runner Secret volume; runner fetches tokens on demand
- // Create the job
- createdJob, err := config.K8sClient.BatchV1().Jobs(sessionNamespace).Create(context.TODO(), job, v1.CreateOptions{})
+ // Create the pod
+ createdPod, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Create(context.TODO(), pod, v1.CreateOptions{})
if err != nil {
- // If job already exists, this is likely a race condition from duplicate watch events - not an error
+ // If pod already exists, this is likely a race condition from duplicate watch events - not an error
if errors.IsAlreadyExists(err) {
- log.Printf("Job %s already exists (race condition), continuing", jobName)
- // Clear desired-phase annotation since job exists
+ log.Printf("Pod %s already exists (race condition), continuing", podName)
+ // Clear desired-phase annotation since pod exists
_ = clearAnnotation(sessionNamespace, name, "ambient-code.io/desired-phase")
return nil
}
- log.Printf("Failed to create job %s: %v", jobName, err)
+ log.Printf("Failed to create pod %s: %v", podName, err)
statusPatch.AddCondition(conditionUpdate{
- Type: conditionJobCreated,
+ Type: conditionPodCreated,
Status: "False",
Reason: "CreateFailed",
Message: err.Error(),
@@ -1463,54 +1283,54 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
statusPatch.AddCondition(conditionUpdate{
Type: conditionReady,
Status: "False",
- Reason: "JobCreationFailed",
- Message: "Runner job creation failed",
+ Reason: "PodCreationFailed",
+ Message: "Runner pod creation failed",
})
_ = statusPatch.Apply()
- return fmt.Errorf("failed to create job: %v", err)
+ return fmt.Errorf("failed to create pod: %v", err)
}
- log.Printf("Created job %s for AgenticSession %s", jobName, name)
+ log.Printf("Created pod %s for AgenticSession %s", podName, name)
statusPatch.SetField("phase", "Creating")
statusPatch.SetField("observedGeneration", currentObj.GetGeneration())
statusPatch.AddCondition(conditionUpdate{
- Type: conditionJobCreated,
+ Type: conditionPodCreated,
Status: "True",
- Reason: "JobCreated",
- Message: "Runner job created",
+ Reason: "PodCreated",
+ Message: "Runner pod created",
})
// Apply all accumulated status changes in a single API call
if err := statusPatch.Apply(); err != nil {
log.Printf("Warning: failed to apply status patch: %v", err)
}
- // Clear desired-phase annotation now that job is created
+ // Clear desired-phase annotation now that pod is created
// (This was deferred from the restart handler to avoid race conditions with stale events)
_ = clearAnnotation(sessionNamespace, name, "ambient-code.io/desired-phase")
- log.Printf("[DesiredPhase] Cleared desired-phase annotation after successful job creation")
+ log.Printf("[DesiredPhase] Cleared desired-phase annotation after successful pod creation")
- // Create a per-job Service pointing to the content container
+ // Create a per-pod Service pointing to the content container
svc := &corev1.Service{
ObjectMeta: v1.ObjectMeta{
Name: fmt.Sprintf("ambient-content-%s", name),
Namespace: sessionNamespace,
Labels: map[string]string{"app": "ambient-code-runner", "agentic-session": name},
OwnerReferences: []v1.OwnerReference{{
- APIVersion: "batch/v1",
- Kind: "Job",
- Name: jobName,
- UID: createdJob.UID,
+ APIVersion: "v1",
+ Kind: "Pod",
+ Name: podName,
+ UID: createdPod.UID,
Controller: boolPtr(true),
}},
},
Spec: corev1.ServiceSpec{
- Selector: map[string]string{"job-name": jobName},
+ Selector: map[string]string{"agentic-session": name, "app": "ambient-code-runner"},
Ports: []corev1.ServicePort{{Port: 8080, TargetPort: intstr.FromString("http"), Protocol: corev1.ProtocolTCP, Name: "http"}},
Type: corev1.ServiceTypeClusterIP,
},
}
if _, serr := config.K8sClient.CoreV1().Services(sessionNamespace).Create(context.TODO(), svc, v1.CreateOptions{}); serr != nil && !errors.IsAlreadyExists(serr) {
- log.Printf("Failed to create per-job content service for %s: %v", name, serr)
+ log.Printf("Failed to create per-pod content service for %s: %v", name, serr)
}
// Create AG-UI Service pointing to the runner's FastAPI server
@@ -1524,16 +1344,16 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
"agentic-session": name,
},
OwnerReferences: []v1.OwnerReference{{
- APIVersion: "batch/v1",
- Kind: "Job",
- Name: jobName,
- UID: createdJob.UID,
+ APIVersion: "v1",
+ Kind: "Pod",
+ Name: podName,
+ UID: createdPod.UID,
Controller: boolPtr(true),
}},
},
Spec: corev1.ServiceSpec{
Type: corev1.ServiceTypeClusterIP,
- Selector: map[string]string{"job-name": jobName},
+ Selector: map[string]string{"agentic-session": name, "app": "ambient-code-runner"},
Ports: []corev1.ServicePort{{
Name: "agui",
Protocol: corev1.ProtocolTCP,
@@ -1548,17 +1368,17 @@ func handleAgenticSessionEvent(obj *unstructured.Unstructured) error {
log.Printf("Created AG-UI service session-%s for AgenticSession %s", name, name)
}
- // Start monitoring the job (only if not already being monitored)
- monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, jobName)
- monitoredJobsMu.Lock()
- alreadyMonitoring := monitoredJobs[monitorKey]
+ // Start monitoring the pod (only if not already being monitored)
+ monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, podName)
+ monitoredPodsMu.Lock()
+ alreadyMonitoring := monitoredPods[monitorKey]
if !alreadyMonitoring {
- monitoredJobs[monitorKey] = true
- monitoredJobsMu.Unlock()
- go monitorJob(jobName, name, sessionNamespace)
+ monitoredPods[monitorKey] = true
+ monitoredPodsMu.Unlock()
+ go monitorPod(podName, name, sessionNamespace)
} else {
- monitoredJobsMu.Unlock()
- log.Printf("Job %s already being monitored, skipping duplicate goroutine", jobName)
+ monitoredPodsMu.Unlock()
+ log.Printf("Pod %s already being monitored, skipping duplicate goroutine", podName)
}
return nil
@@ -1834,18 +1654,18 @@ func reconcileActiveWorkflowWithPatch(sessionNamespace, sessionName string, spec
return nil
}
-func monitorJob(jobName, sessionName, sessionNamespace string) {
- monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, jobName)
+func monitorPod(podName, sessionName, sessionNamespace string) {
+ monitorKey := fmt.Sprintf("%s/%s", sessionNamespace, podName)
// Remove from monitoring map when this goroutine exits
defer func() {
- monitoredJobsMu.Lock()
- delete(monitoredJobs, monitorKey)
- monitoredJobsMu.Unlock()
- log.Printf("Stopped monitoring job %s (goroutine exiting)", jobName)
+ monitoredPodsMu.Lock()
+ delete(monitoredPods, monitorKey)
+ monitoredPodsMu.Unlock()
+ log.Printf("Stopped monitoring pod %s (goroutine exiting)", podName)
}()
- log.Printf("Starting job monitoring for %s (session: %s/%s)", jobName, sessionNamespace, sessionName)
+ log.Printf("Starting pod monitoring for %s (session: %s/%s)", podName, sessionNamespace, sessionName)
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
@@ -1868,7 +1688,7 @@ func monitorJob(jobName, sessionName, sessionNamespace string) {
sessionStatus, _, _ := unstructured.NestedMap(sessionObj.Object, "status")
if sessionStatus != nil {
if currentPhase, ok := sessionStatus["phase"].(string); ok && currentPhase == "Stopped" {
- log.Printf("AgenticSession %s was stopped; stopping job monitoring", sessionName)
+ log.Printf("AgenticSession %s was stopped; stopping pod monitoring", sessionName)
return
}
}
@@ -1877,79 +1697,97 @@ func monitorJob(jobName, sessionName, sessionNamespace string) {
log.Printf("Failed to refresh runner token for %s/%s: %v", sessionNamespace, sessionName, err)
}
- job, err := config.K8sClient.BatchV1().Jobs(sessionNamespace).Get(context.TODO(), jobName, v1.GetOptions{})
+ pod, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), podName, v1.GetOptions{})
if err != nil {
if errors.IsNotFound(err) {
- log.Printf("Job %s deleted; stopping monitor", jobName)
+ log.Printf("Pod %s deleted; stopping monitor", podName)
return
}
- log.Printf("Error fetching job %s: %v", jobName, err)
+ log.Printf("Error fetching pod %s: %v", podName, err)
continue
}
+ // Note: We don't store pod name in status (pods are ephemeral, can be recreated)
+ // Use k8s-resources endpoint or kubectl for live pod info
- pods, err := config.K8sClient.CoreV1().Pods(sessionNamespace).List(context.TODO(), v1.ListOptions{LabelSelector: fmt.Sprintf("job-name=%s", jobName)})
- if err != nil {
- log.Printf("Failed to list pods for job %s: %v", jobName, err)
- continue
+ if pod.Spec.NodeName != "" {
+ statusPatch.AddCondition(conditionUpdate{Type: conditionPodScheduled, Status: "True", Reason: "Scheduled", Message: fmt.Sprintf("Scheduled on %s", pod.Spec.NodeName)})
}
- if job.Status.Succeeded > 0 {
+ if pod.Status.Phase == corev1.PodSucceeded {
statusPatch.SetField("phase", "Completed")
statusPatch.SetField("completionTime", time.Now().UTC().Format(time.RFC3339))
statusPatch.AddCondition(conditionUpdate{Type: conditionReady, Status: "False", Reason: "Completed", Message: "Session finished"})
_ = statusPatch.Apply()
_ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
+ _ = deletePodAndPerPodService(sessionNamespace, podName, sessionName)
return
}
- if job.Spec.BackoffLimit != nil && job.Status.Failed >= *job.Spec.BackoffLimit {
- statusPatch.SetField("phase", "Failed")
- statusPatch.SetField("completionTime", time.Now().UTC().Format(time.RFC3339))
- statusPatch.AddCondition(conditionUpdate{Type: conditionReady, Status: "False", Reason: "BackoffLimitExceeded", Message: "Runner failed repeatedly"})
- _ = statusPatch.Apply()
- _ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
- return
- }
+ if pod.Status.Phase == corev1.PodFailed {
+ // Collect detailed error message from pod and containers
+ errorMsg := pod.Status.Message
+ if errorMsg == "" {
+ errorMsg = pod.Status.Reason
+ }
- if len(pods.Items) == 0 {
- if job.Status.Active == 0 && job.Status.Succeeded == 0 && job.Status.Failed == 0 {
- statusPatch.SetField("phase", "Failed")
- statusPatch.SetField("completionTime", time.Now().UTC().Format(time.RFC3339))
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionReady,
- Status: "False",
- Reason: "PodMissing",
- Message: "Runner pod missing",
- })
- _ = statusPatch.Apply()
- _ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
- return
+ // Check init containers for errors
+ for _, initStatus := range pod.Status.InitContainerStatuses {
+ if initStatus.State.Terminated != nil && initStatus.State.Terminated.ExitCode != 0 {
+ msg := fmt.Sprintf("Init container %s failed (exit %d): %s",
+ initStatus.Name,
+ initStatus.State.Terminated.ExitCode,
+ initStatus.State.Terminated.Message)
+ if initStatus.State.Terminated.Reason != "" {
+ msg = fmt.Sprintf("%s - %s", msg, initStatus.State.Terminated.Reason)
+ }
+ errorMsg = msg
+ break
+ }
+ if initStatus.State.Waiting != nil && initStatus.State.Waiting.Reason != "" {
+ errorMsg = fmt.Sprintf("Init container %s: %s - %s",
+ initStatus.Name,
+ initStatus.State.Waiting.Reason,
+ initStatus.State.Waiting.Message)
+ break
+ }
}
- continue
- }
- pod := pods.Items[0]
- // Note: We don't store pod name in status (pods are ephemeral, can be recreated)
- // Use k8s-resources endpoint or kubectl for live pod info
+ // Check main containers for errors if init passed
+ if errorMsg == "" || errorMsg == "PodFailed" {
+ for _, containerStatus := range pod.Status.ContainerStatuses {
+ if containerStatus.State.Terminated != nil && containerStatus.State.Terminated.ExitCode != 0 {
+ errorMsg = fmt.Sprintf("Container %s failed (exit %d): %s - %s",
+ containerStatus.Name,
+ containerStatus.State.Terminated.ExitCode,
+ containerStatus.State.Terminated.Reason,
+ containerStatus.State.Terminated.Message)
+ break
+ }
+ if containerStatus.State.Waiting != nil {
+ errorMsg = fmt.Sprintf("Container %s: %s - %s",
+ containerStatus.Name,
+ containerStatus.State.Waiting.Reason,
+ containerStatus.State.Waiting.Message)
+ break
+ }
+ }
+ }
- if pod.Spec.NodeName != "" {
- statusPatch.AddCondition(conditionUpdate{Type: conditionPodScheduled, Status: "True", Reason: "Scheduled", Message: fmt.Sprintf("Scheduled on %s", pod.Spec.NodeName)})
- }
+ if errorMsg == "" {
+ errorMsg = "Pod failed with unknown error"
+ }
- if pod.Status.Phase == corev1.PodFailed {
+ log.Printf("Pod %s failed: %s", podName, errorMsg)
statusPatch.SetField("phase", "Failed")
statusPatch.SetField("completionTime", time.Now().UTC().Format(time.RFC3339))
- statusPatch.AddCondition(conditionUpdate{Type: conditionReady, Status: "False", Reason: "PodFailed", Message: pod.Status.Message})
+ statusPatch.AddCondition(conditionUpdate{Type: conditionReady, Status: "False", Reason: "PodFailed", Message: errorMsg})
_ = statusPatch.Apply()
_ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
+ _ = deletePodAndPerPodService(sessionNamespace, podName, sessionName)
return
}
- runner := getContainerStatusByName(&pod, "ambient-code-runner")
+ runner := getContainerStatusByName(pod, "ambient-code-runner")
if runner == nil {
// Apply any accumulated changes (e.g., PodScheduled) before continuing
_ = statusPatch.Apply()
@@ -1974,7 +1812,7 @@ func monitorJob(jobName, sessionName, sessionNamespace string) {
statusPatch.AddCondition(conditionUpdate{Type: conditionReady, Status: "False", Reason: waiting.Reason, Message: msg})
_ = statusPatch.Apply()
_ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
+ _ = deletePodAndPerPodService(sessionNamespace, podName, sessionName)
return
}
}
@@ -2008,7 +1846,7 @@ func monitorJob(jobName, sessionName, sessionNamespace string) {
_ = statusPatch.Apply()
_ = ensureSessionIsInteractive(sessionNamespace, sessionName)
- _ = deleteJobAndPerJobService(sessionNamespace, jobName, sessionName)
+ _ = deletePodAndPerPodService(sessionNamespace, podName, sessionName)
return
}
@@ -2027,31 +1865,101 @@ func getContainerStatusByName(pod *corev1.Pod, name string) *corev1.ContainerSta
return nil
}
+// getS3ConfigForProject reads S3 configuration from project's integration secret
+// Falls back to operator defaults if not configured
+func getS3ConfigForProject(namespace string, appConfig *config.Config) (endpoint, bucket, accessKey, secretKey string, err error) {
+ // Try to read from project's ambient-non-vertex-integrations secret
+ secret, err := config.K8sClient.CoreV1().Secrets(namespace).Get(context.TODO(), "ambient-non-vertex-integrations", v1.GetOptions{})
+ if err != nil && !errors.IsNotFound(err) {
+ return "", "", "", "", fmt.Errorf("failed to read project secret: %w", err)
+ }
+
+ // Read from project secret if available
+ storageMode := "shared" // Default to shared cluster storage
+ if secret != nil && secret.Data != nil {
+ // Check storage mode (shared vs custom)
+ if mode := string(secret.Data["STORAGE_MODE"]); mode != "" {
+ storageMode = mode
+ }
+
+ // Only read custom S3 settings if in custom mode
+ if storageMode == "custom" {
+ if val := string(secret.Data["S3_ENDPOINT"]); val != "" {
+ endpoint = val
+ }
+ if val := string(secret.Data["S3_BUCKET"]); val != "" {
+ bucket = val
+ }
+ if val := string(secret.Data["S3_ACCESS_KEY"]); val != "" {
+ accessKey = val
+ }
+ if val := string(secret.Data["S3_SECRET_KEY"]); val != "" {
+ secretKey = val
+ }
+ log.Printf("Using custom S3 configuration for project %s", namespace)
+ } else {
+ log.Printf("Using shared cluster storage (MinIO) for project %s", namespace)
+ }
+ }
+
+ // Use operator defaults (for shared mode or as fallback)
+ if endpoint == "" {
+ endpoint = appConfig.S3Endpoint
+ }
+ if bucket == "" {
+ bucket = appConfig.S3Bucket
+ }
+
+ // If credentials still empty AND using default endpoint/bucket, use shared MinIO credentials
+ // This implements "shared cluster storage" mode where users don't need to configure anything
+ usingDefaults := endpoint == appConfig.S3Endpoint && bucket == appConfig.S3Bucket
+ if (accessKey == "" || secretKey == "") && usingDefaults {
+ // Look for minio-credentials secret in operator namespace
+ minioSecret, err := config.K8sClient.CoreV1().Secrets(appConfig.BackendNamespace).Get(context.TODO(), "minio-credentials", v1.GetOptions{})
+ if err == nil && minioSecret.Data != nil {
+ if accessKey == "" {
+ accessKey = string(minioSecret.Data["access-key"])
+ }
+ if secretKey == "" {
+ secretKey = string(minioSecret.Data["secret-key"])
+ }
+ log.Printf("Using shared MinIO credentials for project %s (shared cluster storage mode)", namespace)
+ } else {
+ log.Printf("Warning: minio-credentials secret not found in namespace %s", appConfig.BackendNamespace)
+ }
+ }
+
+ // Validate we have required config
+ if endpoint == "" || bucket == "" {
+ return "", "", "", "", fmt.Errorf("incomplete S3 configuration - endpoint and bucket required")
+ }
+ if accessKey == "" || secretKey == "" {
+ return "", "", "", "", fmt.Errorf("incomplete S3 configuration - access key and secret key required")
+ }
+
+ log.Printf("S3 config for project %s: endpoint=%s, bucket=%s", namespace, endpoint, bucket)
+ return endpoint, bucket, accessKey, secretKey, nil
+}
+
// deleteJobAndPerJobService deletes the Job and its associated per-job Service
-func deleteJobAndPerJobService(namespace, jobName, sessionName string) error {
- // Delete Service first (it has ownerRef to Job, but delete explicitly just in case)
+func deletePodAndPerPodService(namespace, podName, sessionName string) error {
+ // Delete Service first (it has ownerRef to Pod, but delete explicitly just in case)
svcName := fmt.Sprintf("ambient-content-%s", sessionName)
if err := config.K8sClient.CoreV1().Services(namespace).Delete(context.TODO(), svcName, v1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete per-job service %s/%s: %v", namespace, svcName, err)
+ log.Printf("Failed to delete per-pod service %s/%s: %v", namespace, svcName, err)
}
- // Delete the Job with background propagation
- policy := v1.DeletePropagationBackground
- if err := config.K8sClient.BatchV1().Jobs(namespace).Delete(context.TODO(), jobName, v1.DeleteOptions{PropagationPolicy: &policy}); err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete job %s/%s: %v", namespace, jobName, err)
- return err
+ // Delete AG-UI service
+ aguiSvcName := fmt.Sprintf("session-%s", sessionName)
+ if err := config.K8sClient.CoreV1().Services(namespace).Delete(context.TODO(), aguiSvcName, v1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
+ log.Printf("Failed to delete AG-UI service %s/%s: %v", namespace, aguiSvcName, err)
}
- // Proactively delete Pods for this Job
- if pods, err := config.K8sClient.CoreV1().Pods(namespace).List(context.TODO(), v1.ListOptions{LabelSelector: fmt.Sprintf("job-name=%s", jobName)}); err == nil {
- for i := range pods.Items {
- p := pods.Items[i]
- if err := config.K8sClient.CoreV1().Pods(namespace).Delete(context.TODO(), p.Name, v1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
- log.Printf("Failed to delete pod %s/%s for job %s: %v", namespace, p.Name, jobName, err)
- }
- }
- } else if !errors.IsNotFound(err) {
- log.Printf("Failed to list pods for job %s/%s: %v", namespace, jobName, err)
+ // Delete the Pod with background propagation
+ policy := v1.DeletePropagationBackground
+ if err := config.K8sClient.CoreV1().Pods(namespace).Delete(context.TODO(), podName, v1.DeleteOptions{PropagationPolicy: &policy}); err != nil && !errors.IsNotFound(err) {
+ log.Printf("Failed to delete pod %s/%s: %v", namespace, podName, err)
+ return err
}
// Delete the ambient-vertex secret if it was copied by the operator
@@ -2076,90 +1984,6 @@ func deleteJobAndPerJobService(namespace, jobName, sessionName string) error {
return nil
}
-// CleanupExpiredTempContentPods removes temporary content pods that have exceeded their TTL
-func CleanupExpiredTempContentPods() {
- log.Println("Starting temp content pod cleanup goroutine")
- for {
- time.Sleep(1 * time.Minute)
-
- // List all temp content pods across all namespaces
- pods, err := config.K8sClient.CoreV1().Pods("").List(context.TODO(), v1.ListOptions{
- LabelSelector: "app=temp-content-service",
- })
- if err != nil {
- log.Printf("[TempPodCleanup] Failed to list temp content pods: %v", err)
- continue
- }
-
- gvr := types.GetAgenticSessionResource()
- for _, pod := range pods.Items {
- sessionName := pod.Labels["agentic-session"]
- if sessionName == "" {
- log.Printf("[TempPodCleanup] Temp pod %s has no agentic-session label, skipping", pod.Name)
- continue
- }
-
- // Check if session still exists
- session, err := config.DynamicClient.Resource(gvr).Namespace(pod.Namespace).Get(context.TODO(), sessionName, v1.GetOptions{})
- if err != nil {
- if errors.IsNotFound(err) {
- // Session deleted, delete temp pod
- log.Printf("[TempPodCleanup] Session %s/%s gone, deleting orphaned temp pod %s", pod.Namespace, sessionName, pod.Name)
- if err := config.K8sClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, v1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
- log.Printf("[TempPodCleanup] Failed to delete orphaned temp pod: %v", err)
- }
- }
- continue
- }
-
- // Get last-accessed timestamp from session annotation
- annotations := session.GetAnnotations()
- lastAccessedStr := annotations[tempContentLastAccessedAnnotation]
- if lastAccessedStr == "" {
- // Fall back to pod created-at if no last-accessed
- lastAccessedStr = pod.Annotations["ambient-code.io/created-at"]
- }
-
- if lastAccessedStr == "" {
- log.Printf("[TempPodCleanup] No timestamp for temp pod %s, skipping", pod.Name)
- continue
- }
-
- lastAccessed, err := time.Parse(time.RFC3339, lastAccessedStr)
- if err != nil {
- log.Printf("[TempPodCleanup] Failed to parse timestamp for pod %s: %v", pod.Name, err)
- continue
- }
-
- // Delete if inactive for > 10 minutes
- if time.Since(lastAccessed) > tempContentInactivityTTL {
- log.Printf("[TempPodCleanup] Deleting inactive temp pod %s/%s (last accessed: %v ago)",
- pod.Namespace, pod.Name, time.Since(lastAccessed))
-
- if err := config.K8sClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, v1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
- log.Printf("[TempPodCleanup] Failed to delete temp pod: %v", err)
- continue
- }
-
- // Update condition
- _ = mutateAgenticSessionStatus(pod.Namespace, sessionName, func(status map[string]interface{}) {
- setCondition(status, conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "False",
- Reason: "Expired",
- Message: fmt.Sprintf("Temp pod deleted due to inactivity (%v)", time.Since(lastAccessed)),
- })
- })
-
- // Clear temp-content-requested annotation
- delete(annotations, tempContentRequestedAnnotation)
- delete(annotations, tempContentLastAccessedAnnotation)
- _ = updateAnnotations(pod.Namespace, sessionName, annotations)
- }
- }
- }
-}
-
// copySecretToNamespace copies a secret to a target namespace with owner references
func copySecretToNamespace(ctx context.Context, sourceSecret *corev1.Secret, targetNamespace string, ownerObj *unstructured.Unstructured) error {
// Check if secret already exists in target namespace
@@ -2326,137 +2150,6 @@ func deleteAmbientLangfuseSecret(ctx context.Context, namespace string) error {
return nil
}
-// reconcileTempContentPodWithPatch is a version of reconcileTempContentPod that uses StatusPatch for batched updates.
-func reconcileTempContentPodWithPatch(sessionNamespace, sessionName, tempPodName string, session *unstructured.Unstructured, statusPatch *StatusPatch) error {
- // Check if pod already exists
- tempPod, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Get(context.TODO(), tempPodName, v1.GetOptions{})
-
- if errors.IsNotFound(err) {
- // Create temp pod
- log.Printf("[TempPod] Creating temp content pod for workspace access: %s/%s", sessionNamespace, tempPodName)
-
- pvcName := fmt.Sprintf("ambient-workspace-%s", sessionName)
- appConfig := config.LoadConfig()
-
- pod := &corev1.Pod{
- ObjectMeta: v1.ObjectMeta{
- Name: tempPodName,
- Namespace: sessionNamespace,
- Labels: map[string]string{
- "app": "temp-content-service",
- "agentic-session": sessionName,
- },
- Annotations: map[string]string{
- "ambient-code.io/created-at": time.Now().UTC().Format(time.RFC3339),
- },
- OwnerReferences: []v1.OwnerReference{{
- APIVersion: session.GetAPIVersion(),
- Kind: session.GetKind(),
- Name: session.GetName(),
- UID: session.GetUID(),
- Controller: boolPtr(true),
- }},
- },
- Spec: corev1.PodSpec{
- RestartPolicy: corev1.RestartPolicyNever,
- TerminationGracePeriodSeconds: int64Ptr(0), // Enable instant termination
- Containers: []corev1.Container{{
- Name: "content",
- Image: appConfig.ContentServiceImage,
- ImagePullPolicy: appConfig.ImagePullPolicy,
- Env: []corev1.EnvVar{
- {Name: "CONTENT_SERVICE_MODE", Value: "true"},
- {Name: "STATE_BASE_DIR", Value: "/workspace"},
- },
- Ports: []corev1.ContainerPort{{ContainerPort: 8080, Name: "http"}},
- VolumeMounts: []corev1.VolumeMount{{
- Name: "workspace",
- MountPath: "/workspace",
- }},
- ReadinessProbe: &corev1.Probe{
- ProbeHandler: corev1.ProbeHandler{
- HTTPGet: &corev1.HTTPGetAction{
- Path: "/health",
- Port: intstr.FromString("http"),
- },
- },
- InitialDelaySeconds: 3,
- PeriodSeconds: 3,
- },
- }},
- Volumes: []corev1.Volume{{
- Name: "workspace",
- VolumeSource: corev1.VolumeSource{
- PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
- ClaimName: pvcName,
- },
- },
- }},
- },
- }
-
- if _, err := config.K8sClient.CoreV1().Pods(sessionNamespace).Create(context.TODO(), pod, v1.CreateOptions{}); err != nil {
- log.Printf("[TempPod] Failed to create temp pod: %v", err)
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "False",
- Reason: "CreationFailed",
- Message: fmt.Sprintf("Failed to create temp pod: %v", err),
- })
- return fmt.Errorf("failed to create temp pod: %w", err)
- }
-
- log.Printf("[TempPod] Created temp pod %s", tempPodName)
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "Unknown",
- Reason: "Provisioning",
- Message: "Temp content pod starting",
- })
- return nil
- }
-
- if err != nil {
- return fmt.Errorf("failed to check temp pod: %w", err)
- }
-
- // Temp pod exists, check readiness
- if tempPod.Status.Phase == corev1.PodRunning {
- ready := false
- for _, cond := range tempPod.Status.Conditions {
- if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
- ready = true
- break
- }
- }
-
- if ready {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "True",
- Reason: "Ready",
- Message: "Temp content pod is ready for workspace access",
- })
- } else {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "Unknown",
- Reason: "NotReady",
- Message: "Temp content pod not ready yet",
- })
- }
- } else if tempPod.Status.Phase == corev1.PodFailed {
- statusPatch.AddCondition(conditionUpdate{
- Type: conditionTempContentPodReady,
- Status: "False",
- Reason: "PodFailed",
- Message: fmt.Sprintf("Temp content pod failed: %s", tempPod.Status.Message),
- })
- }
-
- return nil
-}
-
// LEGACY: getBackendAPIURL removed - AG-UI migration
// Workflow and repo changes now call runner's REST endpoints directly
@@ -2632,6 +2325,5 @@ func regenerateRunnerToken(sessionNamespace, sessionName string, session *unstru
// Helper functions
var (
boolPtr = func(b bool) *bool { return &b }
- int32Ptr = func(i int32) *int32 { return &i }
int64Ptr = func(i int64) *int64 { return &i }
)
diff --git a/components/operator/internal/services/infrastructure.go b/components/operator/internal/services/infrastructure.go
index bed30920a..e33481f89 100644
--- a/components/operator/internal/services/infrastructure.go
+++ b/components/operator/internal/services/infrastructure.go
@@ -51,36 +51,10 @@ func EnsureContentService(namespace string) error {
return nil
}
-// EnsureSessionWorkspacePVC creates a per-session PVC owned by the AgenticSession to avoid multi-attach conflicts
+// EnsureSessionWorkspacePVC is deprecated - sessions now use EmptyDir with S3 state persistence
+// Kept for backward compatibility but returns nil immediately
func EnsureSessionWorkspacePVC(namespace, pvcName string, ownerRefs []v1.OwnerReference) error {
- // Check if PVC exists
- if _, err := config.K8sClient.CoreV1().PersistentVolumeClaims(namespace).Get(context.TODO(), pvcName, v1.GetOptions{}); err == nil {
- return nil
- } else if !errors.IsNotFound(err) {
- return err
- }
-
- pvc := &corev1.PersistentVolumeClaim{
- ObjectMeta: v1.ObjectMeta{
- Name: pvcName,
- Namespace: namespace,
- Labels: map[string]string{"app": "ambient-workspace", "agentic-session": pvcName},
- OwnerReferences: ownerRefs,
- },
- Spec: corev1.PersistentVolumeClaimSpec{
- AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
- Resources: corev1.VolumeResourceRequirements{
- Requests: corev1.ResourceList{
- corev1.ResourceStorage: resource.MustParse("5Gi"),
- },
- },
- },
- }
- if _, err := config.K8sClient.CoreV1().PersistentVolumeClaims(namespace).Create(context.TODO(), pvc, v1.CreateOptions{}); err != nil {
- if errors.IsAlreadyExists(err) {
- return nil
- }
- return err
- }
+ // DEPRECATED: Per-session PVCs have been replaced with EmptyDir + S3 state sync
+ // This function is kept for backward compatibility but does nothing
return nil
}
diff --git a/components/operator/main.go b/components/operator/main.go
index df9c31821..3eb47a231 100644
--- a/components/operator/main.go
+++ b/components/operator/main.go
@@ -1,16 +1,28 @@
package main
import (
+ "context"
+ "flag"
"log"
"os"
+ "strconv"
+
+ "k8s.io/apimachinery/pkg/runtime"
+ utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+ clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+ ctrl "sigs.k8s.io/controller-runtime"
+ "sigs.k8s.io/controller-runtime/pkg/healthz"
+ ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
+ "sigs.k8s.io/controller-runtime/pkg/log/zap"
+ metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"ambient-code-operator/internal/config"
+ "ambient-code-operator/internal/controller"
"ambient-code-operator/internal/handlers"
"ambient-code-operator/internal/preflight"
)
// Build-time metadata (set via -ldflags -X during build)
-// These are embedded directly in the binary, so they're always accurate
var (
GitCommit = "unknown"
GitBranch = "unknown"
@@ -18,60 +30,156 @@ var (
BuildDate = "unknown"
)
-func logBuildInfo() {
- log.Println("==============================================")
- log.Println("Agentic Session Operator - Build Information")
- log.Println("==============================================")
- log.Printf("Version: %s", GitVersion)
- log.Printf("Commit: %s", GitCommit)
- log.Printf("Branch: %s", GitBranch)
- log.Printf("Repository: %s", getEnvOrDefault("GIT_REPO", "unknown"))
- log.Printf("Built: %s", BuildDate)
- log.Printf("Built by: %s", getEnvOrDefault("BUILD_USER", "unknown"))
- log.Println("==============================================")
-}
+var (
+ scheme = runtime.NewScheme()
+)
-func getEnvOrDefault(key, defaultValue string) string {
- if value := os.Getenv(key); value != "" {
- return value
- }
- return defaultValue
+func init() {
+ utilruntime.Must(clientgoscheme.AddToScheme(scheme))
}
func main() {
+ // Parse command line flags
+ var metricsAddr string
+ var enableLeaderElection bool
+ var probeAddr string
+ var maxConcurrentReconciles int
+
+ flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
+ flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
+ flag.BoolVar(&enableLeaderElection, "leader-elect", false,
+ "Enable leader election for controller manager. "+
+ "Enabling this will ensure there is only one active controller manager.")
+ flag.IntVar(&maxConcurrentReconciles, "max-concurrent-reconciles", 10,
+ "Maximum number of concurrent Reconciles which can be run. Higher values allow more throughput but consume more resources.")
+ flag.Parse()
+
+ // Allow environment variable override for max concurrent reconciles
+ if envVal := os.Getenv("MAX_CONCURRENT_RECONCILES"); envVal != "" {
+ if v, err := strconv.Atoi(envVal); err == nil && v > 0 {
+ maxConcurrentReconciles = v
+ }
+ }
+
+ // Set up logging
+ opts := zap.Options{
+ Development: os.Getenv("DEV_MODE") == "true",
+ }
+ ctrllog.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
+
+ logger := ctrllog.Log.WithName("setup")
+
// Log build information
logBuildInfo()
+ logger.Info("Starting Agentic Session Operator",
+ "maxConcurrentReconciles", maxConcurrentReconciles,
+ "leaderElection", enableLeaderElection,
+ )
- // Initialize Kubernetes clients
+ // Initialize Kubernetes clients (needed for namespace/projectsettings handlers and config)
if err := config.InitK8sClients(); err != nil {
- log.Fatalf("Failed to initialize Kubernetes clients: %v", err)
+ logger.Error(err, "Failed to initialize Kubernetes clients")
+ os.Exit(1)
}
// Load application configuration
appConfig := config.LoadConfig()
- log.Printf("Agentic Session Operator starting in namespace: %s", appConfig.Namespace)
- log.Printf("Using ambient-code runner image: %s", appConfig.AmbientCodeRunnerImage)
+ logger.Info("Configuration loaded",
+ "namespace", appConfig.Namespace,
+ "backendNamespace", appConfig.BackendNamespace,
+ "runnerImage", appConfig.AmbientCodeRunnerImage,
+ )
+
+ // Initialize OpenTelemetry metrics
+ shutdownMetrics, err := controller.InitMetrics(context.Background())
+ if err != nil {
+ logger.Error(err, "Failed to initialize OpenTelemetry metrics, continuing without metrics")
+ } else {
+ defer shutdownMetrics()
+ }
// Validate Vertex AI configuration at startup if enabled
if os.Getenv("CLAUDE_CODE_USE_VERTEX") == "1" {
if err := preflight.ValidateVertexConfig(appConfig.Namespace); err != nil {
- log.Fatalf("Vertex AI validation failed: %v", err)
+ logger.Error(err, "Vertex AI validation failed")
+ os.Exit(1)
}
}
- // Start watching AgenticSession resources
- go handlers.WatchAgenticSessions()
+ // Create controller-runtime manager with increased QPS/Burst to avoid client-side throttling
+ // Default is QPS=5, Burst=10 which causes delays when handling multiple sessions
+ restConfig := ctrl.GetConfigOrDie()
+ restConfig.QPS = 100
+ restConfig.Burst = 200
- // Start watching for managed namespaces
- go handlers.WatchNamespaces()
+ mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
+ Scheme: scheme,
+ Metrics: metricsserver.Options{BindAddress: metricsAddr},
+ HealthProbeBindAddress: probeAddr,
+ LeaderElection: enableLeaderElection,
+ LeaderElectionID: "ambient-code-operator.ambient-code.io",
+ })
+ if err != nil {
+ logger.Error(err, "Unable to create manager")
+ os.Exit(1)
+ }
+
+ // Set up AgenticSession controller with concurrent reconcilers
+ agenticSessionReconciler := controller.NewAgenticSessionReconciler(
+ mgr.GetClient(),
+ maxConcurrentReconciles,
+ )
+ if err := agenticSessionReconciler.SetupWithManager(mgr); err != nil {
+ logger.Error(err, "Unable to create AgenticSession controller")
+ os.Exit(1)
+ }
+ logger.Info("AgenticSession controller registered",
+ "maxConcurrentReconciles", maxConcurrentReconciles,
+ )
- // Start watching ProjectSettings resources
+ // Add health check endpoints
+ if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
+ logger.Error(err, "Unable to set up health check")
+ os.Exit(1)
+ }
+ if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
+ logger.Error(err, "Unable to set up ready check")
+ os.Exit(1)
+ }
+
+ // Start namespace and project settings watchers (these remain as watch loops for now)
+ // Note: These could be migrated to controller-runtime controllers in the future
+ go handlers.WatchNamespaces()
go handlers.WatchProjectSettings()
- // Start cleanup of expired temporary content pods
- go handlers.CleanupExpiredTempContentPods()
+ logger.Info("Starting manager with controller-runtime",
+ "maxConcurrentReconciles", maxConcurrentReconciles,
+ )
- // Keep the operator running
- select {}
+ // Start the manager (blocks until stopped)
+ if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
+ logger.Error(err, "Problem running manager")
+ os.Exit(1)
+ }
+}
+
+func logBuildInfo() {
+ log.Println("==============================================")
+ log.Println("Agentic Session Operator - Build Information")
+ log.Println("==============================================")
+ log.Printf("Version: %s", GitVersion)
+ log.Printf("Commit: %s", GitCommit)
+ log.Printf("Branch: %s", GitBranch)
+ log.Printf("Repository: %s", getEnvOrDefault("GIT_REPO", "unknown"))
+ log.Printf("Built: %s", BuildDate)
+ log.Printf("Built by: %s", getEnvOrDefault("BUILD_USER", "unknown"))
+ log.Println("==============================================")
+}
+
+func getEnvOrDefault(key, defaultValue string) string {
+ if value := os.Getenv(key); value != "" {
+ return value
+ }
+ return defaultValue
}
diff --git a/components/runners/claude-code-runner/adapter.py b/components/runners/claude-code-runner/adapter.py
index 419e493d2..4b4e8c30f 100644
--- a/components/runners/claude-code-runner/adapter.py
+++ b/components/runners/claude-code-runner/adapter.py
@@ -87,13 +87,11 @@ async def initialize(self, context: RunnerContext):
# Copy Google OAuth credentials from mounted Secret to writable workspace location
await self._setup_google_credentials()
- # Prepare workspace from input repo if provided
- async for event in self._prepare_workspace():
- yield event
-
- # Initialize workflow if ACTIVE_WORKFLOW env vars are set
- async for event in self._initialize_workflow_if_set():
- yield event
+ # Workspace is already prepared by init container (hydrate.sh)
+ # - Repos cloned to /workspace/repos/
+ # - Workflows cloned to /workspace/workflows/
+ # - State hydrated from S3 to .claude/, artifacts/, file-uploads/
+ logger.info("Workspace prepared by init container, validating...")
# Validate prerequisite files exist for phase-based commands
try:
@@ -361,9 +359,11 @@ async def _run_claude_agent_sdk(
)
obs._pending_initial_prompt = prompt
- # Check if continuing from previous session
- parent_session_id = self.context.get_env('PARENT_SESSION_ID', '').strip()
- is_continuation = bool(parent_session_id)
+ # Check if this is a resume session via IS_RESUME env var
+ # This is set by the operator when restarting a stopped/completed/failed session
+ is_continuation = self.context.get_env('IS_RESUME', '').strip().lower() == 'true'
+ if is_continuation:
+ logger.info("IS_RESUME=true - treating as continuation")
# Determine cwd and additional dirs
repos_cfg = self._get_repos_config()
@@ -790,11 +790,12 @@ def _setup_workflow_paths(self, active_workflow_url: str, repos_cfg: list) -> tu
logger.warning(f"Failed to derive workflow name: {e}, using default")
cwd_path = str(Path(self.context.workspace_path) / "workflows" / "default")
- # Add all repos as additional directories
+ # Add all repos as additional directories (repos are in /workspace/repos/{name})
+ repos_base = Path(self.context.workspace_path) / "repos"
for r in repos_cfg:
name = (r.get('name') or '').strip()
if name:
- repo_path = str(Path(self.context.workspace_path) / name)
+ repo_path = str(repos_base / name)
if repo_path not in add_dirs:
add_dirs.append(repo_path)
@@ -810,8 +811,14 @@ def _setup_workflow_paths(self, active_workflow_url: str, repos_cfg: list) -> tu
return cwd_path, add_dirs, derived_name
def _setup_multi_repo_paths(self, repos_cfg: list) -> tuple[str, list]:
- """Setup paths for multi-repo mode."""
+ """Setup paths for multi-repo mode.
+
+ Repos are cloned to /workspace/repos/{name} by both:
+ - hydrate.sh (init container)
+ - clone_repo_at_runtime() (runtime addition)
+ """
add_dirs = []
+ repos_base = Path(self.context.workspace_path) / "repos"
main_name = (os.getenv('MAIN_REPO_NAME') or '').strip()
if not main_name:
@@ -824,13 +831,15 @@ def _setup_multi_repo_paths(self, repos_cfg: list) -> tuple[str, list]:
idx_val = 0
main_name = (repos_cfg[idx_val].get('name') or '').strip()
- cwd_path = str(Path(self.context.workspace_path) / main_name) if main_name else self.context.workspace_path
+ # Main repo path is /workspace/repos/{name}
+ cwd_path = str(repos_base / main_name) if main_name else self.context.workspace_path
for r in repos_cfg:
name = (r.get('name') or '').strip()
if not name:
continue
- p = str(Path(self.context.workspace_path) / name)
+ # All repos are in /workspace/repos/{name}
+ p = str(repos_base / name)
if p != cwd_path:
add_dirs.append(p)
@@ -898,160 +907,34 @@ async def _setup_vertex_credentials(self) -> dict:
}
async def _prepare_workspace(self) -> AsyncIterator[BaseEvent]:
- """Clone input repo/branch into workspace and configure git remotes."""
+ """Validate workspace prepared by init container.
+
+ The init-hydrate container now handles:
+ - Downloading state from S3 (.claude/, artifacts/, file-uploads/)
+ - Cloning repos to /workspace/repos/
+ - Cloning workflows to /workspace/workflows/
+
+ Runner just validates and logs what's ready.
+ """
workspace = Path(self.context.workspace_path)
- workspace.mkdir(parents=True, exist_ok=True)
-
- parent_session_id = self.context.get_env('PARENT_SESSION_ID', '').strip()
- reusing_workspace = bool(parent_session_id)
-
- logger.info(f"Workspace preparation: parent_session_id={parent_session_id[:8] if parent_session_id else 'None'}, reusing={reusing_workspace}")
-
- repos_cfg = self._get_repos_config()
- if repos_cfg:
- async for event in self._prepare_multi_repo_workspace(workspace, repos_cfg, reusing_workspace):
- yield event
- return
-
- # Single-repo legacy flow
- input_repo = os.getenv("INPUT_REPO_URL", "").strip()
- if not input_repo:
- logger.info("No INPUT_REPO_URL configured, skipping single-repo setup")
- return
-
- input_branch = os.getenv("INPUT_BRANCH", "").strip() or "main"
- output_repo = os.getenv("OUTPUT_REPO_URL", "").strip()
-
- token = await self._fetch_token_for_url(input_repo)
- workspace_has_git = (workspace / ".git").exists()
-
- try:
- if not workspace_has_git:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": "📥 Cloning input repository..."}
- )
- clone_url = self._url_with_token(input_repo, token) if token else input_repo
- await self._run_cmd(["git", "clone", "--branch", input_branch, "--single-branch", clone_url, str(workspace)], cwd=str(workspace.parent))
- await self._run_cmd(["git", "remote", "set-url", "origin", clone_url], cwd=str(workspace), ignore_errors=True)
- elif reusing_workspace:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": "✓ Preserving workspace (continuation)"}
- )
- await self._run_cmd(["git", "remote", "set-url", "origin", self._url_with_token(input_repo, token) if token else input_repo], cwd=str(workspace), ignore_errors=True)
- else:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": "🔄 Resetting workspace to clean state"}
- )
- await self._run_cmd(["git", "remote", "set-url", "origin", self._url_with_token(input_repo, token) if token else input_repo], cwd=str(workspace))
- await self._run_cmd(["git", "fetch", "origin", input_branch], cwd=str(workspace))
- await self._run_cmd(["git", "checkout", input_branch], cwd=str(workspace))
- await self._run_cmd(["git", "reset", "--hard", f"origin/{input_branch}"], cwd=str(workspace))
-
- # Git identity
- user_name = os.getenv("GIT_USER_NAME", "").strip() or "Ambient Code Bot"
- user_email = os.getenv("GIT_USER_EMAIL", "").strip() or "bot@ambient-code.local"
- await self._run_cmd(["git", "config", "user.name", user_name], cwd=str(workspace))
- await self._run_cmd(["git", "config", "user.email", user_email], cwd=str(workspace))
-
- if output_repo:
- out_url = self._url_with_token(output_repo, token) if token else output_repo
- await self._run_cmd(["git", "remote", "remove", "output"], cwd=str(workspace), ignore_errors=True)
- await self._run_cmd(["git", "remote", "add", "output", out_url], cwd=str(workspace))
-
- except Exception as e:
- logger.error(f"Failed to prepare workspace: {e}")
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"Workspace preparation failed: {e}"}
- )
-
- # Create artifacts directory
- try:
- artifacts_dir = workspace / "artifacts"
- artifacts_dir.mkdir(parents=True, exist_ok=True)
- except Exception as e:
- logger.warning(f"Failed to create artifacts directory: {e}")
-
- async def _prepare_multi_repo_workspace(
- self, workspace: Path, repos_cfg: list, reusing_workspace: bool
- ) -> AsyncIterator[BaseEvent]:
- """Prepare workspace for multi-repo mode."""
- try:
- for r in repos_cfg:
- name = (r.get('name') or '').strip()
- inp = r.get('input') or {}
- url = (inp.get('url') or '').strip()
- branch = (inp.get('branch') or '').strip() or 'main'
- if not name or not url:
- continue
-
- repo_dir = workspace / name
- token = await self._fetch_token_for_url(url)
- repo_exists = repo_dir.exists() and (repo_dir / ".git").exists()
-
- if not repo_exists:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"📥 Cloning {name}..."}
- )
- clone_url = self._url_with_token(url, token) if token else url
- await self._run_cmd(["git", "clone", "--branch", branch, "--single-branch", clone_url, str(repo_dir)], cwd=str(workspace))
- await self._run_cmd(["git", "remote", "set-url", "origin", clone_url], cwd=str(repo_dir), ignore_errors=True)
- elif reusing_workspace:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"✓ Preserving {name} (continuation)"}
- )
- await self._run_cmd(["git", "remote", "set-url", "origin", self._url_with_token(url, token) if token else url], cwd=str(repo_dir), ignore_errors=True)
- else:
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"🔄 Resetting {name} to clean state"}
- )
- await self._run_cmd(["git", "remote", "set-url", "origin", self._url_with_token(url, token) if token else url], cwd=str(repo_dir), ignore_errors=True)
- await self._run_cmd(["git", "fetch", "origin", branch], cwd=str(repo_dir))
- await self._run_cmd(["git", "checkout", branch], cwd=str(repo_dir))
- await self._run_cmd(["git", "reset", "--hard", f"origin/{branch}"], cwd=str(repo_dir))
-
- # Git identity
- user_name = os.getenv("GIT_USER_NAME", "").strip() or "Ambient Code Bot"
- user_email = os.getenv("GIT_USER_EMAIL", "").strip() or "bot@ambient-code.local"
- await self._run_cmd(["git", "config", "user.name", user_name], cwd=str(repo_dir))
- await self._run_cmd(["git", "config", "user.email", user_email], cwd=str(repo_dir))
-
- # Configure output remote
- out = r.get('output') or {}
- out_url_raw = (out.get('url') or '').strip()
- if out_url_raw:
- out_url = self._url_with_token(out_url_raw, token) if token else out_url_raw
- await self._run_cmd(["git", "remote", "remove", "output"], cwd=str(repo_dir), ignore_errors=True)
- await self._run_cmd(["git", "remote", "add", "output", out_url], cwd=str(repo_dir))
+ logger.info(f"Validating workspace at {workspace}")
+
+ # Check what was hydrated
+ hydrated_paths = []
+ for path_name in [".claude", "artifacts", "file-uploads"]:
+ path_dir = workspace / path_name
+ if path_dir.exists():
+ file_count = len([f for f in path_dir.rglob("*") if f.is_file()])
+ if file_count > 0:
+ hydrated_paths.append(f"{path_name} ({file_count} files)")
+
+ if hydrated_paths:
+ logger.info(f"Hydrated from S3: {', '.join(hydrated_paths)}")
+ else:
+ logger.info("No state hydrated (fresh session)")
+
+ # No further preparation needed - init container did the work
- except Exception as e:
- logger.error(f"Failed to prepare multi-repo workspace: {e}")
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"Workspace preparation failed: {e}"}
- )
async def _validate_prerequisites(self):
"""Validate prerequisite files exist for phase-based slash commands."""
@@ -1086,14 +969,11 @@ async def _validate_prerequisites(self):
break
async def _initialize_workflow_if_set(self) -> AsyncIterator[BaseEvent]:
- """Initialize workflow on startup if ACTIVE_WORKFLOW env vars are set."""
+ """Validate workflow was cloned by init container."""
active_workflow_url = (os.getenv('ACTIVE_WORKFLOW_GIT_URL') or '').strip()
if not active_workflow_url:
return
- active_workflow_branch = (os.getenv('ACTIVE_WORKFLOW_BRANCH') or 'main').strip()
- active_workflow_path = (os.getenv('ACTIVE_WORKFLOW_PATH') or '').strip()
-
try:
owner, repo, _ = self._parse_owner_repo(active_workflow_url)
derived_name = repo or ''
@@ -1105,79 +985,24 @@ async def _initialize_workflow_if_set(self) -> AsyncIterator[BaseEvent]:
derived_name = (derived_name or '').removesuffix('.git').strip()
if not derived_name:
- logger.warning("Could not derive workflow name from URL, skipping initialization")
- return
-
- workflow_dir = Path(self.context.workspace_path) / "workflows" / derived_name
-
- if workflow_dir.exists():
- logger.info(f"Workflow {derived_name} already exists, skipping initialization")
+ logger.warning("Could not derive workflow name from URL")
return
- logger.info(f"Initializing workflow {derived_name} from CR spec on startup")
- async for event in self._clone_workflow_repository(active_workflow_url, active_workflow_branch, active_workflow_path, derived_name):
- yield event
+ # Check for cloned workflow (init container uses -clone-temp suffix)
+ workspace = Path(self.context.workspace_path)
+ workflow_temp_dir = workspace / "workflows" / f"{derived_name}-clone-temp"
+ workflow_dir = workspace / "workflows" / derived_name
+
+ if workflow_temp_dir.exists():
+ logger.info(f"Workflow {derived_name} cloned by init container at {workflow_temp_dir.name}")
+ elif workflow_dir.exists():
+ logger.info(f"Workflow {derived_name} available at {workflow_dir.name}")
+ else:
+ logger.warning(f"Workflow {derived_name} not found (init container may have failed to clone)")
except Exception as e:
- logger.error(f"Failed to initialize workflow on startup: {e}")
+ logger.error(f"Failed to validate workflow: {e}")
- async def _clone_workflow_repository(
- self, git_url: str, branch: str, path: str, workflow_name: str
- ) -> AsyncIterator[BaseEvent]:
- """Clone workflow repository."""
- workspace = Path(self.context.workspace_path)
- workflow_dir = workspace / "workflows" / workflow_name
- temp_clone_dir = workspace / "workflows" / f"{workflow_name}-clone-temp"
-
- if workflow_dir.exists():
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"✓ Workflow {workflow_name} already loaded"}
- )
- return
-
- token = await self._fetch_token_for_url(git_url)
-
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"📥 Cloning workflow {workflow_name}..."}
- )
-
- clone_url = self._url_with_token(git_url, token) if token else git_url
- await self._run_cmd(["git", "clone", "--branch", branch, "--single-branch", clone_url, str(temp_clone_dir)], cwd=str(workspace))
-
- if path and path.strip():
- subdir_path = temp_clone_dir / path.strip()
- if subdir_path.exists() and subdir_path.is_dir():
- shutil.copytree(subdir_path, workflow_dir)
- shutil.rmtree(temp_clone_dir)
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"✓ Extracted workflow from: {path}"}
- )
- else:
- temp_clone_dir.rename(workflow_dir)
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"⚠️ Path '{path}' not found, using full repository"}
- )
- else:
- temp_clone_dir.rename(workflow_dir)
-
- yield RawEvent(
- type=EventType.RAW,
- thread_id=self._current_thread_id or self.context.session_id,
- run_id=self._current_run_id or "init",
- event={"type": "system_log", "message": f"✅ Workflow {workflow_name} ready"}
- )
async def _run_cmd(self, cmd, cwd=None, capture_stdout=False, ignore_errors=False):
"""Run a subprocess command asynchronously."""
@@ -1457,9 +1282,10 @@ def _build_workspace_context_prompt(self, repos_cfg, workflow_name, artifacts_pa
if repos_cfg:
prompt += "## Available Code Repositories\n"
+ prompt += "Location: repos/\n"
for i, repo in enumerate(repos_cfg):
name = repo.get('name', f'repo-{i}')
- prompt += f"- {name}/\n"
+ prompt += f"- repos/{name}/\n"
prompt += "\nThese repositories contain source code you can read or modify.\n\n"
if ambient_config.get("systemPrompt"):
diff --git a/components/runners/claude-code-runner/main.py b/components/runners/claude-code-runner/main.py
index 7f14b1663..412ce70bd 100644
--- a/components/runners/claude-code-runner/main.py
+++ b/components/runners/claude-code-runner/main.py
@@ -97,17 +97,20 @@ async def lifespan(app: FastAPI):
logger.info("Adapter initialized - fresh client will be created for each run")
- # Check if this is a continuation (has parent session)
- # PARENT_SESSION_ID is set when continuing from another session
- parent_session_id = os.getenv("PARENT_SESSION_ID", "").strip()
+ # Check if this is a resume session via IS_RESUME env var
+ # This is set by the operator when restarting a stopped/completed/failed session
+ is_resume = os.getenv("IS_RESUME", "").strip().lower() == "true"
+ if is_resume:
+ logger.info("IS_RESUME=true - this is a resumed session, will skip INITIAL_PROMPT")
- # Check for INITIAL_PROMPT and auto-execute (only if no parent session)
+ # Check for INITIAL_PROMPT and auto-execute (only if not a resume)
initial_prompt = os.getenv("INITIAL_PROMPT", "").strip()
- if initial_prompt and not parent_session_id:
- logger.info(f"INITIAL_PROMPT detected ({len(initial_prompt)} chars), will auto-execute after 3s delay")
+ if initial_prompt and not is_resume:
+ delay = os.getenv("INITIAL_PROMPT_DELAY_SECONDS", "1")
+ logger.info(f"INITIAL_PROMPT detected ({len(initial_prompt)} chars), will auto-execute after {delay}s delay")
asyncio.create_task(auto_execute_initial_prompt(initial_prompt, session_id))
- elif initial_prompt:
- logger.info(f"INITIAL_PROMPT detected but has parent session ({parent_session_id[:12]}...) - skipping")
+ elif initial_prompt and is_resume:
+ logger.info("INITIAL_PROMPT detected but IS_RESUME=true - skipping (this is a resume)")
logger.info(f"AG-UI server ready for session {session_id}")
@@ -120,17 +123,19 @@ async def lifespan(app: FastAPI):
async def auto_execute_initial_prompt(prompt: str, session_id: str):
"""Auto-execute INITIAL_PROMPT by POSTing to backend after short delay.
- The 3-second delay gives the runner time to fully start. Backend has retry
- logic to handle if Service DNS isn't ready yet.
+ The delay gives the runner service time to register in DNS. Backend has retry
+ logic to handle if Service DNS isn't ready yet, so this can be short.
- Only called for fresh sessions (no PARENT_SESSION_ID set).
+ Only called for fresh sessions (no hydrated state in .claude/).
"""
import uuid
import aiohttp
- # Give runner time to fully start before backend tries to reach us
- logger.info("Waiting 3s before auto-executing INITIAL_PROMPT (allow Service DNS to propagate)...")
- await asyncio.sleep(3)
+ # Configurable delay (default 1s, was 3s)
+ # Backend has retry logic, so we don't need to wait long
+ delay_seconds = float(os.getenv("INITIAL_PROMPT_DELAY_SECONDS", "1"))
+ logger.info(f"Waiting {delay_seconds}s before auto-executing INITIAL_PROMPT (allow Service DNS to propagate)...")
+ await asyncio.sleep(delay_seconds)
logger.info("Auto-executing INITIAL_PROMPT via backend POST...")
@@ -222,12 +227,10 @@ async def event_generator():
try:
logger.info("Event generator started")
- # Initialize adapter on first run (yields setup events)
+ # Initialize adapter on first run
if not _adapter_initialized:
logger.info("First run - initializing adapter with workspace preparation")
- async for event in adapter.initialize(context):
- logger.debug(f"Yielding initialization event: {event.type}")
- yield encoder.encode(event)
+ await adapter.initialize(context)
logger.info("Adapter initialization complete")
_adapter_initialized = True
@@ -283,6 +286,105 @@ async def interrupt_run():
raise HTTPException(status_code=500, detail=str(e))
+async def clone_workflow_at_runtime(git_url: str, branch: str, subpath: str) -> tuple[bool, str]:
+ """
+ Clone a workflow repository at runtime.
+
+ This mirrors the logic in hydrate.sh but runs when workflows are changed
+ after the pod has started.
+
+ Returns:
+ (success, workflow_dir_path) tuple
+ """
+ import tempfile
+ import shutil
+ from pathlib import Path
+
+ if not git_url:
+ return False, ""
+
+ # Derive workflow name from URL
+ workflow_name = git_url.split("/")[-1].removesuffix(".git")
+ workspace_path = os.getenv("WORKSPACE_PATH", "/workspace")
+ workflow_final = Path(workspace_path) / "workflows" / workflow_name
+
+ logger.info(f"Cloning workflow '{workflow_name}' from {git_url}@{branch}")
+ if subpath:
+ logger.info(f" Subpath: {subpath}")
+
+ # Create temp directory for clone
+ temp_dir = Path(tempfile.mkdtemp(prefix="workflow-clone-"))
+
+ try:
+ # Build git clone command with optional auth token
+ github_token = os.getenv("GITHUB_TOKEN", "").strip()
+ gitlab_token = os.getenv("GITLAB_TOKEN", "").strip()
+
+ # Determine which token to use based on URL
+ clone_url = git_url
+ if github_token and "github" in git_url.lower():
+ clone_url = git_url.replace("https://", f"https://x-access-token:{github_token}@")
+ logger.info("Using GITHUB_TOKEN for workflow authentication")
+ elif gitlab_token and "gitlab" in git_url.lower():
+ clone_url = git_url.replace("https://", f"https://oauth2:{gitlab_token}@")
+ logger.info("Using GITLAB_TOKEN for workflow authentication")
+
+ # Clone the repository
+ process = await asyncio.create_subprocess_exec(
+ "git", "clone", "--branch", branch, "--single-branch", "--depth", "1",
+ clone_url, str(temp_dir),
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE
+ )
+ stdout, stderr = await process.communicate()
+
+ if process.returncode != 0:
+ # Redact tokens from error message
+ error_msg = stderr.decode()
+ if github_token:
+ error_msg = error_msg.replace(github_token, "***REDACTED***")
+ if gitlab_token:
+ error_msg = error_msg.replace(gitlab_token, "***REDACTED***")
+ logger.error(f"Failed to clone workflow: {error_msg}")
+ return False, ""
+
+ logger.info("Clone successful, processing...")
+
+ # Handle subpath extraction
+ if subpath:
+ subpath_full = temp_dir / subpath
+ if subpath_full.exists() and subpath_full.is_dir():
+ logger.info(f"Extracting subpath: {subpath}")
+ # Remove existing workflow dir if exists
+ if workflow_final.exists():
+ shutil.rmtree(workflow_final)
+ # Create parent dirs and copy subpath
+ workflow_final.parent.mkdir(parents=True, exist_ok=True)
+ shutil.copytree(subpath_full, workflow_final)
+ logger.info(f"Workflow extracted to {workflow_final}")
+ else:
+ logger.warning(f"Subpath '{subpath}' not found, using entire repo")
+ if workflow_final.exists():
+ shutil.rmtree(workflow_final)
+ shutil.move(str(temp_dir), str(workflow_final))
+ else:
+ # No subpath - use entire repo
+ if workflow_final.exists():
+ shutil.rmtree(workflow_final)
+ shutil.move(str(temp_dir), str(workflow_final))
+
+ logger.info(f"Workflow '{workflow_name}' ready at {workflow_final}")
+ return True, str(workflow_final)
+
+ except Exception as e:
+ logger.error(f"Error cloning workflow: {e}")
+ return False, ""
+ finally:
+ # Cleanup temp directory if it still exists
+ if temp_dir.exists():
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+
@app.post("/workflow")
async def change_workflow(request: Request):
"""
@@ -302,6 +404,13 @@ async def change_workflow(request: Request):
logger.info(f"Workflow change request: {git_url}@{branch} (path: {path})")
+ # Clone the workflow repository at runtime
+ # This is needed because the init container only runs once at pod startup
+ if git_url:
+ success, workflow_path = await clone_workflow_at_runtime(git_url, branch, path)
+ if not success:
+ logger.warning("Failed to clone workflow, will use default workflow directory")
+
# Update environment variables
os.environ["ACTIVE_WORKFLOW_GIT_URL"] = git_url
os.environ["ACTIVE_WORKFLOW_BRANCH"] = branch
@@ -315,12 +424,106 @@ async def change_workflow(request: Request):
# Trigger a new run to greet user with workflow context
# This runs in background via backend POST
- import asyncio
asyncio.create_task(trigger_workflow_greeting(git_url, branch, path))
return {"message": "Workflow updated", "gitUrl": git_url, "branch": branch, "path": path}
+async def clone_repo_at_runtime(git_url: str, branch: str, name: str) -> tuple[bool, str]:
+ """
+ Clone a repository at runtime.
+
+ This mirrors the logic in hydrate.sh but runs when repos are added
+ after the pod has started.
+
+ Args:
+ git_url: Git repository URL
+ branch: Branch to clone
+ name: Name for the cloned directory (derived from URL if empty)
+
+ Returns:
+ (success, repo_dir_path) tuple
+ """
+ import tempfile
+ import shutil
+ from pathlib import Path
+
+ if not git_url:
+ return False, ""
+
+ # Derive repo name from URL if not provided
+ if not name:
+ name = git_url.split("/")[-1].removesuffix(".git")
+
+ # Repos are stored in /workspace/repos/{name} (matching hydrate.sh)
+ workspace_path = os.getenv("WORKSPACE_PATH", "/workspace")
+ repos_dir = Path(workspace_path) / "repos"
+ repos_dir.mkdir(parents=True, exist_ok=True)
+ repo_final = repos_dir / name
+
+ logger.info(f"Cloning repo '{name}' from {git_url}@{branch}")
+
+ # Skip if already cloned
+ if repo_final.exists():
+ logger.info(f"Repo '{name}' already exists at {repo_final}, skipping clone")
+ return True, str(repo_final)
+
+ # Create temp directory for clone
+ temp_dir = Path(tempfile.mkdtemp(prefix="repo-clone-"))
+
+ try:
+ # Build git clone command with optional auth token
+ github_token = os.getenv("GITHUB_TOKEN", "").strip()
+ gitlab_token = os.getenv("GITLAB_TOKEN", "").strip()
+
+ # Determine which token to use based on URL
+ clone_url = git_url
+ if github_token and "github" in git_url.lower():
+ # Add GitHub token to URL
+ clone_url = git_url.replace("https://", f"https://x-access-token:{github_token}@")
+ logger.info("Using GITHUB_TOKEN for authentication")
+ elif gitlab_token and "gitlab" in git_url.lower():
+ # Add GitLab token to URL
+ clone_url = git_url.replace("https://", f"https://oauth2:{gitlab_token}@")
+ logger.info("Using GITLAB_TOKEN for authentication")
+
+ # Clone the repository
+ process = await asyncio.create_subprocess_exec(
+ "git", "clone", "--branch", branch, "--single-branch", "--depth", "1",
+ clone_url, str(temp_dir),
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE
+ )
+ stdout, stderr = await process.communicate()
+
+ if process.returncode != 0:
+ # Redact tokens from error message
+ error_msg = stderr.decode()
+ if github_token:
+ error_msg = error_msg.replace(github_token, "***REDACTED***")
+ if gitlab_token:
+ error_msg = error_msg.replace(gitlab_token, "***REDACTED***")
+ logger.error(f"Failed to clone repo: {error_msg}")
+ return False, ""
+
+ logger.info("Clone successful, moving to final location...")
+
+ # Move to final location
+ repo_final.parent.mkdir(parents=True, exist_ok=True)
+ shutil.move(str(temp_dir), str(repo_final))
+
+ logger.info(f"Repo '{name}' ready at {repo_final}")
+ return True, str(repo_final)
+
+ except Exception as e:
+ logger.error(f"Error cloning repo: {e}")
+ return False, ""
+ finally:
+ # Cleanup temp directory if it still exists
+ if temp_dir.exists():
+ shutil.rmtree(temp_dir, ignore_errors=True)
+
+
async def trigger_workflow_greeting(git_url: str, branch: str, path: str):
"""Trigger workflow greeting after workflow change."""
import uuid
@@ -385,7 +588,7 @@ async def trigger_workflow_greeting(git_url: str, branch: str, path: str):
@app.post("/repos/add")
async def add_repo(request: Request):
"""
- Add repository - triggers Claude SDK client restart.
+ Add repository - clones repo and triggers Claude SDK client restart.
Accepts: {"url": "...", "branch": "...", "name": "..."}
"""
@@ -395,7 +598,23 @@ async def add_repo(request: Request):
raise HTTPException(status_code=503, detail="Adapter not initialized")
body = await request.json()
- logger.info(f"Add repo request: {body}")
+ url = body.get("url", "")
+ branch = body.get("branch", "main")
+ name = body.get("name", "")
+
+ logger.info(f"Add repo request: url={url}, branch={branch}, name={name}")
+
+ if not url:
+ raise HTTPException(status_code=400, detail="Repository URL is required")
+
+ # Derive name from URL if not provided
+ if not name:
+ name = url.split("/")[-1].removesuffix(".git")
+
+ # Clone the repository at runtime
+ success, repo_path = await clone_repo_at_runtime(url, branch, name)
+ if not success:
+ raise HTTPException(status_code=500, detail=f"Failed to clone repository: {url}")
# Update REPOS_JSON env var
repos_json = os.getenv("REPOS_JSON", "[]")
@@ -406,22 +625,81 @@ async def add_repo(request: Request):
# Add new repo
repos.append({
- "name": body.get("name", ""),
+ "name": name,
"input": {
- "url": body.get("url", ""),
- "branch": body.get("branch", "main")
+ "url": url,
+ "branch": branch
}
})
os.environ["REPOS_JSON"] = json.dumps(repos)
- # Reset adapter state
+ # Reset adapter state to force reinitialization on next run
_adapter_initialized = False
adapter._first_run = True
- logger.info(f"Repo added, adapter will reinitialize on next run")
+ logger.info(f"Repo '{name}' added and cloned, adapter will reinitialize on next run")
+
+ # Trigger a notification to Claude about the new repository
+ asyncio.create_task(trigger_repo_added_notification(name, url))
+
+ return {"message": "Repository added", "name": name, "path": repo_path}
+
+
+async def trigger_repo_added_notification(repo_name: str, repo_url: str):
+ """Notify Claude that a repository has been added."""
+ import uuid
+ import aiohttp
+
+ # Wait a moment for repo to be fully ready
+ await asyncio.sleep(1)
+
+ logger.info(f"Triggering repo added notification for: {repo_name}")
+
+ try:
+ backend_url = os.getenv("BACKEND_API_URL", "").rstrip("/")
+ project_name = os.getenv("AGENTIC_SESSION_NAMESPACE", "").strip()
+ session_id = context.session_id if context else "unknown"
+
+ if not backend_url or not project_name:
+ logger.error("Cannot trigger repo notification: BACKEND_API_URL or PROJECT_NAME not set")
+ return
+
+ url = f"{backend_url}/projects/{project_name}/agentic-sessions/{session_id}/agui/run"
+
+ notification = f"The repository '{repo_name}' has been added to your workspace. You can now access it at the path 'repos/{repo_name}/'. Please acknowledge this to the user and let them know you can now read and work with files in this repository."
+
+ payload = {
+ "threadId": session_id,
+ "runId": str(uuid.uuid4()),
+ "messages": [{
+ "id": str(uuid.uuid4()),
+ "role": "user",
+ "content": notification,
+ "metadata": {
+ "hidden": True,
+ "autoSent": True,
+ "source": "repo_added"
+ }
+ }]
+ }
+
+ bot_token = os.getenv("BOT_TOKEN", "").strip()
+ headers = {"Content-Type": "application/json"}
+ if bot_token:
+ headers["Authorization"] = f"Bearer {bot_token}"
+
+ async with aiohttp.ClientSession() as session:
+ async with session.post(url, json=payload, headers=headers) as resp:
+ if resp.status == 200:
+ result = await resp.json()
+ logger.info(f"Repo notification sent: {result}")
+ else:
+ error_text = await resp.text()
+ logger.error(f"Repo notification failed: {resp.status} - {error_text}")
- return {"message": "Repository added"}
+ except Exception as e:
+ logger.error(f"Failed to trigger repo notification: {e}")
@app.post("/repos/remove")
diff --git a/components/runners/state-sync/Dockerfile b/components/runners/state-sync/Dockerfile
new file mode 100644
index 000000000..b0214ff6a
--- /dev/null
+++ b/components/runners/state-sync/Dockerfile
@@ -0,0 +1,21 @@
+FROM alpine:3.19
+
+# Install rclone, git, and utilities
+RUN apk add --no-cache \
+ rclone \
+ git \
+ bash \
+ curl \
+ jq \
+ ca-certificates
+
+# Copy scripts
+COPY hydrate.sh /usr/local/bin/hydrate.sh
+COPY sync.sh /usr/local/bin/sync.sh
+
+# Make scripts executable
+RUN chmod +x /usr/local/bin/hydrate.sh /usr/local/bin/sync.sh
+
+# Default to sync.sh (used by sidecar)
+ENTRYPOINT ["/usr/local/bin/sync.sh"]
+
diff --git a/components/runners/state-sync/hydrate.sh b/components/runners/state-sync/hydrate.sh
new file mode 100644
index 000000000..4c33d2ada
--- /dev/null
+++ b/components/runners/state-sync/hydrate.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+# hydrate.sh - Init container script to download session state from S3
+
+set -e
+
+# Configuration from environment
+S3_ENDPOINT="${S3_ENDPOINT:-http://minio.ambient-code.svc:9000}"
+S3_BUCKET="${S3_BUCKET:-ambient-sessions}"
+NAMESPACE="${NAMESPACE:-default}"
+SESSION_NAME="${SESSION_NAME:-unknown}"
+
+# Sanitize inputs to prevent path traversal
+NAMESPACE="${NAMESPACE//[^a-zA-Z0-9-]/}"
+SESSION_NAME="${SESSION_NAME//[^a-zA-Z0-9-]/}"
+
+# Paths to sync (must match sync.sh)
+# Note: .claude uses /app/.claude (SubPath mount), others use /workspace
+SYNC_PATHS=(
+ "artifacts"
+ "file-uploads"
+)
+CLAUDE_DATA_PATH="/app/.claude"
+
+# Error handler
+error_exit() {
+ echo "ERROR: $1" >&2
+ exit 1
+}
+
+# Configure rclone for S3
+setup_rclone() {
+ # Use explicit /tmp path since HOME may not be set in container
+ mkdir -p /tmp/.config/rclone || error_exit "Failed to create rclone config directory"
+ cat > /tmp/.config/rclone/rclone.conf << EOF
+[s3]
+type = s3
+provider = Other
+access_key_id = ${AWS_ACCESS_KEY_ID}
+secret_access_key = ${AWS_SECRET_ACCESS_KEY}
+endpoint = ${S3_ENDPOINT}
+acl = private
+EOF
+ if [ $? -ne 0 ]; then
+ error_exit "Failed to write rclone configuration"
+ fi
+ # Protect config file with credentials
+ chmod 600 /tmp/.config/rclone/rclone.conf || error_exit "Failed to secure rclone config"
+}
+
+echo "========================================="
+echo "Ambient Code Session State Hydration"
+echo "========================================="
+echo "Session: ${NAMESPACE}/${SESSION_NAME}"
+echo "S3 Endpoint: ${S3_ENDPOINT}"
+echo "S3 Bucket: ${S3_BUCKET}"
+echo "========================================="
+
+# Create workspace structure
+echo "Creating workspace structure..."
+# .claude is mounted at /app/.claude via SubPath (same location as runner container)
+mkdir -p "${CLAUDE_DATA_PATH}" || error_exit "Failed to create .claude directory"
+mkdir -p /workspace/artifacts || error_exit "Failed to create artifacts directory"
+mkdir -p /workspace/file-uploads || error_exit "Failed to create file-uploads directory"
+mkdir -p /workspace/repos || error_exit "Failed to create repos directory"
+
+# Set permissions on created directories (not root workspace which may be owned by different user)
+# Use 755 instead of 777 - readable by all, writable only by owner
+chmod 755 "${CLAUDE_DATA_PATH}" /workspace/artifacts /workspace/file-uploads /workspace/repos 2>/dev/null || true
+
+# Check if S3 is configured
+if [ -z "${S3_ENDPOINT}" ] || [ -z "${S3_BUCKET}" ] || [ -z "${AWS_ACCESS_KEY_ID}" ] || [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then
+ echo "S3 not configured - using ephemeral storage only (no state persistence)"
+ echo "========================================="
+ exit 0
+fi
+
+# Setup rclone
+echo "Setting up rclone..."
+setup_rclone
+
+S3_PATH="s3:${S3_BUCKET}/${NAMESPACE}/${SESSION_NAME}"
+
+# Test S3 connection
+echo "Testing S3 connection..."
+if ! rclone --config /tmp/.config/rclone/rclone.conf lsd "s3:${S3_BUCKET}/" --max-depth 1 2>&1; then
+ error_exit "Failed to connect to S3 at ${S3_ENDPOINT}. Check endpoint and credentials."
+fi
+echo "S3 connection successful"
+
+# Check if session state exists in S3
+echo "Checking for existing session state in S3..."
+if rclone --config /tmp/.config/rclone/rclone.conf lsf "${S3_PATH}/" 2>/dev/null | grep -q .; then
+ echo "Found existing session state, downloading from S3..."
+
+ # Download .claude data to /app/.claude (SubPath mount matches runner container)
+ if rclone --config /tmp/.config/rclone/rclone.conf lsf "${S3_PATH}/.claude/" 2>/dev/null | grep -q .; then
+ echo " Downloading .claude/..."
+ rclone --config /tmp/.config/rclone/rclone.conf copy "${S3_PATH}/.claude/" "${CLAUDE_DATA_PATH}/" \
+ --copy-links \
+ --transfers 8 \
+ --fast-list \
+ --progress 2>&1 || echo " Warning: failed to download .claude"
+ else
+ echo " No data for .claude/"
+ fi
+
+ # Download other sync paths to /workspace
+ for path in "${SYNC_PATHS[@]}"; do
+ if rclone --config /tmp/.config/rclone/rclone.conf lsf "${S3_PATH}/${path}/" 2>/dev/null | grep -q .; then
+ echo " Downloading ${path}/..."
+ rclone --config /tmp/.config/rclone/rclone.conf copy "${S3_PATH}/${path}/" "/workspace/${path}/" \
+ --copy-links \
+ --transfers 8 \
+ --fast-list \
+ --progress 2>&1 || echo " Warning: failed to download ${path}"
+ else
+ echo " No data for ${path}/"
+ fi
+ done
+
+ echo "State hydration complete!"
+else
+ echo "No existing state found, starting fresh session"
+fi
+
+# Set permissions on subdirectories (EmptyDir root may not be chmodable)
+echo "Setting permissions on subdirectories..."
+chmod -R 755 "${CLAUDE_DATA_PATH}" /workspace/artifacts /workspace/file-uploads /workspace/repos 2>/dev/null || true
+
+# ========================================
+# Clone repositories and workflows
+# ========================================
+echo "========================================="
+echo "Setting up repositories and workflows..."
+echo "========================================="
+
+# Disable errexit for git clones (failures are non-fatal for private repos without auth)
+set +e
+
+# Set HOME for git config (alpine doesn't set it by default)
+export HOME=/tmp
+
+# Git identity
+GIT_USER_NAME="${GIT_USER_NAME:-Ambient Code Bot}"
+GIT_USER_EMAIL="${GIT_USER_EMAIL:-bot@ambient-code.local}"
+git config --global user.name "$GIT_USER_NAME" || echo "Warning: failed to set git user.name"
+git config --global user.email "$GIT_USER_EMAIL" || echo "Warning: failed to set git user.email"
+
+# Mark workspace as safe (in case runner needs it)
+git config --global --add safe.directory /workspace 2>/dev/null || true
+
+# Clone repos from REPOS_JSON
+if [ -n "$REPOS_JSON" ] && [ "$REPOS_JSON" != "null" ] && [ "$REPOS_JSON" != "" ]; then
+ echo "Cloning repositories from spec..."
+ # Parse JSON array and clone each repo
+ REPO_COUNT=$(echo "$REPOS_JSON" | jq -e 'if type == "array" then length else 0 end' 2>/dev/null || echo "0")
+ echo "Found $REPO_COUNT repositories to clone"
+ if [ "$REPO_COUNT" -gt 0 ]; then
+ i=0
+ while [ $i -lt $REPO_COUNT ]; do
+ REPO_URL=$(echo "$REPOS_JSON" | jq -r ".[$i].url // empty" 2>/dev/null || echo "")
+ REPO_BRANCH=$(echo "$REPOS_JSON" | jq -r ".[$i].branch // \"main\"" 2>/dev/null || echo "main")
+
+ # Derive repo name from URL
+ REPO_NAME=$(basename "$REPO_URL" .git 2>/dev/null || echo "")
+
+ if [ -n "$REPO_NAME" ] && [ -n "$REPO_URL" ] && [ "$REPO_URL" != "null" ]; then
+ REPO_DIR="/workspace/repos/$REPO_NAME"
+ echo " Cloning $REPO_NAME (branch: $REPO_BRANCH)..."
+
+ # Mark repo directory as safe
+ git config --global --add safe.directory "$REPO_DIR" 2>/dev/null || true
+
+ # Clone repository (for private repos, runner will handle token injection)
+ if git clone --branch "$REPO_BRANCH" --single-branch "$REPO_URL" "$REPO_DIR" 2>&1; then
+ echo " ✓ Cloned $REPO_NAME"
+ else
+ echo " ⚠ Failed to clone $REPO_NAME (may require authentication)"
+ fi
+ fi
+ i=$((i + 1))
+ done
+ fi
+else
+ echo "No repositories configured in spec"
+fi
+
+# Clone workflow repository
+if [ -n "$ACTIVE_WORKFLOW_GIT_URL" ] && [ "$ACTIVE_WORKFLOW_GIT_URL" != "null" ]; then
+ WORKFLOW_BRANCH="${ACTIVE_WORKFLOW_BRANCH:-main}"
+ WORKFLOW_PATH="${ACTIVE_WORKFLOW_PATH:-}"
+
+ echo "Cloning workflow repository..."
+ echo " URL: $ACTIVE_WORKFLOW_GIT_URL"
+ echo " Branch: $WORKFLOW_BRANCH"
+ if [ -n "$WORKFLOW_PATH" ]; then
+ echo " Subpath: $WORKFLOW_PATH"
+ fi
+
+ # Derive workflow name from URL
+ WORKFLOW_NAME=$(basename "$ACTIVE_WORKFLOW_GIT_URL" .git)
+ WORKFLOW_FINAL="/workspace/workflows/${WORKFLOW_NAME}"
+ WORKFLOW_TEMP="/tmp/workflow-clone-$$"
+
+ git config --global --add safe.directory "$WORKFLOW_FINAL" 2>/dev/null || true
+
+ # Clone to temp location
+ if git clone --branch "$WORKFLOW_BRANCH" --single-branch "$ACTIVE_WORKFLOW_GIT_URL" "$WORKFLOW_TEMP" 2>&1; then
+ echo " Clone successful, processing..."
+
+ # Extract subpath if specified
+ if [ -n "$WORKFLOW_PATH" ]; then
+ SUBPATH_FULL="$WORKFLOW_TEMP/$WORKFLOW_PATH"
+ echo " Checking for subpath: $SUBPATH_FULL"
+ ls -la "$SUBPATH_FULL" 2>&1 || echo " Subpath does not exist"
+
+ if [ -d "$SUBPATH_FULL" ]; then
+ echo " Extracting subpath: $WORKFLOW_PATH"
+ mkdir -p "$(dirname "$WORKFLOW_FINAL")"
+ cp -r "$SUBPATH_FULL" "$WORKFLOW_FINAL"
+ rm -rf "$WORKFLOW_TEMP"
+ echo " ✓ Workflow extracted from subpath to /workspace/workflows/${WORKFLOW_NAME}"
+ else
+ echo " ⚠ Subpath '$WORKFLOW_PATH' not found in cloned repo"
+ echo " Available paths in repo:"
+ find "$WORKFLOW_TEMP" -maxdepth 3 -type d | head -10
+ echo " Using entire repo instead"
+ mv "$WORKFLOW_TEMP" "$WORKFLOW_FINAL"
+ echo " ✓ Workflow ready at /workspace/workflows/${WORKFLOW_NAME}"
+ fi
+ else
+ # No subpath - use entire repo
+ mv "$WORKFLOW_TEMP" "$WORKFLOW_FINAL"
+ echo " ✓ Workflow ready at /workspace/workflows/${WORKFLOW_NAME}"
+ fi
+ else
+ echo " ⚠ Failed to clone workflow"
+ rm -rf "$WORKFLOW_TEMP" 2>/dev/null || true
+ fi
+fi
+
+echo "========================================="
+echo "Workspace initialized successfully"
+echo "========================================="
+exit 0
+
diff --git a/components/runners/state-sync/sync.sh b/components/runners/state-sync/sync.sh
new file mode 100644
index 000000000..401ef30d1
--- /dev/null
+++ b/components/runners/state-sync/sync.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+# sync.sh - Sidecar script to sync session state to S3 every N seconds
+
+set -e
+
+# Configuration from environment
+S3_ENDPOINT="${S3_ENDPOINT:-http://minio.ambient-code.svc:9000}"
+S3_BUCKET="${S3_BUCKET:-ambient-sessions}"
+NAMESPACE="${NAMESPACE:-default}"
+SESSION_NAME="${SESSION_NAME:-unknown}"
+SYNC_INTERVAL="${SYNC_INTERVAL:-60}"
+MAX_SYNC_SIZE="${MAX_SYNC_SIZE:-1073741824}" # 1GB default
+
+# Sanitize inputs to prevent path traversal
+NAMESPACE="${NAMESPACE//[^a-zA-Z0-9-]/}"
+SESSION_NAME="${SESSION_NAME//[^a-zA-Z0-9-]/}"
+
+# Paths to sync (non-git content)
+# Note: .claude uses /app/.claude (SubPath mount), others use /workspace
+SYNC_PATHS=(
+ "artifacts"
+ "file-uploads"
+)
+CLAUDE_DATA_PATH="/app/.claude"
+
+# Patterns to exclude from sync
+EXCLUDE_PATTERNS=(
+ "repos/**" # Git handles this
+ "node_modules/**"
+ ".venv/**"
+ "__pycache__/**"
+ ".cache/**"
+ "*.pyc"
+ "target/**"
+ "dist/**"
+ "build/**"
+ ".git/**"
+ ".claude/debug/**" # Debug logs with symlinks that break rclone
+)
+
+# Configure rclone for S3
+setup_rclone() {
+ # Use explicit /tmp path since HOME may not be set in container
+ mkdir -p /tmp/.config/rclone
+ cat > /tmp/.config/rclone/rclone.conf << EOF
+[s3]
+type = s3
+provider = Other
+access_key_id = ${AWS_ACCESS_KEY_ID}
+secret_access_key = ${AWS_SECRET_ACCESS_KEY}
+endpoint = ${S3_ENDPOINT}
+acl = private
+EOF
+ # Protect config file with credentials
+ chmod 600 /tmp/.config/rclone/rclone.conf
+}
+
+# Check total size before sync
+check_size() {
+ local total=0
+
+ # Check .claude directory size (at /app/.claude via SubPath)
+ if [ -d "${CLAUDE_DATA_PATH}" ]; then
+ size=$(du -sb "${CLAUDE_DATA_PATH}" 2>/dev/null | cut -f1 || echo 0)
+ total=$((total + size))
+ fi
+
+ # Check other paths in /workspace
+ for path in "${SYNC_PATHS[@]}"; do
+ if [ -d "/workspace/${path}" ]; then
+ size=$(du -sb "/workspace/${path}" 2>/dev/null | cut -f1 || echo 0)
+ total=$((total + size))
+ fi
+ done
+
+ if [ $total -gt $MAX_SYNC_SIZE ]; then
+ echo "WARNING: Sync size (${total} bytes) exceeds limit (${MAX_SYNC_SIZE} bytes)"
+ echo "Some files may be skipped"
+ return 1
+ fi
+ return 0
+}
+
+# Sync workspace state to S3
+sync_to_s3() {
+ local s3_path="s3:${S3_BUCKET}/${NAMESPACE}/${SESSION_NAME}"
+
+ echo "[$(date -Iseconds)] Starting sync to S3..."
+
+ local synced=0
+
+ # Sync .claude data from /app/.claude (SubPath mount matches runner container)
+ if [ -d "${CLAUDE_DATA_PATH}" ]; then
+ echo " Syncing .claude/..."
+ if rclone --config /tmp/.config/rclone/rclone.conf sync "${CLAUDE_DATA_PATH}" "${s3_path}/.claude/" \
+ --checksum \
+ --copy-links \
+ --transfers 4 \
+ --fast-list \
+ --stats-one-line \
+ --max-size ${MAX_SYNC_SIZE} \
+ $(printf -- '--exclude %s ' "${EXCLUDE_PATTERNS[@]}") \
+ 2>&1; then
+ synced=$((synced + 1))
+ else
+ echo " Warning: sync of .claude had errors"
+ fi
+ fi
+
+ # Sync other paths from /workspace
+ for path in "${SYNC_PATHS[@]}"; do
+ if [ -d "/workspace/${path}" ]; then
+ echo " Syncing ${path}/..."
+ if rclone --config /tmp/.config/rclone/rclone.conf sync "/workspace/${path}" "${s3_path}/${path}/" \
+ --checksum \
+ --copy-links \
+ --transfers 4 \
+ --fast-list \
+ --stats-one-line \
+ --max-size ${MAX_SYNC_SIZE} \
+ $(printf -- '--exclude %s ' "${EXCLUDE_PATTERNS[@]}") \
+ 2>&1; then
+ synced=$((synced + 1))
+ else
+ echo " Warning: sync of ${path} had errors"
+ fi
+ fi
+ done
+
+ # Save metadata
+ echo "{\"lastSync\": \"$(date -Iseconds)\", \"session\": \"${SESSION_NAME}\", \"namespace\": \"${NAMESPACE}\", \"pathsSynced\": ${synced}}" > /tmp/metadata.json
+ rclone --config /tmp/.config/rclone/rclone.conf copy /tmp/metadata.json "${s3_path}/" 2>&1 || true
+
+ echo "[$(date -Iseconds)] Sync complete (${synced} paths synced)"
+}
+
+# Final sync on shutdown
+final_sync() {
+ echo ""
+ echo "========================================="
+ echo "[$(date -Iseconds)] SIGTERM received, performing final sync..."
+ echo "========================================="
+ sync_to_s3
+ echo "========================================="
+ echo "[$(date -Iseconds)] Final sync complete, exiting"
+ echo "========================================="
+ exit 0
+}
+
+# Main
+echo "========================================="
+echo "Ambient Code State Sync Sidecar"
+echo "========================================="
+echo "Session: ${NAMESPACE}/${SESSION_NAME}"
+echo "S3 Endpoint: ${S3_ENDPOINT}"
+echo "S3 Bucket: ${S3_BUCKET}"
+echo "Sync interval: ${SYNC_INTERVAL}s"
+echo "Max sync size: ${MAX_SYNC_SIZE} bytes"
+echo "========================================="
+
+# Check if S3 is configured
+if [ -z "${S3_ENDPOINT}" ] || [ -z "${S3_BUCKET}" ] || [ -z "${AWS_ACCESS_KEY_ID}" ] || [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then
+ echo "S3 not configured - state sync disabled (ephemeral storage only)"
+ echo "Session will not persist across pod restarts"
+ echo "========================================="
+ # Sleep forever - keep sidecar alive but do nothing
+ while true; do
+ sleep 3600
+ done
+fi
+
+setup_rclone
+trap 'final_sync' SIGTERM SIGINT
+
+# Initial delay to let workspace populate
+echo "Waiting 30s for workspace to populate..."
+sleep 30
+
+# Main sync loop
+while true; do
+ check_size || echo "Size check warning (continuing anyway)"
+ sync_to_s3 || echo "Sync failed, will retry in ${SYNC_INTERVAL}s..."
+ sleep ${SYNC_INTERVAL}
+done
+
diff --git a/docs/minio-quickstart.md b/docs/minio-quickstart.md
new file mode 100644
index 000000000..26fbe2a5c
--- /dev/null
+++ b/docs/minio-quickstart.md
@@ -0,0 +1,297 @@
+# MinIO Quickstart for Ambient Code
+
+## Overview
+
+MinIO provides in-cluster S3-compatible storage for Ambient Code session state, artifacts, and uploads. This guide shows you how to deploy and configure MinIO.
+
+## Quick Setup
+
+### 1. Deploy MinIO
+
+```bash
+# Create MinIO credentials secret
+cd components/manifests/base
+cp minio-credentials-secret.yaml.example minio-credentials-secret.yaml
+
+# Edit credentials (change admin/changeme123 to secure values)
+vi minio-credentials-secret.yaml
+
+# Apply the secret
+kubectl apply -f minio-credentials-secret.yaml -n ambient-code
+
+# MinIO deployment is included in base manifests, so deploy normally
+make deploy NAMESPACE=ambient-code
+```
+
+### 2. Create Bucket
+
+```bash
+# Run automated setup
+make setup-minio NAMESPACE=ambient-code
+
+# Or manually:
+kubectl port-forward svc/minio 9001:9001 -n ambient-code &
+open http://localhost:9001
+# Login with credentials, create bucket "ambient-sessions"
+```
+
+### 3. Configure Project
+
+Navigate to project settings in the UI and configure:
+
+| Field | Value |
+|-------|-------|
+| **Enable S3 Storage** | ✅ Checked |
+| **S3_ENDPOINT** | `http://minio.ambient-code.svc:9000` |
+| **S3_BUCKET** | `ambient-sessions` |
+| **S3_REGION** | `us-east-1` (not used by MinIO but required field) |
+| **S3_ACCESS_KEY** | Your MinIO root user |
+| **S3_SECRET_KEY** | Your MinIO root password |
+
+Click **Save Integration Secrets**.
+
+## Accessing MinIO Console
+
+### Option 1: Port Forward
+
+```bash
+make minio-console NAMESPACE=ambient-code
+# Opens at http://localhost:9001
+```
+
+### Option 2: Create Route (OpenShift)
+
+```bash
+oc create route edge minio-console \
+ --service=minio \
+ --port=9001 \
+ -n ambient-code
+
+# Get URL
+oc get route minio-console -n ambient-code -o jsonpath='{.spec.host}'
+```
+
+## Viewing Session Artifacts
+
+### Via MinIO Console
+
+1. Open MinIO console: `make minio-console`
+2. Navigate to "Buckets" → "ambient-sessions"
+3. Browse: `{namespace}/{session-name}/`
+ - `.claude/` - Session history
+ - `artifacts/` - Generated files
+ - `uploads/` - User uploads
+
+### Via MinIO Client (mc)
+
+```bash
+# Install mc
+brew install minio/stable/mc
+
+# Configure alias
+kubectl port-forward svc/minio 9000:9000 -n ambient-code &
+mc alias set ambient http://localhost:9000 admin changeme123
+
+# List sessions
+mc ls ambient/ambient-sessions/
+
+# List session artifacts
+mc ls ambient/ambient-sessions/my-project/session-abc/artifacts/
+
+# Download artifacts
+mc cp --recursive ambient/ambient-sessions/my-project/session-abc/artifacts/ ./local-dir/
+
+# Download session history
+mc cp --recursive ambient/ambient-sessions/my-project/session-abc/.claude/ ./.claude/
+```
+
+### Via kubectl exec
+
+```bash
+# Get MinIO pod
+MINIO_POD=$(kubectl get pod -l app=minio -n ambient-code -o jsonpath='{.items[0].metadata.name}')
+
+# List sessions
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc ls local/ambient-sessions/
+
+# Download file
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc cp "local/ambient-sessions/my-project/session-abc/artifacts/report.pdf" /tmp/
+kubectl cp "ambient-code/${MINIO_POD}:/tmp/report.pdf" ./report.pdf
+```
+
+## Management Commands
+
+```bash
+# Check MinIO status
+make minio-status NAMESPACE=ambient-code
+
+# View MinIO logs
+make minio-logs NAMESPACE=ambient-code
+
+# Port forward to MinIO API (for mc commands)
+kubectl port-forward svc/minio 9000:9000 -n ambient-code
+```
+
+## Bucket Lifecycle Management
+
+### Set Auto-Delete Policy
+
+Keep storage costs down by auto-deleting old sessions:
+
+```bash
+# Create lifecycle policy
+cat > /tmp/lifecycle.json << 'EOF'
+{
+ "Rules": [
+ {
+ "ID": "expire-old-sessions",
+ "Status": "Enabled",
+ "Expiration": {
+ "Days": 30
+ }
+ }
+ ]
+}
+EOF
+
+# Apply policy
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc ilm import "local/ambient-sessions" /tmp/lifecycle.json
+```
+
+### Monitor Storage Usage
+
+```bash
+# Check bucket size
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc du local/ambient-sessions
+
+# List largest sessions
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc du --depth 2 local/ambient-sessions | sort -n -r | head -10
+```
+
+## Backup and Restore
+
+### Backup MinIO Data
+
+```bash
+# Backup to local directory
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc mirror local/ambient-sessions /tmp/backup/
+kubectl cp "ambient-code/${MINIO_POD}:/tmp/backup" ./minio-backup/
+
+# Or use external mc client
+mc mirror ambient/ambient-sessions ./minio-backup/
+```
+
+### Restore from Backup
+
+```bash
+# Copy backup to pod
+kubectl cp ./minio-backup/ "ambient-code/${MINIO_POD}:/tmp/restore"
+
+# Restore
+kubectl exec -n ambient-code "${MINIO_POD}" -- mc mirror /tmp/restore local/ambient-sessions
+```
+
+## Troubleshooting
+
+### MinIO Pod Not Starting
+
+```bash
+# Check events
+kubectl get events -n ambient-code --sort-by='.lastTimestamp' | grep minio
+
+# Check PVC
+kubectl get pvc minio-data -n ambient-code
+
+# Check pod logs
+kubectl logs -f deployment/minio -n ambient-code
+```
+
+### Can't Access MinIO Console
+
+```bash
+# Check service
+kubectl get svc minio -n ambient-code
+
+# Test connection from within cluster
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never -n ambient-code -- \
+ curl -v http://minio.ambient-code.svc:9000/minio/health/live
+```
+
+### Session Init Failing
+
+```bash
+# Check session pod init container logs
+kubectl logs {session-pod} -c init-hydrate -n {namespace}
+
+# Common issues:
+# - Wrong S3 endpoint (check project settings)
+# - Bucket doesn't exist (create in MinIO console)
+# - Wrong credentials (verify in project settings)
+```
+
+## Production Considerations
+
+### High Availability
+
+For production, deploy MinIO in distributed mode:
+
+```bash
+# Use MinIO Operator
+kubectl apply -k "github.com/minio/operator"
+kubectl apply -f - </dev/null 2>&1; then
+ MINIO_USER=$(kubectl get secret minio-credentials -n "${NAMESPACE}" -o jsonpath='{.data.root-user}' | base64 -d)
+ MINIO_PASSWORD=$(kubectl get secret minio-credentials -n "${NAMESPACE}" -o jsonpath='{.data.root-password}' | base64 -d)
+else
+ echo "ERROR: minio-credentials secret not found in namespace ${NAMESPACE}"
+ echo "Please create it first:"
+ echo " 1. Copy components/manifests/base/minio-credentials-secret.yaml.example to minio-credentials-secret.yaml"
+ echo " 2. Edit with secure credentials"
+ echo " 3. kubectl apply -f minio-credentials-secret.yaml -n ${NAMESPACE}"
+ exit 1
+fi
+
+echo "========================================="
+echo "MinIO Setup for Ambient Code Platform"
+echo "========================================="
+echo "Namespace: ${NAMESPACE}"
+echo "Bucket: ${BUCKET_NAME}"
+echo "========================================="
+
+# Check if MinIO is deployed
+echo "Checking MinIO deployment..."
+if ! kubectl get deployment minio -n "${NAMESPACE}" >/dev/null 2>&1; then
+ echo "Error: MinIO deployment not found in namespace ${NAMESPACE}"
+ echo "Deploy MinIO first: kubectl apply -f components/manifests/base/minio-deployment.yaml"
+ exit 1
+fi
+
+# Wait for MinIO to be ready
+echo "Waiting for MinIO to be ready..."
+kubectl wait --for=condition=ready pod -l app=minio -n "${NAMESPACE}" --timeout=120s
+
+# Get MinIO pod name
+MINIO_POD=$(kubectl get pod -l app=minio -n "${NAMESPACE}" -o jsonpath='{.items[0].metadata.name}')
+echo "MinIO pod: ${MINIO_POD}"
+
+# Set up MinIO client alias
+echo "Configuring MinIO client..."
+kubectl exec -n "${NAMESPACE}" "${MINIO_POD}" -- mc alias set local http://localhost:9000 "${MINIO_USER}" "${MINIO_PASSWORD}"
+
+# Create bucket if it doesn't exist
+echo "Creating bucket: ${BUCKET_NAME}..."
+if kubectl exec -n "${NAMESPACE}" "${MINIO_POD}" -- mc ls "local/${BUCKET_NAME}" >/dev/null 2>&1; then
+ echo "Bucket ${BUCKET_NAME} already exists"
+else
+ kubectl exec -n "${NAMESPACE}" "${MINIO_POD}" -- mc mb "local/${BUCKET_NAME}"
+ echo "Created bucket: ${BUCKET_NAME}"
+fi
+
+# Set bucket to private (default)
+echo "Setting bucket policy..."
+kubectl exec -n "${NAMESPACE}" "${MINIO_POD}" -- mc anonymous set none "local/${BUCKET_NAME}"
+
+# Enable versioning (optional - helps with recovery)
+echo "Enabling versioning..."
+kubectl exec -n "${NAMESPACE}" "${MINIO_POD}" -- mc version enable "local/${BUCKET_NAME}"
+
+# Show bucket info
+echo ""
+echo "========================================="
+echo "MinIO Setup Complete!"
+echo "========================================="
+echo "Bucket: ${BUCKET_NAME}"
+echo "Endpoint: http://minio.${NAMESPACE}.svc:9000"
+echo ""
+echo "MinIO Console Access:"
+echo " kubectl port-forward svc/minio 9001:9001 -n ${NAMESPACE}"
+echo " Then open: http://localhost:9001"
+echo " Login: ${MINIO_USER} / ${MINIO_PASSWORD}"
+echo ""
+echo "Configure in Project Settings:"
+echo " S3_ENDPOINT: http://minio.${NAMESPACE}.svc:9000"
+echo " S3_BUCKET: ${BUCKET_NAME}"
+echo " S3_ACCESS_KEY: ${MINIO_USER}"
+echo " S3_SECRET_KEY: ${MINIO_PASSWORD}"
+echo "========================================="
+