From 7f36121b6f0cf8d2f14b47e2594a25c7c4bad12e Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 13 Feb 2026 13:15:40 +0100 Subject: [PATCH 1/2] feat: add Langfuse retention management with ClickHouse TTL and hard delete CronJobs --- infrastructure/README.md | 25 +++++++ infrastructure/rag/templates/_helpers.tpl | 31 ++++++++ .../templates/langfuse-retention-cronjob.yaml | 70 ++++++++++++++++++ ...angfuse-retention-hard-delete-cronjob.yaml | 74 +++++++++++++++++++ infrastructure/rag/values.yaml | 31 ++++++++ 5 files changed, 231 insertions(+) create mode 100644 infrastructure/rag/templates/langfuse-retention-cronjob.yaml create mode 100644 infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml diff --git a/infrastructure/README.md b/infrastructure/README.md index 27d2e3d9..10b37418 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -188,6 +188,31 @@ For local development you can let Tilt generate Langfuse init secrets automatica - Tilt runs Kustomize on `infrastructure/kustomize/langfuse` and applies the resulting `langfuse-init-secrets` (hash disabled) before Helm resources. - This is dev-only. For production, create/manage secrets with your secret manager and set `secretKeyRef.name` in `values.yaml` to your managed secret. +**Langfuse Trace Retention via ClickHouse TTL (without Enterprise)** +If you want automatic deletion (for example after 1 year) without Langfuse Enterprise data-retention management, enable the chart-level retention CronJob: + +```yaml +langfuseRetention: + enabled: true + retentionDays: 365 + schedule: "15 */6 * * *" + hardDelete: + enabled: true + schedule: "30 3 * * *" + mutationSync: 0 + clickhouse: + database: "default" + onCluster: true + clusterName: "default" +``` + +Notes: +- ClickHouse connection/auth for retention jobs is taken from `langfuse.clickhouse.*` (same source as Langfuse itself). +- The CronJob applies idempotent `ALTER TABLE ... MODIFY TTL` statements on Langfuse tables (`traces`, `observations`, `scores`). +- If `hardDelete.enabled=true`, an additional CronJob executes deterministic `ALTER TABLE ... DELETE WHERE ...` mutations. +- Deletion is then handled by ClickHouse background merges (not instant at the exact cutoff timestamp). +- Avoid applying TTL blindly to every table. Some tables are views/metadata and should not be retention-trimmed. + ### 1.2 Qdrant The deployment of the Qdrant can be disabled by setting the following value in the helm-chart: diff --git a/infrastructure/rag/templates/_helpers.tpl b/infrastructure/rag/templates/_helpers.tpl index d41cfe23..d2e866eb 100644 --- a/infrastructure/rag/templates/_helpers.tpl +++ b/infrastructure/rag/templates/_helpers.tpl @@ -78,3 +78,34 @@ {{- toYaml $data -}} {{- end }} {{- end -}} + +{{/* Shared ClickHouse env for Langfuse retention CronJobs. */}} +{{- define "rag.langfuseRetentionClickhouseEnv" -}} +{{- $chHost := default (printf "%s-clickhouse" .Release.Name | trunc 63 | trimSuffix "-") .Values.langfuse.clickhouse.host -}} +{{- $chUser := default "default" .Values.langfuse.clickhouse.auth.username -}} +{{- $chPasswordSecretName := default (printf "%s-clickhouse" .Release.Name | trunc 63 | trimSuffix "-") .Values.langfuse.clickhouse.auth.existingSecret -}} +{{- $chPasswordKey := .Values.langfuse.clickhouse.auth.existingSecretKey -}} +{{- $chNativePort := default 9000 .Values.langfuse.clickhouse.nativePort -}} +- name: CLICKHOUSE_HOST + value: {{ $chHost | quote }} +- name: CLICKHOUSE_PORT + value: {{ $chNativePort | quote }} +- name: CLICKHOUSE_USER + value: {{ $chUser | quote }} +- name: CLICKHOUSE_DATABASE + value: {{ .Values.langfuseRetention.clickhouse.database | quote }} +- name: CLICKHOUSE_ON_CLUSTER + value: {{ ternary "true" "false" .Values.langfuseRetention.clickhouse.onCluster | quote }} +- name: CLICKHOUSE_CLUSTER_NAME + value: {{ .Values.langfuseRetention.clickhouse.clusterName | quote }} +- name: RETENTION_DAYS + value: {{ .Values.langfuseRetention.retentionDays | quote }} +- name: CLICKHOUSE_PASSWORD_LITERAL + value: {{ .Values.langfuse.clickhouse.auth.password | quote }} +- name: CLICKHOUSE_PASSWORD + valueFrom: + secretKeyRef: + name: {{ $chPasswordSecretName | quote }} + key: {{ default "CLICKHOUSE_PASSWORD" $chPasswordKey | quote }} + optional: true +{{- end -}} diff --git a/infrastructure/rag/templates/langfuse-retention-cronjob.yaml b/infrastructure/rag/templates/langfuse-retention-cronjob.yaml new file mode 100644 index 00000000..7553dc5f --- /dev/null +++ b/infrastructure/rag/templates/langfuse-retention-cronjob.yaml @@ -0,0 +1,70 @@ +{{- if and .Values.features.langfuse.enabled .Values.langfuseRetention.enabled }} +{{- $retentionImage := printf "%s:%s" .Values.langfuseRetention.image.repository .Values.langfuseRetention.image.tag -}} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ printf "%s-langfuse-retention" .Release.Name | trunc 63 | trimSuffix "-" }} + labels: + app.kubernetes.io/name: rag + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + schedule: {{ .Values.langfuseRetention.schedule | quote }} + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + metadata: + labels: + app.kubernetes.io/name: rag + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + restartPolicy: OnFailure + containers: + - name: apply-clickhouse-ttl + image: {{ $retentionImage | quote }} + imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }} + command: + - /bin/bash + - -ec + args: + - | + set -euo pipefail + + PASSWORD="${CLICKHOUSE_PASSWORD:-}" + if [ -z "${PASSWORD}" ]; then + PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}" + fi + + if [ -z "${PASSWORD}" ]; then + echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret." + exit 1 + fi + + ON_CLUSTER_CLAUSE="" + if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then + ON_CLUSTER_CLAUSE=" ON CLUSTER ${CLICKHOUSE_CLUSTER_NAME}" + fi + + TABLE_ROWS="$(cat <<'EOF_TABLES' + {{- range .Values.langfuseRetention.clickhouse.tables }} + {{ .name }} {{ .timestampColumn }} + {{- end }} + EOF_TABLES + )" + + while IFS=$'\t' read -r table ts_col; do + [ -z "${table}" ] && continue + + echo "Applying TTL=${RETENTION_DAYS}d to ${CLICKHOUSE_DATABASE}.${table} (${ts_col})" + clickhouse-client \ + --host "${CLICKHOUSE_HOST}" \ + --port "${CLICKHOUSE_PORT}" \ + --user "${CLICKHOUSE_USER}" \ + --password "${PASSWORD}" \ + --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL toDateTime(${ts_col}) + toIntervalDay(${RETENTION_DAYS})" + done <<< "${TABLE_ROWS}" + env: +{{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }} +{{- end }} diff --git a/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml b/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml new file mode 100644 index 00000000..73b65eca --- /dev/null +++ b/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml @@ -0,0 +1,74 @@ +{{- if and .Values.features.langfuse.enabled .Values.langfuseRetention.hardDelete.enabled }} +{{- $retentionImage := printf "%s:%s" .Values.langfuseRetention.image.repository .Values.langfuseRetention.image.tag -}} +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ printf "%s-langfuse-retention-delete" .Release.Name | trunc 63 | trimSuffix "-" }} + labels: + app.kubernetes.io/name: rag + app.kubernetes.io/instance: {{ .Release.Name }} +spec: + schedule: {{ .Values.langfuseRetention.hardDelete.schedule | quote }} + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + metadata: + labels: + app.kubernetes.io/name: rag + app.kubernetes.io/instance: {{ .Release.Name }} + spec: + restartPolicy: OnFailure + containers: + - name: delete-expired-rows + image: {{ $retentionImage | quote }} + imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }} + command: + - /bin/bash + - -ec + args: + - | + set -euo pipefail + + PASSWORD="${CLICKHOUSE_PASSWORD:-}" + if [ -z "${PASSWORD}" ]; then + PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}" + fi + + if [ -z "${PASSWORD}" ]; then + echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret." + exit 1 + fi + + ON_CLUSTER_CLAUSE="" + if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then + ON_CLUSTER_CLAUSE=" ON CLUSTER ${CLICKHOUSE_CLUSTER_NAME}" + fi + + TABLE_ROWS="$(cat <<'EOF_TABLES' + {{- range .Values.langfuseRetention.clickhouse.tables }} + {{ .name }} {{ .timestampColumn }} + {{- end }} + EOF_TABLES + )" + + CUTOFF_UNIX="$(( $(date -u +%s) - RETENTION_DAYS * 86400 ))" + + while IFS=$'\t' read -r table ts_col; do + [ -z "${table}" ] && continue + + echo "Deleting rows older than ${RETENTION_DAYS}d from ${CLICKHOUSE_DATABASE}.${table} (${ts_col})" + clickhouse-client \ + --host "${CLICKHOUSE_HOST}" \ + --port "${CLICKHOUSE_PORT}" \ + --user "${CLICKHOUSE_USER}" \ + --password "${PASSWORD}" \ + --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE toDateTime(${ts_col}) < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}" + done <<< "${TABLE_ROWS}" + env: + - name: MUTATION_SYNC + value: {{ .Values.langfuseRetention.hardDelete.mutationSync | quote }} +{{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }} +{{- end }} diff --git a/infrastructure/rag/values.yaml b/infrastructure/rag/values.yaml index 3e1213d0..0a369baa 100644 --- a/infrastructure/rag/values.yaml +++ b/infrastructure/rag/values.yaml @@ -734,6 +734,37 @@ langfuse: name: "" key: "" +# Optional: enforce a ClickHouse TTL for Langfuse traces without Enterprise data retention management. +# This runs as a CronJob and applies idempotent ALTER TABLE ... MODIFY TTL commands. +langfuseRetention: + enabled: false + retentionDays: 365 + schedule: "15 */6 * * *" + # Optional deterministic deletion in addition to TTL. + # Uses ALTER TABLE ... DELETE WHERE ... and can run nightly. + hardDelete: + enabled: false + schedule: "30 3 * * *" + # ClickHouse mutations_sync setting: + # 0 = async (default), 1 = wait for local completion, 2 = wait for all replicas. + mutationSync: 0 + image: + repository: "bitnamilegacy/clickhouse" + tag: "25.2.1-debian-12-r0" + pullPolicy: IfNotPresent + clickhouse: + # Connection/auth are taken from langfuse.clickhouse.*. + database: "default" + onCluster: true + clusterName: "default" + tables: + - name: "traces" + timestampColumn: "timestamp" + - name: "observations" + timestampColumn: "event_ts" + - name: "scores" + timestampColumn: "timestamp" + minio: image: repository: bitnamilegacy/minio From 29785700caf839c0782b7da5e666c6b87b6e18ea Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 13 Feb 2026 14:17:05 +0100 Subject: [PATCH 2/2] fix(chart): address open PR review comments for Langfuse retention jobs --- infrastructure/README.md | 6 ++- .../templates/langfuse-retention-cronjob.yaml | 41 +++++++++++++++---- ...angfuse-retention-hard-delete-cronjob.yaml | 40 ++++++++++++++---- infrastructure/rag/values.yaml | 21 +++++++++- 4 files changed, 87 insertions(+), 21 deletions(-) diff --git a/infrastructure/README.md b/infrastructure/README.md index 10b37418..b10f5d8c 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -201,13 +201,15 @@ langfuseRetention: schedule: "30 3 * * *" mutationSync: 0 clickhouse: - database: "default" - onCluster: true + database: "default" # set this to the same DB your Langfuse deployment uses + onCluster: false # true only for clustered ClickHouse setups clusterName: "default" ``` Notes: - ClickHouse connection/auth for retention jobs is taken from `langfuse.clickhouse.*` (same source as Langfuse itself). +- Make sure `langfuseRetention.clickhouse.database` matches your Langfuse ClickHouse database, not just the chart default. +- Set `langfuseRetention.clickhouse.onCluster=true` only when your ClickHouse deployment is clustered and `clusterName` exists. - The CronJob applies idempotent `ALTER TABLE ... MODIFY TTL` statements on Langfuse tables (`traces`, `observations`, `scores`). - If `hardDelete.enabled=true`, an additional CronJob executes deterministic `ALTER TABLE ... DELETE WHERE ...` mutations. - Deletion is then handled by ClickHouse background merges (not instant at the exact cutoff timestamp). diff --git a/infrastructure/rag/templates/langfuse-retention-cronjob.yaml b/infrastructure/rag/templates/langfuse-retention-cronjob.yaml index 7553dc5f..30d1f5bd 100644 --- a/infrastructure/rag/templates/langfuse-retention-cronjob.yaml +++ b/infrastructure/rag/templates/langfuse-retention-cronjob.yaml @@ -20,11 +20,24 @@ spec: app.kubernetes.io/name: rag app.kubernetes.io/instance: {{ .Release.Name }} spec: + securityContext: + runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }} + runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }} + {{- if .Values.shared.imagePullSecret }} + imagePullSecrets: + - name: {{ .Values.shared.imagePullSecret.name }} + {{- end }} restartPolicy: OnFailure containers: - name: apply-clickhouse-ttl image: {{ $retentionImage | quote }} imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }} + securityContext: + allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }} + {{- with .Values.langfuseRetention.resources }} + resources: +{{ toYaml . | nindent 16 }} + {{- end }} command: - /bin/bash - -ec @@ -32,15 +45,12 @@ spec: - | set -euo pipefail - PASSWORD="${CLICKHOUSE_PASSWORD:-}" - if [ -z "${PASSWORD}" ]; then - PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}" - fi - - if [ -z "${PASSWORD}" ]; then + if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret." exit 1 fi + export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}" + unset CLICKHOUSE_PASSWORD_LITERAL ON_CLUSTER_CLAUSE="" if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then @@ -54,16 +64,29 @@ spec: EOF_TABLES )" + IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$' + while IFS=$'\t' read -r table ts_col; do [ -z "${table}" ] && continue + if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then + echo "Invalid table identifier: ${table}" + exit 1 + fi + if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then + echo "Invalid timestamp column identifier: ${ts_col}" + exit 1 + fi + echo "Applying TTL=${RETENTION_DAYS}d to ${CLICKHOUSE_DATABASE}.${table} (${ts_col})" - clickhouse-client \ + if ! clickhouse-client \ --host "${CLICKHOUSE_HOST}" \ --port "${CLICKHOUSE_PORT}" \ --user "${CLICKHOUSE_USER}" \ - --password "${PASSWORD}" \ - --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL toDateTime(${ts_col}) + toIntervalDay(${RETENTION_DAYS})" + --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL ${ts_col} + toIntervalDay(${RETENTION_DAYS})"; then + echo "Failed applying TTL on ${CLICKHOUSE_DATABASE}.${table}" + exit 1 + fi done <<< "${TABLE_ROWS}" env: {{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }} diff --git a/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml b/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml index 73b65eca..a6e7a373 100644 --- a/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml +++ b/infrastructure/rag/templates/langfuse-retention-hard-delete-cronjob.yaml @@ -20,11 +20,24 @@ spec: app.kubernetes.io/name: rag app.kubernetes.io/instance: {{ .Release.Name }} spec: + securityContext: + runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }} + runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }} + {{- if .Values.shared.imagePullSecret }} + imagePullSecrets: + - name: {{ .Values.shared.imagePullSecret.name }} + {{- end }} restartPolicy: OnFailure containers: - name: delete-expired-rows image: {{ $retentionImage | quote }} imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }} + securityContext: + allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }} + {{- with .Values.langfuseRetention.resources }} + resources: +{{ toYaml . | nindent 16 }} + {{- end }} command: - /bin/bash - -ec @@ -32,15 +45,12 @@ spec: - | set -euo pipefail - PASSWORD="${CLICKHOUSE_PASSWORD:-}" - if [ -z "${PASSWORD}" ]; then - PASSWORD="${CLICKHOUSE_PASSWORD_LITERAL:-}" - fi - - if [ -z "${PASSWORD}" ]; then + if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret." exit 1 fi + export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}" + unset CLICKHOUSE_PASSWORD_LITERAL ON_CLUSTER_CLAUSE="" if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then @@ -55,17 +65,29 @@ spec: )" CUTOFF_UNIX="$(( $(date -u +%s) - RETENTION_DAYS * 86400 ))" + IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$' while IFS=$'\t' read -r table ts_col; do [ -z "${table}" ] && continue + if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then + echo "Invalid table identifier: ${table}" + exit 1 + fi + if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then + echo "Invalid timestamp column identifier: ${ts_col}" + exit 1 + fi + echo "Deleting rows older than ${RETENTION_DAYS}d from ${CLICKHOUSE_DATABASE}.${table} (${ts_col})" - clickhouse-client \ + if ! clickhouse-client \ --host "${CLICKHOUSE_HOST}" \ --port "${CLICKHOUSE_PORT}" \ --user "${CLICKHOUSE_USER}" \ - --password "${PASSWORD}" \ - --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE toDateTime(${ts_col}) < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}" + --query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE ${ts_col} < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}"; then + echo "Failed deleting expired rows from ${CLICKHOUSE_DATABASE}.${table}" + exit 1 + fi done <<< "${TABLE_ROWS}" env: - name: MUTATION_SYNC diff --git a/infrastructure/rag/values.yaml b/infrastructure/rag/values.yaml index 0a369baa..ecb066e1 100644 --- a/infrastructure/rag/values.yaml +++ b/infrastructure/rag/values.yaml @@ -740,6 +740,21 @@ langfuseRetention: enabled: false retentionDays: 365 schedule: "15 */6 * * *" + podSecurityContext: + runAsUser: 1001 + runAsNonRoot: true + securityContext: + allowPrivilegeEscalation: false + # Optional resources for both retention CronJobs. + # Example: + # resources: + # requests: + # cpu: 100m + # memory: 128Mi + # limits: + # cpu: 500m + # memory: 512Mi + resources: {} # Optional deterministic deletion in addition to TTL. # Uses ALTER TABLE ... DELETE WHERE ... and can run nightly. hardDelete: @@ -754,10 +769,14 @@ langfuseRetention: pullPolicy: IfNotPresent clickhouse: # Connection/auth are taken from langfuse.clickhouse.*. + # Align this with the database Langfuse actually uses in ClickHouse. database: "default" - onCluster: true + # Set to true only for clustered ClickHouse deployments where clusterName exists. + # Keep false for single-node/non-clustered deployments. + onCluster: false clusterName: "default" tables: + # timestampColumn should be a Date/DateTime/DateTime64 column in the target table. - name: "traces" timestampColumn: "timestamp" - name: "observations"