Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions infrastructure/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,33 @@ For local development you can let Tilt generate Langfuse init secrets automatica
- Tilt runs Kustomize on `infrastructure/kustomize/langfuse` and applies the resulting `langfuse-init-secrets` (hash disabled) before Helm resources.
- This is dev-only. For production, create/manage secrets with your secret manager and set `secretKeyRef.name` in `values.yaml` to your managed secret.

**Langfuse Trace Retention via ClickHouse TTL (without Enterprise)**
If you want automatic deletion (for example after 1 year) without Langfuse Enterprise data-retention management, enable the chart-level retention CronJob:

```yaml
langfuseRetention:
enabled: true
retentionDays: 365
schedule: "15 */6 * * *"
hardDelete:
enabled: true
schedule: "30 3 * * *"
mutationSync: 0
clickhouse:
database: "default" # set this to the same DB your Langfuse deployment uses
onCluster: false # true only for clustered ClickHouse setups
clusterName: "default"
```

Notes:
- ClickHouse connection/auth for retention jobs is taken from `langfuse.clickhouse.*` (same source as Langfuse itself).
- Make sure `langfuseRetention.clickhouse.database` matches your Langfuse ClickHouse database, not just the chart default.
- Set `langfuseRetention.clickhouse.onCluster=true` only when your ClickHouse deployment is clustered and `clusterName` exists.
- The CronJob applies idempotent `ALTER TABLE ... MODIFY TTL` statements on Langfuse tables (`traces`, `observations`, `scores`).
- If `hardDelete.enabled=true`, an additional CronJob executes deterministic `ALTER TABLE ... DELETE WHERE ...` mutations.
- Deletion is then handled by ClickHouse background merges (not instant at the exact cutoff timestamp).
- Avoid applying TTL blindly to every table. Some tables are views/metadata and should not be retention-trimmed.

### 1.2 Qdrant

The deployment of the Qdrant can be disabled by setting the following value in the helm-chart:
Expand Down
31 changes: 31 additions & 0 deletions infrastructure/rag/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,34 @@
{{- toYaml $data -}}
{{- end }}
{{- end -}}

{{/* Shared ClickHouse env for Langfuse retention CronJobs. */}}
{{- define "rag.langfuseRetentionClickhouseEnv" -}}
{{- $chHost := default (printf "%s-clickhouse" .Release.Name | trunc 63 | trimSuffix "-") .Values.langfuse.clickhouse.host -}}
{{- $chUser := default "default" .Values.langfuse.clickhouse.auth.username -}}
{{- $chPasswordSecretName := default (printf "%s-clickhouse" .Release.Name | trunc 63 | trimSuffix "-") .Values.langfuse.clickhouse.auth.existingSecret -}}
{{- $chPasswordKey := .Values.langfuse.clickhouse.auth.existingSecretKey -}}
{{- $chNativePort := default 9000 .Values.langfuse.clickhouse.nativePort -}}
- name: CLICKHOUSE_HOST
value: {{ $chHost | quote }}
- name: CLICKHOUSE_PORT
value: {{ $chNativePort | quote }}
- name: CLICKHOUSE_USER
value: {{ $chUser | quote }}
- name: CLICKHOUSE_DATABASE
value: {{ .Values.langfuseRetention.clickhouse.database | quote }}
- name: CLICKHOUSE_ON_CLUSTER
value: {{ ternary "true" "false" .Values.langfuseRetention.clickhouse.onCluster | quote }}
- name: CLICKHOUSE_CLUSTER_NAME
value: {{ .Values.langfuseRetention.clickhouse.clusterName | quote }}
- name: RETENTION_DAYS
value: {{ .Values.langfuseRetention.retentionDays | quote }}
- name: CLICKHOUSE_PASSWORD_LITERAL
value: {{ .Values.langfuse.clickhouse.auth.password | quote }}
- name: CLICKHOUSE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ $chPasswordSecretName | quote }}
key: {{ default "CLICKHOUSE_PASSWORD" $chPasswordKey | quote }}
optional: true
{{- end -}}
93 changes: 93 additions & 0 deletions infrastructure/rag/templates/langfuse-retention-cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{{- if and .Values.features.langfuse.enabled .Values.langfuseRetention.enabled }}
{{- $retentionImage := printf "%s:%s" .Values.langfuseRetention.image.repository .Values.langfuseRetention.image.tag -}}
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ printf "%s-langfuse-retention" .Release.Name | trunc 63 | trimSuffix "-" }}
labels:
app.kubernetes.io/name: rag
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
schedule: {{ .Values.langfuseRetention.schedule | quote }}
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
metadata:
labels:
app.kubernetes.io/name: rag
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
securityContext:
runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }}
runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }}
{{- if .Values.shared.imagePullSecret }}
imagePullSecrets:
- name: {{ .Values.shared.imagePullSecret.name }}
{{- end }}
restartPolicy: OnFailure
containers:
- name: apply-clickhouse-ttl
image: {{ $retentionImage | quote }}
imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }}
securityContext:
allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }}
{{- with .Values.langfuseRetention.resources }}
resources:
{{ toYaml . | nindent 16 }}
{{- end }}
command:
- /bin/bash
- -ec
args:
- |
set -euo pipefail

if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then
echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret."
exit 1
fi
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}"
unset CLICKHOUSE_PASSWORD_LITERAL

ON_CLUSTER_CLAUSE=""
if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then
ON_CLUSTER_CLAUSE=" ON CLUSTER ${CLICKHOUSE_CLUSTER_NAME}"
fi

TABLE_ROWS="$(cat <<'EOF_TABLES'
{{- range .Values.langfuseRetention.clickhouse.tables }}
{{ .name }} {{ .timestampColumn }}
{{- end }}
EOF_TABLES
)"

IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$'

while IFS=$'\t' read -r table ts_col; do
[ -z "${table}" ] && continue

if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then
echo "Invalid table identifier: ${table}"
exit 1
fi
if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then
echo "Invalid timestamp column identifier: ${ts_col}"
exit 1
fi

echo "Applying TTL=${RETENTION_DAYS}d to ${CLICKHOUSE_DATABASE}.${table} (${ts_col})"
if ! clickhouse-client \
--host "${CLICKHOUSE_HOST}" \
--port "${CLICKHOUSE_PORT}" \
--user "${CLICKHOUSE_USER}" \
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} MODIFY TTL ${ts_col} + toIntervalDay(${RETENTION_DAYS})"; then
echo "Failed applying TTL on ${CLICKHOUSE_DATABASE}.${table}"
exit 1
fi
done <<< "${TABLE_ROWS}"
env:
{{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
{{- if and .Values.features.langfuse.enabled .Values.langfuseRetention.hardDelete.enabled }}
{{- $retentionImage := printf "%s:%s" .Values.langfuseRetention.image.repository .Values.langfuseRetention.image.tag -}}
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ printf "%s-langfuse-retention-delete" .Release.Name | trunc 63 | trimSuffix "-" }}
labels:
app.kubernetes.io/name: rag
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
schedule: {{ .Values.langfuseRetention.hardDelete.schedule | quote }}
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
metadata:
labels:
app.kubernetes.io/name: rag
app.kubernetes.io/instance: {{ .Release.Name }}
spec:
securityContext:
runAsUser: {{ .Values.langfuseRetention.podSecurityContext.runAsUser }}
runAsNonRoot: {{ .Values.langfuseRetention.podSecurityContext.runAsNonRoot }}
{{- if .Values.shared.imagePullSecret }}
imagePullSecrets:
- name: {{ .Values.shared.imagePullSecret.name }}
{{- end }}
restartPolicy: OnFailure
containers:
- name: delete-expired-rows
image: {{ $retentionImage | quote }}
imagePullPolicy: {{ .Values.langfuseRetention.image.pullPolicy | quote }}
securityContext:
allowPrivilegeEscalation: {{ .Values.langfuseRetention.securityContext.allowPrivilegeEscalation }}
{{- with .Values.langfuseRetention.resources }}
resources:
{{ toYaml . | nindent 16 }}
{{- end }}
command:
- /bin/bash
- -ec
args:
- |
set -euo pipefail

if [ -z "${CLICKHOUSE_PASSWORD:-}" ] && [ -z "${CLICKHOUSE_PASSWORD_LITERAL:-}" ]; then
echo "No ClickHouse password found. Check langfuse.clickhouse.auth settings and secret."
exit 1
fi
export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-${CLICKHOUSE_PASSWORD_LITERAL:-}}"
unset CLICKHOUSE_PASSWORD_LITERAL

ON_CLUSTER_CLAUSE=""
if [ "${CLICKHOUSE_ON_CLUSTER}" = "true" ]; then
ON_CLUSTER_CLAUSE=" ON CLUSTER ${CLICKHOUSE_CLUSTER_NAME}"
fi

TABLE_ROWS="$(cat <<'EOF_TABLES'
{{- range .Values.langfuseRetention.clickhouse.tables }}
{{ .name }} {{ .timestampColumn }}
{{- end }}
EOF_TABLES
)"

CUTOFF_UNIX="$(( $(date -u +%s) - RETENTION_DAYS * 86400 ))"
IDENTIFIER_REGEX='^[A-Za-z_][A-Za-z0-9_]*$'

while IFS=$'\t' read -r table ts_col; do
[ -z "${table}" ] && continue

if ! [[ "${table}" =~ ${IDENTIFIER_REGEX} ]]; then
echo "Invalid table identifier: ${table}"
exit 1
fi
if ! [[ "${ts_col}" =~ ${IDENTIFIER_REGEX} ]]; then
echo "Invalid timestamp column identifier: ${ts_col}"
exit 1
fi

echo "Deleting rows older than ${RETENTION_DAYS}d from ${CLICKHOUSE_DATABASE}.${table} (${ts_col})"
if ! clickhouse-client \
--host "${CLICKHOUSE_HOST}" \
--port "${CLICKHOUSE_PORT}" \
--user "${CLICKHOUSE_USER}" \
--query "ALTER TABLE ${CLICKHOUSE_DATABASE}.${table}${ON_CLUSTER_CLAUSE} DELETE WHERE ${ts_col} < toDateTime(${CUTOFF_UNIX}) SETTINGS mutations_sync = ${MUTATION_SYNC}"; then
echo "Failed deleting expired rows from ${CLICKHOUSE_DATABASE}.${table}"
exit 1
fi
done <<< "${TABLE_ROWS}"
env:
- name: MUTATION_SYNC
value: {{ .Values.langfuseRetention.hardDelete.mutationSync | quote }}
{{ include "rag.langfuseRetentionClickhouseEnv" . | nindent 16 }}
{{- end }}
50 changes: 50 additions & 0 deletions infrastructure/rag/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,56 @@ langfuse:
name: ""
key: ""

# Optional: enforce a ClickHouse TTL for Langfuse traces without Enterprise data retention management.
# This runs as a CronJob and applies idempotent ALTER TABLE ... MODIFY TTL commands.
langfuseRetention:
enabled: false
retentionDays: 365
schedule: "15 */6 * * *"
podSecurityContext:
runAsUser: 1001
runAsNonRoot: true
securityContext:
allowPrivilegeEscalation: false
# Optional resources for both retention CronJobs.
# Example:
# resources:
# requests:
# cpu: 100m
# memory: 128Mi
# limits:
# cpu: 500m
# memory: 512Mi
resources: {}
# Optional deterministic deletion in addition to TTL.
# Uses ALTER TABLE ... DELETE WHERE ... and can run nightly.
hardDelete:
enabled: false
schedule: "30 3 * * *"
# ClickHouse mutations_sync setting:
# 0 = async (default), 1 = wait for local completion, 2 = wait for all replicas.
mutationSync: 0
image:
repository: "bitnamilegacy/clickhouse"
tag: "25.2.1-debian-12-r0"
pullPolicy: IfNotPresent
clickhouse:
# Connection/auth are taken from langfuse.clickhouse.*.
# Align this with the database Langfuse actually uses in ClickHouse.
database: "default"
# Set to true only for clustered ClickHouse deployments where clusterName exists.
# Keep false for single-node/non-clustered deployments.
onCluster: false
clusterName: "default"
tables:
# timestampColumn should be a Date/DateTime/DateTime64 column in the target table.
- name: "traces"
timestampColumn: "timestamp"
- name: "observations"
timestampColumn: "event_ts"
- name: "scores"
timestampColumn: "timestamp"

minio:
image:
repository: bitnamilegacy/minio
Expand Down