From 13a4f5c4a2a959bfbae11634bd8f8dbe47e00805 Mon Sep 17 00:00:00 2001 From: Sangjin Moon <1128msj@naver.com> Date: Fri, 13 Feb 2026 11:10:37 +0900 Subject: [PATCH 1/3] fix: use strict RFC 2397 regex in _parse_base64_data_uri to avoid misidentifying SSE data _parse_base64_data_uri previously used a loose startswith("data:") check, which caused SSE data (e.g., "data: {...}") to be incorrectly processed as base64 data URIs, resulting in spurious error logs. Replace the manual parsing with a strict regex that requires the full data:[][;params];base64, format. Non-matching inputs now return (None, None) cleanly without error logging. Closes https://github.com/langfuse/langfuse/issues/5659 --- langfuse/media.py | 36 ++++++------ tests/test_issue_5659.py | 121 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 19 deletions(-) create mode 100644 tests/test_issue_5659.py diff --git a/langfuse/media.py b/langfuse/media.py index 6691785af..2df707dca 100644 --- a/langfuse/media.py +++ b/langfuse/media.py @@ -190,33 +190,31 @@ def parse_reference_string(reference_string: str) -> ParsedMediaReference: content_type=cast(MediaContentType, parsed_data["type"]), ) + # Strict regex for RFC 2397 base64 data URIs: data:[][;params];base64, + _BASE64_DATA_URI_RE = re.compile( + r"^data:" + r"(?P[a-zA-Z0-9][a-zA-Z0-9!#$&\-^_.+]*/[a-zA-Z0-9][a-zA-Z0-9!#$&\-^_.+]*)?" + r"(?:;[^;,]+)*" # optional parameters (e.g., ;charset=utf-8) + r";base64," + r"(?P[A-Za-z0-9+/\r\n]+=*)\s*$" + ) + def _parse_base64_data_uri( self, data: str ) -> Tuple[Optional[bytes], Optional[MediaContentType]]: # Example data URI: data:image/jpeg;base64,/9j/4AAQ... - try: - if not data or not isinstance(data, str): - raise ValueError("Data URI is not a string") - - if not data.startswith("data:"): - raise ValueError("Data URI does not start with 'data:'") - - header, actual_data = data[5:].split(",", 1) - if not header or not actual_data: - raise ValueError("Invalid URI") + if not data or not isinstance(data, str): + return None, None - # Split header into parts and check for base64 - header_parts = header.split(";") - if "base64" not in header_parts: - raise ValueError("Data is not base64 encoded") + match = self._BASE64_DATA_URI_RE.match(data) + if not match: + return None, None - # Content type is the first part - content_type = header_parts[0] - if not content_type: - raise ValueError("Content type is empty") + try: + content_type = match.group("content_type") or "text/plain" + actual_data = match.group("data") return base64.b64decode(actual_data), cast(MediaContentType, content_type) - except Exception as e: self._log.error("Error parsing base64 data URI", exc_info=e) diff --git a/tests/test_issue_5659.py b/tests/test_issue_5659.py new file mode 100644 index 000000000..b2bc723ee --- /dev/null +++ b/tests/test_issue_5659.py @@ -0,0 +1,121 @@ +"""Test for issue #5659: _parse_base64_data_uri misidentifies SSE data as base64 media.""" + +import base64 +import logging + +from langfuse.media import LangfuseMedia + + +def _make_media(): + """Create a LangfuseMedia instance for testing _parse_base64_data_uri.""" + return LangfuseMedia( + content_bytes=b"dummy", content_type="application/octet-stream" + ) + + +def test_sse_data_is_not_parsed_as_base64(caplog): + """Verify SSE data strings return (None, None) without error logging.""" + media = _make_media() + with caplog.at_level(logging.ERROR, logger="langfuse.media"): + result = media._parse_base64_data_uri("data: {'foo': 'bar'}") + + assert result == (None, None) + assert caplog.records == [], ( + f"Expected no error logs, got: {[r.message for r in caplog.records]}" + ) + + +def test_sse_data_with_json(caplog): + """Verify SSE data with JSON payload returns (None, None) without error logging.""" + media = _make_media() + with caplog.at_level(logging.ERROR, logger="langfuse.media"): + result = media._parse_base64_data_uri( + 'data: {"event": "message", "data": "hello"}' + ) + + assert result == (None, None) + assert caplog.records == [], ( + f"Expected no error logs, got: {[r.message for r in caplog.records]}" + ) + + +def test_valid_base64_data_uri_still_works(): + """Verify a proper base64 data URI is parsed correctly.""" + original_bytes = b"hello world" + encoded = base64.b64encode(original_bytes).decode("utf-8") + data_uri = f"data:text/plain;base64,{encoded}" + + media = _make_media() + content_bytes, content_type = media._parse_base64_data_uri(data_uri) + + assert content_bytes == original_bytes + assert content_type == "text/plain" + + +def test_data_uri_without_base64_returns_none(caplog): + """Verify a data URI without ;base64 encoding returns (None, None).""" + media = _make_media() + with caplog.at_level(logging.ERROR, logger="langfuse.media"): + result = media._parse_base64_data_uri("data:text/plain,hello") + + assert result == (None, None) + assert caplog.records == [] + + +def test_empty_string_returns_none(caplog): + """Verify an empty string returns (None, None) without error logging.""" + media = _make_media() + with caplog.at_level(logging.ERROR, logger="langfuse.media"): + result = media._parse_base64_data_uri("") + + assert result == (None, None) + assert caplog.records == [] + + +def test_non_data_uri_returns_none(caplog): + """Verify a regular string returns (None, None) without error logging.""" + media = _make_media() + with caplog.at_level(logging.ERROR, logger="langfuse.media"): + result = media._parse_base64_data_uri("just a regular string") + + assert result == (None, None) + assert caplog.records == [] + + +def test_valid_image_data_uri(): + """Verify a valid image data URI parses correctly.""" + pixel_bytes = b"\x89PNG\r\n" + encoded = base64.b64encode(pixel_bytes).decode("utf-8") + data_uri = f"data:image/png;base64,{encoded}" + + media = _make_media() + content_bytes, content_type = media._parse_base64_data_uri(data_uri) + + assert content_bytes == pixel_bytes + assert content_type == "image/png" + + +def test_data_uri_with_mime_params(): + """Verify a data URI with extra MIME parameters (e.g. charset) parses correctly.""" + original_bytes = b"hello world" + encoded = base64.b64encode(original_bytes).decode("utf-8") + data_uri = f"data:text/plain;charset=utf-8;base64,{encoded}" + + media = _make_media() + content_bytes, content_type = media._parse_base64_data_uri(data_uri) + + assert content_bytes == original_bytes + assert content_type == "text/plain" + + +def test_data_uri_without_mime_type(): + """Verify a data URI without MIME type defaults to text/plain per RFC 2397.""" + original_bytes = b"hello world" + encoded = base64.b64encode(original_bytes).decode("utf-8") + data_uri = f"data:;base64,{encoded}" + + media = _make_media() + content_bytes, content_type = media._parse_base64_data_uri(data_uri) + + assert content_bytes == original_bytes + assert content_type == "text/plain" From e29c3f43d5199928aae8e226705a2f8d73966b15 Mon Sep 17 00:00:00 2001 From: Sangjin Moon Date: Wed, 18 Feb 2026 13:34:24 +0900 Subject: [PATCH 2/3] fix: restore error logging for invalid base64 data URIs Add back error log when _parse_base64_data_uri receives a string that does not match the RFC 2397 regex, so callers get feedback on malformed input. Update tests to focus on the core fix (no false decoding) rather than log suppression. Co-Authored-By: Claude Opus 4.6 --- langfuse/media.py | 1 + tests/test_issue_5659.py | 36 +++++++++++------------------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/langfuse/media.py b/langfuse/media.py index 2df707dca..14438303f 100644 --- a/langfuse/media.py +++ b/langfuse/media.py @@ -208,6 +208,7 @@ def _parse_base64_data_uri( match = self._BASE64_DATA_URI_RE.match(data) if not match: + self._log.error("Invalid base64 data URI: does not match RFC 2397 format") return None, None try: diff --git a/tests/test_issue_5659.py b/tests/test_issue_5659.py index b2bc723ee..74636d1d5 100644 --- a/tests/test_issue_5659.py +++ b/tests/test_issue_5659.py @@ -13,30 +13,20 @@ def _make_media(): ) -def test_sse_data_is_not_parsed_as_base64(caplog): - """Verify SSE data strings return (None, None) without error logging.""" +def test_sse_data_is_not_parsed_as_base64(): + """Verify SSE data strings return (None, None) and are not decoded as media.""" media = _make_media() - with caplog.at_level(logging.ERROR, logger="langfuse.media"): - result = media._parse_base64_data_uri("data: {'foo': 'bar'}") + result = media._parse_base64_data_uri("data: {'foo': 'bar'}") assert result == (None, None) - assert caplog.records == [], ( - f"Expected no error logs, got: {[r.message for r in caplog.records]}" - ) -def test_sse_data_with_json(caplog): - """Verify SSE data with JSON payload returns (None, None) without error logging.""" +def test_sse_data_with_json(): + """Verify SSE data with JSON payload returns (None, None) and is not decoded as media.""" media = _make_media() - with caplog.at_level(logging.ERROR, logger="langfuse.media"): - result = media._parse_base64_data_uri( - 'data: {"event": "message", "data": "hello"}' - ) + result = media._parse_base64_data_uri('data: {"event": "message", "data": "hello"}') assert result == (None, None) - assert caplog.records == [], ( - f"Expected no error logs, got: {[r.message for r in caplog.records]}" - ) def test_valid_base64_data_uri_still_works(): @@ -52,14 +42,12 @@ def test_valid_base64_data_uri_still_works(): assert content_type == "text/plain" -def test_data_uri_without_base64_returns_none(caplog): +def test_data_uri_without_base64_returns_none(): """Verify a data URI without ;base64 encoding returns (None, None).""" media = _make_media() - with caplog.at_level(logging.ERROR, logger="langfuse.media"): - result = media._parse_base64_data_uri("data:text/plain,hello") + result = media._parse_base64_data_uri("data:text/plain,hello") assert result == (None, None) - assert caplog.records == [] def test_empty_string_returns_none(caplog): @@ -72,14 +60,12 @@ def test_empty_string_returns_none(caplog): assert caplog.records == [] -def test_non_data_uri_returns_none(caplog): - """Verify a regular string returns (None, None) without error logging.""" +def test_non_data_uri_returns_none(): + """Verify a regular string returns (None, None).""" media = _make_media() - with caplog.at_level(logging.ERROR, logger="langfuse.media"): - result = media._parse_base64_data_uri("just a regular string") + result = media._parse_base64_data_uri("just a regular string") assert result == (None, None) - assert caplog.records == [] def test_valid_image_data_uri(): From 28277c2319018140692d336fdb809e025d0cd3e6 Mon Sep 17 00:00:00 2001 From: Sangjin Moon Date: Wed, 18 Feb 2026 13:44:54 +0900 Subject: [PATCH 3/3] fix: add error log for non-string input in _parse_base64_data_uri --- langfuse/media.py | 1 + tests/test_issue_5659.py | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/langfuse/media.py b/langfuse/media.py index 14438303f..8f417674e 100644 --- a/langfuse/media.py +++ b/langfuse/media.py @@ -204,6 +204,7 @@ def _parse_base64_data_uri( ) -> Tuple[Optional[bytes], Optional[MediaContentType]]: # Example data URI: data:image/jpeg;base64,/9j/4AAQ... if not data or not isinstance(data, str): + self._log.error("Invalid base64 data URI: data is not a string") return None, None match = self._BASE64_DATA_URI_RE.match(data) diff --git a/tests/test_issue_5659.py b/tests/test_issue_5659.py index 74636d1d5..1edb0e251 100644 --- a/tests/test_issue_5659.py +++ b/tests/test_issue_5659.py @@ -1,7 +1,6 @@ """Test for issue #5659: _parse_base64_data_uri misidentifies SSE data as base64 media.""" import base64 -import logging from langfuse.media import LangfuseMedia @@ -50,14 +49,12 @@ def test_data_uri_without_base64_returns_none(): assert result == (None, None) -def test_empty_string_returns_none(caplog): - """Verify an empty string returns (None, None) without error logging.""" +def test_empty_string_returns_none(): + """Verify an empty string returns (None, None).""" media = _make_media() - with caplog.at_level(logging.ERROR, logger="langfuse.media"): - result = media._parse_base64_data_uri("") + result = media._parse_base64_data_uri("") assert result == (None, None) - assert caplog.records == [] def test_non_data_uri_returns_none():