From e6d2e86af098e5c2ce40fbd5032ce837361c0c4a Mon Sep 17 00:00:00 2001
From: majiayu000 <1835304752@qq.com>
Date: Thu, 8 Jan 2026 17:16:09 +0800
Subject: [PATCH] fix: respect <base> tag when resolving relative links in
 html2text

Fixes #1680

The html2text module was ignoring the HTML <base> tag when resolving
relative links. According to HTML standards, when a document contains
a <base> tag, relative URLs should be resolved against the base URL
specified in the tag's href attribute.

Added handling for the <base> tag in handle_tag() to update self.baseurl
when encountered, ensuring subsequent relative links are resolved
correctly.

Signed-off-by: majiayu000 <1835304752@qq.com>
---
 crawl4ai/html2text/__init__.py        |  6 +++
 tests/unit/test_html2text_base_tag.py | 70 +++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 tests/unit/test_html2text_base_tag.py
diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index ca15b4534..fbdf7ecdc 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -400,6 +400,12 @@ def handle_tag(
             else:
                 self.quiet -= 1
 
+        if tag == "base" and start:
+            # Update baseurl from <base href="..."> tag
+            href = attrs.get("href")
+            if href:
+                self.baseurl = urlparse.urljoin(self.baseurl, href)
+
         if tag == "style":
             if start:
                 self.style += 1
diff --git a/tests/unit/test_html2text_base_tag.py b/tests/unit/test_html2text_base_tag.py
new file mode 100644
index 000000000..0f38bede9
--- /dev/null
+++ b/tests/unit/test_html2text_base_tag.py
@@ -0,0 +1,70 @@
+"""
+Test that html2text respects the <base> tag when resolving relative links.
+
+According to HTML standards, when an HTML document contains a <base> tag,
+relative links should be resolved against the URL specified in the <base>
+tag's href attribute, not the original page URL.
+
+This test uses AST parsing to verify the fix is present since the html2text
+module has internal imports that require a full package setup.
+"""
+
+import ast
+import os
+
+
+def get_function_source(filepath: str, function_name: str) -> str:
+    """Parse a Python file and extract the source of a specific function."""
+    with open(filepath, 'r') as f:
+        source = f.read()
+        tree = ast.parse(source)
+
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            if node.name == function_name:
+                return ast.get_source_segment(source, node)
+    return None
+
+
+def test_handle_tag_has_base_tag_support():
+    """Test that handle_tag method processes <base> tag to update baseurl."""
+    html2text_path = os.path.join(
+        os.path.dirname(__file__), '..', '..', 'crawl4ai', 'html2text', '__init__.py'
+    )
+
+    source = get_function_source(html2text_path, 'handle_tag')
+    assert source is not None, "Could not find handle_tag function"
+
+    # Verify the function handles base tag
+    assert 'tag == "base"' in source or "tag == 'base'" in source, \
+        "handle_tag should check for base tag"
+
+    # Verify it updates self.baseurl
+    assert 'self.baseurl' in source, \
+        "handle_tag should update self.baseurl when processing base tag"
+
+    # Verify it gets href attribute
+    assert 'href' in source, \
+        "handle_tag should get href attribute from base tag"
+
+
+def test_base_tag_code_structure():
+    """Test that the base tag handling code is properly structured."""
+    html2text_path = os.path.join(
+        os.path.dirname(__file__), '..', '..', 'crawl4ai', 'html2text', '__init__.py'
+    )
+
+    with open(html2text_path, 'r') as f:
+        source = f.read()
+
+    # Check that base tag handling exists and uses urljoin
+    assert 'tag == "base"' in source, \
+        "html2text should handle base tag"
+    assert 'urlparse.urljoin' in source, \
+        "html2text should use urljoin for URL resolution"
+
+
+if __name__ == "__main__":
+    test_handle_tag_has_base_tag_support()
+    test_base_tag_code_structure()
+    print("All html2text base tag tests passed!")