From 257b51529cab7723f40636b7a825e2f80ee285df Mon Sep 17 00:00:00 2001
From: Sid0004 <siddhantsharma4uonly@gmail.com>
Date: Tue, 28 Oct 2025 13:42:54 +0530
Subject: [PATCH 1/2] tests: add comprehensive unit tests for linktree module
 (fixes #300)

---
 run_tests.py                 |  16 ++
 tests/conftest.py            |  35 ++++
 tests/test_linktree_extra.py | 158 +++++++++++++++
 tests/test_linktree_tree.py  | 363 +++++++++++++++++++++++++++++++++++
 4 files changed, 572 insertions(+)
 create mode 100644 run_tests.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_linktree_extra.py
 create mode 100644 tests/test_linktree_tree.py
diff --git a/run_tests.py b/run_tests.py
new file mode 100644
index 00000000..1dcd373f
--- /dev/null
+++ b/run_tests.py
@@ -0,0 +1,16 @@
+r"""Helper to run pytest with the `src` layout on systems where the package isn't installed.
+
+Usage:
+  .venv\Scripts\python.exe run_tests.py -q
+"""
+import sys
+
+# Ensure the src directory is first on sys.path so `import torbot` works
+sys.path.insert(0, "src")
+
+import pytest
+
+
+if __name__ == "__main__":
+    args = sys.argv[1:] or ["-q"]
+    raise SystemExit(pytest.main(args))
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..328483d7
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,35 @@
+"""pytest configuration and test-time shims for TorBot tests.
+
+This module provides a lightweight stub for the NLP classifier so tests can
+run without installing the full scientific stack (numpy, scikit-learn, etc.).
+The stub is only active during test runs and does not affect production code.
+"""
+import sys
+import types
+
+
+def _install_nlp_stub():
+    """Install a minimal stub for torbot.modules.nlp.main during tests."""
+    mod_name = "torbot.modules.nlp.main"
+    if mod_name in sys.modules:
+        return
+
+    # Create a minimal module with classify function
+    stub = types.ModuleType(mod_name)
+    
+    def classify(data):
+        """Lightweight test-only classifier.
+        
+        Returns a deterministic classification without requiring ML libraries.
+        Real implementation uses sklearn pipeline with training data.
+        """
+        _ = data  # unused in stub
+        return ["unknown", 0.0]
+    
+    # Use setattr to avoid linter complaints about dynamic attributes
+    setattr(stub, "classify", classify)
+    sys.modules[mod_name] = stub
+
+
+# Install stub before any test imports
+_install_nlp_stub()
diff --git a/tests/test_linktree_extra.py b/tests/test_linktree_extra.py
new file mode 100644
index 00000000..3252b55a
--- /dev/null
+++ b/tests/test_linktree_extra.py
@@ -0,0 +1,158 @@
+"""Additional edge-case tests for linktree parsing functions.
+
+These tests cover corner cases and error conditions for the parsing helpers.
+"""
+from bs4 import BeautifulSoup
+import pytest
+
+from torbot.modules.linktree import (
+    parse_hostname,
+    parse_links,
+    parse_emails,
+    parse_phone_numbers,
+)
+
+
+def test_parse_hostname_raises_on_invalid_url() -> None:
+    """Ensure parse_hostname raises exception for URLs without hostname."""
+    with pytest.raises(Exception, match="unable to parse hostname"):
+        parse_hostname("not-a-valid-url")
+
+
+def test_parse_hostname_handles_various_schemes() -> None:
+    """Verify parse_hostname works with http, https, and onion domains."""
+    assert parse_hostname("https://www.example.com/path") == "www.example.com"
+    assert parse_hostname("http://test.onion") == "test.onion"
+    assert parse_hostname("https://sub.domain.co.uk:8080/") == "sub.domain.co.uk"
+
+
+def test_parse_links_filters_only_valid_full_urls() -> None:
+    """Ensure parse_links returns only absolute http(s) URLs."""
+    html = """
+    <html>
+      <a href="/relative/path">relative</a>
+      <a href="https://valid.example/path">valid</a>
+      <a href="http://also.valid.test/">valid2</a>
+      <a href="javascript:void(0)">js</a>
+      <a href="https://valid.example/path">valid-duplicate</a>
+    </html>
+    """
+
+    links = parse_links(html)
+    # only absolute http(s) URLs should be returned, duplicates preserved
+    assert links == [
+        "https://valid.example/path",
+        "http://also.valid.test/",
+        "https://valid.example/path",
+    ]
+
+
+def test_parse_links_empty_html() -> None:
+    """Test parse_links with HTML containing no anchor tags."""
+    html = "<html><body><p>No links here</p></body></html>"
+    links = parse_links(html)
+    assert links == []
+
+
+def test_parse_links_anchor_without_href() -> None:
+    """Ensure parse_links handles anchor tags without href attribute."""
+    html = """
+    <html>
+      <a>No href</a>
+      <a name="anchor">Named anchor</a>
+      <a href="https://valid.com">Valid</a>
+    </html>
+    """
+    links = parse_links(html)
+    assert links == ["https://valid.com"]
+
+
+def test_parse_emails_ignores_invalid_and_returns_unique() -> None:
+    """Verify parse_emails filters invalid emails and removes duplicates."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="mailto:good@example.com">good</a>
+          <a href="mailto:good@example.com">good-dup</a>
+          <a href="mailto:bad-email@invalid@">bad</a>
+          <a href="mailto:withparams@example.com?subject=hi">withparams</a>
+          <a href="#">not-mailto</a>
+        </html>
+        """,
+        "html.parser",
+    )
+
+    emails = parse_emails(doc)
+    # duplicates removed, invalid emails rejected
+    # Note: current impl splits on 'mailto:' so params might be included
+    # We test actual behavior here
+    assert "good@example.com" in emails
+    assert len([e for e in emails if e == "good@example.com"]) == 1  # no duplicates
+
+
+def test_parse_emails_empty_page() -> None:
+    """Test parse_emails with no mailto links."""
+    doc = BeautifulSoup("<html><body><p>No emails</p></body></html>", "html.parser")
+    emails = parse_emails(doc)
+    assert emails == []
+
+
+def test_parse_emails_malformed_mailto() -> None:
+    """Ensure malformed mailto links are filtered out."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="mailto:">empty</a>
+          <a href="mailto:not-an-email">invalid</a>
+          <a href="mailto:valid@test.com">valid</a>
+        </html>
+        """,
+        "html.parser",
+    )
+    emails = parse_emails(doc)
+    # Only valid email should be extracted
+    assert emails == ["valid@test.com"]
+
+
+def test_parse_phone_numbers_only_accepts_possible_international_numbers() -> None:
+    """Verify parse_phone_numbers validates international format."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="tel:+14155552671">us</a>
+          <a href="tel:4155552671">no-plus</a>
+          <a href="tel:+442071838750">uk</a>
+          <a href="tel:invalid_phone">invalid</a>
+        </html>
+        """,
+        "html.parser",
+    )
+
+    numbers = parse_phone_numbers(doc)
+    # only the properly formatted international numbers (with +) are considered possible
+    assert sorted(numbers) == ["+14155552671", "+442071838750"]
+
+
+def test_parse_phone_numbers_empty_page() -> None:
+    """Test parse_phone_numbers with no tel: links."""
+    doc = BeautifulSoup("<html><body><p>No phones</p></body></html>", "html.parser")
+    numbers = parse_phone_numbers(doc)
+    assert numbers == []
+
+
+def test_parse_phone_numbers_removes_duplicates() -> None:
+    """Ensure duplicate phone numbers are deduplicated."""
+    doc = BeautifulSoup(
+        """
+        <html>
+          <a href="tel:+14155551234">call</a>
+          <a href="tel:+14155551234">call again</a>
+          <a href="tel:+14155559999">other</a>
+        </html>
+        """,
+        "html.parser",
+    )
+    numbers = parse_phone_numbers(doc)
+    assert len(numbers) == 2
+    assert "+14155551234" in numbers
+    assert "+14155559999" in numbers
diff --git a/tests/test_linktree_tree.py b/tests/test_linktree_tree.py
new file mode 100644
index 00000000..bcc0a129
--- /dev/null
+++ b/tests/test_linktree_tree.py
@@ -0,0 +1,363 @@
+tree = LinkTree("https://example.com", depth=0, client=client)  # type: ignore[arg-type]
+"""Comprehensive tests for LinkTree and LinkNode classes.
+
+These tests verify:
+- Creating nodes from HTTP responses (with/without title)
+- Building trees with depth limits
+- Duplicate node handling
+- Saving to JSON and text formats
+- Classification and contact extraction integration
+"""
+import json
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+from bs4 import BeautifulSoup
+
+from torbot.modules.linktree import LinkTree, LinkNode, parse_hostname
+
+
+class FakeResponse:
+    """Mock HTTP response for testing."""
+
+    def __init__(self, url: str, text: str, status_code: int = 200):
+        self.url = url
+        self.text = text
+        self.status_code = status_code
+
+
+class FakeClient:
+    """Mock httpx.Client that returns pre-configured responses."""
+
+    def __init__(self, responses: dict[str, FakeResponse]):
+        self.responses = responses
+        self.call_count = {}
+
+    def get(self, url: str) -> FakeResponse:
+        self.call_count[url] = self.call_count.get(url, 0) + 1
+        if url in self.responses:
+            return self.responses[url]
+        # Return a minimal response for unknown URLs
+        return FakeResponse(url, "<html><title>404</title></html>", 404)
+
+
+def test_parse_hostname_extracts_domain():
+    """Verify parse_hostname extracts hostname from various URL formats."""
+    assert parse_hostname("https://example.com/path") == "example.com"
+    assert parse_hostname("http://sub.domain.com:8080/") == "sub.domain.com"
+    assert parse_hostname("https://test.onion") == "test.onion"
+
+
+def test_parse_hostname_raises_on_invalid():
+    """Ensure parse_hostname raises when URL has no hostname."""
+    with pytest.raises(Exception, match="unable to parse hostname"):
+        parse_hostname("not-a-url")
+    
+    with pytest.raises(Exception, match="unable to parse hostname"):
+        parse_hostname("file:///local/path")
+
+
+def test_linknode_initialization():
+    """Verify LinkNode stores all required fields correctly."""
+    node = LinkNode(
+        title="Test Page",
+        url="https://example.com",
+        status=200,
+        classification="blog",
+        accuracy=0.85,
+        numbers=["+14155551234"],
+        emails=["test@example.com"],
+    )
+    
+    assert node.tag == "Test Page"
+    assert node.identifier == "https://example.com"
+    assert node.status == 200
+    assert node.classification == "blog"
+    assert node.accuracy == 0.85
+    assert node.numbers == ["+14155551234"]
+    assert node.emails == ["test@example.com"]
+
+
+def test_linktree_creates_root_node_with_title():
+    """Test that LinkTree creates a root node using the page title."""
+    html = """
+    <html>
+        <head><title>Example Site</title></head>
+        <body>
+            <a href="https://example.com/page1">Page 1</a>
+        </body>
+    </html>
+    """
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", html, 200),
+    })
+    
+    tree = LinkTree("https://example.com", depth=0, client=client)
+    tree.load()
+    
+    root = tree.get_node("https://example.com")
+    assert root is not None
+    assert root.tag == "Example Site"
+    assert root.data.status == 200
+
+
+def test_linktree_creates_root_node_without_title():
+    """Test that LinkTree falls back to hostname when no <title> tag."""
+    html = "<html><body><p>No title here</p></body></html>"
+    
+    client = FakeClient({
+        "https://test.onion": FakeResponse("https://test.onion", html, 200),
+    })
+    
+    tree = LinkTree("https://test.onion", depth=0, client=client)
+    tree.load()
+    
+    root = tree.get_node("https://test.onion")
+    assert root is not None
+    assert root.tag == "test.onion"
+
+
+def test_linktree_extracts_contacts_from_page():
+    """Verify LinkTree extracts emails and phone numbers from page."""
+    html = """
+    <html>
+        <head><title>Contact Page</title></head>
+        <body>
+            <a href="mailto:info@example.com">Email us</a>
+            <a href="tel:+14155551234">Call us</a>
+            <a href="mailto:support@example.com">Support</a>
+        </body>
+    </html>
+    """
+    
+    client = FakeClient({
+        "https://example.com/contact": FakeResponse(
+            "https://example.com/contact", html, 200
+        ),
+    })
+    
+    tree = LinkTree("https://example.com/contact", depth=0, client=client)
+    tree.load()
+    
+    root = tree.get_node("https://example.com/contact")
+    assert root is not None
+    assert len(root.data.emails) == 2
+    assert "info@example.com" in root.data.emails
+    assert "support@example.com" in root.data.emails
+    assert "+14155551234" in root.data.numbers
+
+
+def test_linktree_builds_tree_to_depth_1():
+    """Test that LinkTree builds children up to specified depth."""
+    root_html = """
+    <html>
+        <head><title>Root</title></head>
+        <body>
+            <a href="https://example.com/child1">Child 1</a>
+            <a href="https://example.com/child2">Child 2</a>
+        </body>
+    </html>
+    """
+    
+    child1_html = "<html><head><title>Child 1</title></head></html>"
+    child2_html = "<html><head><title>Child 2</title></head></html>"
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", root_html, 200),
+        "https://example.com/child1": FakeResponse(
+            "https://example.com/child1", child1_html, 200
+        ),
+        "https://example.com/child2": FakeResponse(
+            "https://example.com/child2", child2_html, 200
+        ),
+    })
+    
+    tree = LinkTree("https://example.com", depth=1, client=client)
+    tree.load()
+    
+    # Verify root exists
+    root = tree.get_node("https://example.com")
+    assert root is not None
+    
+    # Verify children exist
+    child1 = tree.get_node("https://example.com/child1")
+    child2 = tree.get_node("https://example.com/child2")
+    assert child1 is not None
+    assert child2 is not None
+    assert child1.tag == "Child 1"
+    assert child2.tag == "Child 2"
+    
+    # Verify tree structure
+    assert tree.parent("https://example.com/child1").identifier == "https://example.com"
+    assert tree.parent("https://example.com/child2").identifier == "https://example.com"
+
+
+def test_linktree_respects_depth_limit():
+    """Ensure LinkTree stops recursion at the specified depth."""
+    root_html = '<html><title>Root</title><a href="https://example.com/level1">L1</a></html>'
+    level1_html = '<html><title>Level 1</title><a href="https://example.com/level2">L2</a></html>'
+    level2_html = '<html><title>Level 2</title><a href="https://example.com/level3">L3</a></html>'
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", root_html, 200),
+        "https://example.com/level1": FakeResponse(
+            "https://example.com/level1", level1_html, 200
+        ),
+        "https://example.com/level2": FakeResponse(
+            "https://example.com/level2", level2_html, 200
+        ),
+        "https://example.com/level3": FakeResponse(
+            "https://example.com/level3", '<html><title>Level 3</title></html>', 200
+        ),
+    })
+    
+    # Build tree with depth=2 (root + 2 levels)
+    tree = LinkTree("https://example.com", depth=2, client=client)
+    tree.load()
+    
+    # Root and level1 and level2 should exist
+    assert tree.get_node("https://example.com") is not None
+    assert tree.get_node("https://example.com/level1") is not None
+    assert tree.get_node("https://example.com/level2") is not None
+    
+    # Level3 should NOT exist (depth limit)
+    assert tree.get_node("https://example.com/level3") is None
+
+
+def test_linktree_handles_duplicate_links():
+    """Verify duplicate URLs don't cause errors or duplicate nodes."""
+    html_with_dup = """
+    <html>
+        <title>Page with duplicates</title>
+        <a href="https://example.com/page">Link 1</a>
+        <a href="https://example.com/page">Link 2 (same URL)</a>
+        <a href="https://example.com/other">Other</a>
+    </html>
+    """
+    
+    page_html = "<html><title>Target Page</title></html>"
+    other_html = "<html><title>Other Page</title></html>"
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", html_with_dup, 200),
+        "https://example.com/page": FakeResponse(
+            "https://example.com/page", page_html, 200
+        ),
+        "https://example.com/other": FakeResponse(
+            "https://example.com/other", other_html, 200
+        ),
+    })
+    
+    tree = LinkTree("https://example.com", depth=1, client=client)
+    tree.load()
+    
+    # Should have 3 nodes total: root + 2 unique children
+    all_nodes = tree.all_nodes()
+    assert len(all_nodes) == 3
+    
+    # Duplicate should have been attempted once (first add), then skipped
+    assert tree.get_node("https://example.com/page") is not None
+    assert tree.get_node("https://example.com/other") is not None
+
+
+def test_linktree_save_json_creates_file():
+    """Test saveJSON writes a valid JSON file with tree structure."""
+    html = '<html><title>Test JSON Save</title></html>'
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", html, 200),
+    })
+    
+    tree = LinkTree("https://example.com", depth=0, client=client)
+    tree.load()
+    
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Patch project_root_directory to use temp dir
+        with patch("torbot.modules.linktree.project_root_directory", tmpdir):
+            tree.saveJSON()
+            
+            # Check that JSON file was created
+            json_files = list(Path(tmpdir).glob("*.json"))
+            assert len(json_files) == 1
+            
+            # Verify JSON file is not empty and is valid JSON
+            with open(json_files[0]) as f:
+                content = f.read()
+                assert len(content) > 0
+                # The file should contain valid JSON (even if simple string from treelib)
+                data = json.loads(content)
+                # Verify file was created successfully (content is non-empty valid JSON)
+                assert data is not None
+
+
+def test_linktree_save_text_creates_file():
+    """Test save creates a text file representation of the tree."""
+    html = '<html><title>Test Text Save</title></html>'
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", html, 200),
+    })
+    
+    tree = LinkTree("https://example.com", depth=0, client=client)
+    tree.load()
+    
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with patch("torbot.modules.linktree.project_root_directory", tmpdir):
+            tree.save()
+            
+            # Check that text file was created
+            txt_files = list(Path(tmpdir).glob("*.txt"))
+            assert len(txt_files) == 1
+            
+            # Verify file is not empty
+            assert txt_files[0].stat().st_size > 0
+
+
+def test_linktree_handles_non_200_status():
+    """Verify LinkTree records non-200 status codes correctly."""
+    html = '<html><title>Not Found</title></html>'
+    
+    client = FakeClient({
+        "https://example.com/missing": FakeResponse(
+            "https://example.com/missing", html, 404
+        ),
+    })
+    
+    tree = LinkTree("https://example.com/missing", depth=0, client=client)
+    tree.load()
+    
+    root = tree.get_node("https://example.com/missing")
+    assert root is not None
+    assert root.data.status == 404
+
+
+def test_linktree_filters_invalid_links():
+    """Ensure only valid absolute URLs are added as children."""
+    html = """
+    <html>
+        <title>Root</title>
+        <a href="https://valid.com">Valid</a>
+        <a href="/relative/path">Relative</a>
+        <a href="javascript:void(0)">JS</a>
+        <a href="mailto:test@example.com">Email</a>
+        <a href="#fragment">Fragment</a>
+    </html>
+    """
+    
+    valid_html = '<html><title>Valid</title></html>'
+    
+    client = FakeClient({
+        "https://example.com": FakeResponse("https://example.com", html, 200),
+        "https://valid.com": FakeResponse("https://valid.com", valid_html, 200),
+    })
+    
+    tree = LinkTree("https://example.com", depth=1, client=client)
+    tree.load()
+    
+    # Should have 2 nodes: root + 1 valid child
+    all_nodes = tree.all_nodes()
+    assert len(all_nodes) == 2
+    assert tree.get_node("https://valid.com") is not None

From 792f9b6699484a206907a4c12e832891efe8dcee Mon Sep 17 00:00:00 2001
From: Sid0004 <siddhantsharma4uonly@gmail.com>
Date: Wed, 29 Oct 2025 09:50:31 +0530
Subject: [PATCH 2/2] Fix NameError in test_linktree_tree.py - Remove errant
 code from module level

---
 tests/test_linktree_tree.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_linktree_tree.py b/tests/test_linktree_tree.py
index bcc0a129..cd64a8ea 100644
--- a/tests/test_linktree_tree.py
+++ b/tests/test_linktree_tree.py
@@ -1,4 +1,3 @@
-tree = LinkTree("https://example.com", depth=0, client=client)  # type: ignore[arg-type]
 """Comprehensive tests for LinkTree and LinkNode classes.
 
 These tests verify: