From 257b51529cab7723f40636b7a825e2f80ee285df Mon Sep 17 00:00:00 2001 From: Sid0004 Date: Tue, 28 Oct 2025 13:42:54 +0530 Subject: [PATCH 1/2] tests: add comprehensive unit tests for linktree module (fixes #300) --- run_tests.py | 16 ++ tests/conftest.py | 35 ++++ tests/test_linktree_extra.py | 158 +++++++++++++++ tests/test_linktree_tree.py | 363 +++++++++++++++++++++++++++++++++++ 4 files changed, 572 insertions(+) create mode 100644 run_tests.py create mode 100644 tests/conftest.py create mode 100644 tests/test_linktree_extra.py create mode 100644 tests/test_linktree_tree.py diff --git a/run_tests.py b/run_tests.py new file mode 100644 index 00000000..1dcd373f --- /dev/null +++ b/run_tests.py @@ -0,0 +1,16 @@ +r"""Helper to run pytest with the `src` layout on systems where the package isn't installed. + +Usage: + .venv\Scripts\python.exe run_tests.py -q +""" +import sys + +# Ensure the src directory is first on sys.path so `import torbot` works +sys.path.insert(0, "src") + +import pytest + + +if __name__ == "__main__": + args = sys.argv[1:] or ["-q"] + raise SystemExit(pytest.main(args)) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..328483d7 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,35 @@ +"""pytest configuration and test-time shims for TorBot tests. + +This module provides a lightweight stub for the NLP classifier so tests can +run without installing the full scientific stack (numpy, scikit-learn, etc.). +The stub is only active during test runs and does not affect production code. +""" +import sys +import types + + +def _install_nlp_stub(): + """Install a minimal stub for torbot.modules.nlp.main during tests.""" + mod_name = "torbot.modules.nlp.main" + if mod_name in sys.modules: + return + + # Create a minimal module with classify function + stub = types.ModuleType(mod_name) + + def classify(data): + """Lightweight test-only classifier. + + Returns a deterministic classification without requiring ML libraries. + Real implementation uses sklearn pipeline with training data. + """ + _ = data # unused in stub + return ["unknown", 0.0] + + # Use setattr to avoid linter complaints about dynamic attributes + setattr(stub, "classify", classify) + sys.modules[mod_name] = stub + + +# Install stub before any test imports +_install_nlp_stub() diff --git a/tests/test_linktree_extra.py b/tests/test_linktree_extra.py new file mode 100644 index 00000000..3252b55a --- /dev/null +++ b/tests/test_linktree_extra.py @@ -0,0 +1,158 @@ +"""Additional edge-case tests for linktree parsing functions. + +These tests cover corner cases and error conditions for the parsing helpers. +""" +from bs4 import BeautifulSoup +import pytest + +from torbot.modules.linktree import ( + parse_hostname, + parse_links, + parse_emails, + parse_phone_numbers, +) + + +def test_parse_hostname_raises_on_invalid_url() -> None: + """Ensure parse_hostname raises exception for URLs without hostname.""" + with pytest.raises(Exception, match="unable to parse hostname"): + parse_hostname("not-a-valid-url") + + +def test_parse_hostname_handles_various_schemes() -> None: + """Verify parse_hostname works with http, https, and onion domains.""" + assert parse_hostname("https://www.example.com/path") == "www.example.com" + assert parse_hostname("http://test.onion") == "test.onion" + assert parse_hostname("https://sub.domain.co.uk:8080/") == "sub.domain.co.uk" + + +def test_parse_links_filters_only_valid_full_urls() -> None: + """Ensure parse_links returns only absolute http(s) URLs.""" + html = """ + + relative + valid + valid2 + js + valid-duplicate + + """ + + links = parse_links(html) + # only absolute http(s) URLs should be returned, duplicates preserved + assert links == [ + "https://valid.example/path", + "http://also.valid.test/", + "https://valid.example/path", + ] + + +def test_parse_links_empty_html() -> None: + """Test parse_links with HTML containing no anchor tags.""" + html = "

No links here

" + links = parse_links(html) + assert links == [] + + +def test_parse_links_anchor_without_href() -> None: + """Ensure parse_links handles anchor tags without href attribute.""" + html = """ + + No href + Named anchor + Valid + + """ + links = parse_links(html) + assert links == ["https://valid.com"] + + +def test_parse_emails_ignores_invalid_and_returns_unique() -> None: + """Verify parse_emails filters invalid emails and removes duplicates.""" + doc = BeautifulSoup( + """ + + good + good-dup + bad + withparams + not-mailto + + """, + "html.parser", + ) + + emails = parse_emails(doc) + # duplicates removed, invalid emails rejected + # Note: current impl splits on 'mailto:' so params might be included + # We test actual behavior here + assert "good@example.com" in emails + assert len([e for e in emails if e == "good@example.com"]) == 1 # no duplicates + + +def test_parse_emails_empty_page() -> None: + """Test parse_emails with no mailto links.""" + doc = BeautifulSoup("

No emails

", "html.parser") + emails = parse_emails(doc) + assert emails == [] + + +def test_parse_emails_malformed_mailto() -> None: + """Ensure malformed mailto links are filtered out.""" + doc = BeautifulSoup( + """ + + empty + invalid + valid + + """, + "html.parser", + ) + emails = parse_emails(doc) + # Only valid email should be extracted + assert emails == ["valid@test.com"] + + +def test_parse_phone_numbers_only_accepts_possible_international_numbers() -> None: + """Verify parse_phone_numbers validates international format.""" + doc = BeautifulSoup( + """ + + us + no-plus + uk + invalid + + """, + "html.parser", + ) + + numbers = parse_phone_numbers(doc) + # only the properly formatted international numbers (with +) are considered possible + assert sorted(numbers) == ["+14155552671", "+442071838750"] + + +def test_parse_phone_numbers_empty_page() -> None: + """Test parse_phone_numbers with no tel: links.""" + doc = BeautifulSoup("

No phones

", "html.parser") + numbers = parse_phone_numbers(doc) + assert numbers == [] + + +def test_parse_phone_numbers_removes_duplicates() -> None: + """Ensure duplicate phone numbers are deduplicated.""" + doc = BeautifulSoup( + """ + + call + call again + other + + """, + "html.parser", + ) + numbers = parse_phone_numbers(doc) + assert len(numbers) == 2 + assert "+14155551234" in numbers + assert "+14155559999" in numbers diff --git a/tests/test_linktree_tree.py b/tests/test_linktree_tree.py new file mode 100644 index 00000000..bcc0a129 --- /dev/null +++ b/tests/test_linktree_tree.py @@ -0,0 +1,363 @@ +tree = LinkTree("https://example.com", depth=0, client=client) # type: ignore[arg-type] +"""Comprehensive tests for LinkTree and LinkNode classes. + +These tests verify: +- Creating nodes from HTTP responses (with/without title) +- Building trees with depth limits +- Duplicate node handling +- Saving to JSON and text formats +- Classification and contact extraction integration +""" +import json +import tempfile +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest +from bs4 import BeautifulSoup + +from torbot.modules.linktree import LinkTree, LinkNode, parse_hostname + + +class FakeResponse: + """Mock HTTP response for testing.""" + + def __init__(self, url: str, text: str, status_code: int = 200): + self.url = url + self.text = text + self.status_code = status_code + + +class FakeClient: + """Mock httpx.Client that returns pre-configured responses.""" + + def __init__(self, responses: dict[str, FakeResponse]): + self.responses = responses + self.call_count = {} + + def get(self, url: str) -> FakeResponse: + self.call_count[url] = self.call_count.get(url, 0) + 1 + if url in self.responses: + return self.responses[url] + # Return a minimal response for unknown URLs + return FakeResponse(url, "404", 404) + + +def test_parse_hostname_extracts_domain(): + """Verify parse_hostname extracts hostname from various URL formats.""" + assert parse_hostname("https://example.com/path") == "example.com" + assert parse_hostname("http://sub.domain.com:8080/") == "sub.domain.com" + assert parse_hostname("https://test.onion") == "test.onion" + + +def test_parse_hostname_raises_on_invalid(): + """Ensure parse_hostname raises when URL has no hostname.""" + with pytest.raises(Exception, match="unable to parse hostname"): + parse_hostname("not-a-url") + + with pytest.raises(Exception, match="unable to parse hostname"): + parse_hostname("file:///local/path") + + +def test_linknode_initialization(): + """Verify LinkNode stores all required fields correctly.""" + node = LinkNode( + title="Test Page", + url="https://example.com", + status=200, + classification="blog", + accuracy=0.85, + numbers=["+14155551234"], + emails=["test@example.com"], + ) + + assert node.tag == "Test Page" + assert node.identifier == "https://example.com" + assert node.status == 200 + assert node.classification == "blog" + assert node.accuracy == 0.85 + assert node.numbers == ["+14155551234"] + assert node.emails == ["test@example.com"] + + +def test_linktree_creates_root_node_with_title(): + """Test that LinkTree creates a root node using the page title.""" + html = """ + + Example Site + + Page 1 + + + """ + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", html, 200), + }) + + tree = LinkTree("https://example.com", depth=0, client=client) + tree.load() + + root = tree.get_node("https://example.com") + assert root is not None + assert root.tag == "Example Site" + assert root.data.status == 200 + + +def test_linktree_creates_root_node_without_title(): + """Test that LinkTree falls back to hostname when no tag.""" + html = "<html><body><p>No title here</p></body></html>" + + client = FakeClient({ + "https://test.onion": FakeResponse("https://test.onion", html, 200), + }) + + tree = LinkTree("https://test.onion", depth=0, client=client) + tree.load() + + root = tree.get_node("https://test.onion") + assert root is not None + assert root.tag == "test.onion" + + +def test_linktree_extracts_contacts_from_page(): + """Verify LinkTree extracts emails and phone numbers from page.""" + html = """ + <html> + <head><title>Contact Page + + Email us + Call us + Support + + + """ + + client = FakeClient({ + "https://example.com/contact": FakeResponse( + "https://example.com/contact", html, 200 + ), + }) + + tree = LinkTree("https://example.com/contact", depth=0, client=client) + tree.load() + + root = tree.get_node("https://example.com/contact") + assert root is not None + assert len(root.data.emails) == 2 + assert "info@example.com" in root.data.emails + assert "support@example.com" in root.data.emails + assert "+14155551234" in root.data.numbers + + +def test_linktree_builds_tree_to_depth_1(): + """Test that LinkTree builds children up to specified depth.""" + root_html = """ + + Root + + Child 1 + Child 2 + + + """ + + child1_html = "Child 1" + child2_html = "Child 2" + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", root_html, 200), + "https://example.com/child1": FakeResponse( + "https://example.com/child1", child1_html, 200 + ), + "https://example.com/child2": FakeResponse( + "https://example.com/child2", child2_html, 200 + ), + }) + + tree = LinkTree("https://example.com", depth=1, client=client) + tree.load() + + # Verify root exists + root = tree.get_node("https://example.com") + assert root is not None + + # Verify children exist + child1 = tree.get_node("https://example.com/child1") + child2 = tree.get_node("https://example.com/child2") + assert child1 is not None + assert child2 is not None + assert child1.tag == "Child 1" + assert child2.tag == "Child 2" + + # Verify tree structure + assert tree.parent("https://example.com/child1").identifier == "https://example.com" + assert tree.parent("https://example.com/child2").identifier == "https://example.com" + + +def test_linktree_respects_depth_limit(): + """Ensure LinkTree stops recursion at the specified depth.""" + root_html = 'RootL1' + level1_html = 'Level 1L2' + level2_html = 'Level 2L3' + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", root_html, 200), + "https://example.com/level1": FakeResponse( + "https://example.com/level1", level1_html, 200 + ), + "https://example.com/level2": FakeResponse( + "https://example.com/level2", level2_html, 200 + ), + "https://example.com/level3": FakeResponse( + "https://example.com/level3", 'Level 3', 200 + ), + }) + + # Build tree with depth=2 (root + 2 levels) + tree = LinkTree("https://example.com", depth=2, client=client) + tree.load() + + # Root and level1 and level2 should exist + assert tree.get_node("https://example.com") is not None + assert tree.get_node("https://example.com/level1") is not None + assert tree.get_node("https://example.com/level2") is not None + + # Level3 should NOT exist (depth limit) + assert tree.get_node("https://example.com/level3") is None + + +def test_linktree_handles_duplicate_links(): + """Verify duplicate URLs don't cause errors or duplicate nodes.""" + html_with_dup = """ + + Page with duplicates + Link 1 + Link 2 (same URL) + Other + + """ + + page_html = "Target Page" + other_html = "Other Page" + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", html_with_dup, 200), + "https://example.com/page": FakeResponse( + "https://example.com/page", page_html, 200 + ), + "https://example.com/other": FakeResponse( + "https://example.com/other", other_html, 200 + ), + }) + + tree = LinkTree("https://example.com", depth=1, client=client) + tree.load() + + # Should have 3 nodes total: root + 2 unique children + all_nodes = tree.all_nodes() + assert len(all_nodes) == 3 + + # Duplicate should have been attempted once (first add), then skipped + assert tree.get_node("https://example.com/page") is not None + assert tree.get_node("https://example.com/other") is not None + + +def test_linktree_save_json_creates_file(): + """Test saveJSON writes a valid JSON file with tree structure.""" + html = 'Test JSON Save' + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", html, 200), + }) + + tree = LinkTree("https://example.com", depth=0, client=client) + tree.load() + + with tempfile.TemporaryDirectory() as tmpdir: + # Patch project_root_directory to use temp dir + with patch("torbot.modules.linktree.project_root_directory", tmpdir): + tree.saveJSON() + + # Check that JSON file was created + json_files = list(Path(tmpdir).glob("*.json")) + assert len(json_files) == 1 + + # Verify JSON file is not empty and is valid JSON + with open(json_files[0]) as f: + content = f.read() + assert len(content) > 0 + # The file should contain valid JSON (even if simple string from treelib) + data = json.loads(content) + # Verify file was created successfully (content is non-empty valid JSON) + assert data is not None + + +def test_linktree_save_text_creates_file(): + """Test save creates a text file representation of the tree.""" + html = 'Test Text Save' + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", html, 200), + }) + + tree = LinkTree("https://example.com", depth=0, client=client) + tree.load() + + with tempfile.TemporaryDirectory() as tmpdir: + with patch("torbot.modules.linktree.project_root_directory", tmpdir): + tree.save() + + # Check that text file was created + txt_files = list(Path(tmpdir).glob("*.txt")) + assert len(txt_files) == 1 + + # Verify file is not empty + assert txt_files[0].stat().st_size > 0 + + +def test_linktree_handles_non_200_status(): + """Verify LinkTree records non-200 status codes correctly.""" + html = 'Not Found' + + client = FakeClient({ + "https://example.com/missing": FakeResponse( + "https://example.com/missing", html, 404 + ), + }) + + tree = LinkTree("https://example.com/missing", depth=0, client=client) + tree.load() + + root = tree.get_node("https://example.com/missing") + assert root is not None + assert root.data.status == 404 + + +def test_linktree_filters_invalid_links(): + """Ensure only valid absolute URLs are added as children.""" + html = """ + + Root + Valid + Relative + JS + Email + Fragment + + """ + + valid_html = 'Valid' + + client = FakeClient({ + "https://example.com": FakeResponse("https://example.com", html, 200), + "https://valid.com": FakeResponse("https://valid.com", valid_html, 200), + }) + + tree = LinkTree("https://example.com", depth=1, client=client) + tree.load() + + # Should have 2 nodes: root + 1 valid child + all_nodes = tree.all_nodes() + assert len(all_nodes) == 2 + assert tree.get_node("https://valid.com") is not None From 792f9b6699484a206907a4c12e832891efe8dcee Mon Sep 17 00:00:00 2001 From: Sid0004 Date: Wed, 29 Oct 2025 09:50:31 +0530 Subject: [PATCH 2/2] Fix NameError in test_linktree_tree.py - Remove errant code from module level --- tests/test_linktree_tree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_linktree_tree.py b/tests/test_linktree_tree.py index bcc0a129..cd64a8ea 100644 --- a/tests/test_linktree_tree.py +++ b/tests/test_linktree_tree.py @@ -1,4 +1,3 @@ -tree = LinkTree("https://example.com", depth=0, client=client) # type: ignore[arg-type] """Comprehensive tests for LinkTree and LinkNode classes. These tests verify: