|
1 | 1 | import { Readability } from '@mozilla/readability'; |
2 | 2 | import { JSDOM } from 'jsdom'; |
3 | 3 | import { Page } from 'playwright'; |
| 4 | +import { ToolContext } from '../../../core/types.js'; |
4 | 5 |
|
5 | 6 | const OUTPUT_LIMIT = 11 * 1024; // 10KB limit |
6 | 7 |
|
7 | 8 | /** |
8 | 9 | * Returns the raw HTML content of the page without any processing |
9 | 10 | */ |
10 | | -async function getNoneProcessedDOM(page: Page): Promise<string> { |
11 | | - return await page.content(); |
| 11 | +async function getRawDOM(page: Page): Promise<string> { |
| 12 | + const content = await page.content(); |
| 13 | + return content; |
12 | 14 | } |
13 | 15 |
|
14 | 16 | /** |
15 | | - * Processes the page using Mozilla's Readability to extract the main content |
16 | | - * Falls back to simple processing if Readability fails |
| 17 | + * Uses an LLM to extract the main content from a page and format it as markdown |
17 | 18 | */ |
18 | | -async function getReadabilityProcessedDOM(page: Page): Promise<string> { |
| 19 | +async function getSmartMarkdownContent(page: Page, context: ToolContext): Promise<string> { |
19 | 20 | try { |
20 | 21 | const html = await page.content(); |
21 | 22 | const url = page.url(); |
22 | | - const dom = new JSDOM(html, { url }); |
23 | | - const reader = new Readability(dom.window.document); |
24 | | - const article = reader.parse(); |
| 23 | + |
| 24 | + // Create a system prompt for the LLM |
| 25 | + const systemPrompt = `You are an expert at extracting the main content from web pages. |
| 26 | +Given the HTML content of a webpage, extract only the main informative content. |
| 27 | +Format the extracted content as clean, well-structured markdown. |
| 28 | +Ignore headers, footers, navigation, sidebars, ads, and other non-content elements. |
| 29 | +Preserve the important headings, paragraphs, lists, and other content structures. |
| 30 | +Do not include any explanations or descriptions about what you're doing. |
| 31 | +Just return the extracted content as markdown.`; |
25 | 32 |
|
26 | | - if (!article) { |
27 | | - console.warn( |
28 | | - 'Readability could not parse the page, falling back to simple mode', |
29 | | - ); |
30 | | - return getSimpleProcessedDOM(page); |
| 33 | + // Use the configured LLM to extract the content |
| 34 | + const { provider, model, apiKey, baseUrl } = context; |
| 35 | + |
| 36 | + if (!provider || !model) { |
| 37 | + context.logger.warn('LLM provider or model not available, falling back to raw DOM'); |
| 38 | + return getRawDOM(page); |
31 | 39 | } |
32 | 40 |
|
33 | | - // Return a formatted version of the article |
34 | | - return JSON.stringify( |
35 | | - { |
36 | | - url: url, |
37 | | - title: article.title || '', |
38 | | - content: article.content || '', |
39 | | - textContent: article.textContent || '', |
40 | | - excerpt: article.excerpt || '', |
41 | | - byline: article.byline || '', |
42 | | - dir: article.dir || '', |
43 | | - siteName: article.siteName || '', |
44 | | - length: article.length || 0, |
45 | | - }, |
46 | | - null, |
47 | | - 2, |
48 | | - ); |
| 41 | + try { |
| 42 | + // Import the createProvider function from the provider module |
| 43 | + const { createProvider } = await import('../../../core/llm/provider.js'); |
| 44 | + |
| 45 | + // Create a provider instance using the provider abstraction |
| 46 | + const llmProvider = createProvider(provider, model, { |
| 47 | + apiKey, |
| 48 | + baseUrl |
| 49 | + }); |
| 50 | + |
| 51 | + // Generate text using the provider |
| 52 | + const response = await llmProvider.generateText({ |
| 53 | + messages: [ |
| 54 | + { |
| 55 | + role: 'system', |
| 56 | + content: systemPrompt |
| 57 | + }, |
| 58 | + { |
| 59 | + role: 'user', |
| 60 | + content: `URL: ${url}\n\nHTML content:\n${html}` |
| 61 | + } |
| 62 | + ], |
| 63 | + temperature: 0.3, |
| 64 | + maxTokens: 4000 |
| 65 | + }); |
| 66 | + |
| 67 | + // Extract the markdown content from the response |
| 68 | + const markdown = response.text; |
| 69 | + |
| 70 | + if (!markdown) { |
| 71 | + context.logger.warn('LLM returned empty content, falling back to raw DOM'); |
| 72 | + return getRawDOM(page); |
| 73 | + } |
| 74 | + |
| 75 | + // Log token usage for monitoring |
| 76 | + context.logger.debug(`Token usage for content extraction: ${JSON.stringify(response.tokenUsage)}`); |
| 77 | + |
| 78 | + return markdown; |
| 79 | + } catch (llmError) { |
| 80 | + context.logger.error('Error using LLM provider for content extraction:', llmError); |
| 81 | + return getRawDOM(page); |
| 82 | + } |
49 | 83 | } catch (error) { |
50 | | - console.error('Error using Readability:', error); |
51 | | - // Fallback to simple mode if Readability fails |
52 | | - return getSimpleProcessedDOM(page); |
| 84 | + context.logger.error('Error using LLM for content extraction:', error); |
| 85 | + // Fallback to raw mode if LLM processing fails |
| 86 | + return getRawDOM(page); |
53 | 87 | } |
54 | 88 | } |
55 | 89 |
|
56 | | -/** |
57 | | - * Processes the page by removing invisible elements and non-visual tags |
58 | | - */ |
59 | | -async function getSimpleProcessedDOM(page: Page): Promise<string> { |
60 | | - const domContent = await page.evaluate(() => { |
61 | | - const clone = document.documentElement; |
62 | | - |
63 | | - const elements = clone.querySelectorAll('*'); |
64 | | - |
65 | | - const elementsToRemove: Element[] = []; |
66 | | - elements.forEach((element) => { |
67 | | - const computedStyle = window.getComputedStyle(element); |
68 | | - const isVisible = |
69 | | - computedStyle.display !== 'none' && |
70 | | - computedStyle.visibility !== 'hidden' && |
71 | | - computedStyle.opacity !== '0'; |
72 | | - |
73 | | - if (!isVisible) { |
74 | | - elementsToRemove.push(element); |
75 | | - } |
76 | | - }); |
77 | | - |
78 | | - const nonVisualTags = clone.querySelectorAll( |
79 | | - 'noscript, iframe, link[rel="stylesheet"], meta, svg, img, symbol, path, style, script', |
80 | | - ); |
81 | | - nonVisualTags.forEach((element) => elementsToRemove.push(element)); |
82 | | - |
83 | | - elementsToRemove.forEach((element) => element.remove()); |
84 | | - |
85 | | - return clone.outerHTML; |
86 | | - }); |
87 | | - |
88 | | - return domContent.replace(/\n/g, '').replace(/\s+/g, ' '); |
89 | | -} |
90 | | - |
91 | 90 | /** |
92 | 91 | * Gets the rendered DOM of a page with specified processing method |
93 | 92 | */ |
94 | 93 | export async function filterPageContent( |
95 | 94 | page: Page, |
96 | | - pageFilter: 'simple' | 'none' | 'readability', |
| 95 | + pageFilter: 'raw' | 'smartMarkdown', |
| 96 | + context?: ToolContext |
97 | 97 | ): Promise<string> { |
98 | 98 | let result: string = ''; |
| 99 | + |
99 | 100 | switch (pageFilter) { |
100 | | - case 'none': |
101 | | - result = await getNoneProcessedDOM(page); |
102 | | - break; |
103 | | - case 'readability': |
104 | | - result = await getReadabilityProcessedDOM(page); |
| 101 | + case 'smartMarkdown': |
| 102 | + if (!context) { |
| 103 | + console.warn('ToolContext required for smartMarkdown filter but not provided, falling back to raw mode'); |
| 104 | + result = await getRawDOM(page); |
| 105 | + } else { |
| 106 | + result = await getSmartMarkdownContent(page, context); |
| 107 | + } |
105 | 108 | break; |
106 | | - case 'simple': |
| 109 | + case 'raw': |
107 | 110 | default: |
108 | | - result = await getSimpleProcessedDOM(page); |
| 111 | + result = await getRawDOM(page); |
109 | 112 | break; |
110 | 113 | } |
111 | 114 |
|
112 | | - if (result.length > OUTPUT_LIMIT) { |
113 | | - return result.slice(0, OUTPUT_LIMIT) + '...(truncated)'; |
| 115 | + // Ensure result is a string before checking length |
| 116 | + const resultString = result || ''; |
| 117 | + if (resultString.length > OUTPUT_LIMIT) { |
| 118 | + return resultString.slice(0, OUTPUT_LIMIT) + '...(truncated)'; |
114 | 119 | } |
115 | | - return result; |
| 120 | + return resultString; |
116 | 121 | } |
0 commit comments