diff --git a/examples/pdf-server/server.test.ts b/examples/pdf-server/server.test.ts new file mode 100644 index 00000000..92ef4535 --- /dev/null +++ b/examples/pdf-server/server.test.ts @@ -0,0 +1,180 @@ +import { describe, it, expect, beforeEach, afterEach, spyOn } from "bun:test"; +import { + createPdfCache, + CACHE_INACTIVITY_TIMEOUT_MS, + CACHE_MAX_LIFETIME_MS, + CACHE_MAX_PDF_SIZE_BYTES, + type PdfCache, +} from "./server"; + +describe("PDF Cache with Timeouts", () => { + let pdfCache: PdfCache; + + beforeEach(() => { + // Each test gets its own session-local cache + pdfCache = createPdfCache(); + }); + + afterEach(() => { + pdfCache.clearCache(); + }); + + describe("cache configuration", () => { + it("should have 10 second inactivity timeout", () => { + expect(CACHE_INACTIVITY_TIMEOUT_MS).toBe(10_000); + }); + + it("should have 60 second max lifetime timeout", () => { + expect(CACHE_MAX_LIFETIME_MS).toBe(60_000); + }); + + it("should have 50MB max PDF size limit", () => { + expect(CACHE_MAX_PDF_SIZE_BYTES).toBe(50 * 1024 * 1024); + }); + }); + + describe("cache management", () => { + it("should start with empty cache", () => { + expect(pdfCache.getCacheSize()).toBe(0); + }); + + it("should clear all entries", () => { + pdfCache.clearCache(); + expect(pdfCache.getCacheSize()).toBe(0); + }); + + it("should isolate caches between sessions", () => { + // Create two independent cache instances + const cache1 = createPdfCache(); + const cache2 = createPdfCache(); + + // They should be independent (both start empty) + expect(cache1.getCacheSize()).toBe(0); + expect(cache2.getCacheSize()).toBe(0); + }); + }); + + describe("readPdfRange caching behavior", () => { + const testUrl = "https://arxiv.org/pdf/test-pdf"; + const testData = new Uint8Array([0x25, 0x50, 0x44, 0x46]); // %PDF header + + it("should cache full body when server returns HTTP 200", async () => { + // Mock fetch to return HTTP 200 (full body, no range support) + const mockFetch = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(testData, { + status: 200, + headers: { "Content-Type": "application/pdf" }, + }), + ); + + try { + // First request - should fetch and cache + const result1 = await pdfCache.readPdfRange(testUrl, 0, 1024); + expect(result1.data).toEqual(testData); + expect(result1.totalBytes).toBe(testData.length); + expect(pdfCache.getCacheSize()).toBe(1); + + // Second request - should serve from cache (no new fetch) + const result2 = await pdfCache.readPdfRange(testUrl, 0, 1024); + expect(result2.data).toEqual(testData); + expect(mockFetch).toHaveBeenCalledTimes(1); // Only one fetch call + } finally { + mockFetch.mockRestore(); + } + }); + + it("should not cache when server returns HTTP 206 (range supported)", async () => { + const chunkData = new Uint8Array([0x25, 0x50]); // First 2 bytes + + const mockFetch = spyOn(globalThis, "fetch").mockResolvedValue( + new Response(chunkData, { + status: 206, + headers: { + "Content-Type": "application/pdf", + "Content-Range": "bytes 0-1/100", + }, + }), + ); + + try { + await pdfCache.readPdfRange(testUrl, 0, 2); + expect(pdfCache.getCacheSize()).toBe(0); // Not cached when 206 + } finally { + mockFetch.mockRestore(); + } + }); + + it("should slice cached data for subsequent range requests", async () => { + const fullData = new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + + const mockFetch = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(fullData, { status: 200 }), + ); + + try { + // First request caches full body + await pdfCache.readPdfRange(testUrl, 0, 1024); + expect(pdfCache.getCacheSize()).toBe(1); + + // Subsequent request gets slice from cache + const result = await pdfCache.readPdfRange(testUrl, 2, 3); + expect(result.data).toEqual(new Uint8Array([3, 4, 5])); + expect(result.totalBytes).toBe(10); + expect(mockFetch).toHaveBeenCalledTimes(1); + } finally { + mockFetch.mockRestore(); + } + }); + + it("should reject PDFs larger than max size limit", async () => { + const hugeUrl = "https://arxiv.org/pdf/huge-pdf"; + // Create data larger than the limit + const hugeData = new Uint8Array(CACHE_MAX_PDF_SIZE_BYTES + 1); + + const mockFetch = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(hugeData, { + status: 200, + headers: { "Content-Type": "application/pdf" }, + }), + ); + + try { + await expect(pdfCache.readPdfRange(hugeUrl, 0, 1024)).rejects.toThrow( + /PDF too large to cache/, + ); + expect(pdfCache.getCacheSize()).toBe(0); // Should not be cached + } finally { + mockFetch.mockRestore(); + } + }); + + it("should reject when Content-Length header exceeds limit", async () => { + const headerUrl = "https://arxiv.org/pdf/huge-pdf-header"; + const smallData = new Uint8Array([1, 2, 3, 4]); + + const mockFetch = spyOn(globalThis, "fetch").mockResolvedValueOnce( + new Response(smallData, { + status: 200, + headers: { + "Content-Type": "application/pdf", + "Content-Length": String(CACHE_MAX_PDF_SIZE_BYTES + 1), + }, + }), + ); + + try { + await expect(pdfCache.readPdfRange(headerUrl, 0, 1024)).rejects.toThrow( + /PDF too large to cache/, + ); + expect(pdfCache.getCacheSize()).toBe(0); + } finally { + mockFetch.mockRestore(); + } + }); + }); + + // Note: Timer-based tests (inactivity/max lifetime) would require + // using fake timers which can be complex with async code. + // The timeout behavior is straightforward and can be verified + // through manual testing or E2E tests. +}); diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index 40de9aff..46f54036 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -33,6 +33,15 @@ export const DEFAULT_PDF = "https://arxiv.org/pdf/1706.03762"; // Attention Is A export const MAX_CHUNK_BYTES = 512 * 1024; // 512KB max per request export const RESOURCE_URI = "ui://pdf-viewer/mcp-app.html"; +/** Inactivity timeout: clear cache entry if not accessed for this long */ +export const CACHE_INACTIVITY_TIMEOUT_MS = 10_000; // 10 seconds + +/** Max lifetime: clear cache entry after this time regardless of access */ +export const CACHE_MAX_LIFETIME_MS = 60_000; // 60 seconds + +/** Max size for cached PDFs (defensive limit to prevent memory exhaustion) */ +export const CACHE_MAX_PDF_SIZE_BYTES = 50 * 1024 * 1024; // 50MB + /** Allowed remote origins (security allowlist) */ export const allowedRemoteOrigins = new Set([ "https://agrirxiv.org", @@ -126,67 +135,211 @@ export function validateUrl(url: string): { valid: boolean; error?: string } { } // ============================================================================= -// Range Request Helpers +// Session-Local PDF Cache // ============================================================================= -export async function readPdfRange( - url: string, - offset: number, - byteCount: number, -): Promise<{ data: Uint8Array; totalBytes: number }> { - const normalized = isArxivUrl(url) ? normalizeArxivUrl(url) : url; - const clampedByteCount = Math.min(byteCount, MAX_CHUNK_BYTES); +/** + * Cache entry for remote PDFs from servers that don't support Range requests. + * Tracks both inactivity and max lifetime for automatic cleanup. + */ +interface CacheEntry { + /** The cached PDF data */ + data: Uint8Array; + /** Timestamp when entry was created (for max lifetime) */ + createdAt: number; + /** Timer that fires after CACHE_INACTIVITY_TIMEOUT_MS of no access */ + inactivityTimer: ReturnType; + /** Timer that fires after CACHE_MAX_LIFETIME_MS from creation */ + maxLifetimeTimer: ReturnType; +} - if (isFileUrl(normalized)) { - const filePath = fileUrlToPath(normalized); - const stats = await fs.promises.stat(filePath); - const totalBytes = stats.size; +/** + * Session-local PDF cache utilities. + * Each call to createPdfCache() creates an independent cache instance. + */ +export interface PdfCache { + /** Read a range of bytes from a PDF, using cache for servers without Range support */ + readPdfRange( + url: string, + offset: number, + byteCount: number, + ): Promise<{ data: Uint8Array; totalBytes: number }>; + /** Get current number of cached entries */ + getCacheSize(): number; + /** Clear all cached entries and their timers */ + clearCache(): void; +} - // Clamp to file bounds +/** + * Creates a session-local PDF cache with automatic timeout-based cleanup. + * + * When a remote server returns HTTP 200 (full body) instead of 206 (partial), + * the full response is cached so subsequent chunk requests don't re-download. + * + * Entries are automatically cleared after: + * - CACHE_INACTIVITY_TIMEOUT_MS of no access (resets on each access) + * - CACHE_MAX_LIFETIME_MS from creation (absolute timeout) + */ +export function createPdfCache(): PdfCache { + const cache = new Map(); + + /** Delete a cache entry and clear its timers */ + function deleteCacheEntry(url: string): void { + const entry = cache.get(url); + if (entry) { + clearTimeout(entry.inactivityTimer); + clearTimeout(entry.maxLifetimeTimer); + cache.delete(url); + } + } + + /** Get cached data and refresh the inactivity timer */ + function getCacheEntry(url: string): Uint8Array | undefined { + const entry = cache.get(url); + if (!entry) return undefined; + + // Refresh inactivity timer on access + clearTimeout(entry.inactivityTimer); + entry.inactivityTimer = setTimeout(() => { + deleteCacheEntry(url); + }, CACHE_INACTIVITY_TIMEOUT_MS); + + return entry.data; + } + + /** Add data to cache with both inactivity and max lifetime timers */ + function setCacheEntry(url: string, data: Uint8Array): void { + // Clear any existing entry first + deleteCacheEntry(url); + + const entry: CacheEntry = { + data, + createdAt: Date.now(), + inactivityTimer: setTimeout(() => { + deleteCacheEntry(url); + }, CACHE_INACTIVITY_TIMEOUT_MS), + maxLifetimeTimer: setTimeout(() => { + deleteCacheEntry(url); + }, CACHE_MAX_LIFETIME_MS), + }; + + cache.set(url, entry); + } + + /** Slice a cached or freshly-fetched full body to the requested range. */ + function sliceToChunk( + fullData: Uint8Array, + offset: number, + clampedByteCount: number, + ): { data: Uint8Array; totalBytes: number } { + const totalBytes = fullData.length; const start = Math.min(offset, totalBytes); const end = Math.min(start + clampedByteCount, totalBytes); + return { data: fullData.slice(start, end), totalBytes }; + } - if (start >= totalBytes) { - return { data: new Uint8Array(0), totalBytes }; + async function readPdfRange( + url: string, + offset: number, + byteCount: number, + ): Promise<{ data: Uint8Array; totalBytes: number }> { + const normalized = isArxivUrl(url) ? normalizeArxivUrl(url) : url; + const clampedByteCount = Math.min(byteCount, MAX_CHUNK_BYTES); + + if (isFileUrl(normalized)) { + const filePath = fileUrlToPath(normalized); + const stats = await fs.promises.stat(filePath); + const totalBytes = stats.size; + + // Clamp to file bounds + const start = Math.min(offset, totalBytes); + const end = Math.min(start + clampedByteCount, totalBytes); + + if (start >= totalBytes) { + return { data: new Uint8Array(0), totalBytes }; + } + + // Read range from local file + const buffer = Buffer.alloc(end - start); + const fd = await fs.promises.open(filePath, "r"); + try { + await fd.read(buffer, 0, end - start, start); + } finally { + await fd.close(); + } + + return { data: new Uint8Array(buffer), totalBytes }; } - // Read range from local file - const buffer = Buffer.alloc(end - start); - const fd = await fs.promises.open(filePath, "r"); - try { - await fd.read(buffer, 0, end - start, start); - } finally { - await fd.close(); + // Serve from cache if we previously downloaded the full body + const cached = getCacheEntry(normalized); + if (cached) { + return sliceToChunk(cached, offset, clampedByteCount); } - return { data: new Uint8Array(buffer), totalBytes }; - } + // Remote URL - Range request + const response = await fetch(normalized, { + headers: { + Range: `bytes=${offset}-${offset + clampedByteCount - 1}`, + }, + }); - // Remote URL - Range request - const response = await fetch(normalized, { - headers: { - Range: `bytes=${offset}-${offset + clampedByteCount - 1}`, - }, - }); + if (!response.ok && response.status !== 206) { + throw new Error( + `Range request failed: ${response.status} ${response.statusText}`, + ); + } - if (!response.ok && response.status !== 206) { - throw new Error( - `Range request failed: ${response.status} ${response.statusText}`, - ); - } + // HTTP 200 means the server ignored our Range header and sent the full body. + // Cache it so subsequent chunk requests don't re-download, then slice. + if (response.status === 200) { + // Check Content-Length header first as a preliminary size check + const contentLength = response.headers.get("content-length"); + if (contentLength) { + const declaredSize = parseInt(contentLength, 10); + if (declaredSize > CACHE_MAX_PDF_SIZE_BYTES) { + throw new Error( + `PDF too large to cache: ${declaredSize} bytes exceeds ${CACHE_MAX_PDF_SIZE_BYTES} byte limit`, + ); + } + } + + const fullData = new Uint8Array(await response.arrayBuffer()); + + // Check actual size (may differ from Content-Length) + if (fullData.length > CACHE_MAX_PDF_SIZE_BYTES) { + throw new Error( + `PDF too large to cache: ${fullData.length} bytes exceeds ${CACHE_MAX_PDF_SIZE_BYTES} byte limit`, + ); + } - // Parse total size from Content-Range header - const contentRange = response.headers.get("content-range"); - let totalBytes = 0; - if (contentRange) { - const match = contentRange.match(/bytes \d+-\d+\/(\d+)/); - if (match) { - totalBytes = parseInt(match[1], 10); + setCacheEntry(normalized, fullData); + return sliceToChunk(fullData, offset, clampedByteCount); } + + // HTTP 206 Partial Content — parse total size from Content-Range header + const contentRange = response.headers.get("content-range"); + let totalBytes = 0; + if (contentRange) { + const match = contentRange.match(/bytes \d+-\d+\/(\d+)/); + if (match) { + totalBytes = parseInt(match[1], 10); + } + } + + const data = new Uint8Array(await response.arrayBuffer()); + return { data, totalBytes }; } - const data = new Uint8Array(await response.arrayBuffer()); - return { data, totalBytes }; + return { + readPdfRange, + getCacheSize: () => cache.size, + clearCache: () => { + for (const url of [...cache.keys()]) { + deleteCacheEntry(url); + } + }, + }; } // ============================================================================= @@ -196,6 +349,9 @@ export async function readPdfRange( export function createServer(): McpServer { const server = new McpServer({ name: "PDF Server", version: "2.0.0" }); + // Create session-local cache (isolated per server instance) + const { readPdfRange } = createPdfCache(); + // Tool: list_pdfs - List available PDFs (local files + allowed origins) server.tool( "list_pdfs",