Skip to content

Commit ea09fce

Browse files
authored
fix(build): consolidate pdf parsing dependencies, remove extraneous html deps (#1212)
* fix(build): consolidate pdf parsing dependencies, remove extraneous html deps * add types
1 parent 9ccb760 commit ea09fce

File tree

8 files changed

+41
-608
lines changed

8 files changed

+41
-608
lines changed

apps/sim/app/api/files/parse/route.ts

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -553,22 +553,11 @@ function handleGenericBuffer(
553553
*/
554554
async function parseBufferAsPdf(buffer: Buffer) {
555555
try {
556-
try {
557-
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
558-
const parser = new PdfParser()
559-
logger.info('Using main PDF parser for buffer')
560-
561-
if (parser.parseBuffer) {
562-
return await parser.parseBuffer(buffer)
563-
}
564-
throw new Error('PDF parser does not support buffer parsing')
565-
} catch (error) {
566-
logger.warn('Main PDF parser failed, using raw parser for buffer:', error)
567-
const { RawPdfParser } = await import('@/lib/file-parsers/raw-pdf-parser')
568-
const rawParser = new RawPdfParser()
556+
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
557+
const parser = new PdfParser()
558+
logger.info('Using main PDF parser for buffer')
569559

570-
return await rawParser.parseBuffer(buffer)
571-
}
560+
return await parser.parseBuffer(buffer)
572561
} catch (error) {
573562
throw new Error(`PDF parsing failed: ${(error as Error).message}`)
574563
}

apps/sim/lib/file-parsers/index.test.ts

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,17 +141,6 @@ describe('File Parsers', () => {
141141
})),
142142
}))
143143

144-
vi.doMock('@/lib/file-parsers/raw-pdf-parser', () => ({
145-
RawPdfParser: vi.fn().mockImplementation(() => ({
146-
parseFile: vi.fn().mockResolvedValue({
147-
content: 'Raw parsed PDF content',
148-
metadata: {
149-
pageCount: 3,
150-
},
151-
}),
152-
})),
153-
}))
154-
155144
vi.doMock('@/lib/file-parsers/txt-parser', () => ({
156145
TxtParser: vi.fn().mockImplementation(() => ({
157146
parseFile: mockTxtParseFile,

apps/sim/lib/file-parsers/index.ts

Lines changed: 5 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import { existsSync } from 'fs'
2-
import { readFile } from 'fs/promises'
32
import path from 'path'
4-
import { RawPdfParser } from '@/lib/file-parsers/raw-pdf-parser'
53
import type { FileParseResult, FileParser, SupportedFileType } from '@/lib/file-parsers/types'
64
import { createLogger } from '@/lib/logs/console/logger'
75

@@ -18,42 +16,12 @@ function getParserInstances(): Record<string, FileParser> {
1816

1917
try {
2018
try {
21-
logger.info('Attempting to load PDF parser...')
22-
try {
23-
const { PdfParser } = require('@/lib/file-parsers/pdf-parser')
24-
parserInstances.pdf = new PdfParser()
25-
logger.info('PDF parser loaded successfully')
26-
} catch (pdfLibError) {
27-
logger.error('Failed to load primary PDF parser:', pdfLibError)
28-
logger.info('Falling back to raw PDF parser')
29-
parserInstances.pdf = new RawPdfParser()
30-
logger.info('Raw PDF parser loaded successfully')
31-
}
19+
logger.info('Loading PDF parser...')
20+
const { PdfParser } = require('@/lib/file-parsers/pdf-parser')
21+
parserInstances.pdf = new PdfParser()
22+
logger.info('PDF parser loaded successfully')
3223
} catch (error) {
33-
logger.error('Failed to load any PDF parser:', error)
34-
parserInstances.pdf = {
35-
async parseFile(filePath: string): Promise<FileParseResult> {
36-
const buffer = await readFile(filePath)
37-
return {
38-
content: `PDF parsing is not available. File size: ${buffer.length} bytes`,
39-
metadata: {
40-
info: { Error: 'PDF parsing unavailable' },
41-
pageCount: 0,
42-
version: 'unknown',
43-
},
44-
}
45-
},
46-
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
47-
return {
48-
content: `PDF parsing is not available. File size: ${buffer.length} bytes`,
49-
metadata: {
50-
info: { Error: 'PDF parsing unavailable' },
51-
pageCount: 0,
52-
version: 'unknown',
53-
},
54-
}
55-
},
56-
}
24+
logger.error('Failed to load PDF parser:', error)
5725
}
5826

5927
try {

apps/sim/lib/file-parsers/pdf-parser.ts

Lines changed: 18 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import { readFile } from 'fs/promises'
2-
import { PDFDocument } from 'pdf-lib'
2+
import pdfParse from 'pdf-parse'
33
import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
44
import { createLogger } from '@/lib/logs/console/logger'
5-
import { RawPdfParser } from './raw-pdf-parser'
65

76
const logger = createLogger('PdfParser')
8-
const rawPdfParser = new RawPdfParser()
97

108
export class PdfParser implements FileParser {
119
async parseFile(filePath: string): Promise<FileParseResult> {
@@ -31,68 +29,23 @@ export class PdfParser implements FileParser {
3129
try {
3230
logger.info('Starting to parse buffer, size:', dataBuffer.length)
3331

34-
try {
35-
logger.info('Attempting to parse with pdf-lib library...')
36-
37-
logger.info('Starting PDF parsing...')
38-
const pdfDoc = await PDFDocument.load(dataBuffer)
39-
const pages = pdfDoc.getPages()
40-
const pageCount = pages.length
41-
42-
logger.info('PDF parsed successfully with pdf-lib, pages:', pageCount)
43-
44-
const metadata: Record<string, any> = {
45-
pageCount,
46-
}
47-
48-
try {
49-
const title = pdfDoc.getTitle()
50-
const author = pdfDoc.getAuthor()
51-
const subject = pdfDoc.getSubject()
52-
const creator = pdfDoc.getCreator()
53-
const producer = pdfDoc.getProducer()
54-
const creationDate = pdfDoc.getCreationDate()
55-
const modificationDate = pdfDoc.getModificationDate()
56-
57-
if (title) metadata.title = title
58-
if (author) metadata.author = author
59-
if (subject) metadata.subject = subject
60-
if (creator) metadata.creator = creator
61-
if (producer) metadata.producer = producer
62-
if (creationDate) metadata.creationDate = creationDate.toISOString()
63-
if (modificationDate) metadata.modificationDate = modificationDate.toISOString()
64-
} catch (metadataError) {
65-
logger.warn('Could not extract PDF metadata:', metadataError)
66-
}
67-
68-
logger.info(
69-
'pdf-lib loaded successfully, but text extraction requires fallback to raw parser'
70-
)
71-
const rawResult = await rawPdfParser.parseBuffer(dataBuffer)
72-
73-
return {
74-
content: rawResult.content,
75-
metadata: {
76-
...rawResult.metadata,
77-
...metadata,
78-
source: 'pdf-lib + raw-parser',
79-
},
80-
}
81-
} catch (pdfLibError: unknown) {
82-
logger.error('PDF-lib library failed:', pdfLibError)
83-
84-
logger.info('Falling back to raw PDF parser...')
85-
const rawResult = await rawPdfParser.parseBuffer(dataBuffer)
86-
87-
return {
88-
...rawResult,
89-
metadata: {
90-
...rawResult.metadata,
91-
fallback: true,
92-
source: 'raw-parser-only',
93-
error: (pdfLibError as Error).message || 'Unknown error',
94-
},
95-
}
32+
const pdfData = await pdfParse(dataBuffer)
33+
34+
logger.info(
35+
'PDF parsed successfully, pages:',
36+
pdfData.numpages,
37+
'text length:',
38+
pdfData.text.length
39+
)
40+
41+
return {
42+
content: pdfData.text,
43+
metadata: {
44+
pageCount: pdfData.numpages,
45+
info: pdfData.info,
46+
version: pdfData.version,
47+
source: 'pdf-parse',
48+
},
9649
}
9750
} catch (error) {
9851
logger.error('Error parsing buffer:', error)

0 commit comments

Comments
 (0)