11import { readFile } from 'fs/promises'
2- import { PDFDocument } from 'pdf-lib '
2+ import pdfParse from 'pdf-parse '
33import type { FileParseResult , FileParser } from '@/lib/file-parsers/types'
44import { createLogger } from '@/lib/logs/console/logger'
5- import { RawPdfParser } from './raw-pdf-parser'
65
76const logger = createLogger ( 'PdfParser' )
8- const rawPdfParser = new RawPdfParser ( )
97
108export class PdfParser implements FileParser {
119 async parseFile ( filePath : string ) : Promise < FileParseResult > {
@@ -31,68 +29,23 @@ export class PdfParser implements FileParser {
3129 try {
3230 logger . info ( 'Starting to parse buffer, size:' , dataBuffer . length )
3331
34- try {
35- logger . info ( 'Attempting to parse with pdf-lib library...' )
36-
37- logger . info ( 'Starting PDF parsing...' )
38- const pdfDoc = await PDFDocument . load ( dataBuffer )
39- const pages = pdfDoc . getPages ( )
40- const pageCount = pages . length
41-
42- logger . info ( 'PDF parsed successfully with pdf-lib, pages:' , pageCount )
43-
44- const metadata : Record < string , any > = {
45- pageCount,
46- }
47-
48- try {
49- const title = pdfDoc . getTitle ( )
50- const author = pdfDoc . getAuthor ( )
51- const subject = pdfDoc . getSubject ( )
52- const creator = pdfDoc . getCreator ( )
53- const producer = pdfDoc . getProducer ( )
54- const creationDate = pdfDoc . getCreationDate ( )
55- const modificationDate = pdfDoc . getModificationDate ( )
56-
57- if ( title ) metadata . title = title
58- if ( author ) metadata . author = author
59- if ( subject ) metadata . subject = subject
60- if ( creator ) metadata . creator = creator
61- if ( producer ) metadata . producer = producer
62- if ( creationDate ) metadata . creationDate = creationDate . toISOString ( )
63- if ( modificationDate ) metadata . modificationDate = modificationDate . toISOString ( )
64- } catch ( metadataError ) {
65- logger . warn ( 'Could not extract PDF metadata:' , metadataError )
66- }
67-
68- logger . info (
69- 'pdf-lib loaded successfully, but text extraction requires fallback to raw parser'
70- )
71- const rawResult = await rawPdfParser . parseBuffer ( dataBuffer )
72-
73- return {
74- content : rawResult . content ,
75- metadata : {
76- ...rawResult . metadata ,
77- ...metadata ,
78- source : 'pdf-lib + raw-parser' ,
79- } ,
80- }
81- } catch ( pdfLibError : unknown ) {
82- logger . error ( 'PDF-lib library failed:' , pdfLibError )
83-
84- logger . info ( 'Falling back to raw PDF parser...' )
85- const rawResult = await rawPdfParser . parseBuffer ( dataBuffer )
86-
87- return {
88- ...rawResult ,
89- metadata : {
90- ...rawResult . metadata ,
91- fallback : true ,
92- source : 'raw-parser-only' ,
93- error : ( pdfLibError as Error ) . message || 'Unknown error' ,
94- } ,
95- }
32+ const pdfData = await pdfParse ( dataBuffer )
33+
34+ logger . info (
35+ 'PDF parsed successfully, pages:' ,
36+ pdfData . numpages ,
37+ 'text length:' ,
38+ pdfData . text . length
39+ )
40+
41+ return {
42+ content : pdfData . text ,
43+ metadata : {
44+ pageCount : pdfData . numpages ,
45+ info : pdfData . info ,
46+ version : pdfData . version ,
47+ source : 'pdf-parse' ,
48+ } ,
9649 }
9750 } catch ( error ) {
9851 logger . error ( 'Error parsing buffer:' , error )
0 commit comments