diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index cd87989a..8955a3ff 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -22,7 +22,7 @@ def parse_single_page(mindee_client, input_source) end def parse_multi_page(mindee_client, input_source) - pdf_extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(input_source) + pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(input_source) invoice_splitter_response = mindee_client.enqueue_and_parse( input_source, Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, diff --git a/lib/mindee.rb b/lib/mindee.rb index ca57a06f..3dfe29ed 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -4,6 +4,14 @@ require 'mindee/extraction' module Mindee + # Mindee internal error module. + module Errors + end + + # Custom extraction module + module Extraction + end + # Mindee internal http module. module HTTP end @@ -29,10 +37,6 @@ module ImageCompressor end end - # Custom extraction module - module Extraction - end - # Parsing internals and fields. module Parsing # Common fields and functions. diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb index 8ed4a69d..d83737dc 100644 --- a/lib/mindee/client.rb +++ b/lib/mindee/client.rb @@ -166,6 +166,7 @@ def enqueue_and_parse( delay_sec: 1.5, max_retries: 80 ) + validate_async_params(initial_delay_sec, delay_sec, max_retries) enqueue_res = enqueue( input_source, product_class, @@ -187,7 +188,8 @@ def enqueue_and_parse( end if queue_res.job.status != Mindee::Parsing::Common::JobStatus::COMPLETED elapsed = initial_delay_sec + (polling_attempts * delay_sec) - raise "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)" + raise Errors::MindeeAPIError, + "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)" end queue_res @@ -242,8 +244,8 @@ def execute_workflow( # @return [Mindee::Parsing::Common::ApiResponse] def load_prediction(product_class, local_response) Mindee::Parsing::Common::ApiResponse.new(product_class, local_response.as_hash, local_response.as_hash.to_json) - rescue KeyError - raise 'No prediction found in local response.' + rescue KeyError, Errors::MindeeAPIError + raise Errors::MindeeInputError, 'No prediction found in local response.' end # Load a document from an absolute path, as a string. @@ -314,11 +316,18 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) min_delay_sec = 1 min_initial_delay_sec = 1 min_retries = 2 - raise "Cannot set auto-poll delay to less than #{min_delay_sec} second(s)" if delay_sec < min_delay_sec + if delay_sec < min_delay_sec + raise ArgumentError, + "Cannot set auto-poll delay to less than #{min_delay_sec} second(s)" + end if initial_delay_sec < min_initial_delay_sec - raise "Cannot set initial parsing delay to less than #{min_initial_delay_sec} second(s)" + raise ArgumentError, + "Cannot set initial parsing delay to less than #{min_initial_delay_sec} second(s)" end - raise "Cannot set auto-poll retries to less than #{min_retries}" if max_retries < min_retries + return unless max_retries < min_retries + + raise ArgumentError, + "Cannot set auto-poll retries to less than #{min_retries}" end # Creates an endpoint with the given values. Raises an error if the endpoint is invalid. @@ -333,8 +342,9 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) # @param version [String] For custom endpoints, version of the product. # @return [Mindee::HTTP::Endpoint] def initialize_endpoint(product_class, endpoint_name: '', account_name: '', version: '') - if (endpoint_name.nil? || endpoint_name.empty?) && product_class == Mindee::Product::Custom::CustomV1 - raise 'Missing argument endpoint_name when using custom class' + if (endpoint_name.nil? || endpoint_name.empty?) && + [Mindee::Product::Custom::CustomV1, Mindee::Product::Generated::GeneratedV1].include?(product_class) + raise Errors::MindeeConfigurationError, 'Missing argument endpoint_name when using custom class' end endpoint_name = fix_endpoint_name(product_class, endpoint_name) diff --git a/lib/mindee/errors.rb b/lib/mindee/errors.rb new file mode 100644 index 00000000..59117d6c --- /dev/null +++ b/lib/mindee/errors.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +require_relative 'errors/mindee_error' +require_relative 'errors/mindee_http_error' +require_relative 'errors/mindee_input_error' diff --git a/lib/mindee/errors/mindee_error.rb b/lib/mindee/errors/mindee_error.rb new file mode 100644 index 00000000..0f0cee39 --- /dev/null +++ b/lib/mindee/errors/mindee_error.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +module Mindee + module Errors + # Base class for all custom mindee errors. + class MindeeError < StandardError; end + + # Errors relating to library issues. + class MindeeAPIError < MindeeError; end + + # Errors relating to misuse of the library. + class MindeeConfigurationError < MindeeError; end + + # Errors relating to geometric manipulation issues. + class MindeeGeometryError < MindeeError; end + end +end diff --git a/lib/mindee/errors/mindee_http_error.rb b/lib/mindee/errors/mindee_http_error.rb new file mode 100644 index 00000000..1694ef2c --- /dev/null +++ b/lib/mindee/errors/mindee_http_error.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +require_relative 'mindee_error' + +module Mindee + module Errors + # API HttpError + class MindeeHTTPError < MindeeError + # @return [String] + attr_reader :status_code + # @return [Integer] + attr_reader :api_code + # @return [String] + attr_reader :api_details + # @return [String] + attr_reader :api_message + + # @param http_error [Hash] + # @param url [String] + # @param code [Integer] + def initialize(http_error, url, code) + @status_code = code + @api_code = http_error['code'] + @api_details = http_error['details'] + @api_message = http_error['message'] + super("#{url} #{@status_code} HTTP error: #{@api_details} - #{@api_message}") + end + end + + # Base class for all client-side errors. + class MindeeHTTPClientError < MindeeHTTPError; end + + # Base class for all server-side errors. + class MindeeHTTPServerError < MindeeHTTPError; end + end +end diff --git a/lib/mindee/errors/mindee_input_error.rb b/lib/mindee/errors/mindee_input_error.rb new file mode 100644 index 00000000..267fea25 --- /dev/null +++ b/lib/mindee/errors/mindee_input_error.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +module Mindee + module Errors + # Base class for errors relating to input documents. + class MindeeInputError < MindeeError; end + + # Errors relating to sources (documents) handling. + class MindeeSourceError < MindeeInputError; end + + # Errors relating to mime type issues. + class MindeeMimeTypeError < MindeeSourceError + # @return [String] + attr_reader :invalid_mimetype + + # @param mime_type [String] + def initialize(mime_type) + @invalid_mimetype = mime_type + super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \ + "#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}") + end + end + + # Errors relating to the handling of images. + class MindeeImageError < MindeeInputError; end + + # Errors relating to the handling of PDF documents. + class MindeePDFError < MindeeInputError; end + end +end diff --git a/lib/mindee/extraction.rb b/lib/mindee/extraction.rb index fdc0cbaf..fddb170d 100644 --- a/lib/mindee/extraction.rb +++ b/lib/mindee/extraction.rb @@ -2,5 +2,3 @@ require_relative 'extraction/tax_extractor' require_relative 'extraction/multi_receipts_extractor' -require_relative 'extraction/common' -require_relative 'extraction/pdf_extractor' diff --git a/lib/mindee/extraction/common.rb b/lib/mindee/extraction/common.rb deleted file mode 100644 index 509d4d4f..00000000 --- a/lib/mindee/extraction/common.rb +++ /dev/null @@ -1,4 +0,0 @@ -# frozen_string_literal: true - -require_relative 'common/extracted_image' -require_relative 'common/image_extractor' diff --git a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb index 6b5bc729..eff201b2 100644 --- a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +++ b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -require_relative '../common/image_extractor' +require_relative '../../image/image_extractor' module Mindee # Image Extraction Module. @@ -15,13 +15,16 @@ def self.extract_receipts(input_source, inference) # @return [Array] Individual extracted receipts as an array of ExtractedMultiReceiptsImage. images = [] - raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts + unless inference.prediction.receipts + raise Errors::MindeeInputError, + 'No possible receipts candidates found for Multi-Receipts extraction.' + end (0...input_source.count_pdf_pages).each do |page_id| receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box) images.concat( - Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1, - receipt_positions) + Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1, + receipt_positions) ) end diff --git a/lib/mindee/extraction/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor.rb deleted file mode 100644 index 3d44dd98..00000000 --- a/lib/mindee/extraction/pdf_extractor.rb +++ /dev/null @@ -1,4 +0,0 @@ -# frozen_string_literal: true - -require_relative 'pdf_extractor/pdf_extractor' -require_relative 'pdf_extractor/extracted_pdf' diff --git a/lib/mindee/http.rb b/lib/mindee/http.rb index d4b3375e..649b58cb 100644 --- a/lib/mindee/http.rb +++ b/lib/mindee/http.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true require_relative 'http/endpoint' -require_relative 'http/error' +require_relative 'http/error_handler' require_relative 'http/workflow_endpoint' diff --git a/lib/mindee/http/endpoint.rb b/lib/mindee/http/endpoint.rb index 7b2cfe75..e1e31c04 100644 --- a/lib/mindee/http/endpoint.rb +++ b/lib/mindee/http/endpoint.rb @@ -2,7 +2,7 @@ require 'json' require 'net/http' -require_relative 'error' +require_relative 'error_handler' require_relative '../version' require_relative 'response_validation' @@ -65,7 +65,7 @@ def predict(input_source, all_words, full_text, close_file, cropper) return [hashed_response, response.body] if ResponseValidation.valid_sync_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -83,7 +83,7 @@ def predict_async(input_source, all_words, full_text, close_file, cropper) return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -97,7 +97,7 @@ def parse_async(job_id) return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -201,9 +201,9 @@ def document_queue_req(job_id) def check_api_key return unless @api_key.nil? || @api_key.empty? - raise "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ - "check your Client Configuration.\n" \ - 'You can set this using the ' \ + raise Errors::MindeeAPIError, + "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ + "check your Client Configuration.\nYou can set this using the " \ "'#{HTTP::API_KEY_ENV_NAME}' environment variable." end end diff --git a/lib/mindee/http/error.rb b/lib/mindee/http/error_handler.rb similarity index 74% rename from lib/mindee/http/error.rb rename to lib/mindee/http/error_handler.rb index 84a7368c..b4ade5ac 100644 --- a/lib/mindee/http/error.rb +++ b/lib/mindee/http/error_handler.rb @@ -1,11 +1,12 @@ # frozen_string_literal: true require 'json' +require_relative '../errors/mindee_http_error' module Mindee module HTTP # Mindee HTTP error module. - module Error + module ErrorHandler module_function # Extracts the HTTP error from the response hash, or the job error if there is one. @@ -80,44 +81,13 @@ def handle_error(url, response) error_obj = create_error_obj(parsed_hash) case code when 400..499 - MindeeHttpClientError.new(error_obj, url, code) + Errors::MindeeHTTPClientError.new(error_obj, url, code) when 500..599 - MindeeHttpServerError.new(error_obj, url, code) + Errors::MindeeHTTPServerError.new(error_obj, url, code) else - MindeeHttpError.new(error_obj, url, code) + Errors::MindeeHTTPError.new(error_obj, url, code) end end - - # API HttpError - class MindeeHttpError < StandardError - # @return [String] - attr_reader :status_code - # @return [Integer] - attr_reader :api_code - # @return [String] - attr_reader :api_details - # @return [String] - attr_reader :api_message - - # @param http_error [Hash] - # @param url [String] - # @param code [Integer] - def initialize(http_error, url, code) - @status_code = code - @api_code = http_error['code'] - @api_details = http_error['details'] - @api_message = http_error['message'] - super("#{url} #{@status_code} HTTP error: #{@api_details} - #{@api_message}") - end - end - - # API client HttpError - class MindeeHttpClientError < MindeeHttpError - end - - # API server HttpError - class MindeeHttpServerError < MindeeHttpError - end end end end diff --git a/lib/mindee/http/workflow_endpoint.rb b/lib/mindee/http/workflow_endpoint.rb index 79e08897..594b5acf 100644 --- a/lib/mindee/http/workflow_endpoint.rb +++ b/lib/mindee/http/workflow_endpoint.rb @@ -2,7 +2,7 @@ require 'json' require 'net/http' -require_relative 'error' +require_relative 'error_handler' module Mindee module HTTP @@ -37,7 +37,7 @@ def execute_workflow(input_source, full_text, document_alias, priority, public_u return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -81,9 +81,9 @@ def workflow_execution_req_post(input_source, document_alias, priority, full_tex def check_api_key return unless @api_key.nil? || @api_key.empty? - raise "Missing API key. Check your Client Configuration.\n" \ - 'You can set this using the ' \ - "'#{HTTP::API_KEY_ENV_NAME}' environment variable." + raise Errors::MindeeConfigurationError, "Missing API key. Check your Client Configuration.\n" \ + "You can set this using the '#{HTTP::API_KEY_ENV_NAME}'" \ + 'environment variable.' end end end diff --git a/lib/mindee/image.rb b/lib/mindee/image.rb index 51406f83..5664bc8f 100644 --- a/lib/mindee/image.rb +++ b/lib/mindee/image.rb @@ -1,4 +1,6 @@ # frozen_string_literal: true +require_relative 'image/extracted_image' require_relative 'image/image_compressor' +require_relative 'image/image_extractor' require_relative 'image/image_utils' diff --git a/lib/mindee/extraction/common/extracted_image.rb b/lib/mindee/image/extracted_image.rb similarity index 64% rename from lib/mindee/extraction/common/extracted_image.rb rename to lib/mindee/image/extracted_image.rb index 44a4fb07..9292ba2b 100644 --- a/lib/mindee/extraction/common/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -1,16 +1,16 @@ # frozen_string_literal: true -require_relative '../../input/sources' +require_relative '../input/sources' module Mindee # Image Extraction Module. - module Extraction + module Image # Generic class for image extraction. class ExtractedImage - # Id of the page the image was extracted from. + # ID of the page the image was extracted from. attr_reader :page_id - # Id of the element on a given page. + # ID of the element on a given page. attr_reader :element_id # Buffer object of the file's content. @@ -28,11 +28,12 @@ def initialize(input_source, page_id, element_id) @buffer = StringIO.new(input_source.io_stream.read) @buffer.rewind extension = if input_source.pdf? - 'jpg' + '.jpg' else File.extname(input_source.filename) end - @internal_file_name = "#{input_source.filename}_p#{page_id}_#{element_id}.#{extension}" + base_name = File.basename(input_source.filename, File.extname(input_source.filename)) + @internal_file_name = "#{base_name}_p#{page_id}_#{element_id}#{extension}" @page_id = page_id @element_id = element_id.nil? ? 0 : element_id end @@ -43,21 +44,22 @@ def initialize(input_source, page_id, element_id) # @param file_format [String, nil] Optional MiniMagick-compatible format for the file. Inferred from file # extension if not provided. # @raise [MindeeError] If an invalid path or filename is provided. - def save_to_file(output_path, file_format = nil) - resolved_path = Pathname.new(output_path).realpath + def write_to_file(output_path, file_format = nil) + resolved_path = Pathname.new(File.expand_path(output_path)) if file_format.nil? - raise ArgumentError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? + raise Errors::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? file_format = resolved_path.extname.delete('.').upcase end - @buffer.rewind - image = MiniMagick::Image.read(@buffer) - image.format file_format.downcase - image.write resolved_path.to_s - rescue TypeError - raise 'Invalid path/filename provided.' - rescue StandardError - raise "Could not save file #{Pathname.new(output_path).basename}." + begin + @buffer.rewind + image = MiniMagick::Image.read(@buffer) + image.format file_format.downcase + image.write resolved_path.to_s + rescue StandardError + raise Errors::MindeeImageError, "Could not save file '#{output_path}'. " \ + 'Is the provided file path valid?.' + end end # Return the file as a Mindee-compatible BufferInput source. diff --git a/lib/mindee/extraction/common/image_extractor.rb b/lib/mindee/image/image_extractor.rb similarity index 97% rename from lib/mindee/extraction/common/image_extractor.rb rename to lib/mindee/image/image_extractor.rb index acd2bd7a..a360f906 100644 --- a/lib/mindee/extraction/common/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -4,12 +4,12 @@ require 'origami' require 'stringio' require 'tempfile' -require_relative '../../input/sources' +require_relative '../input/sources' require_relative 'extracted_image' module Mindee # Image Extraction Module. - module Extraction + module Image # Image Extraction wrapper class. module ImageExtractor def self.attach_image_as_new_file(input_buffer, format: 'jpg') @@ -35,7 +35,7 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg') # @param [Integer] page_id ID of the Page to extract from. # @param [Array>, Array] polygons List of coordinates # to extract. - # @return [Array] Extracted Images. + # @return [Array] Extracted Images. def self.extract_multiple_images_from_source(input_source, page_id, polygons) new_stream = load_input_source_pdf_page_as_image(input_source, page_id) new_stream.seek(0) @@ -49,7 +49,7 @@ def self.extract_multiple_images_from_source(input_source, page_id, polygons) # @param [StringIO] pdf_stream Buffer of the PDF. # @param [Integer] page_id Page ID. # @param [Array] polygons - # @return [Array] Extracted Images. + # @return [Array] Extracted Images. def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons) extracted_elements = [] diff --git a/lib/mindee/image/image_utils.rb b/lib/mindee/image/image_utils.rb index d839cdf5..a7b970a9 100644 --- a/lib/mindee/image/image_utils.rb +++ b/lib/mindee/image/image_utils.rb @@ -36,7 +36,8 @@ def self.to_image(image) elsif image.is_a?(MiniMagick::Image) image else - raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead." + img_class = image.class ? image.class.to_s : 'unknown format' + raise Errors::MindeeImageError, "Expected an I/O object or a MiniMagick::Image. '#{img_class}' given instead." end end @@ -59,7 +60,7 @@ def self.image_to_stringio(image, format = 'JPEG') # @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same. # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same. def self.calculate_new_dimensions(original, max_width: nil, max_height: nil) - raise 'Provided image could not be processed for resizing.' if original.nil? + raise Errors::MindeeImageError, 'Provided image could not be processed for resizing.' if original.nil? return [original.width, original.height] if max_width.nil? && max_height.nil? diff --git a/lib/mindee/input/local_response.rb b/lib/mindee/input/local_response.rb index 88736f97..fafe3fdf 100644 --- a/lib/mindee/input/local_response.rb +++ b/lib/mindee/input/local_response.rb @@ -27,7 +27,7 @@ def initialize(input_file) end @file.rewind else - raise "Incompatible type for input '#{input_file.class}'." + raise Errors::MindeeInputError, "Incompatible type for input '#{input_file.class}'." end end @@ -38,7 +38,7 @@ def as_hash file_str = @file.read JSON.parse(file_str, object_class: Hash) rescue JSON::ParserError - raise "File is not a valid dict. #{file_str}" + raise Errors::MindeeInputError, "File is not a valid dict. #{file_str}" end # Processes the secret key @@ -56,7 +56,7 @@ def get_hmac_signature(secret_key) @file.rewind mac = OpenSSL::HMAC.hexdigest(algorithm, self.class.process_secret_key(secret_key), @file.read) rescue StandardError - raise 'Could not get HMAC signature from payload.' + raise Errors::MindeeInputError, 'Could not get HMAC signature from payload.' end mac end diff --git a/lib/mindee/input/sources/local_input_source.rb b/lib/mindee/input/sources/local_input_source.rb index 1d2dadea..48939cfc 100644 --- a/lib/mindee/input/sources/local_input_source.rb +++ b/lib/mindee/input/sources/local_input_source.rb @@ -20,29 +20,6 @@ module Source 'image/webp', ].freeze - # Standard error for invalid mime types - class MimeTypeError < StandardError - end - - # Error sent if the file's mimetype isn't allowed - class InvalidMimeTypeError < MimeTypeError - # @return [String] - attr_reader :invalid_mimetype - - # @param mime_type [String] - def initialize(mime_type) - @invalid_mimetype = mime_type - super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}") - end - end - - # Error sent if a pdf file couldn't be fixed - class UnfixablePDFError < MimeTypeError - def initialize - super("Corrupted PDF couldn't be repaired.") - end - end - # Base class for loading documents. class LocalInputSource # @return [String] @@ -72,7 +49,7 @@ def initialize(io_stream, filename, fix_pdf: false) return if ALLOWED_MIME_TYPES.include? @file_mimetype end - raise InvalidMimeTypeError, @file_mimetype.to_s + raise Errors::MindeeMimeTypeError, @file_mimetype.to_s end # Attempts to fix pdf files if mimetype is rejected. @@ -81,7 +58,7 @@ def initialize(io_stream, filename, fix_pdf: false) # @param stream [StringIO] def rescue_broken_pdf(stream) stream.gets('%PDF-') - raise UnfixablePDFError if stream.eof? || stream.pos > 500 + raise Errors::MindeePDFError if stream.eof? || stream.pos > 500 stream.pos = stream.pos - 5 data = stream.read diff --git a/lib/mindee/input/sources/url_input_source.rb b/lib/mindee/input/sources/url_input_source.rb index 5b4a9b13..989e8461 100644 --- a/lib/mindee/input/sources/url_input_source.rb +++ b/lib/mindee/input/sources/url_input_source.rb @@ -13,7 +13,7 @@ class UrlInputSource attr_reader :url def initialize(url) - raise 'URL must be HTTPS' unless url.start_with? 'https://' + raise Errors::MindeeInputError, 'URL must be HTTPS' unless url.start_with? 'https://' @url = url end @@ -27,7 +27,7 @@ def initialize(url) # @param token [String, nil] Optional token for JWT-based authentication. # @param max_redirects [Integer] Maximum amount of redirects to follow. # @return [String] The full path of the saved file. - def save_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3) + def write_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3) response_body = fetch_file_content(username: username, password: password, token: token, max_redirects: max_redirects) @@ -72,9 +72,9 @@ def fetch_file_content(username: nil, password: nil, token: nil, max_redirects: response = make_request(uri, request, max_redirects) if response.code.to_i > 299 - raise "Failed to download file: HTTP status code #{response.code}" + raise Errors::MindeeAPIError, "Failed to download file: HTTP status code #{response.code}" elsif response.code.to_i < 200 - raise "Failed to download file: Invalid response code #{response.code}." + raise Errors::MindeeAPIError, "Failed to download file: Invalid response code #{response.code}." end response.body @@ -100,7 +100,7 @@ def make_request(uri, request, max_redirects) response = http.request(request) if response.is_a?(Net::HTTPRedirection) && max_redirects.positive? location = response['location'] - raise 'No location in redirection header.' if location.nil? + raise Errors::MindeeInputError, 'No location in redirection header.' if location.nil? new_uri = URI.parse(location) request = Net::HTTP::Get.new(new_uri) diff --git a/lib/mindee/parsing/common/api_response.rb b/lib/mindee/parsing/common/api_response.rb index f8bfc3e5..74fe3a24 100644 --- a/lib/mindee/parsing/common/api_response.rb +++ b/lib/mindee/parsing/common/api_response.rb @@ -108,7 +108,7 @@ class ApiResponse # @param raw_http [String] def initialize(product_class, http_response, raw_http) @raw_http = raw_http.to_s - raise 'Invalid response format.' unless http_response.key?('api_request') + raise Errors::MindeeAPIError, 'Invalid response format.' unless http_response.key?('api_request') @api_request = Mindee::Parsing::Common::ApiRequest.new(http_response['api_request']) diff --git a/lib/mindee/pdf.rb b/lib/mindee/pdf.rb index ab262fd7..48b05d99 100644 --- a/lib/mindee/pdf.rb +++ b/lib/mindee/pdf.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative 'pdf/extracted_pdf' require_relative 'pdf/pdf_compressor' +require_relative 'pdf/pdf_extractor' require_relative 'pdf/pdf_processor' require_relative 'pdf/pdf_tools' diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb similarity index 65% rename from lib/mindee/extraction/pdf_extractor/extracted_pdf.rb rename to lib/mindee/pdf/extracted_pdf.rb index e8ade1e8..a8160bf1 100644 --- a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -1,11 +1,11 @@ # frozen_string_literal: true module Mindee - # Pdf Extraction Module. - module Extraction - module PdfExtractor + # PDF Extraction Module. + module PDF + module PDFExtractor # An extracted sub-Pdf. - class ExtractedPdf + class ExtractedPDF # Byte contents of the pdf # @return [StreamIO] attr_reader :pdf_bytes @@ -26,17 +26,20 @@ def initialize(pdf_bytes, filename) def page_count current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes) current_pdf.pages.size - rescue TypeError - raise 'Could not retrieve page count from Extracted PDF object.' + rescue TypeError, Origami::InvalidPDFError + raise Errors::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.' end # Writes the contents of the current PDF object to a file. # @param output_path [String] Path to write to. - def write_to_file(output_path) - raise 'Provided path is not a file' if File.directory?(destination) - raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path)) - - if File.extname(output_path).downcase == '.pdf' + # @param override [Boolean] Whether to override the destination file. + def write_to_file(output_path, override: false) + raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) + raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?( + File.expand_path('..', output_path) + ) && !override + + if File.extname(output_path).downcase == 'pdf' base_path = File.expand_path('..', output_path) output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path) end diff --git a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb b/lib/mindee/pdf/pdf_extractor.rb similarity index 82% rename from lib/mindee/extraction/pdf_extractor/pdf_extractor.rb rename to lib/mindee/pdf/pdf_extractor.rb index 14b429bb..39826c49 100644 --- a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +++ b/lib/mindee/pdf/pdf_extractor.rb @@ -2,18 +2,18 @@ module Mindee # Pdf Extraction Module. - module Extraction + module PDF # Pdf Extraction class. - module PdfExtractor + module PDFExtractor # Pdf extraction class. - class PdfExtractor + class PDFExtractor # @param local_input [Mindee::Input::Source::LocalInputSource] def initialize(local_input) @filename = local_input.filename if local_input.pdf? @source_pdf = local_input.io_stream else - pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream) + pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream) io_buffer = StringIO.new pdf_image.save(io_buffer) @@ -40,24 +40,27 @@ def cut_pages(page_indexes) # Extract the sub-documents from the main pdf, based on the given list of page indexes. # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. - # @return [Array] The buffer containing the new Pdf. + # @return [Array] The buffer containing the new Pdf. def extract_sub_documents(page_indexes) extracted_pdfs = [] extension = File.extname(@filename) basename = File.basename(@filename, extension) page_indexes.each do |page_index_list| if page_index_list.empty? || page_index_list.nil? - raise "Empty indexes aren't allowed for extraction #{page_index_list}" + raise Errors::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}" end page_index_list.each do |page_index| - raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative? + if (page_index > page_count) || page_index.negative? + raise Errors::MindeePDFError, + "Index #{page_index} is out of range." + end end formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s field_filename = "#{basename}_#{format('%03d', (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}" - extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list), - field_filename) + extracted_pdf = Mindee::PDF::PDFExtractor::ExtractedPDF.new(cut_pages(page_index_list), + field_filename) extracted_pdfs << extracted_pdf end extracted_pdfs @@ -69,9 +72,9 @@ def extract_sub_documents(page_indexes) # Extracts invoices as complete PDFs from the document. # @param page_indexes [Array, InvoiceSplitterV1PageGroup>] # @param strict [Boolean] - # @return [Array] + # @return [Array] def extract_invoices(page_indexes, strict: false) - raise 'No indexes provided.' if page_indexes.empty? + raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty? unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup) return extract_sub_documents(page_indexes) end diff --git a/lib/mindee/pdf/pdf_processor.rb b/lib/mindee/pdf/pdf_processor.rb index 1e7e981a..f5cf0400 100644 --- a/lib/mindee/pdf/pdf_processor.rb +++ b/lib/mindee/pdf/pdf_processor.rb @@ -28,13 +28,12 @@ def self.parse(io_stream, options) all_pages = (0..pages_count - 1).to_a - case options[:operation] - when :KEEP_ONLY + if options[:operation] == :KEEP_ONLY pages_to_remove = indexes_from_keep(options[:page_indexes], all_pages) - when :REMOVE + elsif options[:operation] == :REMOVE pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages) else - raise "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" + raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options[:operation]}'" end current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a diff --git a/spec/client_spec.rb b/spec/client_spec.rb index 51c3479f..2e3716e2 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -63,5 +63,60 @@ mindee_client.load_prediction(Mindee::Product::Invoice::InvoiceV4, local_resp) expect(mindee_client).to_not be_nil end + + it 'should not load an invalid local response' do + local_resp = Mindee::Input::LocalResponse.new("#{DATA_DIR}/geometry/polygon.json") + expect do + mindee_client.load_prediction(Mindee::Product::Invoice::InvoiceV4, local_resp) + end.to raise_error Mindee::Errors::MindeeInputError + end + + it 'should not validate improper async parameters' do + file_data = File.binread("#{DATA_DIR}/file_types/receipt.jpg") + input_source = mindee_client.source_from_bytes(file_data, 'receipt.jpg') + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + max_retries: 0 + ) + end.to raise_error ArgumentError + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + initial_delay_sec: 0.5 + ) + end.to raise_error ArgumentError + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + delay_sec: 0.5 + ) + end.to raise_error ArgumentError + end + + it 'should not initialize an invalid endpoint' do + expect do + mindee_client.send( + :initialize_endpoint, + Mindee::Product::Generated::GeneratedV1, + endpoint_name: nil, + account_name: 'account_name', + version: 'version' + ) + end.to raise_error Mindee::Errors::MindeeConfigurationError + + expect do + mindee_client.send( + :initialize_endpoint, + Mindee::Product::Generated::GeneratedV1, + endpoint_name: '', + account_name: 'account_name', + version: 'version' + ) + end.to raise_error Mindee::Errors::MindeeConfigurationError + end end end diff --git a/spec/extraction/invoice_splitter_extraction_integration.rb b/spec/extraction/invoice_splitter_extraction_integration.rb index f6401d32..e32b8ad0 100644 --- a/spec/extraction/invoice_splitter_extraction_integration.rb +++ b/spec/extraction/invoice_splitter_extraction_integration.rb @@ -30,7 +30,7 @@ def prepare_invoice_return(rst_file_path, invoice_prediction) ) inference = response.document.inference - pdf_extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(invoice_splitter_input) + pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(invoice_splitter_input) expect(pdf_extractor.page_count).to eq(2) extracted_pdfs_strict = pdf_extractor.extract_invoices(inference.prediction.invoice_page_groups, strict: true) diff --git a/spec/extraction/multi_receipts_extractor_spec.rb b/spec/extraction/multi_receipts_extractor_spec.rb index 654e4c57..a3876347 100644 --- a/spec/extraction/multi_receipts_extractor_spec.rb +++ b/spec/extraction/multi_receipts_extractor_spec.rb @@ -5,8 +5,19 @@ require 'mindee/extraction' require_relative '../data' -describe Mindee::Extraction do - include Mindee::Extraction +describe Mindee::Extraction::MultiReceiptsExtractor do + include Mindee::Image + let(:empty_inference) do + double('Inference', prediction: double('Prediction', receipts: nil), pages: []) + end + + let(:valid_inference_with_no_receipts) do + double('Inference', prediction: double('Prediction', receipts: []), pages: []) + end + + let(:empty_input_source) do + double('InputSource', count_pdf_pages: 0) + end let(:multi_receipts_single_page_path) do File.join(DATA_DIR, 'products', 'multi_receipts_detector', 'default_sample.jpg') end @@ -111,4 +122,20 @@ expect(extracted_receipts[4].as_source.filename).to end_with('jpg') end end + + context 'when no receipts are found in inference' do + it 'raises a MindeeInputError' do + expect do + described_class.extract_receipts(empty_input_source, empty_inference) + end.to raise_error(Mindee::Errors::MindeeInputError, + 'No possible receipts candidates found for Multi-Receipts extraction.') + end + end + + context 'when input source has no pages' do + it 'returns an empty array' do + extracted_receipts = described_class.extract_receipts(empty_input_source, valid_inference_with_no_receipts) + expect(extracted_receipts).to eq([]) + end + end end diff --git a/spec/http/error_spec.rb b/spec/http/error_handler_spec.rb similarity index 81% rename from spec/http/error_spec.rb rename to spec/http/error_handler_spec.rb index 45154587..2b2c0a3b 100644 --- a/spec/http/error_spec.rb +++ b/spec/http/error_handler_spec.rb @@ -4,7 +4,7 @@ require 'json' require_relative 'mock_http_response' -describe Mindee::HTTP::Error do +describe Mindee::HTTP::ErrorHandler do context 'An HTTP call' do it 'should make an invalid API sync parse call raising an exception' do mindee_client1 = Mindee::Client.new(api_key: 'invalid-api-key') @@ -13,7 +13,7 @@ doc_class = Mindee::Product::Receipt::ReceiptV5 expect do mindee_client1.parse(input_source, doc_class, all_words: false, close_file: true) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end it 'should make an invalid API async enqueue call raising an exception' do @@ -23,7 +23,7 @@ doc_class = Mindee::Product::Invoice::InvoiceV4 expect do mindee_client1.enqueue(input_source, doc_class) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end it 'should make an invalid API async parse call raising an exception' do @@ -31,7 +31,7 @@ doc_class = Mindee::Product::InvoiceSplitter::InvoiceSplitterV1 expect do mindee_client1.parse_queued('invalid-job-id', doc_class) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end # NOTE: No reliable UT each HTTP error for ruby as the only semi-reliable http mock lib (Webmock) isn't compatible @@ -41,10 +41,10 @@ it 'should fail on a 400 response with object' do file = File.read("#{DATA_DIR}/errors/error_400_no_details.json") error_obj = MockHTTPResponse.new('1.0', '400', 'Some scary message here', file) - error400 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error400 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error400 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error400.status_code).to eq(400) expect(error400.api_code).to eq('SomeCode') expect(error400.api_message).to eq('Some scary message here') @@ -54,10 +54,10 @@ it 'should fail on a 401 response with object' do file = File.read("#{DATA_DIR}/errors/error_401_invalid_token.json") error_obj = MockHTTPResponse.new('1.0', '401', 'Authorization required', file) - error401 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error401 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error401 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error401.status_code).to eq(401) expect(error401.api_code).to eq('Unauthorized') expect(error401.api_message).to eq('Authorization required') @@ -67,10 +67,10 @@ it 'should fail on a 429 response with object' do file = File.read("#{DATA_DIR}/errors/error_429_too_many_requests.json") error_obj = MockHTTPResponse.new('1.0', '429', 'Too many requests', file) - error429 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error429 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error429 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error429.status_code).to eq(429) expect(error429.api_code).to eq('TooManyRequests') expect(error429.api_message).to eq('Too many requests') @@ -80,10 +80,10 @@ it 'should fail on a 500 response with object' do file = File.read("#{DATA_DIR}/errors/error_500_inference_fail.json") error_obj = MockHTTPResponse.new('1.0', '500', 'Inference failed', file) - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('failure') expect(error500.api_message).to eq('Inference failed') @@ -93,10 +93,10 @@ it 'should fail on a 500 HTML response' do file = File.read("#{DATA_DIR}/errors/error_50x.html") error_obj = MockHTTPResponse.new('1.0', '500', '', file) - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('UnknownError') expect(error500.api_message).to eq('Server sent back an unexpected reply.') @@ -111,10 +111,10 @@ expect(hashed_obj.dig('job', 'status')).to eq('failed') expect(Mindee::HTTP::ResponseValidation.valid_async_response?(error_obj)).to be(false) Mindee::HTTP::ResponseValidation.clean_request! error_obj - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('ServerError') expect(error500.api_message).to eq('An error occurred') diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb new file mode 100644 index 00000000..d01d7d67 --- /dev/null +++ b/spec/image/extracted_image_spec.rb @@ -0,0 +1,119 @@ +# frozen_string_literal: true + +require 'mindee' +require 'pathname' +require 'fileutils' +require 'mini_magick' +require_relative '../data' + +describe Mindee::Image::ExtractedImage do + let(:file_path) do + File.join(DATA_DIR, 'products', 'invoices', 'default_sample.jpg') + end + let(:input_source) do + Mindee::Input::Source::PathInputSource.new(file_path) + end + let(:page_id) { 1 } + let(:element_id) { 42 } + let(:output_dir) { "#{DATA_DIR}/output" } + + describe '#initialize' do + it 'initializes with correct attributes' do + extracted_image = described_class.new(input_source, page_id, element_id) + + expect(extracted_image.page_id).to eq(page_id) + expect(extracted_image.element_id).to eq(element_id) + expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + + # NOTE: ruby messes up the formatting of binary strings, I don't think it worth it to correct this behavior, but + # the result is that we have to remove them from the comparisons. + input_source.io_stream.rewind + source_content = extracted_image.buffer.read.gsub("\r", '').gsub("\n", '') + input_content = input_source.io_stream.read.gsub("\r", '').gsub("\n", '') + + expect(source_content).to eq(input_content) + + input_source.io_stream.rewind + end + + it 'defaults element_id to 0 if nil is provided' do + extracted_image = described_class.new(input_source, page_id, nil) + + expect(extracted_image.element_id).to eq(0) + end + + it 'appends .jpg extension for PDF input sources' do + allow(input_source).to receive(:pdf?).and_return(true) + + extracted_image = described_class.new(input_source, page_id, element_id) + + expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + end + end + + describe '#write_to_file' do + it 'saves the buffer to a file with the correct format' do + extracted_image = described_class.new(input_source, page_id, element_id) + output_path = "#{output_dir}/output_test.jpg" + + extracted_image.write_to_file(output_path) + + expect(File.exist?(output_path)).to be true + expect(File.size(output_path)).to be > 0 + end + + it 'raises an error if file format is invalid' do + extracted_image = described_class.new(input_source, page_id, element_id) + invalid_output_path = "#{output_dir}/output_test" + + expect do + extracted_image.write_to_file(invalid_output_path) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Invalid file format}) + end + + it 'raises an error if the file cannot be saved' do + extracted_image = described_class.new(input_source, page_id, element_id) + invalid_output_path = '/invalid/path/output_test.jpg' + + expect do + extracted_image.write_to_file(invalid_output_path) + end.to raise_error(Mindee::Errors::MindeeImageError) + end + end + + describe '#as_source' do + it 'returns a BytesInputSource with the correct content and filename' do + extracted_image = described_class.new(input_source, page_id, element_id) + + source = extracted_image.as_source + + expect(source).to be_a(Mindee::Input::Source::BytesInputSource) + expect(source.filename).to eq('default_sample_p1_42.jpg') + source.io_stream.rewind + + input_source.io_stream.rewind + source_content = source.io_stream.read.gsub("\r", '').gsub("\n", '') + input_content = input_source.io_stream.read.gsub("\r", '').gsub("\n", '') + + expect(source_content).to eq(input_content) + + input_source.io_stream.rewind + end + + it 'should raise an error when MiniMagick fails during save' do + allow(MiniMagick::Image).to receive(:read).and_raise(StandardError) + + extracted_image = Mindee::Image::ExtractedImage.new(input_source, 1, 2) + + Tempfile.create(['output', '.jpg']) do |tempfile| + expect do + extracted_image.write_to_file(tempfile.path, 'jpg') + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) + end + end + + after(:each) do + FileUtils.rm_f("#{output_dir}/output_test.jpg") + end + end +end diff --git a/spec/image/image_compressor_spec.rb b/spec/image/image_compressor_spec.rb new file mode 100644 index 00000000..81c1b134 --- /dev/null +++ b/spec/image/image_compressor_spec.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +require 'mindee' + +require_relative '../data' + +describe Mindee::Image::ImageCompressor do + describe 'Image Quality Compression' do + let(:input_receipt_path) { "#{DATA_DIR}/file_types/receipt.jpg" } + let(:output_dir) { "#{DATA_DIR}/output/" } + + it 'should compress the image from input source' do + receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) + receipt_input.compress!(quality: 80) # NOTE: base jpg quality is ~81 + + FileUtils.mkdir_p(File.dirname("#{output_dir}compress_indirect.jpg")) + File.write("#{output_dir}compress_indirect.jpg", receipt_input.io_stream.read) + + initial_file_size = File.size(input_receipt_path) + compressed_file_size = File.size(output_dir) + + expect(compressed_file_size).to be < initial_file_size + end + + it 'should compress the image with various quality levels' do + receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) + + compresses = [ + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 100), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream), # default quality + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 50), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 10), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 1), + ] + + output_files = [ + "#{output_dir}/compress100.jpg", + "#{output_dir}/compress85.jpg", + "#{output_dir}/compress50.jpg", + "#{output_dir}/compress10.jpg", + "#{output_dir}/compress1.jpg", + ] + + compresses.zip(output_files).each do |compressed, output_file| + File.write(output_file, compressed.read) + end + + initial_file_size = File.size(input_receipt_path) + rendered_file_sizes = output_files.map { |file| File.size(file) } + + expect(initial_file_size).to be < rendered_file_sizes[0] + expect(initial_file_size).to be < rendered_file_sizes[1] + expect(rendered_file_sizes[1]).to be > rendered_file_sizes[2] + expect(rendered_file_sizes[2]).to be > rendered_file_sizes[3] + expect(rendered_file_sizes[3]).to be > rendered_file_sizes[4] + end + + after(:each) do + FileUtils.rm_f("#{output_dir}/compress100.jpg") + FileUtils.rm_f("#{output_dir}/compress85.jpg") + FileUtils.rm_f("#{output_dir}/compress50.jpg") + FileUtils.rm_f("#{output_dir}/compress10.jpg") + FileUtils.rm_f("#{output_dir}/compress1.jpg") + FileUtils.rm_f("#{output_dir}/compress_indirect.jpg") + end + end +end diff --git a/spec/extraction/image_extractor_spec.rb b/spec/image/image_extractor_spec.rb similarity index 82% rename from spec/extraction/image_extractor_spec.rb rename to spec/image/image_extractor_spec.rb index ff4c9667..0fb4efac 100644 --- a/spec/extraction/image_extractor_spec.rb +++ b/spec/image/image_extractor_spec.rb @@ -5,8 +5,8 @@ require 'mindee/extraction' require_relative '../data' -describe Mindee::Extraction do - include Mindee::Extraction +describe Mindee::Image do + include Mindee::Image let(:barcode_path) do File.join(DATA_DIR, 'products', 'barcode_reader', 'default_sample.jpg') end @@ -23,10 +23,10 @@ barcodes2 = inference.prediction.codes_2d.map(&:polygon) input_source = Mindee::Input::Source::PathInputSource.new(barcode_path) - extracted_barcodes_1d = Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, 1, - barcodes1) - extracted_barcodes_2d = Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, 1, - barcodes2) + extracted_barcodes_1d = Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, 1, + barcodes1) + extracted_barcodes_2d = Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, 1, + barcodes2) expect(extracted_barcodes_1d.size).to eq(1) expect(extracted_barcodes_2d.size).to eq(2) diff --git a/spec/image/image_utils_spec.rb b/spec/image/image_utils_spec.rb new file mode 100644 index 00000000..7526e7c3 --- /dev/null +++ b/spec/image/image_utils_spec.rb @@ -0,0 +1,77 @@ +# frozen_string_literal: true + +# spec/image_utils_spec.rb +require 'rspec' +require 'mini_magick' +require 'stringio' +require 'mindee' + +describe Mindee::Image::ImageUtils do + let(:sample_image_path) { "#{DATA_DIR}/file_types/receipt.jpg" } + let(:sample_image) { MiniMagick::Image.open(sample_image_path) } + + describe 'Image utility module' do + it 'Should convert StringIO to MiniMagick::Image' do + string_io = StringIO.new(File.read(sample_image_path)) + result = Mindee::Image::ImageUtils.to_image(string_io) + expect(result).to be_a(MiniMagick::Image) + end + + it 'Should return the same MiniMagick::Image object if passed as input' do + result = Mindee::Image::ImageUtils.to_image(sample_image) + expect(result).to eq(sample_image) + end + + it 'Should raise an error for invalid input types' do + expect do + Mindee::Image::ImageUtils.to_image(123) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Expected an I/O object or a MiniMagick::Image}) + end + + it 'Should convert MiniMagick image to StringIO' do + result = Mindee::Image::ImageUtils.image_to_stringio(sample_image) + expect(result).to be_a(StringIO) + end + + it 'Should set the format of the image correctly' do + result = Mindee::Image::ImageUtils.image_to_stringio(sample_image, 'PNG') + expect(result.string[1..3]).to eq('PNG') + end + + it 'Should return original dimensions if no max_width or max_height is provided' do + result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image) + expect(result).to eq([sample_image.width, sample_image.height]) + end + + it 'Should calculate new dimensions based on max_width and max_height' do + result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image, max_width: 100, max_height: 100) + expect(result[0]).to be <= 100 + expect(result[1]).to be <= 100 + end + + it 'Should raise an error if the original image is nil' do + expect do + Mindee::Image::ImageUtils.calculate_new_dimensions(nil) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Provided image could not be processed for resizing}) + end + + it 'Should return dimensions from media box if provided' do + media_box = [0, 0, 300, 400] + result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, media_box) + expect(result).to eq([300, 400]) + end + + it 'Should fall back to image dimensions if media box is nil or empty' do + result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, nil) + expect(result).to eq([sample_image.width.to_i, sample_image.height.to_i]) + end + + it 'Should raise an error if the PDF stream is invalid' do + invalid_pdf_stream = StringIO.new('invalid data') + # Adjust based on actual error raised by MiniMagick for invalid data. + expect do + Mindee::Image::ImageUtils.pdf_to_magick_image(invalid_pdf_stream, 75) + end.to raise_error(MiniMagick::Error) + end + end +end diff --git a/spec/input/local_response_spec.rb b/spec/input/local_response_spec.rb index 31b3b132..058973c2 100644 --- a/spec/input/local_response_spec.rb +++ b/spec/input/local_response_spec.rb @@ -53,5 +53,18 @@ expect(response.get_hmac_signature(dummy_secret_key)).to eq(signature) end end + + it 'should trigger an error when something invalid is passed' do + expect do + Mindee::Input::LocalResponse.new(123) + end.to raise_error Mindee::Errors::MindeeInputError + end + + it 'should trigger an error when the payload is not hashable' do + local_response = Mindee::Input::LocalResponse.new('Your mother was a hamster.') + expect do + local_response.as_hash + end.to raise_error Mindee::Errors::MindeeInputError + end end end diff --git a/spec/input/files_handling_spec.rb b/spec/input/sources/files_handling_spec.rb similarity index 99% rename from spec/input/files_handling_spec.rb rename to spec/input/sources/files_handling_spec.rb index b38da352..4c068d89 100644 --- a/spec/input/files_handling_spec.rb +++ b/spec/input/sources/files_handling_spec.rb @@ -2,7 +2,7 @@ require 'mindee/input/sources' require 'base64' -require_relative '../data' +require_relative '../../data' describe Mindee::Input::Source::LocalInputSource do context 'An jpg input file' do diff --git a/spec/input/sources/sources_spec.rb b/spec/input/sources/sources_spec.rb new file mode 100644 index 00000000..885a768d --- /dev/null +++ b/spec/input/sources/sources_spec.rb @@ -0,0 +1,83 @@ +# frozen_string_literal: true + +require 'mindee' +require 'mindee/input/sources' +require 'mindee/errors' +require 'pdf-reader' + +require_relative '../../data' + +describe Mindee::Input::Source do + context 'An image input file' do + it 'should load a JPEG from a path' do + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.jpg')) + expect(input.file_mimetype).to eq('image/jpeg') + + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.jpga')) + expect(input.file_mimetype).to eq('image/jpeg') + end + + it 'should load a TIFF from a path' do + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.tif')) + expect(input.file_mimetype).to eq('image/tiff') + + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.tiff')) + expect(input.file_mimetype).to eq('image/tiff') + end + + it 'should load a HEIC from a path' do + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.heic')) + expect(input.file_mimetype).to eq('image/heic') + end + end + + context 'A PDF input file' do + it 'should load a multi-page PDF from a path' do + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice.pdf')) + expect(input.file_mimetype).to eq('application/pdf') + expect(input.pdf?).to eq(true) + + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice.pdf')) + expect(input.file_mimetype).to eq('application/pdf') + expect(input.pdf?).to eq(true) + + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice_10p.pdf')) + expect(input.file_mimetype).to eq('application/pdf') + expect(input.pdf?).to eq(true) + + input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice_10p.pdf')) + expect(input.file_mimetype).to eq('application/pdf') + expect(input.pdf?).to eq(true) + end + end + + context 'A broken fixable PDF' do + mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') + it 'Should not raise a mime error' do + expect do + mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_fixable.pdf", fix_pdf: true) + end.not_to raise_error + end + end + + context 'A broken unfixable PDF' do + mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') + it 'Should raise an error' do + expect do + mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_unfixable.pdf", fix_pdf: true) + end.to raise_error Mindee::Errors::MindeePDFError + end + end + + context 'A broken fixable invoice PDF' do + mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') + it 'Should send correct results' do + source_doc_original = mindee_client.source_from_path("#{DATA_DIR}/products/invoices/invoice.pdf") + expect do + source_doc_fixed = mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_invoice.pdf", + fix_pdf: true) + expect(source_doc_fixed.read_document[1].to_s).to eq(source_doc_original.read_document[1].to_s) + end.not_to raise_error + end + end +end diff --git a/spec/input/url_input_source_integration.rb b/spec/input/sources/url_input_source_integration.rb similarity index 100% rename from spec/input/url_input_source_integration.rb rename to spec/input/sources/url_input_source_integration.rb diff --git a/spec/input/url_input_source_spec.rb b/spec/input/sources/url_input_source_spec.rb similarity index 81% rename from spec/input/url_input_source_spec.rb rename to spec/input/sources/url_input_source_spec.rb index 12cc4b8d..d2a86f63 100644 --- a/spec/input/url_input_source_spec.rb +++ b/spec/input/sources/url_input_source_spec.rb @@ -2,7 +2,7 @@ require 'rspec' require 'mindee' -require_relative '../http/mock_http_response' +require_relative '../../http/mock_http_response' RSpec.describe Mindee::Input::Source::UrlInputSource do let(:valid_url) { 'https://validurl/some/file.jpg' } @@ -20,7 +20,7 @@ context 'with invalid URL' do it 'raises an error for invalid URLs' do - expect { described_class.new(invalid_url) }.to raise_error('URL must be HTTPS') + expect { described_class.new(invalid_url) }.to raise_error(Mindee::Errors::MindeeInputError) end end end @@ -59,12 +59,14 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '404', 'Not Found', '') } it 'raises an error' do - expect { url_input_source.as_local_input_source }.to raise_error(RuntimeError, %r{Failed to download file}) + expect do + url_input_source.as_local_input_source + end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) end end end - describe '#save_to_file' do + describe '#write_to_file' do let(:url_input_source) { described_class.new(valid_url) } let(:url_input_source_no_filename) { described_class.new(valid_url_no_filename) } @@ -77,23 +79,23 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '200', 'OK', 'file content') } it 'generates a valid filename when not provided' do - output_file_path = url_input_source_no_filename.save_to_file(output_dir) + output_file_path = url_input_source_no_filename.write_to_file(output_dir) expect(output_file_path).to match(%r{mindee_temp_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_[a-z0-9]{8}\.tmp}) end it 'saves the file with the provided filename' do - result = url_input_source.save_to_file('/tmp', filename: 'file.pdf') + result = url_input_source.write_to_file('/tmp', filename: 'file.pdf') expect(result).to eq('/tmp/file.pdf') expect(File).to have_received(:write).with('/tmp/file.pdf', 'file content') end it 'uses a custom filename when provided' do - result = url_input_source.save_to_file('/tmp', filename: 'custom.pdf') + result = url_input_source.write_to_file('/tmp', filename: 'custom.pdf') expect(result).to eq('/tmp/custom.pdf') end it 'handles authentication' do - result = url_input_source_no_filename.save_to_file('/tmp', username: 'user', password: 'pass') + result = url_input_source_no_filename.write_to_file('/tmp', username: 'user', password: 'pass') expect(result).to match(%r{/tmp/mindee_temp_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_[a-z0-9]{8}\.tmp}) end end @@ -102,7 +104,9 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '404', 'Not Found', '') } it 'raises an error' do - expect { url_input_source.save_to_file('/tmp') }.to raise_error(RuntimeError, %r{Failed to download file}) + expect do + url_input_source.write_to_file('/tmp') + end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) end end end diff --git a/spec/input/sources_spec.rb b/spec/input/sources_spec.rb deleted file mode 100644 index 72c2244b..00000000 --- a/spec/input/sources_spec.rb +++ /dev/null @@ -1,231 +0,0 @@ -# frozen_string_literal: true - -require 'mindee' -require 'mindee/input/sources' -require 'pdf-reader' - -require_relative '../data' - -describe Mindee::Input::Source do - context 'An image input file' do - it 'should load a JPEG from a path' do - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.jpg')) - expect(input.file_mimetype).to eq('image/jpeg') - - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.jpga')) - expect(input.file_mimetype).to eq('image/jpeg') - end - - it 'should load a TIFF from a path' do - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.tif')) - expect(input.file_mimetype).to eq('image/tiff') - - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.tiff')) - expect(input.file_mimetype).to eq('image/tiff') - end - - it 'should load a HEIC from a path' do - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'file_types/receipt.heic')) - expect(input.file_mimetype).to eq('image/heic') - end - end - - context 'A PDF input file' do - it 'should load a multi-page PDF from a path' do - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice.pdf')) - expect(input.file_mimetype).to eq('application/pdf') - expect(input.pdf?).to eq(true) - - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice.pdf')) - expect(input.file_mimetype).to eq('application/pdf') - expect(input.pdf?).to eq(true) - - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice_10p.pdf')) - expect(input.file_mimetype).to eq('application/pdf') - expect(input.pdf?).to eq(true) - - input = Mindee::Input::Source::PathInputSource.new(File.join(DATA_DIR, 'products/invoices/invoice_10p.pdf')) - expect(input.file_mimetype).to eq('application/pdf') - expect(input.pdf?).to eq(true) - end - end - - context 'A broken fixable PDF' do - mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') - it 'Should not raise a mime error' do - expect do - mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_fixable.pdf", fix_pdf: true) - end.not_to raise_error - end - end - - context 'A broken unfixable PDF' do - mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') - it 'Should raise an error' do - expect do - mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_unfixable.pdf", fix_pdf: true) - end.to raise_error Mindee::Input::Source::UnfixablePDFError - end - end - - context 'A broken fixable invoice PDF' do - mindee_client = Mindee::Client.new(api_key: 'invalid-api-key') - it 'Should send correct results' do - source_doc_original = mindee_client.source_from_path("#{DATA_DIR}/products/invoices/invoice.pdf") - expect do - source_doc_fixed = mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_invoice.pdf", - fix_pdf: true) - expect(source_doc_fixed.read_document[1].to_s).to eq(source_doc_original.read_document[1].to_s) - end.not_to raise_error - end - end - - describe 'Image Quality Compression' do - let(:input_receipt_path) { "#{DATA_DIR}/file_types/receipt.jpg" } - let(:output_dir) { "#{DATA_DIR}/output/" } - - it 'should compress the image from input source' do - receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) - receipt_input.compress!(quality: 80) # NOTE: base jpg quality is ~81 - - FileUtils.mkdir_p(File.dirname("#{output_dir}compress_indirect.jpg")) - File.write("#{output_dir}compress_indirect.jpg", receipt_input.io_stream.read) - - initial_file_size = File.size(input_receipt_path) - compressed_file_size = File.size(output_dir) - - expect(compressed_file_size).to be < initial_file_size - end - - it 'should compress the image with various quality levels' do - receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) - - compresses = [ - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 100), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream), # default quality - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 50), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 10), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 1), - ] - - output_files = [ - "#{output_dir}/compress100.jpg", - "#{output_dir}/compress85.jpg", - "#{output_dir}/compress50.jpg", - "#{output_dir}/compress10.jpg", - "#{output_dir}/compress1.jpg", - ] - - compresses.zip(output_files).each do |compressed, output_file| - File.write(output_file, compressed.read) - end - - initial_file_size = File.size(input_receipt_path) - rendered_file_sizes = output_files.map { |file| File.size(file) } - - expect(initial_file_size).to be < rendered_file_sizes[0] - expect(initial_file_size).to be < rendered_file_sizes[1] - expect(rendered_file_sizes[1]).to be > rendered_file_sizes[2] - expect(rendered_file_sizes[2]).to be > rendered_file_sizes[3] - expect(rendered_file_sizes[3]).to be > rendered_file_sizes[4] - end - - after(:each) do - FileUtils.rm_f("#{output_dir}/compress100.jpg") - FileUtils.rm_f("#{output_dir}/compress85.jpg") - FileUtils.rm_f("#{output_dir}/compress50.jpg") - FileUtils.rm_f("#{output_dir}/compress10.jpg") - FileUtils.rm_f("#{output_dir}/compress1.jpg") - FileUtils.rm_f("#{output_dir}/compress_indirect.jpg") - end - end - - describe 'The PDF text detection method' do - it 'should detect text pdf in a PDF file.' do - text_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/pdf/multipage.pdf") - expect(Mindee::PDF::PDFTools.source_text?(text_input.io_stream)).to be(true) - end - - it 'should not detect text pdf in an empty PDF file.' do - no_text_input = Mindee::Input::Source::PathInputSource.new( - "#{DATA_DIR}/file_types/pdf/blank_1.pdf" - ) - expect(Mindee::PDF::PDFTools.source_text?(no_text_input.io_stream)).to be(false) - end - - it 'should not detect text pdf in an image file.' do - image_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/receipt.jpg") - expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false) - end - end - - describe 'PDF compression' do - it 'should compress from an input source' do - input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" - output_file_path = "#{DATA_DIR}/output/compress_indirect.pdf" - pdf_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoice_splitter/default_sample.pdf") - pdf_input.compress!(quality: 50) - File.write(output_file_path, pdf_input.io_stream.read) - expect(File.size(output_file_path)).to be < File.size(input_file_path) - end - - it 'should compress from the compressor' do - input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" - output_file_paths = { - 85 => "#{DATA_DIR}/output/compressed_direct_85.pdf", - 75 => "#{DATA_DIR}/output/compressed_direct_75.pdf", - 50 => "#{DATA_DIR}/output/compressed_direct_50.pdf", - 10 => "#{DATA_DIR}/output/compressed_direct_10.pdf", - } - pdf = File.open(input_file_path) - output_file_paths.each_pair do |key, value| - compressed_pdf = Mindee::PDF::PDFCompressor.compress_pdf(pdf, quality: key) - compressed_pdf.rewind - File.write(value, compressed_pdf.read) - end - expect(File.size(input_file_path)).to be > File.size(output_file_paths[85]) - expect(File.size(output_file_paths[75])).to be < File.size(output_file_paths[85]) - expect(File.size(output_file_paths[50])).to be < File.size(output_file_paths[75]) - expect(File.size(output_file_paths[10])).to be < File.size(output_file_paths[50]) - end - - after(:each) do - output_dir = "#{DATA_DIR}/output" - FileUtils.rm_f("#{output_dir}/compressed_direct_85.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_75.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_50.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_10.pdf") - FileUtils.rm_f("#{output_dir}/compress_indirect.pdf") - end - end - - describe 'source text PDF compression' do - it 'should compress if forced' do - input_file_path = "#{DATA_DIR}/file_types/pdf/multipage.pdf" - output_file_path = "#{DATA_DIR}/output/compress_with_text.pdf" - pdf_input = Mindee::Input::Source::PathInputSource.new(input_file_path) - pdf_input.compress!(quality: 50, force_source_text: true, disable_source_text: false) - File.write(output_file_path, pdf_input.io_stream.read) - expect(File.size(output_file_path)).to be > File.size(input_file_path) - - pdf_input.io_stream.rewind - reader = PDFReader::Reader.new(pdf_input.io_stream) - - text = '' - reader.pages.each do |original_page| - receiver = PDFReader::Reader::PageTextReceiver.new - original_page.walk(receiver) - - receiver.runs.each do |text_run| - text += text_run.text - end - end - expect(text).to eq('*' * 650) - end - - after(:each) do - output_dir = "#{DATA_DIR}/output" - FileUtils.rm_f("#{output_dir}/compress_with_text.pdf") - end - end -end diff --git a/spec/fields/date_field_spec.rb b/spec/parsing/standard/date_field_spec.rb similarity index 100% rename from spec/fields/date_field_spec.rb rename to spec/parsing/standard/date_field_spec.rb diff --git a/spec/fields/string_field_spec.rb b/spec/parsing/standard/string_field_spec.rb similarity index 100% rename from spec/fields/string_field_spec.rb rename to spec/parsing/standard/string_field_spec.rb diff --git a/spec/pdf/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb new file mode 100644 index 00000000..afccf91c --- /dev/null +++ b/spec/pdf/extracted_pdf_spec.rb @@ -0,0 +1,94 @@ +# frozen_string_literal: true + +require 'mindee' +require 'rspec' + +describe Mindee::PDF::PDFExtractor::ExtractedPDF do + let(:product_data_dir) { File.join(DATA_DIR, 'products') } + let(:output_dir) { File.join(DATA_DIR, 'output') } + let(:file_types_dir) { File.join(DATA_DIR, 'file_types') } + let(:valid_pdf_path) { "#{product_data_dir}/invoices/invoice.pdf" } + let(:invalid_pdf_path) { "#{file_types_dir}/receipt.txt" } + let(:output_path) { "#{output_dir}/sample_output.pdf" } + + before do + allow(File).to receive(:directory?).and_return(false) + allow(File).to receive(:exist?).and_return(true) + allow(File).to receive(:extname).and_return('.pdf') + allow(File).to receive(:write) + end + + describe '#initialize' do + it 'initializes with valid pdf bytes and filename' do + pdf_stream = File.open(valid_pdf_path, 'r') + extracted_pdf = described_class.new(pdf_stream, 'invoice.pdf') + + expect(extracted_pdf.pdf_bytes).to eq(pdf_stream) + expect(extracted_pdf.filename).to eq('invoice.pdf') + end + end + + describe '#page_count' do + it 'raises an error for invalid PDF content' do + jpg_stream = File.open(invalid_pdf_path, 'r') + pdf_wrapper = described_class.new(jpg_stream, 'dummy.pdf') + + expect do + pdf_wrapper.page_count + end.to raise_error Mindee::Errors::MindeePDFError, %r{Could not retrieve page count} + end + + it 'returns the correct page count for a valid PDF' do + pdf_stream = File.open(valid_pdf_path, 'r') + allow(Mindee::PDF::PdfProcessor).to receive(:open_pdf).and_return(double(pages: [1, 2, 3])) + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect(pdf_wrapper.page_count).to eq(3) + end + end + + describe '#write_to_file' do + it 'writes the PDF bytes to a specified file path' do + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect { pdf_wrapper.write_to_file(output_path) }.not_to raise_error + expect(File).to have_received(:write).with(output_path, pdf_stream) + end + + it 'raises an error if the output path is a directory' do + allow(File).to receive(:directory?).and_return(true) + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect do + pdf_wrapper.write_to_file(output_path) + end.to raise_error Mindee::Errors::MindeePDFError, %r{Provided path is not a file} + end + + it 'raises an error if the save path is invalid' do + allow(File).to receive(:exist?).and_return(false) + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect do + pdf_wrapper.write_to_file(output_path) + end.to raise_error Mindee::Errors::MindeePDFError, %r{Invalid save path provided} + end + end + + describe '#as_input_source' do + it 'returns a BytesInputSource object with correct attributes' do + pdf_stream = StringIO.new('pdf content') + input_source_double = double('BytesInputSource', content: 'pdf content', filename: 'invoice.pdf') + + allow(Mindee::Input::Source::BytesInputSource).to receive(:new).and_return(input_source_double) + + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + input_source = pdf_wrapper.as_input_source + + expect(input_source.content).to eq('pdf content') + expect(input_source.filename).to eq('invoice.pdf') + end + end +end diff --git a/spec/pdf/pdf_compressor_spec.rb b/spec/pdf/pdf_compressor_spec.rb new file mode 100644 index 00000000..bb36d87c --- /dev/null +++ b/spec/pdf/pdf_compressor_spec.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +require 'mindee' + +require_relative '../data' + +describe Mindee::PDF::PDFCompressor do + describe 'The PDF text detection method' do + it 'should detect text pdf in a PDF file.' do + text_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/pdf/multipage.pdf") + expect(Mindee::PDF::PDFTools.source_text?(text_input.io_stream)).to be(true) + end + + it 'should not detect text pdf in an empty PDF file.' do + no_text_input = Mindee::Input::Source::PathInputSource.new( + "#{DATA_DIR}/file_types/pdf/blank_1.pdf" + ) + expect(Mindee::PDF::PDFTools.source_text?(no_text_input.io_stream)).to be(false) + end + + it 'should not detect text pdf in an image file.' do + image_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/receipt.jpg") + expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false) + end + end + + describe 'PDF compression' do + it 'should compress from an input source' do + input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" + output_file_path = "#{DATA_DIR}/output/compress_indirect.pdf" + pdf_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoice_splitter/default_sample.pdf") + pdf_input.compress!(quality: 50) + File.write(output_file_path, pdf_input.io_stream.read) + expect(File.size(output_file_path)).to be < File.size(input_file_path) + end + + it 'should compress from the compressor' do + input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" + output_file_paths = { + 85 => "#{DATA_DIR}/output/compressed_direct_85.pdf", + 75 => "#{DATA_DIR}/output/compressed_direct_75.pdf", + 50 => "#{DATA_DIR}/output/compressed_direct_50.pdf", + 10 => "#{DATA_DIR}/output/compressed_direct_10.pdf", + } + pdf = File.open(input_file_path) + output_file_paths.each_pair do |key, value| + compressed_pdf = Mindee::PDF::PDFCompressor.compress_pdf(pdf, quality: key) + compressed_pdf.rewind + File.write(value, compressed_pdf.read) + end + expect(File.size(input_file_path)).to be > File.size(output_file_paths[85]) + expect(File.size(output_file_paths[75])).to be < File.size(output_file_paths[85]) + expect(File.size(output_file_paths[50])).to be < File.size(output_file_paths[75]) + expect(File.size(output_file_paths[10])).to be < File.size(output_file_paths[50]) + end + + after(:each) do + output_dir = "#{DATA_DIR}/output" + FileUtils.rm_f("#{output_dir}/compressed_direct_85.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_75.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_50.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_10.pdf") + FileUtils.rm_f("#{output_dir}/compress_indirect.pdf") + end + end + + describe 'source text PDF compression' do + it 'should compress if forced' do + input_file_path = "#{DATA_DIR}/file_types/pdf/multipage.pdf" + output_file_path = "#{DATA_DIR}/output/compress_with_text.pdf" + pdf_input = Mindee::Input::Source::PathInputSource.new(input_file_path) + pdf_input.compress!(quality: 50, force_source_text: true, disable_source_text: false) + File.write(output_file_path, pdf_input.io_stream.read) + expect(File.size(output_file_path)).to be > File.size(input_file_path) + + pdf_input.io_stream.rewind + reader = PDFReader::Reader.new(pdf_input.io_stream) + + text = '' + reader.pages.each do |original_page| + receiver = PDFReader::Reader::PageTextReceiver.new + original_page.walk(receiver) + + receiver.runs.each do |text_run| + text += text_run.text + end + end + expect(text).to eq('*' * 650) + end + + after(:each) do + output_dir = "#{DATA_DIR}/output" + FileUtils.rm_f("#{output_dir}/compress_with_text.pdf") + end + end +end diff --git a/spec/extraction/pdf_extractor_spec.rb b/spec/pdf/pdf_extractor_spec.rb similarity index 91% rename from spec/extraction/pdf_extractor_spec.rb rename to spec/pdf/pdf_extractor_spec.rb index e8ca5e2c..6de412f0 100644 --- a/spec/extraction/pdf_extractor_spec.rb +++ b/spec/pdf/pdf_extractor_spec.rb @@ -20,13 +20,13 @@ jpg_input = Mindee::Input::Source::PathInputSource.new(invoice_default_sample_path) expect(jpg_input.pdf?).to eq(false) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(jpg_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(jpg_input) expect(extractor.page_count).to eq(1) end it 'should extract invoices from a PDF (no strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) @@ -45,7 +45,7 @@ it 'should extract invoices from a PDF (strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) expect(loaded_prediction.invoice_page_groups.length).to eq(3) diff --git a/spec/input/pdf_processing_spec.rb b/spec/pdf/pdf_processor_spec.rb similarity index 89% rename from spec/input/pdf_processing_spec.rb rename to spec/pdf/pdf_processor_spec.rb index 4feae54c..0ef77fe1 100644 --- a/spec/input/pdf_processing_spec.rb +++ b/spec/pdf/pdf_processor_spec.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -require 'mindee/pdf' +require 'mindee' require_relative '../data' @@ -108,5 +108,18 @@ def open_pdf(io_stream) new_pdf = open_pdf(new_stream) expect(new_pdf.pages.size).to eq(9) end + + it 'Should fail on invalid operation' do + io_stream = File.open(filepath, 'rb') + io_stream.seek(0) + options = { + page_indexes: [1], + operation: :broken, + on_min_pages: 0, + } + expect do + Mindee::PDF::PdfProcessor.parse(io_stream, options) + end.to raise_error ArgumentError + end end end