From 109cd64d69703fff3430147e2ac8bd3f2df41f27 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Wed, 22 Jan 2025 16:09:08 +0100 Subject: [PATCH 01/16] tmp --- lib/mindee.rb | 4 ++++ lib/mindee/errors.rb | 0 lib/mindee/errors/http_errors.rb | 15 +++++++++++++++ lib/mindee/errors/mindee_error.rb | 6 ++++++ 4 files changed, 25 insertions(+) create mode 100644 lib/mindee/errors.rb create mode 100644 lib/mindee/errors/http_errors.rb create mode 100644 lib/mindee/errors/mindee_error.rb diff --git a/lib/mindee.rb b/lib/mindee.rb index ca57a06f..600d40a6 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -4,6 +4,10 @@ require 'mindee/extraction' module Mindee + # Mindee internal error module. + module Errors + end + # Mindee internal http module. module HTTP end diff --git a/lib/mindee/errors.rb b/lib/mindee/errors.rb new file mode 100644 index 00000000..e69de29b diff --git a/lib/mindee/errors/http_errors.rb b/lib/mindee/errors/http_errors.rb new file mode 100644 index 00000000..6332a4d9 --- /dev/null +++ b/lib/mindee/errors/http_errors.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Errors + # Base class for all http-related errors. + class MindeeHTTPError < MindeeError + # @return [Integer] + attr_reader :status_code + # @return [String, Nil] + attr_reader :api_code + # @return [Hash] + attr_reader :api_details + # @return [Hash] + attr_reader :api_message + end +end diff --git a/lib/mindee/errors/mindee_error.rb b/lib/mindee/errors/mindee_error.rb new file mode 100644 index 00000000..7d7cbd25 --- /dev/null +++ b/lib/mindee/errors/mindee_error.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true + +module Errors + # Base class for all custom mindee errors. + class MindeeError < StandardError; end +end From b7b923d3ec6208750a853cb986fe2ab08a85c3e8 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 23 Jan 2025 15:24:01 +0100 Subject: [PATCH 02/16] :boom: refactor error handling module --- lib/mindee/client.rb | 20 +++++++--- lib/mindee/errors.rb | 5 +++ lib/mindee/errors/http_errors.rb | 15 ------- lib/mindee/errors/mindee_error.rb | 17 ++++++-- lib/mindee/errors/mindee_http_error.rb | 36 +++++++++++++++++ lib/mindee/errors/mindee_input_error.rb | 29 ++++++++++++++ .../extraction/common/extracted_image.rb | 6 +-- .../multi_receipts_extractor.rb | 5 ++- .../extraction/pdf_extractor/extracted_pdf.rb | 7 ++-- .../extraction/pdf_extractor/pdf_extractor.rb | 9 +++-- lib/mindee/http.rb | 2 +- lib/mindee/http/endpoint.rb | 11 ++--- .../http/{error.rb => error_handler.rb} | 40 +++---------------- lib/mindee/http/workflow_endpoint.rb | 10 ++--- lib/mindee/image/image_utils.rb | 5 ++- lib/mindee/input/local_response.rb | 6 +-- .../input/sources/local_input_source.rb | 27 +------------ lib/mindee/input/sources/url_input_source.rb | 8 ++-- lib/mindee/parsing/common/api_response.rb | 2 +- lib/mindee/pdf/pdf_processor.rb | 2 +- .../{error_spec.rb => error_handler_spec.rb} | 32 +++++++-------- spec/input/sources_spec.rb | 3 +- spec/input/url_input_source_spec.rb | 10 +++-- 23 files changed, 171 insertions(+), 136 deletions(-) delete mode 100644 lib/mindee/errors/http_errors.rb create mode 100644 lib/mindee/errors/mindee_http_error.rb create mode 100644 lib/mindee/errors/mindee_input_error.rb rename lib/mindee/http/{error.rb => error_handler.rb} (74%) rename spec/http/{error_spec.rb => error_handler_spec.rb} (81%) diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb index 8ed4a69d..54770cd3 100644 --- a/lib/mindee/client.rb +++ b/lib/mindee/client.rb @@ -187,7 +187,8 @@ def enqueue_and_parse( end if queue_res.job.status != Mindee::Parsing::Common::JobStatus::COMPLETED elapsed = initial_delay_sec + (polling_attempts * delay_sec) - raise "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)" + raise Errors::MindeeAPIError, + "Asynchronous parsing request timed out after #{elapsed} seconds (#{polling_attempts} tries)" end queue_res @@ -243,7 +244,7 @@ def execute_workflow( def load_prediction(product_class, local_response) Mindee::Parsing::Common::ApiResponse.new(product_class, local_response.as_hash, local_response.as_hash.to_json) rescue KeyError - raise 'No prediction found in local response.' + raise Errors::MindeeError, 'No prediction found in local response.' end # Load a document from an absolute path, as a string. @@ -314,11 +315,18 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) min_delay_sec = 1 min_initial_delay_sec = 1 min_retries = 2 - raise "Cannot set auto-poll delay to less than #{min_delay_sec} second(s)" if delay_sec < min_delay_sec + if delay_sec < min_delay_sec + raise Errors::MindeeUserError, + "Cannot set auto-poll delay to less than #{min_delay_sec} second(s)" + end if initial_delay_sec < min_initial_delay_sec - raise "Cannot set initial parsing delay to less than #{min_initial_delay_sec} second(s)" + raise Errors::MindeeUserError, + "Cannot set initial parsing delay to less than #{min_initial_delay_sec} second(s)" end - raise "Cannot set auto-poll retries to less than #{min_retries}" if max_retries < min_retries + return unless max_retries < min_retries + + raise Errors::MindeeUserError, + "Cannot set auto-poll retries to less than #{min_retries}" end # Creates an endpoint with the given values. Raises an error if the endpoint is invalid. @@ -334,7 +342,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) # @return [Mindee::HTTP::Endpoint] def initialize_endpoint(product_class, endpoint_name: '', account_name: '', version: '') if (endpoint_name.nil? || endpoint_name.empty?) && product_class == Mindee::Product::Custom::CustomV1 - raise 'Missing argument endpoint_name when using custom class' + raise Errors::MindeeUserError, 'Missing argument endpoint_name when using custom class' end endpoint_name = fix_endpoint_name(product_class, endpoint_name) diff --git a/lib/mindee/errors.rb b/lib/mindee/errors.rb index e69de29b..59117d6c 100644 --- a/lib/mindee/errors.rb +++ b/lib/mindee/errors.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +require_relative 'errors/mindee_error' +require_relative 'errors/mindee_http_error' +require_relative 'errors/mindee_input_error' diff --git a/lib/mindee/errors/http_errors.rb b/lib/mindee/errors/http_errors.rb deleted file mode 100644 index 6332a4d9..00000000 --- a/lib/mindee/errors/http_errors.rb +++ /dev/null @@ -1,15 +0,0 @@ -# frozen_string_literal: true - -module Errors - # Base class for all http-related errors. - class MindeeHTTPError < MindeeError - # @return [Integer] - attr_reader :status_code - # @return [String, Nil] - attr_reader :api_code - # @return [Hash] - attr_reader :api_details - # @return [Hash] - attr_reader :api_message - end -end diff --git a/lib/mindee/errors/mindee_error.rb b/lib/mindee/errors/mindee_error.rb index 7d7cbd25..1b5ad928 100644 --- a/lib/mindee/errors/mindee_error.rb +++ b/lib/mindee/errors/mindee_error.rb @@ -1,6 +1,17 @@ # frozen_string_literal: true -module Errors - # Base class for all custom mindee errors. - class MindeeError < StandardError; end +module Mindee + module Errors + # Base class for all custom mindee errors. + class MindeeError < StandardError; end + + # Errors relating to library issues. + class MindeeAPIError < MindeeError; end + + # Errors relating to misuse of the library. + class MindeeUserError < MindeeError; end + + # Errors relating to geometric manipulation issues. + class MindeeGeometryError < MindeeError; end + end end diff --git a/lib/mindee/errors/mindee_http_error.rb b/lib/mindee/errors/mindee_http_error.rb new file mode 100644 index 00000000..1694ef2c --- /dev/null +++ b/lib/mindee/errors/mindee_http_error.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +require_relative 'mindee_error' + +module Mindee + module Errors + # API HttpError + class MindeeHTTPError < MindeeError + # @return [String] + attr_reader :status_code + # @return [Integer] + attr_reader :api_code + # @return [String] + attr_reader :api_details + # @return [String] + attr_reader :api_message + + # @param http_error [Hash] + # @param url [String] + # @param code [Integer] + def initialize(http_error, url, code) + @status_code = code + @api_code = http_error['code'] + @api_details = http_error['details'] + @api_message = http_error['message'] + super("#{url} #{@status_code} HTTP error: #{@api_details} - #{@api_message}") + end + end + + # Base class for all client-side errors. + class MindeeHTTPClientError < MindeeHTTPError; end + + # Base class for all server-side errors. + class MindeeHTTPServerError < MindeeHTTPError; end + end +end diff --git a/lib/mindee/errors/mindee_input_error.rb b/lib/mindee/errors/mindee_input_error.rb new file mode 100644 index 00000000..281dd727 --- /dev/null +++ b/lib/mindee/errors/mindee_input_error.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +module Mindee + module Errors + # Base class for errors relating to input documents. + class MindeeInputError < MindeeError; end + + # Errors relating to sources (documents) handling. + class MindeeSourceError < MindeeInputError; end + + # Errors relating to mime type issues. + class MindeeMimeTypeError < MindeeSourceError + # @return [String] + attr_reader :invalid_mimetype + + # @param mime_type [String] + def initialize(mime_type) + @invalid_mimetype = mime_type + super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}") + end + end + + # Errors relating to the handling of images. + class MindeeImageError < MindeeInputError; end + + # Errors relating to the handling of PDF documents. + class MindeePDFError < MindeeInputError; end + end +end diff --git a/lib/mindee/extraction/common/extracted_image.rb b/lib/mindee/extraction/common/extracted_image.rb index 44a4fb07..6ff9d79c 100644 --- a/lib/mindee/extraction/common/extracted_image.rb +++ b/lib/mindee/extraction/common/extracted_image.rb @@ -46,7 +46,7 @@ def initialize(input_source, page_id, element_id) def save_to_file(output_path, file_format = nil) resolved_path = Pathname.new(output_path).realpath if file_format.nil? - raise ArgumentError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? + raise Errors::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? file_format = resolved_path.extname.delete('.').upcase end @@ -55,9 +55,9 @@ def save_to_file(output_path, file_format = nil) image.format file_format.downcase image.write resolved_path.to_s rescue TypeError - raise 'Invalid path/filename provided.' + raise Errors::MindeeImageError, 'Invalid path/filename provided.' rescue StandardError - raise "Could not save file #{Pathname.new(output_path).basename}." + raise Errors::MindeeImageError, "Could not save file #{Pathname.new(output_path).basename}." end # Return the file as a Mindee-compatible BufferInput source. diff --git a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb index 6b5bc729..b3884789 100644 --- a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +++ b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb @@ -15,7 +15,10 @@ def self.extract_receipts(input_source, inference) # @return [Array] Individual extracted receipts as an array of ExtractedMultiReceiptsImage. images = [] - raise 'No possible receipts candidates found for MultiReceipts extraction.' unless inference.prediction.receipts + unless inference.prediction.receipts + raise Errors::MindeeInputError, + 'No possible receipts candidates found for Multi-Receipts extraction.' + end (0...input_source.count_pdf_pages).each do |page_id| receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box) diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb index e8ade1e8..17baa73f 100644 --- a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +++ b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb @@ -27,14 +27,15 @@ def page_count current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes) current_pdf.pages.size rescue TypeError - raise 'Could not retrieve page count from Extracted PDF object.' + raise Errors::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.' end # Writes the contents of the current PDF object to a file. # @param output_path [String] Path to write to. def write_to_file(output_path) - raise 'Provided path is not a file' if File.directory?(destination) - raise 'Invalid save path provided' unless File.exist?(File.expand_path('..', output_path)) + raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(destination) + raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?(File.expand_path('..', + output_path)) if File.extname(output_path).downcase == '.pdf' base_path = File.expand_path('..', output_path) diff --git a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb index 14b429bb..5fa960d1 100644 --- a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +++ b/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb @@ -47,11 +47,14 @@ def extract_sub_documents(page_indexes) basename = File.basename(@filename, extension) page_indexes.each do |page_index_list| if page_index_list.empty? || page_index_list.nil? - raise "Empty indexes aren't allowed for extraction #{page_index_list}" + raise Errors::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}" end page_index_list.each do |page_index| - raise "Index #{page_index} is out of range." if (page_index > page_count) || page_index.negative? + if (page_index > page_count) || page_index.negative? + raise Errors::MindeePDFError, + "Index #{page_index} is out of range." + end end formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s field_filename = "#{basename}_#{format('%03d', @@ -71,7 +74,7 @@ def extract_sub_documents(page_indexes) # @param strict [Boolean] # @return [Array] def extract_invoices(page_indexes, strict: false) - raise 'No indexes provided.' if page_indexes.empty? + raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty? unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup) return extract_sub_documents(page_indexes) end diff --git a/lib/mindee/http.rb b/lib/mindee/http.rb index d4b3375e..649b58cb 100644 --- a/lib/mindee/http.rb +++ b/lib/mindee/http.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true require_relative 'http/endpoint' -require_relative 'http/error' +require_relative 'http/error_handler' require_relative 'http/workflow_endpoint' diff --git a/lib/mindee/http/endpoint.rb b/lib/mindee/http/endpoint.rb index 7b2cfe75..1c570cf3 100644 --- a/lib/mindee/http/endpoint.rb +++ b/lib/mindee/http/endpoint.rb @@ -2,7 +2,7 @@ require 'json' require 'net/http' -require_relative 'error' +require_relative 'error_handler' require_relative '../version' require_relative 'response_validation' @@ -65,7 +65,7 @@ def predict(input_source, all_words, full_text, close_file, cropper) return [hashed_response, response.body] if ResponseValidation.valid_sync_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -83,7 +83,7 @@ def predict_async(input_source, all_words, full_text, close_file, cropper) return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -97,7 +97,7 @@ def parse_async(job_id) return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -201,7 +201,8 @@ def document_queue_req(job_id) def check_api_key return unless @api_key.nil? || @api_key.empty? - raise "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ + raise Errors::MindeeAPIError, + "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ "check your Client Configuration.\n" \ 'You can set this using the ' \ "'#{HTTP::API_KEY_ENV_NAME}' environment variable." diff --git a/lib/mindee/http/error.rb b/lib/mindee/http/error_handler.rb similarity index 74% rename from lib/mindee/http/error.rb rename to lib/mindee/http/error_handler.rb index 84a7368c..b4ade5ac 100644 --- a/lib/mindee/http/error.rb +++ b/lib/mindee/http/error_handler.rb @@ -1,11 +1,12 @@ # frozen_string_literal: true require 'json' +require_relative '../errors/mindee_http_error' module Mindee module HTTP # Mindee HTTP error module. - module Error + module ErrorHandler module_function # Extracts the HTTP error from the response hash, or the job error if there is one. @@ -80,44 +81,13 @@ def handle_error(url, response) error_obj = create_error_obj(parsed_hash) case code when 400..499 - MindeeHttpClientError.new(error_obj, url, code) + Errors::MindeeHTTPClientError.new(error_obj, url, code) when 500..599 - MindeeHttpServerError.new(error_obj, url, code) + Errors::MindeeHTTPServerError.new(error_obj, url, code) else - MindeeHttpError.new(error_obj, url, code) + Errors::MindeeHTTPError.new(error_obj, url, code) end end - - # API HttpError - class MindeeHttpError < StandardError - # @return [String] - attr_reader :status_code - # @return [Integer] - attr_reader :api_code - # @return [String] - attr_reader :api_details - # @return [String] - attr_reader :api_message - - # @param http_error [Hash] - # @param url [String] - # @param code [Integer] - def initialize(http_error, url, code) - @status_code = code - @api_code = http_error['code'] - @api_details = http_error['details'] - @api_message = http_error['message'] - super("#{url} #{@status_code} HTTP error: #{@api_details} - #{@api_message}") - end - end - - # API client HttpError - class MindeeHttpClientError < MindeeHttpError - end - - # API server HttpError - class MindeeHttpServerError < MindeeHttpError - end end end end diff --git a/lib/mindee/http/workflow_endpoint.rb b/lib/mindee/http/workflow_endpoint.rb index 79e08897..cc2e0c79 100644 --- a/lib/mindee/http/workflow_endpoint.rb +++ b/lib/mindee/http/workflow_endpoint.rb @@ -2,7 +2,7 @@ require 'json' require 'net/http' -require_relative 'error' +require_relative 'error_handler' module Mindee module HTTP @@ -37,7 +37,7 @@ def execute_workflow(input_source, full_text, document_alias, priority, public_u return [hashed_response, response.body] if ResponseValidation.valid_async_response?(response) ResponseValidation.clean_request!(response) - error = Error.handle_error(@url_name, response) + error = ErrorHandler.handle_error(@url_name, response) raise error end @@ -81,9 +81,9 @@ def workflow_execution_req_post(input_source, document_alias, priority, full_tex def check_api_key return unless @api_key.nil? || @api_key.empty? - raise "Missing API key. Check your Client Configuration.\n" \ - 'You can set this using the ' \ - "'#{HTTP::API_KEY_ENV_NAME}' environment variable." + raise Errors::MindeeUserError, "Missing API key. Check your Client Configuration.\n" \ + 'You can set this using the ' \ + "'#{HTTP::API_KEY_ENV_NAME}' environment variable." end end end diff --git a/lib/mindee/image/image_utils.rb b/lib/mindee/image/image_utils.rb index d839cdf5..a7b970a9 100644 --- a/lib/mindee/image/image_utils.rb +++ b/lib/mindee/image/image_utils.rb @@ -36,7 +36,8 @@ def self.to_image(image) elsif image.is_a?(MiniMagick::Image) image else - raise "Expected an I/O object or a MiniMagick::Image. '#{image.class}' given instead." + img_class = image.class ? image.class.to_s : 'unknown format' + raise Errors::MindeeImageError, "Expected an I/O object or a MiniMagick::Image. '#{img_class}' given instead." end end @@ -59,7 +60,7 @@ def self.image_to_stringio(image, format = 'JPEG') # @param max_width [Integer] Maximum width. If not specified, the horizontal ratio will remain the same. # @param max_height [Integer] Maximum height. If not specified, the vertical ratio will remain the same. def self.calculate_new_dimensions(original, max_width: nil, max_height: nil) - raise 'Provided image could not be processed for resizing.' if original.nil? + raise Errors::MindeeImageError, 'Provided image could not be processed for resizing.' if original.nil? return [original.width, original.height] if max_width.nil? && max_height.nil? diff --git a/lib/mindee/input/local_response.rb b/lib/mindee/input/local_response.rb index 88736f97..fafe3fdf 100644 --- a/lib/mindee/input/local_response.rb +++ b/lib/mindee/input/local_response.rb @@ -27,7 +27,7 @@ def initialize(input_file) end @file.rewind else - raise "Incompatible type for input '#{input_file.class}'." + raise Errors::MindeeInputError, "Incompatible type for input '#{input_file.class}'." end end @@ -38,7 +38,7 @@ def as_hash file_str = @file.read JSON.parse(file_str, object_class: Hash) rescue JSON::ParserError - raise "File is not a valid dict. #{file_str}" + raise Errors::MindeeInputError, "File is not a valid dict. #{file_str}" end # Processes the secret key @@ -56,7 +56,7 @@ def get_hmac_signature(secret_key) @file.rewind mac = OpenSSL::HMAC.hexdigest(algorithm, self.class.process_secret_key(secret_key), @file.read) rescue StandardError - raise 'Could not get HMAC signature from payload.' + raise Errors::MindeeInputError, 'Could not get HMAC signature from payload.' end mac end diff --git a/lib/mindee/input/sources/local_input_source.rb b/lib/mindee/input/sources/local_input_source.rb index 1d2dadea..48939cfc 100644 --- a/lib/mindee/input/sources/local_input_source.rb +++ b/lib/mindee/input/sources/local_input_source.rb @@ -20,29 +20,6 @@ module Source 'image/webp', ].freeze - # Standard error for invalid mime types - class MimeTypeError < StandardError - end - - # Error sent if the file's mimetype isn't allowed - class InvalidMimeTypeError < MimeTypeError - # @return [String] - attr_reader :invalid_mimetype - - # @param mime_type [String] - def initialize(mime_type) - @invalid_mimetype = mime_type - super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}") - end - end - - # Error sent if a pdf file couldn't be fixed - class UnfixablePDFError < MimeTypeError - def initialize - super("Corrupted PDF couldn't be repaired.") - end - end - # Base class for loading documents. class LocalInputSource # @return [String] @@ -72,7 +49,7 @@ def initialize(io_stream, filename, fix_pdf: false) return if ALLOWED_MIME_TYPES.include? @file_mimetype end - raise InvalidMimeTypeError, @file_mimetype.to_s + raise Errors::MindeeMimeTypeError, @file_mimetype.to_s end # Attempts to fix pdf files if mimetype is rejected. @@ -81,7 +58,7 @@ def initialize(io_stream, filename, fix_pdf: false) # @param stream [StringIO] def rescue_broken_pdf(stream) stream.gets('%PDF-') - raise UnfixablePDFError if stream.eof? || stream.pos > 500 + raise Errors::MindeePDFError if stream.eof? || stream.pos > 500 stream.pos = stream.pos - 5 data = stream.read diff --git a/lib/mindee/input/sources/url_input_source.rb b/lib/mindee/input/sources/url_input_source.rb index 5b4a9b13..8bc03897 100644 --- a/lib/mindee/input/sources/url_input_source.rb +++ b/lib/mindee/input/sources/url_input_source.rb @@ -13,7 +13,7 @@ class UrlInputSource attr_reader :url def initialize(url) - raise 'URL must be HTTPS' unless url.start_with? 'https://' + raise Errors::MindeeInputError, 'URL must be HTTPS' unless url.start_with? 'https://' @url = url end @@ -72,9 +72,9 @@ def fetch_file_content(username: nil, password: nil, token: nil, max_redirects: response = make_request(uri, request, max_redirects) if response.code.to_i > 299 - raise "Failed to download file: HTTP status code #{response.code}" + raise Errors::MindeeAPIError, "Failed to download file: HTTP status code #{response.code}" elsif response.code.to_i < 200 - raise "Failed to download file: Invalid response code #{response.code}." + raise Errors::MindeeAPIError, "Failed to download file: Invalid response code #{response.code}." end response.body @@ -100,7 +100,7 @@ def make_request(uri, request, max_redirects) response = http.request(request) if response.is_a?(Net::HTTPRedirection) && max_redirects.positive? location = response['location'] - raise 'No location in redirection header.' if location.nil? + raise Errors::MindeeInputError, 'No location in redirection header.' if location.nil? new_uri = URI.parse(location) request = Net::HTTP::Get.new(new_uri) diff --git a/lib/mindee/parsing/common/api_response.rb b/lib/mindee/parsing/common/api_response.rb index f8bfc3e5..74fe3a24 100644 --- a/lib/mindee/parsing/common/api_response.rb +++ b/lib/mindee/parsing/common/api_response.rb @@ -108,7 +108,7 @@ class ApiResponse # @param raw_http [String] def initialize(product_class, http_response, raw_http) @raw_http = raw_http.to_s - raise 'Invalid response format.' unless http_response.key?('api_request') + raise Errors::MindeeAPIError, 'Invalid response format.' unless http_response.key?('api_request') @api_request = Mindee::Parsing::Common::ApiRequest.new(http_response['api_request']) diff --git a/lib/mindee/pdf/pdf_processor.rb b/lib/mindee/pdf/pdf_processor.rb index 1e7e981a..575329df 100644 --- a/lib/mindee/pdf/pdf_processor.rb +++ b/lib/mindee/pdf/pdf_processor.rb @@ -34,7 +34,7 @@ def self.parse(io_stream, options) when :REMOVE pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages) else - raise "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" + raise Errors::MindeeUserError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" end current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a diff --git a/spec/http/error_spec.rb b/spec/http/error_handler_spec.rb similarity index 81% rename from spec/http/error_spec.rb rename to spec/http/error_handler_spec.rb index 45154587..2b2c0a3b 100644 --- a/spec/http/error_spec.rb +++ b/spec/http/error_handler_spec.rb @@ -4,7 +4,7 @@ require 'json' require_relative 'mock_http_response' -describe Mindee::HTTP::Error do +describe Mindee::HTTP::ErrorHandler do context 'An HTTP call' do it 'should make an invalid API sync parse call raising an exception' do mindee_client1 = Mindee::Client.new(api_key: 'invalid-api-key') @@ -13,7 +13,7 @@ doc_class = Mindee::Product::Receipt::ReceiptV5 expect do mindee_client1.parse(input_source, doc_class, all_words: false, close_file: true) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end it 'should make an invalid API async enqueue call raising an exception' do @@ -23,7 +23,7 @@ doc_class = Mindee::Product::Invoice::InvoiceV4 expect do mindee_client1.enqueue(input_source, doc_class) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end it 'should make an invalid API async parse call raising an exception' do @@ -31,7 +31,7 @@ doc_class = Mindee::Product::InvoiceSplitter::InvoiceSplitterV1 expect do mindee_client1.parse_queued('invalid-job-id', doc_class) - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError end # NOTE: No reliable UT each HTTP error for ruby as the only semi-reliable http mock lib (Webmock) isn't compatible @@ -41,10 +41,10 @@ it 'should fail on a 400 response with object' do file = File.read("#{DATA_DIR}/errors/error_400_no_details.json") error_obj = MockHTTPResponse.new('1.0', '400', 'Some scary message here', file) - error400 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error400 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error400 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error400.status_code).to eq(400) expect(error400.api_code).to eq('SomeCode') expect(error400.api_message).to eq('Some scary message here') @@ -54,10 +54,10 @@ it 'should fail on a 401 response with object' do file = File.read("#{DATA_DIR}/errors/error_401_invalid_token.json") error_obj = MockHTTPResponse.new('1.0', '401', 'Authorization required', file) - error401 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error401 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error401 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error401.status_code).to eq(401) expect(error401.api_code).to eq('Unauthorized') expect(error401.api_message).to eq('Authorization required') @@ -67,10 +67,10 @@ it 'should fail on a 429 response with object' do file = File.read("#{DATA_DIR}/errors/error_429_too_many_requests.json") error_obj = MockHTTPResponse.new('1.0', '429', 'Too many requests', file) - error429 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error429 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error429 - end.to raise_error Mindee::HTTP::Error::MindeeHttpClientError + end.to raise_error Mindee::Errors::MindeeHTTPClientError expect(error429.status_code).to eq(429) expect(error429.api_code).to eq('TooManyRequests') expect(error429.api_message).to eq('Too many requests') @@ -80,10 +80,10 @@ it 'should fail on a 500 response with object' do file = File.read("#{DATA_DIR}/errors/error_500_inference_fail.json") error_obj = MockHTTPResponse.new('1.0', '500', 'Inference failed', file) - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('failure') expect(error500.api_message).to eq('Inference failed') @@ -93,10 +93,10 @@ it 'should fail on a 500 HTML response' do file = File.read("#{DATA_DIR}/errors/error_50x.html") error_obj = MockHTTPResponse.new('1.0', '500', '', file) - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('UnknownError') expect(error500.api_message).to eq('Server sent back an unexpected reply.') @@ -111,10 +111,10 @@ expect(hashed_obj.dig('job', 'status')).to eq('failed') expect(Mindee::HTTP::ResponseValidation.valid_async_response?(error_obj)).to be(false) Mindee::HTTP::ResponseValidation.clean_request! error_obj - error500 = Mindee::HTTP::Error.handle_error('dummy-url', error_obj) + error500 = Mindee::HTTP::ErrorHandler.handle_error('dummy-url', error_obj) expect do raise error500 - end.to raise_error Mindee::HTTP::Error::MindeeHttpServerError + end.to raise_error Mindee::Errors::MindeeHTTPServerError expect(error500.status_code).to eq(500) expect(error500.api_code).to eq('ServerError') expect(error500.api_message).to eq('An error occurred') diff --git a/spec/input/sources_spec.rb b/spec/input/sources_spec.rb index 72c2244b..71274f4f 100644 --- a/spec/input/sources_spec.rb +++ b/spec/input/sources_spec.rb @@ -2,6 +2,7 @@ require 'mindee' require 'mindee/input/sources' +require 'mindee/errors' require 'pdf-reader' require_relative '../data' @@ -64,7 +65,7 @@ it 'Should raise an error' do expect do mindee_client.source_from_path("#{DATA_DIR}/file_types/pdf/broken_unfixable.pdf", fix_pdf: true) - end.to raise_error Mindee::Input::Source::UnfixablePDFError + end.to raise_error Mindee::Errors::MindeePDFError end end diff --git a/spec/input/url_input_source_spec.rb b/spec/input/url_input_source_spec.rb index 12cc4b8d..393e6a41 100644 --- a/spec/input/url_input_source_spec.rb +++ b/spec/input/url_input_source_spec.rb @@ -20,7 +20,7 @@ context 'with invalid URL' do it 'raises an error for invalid URLs' do - expect { described_class.new(invalid_url) }.to raise_error('URL must be HTTPS') + expect { described_class.new(invalid_url) }.to raise_error(Mindee::Errors::MindeeInputError) end end end @@ -59,7 +59,9 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '404', 'Not Found', '') } it 'raises an error' do - expect { url_input_source.as_local_input_source }.to raise_error(RuntimeError, %r{Failed to download file}) + expect do + url_input_source.as_local_input_source + end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) end end end @@ -102,7 +104,9 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '404', 'Not Found', '') } it 'raises an error' do - expect { url_input_source.save_to_file('/tmp') }.to raise_error(RuntimeError, %r{Failed to download file}) + expect do + url_input_source.save_to_file('/tmp') + end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) end end end From 622214023ab4c85135379324ecbaad2720fc472c Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 23 Jan 2025 15:41:31 +0100 Subject: [PATCH 03/16] fix ugly lines --- lib/mindee/http/endpoint.rb | 3 +-- lib/mindee/http/workflow_endpoint.rb | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/mindee/http/endpoint.rb b/lib/mindee/http/endpoint.rb index 1c570cf3..e1e31c04 100644 --- a/lib/mindee/http/endpoint.rb +++ b/lib/mindee/http/endpoint.rb @@ -203,8 +203,7 @@ def check_api_key raise Errors::MindeeAPIError, "Missing API key for product \"'#{@url_name}' v#{@version}\" (belonging to \"#{@owner}\"), " \ - "check your Client Configuration.\n" \ - 'You can set this using the ' \ + "check your Client Configuration.\nYou can set this using the " \ "'#{HTTP::API_KEY_ENV_NAME}' environment variable." end end diff --git a/lib/mindee/http/workflow_endpoint.rb b/lib/mindee/http/workflow_endpoint.rb index cc2e0c79..9dc3632c 100644 --- a/lib/mindee/http/workflow_endpoint.rb +++ b/lib/mindee/http/workflow_endpoint.rb @@ -82,8 +82,7 @@ def check_api_key return unless @api_key.nil? || @api_key.empty? raise Errors::MindeeUserError, "Missing API key. Check your Client Configuration.\n" \ - 'You can set this using the ' \ - "'#{HTTP::API_KEY_ENV_NAME}' environment variable." + "You can set this using the '#{HTTP::API_KEY_ENV_NAME}' environment variable." end end end From f6c968b5db3ff3a97ecfc9a7c615be98de2c32cb Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Thu, 23 Jan 2025 18:02:46 +0100 Subject: [PATCH 04/16] update some tests --- lib/mindee/client.rb | 14 ++++++----- spec/client_spec.rb | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb index 54770cd3..99df9b39 100644 --- a/lib/mindee/client.rb +++ b/lib/mindee/client.rb @@ -166,6 +166,7 @@ def enqueue_and_parse( delay_sec: 1.5, max_retries: 80 ) + validate_async_params(initial_delay_sec, delay_sec, max_retries) enqueue_res = enqueue( input_source, product_class, @@ -243,8 +244,8 @@ def execute_workflow( # @return [Mindee::Parsing::Common::ApiResponse] def load_prediction(product_class, local_response) Mindee::Parsing::Common::ApiResponse.new(product_class, local_response.as_hash, local_response.as_hash.to_json) - rescue KeyError - raise Errors::MindeeError, 'No prediction found in local response.' + rescue KeyError, Errors::MindeeAPIError + raise Errors::MindeeInputError, 'No prediction found in local response.' end # Load a document from an absolute path, as a string. @@ -316,16 +317,16 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) min_initial_delay_sec = 1 min_retries = 2 if delay_sec < min_delay_sec - raise Errors::MindeeUserError, + raise ArgumentError, "Cannot set auto-poll delay to less than #{min_delay_sec} second(s)" end if initial_delay_sec < min_initial_delay_sec - raise Errors::MindeeUserError, + raise ArgumentError, "Cannot set initial parsing delay to less than #{min_initial_delay_sec} second(s)" end return unless max_retries < min_retries - raise Errors::MindeeUserError, + raise ArgumentError, "Cannot set auto-poll retries to less than #{min_retries}" end @@ -341,7 +342,8 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) # @param version [String] For custom endpoints, version of the product. # @return [Mindee::HTTP::Endpoint] def initialize_endpoint(product_class, endpoint_name: '', account_name: '', version: '') - if (endpoint_name.nil? || endpoint_name.empty?) && product_class == Mindee::Product::Custom::CustomV1 + if (endpoint_name.nil? || endpoint_name.empty?) && + [Mindee::Product::Custom::CustomV1, Mindee::Product::Generated::GeneratedV1].include?(product_class) raise Errors::MindeeUserError, 'Missing argument endpoint_name when using custom class' end diff --git a/spec/client_spec.rb b/spec/client_spec.rb index 51c3479f..cde48365 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -63,5 +63,60 @@ mindee_client.load_prediction(Mindee::Product::Invoice::InvoiceV4, local_resp) expect(mindee_client).to_not be_nil end + + it 'should not load an invalid local response' do + local_resp = Mindee::Input::LocalResponse.new("#{DATA_DIR}/geometry/polygon.json") + expect do + mindee_client.load_prediction(Mindee::Product::Invoice::InvoiceV4, local_resp) + end.to raise_error Mindee::Errors::MindeeInputError + end + + it 'should not validate improper async parameters' do + file_data = File.binread("#{DATA_DIR}/file_types/receipt.jpg") + input_source = mindee_client.source_from_bytes(file_data, 'receipt.jpg') + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + max_retries: 0 + ) + end.to raise_error ArgumentError + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + initial_delay_sec: 0.5 + ) + end.to raise_error ArgumentError + expect do + mindee_client.enqueue_and_parse( + input_source, + Mindee::Product::Invoice::InvoiceV4, + delay_sec: 0.5 + ) + end.to raise_error ArgumentError + end + + it 'should not initialize an invalid endpoint' do + expect do + mindee_client.send( + :initialize_endpoint, + Mindee::Product::Generated::GeneratedV1, + endpoint_name: nil, + account_name: 'account_name', + version: 'version' + ) + end.to raise_error Mindee::Errors::MindeeUserError + + expect do + mindee_client.send( + :initialize_endpoint, + Mindee::Product::Generated::GeneratedV1, + endpoint_name: '', + account_name: 'account_name', + version: 'version' + ) + end.to raise_error Mindee::Errors::MindeeUserError + end end end From 5db992f154a113670a43f877a35e2b4633976dad Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 24 Jan 2025 11:46:46 +0100 Subject: [PATCH 05/16] add test for image module --- lib/mindee/client.rb | 2 +- lib/mindee/errors/mindee_error.rb | 2 +- lib/mindee/http/workflow_endpoint.rb | 5 +- lib/mindee/pdf/pdf_processor.rb | 2 +- spec/client_spec.rb | 4 +- spec/image/image_utils_spec.rb | 85 ++++++++++++++++++++++++++++ 6 files changed, 93 insertions(+), 7 deletions(-) create mode 100644 spec/image/image_utils_spec.rb diff --git a/lib/mindee/client.rb b/lib/mindee/client.rb index 99df9b39..d83737dc 100644 --- a/lib/mindee/client.rb +++ b/lib/mindee/client.rb @@ -344,7 +344,7 @@ def validate_async_params(initial_delay_sec, delay_sec, max_retries) def initialize_endpoint(product_class, endpoint_name: '', account_name: '', version: '') if (endpoint_name.nil? || endpoint_name.empty?) && [Mindee::Product::Custom::CustomV1, Mindee::Product::Generated::GeneratedV1].include?(product_class) - raise Errors::MindeeUserError, 'Missing argument endpoint_name when using custom class' + raise Errors::MindeeConfigurationError, 'Missing argument endpoint_name when using custom class' end endpoint_name = fix_endpoint_name(product_class, endpoint_name) diff --git a/lib/mindee/errors/mindee_error.rb b/lib/mindee/errors/mindee_error.rb index 1b5ad928..0f0cee39 100644 --- a/lib/mindee/errors/mindee_error.rb +++ b/lib/mindee/errors/mindee_error.rb @@ -9,7 +9,7 @@ class MindeeError < StandardError; end class MindeeAPIError < MindeeError; end # Errors relating to misuse of the library. - class MindeeUserError < MindeeError; end + class MindeeConfigurationError < MindeeError; end # Errors relating to geometric manipulation issues. class MindeeGeometryError < MindeeError; end diff --git a/lib/mindee/http/workflow_endpoint.rb b/lib/mindee/http/workflow_endpoint.rb index 9dc3632c..594b5acf 100644 --- a/lib/mindee/http/workflow_endpoint.rb +++ b/lib/mindee/http/workflow_endpoint.rb @@ -81,8 +81,9 @@ def workflow_execution_req_post(input_source, document_alias, priority, full_tex def check_api_key return unless @api_key.nil? || @api_key.empty? - raise Errors::MindeeUserError, "Missing API key. Check your Client Configuration.\n" \ - "You can set this using the '#{HTTP::API_KEY_ENV_NAME}' environment variable." + raise Errors::MindeeConfigurationError, "Missing API key. Check your Client Configuration.\n" \ + "You can set this using the '#{HTTP::API_KEY_ENV_NAME}'" \ + 'environment variable.' end end end diff --git a/lib/mindee/pdf/pdf_processor.rb b/lib/mindee/pdf/pdf_processor.rb index 575329df..8716e236 100644 --- a/lib/mindee/pdf/pdf_processor.rb +++ b/lib/mindee/pdf/pdf_processor.rb @@ -34,7 +34,7 @@ def self.parse(io_stream, options) when :REMOVE pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages) else - raise Errors::MindeeUserError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" + raise Errors::MindeeConfigurationError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" end current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a diff --git a/spec/client_spec.rb b/spec/client_spec.rb index cde48365..2e3716e2 100644 --- a/spec/client_spec.rb +++ b/spec/client_spec.rb @@ -106,7 +106,7 @@ account_name: 'account_name', version: 'version' ) - end.to raise_error Mindee::Errors::MindeeUserError + end.to raise_error Mindee::Errors::MindeeConfigurationError expect do mindee_client.send( @@ -116,7 +116,7 @@ account_name: 'account_name', version: 'version' ) - end.to raise_error Mindee::Errors::MindeeUserError + end.to raise_error Mindee::Errors::MindeeConfigurationError end end end diff --git a/spec/image/image_utils_spec.rb b/spec/image/image_utils_spec.rb new file mode 100644 index 00000000..126d575f --- /dev/null +++ b/spec/image/image_utils_spec.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +# spec/image_utils_spec.rb +require 'rspec' +require 'mini_magick' +require 'stringio' +require 'mindee' + +describe Mindee::Image::ImageUtils do + let(:sample_image_path) { "#{DATA_DIR}/file_types/receipt.jpg" } + let(:sample_image) { MiniMagick::Image.open(sample_image_path) } + + describe 'Image utility module' do + it 'converts StringIO to MiniMagick::Image' do + string_io = StringIO.new(File.read(sample_image_path)) + result = Mindee::Image::ImageUtils.to_image(string_io) + expect(result).to be_a(MiniMagick::Image) + end + + it 'returns the same MiniMagick::Image object if passed as input' do + result = Mindee::Image::ImageUtils.to_image(sample_image) + expect(result).to eq(sample_image) + end + + it 'raises an error for invalid input types' do + expect do + Mindee::Image::ImageUtils.to_image(123) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Expected an I/O object or a MiniMagick::Image}) + end + end + + describe '.image_to_stringio' do + it 'converts MiniMagick image to StringIO' do + result = Mindee::Image::ImageUtils.image_to_stringio(sample_image) + expect(result).to be_a(StringIO) + end + + it 'sets the format of the image correctly' do + result = Mindee::Image::ImageUtils.image_to_stringio(sample_image, 'PNG') + expect(result.string[1..3]).to eq('PNG') + end + end + + describe '.calculate_new_dimensions' do + it 'returns original dimensions if no max_width or max_height is provided' do + result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image) + expect(result).to eq([sample_image.width, sample_image.height]) + end + + it 'calculates new dimensions based on max_width and max_height' do + result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image, max_width: 100, max_height: 100) + expect(result[0]).to be <= 100 + expect(result[1]).to be <= 100 + end + + it 'raises an error if the original image is nil' do + expect do + Mindee::Image::ImageUtils.calculate_new_dimensions(nil) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Provided image could not be processed for resizing}) + end + end + + describe '.calculate_dimensions_from_media_box' do + it 'returns dimensions from media box if provided' do + media_box = [0, 0, 300, 400] + result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, media_box) + expect(result).to eq([300, 400]) + end + + it 'falls back to image dimensions if media box is nil or empty' do + result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, nil) + expect(result).to eq([sample_image.width.to_i, sample_image.height.to_i]) + end + end + + describe '.pdf_to_magick_image' do + it 'raises an error if the PDF stream is invalid' do + invalid_pdf_stream = StringIO.new('invalid data') + # Adjust based on actual error raised by MiniMagick for invalid data. + expect do + Mindee::Image::ImageUtils.pdf_to_magick_image(invalid_pdf_stream, 75) + end.to raise_error(MiniMagick::Error) + end + end +end From f31114d4cf6c037e8a893466c7e3fb0db3f46b5d Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 24 Jan 2025 16:54:00 +0100 Subject: [PATCH 06/16] add more test --- lib/mindee/errors/mindee_input_error.rb | 3 +- .../extraction/common/extracted_image.rb | 12 +- .../extraction/pdf_extractor/extracted_pdf.rb | 2 +- lib/mindee/pdf/pdf_processor.rb | 7 +- spec/extraction/extracted_image_spec.rb | 106 ++++++++++++++++++ spec/extraction/extracted_pdf_spec.rb | 17 +++ spec/image/image_utils_spec.rb | 30 ++--- spec/input/local_response_spec.rb | 13 +++ .../{ => sources}/files_handling_spec.rb | 2 +- spec/input/{ => sources}/sources_spec.rb | 2 +- .../url_input_source_integration.rb | 0 .../{ => sources}/url_input_source_spec.rb | 2 +- spec/{input => pdf}/pdf_processing_spec.rb | 13 +++ 13 files changed, 175 insertions(+), 34 deletions(-) create mode 100644 spec/extraction/extracted_image_spec.rb create mode 100644 spec/extraction/extracted_pdf_spec.rb rename spec/input/{ => sources}/files_handling_spec.rb (99%) rename spec/input/{ => sources}/sources_spec.rb (99%) rename spec/input/{ => sources}/url_input_source_integration.rb (100%) rename spec/input/{ => sources}/url_input_source_spec.rb (98%) rename spec/{input => pdf}/pdf_processing_spec.rb (90%) diff --git a/lib/mindee/errors/mindee_input_error.rb b/lib/mindee/errors/mindee_input_error.rb index 281dd727..267fea25 100644 --- a/lib/mindee/errors/mindee_input_error.rb +++ b/lib/mindee/errors/mindee_input_error.rb @@ -16,7 +16,8 @@ class MindeeMimeTypeError < MindeeSourceError # @param mime_type [String] def initialize(mime_type) @invalid_mimetype = mime_type - super("'#{@invalid_mimetype}' mime type not allowed, must be one of #{ALLOWED_MIME_TYPES.join(', ')}") + super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \ + "#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}") end end diff --git a/lib/mindee/extraction/common/extracted_image.rb b/lib/mindee/extraction/common/extracted_image.rb index 6ff9d79c..ea21d3eb 100644 --- a/lib/mindee/extraction/common/extracted_image.rb +++ b/lib/mindee/extraction/common/extracted_image.rb @@ -28,11 +28,12 @@ def initialize(input_source, page_id, element_id) @buffer = StringIO.new(input_source.io_stream.read) @buffer.rewind extension = if input_source.pdf? - 'jpg' + '.jpg' else File.extname(input_source.filename) end - @internal_file_name = "#{input_source.filename}_p#{page_id}_#{element_id}.#{extension}" + base_name = File.basename(input_source.filename, File.extname(input_source.filename)) + @internal_file_name = "#{base_name}_p#{page_id}_#{element_id}#{extension}" @page_id = page_id @element_id = element_id.nil? ? 0 : element_id end @@ -44,7 +45,7 @@ def initialize(input_source, page_id, element_id) # extension if not provided. # @raise [MindeeError] If an invalid path or filename is provided. def save_to_file(output_path, file_format = nil) - resolved_path = Pathname.new(output_path).realpath + resolved_path = Pathname.new(File.expand_path(output_path)) if file_format.nil? raise Errors::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? @@ -54,10 +55,9 @@ def save_to_file(output_path, file_format = nil) image = MiniMagick::Image.read(@buffer) image.format file_format.downcase image.write resolved_path.to_s - rescue TypeError - raise Errors::MindeeImageError, 'Invalid path/filename provided.' rescue StandardError - raise Errors::MindeeImageError, "Could not save file #{Pathname.new(output_path).basename}." + raise Errors::MindeeImageError, "Could not save file '#{output_path}'. " \ + 'Is the provided file path valid?.' end # Return the file as a Mindee-compatible BufferInput source. diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb index 17baa73f..b8235705 100644 --- a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +++ b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb @@ -26,7 +26,7 @@ def initialize(pdf_bytes, filename) def page_count current_pdf = Mindee::PDF::PdfProcessor.open_pdf(pdf_bytes) current_pdf.pages.size - rescue TypeError + rescue TypeError, Origami::InvalidPDFError raise Errors::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.' end diff --git a/lib/mindee/pdf/pdf_processor.rb b/lib/mindee/pdf/pdf_processor.rb index 8716e236..f5cf0400 100644 --- a/lib/mindee/pdf/pdf_processor.rb +++ b/lib/mindee/pdf/pdf_processor.rb @@ -28,13 +28,12 @@ def self.parse(io_stream, options) all_pages = (0..pages_count - 1).to_a - case options[:operation] - when :KEEP_ONLY + if options[:operation] == :KEEP_ONLY pages_to_remove = indexes_from_keep(options[:page_indexes], all_pages) - when :REMOVE + elsif options[:operation] == :REMOVE pages_to_remove = indexes_from_remove(options[:page_indexes], all_pages) else - raise Errors::MindeeConfigurationError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{behavior}'" + raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options[:operation]}'" end current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a diff --git a/spec/extraction/extracted_image_spec.rb b/spec/extraction/extracted_image_spec.rb new file mode 100644 index 00000000..ea252f6d --- /dev/null +++ b/spec/extraction/extracted_image_spec.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true + +require 'mindee' +require 'tempfile' +require 'pathname' +require 'stringio' +require 'mini_magick' +require_relative '../data' + +describe Mindee::Extraction::ExtractedImage do + let(:dummy_io_content) { 'This is a test file content.' } + let(:input_source) do + Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoices/default_sample.jpg") + end + + context 'An extracted image' do + it 'should initialize correctly with valid inputs' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + expect(extracted_image.buffer).to_not be(nil) + expect(extracted_image.page_id).to eq(page_id) + expect(extracted_image.element_id).to eq(element_id) + expect(extracted_image.internal_file_name).to eq("default_sample_p#{page_id}_#{element_id}.jpg") + end + + it 'should handle nil element_id by setting it to 0' do + page_id = 1 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, nil) + + expect(extracted_image.element_id).to eq(0) + end + + it 'should save the buffer to a file with valid format' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + random_sequence = Array.new(8) { rand(0..9) }.join + extracted_image.save_to_file("#{DATA_DIR}/output/temp-#{random_sequence}.jpg", 'jpg') + expect(File.exist?("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to be(true) + expect(File.read("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to_not be_empty + File.delete("#{DATA_DIR}/output/temp-#{random_sequence}.jpg") + expect(File.exist?("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to be(false) + end + + it 'should infer file format from extension if not provided' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + Tempfile.create(['output', '.png']) do |tempfile| + extracted_image.save_to_file(tempfile.path) + expect(File.exist?(tempfile.path)).to be(true) + expect(File.read(tempfile.path)).to_not be_empty + end + end + + it 'should raise an error for invalid file format during save' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + Tempfile.create(['output', '.']) do |tempfile| + expect do + extracted_image.save_to_file(tempfile.path) + end.to raise_error(Mindee::Errors::MindeeImageError) + end + end + + it 'should raise an error for invalid path during save' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + invalid_path = '/invalid/path/output.jpg' + expect do + extracted_image.save_to_file(invalid_path) + end.to raise_error(Mindee::Errors::MindeeImageError) + end + + it 'should return a valid source object from as_source' do + page_id = 1 + element_id = 2 + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + + source_object = extracted_image.as_source + + expect(source_object).to be_a(Mindee::Input::Source::BytesInputSource) + expect(source_object.filename).to eq(extracted_image.internal_file_name) + end + + it 'should raise an error when MiniMagick fails during save' do + allow(MiniMagick::Image).to receive(:read).and_raise(StandardError) + + extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, 1, 2) + + Tempfile.create(['output', '.jpg']) do |tempfile| + expect do + extracted_image.save_to_file(tempfile.path, 'jpg') + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) + end + end + end +end diff --git a/spec/extraction/extracted_pdf_spec.rb b/spec/extraction/extracted_pdf_spec.rb new file mode 100644 index 00000000..25a176ab --- /dev/null +++ b/spec/extraction/extracted_pdf_spec.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +require 'mindee' +require 'rspec' + +describe 'Invoice extraction' do + let(:product_data_dir) { File.join(DATA_DIR, 'products') } + + it 'should extract a PDF from an image' do + jpg_stream = File.open("#{product_data_dir}/invoices/default_sample.jpg", "r") + pdf_wrapper = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(jpg_stream, "dummy.pdf") + expect do + pdf_wrapper.page_count + end.to raise_error Mindee::Errors::MindeePDFError + end + +end \ No newline at end of file diff --git a/spec/image/image_utils_spec.rb b/spec/image/image_utils_spec.rb index 126d575f..7526e7c3 100644 --- a/spec/image/image_utils_spec.rb +++ b/spec/image/image_utils_spec.rb @@ -11,70 +11,62 @@ let(:sample_image) { MiniMagick::Image.open(sample_image_path) } describe 'Image utility module' do - it 'converts StringIO to MiniMagick::Image' do + it 'Should convert StringIO to MiniMagick::Image' do string_io = StringIO.new(File.read(sample_image_path)) result = Mindee::Image::ImageUtils.to_image(string_io) expect(result).to be_a(MiniMagick::Image) end - it 'returns the same MiniMagick::Image object if passed as input' do + it 'Should return the same MiniMagick::Image object if passed as input' do result = Mindee::Image::ImageUtils.to_image(sample_image) expect(result).to eq(sample_image) end - it 'raises an error for invalid input types' do + it 'Should raise an error for invalid input types' do expect do Mindee::Image::ImageUtils.to_image(123) end.to raise_error(Mindee::Errors::MindeeImageError, %r{Expected an I/O object or a MiniMagick::Image}) end - end - describe '.image_to_stringio' do - it 'converts MiniMagick image to StringIO' do + it 'Should convert MiniMagick image to StringIO' do result = Mindee::Image::ImageUtils.image_to_stringio(sample_image) expect(result).to be_a(StringIO) end - it 'sets the format of the image correctly' do + it 'Should set the format of the image correctly' do result = Mindee::Image::ImageUtils.image_to_stringio(sample_image, 'PNG') expect(result.string[1..3]).to eq('PNG') end - end - describe '.calculate_new_dimensions' do - it 'returns original dimensions if no max_width or max_height is provided' do + it 'Should return original dimensions if no max_width or max_height is provided' do result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image) expect(result).to eq([sample_image.width, sample_image.height]) end - it 'calculates new dimensions based on max_width and max_height' do + it 'Should calculate new dimensions based on max_width and max_height' do result = Mindee::Image::ImageUtils.calculate_new_dimensions(sample_image, max_width: 100, max_height: 100) expect(result[0]).to be <= 100 expect(result[1]).to be <= 100 end - it 'raises an error if the original image is nil' do + it 'Should raise an error if the original image is nil' do expect do Mindee::Image::ImageUtils.calculate_new_dimensions(nil) end.to raise_error(Mindee::Errors::MindeeImageError, %r{Provided image could not be processed for resizing}) end - end - describe '.calculate_dimensions_from_media_box' do - it 'returns dimensions from media box if provided' do + it 'Should return dimensions from media box if provided' do media_box = [0, 0, 300, 400] result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, media_box) expect(result).to eq([300, 400]) end - it 'falls back to image dimensions if media box is nil or empty' do + it 'Should fall back to image dimensions if media box is nil or empty' do result = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(sample_image, nil) expect(result).to eq([sample_image.width.to_i, sample_image.height.to_i]) end - end - describe '.pdf_to_magick_image' do - it 'raises an error if the PDF stream is invalid' do + it 'Should raise an error if the PDF stream is invalid' do invalid_pdf_stream = StringIO.new('invalid data') # Adjust based on actual error raised by MiniMagick for invalid data. expect do diff --git a/spec/input/local_response_spec.rb b/spec/input/local_response_spec.rb index 31b3b132..058973c2 100644 --- a/spec/input/local_response_spec.rb +++ b/spec/input/local_response_spec.rb @@ -53,5 +53,18 @@ expect(response.get_hmac_signature(dummy_secret_key)).to eq(signature) end end + + it 'should trigger an error when something invalid is passed' do + expect do + Mindee::Input::LocalResponse.new(123) + end.to raise_error Mindee::Errors::MindeeInputError + end + + it 'should trigger an error when the payload is not hashable' do + local_response = Mindee::Input::LocalResponse.new('Your mother was a hamster.') + expect do + local_response.as_hash + end.to raise_error Mindee::Errors::MindeeInputError + end end end diff --git a/spec/input/files_handling_spec.rb b/spec/input/sources/files_handling_spec.rb similarity index 99% rename from spec/input/files_handling_spec.rb rename to spec/input/sources/files_handling_spec.rb index b38da352..4c068d89 100644 --- a/spec/input/files_handling_spec.rb +++ b/spec/input/sources/files_handling_spec.rb @@ -2,7 +2,7 @@ require 'mindee/input/sources' require 'base64' -require_relative '../data' +require_relative '../../data' describe Mindee::Input::Source::LocalInputSource do context 'An jpg input file' do diff --git a/spec/input/sources_spec.rb b/spec/input/sources/sources_spec.rb similarity index 99% rename from spec/input/sources_spec.rb rename to spec/input/sources/sources_spec.rb index 71274f4f..a539d98e 100644 --- a/spec/input/sources_spec.rb +++ b/spec/input/sources/sources_spec.rb @@ -5,7 +5,7 @@ require 'mindee/errors' require 'pdf-reader' -require_relative '../data' +require_relative '../../data' describe Mindee::Input::Source do context 'An image input file' do diff --git a/spec/input/url_input_source_integration.rb b/spec/input/sources/url_input_source_integration.rb similarity index 100% rename from spec/input/url_input_source_integration.rb rename to spec/input/sources/url_input_source_integration.rb diff --git a/spec/input/url_input_source_spec.rb b/spec/input/sources/url_input_source_spec.rb similarity index 98% rename from spec/input/url_input_source_spec.rb rename to spec/input/sources/url_input_source_spec.rb index 393e6a41..1357dedf 100644 --- a/spec/input/url_input_source_spec.rb +++ b/spec/input/sources/url_input_source_spec.rb @@ -2,7 +2,7 @@ require 'rspec' require 'mindee' -require_relative '../http/mock_http_response' +require_relative '../../http/mock_http_response' RSpec.describe Mindee::Input::Source::UrlInputSource do let(:valid_url) { 'https://validurl/some/file.jpg' } diff --git a/spec/input/pdf_processing_spec.rb b/spec/pdf/pdf_processing_spec.rb similarity index 90% rename from spec/input/pdf_processing_spec.rb rename to spec/pdf/pdf_processing_spec.rb index 4feae54c..93092911 100644 --- a/spec/input/pdf_processing_spec.rb +++ b/spec/pdf/pdf_processing_spec.rb @@ -108,5 +108,18 @@ def open_pdf(io_stream) new_pdf = open_pdf(new_stream) expect(new_pdf.pages.size).to eq(9) end + + it 'Should fail on invalid operation' do + io_stream = File.open(filepath, 'rb') + io_stream.seek(0) + options = { + page_indexes: [1], + operation: :broken, + on_min_pages: 0, + } + expect do + Mindee::PDF::PdfProcessor.parse(io_stream, options) + end.to raise_error ArgumentError + end end end From 7aa4fb5e652bd054e5d6de455740f2f9d71a43eb Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 27 Jan 2025 13:22:14 +0100 Subject: [PATCH 07/16] refactor image module --- examples/auto_invoice_splitter_extraction.rb | 2 +- ...auto_multi_receipts_detector_extraction.rb | 2 +- lib/mindee.rb | 2 +- lib/mindee/extraction.rb | 1 - lib/mindee/extraction/common.rb | 4 - .../multi_receipts_extractor.rb | 8 +- .../extraction/pdf_extractor/extracted_pdf.rb | 2 +- .../extraction/pdf_extractor/pdf_extractor.rb | 12 +- .../extraction/tax_extractor/ocr_extractor.rb | 2 +- .../extraction/tax_extractor/tax_extractor.rb | 2 +- lib/mindee/image.rb | 2 + .../common => image}/extracted_image.rb | 24 +-- .../common => image}/image_extractor.rb | 8 +- spec/extraction/extracted_image_spec.rb | 18 +-- spec/extraction/extracted_pdf_spec.rb | 7 +- spec/extraction/image_extractor_spec.rb | 12 +- ...invoice_splitter_extraction_integration.rb | 2 +- .../multi_receipts_extractor_spec.rb | 8 +- spec/extraction/pdf_extractor_spec.rb | 6 +- spec/extraction/tax_extractor_spec.rb | 2 +- spec/image/extracted_image_spec.rb | 107 +++++++++++++ spec/image/image_compressor_spec.rb | 67 ++++++++ spec/input/sources/sources_spec.rb | 149 ------------------ spec/pdf/pdf_compressor_spec.rb | 96 +++++++++++ ...ocessing_spec.rb => pdf_processor_spec.rb} | 2 +- 25 files changed, 333 insertions(+), 214 deletions(-) delete mode 100644 lib/mindee/extraction/common.rb rename lib/mindee/{extraction/common => image}/extracted_image.rb (80%) rename lib/mindee/{extraction/common => image}/image_extractor.rb (97%) create mode 100644 spec/image/extracted_image_spec.rb create mode 100644 spec/image/image_compressor_spec.rb create mode 100644 spec/pdf/pdf_compressor_spec.rb rename spec/pdf/{pdf_processing_spec.rb => pdf_processor_spec.rb} (99%) diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index cd87989a..8bd85970 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -22,7 +22,7 @@ def parse_single_page(mindee_client, input_source) end def parse_multi_page(mindee_client, input_source) - pdf_extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(input_source) + pdf_extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(input_source) invoice_splitter_response = mindee_client.enqueue_and_parse( input_source, Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, diff --git a/examples/auto_multi_receipts_detector_extraction.rb b/examples/auto_multi_receipts_detector_extraction.rb index 97d95930..bcd55304 100644 --- a/examples/auto_multi_receipts_detector_extraction.rb +++ b/examples/auto_multi_receipts_detector_extraction.rb @@ -13,7 +13,7 @@ def multi_receipts_detection(file_path, mindee_client) close_file: false ) - images = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) + images = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) images.each do |sub_image| # Optional: Save the files locally # sub_image.write_to_file("/path/to/my/extracted/file/folder") diff --git a/lib/mindee.rb b/lib/mindee.rb index 600d40a6..38c19a51 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -34,7 +34,7 @@ module ImageCompressor end # Custom extraction module - module Extraction + module Image end # Parsing internals and fields. diff --git a/lib/mindee/extraction.rb b/lib/mindee/extraction.rb index fdc0cbaf..1a5eec78 100644 --- a/lib/mindee/extraction.rb +++ b/lib/mindee/extraction.rb @@ -2,5 +2,4 @@ require_relative 'extraction/tax_extractor' require_relative 'extraction/multi_receipts_extractor' -require_relative 'extraction/common' require_relative 'extraction/pdf_extractor' diff --git a/lib/mindee/extraction/common.rb b/lib/mindee/extraction/common.rb deleted file mode 100644 index 509d4d4f..00000000 --- a/lib/mindee/extraction/common.rb +++ /dev/null @@ -1,4 +0,0 @@ -# frozen_string_literal: true - -require_relative 'common/extracted_image' -require_relative 'common/image_extractor' diff --git a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb index b3884789..6eb516be 100644 --- a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +++ b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb @@ -1,10 +1,10 @@ # frozen_string_literal: true -require_relative '../common/image_extractor' +require_relative '../../image/image_extractor' module Mindee # Image Extraction Module. - module Extraction + module Image # Multi-receipts extraction class wrapper. class MultiReceiptsExtractor def self.extract_receipts(input_source, inference) @@ -23,8 +23,8 @@ def self.extract_receipts(input_source, inference) (0...input_source.count_pdf_pages).each do |page_id| receipt_positions = inference.pages[page_id].prediction.receipts.map(&:bounding_box) images.concat( - Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1, - receipt_positions) + Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, page_id + 1, + receipt_positions) ) end diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb index b8235705..80f75c74 100644 --- a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +++ b/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb @@ -2,7 +2,7 @@ module Mindee # Pdf Extraction Module. - module Extraction + module Image module PdfExtractor # An extracted sub-Pdf. class ExtractedPdf diff --git a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb index 5fa960d1..c9fae43a 100644 --- a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +++ b/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb @@ -2,7 +2,7 @@ module Mindee # Pdf Extraction Module. - module Extraction + module Image # Pdf Extraction class. module PdfExtractor # Pdf extraction class. @@ -13,7 +13,7 @@ def initialize(local_input) if local_input.pdf? @source_pdf = local_input.io_stream else - pdf_image = Extraction::ImageExtractor.attach_image_as_new_file(local_input.io_stream) + pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream) io_buffer = StringIO.new pdf_image.save(io_buffer) @@ -40,7 +40,7 @@ def cut_pages(page_indexes) # Extract the sub-documents from the main pdf, based on the given list of page indexes. # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. - # @return [Array] The buffer containing the new Pdf. + # @return [Array] The buffer containing the new Pdf. def extract_sub_documents(page_indexes) extracted_pdfs = [] extension = File.extname(@filename) @@ -59,8 +59,8 @@ def extract_sub_documents(page_indexes) formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s field_filename = "#{basename}_#{format('%03d', (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}" - extracted_pdf = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list), - field_filename) + extracted_pdf = Mindee::Image::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list), + field_filename) extracted_pdfs << extracted_pdf end extracted_pdfs @@ -72,7 +72,7 @@ def extract_sub_documents(page_indexes) # Extracts invoices as complete PDFs from the document. # @param page_indexes [Array, InvoiceSplitterV1PageGroup>] # @param strict [Boolean] - # @return [Array] + # @return [Array] def extract_invoices(page_indexes, strict: false) raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty? unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup) diff --git a/lib/mindee/extraction/tax_extractor/ocr_extractor.rb b/lib/mindee/extraction/tax_extractor/ocr_extractor.rb index 94397c6c..3490ca69 100644 --- a/lib/mindee/extraction/tax_extractor/ocr_extractor.rb +++ b/lib/mindee/extraction/tax_extractor/ocr_extractor.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true module Mindee - module Extraction + module Image # Generic extractor class class OcrExtractor # Checks for a list of possible matches in a string & returns the index of the first found candidate. diff --git a/lib/mindee/extraction/tax_extractor/tax_extractor.rb b/lib/mindee/extraction/tax_extractor/tax_extractor.rb index c5e1da55..dabae498 100644 --- a/lib/mindee/extraction/tax_extractor/tax_extractor.rb +++ b/lib/mindee/extraction/tax_extractor/tax_extractor.rb @@ -5,7 +5,7 @@ # rubocop:disable Metrics/ClassLength module Mindee - module Extraction + module Image # Tax extractor class class TaxExtractor < OcrExtractor # Extracts the most relevant candidate. diff --git a/lib/mindee/image.rb b/lib/mindee/image.rb index 51406f83..5664bc8f 100644 --- a/lib/mindee/image.rb +++ b/lib/mindee/image.rb @@ -1,4 +1,6 @@ # frozen_string_literal: true +require_relative 'image/extracted_image' require_relative 'image/image_compressor' +require_relative 'image/image_extractor' require_relative 'image/image_utils' diff --git a/lib/mindee/extraction/common/extracted_image.rb b/lib/mindee/image/extracted_image.rb similarity index 80% rename from lib/mindee/extraction/common/extracted_image.rb rename to lib/mindee/image/extracted_image.rb index ea21d3eb..8f25824e 100644 --- a/lib/mindee/extraction/common/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -1,16 +1,16 @@ # frozen_string_literal: true -require_relative '../../input/sources' +require_relative '../input/sources' module Mindee # Image Extraction Module. - module Extraction + module Image # Generic class for image extraction. class ExtractedImage - # Id of the page the image was extracted from. + # ID of the page the image was extracted from. attr_reader :page_id - # Id of the element on a given page. + # ID of the element on a given page. attr_reader :element_id # Buffer object of the file's content. @@ -51,13 +51,15 @@ def save_to_file(output_path, file_format = nil) file_format = resolved_path.extname.delete('.').upcase end - @buffer.rewind - image = MiniMagick::Image.read(@buffer) - image.format file_format.downcase - image.write resolved_path.to_s - rescue StandardError - raise Errors::MindeeImageError, "Could not save file '#{output_path}'. " \ - 'Is the provided file path valid?.' + begin + @buffer.rewind + image = MiniMagick::Image.read(@buffer) + image.format file_format.downcase + image.write resolved_path.to_s + rescue StandardError + raise Errors::MindeeImageError, "Could not save file '#{output_path}'. " \ + 'Is the provided file path valid?.' + end end # Return the file as a Mindee-compatible BufferInput source. diff --git a/lib/mindee/extraction/common/image_extractor.rb b/lib/mindee/image/image_extractor.rb similarity index 97% rename from lib/mindee/extraction/common/image_extractor.rb rename to lib/mindee/image/image_extractor.rb index acd2bd7a..a360f906 100644 --- a/lib/mindee/extraction/common/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -4,12 +4,12 @@ require 'origami' require 'stringio' require 'tempfile' -require_relative '../../input/sources' +require_relative '../input/sources' require_relative 'extracted_image' module Mindee # Image Extraction Module. - module Extraction + module Image # Image Extraction wrapper class. module ImageExtractor def self.attach_image_as_new_file(input_buffer, format: 'jpg') @@ -35,7 +35,7 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg') # @param [Integer] page_id ID of the Page to extract from. # @param [Array>, Array] polygons List of coordinates # to extract. - # @return [Array] Extracted Images. + # @return [Array] Extracted Images. def self.extract_multiple_images_from_source(input_source, page_id, polygons) new_stream = load_input_source_pdf_page_as_image(input_source, page_id) new_stream.seek(0) @@ -49,7 +49,7 @@ def self.extract_multiple_images_from_source(input_source, page_id, polygons) # @param [StringIO] pdf_stream Buffer of the PDF. # @param [Integer] page_id Page ID. # @param [Array] polygons - # @return [Array] Extracted Images. + # @return [Array] Extracted Images. def self.extract_images_from_polygons(input_source, pdf_stream, page_id, polygons) extracted_elements = [] diff --git a/spec/extraction/extracted_image_spec.rb b/spec/extraction/extracted_image_spec.rb index ea252f6d..551b7495 100644 --- a/spec/extraction/extracted_image_spec.rb +++ b/spec/extraction/extracted_image_spec.rb @@ -7,7 +7,7 @@ require 'mini_magick' require_relative '../data' -describe Mindee::Extraction::ExtractedImage do +describe Mindee::Image::ExtractedImage do let(:dummy_io_content) { 'This is a test file content.' } let(:input_source) do Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoices/default_sample.jpg") @@ -17,7 +17,7 @@ it 'should initialize correctly with valid inputs' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) expect(extracted_image.buffer).to_not be(nil) expect(extracted_image.page_id).to eq(page_id) @@ -27,7 +27,7 @@ it 'should handle nil element_id by setting it to 0' do page_id = 1 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, nil) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, nil) expect(extracted_image.element_id).to eq(0) end @@ -35,7 +35,7 @@ it 'should save the buffer to a file with valid format' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) random_sequence = Array.new(8) { rand(0..9) }.join extracted_image.save_to_file("#{DATA_DIR}/output/temp-#{random_sequence}.jpg", 'jpg') @@ -48,7 +48,7 @@ it 'should infer file format from extension if not provided' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) Tempfile.create(['output', '.png']) do |tempfile| extracted_image.save_to_file(tempfile.path) @@ -60,7 +60,7 @@ it 'should raise an error for invalid file format during save' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) Tempfile.create(['output', '.']) do |tempfile| expect do @@ -72,7 +72,7 @@ it 'should raise an error for invalid path during save' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) invalid_path = '/invalid/path/output.jpg' expect do @@ -83,7 +83,7 @@ it 'should return a valid source object from as_source' do page_id = 1 element_id = 2 - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, page_id, element_id) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) source_object = extracted_image.as_source @@ -94,7 +94,7 @@ it 'should raise an error when MiniMagick fails during save' do allow(MiniMagick::Image).to receive(:read).and_raise(StandardError) - extracted_image = Mindee::Extraction::ExtractedImage.new(input_source, 1, 2) + extracted_image = Mindee::Image::ExtractedImage.new(input_source, 1, 2) Tempfile.create(['output', '.jpg']) do |tempfile| expect do diff --git a/spec/extraction/extracted_pdf_spec.rb b/spec/extraction/extracted_pdf_spec.rb index 25a176ab..8967971d 100644 --- a/spec/extraction/extracted_pdf_spec.rb +++ b/spec/extraction/extracted_pdf_spec.rb @@ -7,11 +7,10 @@ let(:product_data_dir) { File.join(DATA_DIR, 'products') } it 'should extract a PDF from an image' do - jpg_stream = File.open("#{product_data_dir}/invoices/default_sample.jpg", "r") - pdf_wrapper = Mindee::Extraction::PdfExtractor::ExtractedPdf.new(jpg_stream, "dummy.pdf") + jpg_stream = File.open("#{product_data_dir}/invoices/default_sample.jpg", 'r') + pdf_wrapper = Mindee::Image::PdfExtractor::ExtractedPdf.new(jpg_stream, 'dummy.pdf') expect do pdf_wrapper.page_count end.to raise_error Mindee::Errors::MindeePDFError end - -end \ No newline at end of file +end diff --git a/spec/extraction/image_extractor_spec.rb b/spec/extraction/image_extractor_spec.rb index ff4c9667..0fb4efac 100644 --- a/spec/extraction/image_extractor_spec.rb +++ b/spec/extraction/image_extractor_spec.rb @@ -5,8 +5,8 @@ require 'mindee/extraction' require_relative '../data' -describe Mindee::Extraction do - include Mindee::Extraction +describe Mindee::Image do + include Mindee::Image let(:barcode_path) do File.join(DATA_DIR, 'products', 'barcode_reader', 'default_sample.jpg') end @@ -23,10 +23,10 @@ barcodes2 = inference.prediction.codes_2d.map(&:polygon) input_source = Mindee::Input::Source::PathInputSource.new(barcode_path) - extracted_barcodes_1d = Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, 1, - barcodes1) - extracted_barcodes_2d = Mindee::Extraction::ImageExtractor.extract_multiple_images_from_source(input_source, 1, - barcodes2) + extracted_barcodes_1d = Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, 1, + barcodes1) + extracted_barcodes_2d = Mindee::Image::ImageExtractor.extract_multiple_images_from_source(input_source, 1, + barcodes2) expect(extracted_barcodes_1d.size).to eq(1) expect(extracted_barcodes_2d.size).to eq(2) diff --git a/spec/extraction/invoice_splitter_extraction_integration.rb b/spec/extraction/invoice_splitter_extraction_integration.rb index f6401d32..2dc3bc15 100644 --- a/spec/extraction/invoice_splitter_extraction_integration.rb +++ b/spec/extraction/invoice_splitter_extraction_integration.rb @@ -30,7 +30,7 @@ def prepare_invoice_return(rst_file_path, invoice_prediction) ) inference = response.document.inference - pdf_extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(invoice_splitter_input) + pdf_extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(invoice_splitter_input) expect(pdf_extractor.page_count).to eq(2) extracted_pdfs_strict = pdf_extractor.extract_invoices(inference.prediction.invoice_page_groups, strict: true) diff --git a/spec/extraction/multi_receipts_extractor_spec.rb b/spec/extraction/multi_receipts_extractor_spec.rb index 654e4c57..10117159 100644 --- a/spec/extraction/multi_receipts_extractor_spec.rb +++ b/spec/extraction/multi_receipts_extractor_spec.rb @@ -5,8 +5,8 @@ require 'mindee/extraction' require_relative '../data' -describe Mindee::Extraction do - include Mindee::Extraction +describe Mindee::Image do + include Mindee::Image let(:multi_receipts_single_page_path) do File.join(DATA_DIR, 'products', 'multi_receipts_detector', 'default_sample.jpg') end @@ -28,7 +28,7 @@ input_sample = Mindee::Input::Source::PathInputSource.new(multi_receipts_single_page_path) response = load_json(multi_receipts_single_page_json_path, 'complete.json') doc = Mindee::Product::MultiReceiptsDetector::MultiReceiptsDetectorV1.new(response['document']['inference']) - extracted_receipts = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_sample, doc) + extracted_receipts = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_sample, doc) expect(extracted_receipts.size).to eq(6) @@ -76,7 +76,7 @@ input_sample = Mindee::Input::Source::PathInputSource.new(multi_receipts_multi_page_path) response = load_json(multi_receipts_multi_page_json_path, 'multipage_sample.json') doc = Mindee::Product::MultiReceiptsDetector::MultiReceiptsDetectorV1.new(response['document']['inference']) - extracted_receipts = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_sample, doc) + extracted_receipts = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_sample, doc) expect(extracted_receipts.size).to eq(5) diff --git a/spec/extraction/pdf_extractor_spec.rb b/spec/extraction/pdf_extractor_spec.rb index e8ca5e2c..80d113fc 100644 --- a/spec/extraction/pdf_extractor_spec.rb +++ b/spec/extraction/pdf_extractor_spec.rb @@ -20,13 +20,13 @@ jpg_input = Mindee::Input::Source::PathInputSource.new(invoice_default_sample_path) expect(jpg_input.pdf?).to eq(false) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(jpg_input) + extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(jpg_input) expect(extractor.page_count).to eq(1) end it 'should extract invoices from a PDF (no strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) @@ -45,7 +45,7 @@ it 'should extract invoices from a PDF (strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Extraction::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) expect(loaded_prediction.invoice_page_groups.length).to eq(3) diff --git a/spec/extraction/tax_extractor_spec.rb b/spec/extraction/tax_extractor_spec.rb index 285a975e..6ad6541f 100644 --- a/spec/extraction/tax_extractor_spec.rb +++ b/spec/extraction/tax_extractor_spec.rb @@ -13,7 +13,7 @@ ocr = Mindee::Parsing::Common::Ocr::Ocr.new( response['document']['ocr'] ) - found_tax = Mindee::Extraction::TaxExtractor.extract_custom_tax(ocr, ['Tax'], 0, 20) + found_tax = Mindee::Image::TaxExtractor.extract_custom_tax(ocr, ['Tax'], 0, 20) expect(found_tax.code).to eq('Tax') expect(found_tax.rate).to eq(8) expect(found_tax.value).to eq(nil) diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb new file mode 100644 index 00000000..eae115b5 --- /dev/null +++ b/spec/image/extracted_image_spec.rb @@ -0,0 +1,107 @@ +# frozen_string_literal: true + +require 'mindee' +require 'pathname' +require 'fileutils' +require 'mini_magick' +require_relative '../data' + +describe Mindee::Image::ExtractedImage do + let(:file_path) do + File.join(DATA_DIR, 'products', 'invoices', 'default_sample.jpg') + end + let(:input_source) do + Mindee::Input::Source::PathInputSource.new(file_path) + end + let(:page_id) { 1 } + let(:element_id) { 42 } + let(:output_dir) { "#{DATA_DIR}/output" } + + describe '#initialize' do + it 'initializes with correct attributes' do + extracted_image = described_class.new(input_source, page_id, element_id) + + expect(extracted_image.page_id).to eq(page_id) + expect(extracted_image.element_id).to eq(element_id) + expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + + # NOTE: ruby messes up the formatting of binary strings, I don't think it worth it to correct this behavior, but + # the result is that we have to remove them from the comparisons. + input_source.io_stream.rewind + source_content = extracted_image.buffer.read.gsub("\r", '').gsub("\n", '') + input_content = input_source.io_stream.read.gsub("\r", '').gsub("\n", '') + + expect(source_content).to eq(input_content) + + input_source.io_stream.rewind + end + + it 'defaults element_id to 0 if nil is provided' do + extracted_image = described_class.new(input_source, page_id, nil) + + expect(extracted_image.element_id).to eq(0) + end + + it 'appends .jpg extension for PDF input sources' do + allow(input_source).to receive(:pdf?).and_return(true) + + extracted_image = described_class.new(input_source, page_id, element_id) + + expect(extracted_image.internal_file_name).to eq('default_sample_p1_42.jpg') + end + end + + describe '#save_to_file' do + it 'saves the buffer to a file with the correct format' do + extracted_image = described_class.new(input_source, page_id, element_id) + output_path = "#{output_dir}/output_test.jpg" + + extracted_image.save_to_file(output_path) + + expect(File.exist?(output_path)).to be true + expect(File.size(output_path)).to be > 0 + end + + it 'raises an error if file format is invalid' do + extracted_image = described_class.new(input_source, page_id, element_id) + invalid_output_path = "#{output_dir}/output_test" + + expect do + extracted_image.save_to_file(invalid_output_path) + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Invalid file format}) + end + + it 'raises an error if the file cannot be saved' do + extracted_image = described_class.new(input_source, page_id, element_id) + invalid_output_path = '/invalid/path/output_test.jpg' + + expect do + extracted_image.save_to_file(invalid_output_path) + end.to raise_error(Mindee::Errors::MindeeImageError) + end + end + + describe '#as_source' do + it 'returns a BytesInputSource with the correct content and filename' do + extracted_image = described_class.new(input_source, page_id, element_id) + + source = extracted_image.as_source + + expect(source).to be_a(Mindee::Input::Source::BytesInputSource) + expect(source.filename).to eq('default_sample_p1_42.jpg') + source.io_stream.rewind + + input_source.io_stream.rewind + source_content = source.io_stream.read.gsub("\r", '').gsub("\n", '') + input_content = input_source.io_stream.read.gsub("\r", '').gsub("\n", '') + + expect(source_content).to eq(input_content) + + input_source.io_stream.rewind + end + + after(:each) do + # FileUtils.rm_f("#{output_dir}/compress100.jpg") + end + end +end diff --git a/spec/image/image_compressor_spec.rb b/spec/image/image_compressor_spec.rb new file mode 100644 index 00000000..81c1b134 --- /dev/null +++ b/spec/image/image_compressor_spec.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +require 'mindee' + +require_relative '../data' + +describe Mindee::Image::ImageCompressor do + describe 'Image Quality Compression' do + let(:input_receipt_path) { "#{DATA_DIR}/file_types/receipt.jpg" } + let(:output_dir) { "#{DATA_DIR}/output/" } + + it 'should compress the image from input source' do + receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) + receipt_input.compress!(quality: 80) # NOTE: base jpg quality is ~81 + + FileUtils.mkdir_p(File.dirname("#{output_dir}compress_indirect.jpg")) + File.write("#{output_dir}compress_indirect.jpg", receipt_input.io_stream.read) + + initial_file_size = File.size(input_receipt_path) + compressed_file_size = File.size(output_dir) + + expect(compressed_file_size).to be < initial_file_size + end + + it 'should compress the image with various quality levels' do + receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) + + compresses = [ + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 100), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream), # default quality + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 50), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 10), + Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 1), + ] + + output_files = [ + "#{output_dir}/compress100.jpg", + "#{output_dir}/compress85.jpg", + "#{output_dir}/compress50.jpg", + "#{output_dir}/compress10.jpg", + "#{output_dir}/compress1.jpg", + ] + + compresses.zip(output_files).each do |compressed, output_file| + File.write(output_file, compressed.read) + end + + initial_file_size = File.size(input_receipt_path) + rendered_file_sizes = output_files.map { |file| File.size(file) } + + expect(initial_file_size).to be < rendered_file_sizes[0] + expect(initial_file_size).to be < rendered_file_sizes[1] + expect(rendered_file_sizes[1]).to be > rendered_file_sizes[2] + expect(rendered_file_sizes[2]).to be > rendered_file_sizes[3] + expect(rendered_file_sizes[3]).to be > rendered_file_sizes[4] + end + + after(:each) do + FileUtils.rm_f("#{output_dir}/compress100.jpg") + FileUtils.rm_f("#{output_dir}/compress85.jpg") + FileUtils.rm_f("#{output_dir}/compress50.jpg") + FileUtils.rm_f("#{output_dir}/compress10.jpg") + FileUtils.rm_f("#{output_dir}/compress1.jpg") + FileUtils.rm_f("#{output_dir}/compress_indirect.jpg") + end + end +end diff --git a/spec/input/sources/sources_spec.rb b/spec/input/sources/sources_spec.rb index a539d98e..885a768d 100644 --- a/spec/input/sources/sources_spec.rb +++ b/spec/input/sources/sources_spec.rb @@ -80,153 +80,4 @@ end.not_to raise_error end end - - describe 'Image Quality Compression' do - let(:input_receipt_path) { "#{DATA_DIR}/file_types/receipt.jpg" } - let(:output_dir) { "#{DATA_DIR}/output/" } - - it 'should compress the image from input source' do - receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) - receipt_input.compress!(quality: 80) # NOTE: base jpg quality is ~81 - - FileUtils.mkdir_p(File.dirname("#{output_dir}compress_indirect.jpg")) - File.write("#{output_dir}compress_indirect.jpg", receipt_input.io_stream.read) - - initial_file_size = File.size(input_receipt_path) - compressed_file_size = File.size(output_dir) - - expect(compressed_file_size).to be < initial_file_size - end - - it 'should compress the image with various quality levels' do - receipt_input = Mindee::Input::Source::PathInputSource.new(input_receipt_path) - - compresses = [ - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 100), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream), # default quality - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 50), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 10), - Mindee::Image::ImageCompressor.compress_image(receipt_input.io_stream, quality: 1), - ] - - output_files = [ - "#{output_dir}/compress100.jpg", - "#{output_dir}/compress85.jpg", - "#{output_dir}/compress50.jpg", - "#{output_dir}/compress10.jpg", - "#{output_dir}/compress1.jpg", - ] - - compresses.zip(output_files).each do |compressed, output_file| - File.write(output_file, compressed.read) - end - - initial_file_size = File.size(input_receipt_path) - rendered_file_sizes = output_files.map { |file| File.size(file) } - - expect(initial_file_size).to be < rendered_file_sizes[0] - expect(initial_file_size).to be < rendered_file_sizes[1] - expect(rendered_file_sizes[1]).to be > rendered_file_sizes[2] - expect(rendered_file_sizes[2]).to be > rendered_file_sizes[3] - expect(rendered_file_sizes[3]).to be > rendered_file_sizes[4] - end - - after(:each) do - FileUtils.rm_f("#{output_dir}/compress100.jpg") - FileUtils.rm_f("#{output_dir}/compress85.jpg") - FileUtils.rm_f("#{output_dir}/compress50.jpg") - FileUtils.rm_f("#{output_dir}/compress10.jpg") - FileUtils.rm_f("#{output_dir}/compress1.jpg") - FileUtils.rm_f("#{output_dir}/compress_indirect.jpg") - end - end - - describe 'The PDF text detection method' do - it 'should detect text pdf in a PDF file.' do - text_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/pdf/multipage.pdf") - expect(Mindee::PDF::PDFTools.source_text?(text_input.io_stream)).to be(true) - end - - it 'should not detect text pdf in an empty PDF file.' do - no_text_input = Mindee::Input::Source::PathInputSource.new( - "#{DATA_DIR}/file_types/pdf/blank_1.pdf" - ) - expect(Mindee::PDF::PDFTools.source_text?(no_text_input.io_stream)).to be(false) - end - - it 'should not detect text pdf in an image file.' do - image_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/receipt.jpg") - expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false) - end - end - - describe 'PDF compression' do - it 'should compress from an input source' do - input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" - output_file_path = "#{DATA_DIR}/output/compress_indirect.pdf" - pdf_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoice_splitter/default_sample.pdf") - pdf_input.compress!(quality: 50) - File.write(output_file_path, pdf_input.io_stream.read) - expect(File.size(output_file_path)).to be < File.size(input_file_path) - end - - it 'should compress from the compressor' do - input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" - output_file_paths = { - 85 => "#{DATA_DIR}/output/compressed_direct_85.pdf", - 75 => "#{DATA_DIR}/output/compressed_direct_75.pdf", - 50 => "#{DATA_DIR}/output/compressed_direct_50.pdf", - 10 => "#{DATA_DIR}/output/compressed_direct_10.pdf", - } - pdf = File.open(input_file_path) - output_file_paths.each_pair do |key, value| - compressed_pdf = Mindee::PDF::PDFCompressor.compress_pdf(pdf, quality: key) - compressed_pdf.rewind - File.write(value, compressed_pdf.read) - end - expect(File.size(input_file_path)).to be > File.size(output_file_paths[85]) - expect(File.size(output_file_paths[75])).to be < File.size(output_file_paths[85]) - expect(File.size(output_file_paths[50])).to be < File.size(output_file_paths[75]) - expect(File.size(output_file_paths[10])).to be < File.size(output_file_paths[50]) - end - - after(:each) do - output_dir = "#{DATA_DIR}/output" - FileUtils.rm_f("#{output_dir}/compressed_direct_85.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_75.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_50.pdf") - FileUtils.rm_f("#{output_dir}/compressed_direct_10.pdf") - FileUtils.rm_f("#{output_dir}/compress_indirect.pdf") - end - end - - describe 'source text PDF compression' do - it 'should compress if forced' do - input_file_path = "#{DATA_DIR}/file_types/pdf/multipage.pdf" - output_file_path = "#{DATA_DIR}/output/compress_with_text.pdf" - pdf_input = Mindee::Input::Source::PathInputSource.new(input_file_path) - pdf_input.compress!(quality: 50, force_source_text: true, disable_source_text: false) - File.write(output_file_path, pdf_input.io_stream.read) - expect(File.size(output_file_path)).to be > File.size(input_file_path) - - pdf_input.io_stream.rewind - reader = PDFReader::Reader.new(pdf_input.io_stream) - - text = '' - reader.pages.each do |original_page| - receiver = PDFReader::Reader::PageTextReceiver.new - original_page.walk(receiver) - - receiver.runs.each do |text_run| - text += text_run.text - end - end - expect(text).to eq('*' * 650) - end - - after(:each) do - output_dir = "#{DATA_DIR}/output" - FileUtils.rm_f("#{output_dir}/compress_with_text.pdf") - end - end end diff --git a/spec/pdf/pdf_compressor_spec.rb b/spec/pdf/pdf_compressor_spec.rb new file mode 100644 index 00000000..bb36d87c --- /dev/null +++ b/spec/pdf/pdf_compressor_spec.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +require 'mindee' + +require_relative '../data' + +describe Mindee::PDF::PDFCompressor do + describe 'The PDF text detection method' do + it 'should detect text pdf in a PDF file.' do + text_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/pdf/multipage.pdf") + expect(Mindee::PDF::PDFTools.source_text?(text_input.io_stream)).to be(true) + end + + it 'should not detect text pdf in an empty PDF file.' do + no_text_input = Mindee::Input::Source::PathInputSource.new( + "#{DATA_DIR}/file_types/pdf/blank_1.pdf" + ) + expect(Mindee::PDF::PDFTools.source_text?(no_text_input.io_stream)).to be(false) + end + + it 'should not detect text pdf in an image file.' do + image_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/file_types/receipt.jpg") + expect(Mindee::PDF::PDFTools.source_text?(image_input.io_stream)).to be(false) + end + end + + describe 'PDF compression' do + it 'should compress from an input source' do + input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" + output_file_path = "#{DATA_DIR}/output/compress_indirect.pdf" + pdf_input = Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoice_splitter/default_sample.pdf") + pdf_input.compress!(quality: 50) + File.write(output_file_path, pdf_input.io_stream.read) + expect(File.size(output_file_path)).to be < File.size(input_file_path) + end + + it 'should compress from the compressor' do + input_file_path = "#{DATA_DIR}/products/invoice_splitter/default_sample.pdf" + output_file_paths = { + 85 => "#{DATA_DIR}/output/compressed_direct_85.pdf", + 75 => "#{DATA_DIR}/output/compressed_direct_75.pdf", + 50 => "#{DATA_DIR}/output/compressed_direct_50.pdf", + 10 => "#{DATA_DIR}/output/compressed_direct_10.pdf", + } + pdf = File.open(input_file_path) + output_file_paths.each_pair do |key, value| + compressed_pdf = Mindee::PDF::PDFCompressor.compress_pdf(pdf, quality: key) + compressed_pdf.rewind + File.write(value, compressed_pdf.read) + end + expect(File.size(input_file_path)).to be > File.size(output_file_paths[85]) + expect(File.size(output_file_paths[75])).to be < File.size(output_file_paths[85]) + expect(File.size(output_file_paths[50])).to be < File.size(output_file_paths[75]) + expect(File.size(output_file_paths[10])).to be < File.size(output_file_paths[50]) + end + + after(:each) do + output_dir = "#{DATA_DIR}/output" + FileUtils.rm_f("#{output_dir}/compressed_direct_85.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_75.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_50.pdf") + FileUtils.rm_f("#{output_dir}/compressed_direct_10.pdf") + FileUtils.rm_f("#{output_dir}/compress_indirect.pdf") + end + end + + describe 'source text PDF compression' do + it 'should compress if forced' do + input_file_path = "#{DATA_DIR}/file_types/pdf/multipage.pdf" + output_file_path = "#{DATA_DIR}/output/compress_with_text.pdf" + pdf_input = Mindee::Input::Source::PathInputSource.new(input_file_path) + pdf_input.compress!(quality: 50, force_source_text: true, disable_source_text: false) + File.write(output_file_path, pdf_input.io_stream.read) + expect(File.size(output_file_path)).to be > File.size(input_file_path) + + pdf_input.io_stream.rewind + reader = PDFReader::Reader.new(pdf_input.io_stream) + + text = '' + reader.pages.each do |original_page| + receiver = PDFReader::Reader::PageTextReceiver.new + original_page.walk(receiver) + + receiver.runs.each do |text_run| + text += text_run.text + end + end + expect(text).to eq('*' * 650) + end + + after(:each) do + output_dir = "#{DATA_DIR}/output" + FileUtils.rm_f("#{output_dir}/compress_with_text.pdf") + end + end +end diff --git a/spec/pdf/pdf_processing_spec.rb b/spec/pdf/pdf_processor_spec.rb similarity index 99% rename from spec/pdf/pdf_processing_spec.rb rename to spec/pdf/pdf_processor_spec.rb index 93092911..0ef77fe1 100644 --- a/spec/pdf/pdf_processing_spec.rb +++ b/spec/pdf/pdf_processor_spec.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true -require 'mindee/pdf' +require 'mindee' require_relative '../data' From 75b55d95ef45bc38c489a4ecd6767fa39fece352 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 27 Jan 2025 15:47:38 +0100 Subject: [PATCH 08/16] refactor pdf module --- examples/auto_invoice_splitter_extraction.rb | 2 +- lib/mindee/extraction.rb | 1 - lib/mindee/extraction/pdf_extractor.rb | 4 - lib/mindee/pdf.rb | 2 + .../pdf_extractor => pdf}/extracted_pdf.rb | 6 +- .../pdf_extractor => pdf}/pdf_extractor.rb | 14 +-- spec/extraction/extracted_image_spec.rb | 106 ------------------ ...invoice_splitter_extraction_integration.rb | 2 +- spec/image/extracted_image_spec.rb | 12 ++ .../image_extractor_spec.rb | 0 .../{extraction => pdf}/extracted_pdf_spec.rb | 2 +- .../{extraction => pdf}/pdf_extractor_spec.rb | 6 +- 12 files changed, 30 insertions(+), 127 deletions(-) delete mode 100644 lib/mindee/extraction/pdf_extractor.rb rename lib/mindee/{extraction/pdf_extractor => pdf}/extracted_pdf.rb (96%) rename lib/mindee/{extraction/pdf_extractor => pdf}/pdf_extractor.rb (92%) delete mode 100644 spec/extraction/extracted_image_spec.rb rename spec/{extraction => image}/image_extractor_spec.rb (100%) rename spec/{extraction => pdf}/extracted_pdf_spec.rb (81%) rename spec/{extraction => pdf}/pdf_extractor_spec.rb (91%) diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index 8bd85970..8955a3ff 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -22,7 +22,7 @@ def parse_single_page(mindee_client, input_source) end def parse_multi_page(mindee_client, input_source) - pdf_extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(input_source) + pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(input_source) invoice_splitter_response = mindee_client.enqueue_and_parse( input_source, Mindee::Product::InvoiceSplitter::InvoiceSplitterV1, diff --git a/lib/mindee/extraction.rb b/lib/mindee/extraction.rb index 1a5eec78..fddb170d 100644 --- a/lib/mindee/extraction.rb +++ b/lib/mindee/extraction.rb @@ -2,4 +2,3 @@ require_relative 'extraction/tax_extractor' require_relative 'extraction/multi_receipts_extractor' -require_relative 'extraction/pdf_extractor' diff --git a/lib/mindee/extraction/pdf_extractor.rb b/lib/mindee/extraction/pdf_extractor.rb deleted file mode 100644 index 3d44dd98..00000000 --- a/lib/mindee/extraction/pdf_extractor.rb +++ /dev/null @@ -1,4 +0,0 @@ -# frozen_string_literal: true - -require_relative 'pdf_extractor/pdf_extractor' -require_relative 'pdf_extractor/extracted_pdf' diff --git a/lib/mindee/pdf.rb b/lib/mindee/pdf.rb index ab262fd7..48b05d99 100644 --- a/lib/mindee/pdf.rb +++ b/lib/mindee/pdf.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative 'pdf/extracted_pdf' require_relative 'pdf/pdf_compressor' +require_relative 'pdf/pdf_extractor' require_relative 'pdf/pdf_processor' require_relative 'pdf/pdf_tools' diff --git a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb similarity index 96% rename from lib/mindee/extraction/pdf_extractor/extracted_pdf.rb rename to lib/mindee/pdf/extracted_pdf.rb index 80f75c74..c35ae677 100644 --- a/lib/mindee/extraction/pdf_extractor/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -2,10 +2,10 @@ module Mindee # Pdf Extraction Module. - module Image - module PdfExtractor + module PDF + module PDFExtractor # An extracted sub-Pdf. - class ExtractedPdf + class ExtractedPDF # Byte contents of the pdf # @return [StreamIO] attr_reader :pdf_bytes diff --git a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb b/lib/mindee/pdf/pdf_extractor.rb similarity index 92% rename from lib/mindee/extraction/pdf_extractor/pdf_extractor.rb rename to lib/mindee/pdf/pdf_extractor.rb index c9fae43a..39826c49 100644 --- a/lib/mindee/extraction/pdf_extractor/pdf_extractor.rb +++ b/lib/mindee/pdf/pdf_extractor.rb @@ -2,11 +2,11 @@ module Mindee # Pdf Extraction Module. - module Image + module PDF # Pdf Extraction class. - module PdfExtractor + module PDFExtractor # Pdf extraction class. - class PdfExtractor + class PDFExtractor # @param local_input [Mindee::Input::Source::LocalInputSource] def initialize(local_input) @filename = local_input.filename @@ -40,7 +40,7 @@ def cut_pages(page_indexes) # Extract the sub-documents from the main pdf, based on the given list of page indexes. # @param page_indexes [Array>] List of page number to use for merging in the original Pdf. - # @return [Array] The buffer containing the new Pdf. + # @return [Array] The buffer containing the new Pdf. def extract_sub_documents(page_indexes) extracted_pdfs = [] extension = File.extname(@filename) @@ -59,8 +59,8 @@ def extract_sub_documents(page_indexes) formatted_max_index = format('%03d', page_index_list[page_index_list.length - 1] + 1).to_s field_filename = "#{basename}_#{format('%03d', (page_index_list[0] + 1))}-#{formatted_max_index}#{extension}" - extracted_pdf = Mindee::Image::PdfExtractor::ExtractedPdf.new(cut_pages(page_index_list), - field_filename) + extracted_pdf = Mindee::PDF::PDFExtractor::ExtractedPDF.new(cut_pages(page_index_list), + field_filename) extracted_pdfs << extracted_pdf end extracted_pdfs @@ -72,7 +72,7 @@ def extract_sub_documents(page_indexes) # Extracts invoices as complete PDFs from the document. # @param page_indexes [Array, InvoiceSplitterV1PageGroup>] # @param strict [Boolean] - # @return [Array] + # @return [Array] def extract_invoices(page_indexes, strict: false) raise Errors::MindeePDFError, 'No indexes provided.' if page_indexes.empty? unless page_indexes[0].is_a?(Mindee::Product::InvoiceSplitter::InvoiceSplitterV1PageGroup) diff --git a/spec/extraction/extracted_image_spec.rb b/spec/extraction/extracted_image_spec.rb deleted file mode 100644 index 551b7495..00000000 --- a/spec/extraction/extracted_image_spec.rb +++ /dev/null @@ -1,106 +0,0 @@ -# frozen_string_literal: true - -require 'mindee' -require 'tempfile' -require 'pathname' -require 'stringio' -require 'mini_magick' -require_relative '../data' - -describe Mindee::Image::ExtractedImage do - let(:dummy_io_content) { 'This is a test file content.' } - let(:input_source) do - Mindee::Input::Source::PathInputSource.new("#{DATA_DIR}/products/invoices/default_sample.jpg") - end - - context 'An extracted image' do - it 'should initialize correctly with valid inputs' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - expect(extracted_image.buffer).to_not be(nil) - expect(extracted_image.page_id).to eq(page_id) - expect(extracted_image.element_id).to eq(element_id) - expect(extracted_image.internal_file_name).to eq("default_sample_p#{page_id}_#{element_id}.jpg") - end - - it 'should handle nil element_id by setting it to 0' do - page_id = 1 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, nil) - - expect(extracted_image.element_id).to eq(0) - end - - it 'should save the buffer to a file with valid format' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - random_sequence = Array.new(8) { rand(0..9) }.join - extracted_image.save_to_file("#{DATA_DIR}/output/temp-#{random_sequence}.jpg", 'jpg') - expect(File.exist?("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to be(true) - expect(File.read("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to_not be_empty - File.delete("#{DATA_DIR}/output/temp-#{random_sequence}.jpg") - expect(File.exist?("#{DATA_DIR}/output/temp-#{random_sequence}.jpg")).to be(false) - end - - it 'should infer file format from extension if not provided' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - Tempfile.create(['output', '.png']) do |tempfile| - extracted_image.save_to_file(tempfile.path) - expect(File.exist?(tempfile.path)).to be(true) - expect(File.read(tempfile.path)).to_not be_empty - end - end - - it 'should raise an error for invalid file format during save' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - Tempfile.create(['output', '.']) do |tempfile| - expect do - extracted_image.save_to_file(tempfile.path) - end.to raise_error(Mindee::Errors::MindeeImageError) - end - end - - it 'should raise an error for invalid path during save' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - invalid_path = '/invalid/path/output.jpg' - expect do - extracted_image.save_to_file(invalid_path) - end.to raise_error(Mindee::Errors::MindeeImageError) - end - - it 'should return a valid source object from as_source' do - page_id = 1 - element_id = 2 - extracted_image = Mindee::Image::ExtractedImage.new(input_source, page_id, element_id) - - source_object = extracted_image.as_source - - expect(source_object).to be_a(Mindee::Input::Source::BytesInputSource) - expect(source_object.filename).to eq(extracted_image.internal_file_name) - end - - it 'should raise an error when MiniMagick fails during save' do - allow(MiniMagick::Image).to receive(:read).and_raise(StandardError) - - extracted_image = Mindee::Image::ExtractedImage.new(input_source, 1, 2) - - Tempfile.create(['output', '.jpg']) do |tempfile| - expect do - extracted_image.save_to_file(tempfile.path, 'jpg') - end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) - end - end - end -end diff --git a/spec/extraction/invoice_splitter_extraction_integration.rb b/spec/extraction/invoice_splitter_extraction_integration.rb index 2dc3bc15..e32b8ad0 100644 --- a/spec/extraction/invoice_splitter_extraction_integration.rb +++ b/spec/extraction/invoice_splitter_extraction_integration.rb @@ -30,7 +30,7 @@ def prepare_invoice_return(rst_file_path, invoice_prediction) ) inference = response.document.inference - pdf_extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(invoice_splitter_input) + pdf_extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(invoice_splitter_input) expect(pdf_extractor.page_count).to eq(2) extracted_pdfs_strict = pdf_extractor.extract_invoices(inference.prediction.invoice_page_groups, strict: true) diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb index eae115b5..f064ec65 100644 --- a/spec/image/extracted_image_spec.rb +++ b/spec/image/extracted_image_spec.rb @@ -100,6 +100,18 @@ input_source.io_stream.rewind end + it 'should raise an error when MiniMagick fails during save' do + allow(MiniMagick::Image).to receive(:read).and_raise(StandardError) + + extracted_image = Mindee::Image::ExtractedImage.new(input_source, 1, 2) + + Tempfile.create(['output', '.jpg']) do |tempfile| + expect do + extracted_image.save_to_file(tempfile.path, 'jpg') + end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) + end + end + after(:each) do # FileUtils.rm_f("#{output_dir}/compress100.jpg") end diff --git a/spec/extraction/image_extractor_spec.rb b/spec/image/image_extractor_spec.rb similarity index 100% rename from spec/extraction/image_extractor_spec.rb rename to spec/image/image_extractor_spec.rb diff --git a/spec/extraction/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb similarity index 81% rename from spec/extraction/extracted_pdf_spec.rb rename to spec/pdf/extracted_pdf_spec.rb index 8967971d..772ba3c6 100644 --- a/spec/extraction/extracted_pdf_spec.rb +++ b/spec/pdf/extracted_pdf_spec.rb @@ -8,7 +8,7 @@ it 'should extract a PDF from an image' do jpg_stream = File.open("#{product_data_dir}/invoices/default_sample.jpg", 'r') - pdf_wrapper = Mindee::Image::PdfExtractor::ExtractedPdf.new(jpg_stream, 'dummy.pdf') + pdf_wrapper = Mindee::PDF::PDFExtractor::ExtractedPDF.new(jpg_stream, 'dummy.pdf') expect do pdf_wrapper.page_count end.to raise_error Mindee::Errors::MindeePDFError diff --git a/spec/extraction/pdf_extractor_spec.rb b/spec/pdf/pdf_extractor_spec.rb similarity index 91% rename from spec/extraction/pdf_extractor_spec.rb rename to spec/pdf/pdf_extractor_spec.rb index 80d113fc..6de412f0 100644 --- a/spec/extraction/pdf_extractor_spec.rb +++ b/spec/pdf/pdf_extractor_spec.rb @@ -20,13 +20,13 @@ jpg_input = Mindee::Input::Source::PathInputSource.new(invoice_default_sample_path) expect(jpg_input.pdf?).to eq(false) - extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(jpg_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(jpg_input) expect(extractor.page_count).to eq(1) end it 'should extract invoices from a PDF (no strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) @@ -45,7 +45,7 @@ it 'should extract invoices from a PDF (strict mode)' do pdf_input = Mindee::Input::Source::PathInputSource.new(invoice_splitter_5p_path) - extractor = Mindee::Image::PdfExtractor::PdfExtractor.new(pdf_input) + extractor = Mindee::PDF::PDFExtractor::PDFExtractor.new(pdf_input) expect(extractor.page_count).to eq(5) expect(loaded_prediction.invoice_page_groups.length).to eq(3) From a58a7664d2a2e804d056baf5f64ffe99eef87185 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Mon, 27 Jan 2025 17:10:41 +0100 Subject: [PATCH 09/16] add proper test removal --- spec/image/extracted_image_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb index f064ec65..7af996dd 100644 --- a/spec/image/extracted_image_spec.rb +++ b/spec/image/extracted_image_spec.rb @@ -113,7 +113,7 @@ end after(:each) do - # FileUtils.rm_f("#{output_dir}/compress100.jpg") + FileUtils.rm_f("#{output_dir}/output_test.jpg") end end end From d93f9e52828edf3f50ae64f0ad170156af37b1bd Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 10:31:42 +0100 Subject: [PATCH 10/16] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bea4438c..5a273b6a 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ job_id = enqueue_response.job.id local_response = Mindee::Input::LocalResponse.new(request.body.string) -# You can also use a File object as the input. +# You can also load the json from a local path. # FILE_PATH = File.join('path', 'to', 'file.json').freeze # local_response = Mindee::Input::LocalResponse.new(FILE_PATH); From bb2c1a5d9923018631856c73dde2122a6e262c6b Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 10:47:39 +0100 Subject: [PATCH 11/16] refix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a273b6a..bea4438c 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ job_id = enqueue_response.job.id local_response = Mindee::Input::LocalResponse.new(request.body.string) -# You can also load the json from a local path. +# You can also use a File object as the input. # FILE_PATH = File.join('path', 'to', 'file.json').freeze # local_response = Mindee::Input::LocalResponse.new(FILE_PATH); From d8aa791e5d2654266d94cfa3857922b3097299e3 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 11:21:26 +0100 Subject: [PATCH 12/16] add more coverage for extracted_pdf --- lib/mindee/pdf/extracted_pdf.rb | 14 ++--- spec/pdf/extracted_pdf_spec.rb | 92 ++++++++++++++++++++++++++++++--- 2 files changed, 93 insertions(+), 13 deletions(-) diff --git a/lib/mindee/pdf/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb index c35ae677..22387cf9 100644 --- a/lib/mindee/pdf/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -32,12 +32,14 @@ def page_count # Writes the contents of the current PDF object to a file. # @param output_path [String] Path to write to. - def write_to_file(output_path) - raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(destination) - raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?(File.expand_path('..', - output_path)) - - if File.extname(output_path).downcase == '.pdf' + # @param override [Boolean] Whether to override the destination file. + def write_to_file(output_path, override: false) + raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) + raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?( + File.expand_path('..', output_path) + ) && !override + + if File.extname(output_path).downcase == 'pdf' base_path = File.expand_path('..', output_path) output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path) end diff --git a/spec/pdf/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb index 772ba3c6..afccf91c 100644 --- a/spec/pdf/extracted_pdf_spec.rb +++ b/spec/pdf/extracted_pdf_spec.rb @@ -3,14 +3,92 @@ require 'mindee' require 'rspec' -describe 'Invoice extraction' do +describe Mindee::PDF::PDFExtractor::ExtractedPDF do let(:product_data_dir) { File.join(DATA_DIR, 'products') } + let(:output_dir) { File.join(DATA_DIR, 'output') } + let(:file_types_dir) { File.join(DATA_DIR, 'file_types') } + let(:valid_pdf_path) { "#{product_data_dir}/invoices/invoice.pdf" } + let(:invalid_pdf_path) { "#{file_types_dir}/receipt.txt" } + let(:output_path) { "#{output_dir}/sample_output.pdf" } - it 'should extract a PDF from an image' do - jpg_stream = File.open("#{product_data_dir}/invoices/default_sample.jpg", 'r') - pdf_wrapper = Mindee::PDF::PDFExtractor::ExtractedPDF.new(jpg_stream, 'dummy.pdf') - expect do - pdf_wrapper.page_count - end.to raise_error Mindee::Errors::MindeePDFError + before do + allow(File).to receive(:directory?).and_return(false) + allow(File).to receive(:exist?).and_return(true) + allow(File).to receive(:extname).and_return('.pdf') + allow(File).to receive(:write) + end + + describe '#initialize' do + it 'initializes with valid pdf bytes and filename' do + pdf_stream = File.open(valid_pdf_path, 'r') + extracted_pdf = described_class.new(pdf_stream, 'invoice.pdf') + + expect(extracted_pdf.pdf_bytes).to eq(pdf_stream) + expect(extracted_pdf.filename).to eq('invoice.pdf') + end + end + + describe '#page_count' do + it 'raises an error for invalid PDF content' do + jpg_stream = File.open(invalid_pdf_path, 'r') + pdf_wrapper = described_class.new(jpg_stream, 'dummy.pdf') + + expect do + pdf_wrapper.page_count + end.to raise_error Mindee::Errors::MindeePDFError, %r{Could not retrieve page count} + end + + it 'returns the correct page count for a valid PDF' do + pdf_stream = File.open(valid_pdf_path, 'r') + allow(Mindee::PDF::PdfProcessor).to receive(:open_pdf).and_return(double(pages: [1, 2, 3])) + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect(pdf_wrapper.page_count).to eq(3) + end + end + + describe '#write_to_file' do + it 'writes the PDF bytes to a specified file path' do + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect { pdf_wrapper.write_to_file(output_path) }.not_to raise_error + expect(File).to have_received(:write).with(output_path, pdf_stream) + end + + it 'raises an error if the output path is a directory' do + allow(File).to receive(:directory?).and_return(true) + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect do + pdf_wrapper.write_to_file(output_path) + end.to raise_error Mindee::Errors::MindeePDFError, %r{Provided path is not a file} + end + + it 'raises an error if the save path is invalid' do + allow(File).to receive(:exist?).and_return(false) + pdf_stream = File.open(valid_pdf_path, 'r') + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + + expect do + pdf_wrapper.write_to_file(output_path) + end.to raise_error Mindee::Errors::MindeePDFError, %r{Invalid save path provided} + end + end + + describe '#as_input_source' do + it 'returns a BytesInputSource object with correct attributes' do + pdf_stream = StringIO.new('pdf content') + input_source_double = double('BytesInputSource', content: 'pdf content', filename: 'invoice.pdf') + + allow(Mindee::Input::Source::BytesInputSource).to receive(:new).and_return(input_source_double) + + pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') + input_source = pdf_wrapper.as_input_source + + expect(input_source.content).to eq('pdf content') + expect(input_source.filename).to eq('invoice.pdf') + end end end From 5aefe420b853c8e0b8973034b7b78b913d116348 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 11:29:20 +0100 Subject: [PATCH 13/16] add even more coverage --- ...auto_multi_receipts_detector_extraction.rb | 2 +- .../multi_receipts_extractor.rb | 2 +- .../extraction/tax_extractor/ocr_extractor.rb | 2 +- .../extraction/tax_extractor/tax_extractor.rb | 2 +- .../multi_receipts_extractor_spec.rb | 33 +++++++++++++++++-- spec/extraction/tax_extractor_spec.rb | 2 +- .../standard}/date_field_spec.rb | 0 .../standard}/string_field_spec.rb | 0 8 files changed, 35 insertions(+), 8 deletions(-) rename spec/{fields => parsing/standard}/date_field_spec.rb (100%) rename spec/{fields => parsing/standard}/string_field_spec.rb (100%) diff --git a/examples/auto_multi_receipts_detector_extraction.rb b/examples/auto_multi_receipts_detector_extraction.rb index bcd55304..97d95930 100644 --- a/examples/auto_multi_receipts_detector_extraction.rb +++ b/examples/auto_multi_receipts_detector_extraction.rb @@ -13,7 +13,7 @@ def multi_receipts_detection(file_path, mindee_client) close_file: false ) - images = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) + images = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) images.each do |sub_image| # Optional: Save the files locally # sub_image.write_to_file("/path/to/my/extracted/file/folder") diff --git a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb index 6eb516be..eff201b2 100644 --- a/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb +++ b/lib/mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.rb @@ -4,7 +4,7 @@ module Mindee # Image Extraction Module. - module Image + module Extraction # Multi-receipts extraction class wrapper. class MultiReceiptsExtractor def self.extract_receipts(input_source, inference) diff --git a/lib/mindee/extraction/tax_extractor/ocr_extractor.rb b/lib/mindee/extraction/tax_extractor/ocr_extractor.rb index 3490ca69..94397c6c 100644 --- a/lib/mindee/extraction/tax_extractor/ocr_extractor.rb +++ b/lib/mindee/extraction/tax_extractor/ocr_extractor.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true module Mindee - module Image + module Extraction # Generic extractor class class OcrExtractor # Checks for a list of possible matches in a string & returns the index of the first found candidate. diff --git a/lib/mindee/extraction/tax_extractor/tax_extractor.rb b/lib/mindee/extraction/tax_extractor/tax_extractor.rb index dabae498..c5e1da55 100644 --- a/lib/mindee/extraction/tax_extractor/tax_extractor.rb +++ b/lib/mindee/extraction/tax_extractor/tax_extractor.rb @@ -5,7 +5,7 @@ # rubocop:disable Metrics/ClassLength module Mindee - module Image + module Extraction # Tax extractor class class TaxExtractor < OcrExtractor # Extracts the most relevant candidate. diff --git a/spec/extraction/multi_receipts_extractor_spec.rb b/spec/extraction/multi_receipts_extractor_spec.rb index 10117159..a3876347 100644 --- a/spec/extraction/multi_receipts_extractor_spec.rb +++ b/spec/extraction/multi_receipts_extractor_spec.rb @@ -5,8 +5,19 @@ require 'mindee/extraction' require_relative '../data' -describe Mindee::Image do +describe Mindee::Extraction::MultiReceiptsExtractor do include Mindee::Image + let(:empty_inference) do + double('Inference', prediction: double('Prediction', receipts: nil), pages: []) + end + + let(:valid_inference_with_no_receipts) do + double('Inference', prediction: double('Prediction', receipts: []), pages: []) + end + + let(:empty_input_source) do + double('InputSource', count_pdf_pages: 0) + end let(:multi_receipts_single_page_path) do File.join(DATA_DIR, 'products', 'multi_receipts_detector', 'default_sample.jpg') end @@ -28,7 +39,7 @@ input_sample = Mindee::Input::Source::PathInputSource.new(multi_receipts_single_page_path) response = load_json(multi_receipts_single_page_json_path, 'complete.json') doc = Mindee::Product::MultiReceiptsDetector::MultiReceiptsDetectorV1.new(response['document']['inference']) - extracted_receipts = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_sample, doc) + extracted_receipts = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_sample, doc) expect(extracted_receipts.size).to eq(6) @@ -76,7 +87,7 @@ input_sample = Mindee::Input::Source::PathInputSource.new(multi_receipts_multi_page_path) response = load_json(multi_receipts_multi_page_json_path, 'multipage_sample.json') doc = Mindee::Product::MultiReceiptsDetector::MultiReceiptsDetectorV1.new(response['document']['inference']) - extracted_receipts = Mindee::Image::MultiReceiptsExtractor.extract_receipts(input_sample, doc) + extracted_receipts = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_sample, doc) expect(extracted_receipts.size).to eq(5) @@ -111,4 +122,20 @@ expect(extracted_receipts[4].as_source.filename).to end_with('jpg') end end + + context 'when no receipts are found in inference' do + it 'raises a MindeeInputError' do + expect do + described_class.extract_receipts(empty_input_source, empty_inference) + end.to raise_error(Mindee::Errors::MindeeInputError, + 'No possible receipts candidates found for Multi-Receipts extraction.') + end + end + + context 'when input source has no pages' do + it 'returns an empty array' do + extracted_receipts = described_class.extract_receipts(empty_input_source, valid_inference_with_no_receipts) + expect(extracted_receipts).to eq([]) + end + end end diff --git a/spec/extraction/tax_extractor_spec.rb b/spec/extraction/tax_extractor_spec.rb index 6ad6541f..285a975e 100644 --- a/spec/extraction/tax_extractor_spec.rb +++ b/spec/extraction/tax_extractor_spec.rb @@ -13,7 +13,7 @@ ocr = Mindee::Parsing::Common::Ocr::Ocr.new( response['document']['ocr'] ) - found_tax = Mindee::Image::TaxExtractor.extract_custom_tax(ocr, ['Tax'], 0, 20) + found_tax = Mindee::Extraction::TaxExtractor.extract_custom_tax(ocr, ['Tax'], 0, 20) expect(found_tax.code).to eq('Tax') expect(found_tax.rate).to eq(8) expect(found_tax.value).to eq(nil) diff --git a/spec/fields/date_field_spec.rb b/spec/parsing/standard/date_field_spec.rb similarity index 100% rename from spec/fields/date_field_spec.rb rename to spec/parsing/standard/date_field_spec.rb diff --git a/spec/fields/string_field_spec.rb b/spec/parsing/standard/string_field_spec.rb similarity index 100% rename from spec/fields/string_field_spec.rb rename to spec/parsing/standard/string_field_spec.rb From 580842018c527748ddd972b2596b5f8cc75abde2 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 11:32:54 +0100 Subject: [PATCH 14/16] unify file save syntac --- examples/auto_invoice_splitter_extraction.rb | 2 +- examples/auto_multi_receipts_detector_extraction.rb | 2 +- lib/mindee/pdf/extracted_pdf.rb | 2 +- spec/pdf/extracted_pdf_spec.rb | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index 8955a3ff..442119b5 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -33,7 +33,7 @@ def parse_multi_page(mindee_client, input_source) extracted_pdfs.each do |extracted_pdf| # Optional: Save the files locally - # extracted_pdf.write_to_file("output/path") + # extracted_pdf.save_to_file("output/path") invoice_result = mindee_client.parse( extracted_pdf.as_input_source, diff --git a/examples/auto_multi_receipts_detector_extraction.rb b/examples/auto_multi_receipts_detector_extraction.rb index 97d95930..51a3e843 100644 --- a/examples/auto_multi_receipts_detector_extraction.rb +++ b/examples/auto_multi_receipts_detector_extraction.rb @@ -16,7 +16,7 @@ def multi_receipts_detection(file_path, mindee_client) images = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) images.each do |sub_image| # Optional: Save the files locally - # sub_image.write_to_file("/path/to/my/extracted/file/folder") + # sub_image.save_to_file("/path/to/my/extracted/file/folder") result_receipt = mindee_client.parse( sub_image.as_source, diff --git a/lib/mindee/pdf/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb index 22387cf9..dc6a0f6c 100644 --- a/lib/mindee/pdf/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -33,7 +33,7 @@ def page_count # Writes the contents of the current PDF object to a file. # @param output_path [String] Path to write to. # @param override [Boolean] Whether to override the destination file. - def write_to_file(output_path, override: false) + def save_to_file(output_path, override: false) raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?( File.expand_path('..', output_path) diff --git a/spec/pdf/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb index afccf91c..d3e12696 100644 --- a/spec/pdf/extracted_pdf_spec.rb +++ b/spec/pdf/extracted_pdf_spec.rb @@ -47,12 +47,12 @@ end end - describe '#write_to_file' do + describe '#save_to_file' do it 'writes the PDF bytes to a specified file path' do pdf_stream = File.open(valid_pdf_path, 'r') pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') - expect { pdf_wrapper.write_to_file(output_path) }.not_to raise_error + expect { pdf_wrapper.save_to_file(output_path) }.not_to raise_error expect(File).to have_received(:write).with(output_path, pdf_stream) end @@ -62,7 +62,7 @@ pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') expect do - pdf_wrapper.write_to_file(output_path) + pdf_wrapper.save_to_file(output_path) end.to raise_error Mindee::Errors::MindeePDFError, %r{Provided path is not a file} end @@ -72,7 +72,7 @@ pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') expect do - pdf_wrapper.write_to_file(output_path) + pdf_wrapper.save_to_file(output_path) end.to raise_error Mindee::Errors::MindeePDFError, %r{Invalid save path provided} end end From 5e08f81fea630cd73e0a6828a4283a1cd880b599 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 11:34:58 +0100 Subject: [PATCH 15/16] fix extraction module naming --- lib/mindee.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/mindee.rb b/lib/mindee.rb index 38c19a51..3dfe29ed 100644 --- a/lib/mindee.rb +++ b/lib/mindee.rb @@ -8,6 +8,10 @@ module Mindee module Errors end + # Custom extraction module + module Extraction + end + # Mindee internal http module. module HTTP end @@ -33,10 +37,6 @@ module ImageCompressor end end - # Custom extraction module - module Image - end - # Parsing internals and fields. module Parsing # Common fields and functions. From 5491ac5ed157ec278250e1969f00e6cb7f84cfbb Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Tue, 28 Jan 2025 13:53:21 +0100 Subject: [PATCH 16/16] switch save_to_file to write_to_file --- examples/auto_invoice_splitter_extraction.rb | 2 +- examples/auto_multi_receipts_detector_extraction.rb | 2 +- lib/mindee/image/extracted_image.rb | 2 +- lib/mindee/input/sources/url_input_source.rb | 2 +- lib/mindee/pdf/extracted_pdf.rb | 4 ++-- spec/image/extracted_image_spec.rb | 10 +++++----- spec/input/sources/url_input_source_spec.rb | 12 ++++++------ spec/pdf/extracted_pdf_spec.rb | 8 ++++---- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/auto_invoice_splitter_extraction.rb b/examples/auto_invoice_splitter_extraction.rb index 442119b5..8955a3ff 100644 --- a/examples/auto_invoice_splitter_extraction.rb +++ b/examples/auto_invoice_splitter_extraction.rb @@ -33,7 +33,7 @@ def parse_multi_page(mindee_client, input_source) extracted_pdfs.each do |extracted_pdf| # Optional: Save the files locally - # extracted_pdf.save_to_file("output/path") + # extracted_pdf.write_to_file("output/path") invoice_result = mindee_client.parse( extracted_pdf.as_input_source, diff --git a/examples/auto_multi_receipts_detector_extraction.rb b/examples/auto_multi_receipts_detector_extraction.rb index 51a3e843..97d95930 100644 --- a/examples/auto_multi_receipts_detector_extraction.rb +++ b/examples/auto_multi_receipts_detector_extraction.rb @@ -16,7 +16,7 @@ def multi_receipts_detection(file_path, mindee_client) images = Mindee::Extraction::MultiReceiptsExtractor.extract_receipts(input_source, result_split.document.inference) images.each do |sub_image| # Optional: Save the files locally - # sub_image.save_to_file("/path/to/my/extracted/file/folder") + # sub_image.write_to_file("/path/to/my/extracted/file/folder") result_receipt = mindee_client.parse( sub_image.as_source, diff --git a/lib/mindee/image/extracted_image.rb b/lib/mindee/image/extracted_image.rb index 8f25824e..9292ba2b 100644 --- a/lib/mindee/image/extracted_image.rb +++ b/lib/mindee/image/extracted_image.rb @@ -44,7 +44,7 @@ def initialize(input_source, page_id, element_id) # @param file_format [String, nil] Optional MiniMagick-compatible format for the file. Inferred from file # extension if not provided. # @raise [MindeeError] If an invalid path or filename is provided. - def save_to_file(output_path, file_format = nil) + def write_to_file(output_path, file_format = nil) resolved_path = Pathname.new(File.expand_path(output_path)) if file_format.nil? raise Errors::MindeeImageError, 'Invalid file format.' if resolved_path.extname.delete('.').empty? diff --git a/lib/mindee/input/sources/url_input_source.rb b/lib/mindee/input/sources/url_input_source.rb index 8bc03897..989e8461 100644 --- a/lib/mindee/input/sources/url_input_source.rb +++ b/lib/mindee/input/sources/url_input_source.rb @@ -27,7 +27,7 @@ def initialize(url) # @param token [String, nil] Optional token for JWT-based authentication. # @param max_redirects [Integer] Maximum amount of redirects to follow. # @return [String] The full path of the saved file. - def save_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3) + def write_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3) response_body = fetch_file_content(username: username, password: password, token: token, max_redirects: max_redirects) diff --git a/lib/mindee/pdf/extracted_pdf.rb b/lib/mindee/pdf/extracted_pdf.rb index dc6a0f6c..a8160bf1 100644 --- a/lib/mindee/pdf/extracted_pdf.rb +++ b/lib/mindee/pdf/extracted_pdf.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true module Mindee - # Pdf Extraction Module. + # PDF Extraction Module. module PDF module PDFExtractor # An extracted sub-Pdf. @@ -33,7 +33,7 @@ def page_count # Writes the contents of the current PDF object to a file. # @param output_path [String] Path to write to. # @param override [Boolean] Whether to override the destination file. - def save_to_file(output_path, override: false) + def write_to_file(output_path, override: false) raise Errors::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path) raise Errors::MindeePDFError, 'Invalid save path provided' unless File.exist?( File.expand_path('..', output_path) diff --git a/spec/image/extracted_image_spec.rb b/spec/image/extracted_image_spec.rb index 7af996dd..d01d7d67 100644 --- a/spec/image/extracted_image_spec.rb +++ b/spec/image/extracted_image_spec.rb @@ -51,12 +51,12 @@ end end - describe '#save_to_file' do + describe '#write_to_file' do it 'saves the buffer to a file with the correct format' do extracted_image = described_class.new(input_source, page_id, element_id) output_path = "#{output_dir}/output_test.jpg" - extracted_image.save_to_file(output_path) + extracted_image.write_to_file(output_path) expect(File.exist?(output_path)).to be true expect(File.size(output_path)).to be > 0 @@ -67,7 +67,7 @@ invalid_output_path = "#{output_dir}/output_test" expect do - extracted_image.save_to_file(invalid_output_path) + extracted_image.write_to_file(invalid_output_path) end.to raise_error(Mindee::Errors::MindeeImageError, %r{Invalid file format}) end @@ -76,7 +76,7 @@ invalid_output_path = '/invalid/path/output_test.jpg' expect do - extracted_image.save_to_file(invalid_output_path) + extracted_image.write_to_file(invalid_output_path) end.to raise_error(Mindee::Errors::MindeeImageError) end end @@ -107,7 +107,7 @@ Tempfile.create(['output', '.jpg']) do |tempfile| expect do - extracted_image.save_to_file(tempfile.path, 'jpg') + extracted_image.write_to_file(tempfile.path, 'jpg') end.to raise_error(Mindee::Errors::MindeeImageError, %r{Could not save file}) end end diff --git a/spec/input/sources/url_input_source_spec.rb b/spec/input/sources/url_input_source_spec.rb index 1357dedf..d2a86f63 100644 --- a/spec/input/sources/url_input_source_spec.rb +++ b/spec/input/sources/url_input_source_spec.rb @@ -66,7 +66,7 @@ end end - describe '#save_to_file' do + describe '#write_to_file' do let(:url_input_source) { described_class.new(valid_url) } let(:url_input_source_no_filename) { described_class.new(valid_url_no_filename) } @@ -79,23 +79,23 @@ let(:mock_response) { MockHTTPResponse.new('1.1', '200', 'OK', 'file content') } it 'generates a valid filename when not provided' do - output_file_path = url_input_source_no_filename.save_to_file(output_dir) + output_file_path = url_input_source_no_filename.write_to_file(output_dir) expect(output_file_path).to match(%r{mindee_temp_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_[a-z0-9]{8}\.tmp}) end it 'saves the file with the provided filename' do - result = url_input_source.save_to_file('/tmp', filename: 'file.pdf') + result = url_input_source.write_to_file('/tmp', filename: 'file.pdf') expect(result).to eq('/tmp/file.pdf') expect(File).to have_received(:write).with('/tmp/file.pdf', 'file content') end it 'uses a custom filename when provided' do - result = url_input_source.save_to_file('/tmp', filename: 'custom.pdf') + result = url_input_source.write_to_file('/tmp', filename: 'custom.pdf') expect(result).to eq('/tmp/custom.pdf') end it 'handles authentication' do - result = url_input_source_no_filename.save_to_file('/tmp', username: 'user', password: 'pass') + result = url_input_source_no_filename.write_to_file('/tmp', username: 'user', password: 'pass') expect(result).to match(%r{/tmp/mindee_temp_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_[a-z0-9]{8}\.tmp}) end end @@ -105,7 +105,7 @@ it 'raises an error' do expect do - url_input_source.save_to_file('/tmp') + url_input_source.write_to_file('/tmp') end.to raise_error(Mindee::Errors::MindeeAPIError, %r{Failed to download file}) end end diff --git a/spec/pdf/extracted_pdf_spec.rb b/spec/pdf/extracted_pdf_spec.rb index d3e12696..afccf91c 100644 --- a/spec/pdf/extracted_pdf_spec.rb +++ b/spec/pdf/extracted_pdf_spec.rb @@ -47,12 +47,12 @@ end end - describe '#save_to_file' do + describe '#write_to_file' do it 'writes the PDF bytes to a specified file path' do pdf_stream = File.open(valid_pdf_path, 'r') pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') - expect { pdf_wrapper.save_to_file(output_path) }.not_to raise_error + expect { pdf_wrapper.write_to_file(output_path) }.not_to raise_error expect(File).to have_received(:write).with(output_path, pdf_stream) end @@ -62,7 +62,7 @@ pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') expect do - pdf_wrapper.save_to_file(output_path) + pdf_wrapper.write_to_file(output_path) end.to raise_error Mindee::Errors::MindeePDFError, %r{Provided path is not a file} end @@ -72,7 +72,7 @@ pdf_wrapper = described_class.new(pdf_stream, 'invoice.pdf') expect do - pdf_wrapper.save_to_file(output_path) + pdf_wrapper.write_to_file(output_path) end.to raise_error Mindee::Errors::MindeePDFError, %r{Invalid save path provided} end end