diff --git a/docs/code_samples/default_v2.txt b/docs/code_samples/default_v2.txt index 8acb3123..3a972c00 100644 --- a/docs/code_samples/default_v2.txt +++ b/docs/code_samples/default_v2.txt @@ -8,7 +8,7 @@ model_id = 'MY_MODEL_ID' mindee_client = Mindee::ClientV2.new(api_key: api_key) # Set inference parameters -params = Mindee::Input::InferenceParameters.new( +inference_params = Mindee::Input::InferenceParameters.new( # ID of the model, required. model_id, # If set to `true`, will enable Retrieval-Augmented Generation. @@ -21,7 +21,7 @@ input_source = Mindee::Input::Source::PathInputSource.new(input_path) # Send for processing response = mindee_client.enqueue_and_get_inference( input_source, - params # Note: this parameter can also be provided as a Hash. + inference_params # Note: this parameter can also be provided as a Hash. ) # Print a brief summary of the parsed data diff --git a/lib/mindee/errors/mindee_input_error.rb b/lib/mindee/errors/mindee_input_error.rb index 267fea25..6554198c 100644 --- a/lib/mindee/errors/mindee_input_error.rb +++ b/lib/mindee/errors/mindee_input_error.rb @@ -16,7 +16,7 @@ class MindeeMimeTypeError < MindeeSourceError # @param mime_type [String] def initialize(mime_type) @invalid_mimetype = mime_type - super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \ + super("'#{@invalid_mimetype}' mime type not allowed, must be one of " \ "#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}") end end diff --git a/lib/mindee/image/image_extractor.rb b/lib/mindee/image/image_extractor.rb index bb059980..68517855 100644 --- a/lib/mindee/image/image_extractor.rb +++ b/lib/mindee/image/image_extractor.rb @@ -18,12 +18,10 @@ module ImageExtractor # @return [Origami::PDF] A PdfDocument handle. def self.attach_image_as_new_file(input_buffer, format: 'jpg') magick_image = MiniMagick::Image.read(input_buffer) - # NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't - # converted. + # NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output. magick_image.format(format) original_density = magick_image.resolution - scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for - # the pdf otherwise the resulting image shrinks. + scale_factor = original_density[0].to_f / 4.166666 # Convert from default 300 DPI to 72. magick_image.format('pdf', 0, { density: scale_factor.to_s }) Origami::PDF.read(StringIO.new(magick_image.to_blob)) end @@ -32,8 +30,7 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg') # # @param [Input::Source::LocalInputSource] input_source # @param [Integer] page_id ID of the Page to extract from. - # @param [Array>, Array] polygons List of coordinates. - # to extract. + # @param [Array>, Array] polygons List of coordinates to extract. # @return [Array] Extracted Images. def self.extract_multiple_images_from_source(input_source, page_id, polygons) new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id) diff --git a/lib/mindee/input/sources/local_input_source.rb b/lib/mindee/input/sources/local_input_source.rb index ca3a5f9a..18095290 100644 --- a/lib/mindee/input/sources/local_input_source.rb +++ b/lib/mindee/input/sources/local_input_source.rb @@ -47,8 +47,7 @@ def initialize(io_stream, filename, repair_pdf: false) end if filename.end_with?('.pdf') && repair_pdf - rescue_broken_pdf(@io_stream) - @file_mimetype = Marcel::MimeType.for @io_stream + fix_pdf! logger.debug("Loaded new input #{@filename} from #{self.class}") return if ALLOWED_MIME_TYPES.include? @file_mimetype @@ -57,27 +56,40 @@ def initialize(io_stream, filename, repair_pdf: false) raise Errors::MindeeMimeTypeError, @file_mimetype.to_s end - # Attempts to fix pdf files if mimetype is rejected. - # "Broken PDFs" are often a result of third-party injecting invalid headers. - # This attempts to remove them and send the file - # @param stream [StringIO, File] - def rescue_broken_pdf(stream) - stream.gets('%PDF-') - raise Errors::MindeePDFError if stream.eof? || stream.pos > 500 - - stream.pos = stream.pos - 5 - data = stream.read - @io_stream.close - - @io_stream = StringIO.new - @io_stream << data + # @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead. + def rescue_broken_pdf(_) + fix_pdf! end - # Shorthand for pdf mimetype validation. + # Shorthand for PDF mimetype validation. def pdf? @file_mimetype.to_s == 'application/pdf' end + # Attempts to fix the PDF data in the file. + # @param maximum_offset [Integer] Maximum offset to look for the PDF header. + # @return [void] + # @raise [Mindee::Errors::MindeePDFError] + def fix_pdf!(maximum_offset: 500) + @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset) + @io_stream.rewind + @file_mimetype = Marcel::MimeType.for @io_stream + end + + # Attempt to fix the PDF data in the given stream. + # @param stream [StringIO] The stream to fix. + # @param maximum_offset [Integer] Maximum offset to look for the PDF header. + # @return [StringIO] The fixed stream. + # @raise [Mindee::Errors::MindeePDFError] + def self.fix_pdf(stream, maximum_offset: 500) + out_stream = StringIO.new + stream.gets('%PDF-') + raise Errors::MindeePDFError if stream.eof? || stream.pos > maximum_offset + + stream.pos = stream.pos - 5 + out_stream << stream.read + end + # Cuts a PDF file according to provided options. # @param options [PageOptions, nil] Page cutting/merge options: # diff --git a/lib/mindee/parsing/v2/field/list_field.rb b/lib/mindee/parsing/v2/field/list_field.rb index 1e6d9d69..d69a024e 100644 --- a/lib/mindee/parsing/v2/field/list_field.rb +++ b/lib/mindee/parsing/v2/field/list_field.rb @@ -74,7 +74,7 @@ def [](index) end # Iterator for Enumerator inheritance. - # NOTE: Untyped due to incomplete support in steep. + # NOTE: Untyped due to incomplete support in current supported version of RBS. def each(&block) return to_enum(:each) unless block_given? diff --git a/lib/mindee/parsing/v2/field/simple_field.rb b/lib/mindee/parsing/v2/field/simple_field.rb index aee80f63..e2c8cbf4 100644 --- a/lib/mindee/parsing/v2/field/simple_field.rb +++ b/lib/mindee/parsing/v2/field/simple_field.rb @@ -26,7 +26,6 @@ def to_s if @value.is_a?(TrueClass) || @value.is_a?(FalseClass) @value ? 'True' : 'False' elsif @value.is_a?(Integer) || @value.is_a?(Float) - # NOTE: explicitly typing because steep is very, very dumb num = @value # @type var num: Integer | Float format_numeric_value(num) else diff --git a/sig/custom/mini_magick.rbs b/sig/custom/mini_magick.rbs index 78111ac2..bf125c1a 100644 --- a/sig/custom/mini_magick.rbs +++ b/sig/custom/mini_magick.rbs @@ -1,5 +1,5 @@ # Stub for the mini_magick library. -# Note: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough +# NOTE: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough # to match the rules we have on the repo, hence the existence of this file and the overrides present below. module MiniMagick class Image diff --git a/sig/mindee/input/sources/local_input_source.rbs b/sig/mindee/input/sources/local_input_source.rbs index c51e9553..9bfeaefb 100644 --- a/sig/mindee/input/sources/local_input_source.rbs +++ b/sig/mindee/input/sources/local_input_source.rbs @@ -8,10 +8,14 @@ module Mindee attr_reader filename: String attr_reader io_stream: StringIO | File def initialize: (StringIO | File, String, ?repair_pdf: bool) -> void + + def fix_pdf!: (?maximum_offset: Integer) -> void + def self.fix_pdf: (StringIO | File, ?maximum_offset: Integer) -> StringIO + def logger: () -> Logger - def rescue_broken_pdf: (StringIO | File) -> (StringIO | File) + def rescue_broken_pdf: (untyped) -> void def pdf?: -> bool def apply_page_options: (PageOptions) -> StringIO? def process_pdf: (PageOptions) -> StringIO? diff --git a/sig/mindee/parsing/v2/field/list_field.rbs b/sig/mindee/parsing/v2/field/list_field.rbs index 2383d129..2dac57b7 100644 --- a/sig/mindee/parsing/v2/field/list_field.rbs +++ b/sig/mindee/parsing/v2/field/list_field.rbs @@ -12,7 +12,6 @@ module Mindee def empty?: -> bool def size: -> Integer def length: -> Integer - # NOTE: Steep is incapable of handling typing of `each` when multiple types are used. def each: () { (untyped) -> untyped } -> untyped def []: (Integer) -> (BaseField) end diff --git a/spec/parsing/v2/inference_spec.rb b/spec/parsing/v2/inference_spec.rb index 561bb542..5c1e9108 100644 --- a/spec/parsing/v2/inference_spec.rb +++ b/spec/parsing/v2/inference_spec.rb @@ -98,6 +98,12 @@ def load_v2_inference(resource_path) expect(fields['line_items'][0]).to be_a(object_field) expect(fields['line_items'][0]['quantity'].value).to eq(1.0) + expect(fields).to have_key('line_items') + expect(fields['line_items']).not_to be_nil + expect(fields['line_items']).to be_a(list_field) + expect(fields['line_items'][0]).to be_a(object_field) + expect(fields['line_items'][0]['quantity'].value).to eq(1.0) + tax_item_obj = first_tax_item expect(tax_item_obj.fields.size).to eq(3) diff --git a/spec/test_utilities.rb b/spec/test_utilities.rb index 107d34f3..92eb0f8f 100644 --- a/spec/test_utilities.rb +++ b/spec/test_utilities.rb @@ -17,7 +17,7 @@ def self.get_id(rst_str) end # Implementation of the levenshtein algorithm from here: https://rosettacode.org/wiki/Levenshtein_distance#Ruby - # Note: removes the downcase operation since we care about case in the return strings. + # Without the downcase operation since we care about case in the return strings. # @param [String] ref_string First String. # @param [String] target_string Second String. # @return [Integer] Levenshtein distance between the strings.