Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/code_samples/default_v2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ model_id = 'MY_MODEL_ID'
mindee_client = Mindee::ClientV2.new(api_key: api_key)

# Set inference parameters
params = Mindee::Input::InferenceParameters.new(
inference_params = Mindee::Input::InferenceParameters.new(
# ID of the model, required.
model_id,
# If set to `true`, will enable Retrieval-Augmented Generation.
Expand All @@ -21,7 +21,7 @@ input_source = Mindee::Input::Source::PathInputSource.new(input_path)
# Send for processing
response = mindee_client.enqueue_and_get_inference(
input_source,
params # Note: this parameter can also be provided as a Hash.
inference_params # Note: this parameter can also be provided as a Hash.
)

# Print a brief summary of the parsed data
Expand Down
2 changes: 1 addition & 1 deletion lib/mindee/errors/mindee_input_error.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class MindeeMimeTypeError < MindeeSourceError
# @param mime_type [String]
def initialize(mime_type)
@invalid_mimetype = mime_type
super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \
super("'#{@invalid_mimetype}' mime type not allowed, must be one of " \
"#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}")
end
end
Expand Down
9 changes: 3 additions & 6 deletions lib/mindee/image/image_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@ module ImageExtractor
# @return [Origami::PDF] A PdfDocument handle.
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
magick_image = MiniMagick::Image.read(input_buffer)
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
# converted.
# NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output.
magick_image.format(format)
original_density = magick_image.resolution
scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
# the pdf otherwise the resulting image shrinks.
scale_factor = original_density[0].to_f / 4.166666 # Convert from default 300 DPI to 72.
magick_image.format('pdf', 0, { density: scale_factor.to_s })
Origami::PDF.read(StringIO.new(magick_image.to_blob))
end
Expand All @@ -32,8 +30,7 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg')
#
# @param [Input::Source::LocalInputSource] input_source
# @param [Integer] page_id ID of the Page to extract from.
# @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates.
# to extract.
# @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates to extract.
# @return [Array<Image::ExtractedImage>] Extracted Images.
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id)
Expand Down
46 changes: 29 additions & 17 deletions lib/mindee/input/sources/local_input_source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ def initialize(io_stream, filename, repair_pdf: false)
end

if filename.end_with?('.pdf') && repair_pdf
rescue_broken_pdf(@io_stream)
@file_mimetype = Marcel::MimeType.for @io_stream
fix_pdf!

logger.debug("Loaded new input #{@filename} from #{self.class}")
return if ALLOWED_MIME_TYPES.include? @file_mimetype
Expand All @@ -57,27 +56,40 @@ def initialize(io_stream, filename, repair_pdf: false)
raise Errors::MindeeMimeTypeError, @file_mimetype.to_s
end

# Attempts to fix pdf files if mimetype is rejected.
# "Broken PDFs" are often a result of third-party injecting invalid headers.
# This attempts to remove them and send the file
# @param stream [StringIO, File]
def rescue_broken_pdf(stream)
stream.gets('%PDF-')
raise Errors::MindeePDFError if stream.eof? || stream.pos > 500

stream.pos = stream.pos - 5
data = stream.read
@io_stream.close

@io_stream = StringIO.new
@io_stream << data
# @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead.
def rescue_broken_pdf(_)
fix_pdf!
end

# Shorthand for pdf mimetype validation.
# Shorthand for PDF mimetype validation.
def pdf?
@file_mimetype.to_s == 'application/pdf'
end

# Attempts to fix the PDF data in the file.
# @param maximum_offset [Integer] Maximum offset to look for the PDF header.
# @return [void]
# @raise [Mindee::Errors::MindeePDFError]
def fix_pdf!(maximum_offset: 500)
@io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset)
@io_stream.rewind
@file_mimetype = Marcel::MimeType.for @io_stream
end

# Attempt to fix the PDF data in the given stream.
# @param stream [StringIO] The stream to fix.
# @param maximum_offset [Integer] Maximum offset to look for the PDF header.
# @return [StringIO] The fixed stream.
# @raise [Mindee::Errors::MindeePDFError]
def self.fix_pdf(stream, maximum_offset: 500)
out_stream = StringIO.new
stream.gets('%PDF-')
raise Errors::MindeePDFError if stream.eof? || stream.pos > maximum_offset

stream.pos = stream.pos - 5
out_stream << stream.read
end

# Cuts a PDF file according to provided options.
# @param options [PageOptions, nil] Page cutting/merge options:
#
Expand Down
2 changes: 1 addition & 1 deletion lib/mindee/parsing/v2/field/list_field.rb
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def [](index)
end

# Iterator for Enumerator inheritance.
# NOTE: Untyped due to incomplete support in steep.
# NOTE: Untyped due to incomplete support in current supported version of RBS.
def each(&block)
return to_enum(:each) unless block_given?

Expand Down
1 change: 0 additions & 1 deletion lib/mindee/parsing/v2/field/simple_field.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def to_s
if @value.is_a?(TrueClass) || @value.is_a?(FalseClass)
@value ? 'True' : 'False'
elsif @value.is_a?(Integer) || @value.is_a?(Float)
# NOTE: explicitly typing because steep is very, very dumb
num = @value # @type var num: Integer | Float
format_numeric_value(num)
else
Expand Down
2 changes: 1 addition & 1 deletion sig/custom/mini_magick.rbs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Stub for the mini_magick library.
# Note: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
# NOTE: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
# to match the rules we have on the repo, hence the existence of this file and the overrides present below.
module MiniMagick
class Image
Expand Down
6 changes: 5 additions & 1 deletion sig/mindee/input/sources/local_input_source.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@ module Mindee
attr_reader filename: String
attr_reader io_stream: StringIO | File
def initialize: (StringIO | File, String, ?repair_pdf: bool) -> void

def fix_pdf!: (?maximum_offset: Integer) -> void
def self.fix_pdf: (StringIO | File, ?maximum_offset: Integer) -> StringIO

def logger: () -> Logger


def rescue_broken_pdf: (StringIO | File) -> (StringIO | File)
def rescue_broken_pdf: (untyped) -> void
def pdf?: -> bool
def apply_page_options: (PageOptions) -> StringIO?
def process_pdf: (PageOptions) -> StringIO?
Expand Down
1 change: 0 additions & 1 deletion sig/mindee/parsing/v2/field/list_field.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ module Mindee
def empty?: -> bool
def size: -> Integer
def length: -> Integer
# NOTE: Steep is incapable of handling typing of `each` when multiple types are used.
def each: () { (untyped) -> untyped } -> untyped
def []: (Integer) -> (BaseField)
end
Expand Down
6 changes: 6 additions & 0 deletions spec/parsing/v2/inference_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ def load_v2_inference(resource_path)
expect(fields['line_items'][0]).to be_a(object_field)
expect(fields['line_items'][0]['quantity'].value).to eq(1.0)

expect(fields).to have_key('line_items')
expect(fields['line_items']).not_to be_nil
expect(fields['line_items']).to be_a(list_field)
expect(fields['line_items'][0]).to be_a(object_field)
expect(fields['line_items'][0]['quantity'].value).to eq(1.0)

tax_item_obj = first_tax_item
expect(tax_item_obj.fields.size).to eq(3)

Expand Down
2 changes: 1 addition & 1 deletion spec/test_utilities.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def self.get_id(rst_str)
end

# Implementation of the levenshtein algorithm from here: https://rosettacode.org/wiki/Levenshtein_distance#Ruby
# Note: removes the downcase operation since we care about case in the return strings.
# Without the downcase operation since we care about case in the return strings.
# @param [String] ref_string First String.
# @param [String] target_string Second String.
# @return [Integer] Levenshtein distance between the strings.
Expand Down
Loading