From 243153290e3566df6a9c6d7f4cd435ee2bd81ce6 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 19 Dec 2025 14:41:50 +0100 Subject: [PATCH 1/4] :sparkles: add support for dataschema parameter --- lib/mindee/input.rb | 1 + lib/mindee/input/data_schema.rb | 126 ++++++++++++++++++ lib/mindee/input/inference_parameters.rb | 9 +- .../parsing/v2/inference_active_options.rb | 22 +++ sig/mindee/input/data_schema.rbs | 34 +++++ sig/mindee/input/inference_parameters.rbs | 4 +- .../parsing/v2/inference_active_options.rbs | 8 ++ spec/data | 2 +- spec/v2/client_v2_integration.rb | 52 +++++++- spec/v2/input/inference_parameter_spec.rb | 46 +++++++ spec/v2/input/local_response_v2_spec.rb | 2 +- 11 files changed, 299 insertions(+), 7 deletions(-) create mode 100644 lib/mindee/input/data_schema.rb create mode 100644 sig/mindee/input/data_schema.rbs create mode 100644 spec/v2/input/inference_parameter_spec.rb diff --git a/lib/mindee/input.rb b/lib/mindee/input.rb index 8afb9bfa..9bbf38a8 100644 --- a/lib/mindee/input.rb +++ b/lib/mindee/input.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true +require_relative 'input/data_schema' require_relative 'input/inference_parameters' require_relative 'input/polling_options' require_relative 'input/sources' diff --git a/lib/mindee/input/data_schema.rb b/lib/mindee/input/data_schema.rb new file mode 100644 index 00000000..1aca73a9 --- /dev/null +++ b/lib/mindee/input/data_schema.rb @@ -0,0 +1,126 @@ +# frozen_string_literal: true + +module Mindee + module Input + # Data Schema Field. + class DataSchemaField + # @return [String] Display name for the field, also impacts inference results. + attr_reader :title + # @return [String] Name of the field in the data schema. + attr_reader :name + # @return [Boolean] Whether this field can contain multiple values. + attr_reader :is_array + # @return [String] Data type of the field. + attr_reader :type + # @return [Array, nil] Allowed values when type is `classification`. Leave empty for other types. + attr_reader :classification_values + # @return [Boolean, nil] Whether to remove duplicate values in the array. + # Only applicable if `is_array` is True. + attr_reader :unique_values + # @return [String, nil] Detailed description of what this field represents. + attr_reader :description + # @return [String, nil] Optional extraction guidelines. + attr_reader :guidelines + # @return [Array, nil] Nested fields. + attr_reader :nested_fields + + # @param field [Hash] + def initialize(field) + field.transform_keys!(&:to_sym) + @name = field[:name] + @title = field[:title] + @is_array = field[:is_array] + @type = field[:type] + @classification_values = field[:classification_values] + @unique_values = field[:unique_values] + @description = field[:description] + @guidelines = field[:guidelines] + @nested_fields = field[:nested_fields] + end + + # @return [Hash] + def to_hash + out = { + name: @name, + title: @title, + is_array: @is_array, + type: @type, + } # @type var out: Hash[Symbol, untyped] + out[:classification_values] = @classification_values unless @classification_values.nil? + out[:unique_values] = @unique_values unless @unique_values.nil? + out[:description] = @description unless @description.nil? + out[:guidelines] = @guidelines unless @guidelines.nil? + out[:nested_fields] = @nested_fields unless @nested_fields.nil? + out + end + + # @return [String] + def to_s + to_hash.to_json + end + end + + # The structure to completely replace the data schema of the model. + class DataSchemaReplace + # @return [Array] Subfields when type is `nested_object`. Leave empty for other types. + attr_reader :fields + + # @param data_schema_replace [Hash] + def initialize(data_schema_replace) + data_schema_replace.transform_keys!(&:to_sym) + fields_list = data_schema_replace[:fields] + raise Mindee::Errors::MindeeError, 'Invalid Data Schema provided.' if fields_list.nil? + raise TypeError, 'Data Schema replacement fields cannot be empty.' if fields_list.empty? + + @fields = fields_list.map { |field| DataSchemaField.new(field) } + end + + # @return [Hash] + def to_hash + { fields: @fields.map(&:to_hash) } + end + + # @return [String] + def to_s + to_hash.to_json + end + end + + # Modify the Data Schema. + class DataSchema + # @return [Mindee::Input::DataSchemaReplace] + attr_reader :replace + + # @param data_schema [Hash, String] + def initialize(data_schema) + case data_schema + when String + parsed = JSON.parse(data_schema.to_s, object_class: Hash) + parsed.transform_keys!(&:to_sym) + @replace = DataSchemaReplace.new(parsed[:replace]) + when Hash + data_schema.transform_keys!(&:to_sym) + @replace = if data_schema[:replace].is_a?(DataSchemaReplace) + data_schema[:replace] + else + DataSchemaReplace.new(data_schema[:replace]) + end + when DataSchema + @replace = data_schema.replace + else + raise TypeError, 'Invalid Data Schema provided.' + end + end + + # @return [Hash] + def to_hash + { replace: @replace.to_hash } + end + + # @return [String] + def to_s + to_hash.to_json + end + end + end +end diff --git a/lib/mindee/input/inference_parameters.rb b/lib/mindee/input/inference_parameters.rb index c79987c8..1a38fca4 100644 --- a/lib/mindee/input/inference_parameters.rb +++ b/lib/mindee/input/inference_parameters.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require_relative 'data_schema' + module Mindee module Input # Parameters to set when sending a file for inference. @@ -35,6 +37,9 @@ class InferenceParameters # @return [PollingOptions] Options for polling. Set only if having timeout issues. attr_reader :polling_options + # @return [DataSchemaField] + attr_reader :data_schema + # @return [Boolean, nil] Whether to close the file after parsing. attr_reader :close_file @@ -58,7 +63,8 @@ def initialize( webhook_ids: nil, text_context: nil, polling_options: nil, - close_file: true + close_file: true, + data_schema: nil ) raise Errors::MindeeInputError, 'Model ID is required.' if model_id.empty? || model_id.nil? @@ -72,6 +78,7 @@ def initialize( @text_context = text_context @polling_options = get_clean_polling_options(polling_options) @close_file = close_file.nil? || close_file + @data_schema = DataSchema.new(data_schema) unless data_schema.nil? # rubocop:enable Metrics/ParameterLists end diff --git a/lib/mindee/parsing/v2/inference_active_options.rb b/lib/mindee/parsing/v2/inference_active_options.rb index 3dad5b00..47d88473 100644 --- a/lib/mindee/parsing/v2/inference_active_options.rb +++ b/lib/mindee/parsing/v2/inference_active_options.rb @@ -3,6 +3,23 @@ module Mindee module Parsing module V2 + # Data schema options activated during the inference. + class DataSchemaActiveOption + # @return [Boolean] + attr_reader :replace + + # @param server_response [Hash] + def initialize(server_response) + @replace = server_response[:replace] || server_response['replace'] + end + + # String representation. + # @return [String] + def to_s + "Data Schema\n-----------\n:Replace: #{@replace ? 'True' : 'False'}" + end + end + # Options which were activated during the inference. class InferenceActiveOptions # @return [Boolean] Whether the Raw Text feature was activated. @@ -15,6 +32,8 @@ class InferenceActiveOptions attr_reader :rag # @return [Boolean] Whether the text context feature was activated. attr_reader :text_context + # @return [DataSchemaActiveOption] + attr_reader :data_schema # @param server_response [Hash] Raw JSON parsed into a Hash. def initialize(server_response) @@ -23,6 +42,7 @@ def initialize(server_response) @confidence = server_response['confidence'] @rag = server_response['rag'] @text_context = server_response['text_context'] + @data_schema = DataSchemaActiveOption.new(server_response['data_schema']) end # String representation. @@ -35,6 +55,8 @@ def to_s ":Polygon: #{@polygon ? 'True' : 'False'}", ":Confidence: #{@confidence ? 'True' : 'False'}", ":RAG: #{@rag ? 'True' : 'False'}", + ":Text Context: #{@text_context ? 'True' : 'False'}\n", + @data_schema.to_s, '', ] parts.join("\n") diff --git a/sig/mindee/input/data_schema.rbs b/sig/mindee/input/data_schema.rbs new file mode 100644 index 00000000..d7c99d99 --- /dev/null +++ b/sig/mindee/input/data_schema.rbs @@ -0,0 +1,34 @@ +module Mindee + module Input + class DataSchemaField + attr_reader title: String + attr_reader name: String + attr_reader is_array: bool + attr_reader type: String + attr_reader classification_values: String|nil + attr_reader unique_values: bool|nil + attr_reader description: String|nil + attr_reader guidelines: String|nil + attr_reader nested_fields: Array[Hash[String|Symbol, untyped]]|nil + + def initialize: (Hash[Symbol, untyped]) -> void + def to_hash: () -> Hash[Symbol, untyped] + def to_string: () -> String + end + + class DataSchemaReplace + attr_reader fields: Array[DataSchemaField] + def initialize: (Hash[Symbol, untyped]) -> void + def to_hash: () -> Hash[Symbol, untyped] + def to_string: () -> String + end + + class DataSchema + attr_reader replace: DataSchemaReplace + + def initialize: (Hash[String|Symbol, untyped]|String|DataSchema) -> void + def to_hash: () -> Hash[Symbol, untyped] + def to_s: -> String + end + end +end diff --git a/sig/mindee/input/inference_parameters.rbs b/sig/mindee/input/inference_parameters.rbs index d8b3f173..1e929c5f 100644 --- a/sig/mindee/input/inference_parameters.rbs +++ b/sig/mindee/input/inference_parameters.rbs @@ -12,6 +12,7 @@ module Mindee attr_reader raw_text: bool? attr_reader text_context: String? attr_reader webhook_ids: Array[String]? + attr_reader data_schema: DataSchema? def initialize: ( String, @@ -23,7 +24,8 @@ module Mindee ?text_context: String?, ?webhook_ids: Array[String]?, ?polling_options: Hash[Symbol | String, untyped] | PollingOptions?, - ?close_file: bool? + ?close_file: bool?, + ?data_schema: DataSchema|String|Hash[Symbol | String, untyped]? ) -> void def self.from_hash: (params: Hash[String | Symbol, untyped]) -> InferenceParameters diff --git a/sig/mindee/parsing/v2/inference_active_options.rbs b/sig/mindee/parsing/v2/inference_active_options.rbs index 2dd4b626..f0cc0296 100644 --- a/sig/mindee/parsing/v2/inference_active_options.rbs +++ b/sig/mindee/parsing/v2/inference_active_options.rbs @@ -1,14 +1,22 @@ module Mindee module Parsing module V2 + class DataSchemaActiveOption + attr_reader replace: bool + + def initialize: (Hash[Symbol |string, untyped]) -> void + def to_s: () -> String + end class InferenceActiveOptions attr_reader confidence: bool attr_reader polygon: bool attr_reader rag: bool attr_reader raw_text: bool attr_reader text_context: bool + attr_reader data_schema: DataSchemaActiveOption def initialize: (Hash[String | Symbol, untyped]) -> void + def to_s: () -> String end end end diff --git a/spec/data b/spec/data index f86f3eaf..0c51e1d3 160000 --- a/spec/data +++ b/spec/data @@ -1 +1 @@ -Subproject commit f86f3eaf540f0babeb3d4f1a458d764856a2170b +Subproject commit 0c51e1d3e2258404c44280f25f4951ba6fe27324 diff --git a/spec/v2/client_v2_integration.rb b/spec/v2/client_v2_integration.rb index 53af04f6..884a8505 100644 --- a/spec/v2/client_v2_integration.rb +++ b/spec/v2/client_v2_integration.rb @@ -23,7 +23,7 @@ raw_text: true, polygon: false, confidence: false, - file_alias: 'ruby-integration-test', + file_alias: 'rb_integration_test', polling_options: polling, text_context: 'this is a test' ) @@ -72,7 +72,7 @@ polygon: false, confidence: false, rag: false, - file_alias: 'ruby-integration-test' + file_alias: 'rb_integration_test' ) response = client.enqueue_and_get_inference(input, inference_params) @@ -191,7 +191,7 @@ polygon: false, confidence: false, rag: false, - file_alias: 'ruby-integration-test' + file_alias: 'rb_integration_test' ) client.enqueue_and_get_inference(input, inference_params) end.to raise_error(Mindee::Errors::MindeeHTTPErrorV2) { |e| @@ -216,4 +216,50 @@ expect(response.inference).not_to be_nil end end + + context 'A Data Schema Override' do + it 'Overrides successfully' do + data_schema_replace = File.read(File.join(V2_DATA_DIR, 'inference', 'data_schema_replace_param.json')) + input = Mindee::Input::Source::PathInputSource.new(blank_pdf_url) + + inference_params = Mindee::Input::InferenceParameters.new( + model_id, + raw_text: false, + polygon: false, + confidence: false, + rag: false, + file_alias: 'rb_integration_data_schema_replace', + data_schema: data_schema_replace + ) + + response = client.enqueue_and_get_inference(input, inference_params) + expect(response).not_to be_nil + + model = response.inference.model + expect(model).not_to be_nil + expect(model).to be_a(Mindee::Parsing::V2::InferenceModel) + expect(model.id).to eq(model_id) + + active_options = response.inference.active_options + expect(active_options).not_to be_nil + expect(active_options).to be_a(Mindee::Parsing::V2::InferenceActiveOptions) + expect(active_options.raw_text).to eq(false) + expect(active_options.polygon).to eq(false) + expect(active_options.confidence).to eq(false) + expect(active_options.rag).to eq(false) + expect(active_options.text_context).to eq(false) + expect(active_options.data_schema).to_not be_nil + expect(active_options.data_schema.replace).to eq(true) + + result = response.inference.result + expect(result).not_to be_nil + + expect(result.raw_text).to be_nil + + fields = result.fields + expect(fields).not_to be_nil + expect(fields['test_replace']).not_to be_nil + expect(fields['test_replace'].value).to eq('a test value') + end + end end diff --git a/spec/v2/input/inference_parameter_spec.rb b/spec/v2/input/inference_parameter_spec.rb new file mode 100644 index 00000000..f3e10a6d --- /dev/null +++ b/spec/v2/input/inference_parameter_spec.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require 'mindee/input/inference_parameters' +require 'mindee/input/data_schema' + +describe Mindee::Input::InferenceParameters do + let(:extracted_schema_content) { File.read(File.join(V2_DATA_DIR, 'inference', 'data_schema_replace_param.json')) } + let(:extracted_schema_hash) { JSON.parse(extracted_schema_content) } + let(:extracted_schema_str) { extracted_schema_hash.to_json } + let(:extracted_schema_object) { Mindee::Input::DataSchema.new(extracted_schema_hash) } + + describe 'Data Schema' do + describe "shouldn't replace when unset" do + it 'should initialize with a data schema' do + param = Mindee::Input::InferenceParameters.new( + 'dummy-model' + ) + expect(param.data_schema).to be_nil + end + + it 'should initialize with string' do + param = Mindee::Input::InferenceParameters.new( + 'dummy-model', + data_schema: extracted_schema_str + ) + expect(param.data_schema.to_s).to eq(extracted_schema_str) + end + + it 'should initialize with hash' do + param = Mindee::Input::InferenceParameters.new( + 'dummy-model', + data_schema: extracted_schema_hash + ) + expect(param.data_schema.to_s).to eq(extracted_schema_str) + end + + it 'should initialize with DataSchema object' do + param = Mindee::Input::InferenceParameters.new( + 'dummy-model', + data_schema: extracted_schema_object + ) + expect(param.data_schema.to_s).to eq(extracted_schema_str) + end + end + end +end diff --git a/spec/v2/input/local_response_v2_spec.rb b/spec/v2/input/local_response_v2_spec.rb index a10fa1bc..ccdf8d88 100644 --- a/spec/v2/input/local_response_v2_spec.rb +++ b/spec/v2/input/local_response_v2_spec.rb @@ -5,7 +5,7 @@ def assert_local_response(local_response) dummy_secret_key = 'ogNjY44MhvKPGTtVsI8zG82JqWQa68woYQH' - signature = 'b82a515c832fd2c4f4ce3a7e6f53c12e8d10e19223f6cf0e3a9809a7a3da26be' + signature = '1df388c992d87897fe61dfc56c444c58fc3c7369c31e2b5fd20d867695e93e85' expect(local_response.file).to_not be(nil) expect(local_response.valid_hmac_signature?( dummy_secret_key, 'invalid signature' From 6822e3e561732aa9cff310f49adb7fe2c1729589 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 19 Dec 2025 15:07:10 +0100 Subject: [PATCH 2/4] fix typo --- spec/v2/client_v2_integration.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/v2/client_v2_integration.rb b/spec/v2/client_v2_integration.rb index 884a8505..6be430bf 100644 --- a/spec/v2/client_v2_integration.rb +++ b/spec/v2/client_v2_integration.rb @@ -220,7 +220,7 @@ context 'A Data Schema Override' do it 'Overrides successfully' do data_schema_replace = File.read(File.join(V2_DATA_DIR, 'inference', 'data_schema_replace_param.json')) - input = Mindee::Input::Source::PathInputSource.new(blank_pdf_url) + input = Mindee::Input::Source::PathInputSource.new(File.join(FILE_TYPES_DIR, 'pdf', 'blank_1.pdf')) inference_params = Mindee::Input::InferenceParameters.new( model_id, From b19a0cdc36214e75e3b3bb8d2f4b2df3abe00098 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 19 Dec 2025 18:01:39 +0100 Subject: [PATCH 3/4] add the only part that actually makes all this work --- lib/mindee/http/mindee_api_v2.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/mindee/http/mindee_api_v2.rb b/lib/mindee/http/mindee_api_v2.rb index 4b838d2c..192e9164 100644 --- a/lib/mindee/http/mindee_api_v2.rb +++ b/lib/mindee/http/mindee_api_v2.rb @@ -122,6 +122,7 @@ def enqueue_form_options(form_data, params) form_data.push(['confidence', params.confidence.to_s]) unless params.confidence.nil? form_data.push ['file_alias', params.file_alias] if params.file_alias form_data.push ['text_context', params.text_context] if params.text_context + form_data.push ['data_schema', params.data_schema.to_s] if params.text_context unless params.webhook_ids.nil? || params.webhook_ids.empty? form_data.push ['webhook_ids', params.webhook_ids.join(',')] end From 506be504b484367a6a7e3a70a7322958dbf48ffd Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Fri, 19 Dec 2025 18:08:55 +0100 Subject: [PATCH 4/4] fix typo --- lib/mindee/http/mindee_api_v2.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mindee/http/mindee_api_v2.rb b/lib/mindee/http/mindee_api_v2.rb index 192e9164..8465f03d 100644 --- a/lib/mindee/http/mindee_api_v2.rb +++ b/lib/mindee/http/mindee_api_v2.rb @@ -122,7 +122,7 @@ def enqueue_form_options(form_data, params) form_data.push(['confidence', params.confidence.to_s]) unless params.confidence.nil? form_data.push ['file_alias', params.file_alias] if params.file_alias form_data.push ['text_context', params.text_context] if params.text_context - form_data.push ['data_schema', params.data_schema.to_s] if params.text_context + form_data.push ['data_schema', params.data_schema.to_s] if params.data_schema unless params.webhook_ids.nil? || params.webhook_ids.empty? form_data.push ['webhook_ids', params.webhook_ids.join(',')] end