Skip to content

Commit e8eee09

Browse files
authored
Merge pull request #108 from kazmer97/feat/agentic-multi-step-extraction
Agentic Extraction extra tools
2 parents 6796d84 + 3c47cc5 commit e8eee09

File tree

1 file changed

+58
-4
lines changed

1 file changed

+58
-4
lines changed

lib/idp_common_pkg/idp_common/extraction/agentic_idp.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,15 @@ def apply_json_patches(
242242
"patches_applied": len(patches),
243243
}
244244

245-
return extraction_tool, apply_json_patches
245+
@tool
246+
def make_buffer_data_final_extraction(agent: Agent) -> str:
247+
valid_extraction = model_class(**agent.state.get("intermediate_extraction"))
248+
249+
agent.state.set("current_extraction", valid_extraction.model_dump())
250+
251+
return f"Successfully made the existing extraction the same as the buffer data {str(valid_extraction.model_dump())[100:]}..."
252+
253+
return extraction_tool, apply_json_patches, make_buffer_data_final_extraction
246254

247255

248256
@tool
@@ -255,6 +263,45 @@ def view_existing_extraction(agent: Agent) -> str:
255263
return agent.state.get("current_extraction")
256264

257265

266+
@tool
267+
def write_buffer_date(data: dict[str, Any], agent: Agent) -> str:
268+
"""
269+
Use this tool when the extraction is too large to do in a single step, this is a buffer where you can save intermediate data that wouldn't pass validation yet.
270+
271+
"""
272+
agent.state.set("intermediate_extraction", data)
273+
logger.info("Saving intermediate data", extra={"intermediate_extraction": data})
274+
return f"Saved data: {str(data)[:100]}.... "
275+
276+
277+
@tool
278+
def view_buffer_data(agent: Agent) -> str:
279+
"""View the intermediate buffer data with this tool, this data is not a validated extraction, but intermediate state for you to work with."""
280+
281+
return agent.state.get("intermediate_extraction")
282+
283+
284+
@tool
285+
def patch_buffer_data(patches: list[dict[str, Any]], agent: Agent) -> str:
286+
"""Update the intermediate_extraction data inside the buffer, this is not validated yet
287+
288+
Apply JSON patches to fix or update the extracted data.
289+
290+
Args:
291+
patches: List of JSON patch operations (RFC 6902 format)
292+
reasoning: Explanation of what the patches fix
293+
294+
"""
295+
296+
patched_data = apply_patches_to_data(
297+
existing_data=agent.state.get("intermediate_extraction"), patches=patches
298+
)
299+
300+
agent.state.set("intermediate_extraction", patched_data)
301+
302+
return f"Successfully patched {str(patched_data)[100:]}...."
303+
304+
258305
SYSTEM_PROMPT = """
259306
You are a useful assistant that helps turn unstructured data into structured data using the provided tools.
260307
@@ -263,6 +310,7 @@ def view_existing_extraction(agent: Agent) -> str:
263310
2. When updating existing data or fixing validation errors, use JSON patch operations via the apply_json_patches tool
264311
3. JSON patches allow precise, targeted updates without losing correct data
265312
4. If the document is large and the extraction request can't be done in one go, create a valid extraction object and interate with jsonpatch until you completed the entire extraction!
313+
5. Use intermediate data buffer if you can't extract a valid data object in a single step.
266314
267315
IMPORTANT:
268316
YOU MUST perform a batched extraction if there are more than 50 fields to extract.
@@ -392,12 +440,18 @@ async def structured_output_async(
392440
)
393441

394442
# Create the dynamic extraction tool for this specific model
395-
extraction_tool, apply_json_patches = create_dynamic_extraction_tool_and_patch_tool(
443+
dynamic_extraction_tools = create_dynamic_extraction_tool_and_patch_tool(
396444
data_format
397445
)
398446

399447
# Prepare tools list
400-
tools = [extraction_tool, apply_json_patches, view_existing_extraction]
448+
tools = [
449+
*dynamic_extraction_tools,
450+
view_existing_extraction,
451+
patch_buffer_data,
452+
view_buffer_data,
453+
write_buffer_date,
454+
]
401455

402456
# Create agent with system prompt and tools
403457
schema_json = json.dumps(data_format.model_json_schema(), indent=2)
@@ -916,7 +970,7 @@ class DocumentRow(BaseModel):
916970
class DocumentFormat(BaseModel):
917971
document_name: str
918972
document_text: str
919-
table_rows: list[DocumentRow]
973+
table_rows: list[DocumentRow] = Field(gt=500)
920974

921975
with open(file_path, "rb") as f:
922976
data = f.read()

0 commit comments

Comments
 (0)