From 1bb5e807c5f3a30515d982b60dc03d0e3e1925f8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:37:44 -0800
Subject: [PATCH 01/38] update docs/architecture/memory.md

---
 docs/architecture/memory.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index 83ae9720..fb25a6ac 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -154,13 +154,12 @@ Executed via `_postprocess_after_turn` (background task).
 *   **Output:** JSON list of strings. Failures fall back to `[]`.
 
 ### 4.3 Reconciliation (Memory Management)
-Resolves contradictions using a "Search-Decide-Update" loop.
+Resolves contradictions using a "Search-Decide-Update" loop with complete enumeration.
 1.  **Local Search:** For each new fact, retrieve a small neighborhood of existing `role="memory"` entries for the conversation.
-2.  **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` (examples + strict JSON schema) to compare `new_facts` vs `existing_memories`.
+2.  **LLM Decision:** Uses `UPDATE_MEMORY_PROMPT` to compare `new_facts` vs `existing_memories`. The model must return **all memories** (existing + new) with explicit events for each.
     *   **Decisions:** `ADD`, `UPDATE`, `DELETE`, `NONE`.
     *   If no existing memories are found, all new facts are added directly.
     *   On LLM/network failure, defaults to adding all new facts.
-    *   Safeguard: if the model returns only deletes/empties, the new facts are still added to avoid data loss.
 3.  **Execution:**
     *   **Adds:** Creates new fact files and upserts to Chroma.
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
@@ -190,13 +189,14 @@ To replicate the system behavior, the following prompt strategies are required.
 *   **Example:** "My wife is Anne" -> `["The user's wife is named Anne"]`.
 
 ### 5.2 Reconciliation (`UPDATE_MEMORY_PROMPT`)
-*   **Goal:** Compare `new_facts` against `existing_memories` (id + text) and output structured decisions.
+*   **Goal:** Compare `new_facts` against `existing_memories` and return **all memories** (existing + new) with explicit events.
+*   **Approach:** The model must enumerate every memory in its response, forcing deliberate decisions rather than implicit omissions.
 *   **Operations:**
-    *   **ADD:** New information (generates a new ID).
-    *   **UPDATE:** Refines existing information (uses the provided short ID).
-    *   **DELETE:** Contradicts existing information (e.g., "I hate pizza" vs "I love pizza"). **If deleting because of a replacement, the new fact must also be returned (ADD or UPDATE).**
-    *   **NONE:** Fact already exists or is irrelevant.
-*   **Output constraints:** JSON list only; no prose/code fences; IDs for UPDATE/DELETE/NONE must come from the provided list.
+    *   **ADD:** New information not present in existing memories (generates a new sequential ID).
+    *   **UPDATE:** Refines existing information about the **same topic** (keeps the existing ID).
+    *   **DELETE:** Explicitly contradicts existing information (e.g., "I hate pizza" vs "I love pizza").
+    *   **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate.
+*   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
 ### 5.3 Summarization (`SUMMARY_PROMPT`)
 *   **Goal:** Maintain a concise running summary.

From d79831a8be3951c3e28626c1cb101e7c48487acd Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:38:04 -0800
Subject: [PATCH 02/38] Turn off ChromaDB telemetry

---
 agent_cli/core/chroma.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py
index 65201422..56d54ede 100644
--- a/agent_cli/core/chroma.py
+++ b/agent_cli/core/chroma.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any
 
 import chromadb
+from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 
 from agent_cli.constants import DEFAULT_OPENAI_EMBEDDING_MODEL
@@ -29,7 +30,10 @@ def init_collection(
     """Initialize a Chroma collection with OpenAI-compatible embeddings."""
     target_path = persistence_path / subdir if subdir else persistence_path
     target_path.mkdir(parents=True, exist_ok=True)
-    client = chromadb.PersistentClient(path=str(target_path))
+    client = chromadb.PersistentClient(
+        path=str(target_path),
+        settings=Settings(anonymized_telemetry=False),
+    )
     embed_fn = embedding_functions.OpenAIEmbeddingFunction(
         api_base=openai_base_url,
         api_key=openai_api_key or "dummy",

From 9f3d6be55932f73058ac551d84d19b0ce322a032 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:44:37 -0800
Subject: [PATCH 03/38] feat(memory): add output validation with ModelRetry for
 reconciliation

- Add @agent.output_validator to validate LLM decisions
- Catch invalid UPDATE/DELETE/NONE with non-existent IDs
- Send helpful error messages via ModelRetry for retry
- Graceful fallback to add all facts when retries exhausted
- Add AI journal POC example for testing MemoryClient
- Improve reconciliation prompt with clearer examples
---
 agent_cli/memory/_ingest.py |  61 +++++++++++++--
 agent_cli/memory/_prompt.py | 121 +++++++++++++++--------------
 examples/aijournal_poc.py   | 151 ++++++++++++++++++++++++++++++++++++
 3 files changed, 266 insertions(+), 67 deletions(-)
 create mode 100755 examples/aijournal_poc.py

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 4a00c4ed..98f110b9 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -10,7 +10,7 @@
 from uuid import uuid4
 
 import httpx
-from pydantic_ai import Agent
+from pydantic_ai import Agent, ModelRetry
 from pydantic_ai.exceptions import AgentRunError, UnexpectedModelBehavior
 from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.openai import OpenAIProvider
@@ -121,9 +121,10 @@ def process_reconciliation_decisions(
                 )
         elif isinstance(dec, MemoryUpdate):
             orig = id_map.get(dec.id)
-            if orig:
-                text = dec.text.strip()
-                if text:
+            text = dec.text.strip()
+            if text:
+                if orig:
+                    # Update existing memory: delete old, add new
                     new_id = str(uuid4())
                     to_delete.append(orig)
                     to_add.append(
@@ -136,6 +137,17 @@ def process_reconciliation_decisions(
                         ),
                     )
                     replacement_map[orig] = new_id
+                else:
+                    # UPDATE with unknown ID = treat as ADD (model used wrong event)
+                    to_add.append(
+                        Fact(
+                            id=str(uuid4()),
+                            conversation_id=conversation_id,
+                            content=text,
+                            source_id=source_id,
+                            created_at=created_at,
+                        ),
+                    )
         elif isinstance(dec, MemoryDelete):
             orig = id_map.get(dec.id)
             if orig:
@@ -178,6 +190,7 @@ async def reconcile_facts(
         return entries, [], {}
     id_map: dict[int, str] = {idx: mem.id for idx, mem in enumerate(existing)}
     existing_json = [{"id": idx, "text": mem.content} for idx, mem in enumerate(existing)]
+    existing_ids = set(id_map.keys())
 
     provider = OpenAIProvider(api_key=api_key or "dummy", base_url=openai_base_url)
     model_cfg = OpenAIChatModel(
@@ -192,9 +205,43 @@ async def reconcile_facts(
         retries=3,
     )
 
-    payload_obj = {"existing": existing_json, "new_facts": new_facts}
-    payload = json.dumps(payload_obj, ensure_ascii=False, indent=2)
-    LOGGER.info("Reconcile payload JSON: %s", payload)
+    @agent.output_validator
+    def validate_decisions(decisions: list[MemoryDecision]) -> list[MemoryDecision]:
+        """Validate LLM decisions and provide feedback for retry."""
+        errors = []
+        for dec in decisions:
+            if (
+                isinstance(dec, (MemoryUpdate, MemoryDelete, MemoryIgnore))
+                and dec.id not in existing_ids
+            ):
+                if isinstance(dec, MemoryUpdate):
+                    errors.append(
+                        f"UPDATE with id={dec.id} is invalid: that ID doesn't exist. "
+                        f"Valid existing IDs are: {sorted(existing_ids)}. "
+                        f"For NEW facts, use ADD with a new ID.",
+                    )
+                elif isinstance(dec, MemoryDelete):
+                    errors.append(f"DELETE with id={dec.id} is invalid: that ID doesn't exist.")
+                else:  # MemoryIgnore (NONE)
+                    errors.append(f"NONE with id={dec.id} is invalid: that ID doesn't exist.")
+        if errors:
+            msg = "Invalid memory decisions:\n" + "\n".join(f"- {e}" for e in errors)
+            raise ModelRetry(msg)
+        return decisions
+
+    # Format with separate sections for existing and new facts
+    existing_str = json.dumps(existing_json, ensure_ascii=False, indent=2)
+    new_facts_str = json.dumps(new_facts, ensure_ascii=False, indent=2)
+    payload = f"""Current memory:
+```
+{existing_str}
+```
+
+New facts to process:
+```
+{new_facts_str}
+```"""
+    LOGGER.info("Reconcile payload: %s", payload)
     try:
         result = await agent.run(payload)
         decisions = result.output
diff --git a/agent_cli/memory/_prompt.py b/agent_cli/memory/_prompt.py
index 58cba1f6..84ebf364 100644
--- a/agent_cli/memory/_prompt.py
+++ b/agent_cli/memory/_prompt.py
@@ -21,67 +21,68 @@
 Return only factual sentences grounded in the user text. No assistant acknowledgements or meta-text.
 """.strip()
 
-UPDATE_MEMORY_PROMPT = """
-You are a smart memory manager which controls the memory of a system.
-You can perform four operations: (1) ADD into memory, (2) UPDATE memory, (3) DELETE from memory, (4) NONE (no change).
-
-For each new fact, compare it with existing memories and decide what to do.
-
-Guidelines:
-
-1. **ADD**: New fact contains information NOT present in any existing memory.
-   - Generate a new ID for added memories (next sequential integer).
-   - Existing unrelated memories remain unchanged (NONE).
-
-2. **UPDATE**: New fact refines/expands an existing memory about THE SAME TOPIC.
+UPDATE_MEMORY_PROMPT = """You are a smart memory manager which controls the memory of a system.
+You can perform four operations: (1) ADD into the memory, (2) UPDATE the memory, (3) DELETE from the memory, and (4) NONE (no change).
+
+Compare new facts with existing memory. For each new fact, decide whether to:
+- ADD: Add it to the memory as a new element (new information not present in any existing memory)
+- UPDATE: Update an existing memory element (only if facts are about THE SAME TOPIC, e.g., both about pizza preferences)
+- DELETE: Delete an existing memory element (if new fact explicitly contradicts it)
+- NONE: Make no change (if fact is already present, a duplicate, or the existing memory is unrelated to new facts)
+
+**Guidelines:**
+
+1. **ADD**: If the new fact contains new information not present in any existing memory, add it with a new ID.
+   - Existing unrelated memories should have event "NONE".
+- **Example**:
+    - Current memory: [{"id": 0, "text": "User is a software engineer"}]
+    - New facts: ["Name is John"]
+    - Output: [
+        {"id": 0, "text": "User is a software engineer", "event": "NONE"},
+        {"id": 1, "text": "Name is John", "event": "ADD"}
+      ]
+
+2. **UPDATE**: Only if the new fact refines/expands an existing memory about THE SAME TOPIC.
    - Keep the same ID, update the text.
-   - Only update if facts are about the same subject (e.g., both about pizza preferences).
-
-3. **DELETE**: New fact explicitly contradicts an existing memory.
-   - Mark the old memory for deletion.
-
-4. **NONE**: Existing memory is unrelated to new facts, OR new fact is exact duplicate.
-   - No change needed.
-
-**CRITICAL**: You must return ALL memories (existing + new) in your response.
-Each existing memory must have an event (NONE, UPDATE, or DELETE).
-Each new unrelated fact must be ADDed with a new ID.
-
-Examples:
-
-1. UNRELATED new fact → ADD it, existing stays NONE
-   Existing: [{"id": 0, "text": "User is a software engineer"}]
-   New facts: ["Name is John"]
-   Output: [
-     {"id": 0, "text": "User is a software engineer", "event": "NONE"},
-     {"id": 1, "text": "Name is John", "event": "ADD"}
-   ]
-
-2. RELATED facts (same topic) → UPDATE existing
-   Existing: [{"id": 0, "text": "User likes pizza"}]
-   New facts: ["User loves pepperoni pizza"]
-   Output: [
-     {"id": 0, "text": "User loves pepperoni pizza", "event": "UPDATE"}
-   ]
-
-3. CONTRADICTING facts → DELETE old
-   Existing: [{"id": 0, "text": "Loves pizza"}, {"id": 1, "text": "Name is John"}]
-   New facts: ["Hates pizza"]
-   Output: [
-     {"id": 0, "text": "Loves pizza", "event": "DELETE"},
-     {"id": 1, "text": "Name is John", "event": "NONE"},
-     {"id": 2, "text": "Hates pizza", "event": "ADD"}
-   ]
-
-4. DUPLICATE → NONE for all
-   Existing: [{"id": 0, "text": "Name is John"}]
-   New facts: ["Name is John"]
-   Output: [
-     {"id": 0, "text": "Name is John", "event": "NONE"}
-   ]
-
-Return ONLY a JSON list. No prose or code fences.
-""".strip()
+   - Example: "User likes pizza" + "User loves pepperoni pizza" → UPDATE (same topic: pizza)
+   - Example: "Met Sarah today" + "Went running" → NOT same topic, do NOT update!
+- **Example**:
+    - Current memory: [{"id": 0, "text": "User likes pizza"}]
+    - New facts: ["User loves pepperoni pizza"]
+    - Output: [{"id": 0, "text": "User loves pepperoni pizza", "event": "UPDATE"}]
+
+3. **DELETE**: If the new fact explicitly contradicts an existing memory.
+- **Example**:
+    - Current memory: [{"id": 0, "text": "Loves pizza"}, {"id": 1, "text": "Name is John"}]
+    - New facts: ["Hates pizza"]
+    - Output: [
+        {"id": 0, "text": "Loves pizza", "event": "DELETE"},
+        {"id": 1, "text": "Name is John", "event": "NONE"},
+        {"id": 2, "text": "Hates pizza", "event": "ADD"}
+      ]
+
+4. **NONE**: If the new fact is already present or existing memory is unrelated to new facts.
+- **Example**:
+    - Current memory: [{"id": 0, "text": "Name is John"}]
+    - New facts: ["Name is John"]
+    - Output: [{"id": 0, "text": "Name is John", "event": "NONE"}]
+
+5. **IMPORTANT - Unrelated topics example**:
+    - Current memory: [{"id": 0, "text": "Met Sarah to discuss quantum computing"}]
+    - New facts: ["Went for a 5km run"]
+    - These are COMPLETELY DIFFERENT topics (meeting vs running). Do NOT use UPDATE!
+    - Output: [
+        {"id": 0, "text": "Met Sarah to discuss quantum computing", "event": "NONE"},
+        {"id": 1, "text": "Went for a 5km run", "event": "ADD"}
+      ]
+
+**CRITICAL RULES:**
+- You MUST return ALL memories (existing + new) in your response.
+- Each existing memory MUST have an event (NONE, UPDATE, or DELETE).
+- Each genuinely NEW fact (not related to any existing memory) MUST be ADDed with a new ID.
+- Do NOT use UPDATE for unrelated topics! "Met Sarah" and "Went running" are DIFFERENT topics → use NONE for existing + ADD for new.
+
+Return ONLY a JSON list. No prose or code fences.""".strip()
 
 SUMMARY_PROMPT = """
 You are a concise conversation summarizer. Update the running summary with the new facts.
diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py
new file mode 100755
index 00000000..df5934e9
--- /dev/null
+++ b/examples/aijournal_poc.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Minimal AI Journal proof-of-concept using MemoryClient.
+
+This validates the core hypothesis: MemoryClient can serve as the
+foundation for a personal knowledge system (AI journal).
+
+Usage:
+    # Add a journal entry
+    python examples/aijournal_poc.py add "Today I learned about quantum computing at work"
+
+    # Search memories
+    python examples/aijournal_poc.py search "what did I learn?"
+
+    # Interactive chat with memory
+    python examples/aijournal_poc.py chat "What have I been working on lately?"
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import logging
+import os
+from pathlib import Path
+
+from agent_cli.memory.client import MemoryClient
+
+# Enable debug logging for memory module
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    datefmt="%H:%M:%S",
+)
+# Enable DEBUG for memory ingest to see full prompts
+logging.getLogger("agent_cli.memory._ingest").setLevel(logging.DEBUG)
+
+
+# Defaults for local AI setup
+DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
+DEFAULT_MODEL = "gpt-oss-high:20b"
+DEFAULT_EMBEDDING_MODEL = "embeddinggemma:300m"
+
+
+def get_client(model: str | None = None) -> tuple[MemoryClient, str]:
+    """Initialize the memory client with sensible defaults.
+
+    Returns:
+        Tuple of (client, model_name)
+
+    """
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    model_name = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)
+    embedding_model = os.environ.get("EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    print(f"Using: {base_url}")
+    print(f"  Chat model: {model_name}")
+    print(f"  Embedding model: {embedding_model}")
+
+    return MemoryClient(
+        memory_path=Path("~/.aijournal").expanduser(),
+        openai_base_url=base_url,
+        chat_api_key=api_key,
+        embedding_api_key=api_key,
+        embedding_model=embedding_model,
+        enable_summarization=True,
+        enable_git_versioning=False,  # Keep it simple for POC
+        score_threshold=0.1,  # Lower threshold for local models
+    ), model_name
+
+
+async def cmd_add(text: str) -> None:
+    """Add a journal entry."""
+    client, model = get_client()
+    print(f"Adding entry: {text[:50]}...")
+    await client.add(text, conversation_id="journal", model=model)
+    print("✓ Entry processed and facts extracted")
+
+
+async def cmd_search(query: str, top_k: int = 5) -> None:
+    """Search memories."""
+    client, model = get_client()
+    print(f"Searching for: {query}\n")
+
+    result = await client.search(query, conversation_id="journal", top_k=top_k, model=model)
+
+    if not result.entries:
+        print("No relevant memories found.")
+        return
+
+    for i, entry in enumerate(result.entries, 1):
+        print(f"{i}. [{entry.role}] {entry.content}")
+        print(f"   Score: {entry.score:.3f} | Created: {entry.created_at[:10]}")
+        print()
+
+
+async def cmd_chat(question: str) -> None:
+    """Chat with memory-augmented LLM."""
+    client, model = get_client()
+    print(f"Question: {question}\n")
+
+    response = await client.chat(
+        messages=[{"role": "user", "content": question}],
+        conversation_id="journal",
+        model=model,
+    )
+
+    # Extract assistant reply
+    choices = response.get("choices", [])
+    if choices:
+        reply = choices[0].get("message", {}).get("content", "")
+        print(f"Answer: {reply}")
+
+    # Show which memories were used
+    hits = response.get("memory_hits", [])
+    if hits:
+        print(f"\n--- Used {len(hits)} memories ---")
+        for hit in hits[:3]:
+            print(f"  • {hit['content'][:80]}...")
+
+
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(description="AI Journal POC")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    # Add command
+    add_parser = subparsers.add_parser("add", help="Add a journal entry")
+    add_parser.add_argument("text", help="The journal entry text")
+
+    # Search command
+    search_parser = subparsers.add_parser("search", help="Search memories")
+    search_parser.add_argument("query", help="Search query")
+    search_parser.add_argument("-k", "--top-k", type=int, default=5, help="Number of results")
+
+    # Chat command
+    chat_parser = subparsers.add_parser("chat", help="Chat with memory")
+    chat_parser.add_argument("question", help="Question to ask")
+
+    args = parser.parse_args()
+
+    if args.command == "add":
+        asyncio.run(cmd_add(args.text))
+    elif args.command == "search":
+        asyncio.run(cmd_search(args.query, args.top_k))
+    elif args.command == "chat":
+        asyncio.run(cmd_chat(args.question))
+
+
+if __name__ == "__main__":
+    main()

From fb1dde87d5b6f9b673e517aa2fe13ad76e4d64e0 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 16:54:38 -0800
Subject: [PATCH 04/38] feat(memory): use PromptedOutput (JSON mode) for
 reconciliation

Switch from tool calls to JSON mode output for the reconciliation agent.
This works better with local models (like reasoning models) that put
output in reasoning_content field instead of content.

PromptedOutput injects the schema into the prompt and enables JSON mode
(response_format={"type": "json_object"}), matching mem0's approach.
---
 agent_cli/memory/_ingest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 98f110b9..46175355 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -10,7 +10,7 @@
 from uuid import uuid4
 
 import httpx
-from pydantic_ai import Agent, ModelRetry
+from pydantic_ai import Agent, ModelRetry, PromptedOutput
 from pydantic_ai.exceptions import AgentRunError, UnexpectedModelBehavior
 from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.openai import OpenAIProvider
@@ -201,7 +201,7 @@ async def reconcile_facts(
     agent = Agent(
         model=model_cfg,
         system_prompt=UPDATE_MEMORY_PROMPT,
-        output_type=list[MemoryDecision],
+        output_type=PromptedOutput(list[MemoryDecision]),  # JSON mode instead of tool calls
         retries=3,
     )
 

From 62bafc39df9b81fdec75988ff4eead5f824e7aa8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 17:04:20 -0800
Subject: [PATCH 05/38] feat(memory): add self-model features to AI journal POC

- Add list_all() method to MemoryClient to retrieve all stored memories
- Add 'show' command to display all stored facts about the user
- Add 'profile' command to generate a structured profile summary using LLM
- Enhance 'chat' command to use profile context for personalized responses

The POC now demonstrates a "self-model" system that:
1. Extracts facts from user input
2. Stores and retrieves them semantically
3. Generates profile summaries on demand
4. Uses the profile to personalize conversations

This validates the core hypothesis: MemoryClient can serve as the
foundation for a personal knowledge system that knows who you are.
---
 agent_cli/memory/client.py |  32 +++++++-
 examples/aijournal_poc.py  | 146 +++++++++++++++++++++++++++++++++----
 2 files changed, 163 insertions(+), 15 deletions(-)

diff --git a/agent_cli/memory/client.py b/agent_cli/memory/client.py
index b5ea3a7f..a3cc970d 100644
--- a/agent_cli/memory/client.py
+++ b/agent_cli/memory/client.py
@@ -14,7 +14,7 @@
 from agent_cli.memory._ingest import extract_and_store_facts_and_summaries
 from agent_cli.memory._persistence import evict_if_needed
 from agent_cli.memory._retrieval import augment_chat_request
-from agent_cli.memory._store import init_memory_collection
+from agent_cli.memory._store import init_memory_collection, list_conversation_entries
 from agent_cli.memory.engine import process_chat_request
 from agent_cli.memory.models import ChatRequest, MemoryRetrieval, Message
 from agent_cli.rag._retriever import get_reranker_model
@@ -185,6 +185,36 @@ async def search(
         )
         return retrieval or MemoryRetrieval(entries=[])
 
+    def list_all(
+        self,
+        conversation_id: str = "default",
+        include_summary: bool = False,
+    ) -> list[dict[str, Any]]:
+        """List all stored memories for a conversation.
+
+        Args:
+            conversation_id: Conversation scope.
+            include_summary: Whether to include summary entries.
+
+        Returns:
+            List of memory entries with id, content, and metadata.
+
+        """
+        entries = list_conversation_entries(
+            self.collection,
+            conversation_id,
+            include_summary=include_summary,
+        )
+        return [
+            {
+                "id": e.id,
+                "content": e.content,
+                "role": e.metadata.role,
+                "created_at": e.metadata.created_at,
+            }
+            for e in entries
+        ]
+
     async def chat(
         self,
         messages: list[dict[str, str]] | list[Any],
diff --git a/examples/aijournal_poc.py b/examples/aijournal_poc.py
index df5934e9..156c0b97 100755
--- a/examples/aijournal_poc.py
+++ b/examples/aijournal_poc.py
@@ -23,6 +23,8 @@
 import os
 from pathlib import Path
 
+import httpx
+
 from agent_cli.memory.client import MemoryClient
 
 # Enable debug logging for memory module
@@ -94,29 +96,135 @@ async def cmd_search(query: str, top_k: int = 5) -> None:
         print()
 
 
-async def cmd_chat(question: str) -> None:
+def cmd_show() -> None:
+    """Show all stored memories (what the system knows about you)."""
+    client, _ = get_client()
+    print("=== What I know about you ===\n")
+
+    entries = client.list_all(conversation_id="journal")
+
+    if not entries:
+        print("No memories stored yet. Add some journal entries first!")
+        return
+
+    # Sort by created_at
+    entries.sort(key=lambda x: x["created_at"], reverse=True)
+
+    for i, entry in enumerate(entries, 1):
+        date = entry["created_at"][:10] if entry["created_at"] else "unknown"
+        print(f"{i}. [{date}] {entry['content']}")
+
+    print(f"\n--- Total: {len(entries)} memories ---")
+
+
+PROFILE_PROMPT = """Based on the following facts about a person, create a brief profile summary.
+Organize the information into categories like:
+- **Identity**: Name, relationships, occupation
+- **Interests & Activities**: Hobbies, regular activities
+- **Goals & Values**: What they care about, what they're working towards
+- **Recent Events**: Notable recent happenings
+
+Only include categories that have relevant information. Be concise.
+
+Facts:
+{facts}
+
+Profile Summary:"""
+
+
+async def cmd_profile() -> None:
+    """Generate a profile summary from stored memories."""
+    client, model = get_client()
+
+    entries = client.list_all(conversation_id="journal")
+
+    if not entries:
+        print("No memories stored yet. Add some journal entries first!")
+        return
+
+    # Format facts for the prompt
+    facts = "\n".join(f"- {e['content']}" for e in entries)
+    prompt = PROFILE_PROMPT.format(facts=facts)
+
+    print("=== Your Profile ===\n")
+    print("(Generating profile from stored memories...)\n")
+
+    # Direct LLM call (bypasses memory storage)
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    async with httpx.AsyncClient(timeout=120.0) as http:
+        response = await http.post(
+            f"{base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {api_key}"},
+            json={
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.7,
+            },
+        )
+        data = response.json()
+
+    choices = data.get("choices", [])
+    if choices:
+        profile = choices[0].get("message", {}).get("content", "")
+        print(profile)
+
+    print(f"\n--- Based on {len(entries)} memories ---")
+
+
+CHAT_SYSTEM_PROMPT = """You are a helpful AI assistant with memory of the user.
+
+Here's what you know about the user:
+{profile}
+
+Use this knowledge naturally in your responses. Be helpful and personable."""
+
+
+async def cmd_chat(question: str, with_profile: bool = True) -> None:
     """Chat with memory-augmented LLM."""
     client, model = get_client()
+
+    # Build profile context
+    profile_text = ""
+    if with_profile:
+        entries = client.list_all(conversation_id="journal")
+        if entries:
+            profile_text = "\n".join(f"- {e['content']}" for e in entries)
+
     print(f"Question: {question}\n")
 
-    response = await client.chat(
-        messages=[{"role": "user", "content": question}],
-        conversation_id="journal",
-        model=model,
-    )
+    # Build messages with profile context
+    messages: list[dict[str, str]] = []
+    if profile_text:
+        system_prompt = CHAT_SYSTEM_PROMPT.format(profile=profile_text)
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": question})
 
-    # Extract assistant reply
-    choices = response.get("choices", [])
+    # Direct LLM call with profile context
+    base_url = os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    async with httpx.AsyncClient(timeout=120.0) as http:
+        response = await http.post(
+            f"{base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {api_key}"},
+            json={
+                "model": model,
+                "messages": messages,
+                "temperature": 0.7,
+            },
+        )
+        data = response.json()
+
+    choices = data.get("choices", [])
     if choices:
         reply = choices[0].get("message", {}).get("content", "")
         print(f"Answer: {reply}")
 
-    # Show which memories were used
-    hits = response.get("memory_hits", [])
-    if hits:
-        print(f"\n--- Used {len(hits)} memories ---")
-        for hit in hits[:3]:
-            print(f"  • {hit['content'][:80]}...")
+    if profile_text:
+        entry_count = len(client.list_all(conversation_id="journal"))
+        print(f"\n--- Using profile with {entry_count} memories ---")
 
 
 def main() -> None:
@@ -137,6 +245,12 @@ def main() -> None:
     chat_parser = subparsers.add_parser("chat", help="Chat with memory")
     chat_parser.add_argument("question", help="Question to ask")
 
+    # Show command - display what the system knows about you
+    subparsers.add_parser("show", help="Show all stored memories")
+
+    # Profile command - generate a profile summary
+    subparsers.add_parser("profile", help="Generate profile from memories")
+
     args = parser.parse_args()
 
     if args.command == "add":
@@ -145,6 +259,10 @@ def main() -> None:
         asyncio.run(cmd_search(args.query, args.top_k))
     elif args.command == "chat":
         asyncio.run(cmd_chat(args.question))
+    elif args.command == "show":
+        cmd_show()
+    elif args.command == "profile":
+        asyncio.run(cmd_profile())
 
 
 if __name__ == "__main__":

From aede425d11b64d1c32ba0524cfc96b2fb9e0d469 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 17:51:37 -0800
Subject: [PATCH 06/38] docs: add detailed comparison between AI journal POC
 and aijournal

Analyzes architecture, features, and test results comparing our
MemoryClient-based POC (~200 LOC) with the full aijournal project
(~15,000+ LOC).

Key findings:
- POC successfully extracts facts and generates accurate profiles
- Main gap is learning over time (strength tracking, decay, feedback)
- Recommends adding simple strength field to close 80% of functionality
  gap with 20% of aijournal's complexity

Includes concrete test results from ingesting 12+ blog posts.
---
 docs/aijournal-poc-comparison.md | 245 +++++++++++++++++++++++++++++++
 1 file changed, 245 insertions(+)
 create mode 100644 docs/aijournal-poc-comparison.md

diff --git a/docs/aijournal-poc-comparison.md b/docs/aijournal-poc-comparison.md
new file mode 100644
index 00000000..a6f928f0
--- /dev/null
+++ b/docs/aijournal-poc-comparison.md
@@ -0,0 +1,245 @@
+# AI Journal POC vs aijournal: Detailed Comparison
+
+This document analyzes the differences between our MemoryClient-based AI Journal POC and the full-featured aijournal project, identifying strengths, gaps, and potential paths forward.
+
+## Executive Summary
+
+| Aspect | Our POC | aijournal |
+|--------|---------|-----------|
+| **Complexity** | ~200 LOC | ~15,000+ LOC |
+| **Setup Time** | Instant | `aijournal init` + config |
+| **Profile Storage** | Generated on-demand | Persisted YAML with versioning |
+| **Claim System** | Raw fact strings | Typed atoms with strength/decay |
+| **Context Layers** | Single flat layer | 4 hierarchical layers (L1-L4) |
+| **Learning** | Static extraction | Feedback loops + interview probing |
+
+## 1. Architecture Comparison
+
+### 1.1 Data Model
+
+**Our POC:**
+```
+~/.aijournal/
+  entries/
+    journal/
+      facts/           # Extracted facts as markdown
+      turns/           # Chat turns
+  chroma/              # Vector embeddings
+```
+
+**aijournal:**
+```
+workspace/
+  data/
+    journal/YYYY/MM/DD/*.md    # Raw entries
+    normalized/YYYY-MM-DD/     # Structured YAML
+  profile/
+    self_profile.yaml          # Facets (values, goals, traits)
+    claims.yaml                # Typed claim atoms
+  derived/
+    summaries/                 # Daily summaries
+    microfacts/                # Extracted facts
+    persona/persona_core.yaml  # L1 context (~1200 tokens)
+    index/                     # Vector store + metadata
+    chat_sessions/             # Conversation history
+    pending/profile_updates/   # Queued changes
+```
+
+**Analysis:** aijournal separates authoritative data (human-editable) from derived data (reproducible). Our POC conflates these, making it harder to inspect or manually correct the knowledge base.
+
+### 1.2 Claim Representation
+
+**Our POC - Raw facts:**
+```
+"Bas is a software engineer"
+"The user loves hiking"
+"The user's wife is named Anne"
+```
+
+**aijournal - Typed claim atoms:**
+```yaml
+- type: trait
+  subject: self
+  predicate: occupation
+  statement: "Works as a software engineer focused on AI systems"
+  scope: {domain: work, context: [professional]}
+  strength: 0.85
+  status: accepted
+  provenance:
+    sources: [entry:2025-01-15-morning]
+    first_seen: 2025-01-15
+    last_updated: 2025-01-20
+```
+
+**Analysis:** aijournal's typed claims enable:
+- Filtering by type (traits vs preferences vs goals)
+- Confidence tracking via `strength`
+- Time-decay for relevance
+- Conflict detection between claims
+- Source attribution for verification
+
+### 1.3 Context Layers
+
+**Our POC:** Single layer - all facts dumped into system prompt
+
+**aijournal - Hierarchical layers:**
+
+| Layer | Content | Tokens | Use Case |
+|-------|---------|--------|----------|
+| L1 | Persona core + top claims | ~1,200 | Quick chat, advice |
+| L2 | L1 + recent summaries/facts | ~2,000 | Daily check-ins |
+| L3 | L2 + full claims + facets | ~2,600 | Deep conversations |
+| L4 | L3 + prompts + config + history | ~3,200 | External AI export |
+
+**Analysis:** Layered context prevents token overflow and allows appropriate depth for different interactions.
+
+## 2. Feature Comparison
+
+### 2.1 Fact Extraction
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Extraction method | PydanticAI agent | Ollama + custom prompts |
+| Output format | Raw strings | Typed MicroFact objects |
+| Reconciliation | ADD/UPDATE/DELETE/NONE | Consolidation with strength weighting |
+| Deduplication | Semantic similarity | Hash + semantic + scope matching |
+
+**Our POC advantage:** The reconciliation logic (PromptedOutput with JSON mode) prevents duplicate facts effectively.
+
+**aijournal advantage:** Consolidation weights existing evidence: `strength_new = clamp01((w_prev * strength_prev + w_obs * signal) / (w_prev + w_obs))`
+
+### 2.2 Profile Generation
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Generation | On-demand via LLM | Pre-built `persona_core.yaml` |
+| Caching | None | Persisted with staleness tracking |
+| Categories | LLM-determined | Defined schema (values, goals, traits, etc.) |
+| Token budget | Unlimited (risk of overflow) | Configurable (~1,200 default) |
+
+**Our POC advantage:** Flexible - LLM determines categories dynamically based on content.
+
+**aijournal advantage:** Deterministic, auditable, and respects token limits.
+
+### 2.3 Chat Integration
+
+| Feature | Our POC | aijournal |
+|---------|---------|-----------|
+| Context injection | All facts in system prompt | Layer-appropriate context |
+| Citations | None | `[entry:id#p<idx>]` markers |
+| Feedback | None | Up/down adjustments to claim strength |
+| Memory storage | Bypassed (direct LLM call) | Persisted with telemetry |
+
+**Our POC advantage:** Simple, no side effects.
+
+**aijournal advantage:** Learning loop - feedback strengthens/weakens claims over time.
+
+### 2.4 Missing in Our POC
+
+1. **Interview/Probing Mode**
+   - aijournal generates questions to fill knowledge gaps
+   - Ranks facets by `staleness × impact_weight` to prioritize probing
+
+2. **Time Decay**
+   - aijournal: `effective_strength = strength × exp(-λ × staleness)`
+   - Our POC: All facts treated equally regardless of age
+
+3. **Conflict Resolution**
+   - aijournal: Detects contradictions, downgrades to `tentative`, queues questions
+   - Our POC: UPDATE replaces old fact entirely
+
+4. **Advisor Mode**
+   - aijournal: Separate `advise` command with coaching preferences
+   - Our POC: Generic chat only
+
+5. **Export/Packs**
+   - aijournal: Generate context bundles for external AIs
+   - Our POC: No export capability
+
+## 3. Test Results Analysis
+
+### 3.1 Blog Post Ingestion
+
+We fed 12+ blog posts into our POC:
+
+| Metric | Result |
+|--------|--------|
+| Posts processed | ~12 |
+| Facts extracted | 52 |
+| Extraction accuracy | High - captured key themes |
+| Profile quality | Excellent - identified all major interests |
+
+**Sample extracted facts:**
+- "Bas is a software engineer"
+- "Bas works on AI systems"
+- "The user loves hiking"
+- "You went for a 5km run this morning"
+- "You discovered that local vision models like Qwen3-VL-32B can identify niche books"
+
+### 3.2 Profile Generation Quality
+
+The generated profile correctly identified:
+- ✅ Professional identity (software engineer, AI focus)
+- ✅ Personal relationships (wife Anne)
+- ✅ Hobbies (hiking, running, learning Dutch)
+- ✅ Technical interests (local AI, terminal productivity, homelab)
+- ✅ Values (minimalism, security, reproducibility)
+
+### 3.3 Chat Intelligence
+
+The chat demonstrated:
+- **Specific recall:** "You use the Glove80 keyboard with programmable layers"
+- **Temporal understanding:** Tracked evolution of views on AI coding
+- **Theme synthesis:** Connected local AI + security + productivity interests
+- **Nuanced responses:** Acknowledged both benefits and limitations
+
+## 4. Recommendations
+
+### 4.1 Quick Wins (Keep POC Simple)
+
+1. **Persist profile summary** - Cache the LLM-generated profile to avoid regeneration
+2. **Add timestamps to facts** - Already have `created_at`, use it for recency weighting
+3. **Token budgeting** - Limit facts sent to chat based on relevance + recency
+
+### 4.2 Medium-Term Enhancements
+
+1. **Claim typing** - Categorize facts into types (trait, preference, goal, relationship)
+2. **Strength tracking** - Increment when same fact extracted multiple times
+3. **Simple decay** - Weight recent facts higher in context
+
+### 4.3 aijournal Features Worth Adopting
+
+1. **Interview mode** - Generate questions to learn more
+2. **Feedback loop** - Up/down on responses affects claim strength
+3. **Layered context** - L1 for quick chats, L4 for deep dives
+4. **Citations** - Link responses to source facts
+
+### 4.4 What NOT to Adopt
+
+1. **7-stage pipeline** - Overkill for our use case
+2. **Strict schema governance** - Adds friction without clear benefit for POC
+3. **Markdown file storage** - ChromaDB is sufficient for our needs
+
+## 5. Conclusion
+
+Our POC validates the core hypothesis: **MemoryClient can serve as the foundation for a personal knowledge system**. With ~200 lines of code, we achieved:
+
+- Accurate fact extraction from unstructured text
+- Coherent profile generation from diverse content
+- Personalized conversations using stored knowledge
+
+The main gap is **learning over time** - our system doesn't strengthen beliefs based on repetition or feedback. Adding simple strength tracking and decay would close 80% of the functionality gap with 20% of aijournal's complexity.
+
+### Recommended Next Step
+
+Add a `strength` field to stored facts and implement:
+```python
+# On duplicate fact detection
+existing.strength = min(1.0, existing.strength + 0.1)
+existing.last_seen = now()
+
+# On retrieval
+effective_strength = fact.strength * exp(-0.1 * days_since_last_seen)
+```
+
+This single change would transform our static knowledge base into a learning system.

From 70cf95591c71e12fdb1a7225f85a1d5238de44bb Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 19:30:24 -0800
Subject: [PATCH 07/38] feat(memory): add adaptive summarization with
 hierarchical storage

Implement research-grounded summarization inspired by Letta and Mem0:
- AdaptiveSummarizer with 5 levels (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
- Hierarchical summary storage (L1 chunks, L2 groups, L3 final) in ChromaDB
- File-based persistence with YAML front matter in markdown files
- Token counting via tiktoken with fallback to cl100k_base
- Level-specific compression ratios (20%, 12%, 7%, capped 2000 tokens)

Structure:
- agent_cli/summarizer/ - standalone reusable summarization module
- summaries/L1/chunk_*.md, L2/group_*.md, L3/final.md file hierarchy
- Soft-delete old summaries to deleted/ folder before replacing
---
 agent_cli/memory/_files.py           |  37 +-
 agent_cli/memory/_ingest.py          |  92 ++++-
 agent_cli/memory/_persistence.py     | 105 +++++-
 agent_cli/memory/_store.py           | 151 ++++++++
 agent_cli/memory/models.py           |  20 ++
 agent_cli/summarizer/__init__.py     |  31 ++
 agent_cli/summarizer/adaptive.py     | 502 +++++++++++++++++++++++++++
 agent_cli/summarizer/models.py       | 220 ++++++++++++
 agent_cli/summarizer/prompts.py      | 135 +++++++
 agent_cli/summarizer/utils.py        | 258 ++++++++++++++
 pyproject.toml                       |   3 +
 tests/memory/test_store.py           | 226 ++++++++++++
 tests/summarizer/__init__.py         |   1 +
 tests/summarizer/test_adaptive.py    | 434 +++++++++++++++++++++++
 tests/summarizer/test_integration.py | 466 +++++++++++++++++++++++++
 tests/summarizer/test_models.py      | 332 ++++++++++++++++++
 tests/summarizer/test_prompts.py     | 180 ++++++++++
 tests/summarizer/test_utils.py       | 193 ++++++++++
 18 files changed, 3382 insertions(+), 4 deletions(-)
 create mode 100644 agent_cli/summarizer/__init__.py
 create mode 100644 agent_cli/summarizer/adaptive.py
 create mode 100644 agent_cli/summarizer/models.py
 create mode 100644 agent_cli/summarizer/prompts.py
 create mode 100644 agent_cli/summarizer/utils.py
 create mode 100644 tests/summarizer/__init__.py
 create mode 100644 tests/summarizer/test_adaptive.py
 create mode 100644 tests/summarizer/test_integration.py
 create mode 100644 tests/summarizer/test_models.py
 create mode 100644 tests/summarizer/test_prompts.py
 create mode 100644 tests/summarizer/test_utils.py

diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index d55133d9..aa8bc5ae 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -24,6 +24,11 @@
 _SNAPSHOT_FILENAME = "memory_index.json"
 _DELETED_DIRNAME = "deleted"
 
+# Summary level constants for hierarchical file structure
+_SUMMARY_LEVEL_L1 = 1
+_SUMMARY_LEVEL_L2 = 2
+_SUMMARY_LEVEL_L3 = 3
+
 
 @dataclass
 class MemoryFileRecord:
@@ -90,6 +95,16 @@ def write_memory_file(
     summary_kind: str | None = None,
     doc_id: str | None = None,
     source_id: str | None = None,
+    # Hierarchical summary fields
+    level: int | None = None,
+    is_final: bool | None = None,
+    chunk_index: int | None = None,
+    parent_group: int | None = None,
+    group_index: int | None = None,
+    input_tokens: int | None = None,
+    output_tokens: int | None = None,
+    compression_ratio: float | None = None,
+    summary_level_name: str | None = None,
 ) -> MemoryFileRecord:
     """Render and persist a memory document to disk."""
     entries_dir, _ = ensure_store_dirs(root)
@@ -98,7 +113,18 @@ def write_memory_file(
     safe_ts = _safe_timestamp(created_at)
 
     # Route by role/category for readability
-    if summary_kind:
+    if summary_kind and level is not None:
+        # Hierarchical summary file structure
+        if level == _SUMMARY_LEVEL_L1:
+            subdir = Path("summaries") / "L1"
+            filename = f"chunk_{chunk_index or 0}.md"
+        elif level == _SUMMARY_LEVEL_L2:
+            subdir = Path("summaries") / "L2"
+            filename = f"group_{group_index or 0}.md"
+        else:  # level == _SUMMARY_LEVEL_L3
+            subdir = Path("summaries") / "L3"
+            filename = "final.md"
+    elif summary_kind:
         subdir = Path("summaries")
         filename = "summary.md"
     elif role == "user":
@@ -120,6 +146,15 @@ def write_memory_file(
         created_at=created_at,
         summary_kind=summary_kind,
         source_id=source_id,
+        level=level,
+        is_final=is_final,
+        chunk_index=chunk_index,
+        parent_group=parent_group,
+        group_index=group_index,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=compression_ratio,
+        summary_level_name=summary_level_name,
     )
 
     front_matter = _render_front_matter(doc_id, metadata)
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 46175355..3ce9385e 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -17,7 +17,12 @@
 from pydantic_ai.settings import ModelSettings
 
 from agent_cli.memory._git import commit_changes
-from agent_cli.memory._persistence import delete_memory_files, persist_entries, persist_summary
+from agent_cli.memory._persistence import (
+    delete_memory_files,
+    persist_entries,
+    persist_hierarchical_summary,
+    persist_summary,
+)
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
     FACT_SYSTEM_PROMPT,
@@ -41,6 +46,8 @@
 
     from chromadb import Collection
 
+    from agent_cli.summarizer import SummaryResult
+
 LOGGER = logging.getLogger(__name__)
 
 _SUMMARY_ROLE = "summary"
@@ -292,7 +299,12 @@ async def update_summary(
     model: str,
     max_tokens: int = 256,
 ) -> str | None:
-    """Update the conversation summary based on new facts."""
+    """Update the conversation summary based on new facts.
+
+    This is the simple Mem0-style rolling summary that incrementally
+    updates based on new facts. For full content adaptive summarization,
+    use `summarize_content` instead.
+    """
     if not new_facts:
         return prior_summary
     system_prompt = SUMMARY_PROMPT
@@ -312,6 +324,82 @@ async def update_summary(
     return result.output.summary or prior_summary
 
 
+async def summarize_content(
+    *,
+    content: str,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+    openai_base_url: str,
+    api_key: str | None,
+    model: str,
+) -> SummaryResult:
+    """Adaptively summarize content based on its length.
+
+    Uses the AdaptiveSummarizer to automatically select the appropriate
+    summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
+    based on input token count.
+
+    Args:
+        content: The content to summarize.
+        prior_summary: Optional prior summary for context continuity.
+        content_type: Type of content ("general", "conversation", "journal", "document").
+        openai_base_url: Base URL for OpenAI-compatible API.
+        api_key: API key for the LLM.
+        model: Model name to use for summarization.
+
+    Returns:
+        SummaryResult with the summary and metadata.
+
+    """
+    # Import here to avoid circular imports and allow optional dependency
+    from agent_cli.summarizer import AdaptiveSummarizer  # noqa: PLC0415
+
+    summarizer = AdaptiveSummarizer(
+        openai_base_url=openai_base_url,
+        model=model,
+        api_key=api_key,
+    )
+    return await summarizer.summarize(
+        content=content,
+        prior_summary=prior_summary,
+        content_type=content_type,
+    )
+
+
+async def store_adaptive_summary(
+    collection: Collection,
+    memory_root: Path,
+    conversation_id: str,
+    summary_result: SummaryResult,
+) -> list[str]:
+    """Store an adaptive summary result to files and ChromaDB.
+
+    This stores all levels of a hierarchical summary (L1, L2, L3) or
+    just the final summary for simpler levels. Old summaries are deleted first.
+
+    Files are stored as Markdown with YAML front matter in a hierarchical structure:
+    - summaries/L1/chunk_{n}.md - L1 chunk summaries
+    - summaries/L2/group_{n}.md - L2 group summaries
+    - summaries/L3/final.md - L3 final summary
+
+    Args:
+        collection: ChromaDB collection.
+        memory_root: Root path for memory files.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: The result from AdaptiveSummarizer.summarize().
+
+    Returns:
+        List of IDs that were stored.
+
+    """
+    return persist_hierarchical_summary(
+        collection,
+        memory_root=memory_root,
+        conversation_id=conversation_id,
+        summary_result=summary_result,
+    )
+
+
 async def extract_and_store_facts_and_summaries(
     *,
     collection: Collection,
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index bd8f4dfd..9c38f731 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -3,10 +3,13 @@
 from __future__ import annotations
 
 import logging
+import shutil
+from datetime import UTC, datetime
 from typing import TYPE_CHECKING
 
 from agent_cli.memory._files import (
     _DELETED_DIRNAME,
+    _slugify,
     ensure_store_dirs,
     load_snapshot,
     read_memory_file,
@@ -14,7 +17,13 @@
     write_memory_file,
     write_snapshot,
 )
-from agent_cli.memory._store import delete_entries, list_conversation_entries, upsert_memories
+from agent_cli.memory._store import (
+    delete_entries,
+    delete_summaries,
+    list_conversation_entries,
+    upsert_hierarchical_summary,
+    upsert_memories,
+)
 from agent_cli.memory.entities import Fact, Summary, Turn
 
 if TYPE_CHECKING:
@@ -23,6 +32,7 @@
     from chromadb import Collection
 
     from agent_cli.memory.models import MemoryMetadata
+    from agent_cli.summarizer import SummaryResult
 
 LOGGER = logging.getLogger(__name__)
 
@@ -180,3 +190,96 @@ def evict_if_needed(
     ids_to_remove = [e.id for e in overflow]
     delete_entries(collection, ids_to_remove)
     delete_memory_files(memory_root, conversation_id, ids_to_remove)
+
+
+def persist_hierarchical_summary(
+    collection: Collection,
+    *,
+    memory_root: Path,
+    conversation_id: str,
+    summary_result: SummaryResult,
+) -> list[str]:
+    """Persist a hierarchical summary to disk and ChromaDB.
+
+    This function:
+    1. Deletes existing summaries (files and ChromaDB entries)
+    2. Writes new summary files to disk in hierarchical structure
+    3. Stores entries in ChromaDB
+
+    Args:
+        collection: ChromaDB collection.
+        memory_root: Root path for memory files.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: The result from AdaptiveSummarizer.summarize().
+
+    Returns:
+        List of IDs that were stored.
+
+    """
+    from agent_cli.summarizer import SummaryLevel  # noqa: PLC0415
+
+    # Skip if no summary needed
+    if summary_result.level == SummaryLevel.NONE:
+        return []
+
+    # Delete existing summary files
+    _delete_summary_files(memory_root, conversation_id)
+
+    # Delete existing ChromaDB entries
+    delete_summaries(collection, conversation_id)
+
+    # Get storage metadata from SummaryResult
+    entries = summary_result.to_storage_metadata(conversation_id)
+    if not entries:
+        return []
+
+    stored_ids: list[str] = []
+    created_at = datetime.now(UTC).isoformat()
+
+    for entry in entries:
+        meta = entry["metadata"]
+        record = write_memory_file(
+            memory_root,
+            conversation_id=meta["conversation_id"],
+            role=meta["role"],
+            created_at=meta.get("created_at", created_at),
+            content=entry["content"],
+            summary_kind="summary",
+            doc_id=entry["id"],
+            level=meta.get("level"),
+            is_final=meta.get("is_final"),
+            chunk_index=meta.get("chunk_index"),
+            parent_group=meta.get("parent_group"),
+            group_index=meta.get("group_index"),
+            input_tokens=meta.get("input_tokens"),
+            output_tokens=meta.get("output_tokens"),
+            compression_ratio=meta.get("compression_ratio"),
+            summary_level_name=meta.get("summary_level"),
+        )
+        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level"))
+        stored_ids.append(record.id)
+
+    # Store in ChromaDB
+    upsert_hierarchical_summary(collection, conversation_id, summary_result)
+
+    return stored_ids
+
+
+def _delete_summary_files(memory_root: Path, conversation_id: str) -> None:
+    """Delete all summary files for a conversation."""
+    entries_dir, _ = ensure_store_dirs(memory_root)
+    safe_conversation = _slugify(conversation_id)
+    summaries_dir = entries_dir / safe_conversation / "summaries"
+
+    if summaries_dir.exists():
+        # Move to deleted folder instead of hard delete
+        deleted_dir = entries_dir / _DELETED_DIRNAME / safe_conversation / "summaries"
+        deleted_dir.parent.mkdir(parents=True, exist_ok=True)
+
+        # If deleted summaries already exist, remove them first
+        if deleted_dir.exists():
+            shutil.rmtree(deleted_dir)
+
+        # Move current summaries to deleted
+        shutil.move(str(summaries_dir), str(deleted_dir))
+        LOGGER.info("Moved old summaries to deleted: %s", deleted_dir)
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 96e7c66a..4f3755b1 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -167,3 +167,154 @@ def list_conversation_entries(
 def delete_entries(collection: Collection, ids: list[str]) -> None:
     """Delete entries by ID."""
     delete_docs(collection, ids)
+
+
+def upsert_hierarchical_summary(
+    collection: Collection,
+    conversation_id: str,
+    summary_result: Any,
+) -> list[str]:
+    """Store all levels of a hierarchical summary.
+
+    Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries
+    for L1 (chunk), L2 (group), and L3 (final) summaries.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: A SummaryResult from the adaptive summarizer.
+
+    Returns:
+        List of IDs that were upserted.
+
+    """
+    entries = summary_result.to_storage_metadata(conversation_id)
+    if not entries:
+        return []
+
+    ids: list[str] = []
+    contents: list[str] = []
+    metadatas: list[MemoryMetadata] = []
+
+    for entry in entries:
+        ids.append(entry["id"])
+        contents.append(entry["content"])
+        # Convert the raw metadata dict to MemoryMetadata
+        meta_dict = entry["metadata"]
+        metadatas.append(
+            MemoryMetadata(
+                conversation_id=meta_dict["conversation_id"],
+                role=meta_dict["role"],
+                created_at=meta_dict["created_at"],
+                level=meta_dict.get("level"),
+                is_final=meta_dict.get("is_final"),
+                chunk_index=meta_dict.get("chunk_index"),
+                parent_group=meta_dict.get("parent_group"),
+                group_index=meta_dict.get("group_index"),
+                input_tokens=meta_dict.get("input_tokens"),
+                output_tokens=meta_dict.get("output_tokens"),
+                compression_ratio=meta_dict.get("compression_ratio"),
+                summary_level_name=meta_dict.get("summary_level"),
+            ),
+        )
+
+    upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
+    return ids
+
+
+def get_summary_at_level(
+    collection: Collection,
+    conversation_id: str,
+    level: int,
+) -> list[StoredMemory]:
+    """Retrieve summaries at a specific level for a conversation.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to retrieve summaries for.
+        level: Summary level (1=chunk, 2=group, 3=final).
+
+    Returns:
+        List of StoredMemory entries at the requested level.
+
+    """
+    filters: list[dict[str, Any]] = [
+        {"conversation_id": conversation_id},
+        {"role": "summary"},
+        {"level": level},
+    ]
+    result = collection.get(where={"$and": filters})
+    docs = result.get("documents") or []
+    metas = result.get("metadatas") or []
+    ids = result.get("ids") or []
+
+    records: list[StoredMemory] = []
+    for doc, meta, entry_id in zip(docs, metas, ids, strict=False):
+        records.append(
+            StoredMemory(
+                id=entry_id,
+                content=doc,
+                metadata=MemoryMetadata(**dict(meta)),
+                distance=None,
+            ),
+        )
+    return records
+
+
+def get_final_summary(
+    collection: Collection,
+    conversation_id: str,
+) -> StoredMemory | None:
+    """Get the L3 (final) summary for a conversation.
+
+    This is a convenience wrapper around get_summary_at_level for the
+    most common use case of retrieving the top-level summary.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to retrieve the summary for.
+
+    Returns:
+        The final summary entry, or None if not found.
+
+    """
+    summaries = get_summary_at_level(collection, conversation_id, level=3)
+    # Return the one marked as final, or the first if none marked
+    for summary in summaries:
+        if summary.metadata.is_final:
+            return summary
+    return summaries[0] if summaries else None
+
+
+def delete_summaries(
+    collection: Collection,
+    conversation_id: str,
+    *,
+    levels: list[int] | None = None,
+) -> int:
+    """Delete summary entries for a conversation.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation to delete summaries from.
+        levels: Optional list of levels to delete. If None, deletes all levels.
+
+    Returns:
+        Number of entries deleted.
+
+    """
+    filters: list[dict[str, Any]] = [
+        {"conversation_id": conversation_id},
+        {"role": "summary"},
+    ]
+    if levels:
+        filters.append({"level": {"$in": levels}})
+
+    # First get the IDs to count them
+    result = collection.get(where={"$and": filters})
+    ids = result.get("ids") or []
+
+    if ids:
+        delete_docs(collection, list(ids))
+
+    return len(ids)
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 9ef076d5..6dc689d8 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -49,6 +49,26 @@ class MemoryMetadata(BaseModel):
     replaced_by: str | None = None
     source_id: str | None = None
 
+    # Hierarchical summary fields (only used when role="summary")
+    level: int | None = None
+    """Summary level: 1=chunk, 2=group, 3=final."""
+    is_final: bool | None = None
+    """Whether this is the final L3 summary."""
+    chunk_index: int | None = None
+    """For L1 summaries: index of the source chunk."""
+    parent_group: int | None = None
+    """For L1 summaries: which L2 group this chunk belongs to."""
+    group_index: int | None = None
+    """For L2 summaries: index of this group."""
+    input_tokens: int | None = None
+    """Number of tokens in the original input (L3 only)."""
+    output_tokens: int | None = None
+    """Number of tokens in the summary output (L3 only)."""
+    compression_ratio: float | None = None
+    """Ratio of output to input tokens (L3 only)."""
+    summary_level_name: str | None = None
+    """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
+
 
 class SummaryOutput(BaseModel):
     """Structured summary returned by the LLM."""
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
new file mode 100644
index 00000000..c6f1d85a
--- /dev/null
+++ b/agent_cli/summarizer/__init__.py
@@ -0,0 +1,31 @@
+"""Adaptive summarization module for variable-length content.
+
+This module provides research-grounded summarization that scales with input complexity,
+inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries,
+compression ratios) architectures.
+
+Example:
+    from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel
+
+    summarizer = AdaptiveSummarizer(
+        openai_base_url="http://localhost:8000/v1",
+        model="gpt-4",
+    )
+    result = await summarizer.summarize(long_document)
+    print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
+
+"""
+
+from agent_cli.summarizer.adaptive import AdaptiveSummarizer
+from agent_cli.summarizer.models import (
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+
+__all__ = [
+    "AdaptiveSummarizer",
+    "HierarchicalSummary",
+    "SummaryLevel",
+    "SummaryResult",
+]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
new file mode 100644
index 00000000..ed0074d8
--- /dev/null
+++ b/agent_cli/summarizer/adaptive.py
@@ -0,0 +1,502 @@
+"""Adaptive summarization that scales with input complexity.
+
+This module implements research-grounded summarization inspired by:
+- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing
+- Mem0: Rolling summaries, 90%+ compression, two-phase architecture
+
+Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+import httpx
+from pydantic import BaseModel
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIChatModel
+from pydantic_ai.providers.openai import OpenAIProvider
+from pydantic_ai.settings import ModelSettings
+
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+from agent_cli.summarizer.prompts import (
+    BRIEF_SUMMARY_PROMPT,
+    CHUNK_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    ROLLING_SUMMARY_PROMPT,
+    format_prior_context,
+    format_summaries_for_meta,
+    get_prompt_for_content_type,
+)
+from agent_cli.summarizer.utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    tokens_to_words,
+)
+
+logger = logging.getLogger(__name__)
+
+# Thresholds for summary levels (in tokens)
+LEVEL_THRESHOLDS = {
+    SummaryLevel.NONE: 100,
+    SummaryLevel.BRIEF: 500,
+    SummaryLevel.STANDARD: 3000,
+    SummaryLevel.DETAILED: 15000,
+    # HIERARCHICAL is everything above DETAILED
+}
+
+# Number of L1 chunks to group together for L2 summaries
+L2_GROUP_SIZE = 5
+# Minimum number of L1 chunks before L2 grouping is applied
+L2_MIN_CHUNKS = 5
+
+
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class AdaptiveSummarizer:
+    """Adaptive summarization that scales with input complexity.
+
+    Automatically selects the appropriate summarization strategy based on
+    input length:
+    - NONE (< 100 tokens): No summary needed
+    - BRIEF (100-500 tokens): Single sentence
+    - STANDARD (500-3000 tokens): Paragraph summary
+    - DETAILED (3000-15000 tokens): Chunked + meta-summary
+    - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries
+
+    Example:
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarizer.summarize(long_document)
+        print(f"Level: {result.level.name}")
+        print(f"Summary: {result.summary}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    def __init__(
+        self,
+        openai_base_url: str,
+        model: str,
+        api_key: str | None = None,
+        chunk_size: int = 3000,
+        chunk_overlap: int = 200,
+        max_concurrent_chunks: int = 5,
+        timeout: float = 60.0,
+    ) -> None:
+        """Initialize the adaptive summarizer.
+
+        Args:
+            openai_base_url: Base URL for OpenAI-compatible API.
+            model: Model name to use for summarization.
+            api_key: API key (optional for local models).
+            chunk_size: Target token count per chunk for hierarchical summarization.
+            chunk_overlap: Token overlap between chunks.
+            max_concurrent_chunks: Maximum parallel chunk summarizations.
+            timeout: Request timeout in seconds.
+
+        """
+        self.openai_base_url = openai_base_url.rstrip("/")
+        self.model = model
+        self.api_key = api_key or "not-needed"
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.max_concurrent_chunks = max_concurrent_chunks
+        self.timeout = timeout
+
+        self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url)
+
+    def determine_level(self, token_count: int) -> SummaryLevel:
+        """Determine the appropriate summary level based on token count.
+
+        Args:
+            token_count: Number of tokens in the input.
+
+        Returns:
+            The recommended SummaryLevel.
+
+        """
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+            return SummaryLevel.NONE
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+            return SummaryLevel.BRIEF
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
+            return SummaryLevel.STANDARD
+        if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
+            return SummaryLevel.DETAILED
+        return SummaryLevel.HIERARCHICAL
+
+    async def summarize(
+        self,
+        content: str,
+        prior_summary: str | None = None,
+        content_type: str = "general",
+    ) -> SummaryResult:
+        """Summarize content with adaptive strategy based on length.
+
+        Args:
+            content: The content to summarize.
+            prior_summary: Optional prior summary for context continuity.
+            content_type: Type of content ("general", "conversation", "journal", "document").
+
+        Returns:
+            SummaryResult with summary and metadata.
+
+        """
+        if not content or not content.strip():
+            return SummaryResult(
+                level=SummaryLevel.NONE,
+                summary=None,
+                hierarchical=None,
+                input_tokens=0,
+                output_tokens=0,
+                compression_ratio=0.0,
+            )
+
+        input_tokens = count_tokens(content, self.model)
+        level = self.determine_level(input_tokens)
+
+        logger.info(
+            "Summarizing %d tokens at level %s (type=%s)",
+            input_tokens,
+            level.name,
+            content_type,
+        )
+
+        if level == SummaryLevel.NONE:
+            return SummaryResult(
+                level=level,
+                summary=None,
+                hierarchical=None,
+                input_tokens=input_tokens,
+                output_tokens=0,
+                compression_ratio=0.0,
+            )
+
+        if level == SummaryLevel.BRIEF:
+            summary = await self._brief_summary(content)
+        elif level == SummaryLevel.STANDARD:
+            summary = await self._standard_summary(content, prior_summary, content_type)
+        elif level == SummaryLevel.DETAILED:
+            return await self._detailed_summary(content, input_tokens)
+        else:  # HIERARCHICAL
+            return await self._hierarchical_summary(content, input_tokens)
+
+        output_tokens = count_tokens(summary, self.model) if summary else 0
+        compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+
+        return SummaryResult(
+            level=level,
+            summary=summary,
+            hierarchical=None,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=compression_ratio,
+        )
+
+    async def update_rolling_summary(
+        self,
+        prior_summary: str | None,
+        new_facts: list[str],
+    ) -> str:
+        """Update a rolling summary with new facts (Mem0-style).
+
+        This is optimized for incremental updates where you have discrete
+        new facts to integrate into an existing summary.
+
+        Args:
+            prior_summary: The existing summary to update.
+            new_facts: List of new facts to integrate.
+
+        Returns:
+            Updated summary string.
+
+        """
+        if not new_facts:
+            return prior_summary or ""
+
+        new_content = "\n".join(f"- {fact}" for fact in new_facts)
+        combined_tokens = count_tokens(
+            (prior_summary or "") + new_content,
+            self.model,
+        )
+
+        target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt = ROLLING_SUMMARY_PROMPT.format(
+            prior_summary=prior_summary or "(No prior summary)",
+            new_content=new_content,
+            max_words=max_words,
+        )
+
+        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+    async def _brief_summary(self, content: str) -> str:
+        """Generate a single-sentence summary for brief content."""
+        prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
+        return await self._generate_summary(prompt, max_tokens=50)
+
+    async def _standard_summary(
+        self,
+        content: str,
+        prior_summary: str | None,
+        content_type: str,
+    ) -> str:
+        """Generate a paragraph summary for standard-length content."""
+        input_tokens = count_tokens(content, self.model)
+        target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt_template = get_prompt_for_content_type(content_type)
+        prior_context = format_prior_context(prior_summary)
+
+        prompt = prompt_template.format(
+            content=content,
+            prior_context=prior_context,
+            max_words=max_words,
+        )
+
+        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+    async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult:
+        """Generate chunked summaries with meta-summary for detailed content."""
+        chunks = chunk_text(
+            content,
+            chunk_size=self.chunk_size,
+            overlap=self.chunk_overlap,
+            model=self.model,
+        )
+
+        logger.info("Detailed summary: processing %d chunks", len(chunks))
+
+        # Summarize chunks (with concurrency limit)
+        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+
+        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+            async with semaphore:
+                chunk_tokens = count_tokens(chunk, self.model)
+                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = CHUNK_SUMMARY_PROMPT.format(
+                    chunk_index=idx + 1,
+                    total_chunks=len(chunks),
+                    content=chunk,
+                    max_words=max_words,
+                )
+
+                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+                summary_tokens = count_tokens(summary, self.model)
+
+                return ChunkSummary(
+                    chunk_index=idx,
+                    content=summary,
+                    token_count=summary_tokens,
+                    source_tokens=chunk_tokens,
+                    parent_group=None,
+                )
+
+        chunk_summaries = await asyncio.gather(
+            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+        )
+
+        # Generate meta-summary
+        all_summaries = [cs.content for cs in chunk_summaries]
+        meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
+        max_words = tokens_to_words(meta_target)
+
+        meta_prompt = META_SUMMARY_PROMPT.format(
+            summaries=format_summaries_for_meta(all_summaries),
+            max_words=max_words,
+        )
+
+        final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100)
+        output_tokens = count_tokens(final_summary, self.model)
+
+        hierarchical = HierarchicalSummary(
+            l1_summaries=list(chunk_summaries),
+            l2_summaries=[],  # Not used for DETAILED level
+            l3_summary=final_summary,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+        )
+
+        return SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary=final_summary,
+            hierarchical=hierarchical,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
+
+    async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult:
+        """Build a tree of summaries for very long content.
+
+        Structure:
+        - L1: Individual chunk summaries
+        - L2: Group summaries (groups of ~5 L1 summaries)
+        - L3: Final synthesis
+        """
+        chunks = chunk_text(
+            content,
+            chunk_size=self.chunk_size,
+            overlap=self.chunk_overlap,
+            model=self.model,
+        )
+
+        logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+
+        # L1: Summarize each chunk
+        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+
+        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+            async with semaphore:
+                chunk_tokens = count_tokens(chunk, self.model)
+                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = CHUNK_SUMMARY_PROMPT.format(
+                    chunk_index=idx + 1,
+                    total_chunks=len(chunks),
+                    content=chunk,
+                    max_words=max_words,
+                )
+
+                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+                summary_tokens = count_tokens(summary, self.model)
+
+                # Assign to group (5 chunks per group)
+                group_idx = idx // 5
+
+                return ChunkSummary(
+                    chunk_index=idx,
+                    content=summary,
+                    token_count=summary_tokens,
+                    source_tokens=chunk_tokens,
+                    parent_group=group_idx,
+                )
+
+        l1_summaries = await asyncio.gather(
+            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+        )
+
+        # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
+        l2_summaries: list[str] = []
+        if len(l1_summaries) > L2_MIN_CHUNKS:
+            groups: list[list[str]] = []
+            for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
+                group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
+                groups.append(group)
+
+            async def summarize_group(group: list[str]) -> str:
+                combined_tokens = sum(count_tokens(s, self.model) for s in group)
+                target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+                max_words = tokens_to_words(target_tokens)
+
+                prompt = META_SUMMARY_PROMPT.format(
+                    summaries=format_summaries_for_meta(group),
+                    max_words=max_words,
+                )
+                return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+
+            l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
+
+        # L3: Final synthesis
+        summaries_to_synthesize = (
+            l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
+        )
+        final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
+        max_words = tokens_to_words(final_target)
+
+        final_prompt = META_SUMMARY_PROMPT.format(
+            summaries=format_summaries_for_meta(summaries_to_synthesize),
+            max_words=max_words,
+        )
+
+        final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100)
+        output_tokens = count_tokens(final_summary, self.model)
+
+        hierarchical = HierarchicalSummary(
+            l1_summaries=list(l1_summaries),
+            l2_summaries=list(l2_summaries),
+            l3_summary=final_summary,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+        )
+
+        return SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary=final_summary,
+            hierarchical=hierarchical,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
+
+    async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
+        """Generate a summary using the LLM.
+
+        Uses PydanticAI for structured output with fallback to raw generation.
+        """
+        model = OpenAIChatModel(
+            model_name=self.model,
+            provider=self._provider,
+            settings=ModelSettings(
+                temperature=0.3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        agent = Agent(
+            model=model,
+            system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+            output_type=SummaryOutput,
+            retries=2,
+        )
+
+        try:
+            result = await agent.run(prompt)
+            return result.output.summary.strip()
+        except Exception as e:
+            logger.warning("Structured summary failed, trying raw generation: %s", e)
+            # Fallback to raw HTTP call
+            return await self._raw_generate(prompt, max_tokens)
+
+    async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
+        """Fallback raw HTTP generation without structured output."""
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            response = await client.post(
+                f"{self.openai_base_url}/chat/completions",
+                headers={"Authorization": f"Bearer {self.api_key}"},
+                json={
+                    "model": self.model,
+                    "messages": [
+                        {"role": "system", "content": "You are a concise summarizer."},
+                        {"role": "user", "content": prompt},
+                    ],
+                    "temperature": 0.3,
+                    "max_tokens": max_tokens,
+                },
+            )
+            response.raise_for_status()
+            data = response.json()
+
+        choices = data.get("choices", [])
+        if choices:
+            return choices[0].get("message", {}).get("content", "").strip()
+        return ""
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
new file mode 100644
index 00000000..f231a41e
--- /dev/null
+++ b/agent_cli/summarizer/models.py
@@ -0,0 +1,220 @@
+"""Data models for adaptive summarization."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from enum import IntEnum
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+# Hierarchical level constants for storage
+HIERARCHICAL_LEVEL_L1 = 1
+HIERARCHICAL_LEVEL_L2 = 2
+HIERARCHICAL_LEVEL_L3 = 3
+
+
+class SummaryLevel(IntEnum):
+    """Summary granularity levels based on input complexity.
+
+    Thresholds are based on Mem0 research showing optimal compression ratios
+    at different content lengths. Token counts are approximate guidelines.
+    """
+
+    NONE = 0
+    """< 100 tokens: No summary needed, facts only."""
+
+    BRIEF = 1
+    """100-500 tokens: Single-sentence summary (~20% compression)."""
+
+    STANDARD = 2
+    """500-3000 tokens: Paragraph summary (~12% compression)."""
+
+    DETAILED = 3
+    """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression)."""
+
+    HIERARCHICAL = 4
+    """> 15000 tokens: Tree of summaries with multiple levels."""
+
+
+class ChunkSummary(BaseModel):
+    """Summary of a single chunk within a hierarchical summary."""
+
+    chunk_index: int = Field(..., description="Index of this chunk in the original content")
+    content: str = Field(..., description="The summarized content of this chunk")
+    token_count: int = Field(..., ge=0, description="Token count of this summary")
+    source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
+    parent_group: int | None = Field(
+        default=None,
+        description="Index of the L2 group this chunk belongs to",
+    )
+
+
+class HierarchicalSummary(BaseModel):
+    """A hierarchical summary with multiple levels.
+
+    Structure inspired by Letta's partial eviction pattern:
+    - L1: Individual chunk summaries (parallel processing)
+    - L2: Group summaries (groups of ~5 L1 summaries)
+    - L3: Final synthesis (single top-level summary)
+    """
+
+    l1_summaries: list[ChunkSummary] = Field(
+        default_factory=list,
+        description="Level 1: Individual chunk summaries",
+    )
+    l2_summaries: list[str] = Field(
+        default_factory=list,
+        description="Level 2: Group summaries (if > 5 chunks)",
+    )
+    l3_summary: str = Field(
+        ...,
+        description="Level 3: Final synthesized summary",
+    )
+    chunk_size: int = Field(
+        default=3000,
+        description="Token size used for chunking",
+    )
+    chunk_overlap: int = Field(
+        default=200,
+        description="Token overlap between chunks",
+    )
+
+    def get_summary_at_level(self, level: int) -> str | list[str]:
+        """Get summary content at a specific level.
+
+        Args:
+            level: 1 for chunk summaries, 2 for group summaries, 3 for final.
+
+        Returns:
+            Summary content at the requested level.
+
+        """
+        if level == HIERARCHICAL_LEVEL_L1:
+            return [cs.content for cs in self.l1_summaries]
+        if level == HIERARCHICAL_LEVEL_L2:
+            return self.l2_summaries if self.l2_summaries else [self.l3_summary]
+        return self.l3_summary
+
+
+class SummaryResult(BaseModel):
+    """Result of adaptive summarization.
+
+    Contains the summary at the appropriate level for the input complexity,
+    along with metadata about the compression achieved.
+    """
+
+    level: SummaryLevel = Field(..., description="The summarization level used")
+    summary: str | None = Field(
+        default=None,
+        description="The final summary text (None for NONE level)",
+    )
+    hierarchical: HierarchicalSummary | None = Field(
+        default=None,
+        description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)",
+    )
+    input_tokens: int = Field(..., ge=0, description="Token count of the input content")
+    output_tokens: int = Field(..., ge=0, description="Token count of the summary")
+    compression_ratio: float = Field(
+        ...,
+        ge=0.0,
+        le=1.0,
+        description="Ratio of output to input tokens (lower = more compression)",
+    )
+    created_at: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp when summary was created",
+    )
+
+    @property
+    def chunk_summaries(self) -> list[str] | None:
+        """Get L1 chunk summaries if available."""
+        if self.hierarchical:
+            return [cs.content for cs in self.hierarchical.l1_summaries]
+        return None
+
+    def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
+        """Convert to metadata entries for ChromaDB storage.
+
+        Returns a list of metadata dicts, one for each summary level stored.
+        """
+        entries: list[dict[str, Any]] = []
+        timestamp = self.created_at.isoformat()
+
+        if self.level == SummaryLevel.NONE:
+            return entries
+
+        # For hierarchical summaries, store each level
+        if self.hierarchical:
+            # L1: Individual chunk summaries
+            entries.extend(
+                {
+                    "id": f"{conversation_id}:summary:L1:{cs.chunk_index}",
+                    "content": cs.content,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L1,
+                        "chunk_index": cs.chunk_index,
+                        "parent_group": cs.parent_group,
+                        "token_count": cs.token_count,
+                        "created_at": timestamp,
+                    },
+                }
+                for cs in self.hierarchical.l1_summaries
+            )
+
+            # L2: Group summaries
+            entries.extend(
+                {
+                    "id": f"{conversation_id}:summary:L2:{idx}",
+                    "content": l2_summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L2,
+                        "group_index": idx,
+                        "created_at": timestamp,
+                    },
+                }
+                for idx, l2_summary in enumerate(self.hierarchical.l2_summaries)
+            )
+
+            # L3: Final summary
+            entries.append(
+                {
+                    "id": f"{conversation_id}:summary:L3:final",
+                    "content": self.hierarchical.l3_summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L3,
+                        "is_final": True,
+                        "input_tokens": self.input_tokens,
+                        "output_tokens": self.output_tokens,
+                        "compression_ratio": self.compression_ratio,
+                        "created_at": timestamp,
+                    },
+                },
+            )
+        elif self.summary:
+            # Non-hierarchical: just store the single summary
+            entries.append(
+                {
+                    "id": f"{conversation_id}:summary:L3:final",
+                    "content": self.summary,
+                    "metadata": {
+                        "conversation_id": conversation_id,
+                        "role": "summary",
+                        "level": HIERARCHICAL_LEVEL_L3,
+                        "is_final": True,
+                        "summary_level": self.level.name,
+                        "input_tokens": self.input_tokens,
+                        "output_tokens": self.output_tokens,
+                        "compression_ratio": self.compression_ratio,
+                        "created_at": timestamp,
+                    },
+                },
+            )
+
+        return entries
diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/prompts.py
new file mode 100644
index 00000000..101422b7
--- /dev/null
+++ b/agent_cli/summarizer/prompts.py
@@ -0,0 +1,135 @@
+"""Prompt templates for adaptive summarization.
+
+These prompts are designed to work with various LLM sizes (8B-20B parameters)
+and are optimized for structured, factual output.
+"""
+
+# Level 1: BRIEF - Single sentence summary
+BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
+Focus on the single most important point or takeaway.
+
+Content:
+{content}
+
+One-sentence summary:""".strip()
+
+# Level 2: STANDARD - Paragraph summary
+STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
+
+Focus on:
+- Key facts, decisions, and outcomes
+- Important context that should be remembered
+- Skip transient details, greetings, and chitchat
+
+{prior_context}
+
+Content to summarize:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# Level 3: DETAILED - Used for individual chunks in hierarchical summarization
+CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document.
+Capture the main points while preserving important details.
+
+Section {chunk_index} of {total_chunks}:
+{content}
+
+Summary of this section (maximum {max_words} words):""".strip()
+
+# Level 4: META - Combine multiple summaries into one
+META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview.
+Identify common themes and key points across all sections.
+Eliminate redundancy while preserving unique insights.
+
+Summaries to combine:
+{summaries}
+
+Combined summary (maximum {max_words} words):""".strip()
+
+# Rolling summary update (Mem0-style)
+ROLLING_SUMMARY_PROMPT = """Update the running summary with new information.
+Integrate new facts seamlessly while keeping the summary concise.
+Drop redundant or superseded information.
+Preserve durable facts about identity, preferences, and important events.
+
+Current summary:
+{prior_summary}
+
+New information to integrate:
+{new_content}
+
+Updated summary (maximum {max_words} words):""".strip()
+
+# For conversation-specific summarization
+CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective.
+Focus on:
+- What the user wanted or asked about
+- Key information the user shared about themselves
+- Decisions made or conclusions reached
+- Any commitments or follow-ups mentioned
+
+Conversation:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# For journal/personal content
+JOURNAL_SUMMARY_PROMPT = """Summarize this personal entry or reflection.
+Preserve:
+- Key events and experiences mentioned
+- Emotions and insights expressed
+- Goals, plans, or intentions stated
+- People, places, or things that are important
+
+Entry:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+# For technical/document content
+DOCUMENT_SUMMARY_PROMPT = """Summarize this technical content or documentation.
+Focus on:
+- Main concepts and their relationships
+- Key procedures or processes described
+- Important specifications or requirements
+- Conclusions or recommendations
+
+Document:
+{content}
+
+Summary (maximum {max_words} words):""".strip()
+
+
+def get_prompt_for_content_type(content_type: str) -> str:
+    """Get the appropriate prompt template for a content type.
+
+    Args:
+        content_type: One of "general", "conversation", "journal", "document".
+
+    Returns:
+        The prompt template string.
+
+    """
+    prompts = {
+        "general": STANDARD_SUMMARY_PROMPT,
+        "conversation": CONVERSATION_SUMMARY_PROMPT,
+        "journal": JOURNAL_SUMMARY_PROMPT,
+        "document": DOCUMENT_SUMMARY_PROMPT,
+    }
+    return prompts.get(content_type, STANDARD_SUMMARY_PROMPT)
+
+
+def format_prior_context(prior_summary: str | None) -> str:
+    """Format prior summary context for inclusion in prompts."""
+    if prior_summary:
+        return f"Prior context (for continuity):\n{prior_summary}\n"
+    return ""
+
+
+def format_summaries_for_meta(summaries: list[str]) -> str:
+    """Format a list of summaries for the meta-summary prompt."""
+    formatted = []
+    for i, summary in enumerate(summaries, 1):
+        formatted.append(f"[Section {i}]\n{summary}")
+    return "\n\n".join(formatted)
diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/utils.py
new file mode 100644
index 00000000..bc319f5b
--- /dev/null
+++ b/agent_cli/summarizer/utils.py
@@ -0,0 +1,258 @@
+"""Utility functions for adaptive summarization."""
+
+from __future__ import annotations
+
+import re
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+from agent_cli.summarizer.models import SummaryLevel
+
+if TYPE_CHECKING:
+    import tiktoken
+
+
+@lru_cache(maxsize=4)
+def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
+    """Get tiktoken encoding for a model, with caching.
+
+    Falls back to cl100k_base for unknown models (covers most modern LLMs).
+    """
+    import tiktoken  # noqa: PLC0415
+
+    try:
+        return tiktoken.encoding_for_model(model)
+    except KeyError:
+        return tiktoken.get_encoding("cl100k_base")
+
+
+def count_tokens(text: str, model: str = "gpt-4") -> int:
+    """Count tokens in text using tiktoken.
+
+    Args:
+        text: The text to count tokens for.
+        model: Model name for tokenizer selection.
+
+    Returns:
+        Number of tokens in the text.
+
+    """
+    if not text:
+        return 0
+    enc = _get_encoding(model)
+    return len(enc.encode(text))
+
+
+def chunk_text(
+    text: str,
+    chunk_size: int = 3000,
+    overlap: int = 200,
+    model: str = "gpt-4",
+) -> list[str]:
+    """Split text into overlapping chunks by token count.
+
+    Uses semantic boundaries (paragraphs, sentences) when possible to avoid
+    splitting mid-thought. Falls back to token-based splitting if no good
+    boundaries are found.
+
+    Args:
+        text: The text to chunk.
+        chunk_size: Target token count per chunk.
+        overlap: Token overlap between chunks for context continuity.
+        model: Model name for tokenizer.
+
+    Returns:
+        List of text chunks.
+
+    """
+    if not text:
+        return []
+
+    total_tokens = count_tokens(text, model)
+    if total_tokens <= chunk_size:
+        return [text]
+
+    # Split into paragraphs first
+    paragraphs = re.split(r"\n\s*\n", text)
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+
+    if not paragraphs:
+        return [text]
+
+    chunks: list[str] = []
+    current_chunk: list[str] = []
+    current_tokens = 0
+
+    for para in paragraphs:
+        para_tokens = count_tokens(para, model)
+
+        # If single paragraph exceeds chunk size, split it further
+        if para_tokens > chunk_size:
+            # Flush current chunk if any
+            if current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+                current_chunk = []
+                current_tokens = 0
+
+            # Split large paragraph by sentences
+            sentences = _split_sentences(para)
+            for sentence in sentences:
+                sent_tokens = count_tokens(sentence, model)
+                if current_tokens + sent_tokens > chunk_size and current_chunk:
+                    chunks.append(" ".join(current_chunk))
+                    # Keep overlap from end of previous chunk
+                    overlap_text = _get_overlap_text(current_chunk, overlap, model)
+                    current_chunk = [overlap_text] if overlap_text else []
+                    current_tokens = count_tokens(overlap_text, model) if overlap_text else 0
+                current_chunk.append(sentence)
+                current_tokens += sent_tokens
+        elif current_tokens + para_tokens > chunk_size:
+            # Flush current chunk and start new one
+            chunks.append("\n\n".join(current_chunk))
+            # Keep overlap from end of previous chunk
+            overlap_text = _get_overlap_text(current_chunk, overlap, model)
+            current_chunk = [overlap_text, para] if overlap_text else [para]
+            current_tokens = (
+                count_tokens(overlap_text, model) + para_tokens if overlap_text else para_tokens
+            )
+        else:
+            current_chunk.append(para)
+            current_tokens += para_tokens
+
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append("\n\n".join(current_chunk))
+
+    return chunks
+
+
+def _split_sentences(text: str) -> list[str]:
+    """Split text into sentences, preserving common abbreviations."""
+    # Simple sentence splitting that handles common cases
+    # Matches period/question/exclamation followed by space and capital letter
+    sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text)
+    return [s.strip() for s in sentences if s.strip()]
+
+
+def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
+    """Extract overlap text from end of chunk list.
+
+    Takes text from the end of the chunk list until reaching target_tokens.
+    """
+    if not chunks or target_tokens <= 0:
+        return ""
+
+    # Work backwards through chunks
+    overlap_parts: list[str] = []
+    tokens_collected = 0
+
+    for chunk in reversed(chunks):
+        chunk_tokens = count_tokens(chunk, model)
+        if tokens_collected + chunk_tokens <= target_tokens:
+            overlap_parts.insert(0, chunk)
+            tokens_collected += chunk_tokens
+        else:
+            # Take partial chunk if needed
+            words = chunk.split()
+            partial: list[str] = []
+            for word in reversed(words):
+                word_tokens = count_tokens(word, model)
+                if tokens_collected + word_tokens <= target_tokens:
+                    partial.insert(0, word)
+                    tokens_collected += word_tokens
+                else:
+                    break
+            if partial:
+                overlap_parts.insert(0, " ".join(partial))
+            break
+
+    return " ".join(overlap_parts)
+
+
+def middle_truncate(
+    text: str,
+    budget_chars: int,
+    head_frac: float = 0.3,
+    tail_frac: float = 0.3,
+) -> tuple[str, int]:
+    """Middle-truncate text to fit within a character budget.
+
+    Keeps the first head_frac and last tail_frac portions, dropping the middle.
+    This preserves context from both the beginning (often contains setup) and
+    end (often contains conclusions/recent events).
+
+    Inspired by Letta's `middle_truncate_text` function.
+
+    Args:
+        text: Text to truncate.
+        budget_chars: Maximum character count for output.
+        head_frac: Fraction of budget for the head portion.
+        tail_frac: Fraction of budget for the tail portion.
+
+    Returns:
+        Tuple of (truncated_text, dropped_char_count).
+
+    """
+    if budget_chars <= 0 or len(text) <= budget_chars:
+        return text, 0
+
+    head_len = max(0, int(budget_chars * head_frac))
+    tail_len = max(0, int(budget_chars * tail_frac))
+
+    # Ensure head + tail doesn't exceed budget
+    if head_len + tail_len > budget_chars:
+        tail_len = max(0, budget_chars - head_len)
+
+    head = text[:head_len]
+    tail = text[-tail_len:] if tail_len > 0 else ""
+    dropped = max(0, len(text) - (len(head) + len(tail)))
+
+    marker = f"\n[...{dropped} characters truncated...]\n"
+
+    # If marker would overflow budget, shrink tail
+    available_for_marker = budget_chars - (len(head) + len(tail))
+    if available_for_marker < len(marker):
+        over = len(marker) - available_for_marker
+        tail = tail[:-over] if over < len(tail) else ""
+
+    return head + marker + tail, dropped
+
+
+def estimate_summary_tokens(input_tokens: int, level: int) -> int:
+    """Estimate target summary tokens based on input size and level.
+
+    Compression ratios based on Mem0 research:
+    - BRIEF: ~20% compression (80% reduction)
+    - STANDARD: ~12% compression (88% reduction)
+    - DETAILED: ~7% compression (93% reduction)
+    - HIERARCHICAL: Capped with diminishing returns
+
+    Args:
+        input_tokens: Number of tokens in the input.
+        level: Summary level (1-4).
+
+    Returns:
+        Target number of tokens for the summary.
+
+    """
+    if level == SummaryLevel.NONE:
+        return 0
+    if level == SummaryLevel.BRIEF:
+        return min(50, max(20, input_tokens // 5))
+    if level == SummaryLevel.STANDARD:
+        return min(200, max(50, input_tokens // 8))
+    if level == SummaryLevel.DETAILED:
+        return min(500, max(100, input_tokens // 15))
+    # HIERARCHICAL
+    # Base of 1000 tokens plus diminishing returns for additional content
+    base = 1000
+    additional = max(0, (input_tokens - 15000) // 100)
+    return min(2000, base + additional)
+
+
+def tokens_to_words(tokens: int) -> int:
+    """Convert token count to approximate word count.
+
+    Rough approximation: 1 token ≈ 0.75 words for English text.
+    """
+    return int(tokens * 0.75)
diff --git a/pyproject.toml b/pyproject.toml
index 7a2798b6..6ea9763c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ memory = [
     "watchfiles>=0.21.0",
     # Until here same as 'rag' extras
     "pyyaml>=6.0.0",
+    "tiktoken>=0.5.0",  # For token counting in adaptive summarization
 ]
 test = [
     "pytest>=7.0.0",
@@ -60,6 +61,7 @@ test = [
     "pydantic-ai-slim[openai]",
     "pytest-timeout",
     "pytest-mock",
+    "tiktoken>=0.5.0",  # For summarizer tests
 ]
 dev = [
     "agent-cli[test]",
@@ -87,6 +89,7 @@ dev = [
     "notebook",
     "audiostretchy>=1.3.0",
     "pre-commit-uv>=4.1.4",
+    "tiktoken>=0.5.0",  # For summarizer tests
 ]
 
 [project.scripts]
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 98334e45..3edd0eeb 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -148,3 +148,229 @@ def test_upsert_and_delete_entries_delegate() -> None:
 
     _store.delete_entries(fake, ["x"])
     assert fake.deleted == [["x"]]
+
+
+# --- Hierarchical Summary Tests ---
+
+
+class _MockSummaryResult:
+    """Mock SummaryResult for testing without importing the full summarizer module."""
+
+    def __init__(self, entries: list[dict[str, Any]]) -> None:
+        self._entries = entries
+
+    def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]:
+        # Just return the pre-configured entries (ignores conversation_id)
+        return self._entries
+
+
+def test_upsert_hierarchical_summary_simple() -> None:
+    """Test upserting a simple (non-hierarchical) summary."""
+    fake = _FakeCollection()
+    entries = [
+        {
+            "id": "conv-123:summary:L3:final",
+            "content": "A standard paragraph summary.",
+            "metadata": {
+                "conversation_id": "conv-123",
+                "role": "summary",
+                "level": 3,
+                "is_final": True,
+                "summary_level": "STANDARD",
+                "input_tokens": 1000,
+                "output_tokens": 50,
+                "compression_ratio": 0.05,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+    ]
+    mock_result = _MockSummaryResult(entries)
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result)
+
+    assert ids == ["conv-123:summary:L3:final"]
+    assert len(fake.upserts) == 1
+    upserted_ids, upserted_docs, upserted_metas = fake.upserts[0]
+    assert upserted_ids == ["conv-123:summary:L3:final"]
+    assert upserted_docs == ["A standard paragraph summary."]
+    assert upserted_metas[0]["level"] == 3
+    assert upserted_metas[0]["is_final"] is True
+
+
+def test_upsert_hierarchical_summary_with_chunks() -> None:
+    """Test upserting a hierarchical summary with L1 and L3 entries."""
+    fake = _FakeCollection()
+    entries = [
+        {
+            "id": "conv-456:summary:L1:0",
+            "content": "Chunk 0 summary",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 1,
+                "chunk_index": 0,
+                "parent_group": 0,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+        {
+            "id": "conv-456:summary:L1:1",
+            "content": "Chunk 1 summary",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 1,
+                "chunk_index": 1,
+                "parent_group": 0,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+        {
+            "id": "conv-456:summary:L3:final",
+            "content": "Final synthesis",
+            "metadata": {
+                "conversation_id": "conv-456",
+                "role": "summary",
+                "level": 3,
+                "is_final": True,
+                "input_tokens": 5000,
+                "output_tokens": 100,
+                "compression_ratio": 0.02,
+                "created_at": "2024-01-01T00:00:00",
+            },
+        },
+    ]
+    mock_result = _MockSummaryResult(entries)
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result)
+
+    assert len(ids) == 3
+    assert "conv-456:summary:L1:0" in ids
+    assert "conv-456:summary:L1:1" in ids
+    assert "conv-456:summary:L3:final" in ids
+
+
+def test_upsert_hierarchical_summary_empty() -> None:
+    """Test upserting when there are no entries (e.g., NONE level)."""
+    fake = _FakeCollection()
+    mock_result = _MockSummaryResult([])
+
+    ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result)
+
+    assert ids == []
+    assert len(fake.upserts) == 0
+
+
+def test_get_summary_at_level() -> None:
+    """Test retrieving summaries at a specific level."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["Chunk 0", "Chunk 1"],
+            "metadatas": [
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 1,
+                    "chunk_index": 0,
+                    "created_at": "now",
+                },
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 1,
+                    "chunk_index": 1,
+                    "created_at": "now",
+                },
+            ],
+            "ids": ["c1:summary:L1:0", "c1:summary:L1:1"],
+        },
+    )
+
+    records = _store.get_summary_at_level(fake, "c1", level=1)
+
+    assert len(records) == 2
+    assert records[0].metadata.level == 1
+    assert records[0].metadata.chunk_index == 0
+    assert records[1].metadata.chunk_index == 1
+
+
+def test_get_final_summary_returns_final() -> None:
+    """Test getting the L3 final summary."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["The final summary"],
+            "metadatas": [
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "level": 3,
+                    "is_final": True,
+                    "created_at": "now",
+                },
+            ],
+            "ids": ["c1:summary:L3:final"],
+        },
+    )
+
+    result = _store.get_final_summary(fake, "c1")
+
+    assert result is not None
+    assert result.content == "The final summary"
+    assert result.metadata.is_final is True
+
+
+def test_get_final_summary_returns_none_when_missing() -> None:
+    """Test that get_final_summary returns None when no summary exists."""
+    fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []})
+
+    result = _store.get_final_summary(fake, "c1")
+
+    assert result is None
+
+
+def test_delete_summaries_all_levels() -> None:
+    """Test deleting all summary levels for a conversation."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["L1", "L3"],
+            "metadatas": [
+                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
+                {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"},
+            ],
+            "ids": ["c1:summary:L1:0", "c1:summary:L3:final"],
+        },
+    )
+
+    deleted_count = _store.delete_summaries(fake, "c1")
+
+    assert deleted_count == 2
+    assert len(fake.deleted) == 1
+    assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"}
+
+
+def test_delete_summaries_specific_levels() -> None:
+    """Test deleting only specific summary levels."""
+    fake = _FakeCollection(
+        get_result={
+            "documents": ["L1 chunk"],
+            "metadatas": [
+                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
+            ],
+            "ids": ["c1:summary:L1:0"],
+        },
+    )
+
+    deleted_count = _store.delete_summaries(fake, "c1", levels=[1])
+
+    assert deleted_count == 1
+    assert fake.deleted[0] == ["c1:summary:L1:0"]
+
+
+def test_delete_summaries_no_entries() -> None:
+    """Test deleting when no summaries exist."""
+    fake = _FakeCollection(get_result={"documents": [], "metadatas": [], "ids": []})
+
+    deleted_count = _store.delete_summaries(fake, "c1")
+
+    assert deleted_count == 0
+    assert len(fake.deleted) == 0
diff --git a/tests/summarizer/__init__.py b/tests/summarizer/__init__.py
new file mode 100644
index 00000000..d6801b31
--- /dev/null
+++ b/tests/summarizer/__init__.py
@@ -0,0 +1 @@
+"""Tests for the adaptive summarizer module."""
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
new file mode 100644
index 00000000..1f010999
--- /dev/null
+++ b/tests/summarizer/test_adaptive.py
@@ -0,0 +1,434 @@
+"""Unit tests for AdaptiveSummarizer."""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agent_cli.summarizer.adaptive import (
+    LEVEL_THRESHOLDS,
+    AdaptiveSummarizer,
+    SummaryOutput,
+)
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+
+
+class TestAdaptiveSummarizerInit:
+    """Tests for AdaptiveSummarizer initialization."""
+
+    def test_basic_init(self) -> None:
+        """Test basic initialization with required parameters."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+        assert summarizer.model == "llama3.1:8b"
+        assert summarizer.api_key == "not-needed"
+
+    def test_init_with_api_key(self) -> None:
+        """Test initialization with custom API key."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+            api_key="sk-test-key",
+        )
+        assert summarizer.api_key == "sk-test-key"
+
+    def test_init_with_custom_settings(self) -> None:
+        """Test initialization with custom chunk settings."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+            chunk_size=5000,
+            chunk_overlap=300,
+            max_concurrent_chunks=10,
+            timeout=120.0,
+        )
+        assert summarizer.chunk_size == 5000
+        assert summarizer.chunk_overlap == 300
+        assert summarizer.max_concurrent_chunks == 10
+        assert summarizer.timeout == 120.0
+
+    def test_trailing_slash_stripped(self) -> None:
+        """Test that trailing slash is stripped from base URL."""
+        summarizer = AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1/",
+            model="gpt-4",
+        )
+        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+
+
+class TestDetermineLevel:
+    """Tests for level determination based on token count."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test NONE level for very short content."""
+        assert summarizer.determine_level(50) == SummaryLevel.NONE
+        assert summarizer.determine_level(99) == SummaryLevel.NONE
+
+    def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test BRIEF level for short content."""
+        assert summarizer.determine_level(100) == SummaryLevel.BRIEF
+        assert summarizer.determine_level(300) == SummaryLevel.BRIEF
+        assert summarizer.determine_level(499) == SummaryLevel.BRIEF
+
+    def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test STANDARD level for medium content."""
+        assert summarizer.determine_level(500) == SummaryLevel.STANDARD
+        assert summarizer.determine_level(1500) == SummaryLevel.STANDARD
+        assert summarizer.determine_level(2999) == SummaryLevel.STANDARD
+
+    def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test DETAILED level for longer content."""
+        assert summarizer.determine_level(3000) == SummaryLevel.DETAILED
+        assert summarizer.determine_level(8000) == SummaryLevel.DETAILED
+        assert summarizer.determine_level(14999) == SummaryLevel.DETAILED
+
+    def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test HIERARCHICAL level for very long content."""
+        assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL
+        assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL
+        assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL
+
+    def test_thresholds_match_constants(self) -> None:
+        """Verify thresholds match the module constants."""
+        assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100
+        assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500
+        assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000
+        assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000
+
+
+class TestSummarize:
+    """Tests for main summarize method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty content returns NONE level result."""
+        result = await summarizer.summarize("")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+        assert result.input_tokens == 0
+        assert result.output_tokens == 0
+
+    @pytest.mark.asyncio
+    async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that whitespace-only content returns NONE level result."""
+        result = await summarizer.summarize("   \n\n   ")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+
+    @pytest.mark.asyncio
+    async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that very short content gets NONE level (no summary)."""
+        # Less than 100 tokens
+        result = await summarizer.summarize("Hello world")
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_brief_summary")
+    async def test_brief_level_calls_brief_summary(
+        self,
+        mock_brief: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that BRIEF level content calls _brief_summary."""
+        mock_brief.return_value = "Brief summary."
+
+        # Create content that's ~100-500 tokens
+        content = "This is a test sentence. " * 30  # ~150 tokens
+
+        result = await summarizer.summarize(content)
+
+        mock_brief.assert_called_once_with(content)
+        assert result.level == SummaryLevel.BRIEF
+        assert result.summary == "Brief summary."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    async def test_standard_level_calls_standard_summary(
+        self,
+        mock_standard: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that STANDARD level content calls _standard_summary."""
+        mock_standard.return_value = "Standard summary paragraph."
+
+        # Create content that's ~500-3000 tokens
+        content = "This is a test sentence with more words. " * 100  # ~800 tokens
+
+        result = await summarizer.summarize(content, content_type="general")
+
+        mock_standard.assert_called_once_with(content, None, "general")
+        assert result.level == SummaryLevel.STANDARD
+        assert result.summary == "Standard summary paragraph."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    async def test_prior_summary_passed_to_standard(
+        self,
+        mock_standard: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that prior_summary is passed to _standard_summary."""
+        mock_standard.return_value = "Updated summary."
+
+        content = "This is a test sentence with more words. " * 100
+        prior = "Previous context summary."
+
+        await summarizer.summarize(content, prior_summary=prior)
+
+        mock_standard.assert_called_once_with(content, prior, "general")
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_detailed_summary")
+    async def test_detailed_level_calls_detailed_summary(
+        self,
+        mock_detailed: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that DETAILED level content calls _detailed_summary."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Detailed summary.",
+            hierarchical=None,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+        mock_detailed.return_value = mock_result
+
+        # Create content that's ~3000-15000 tokens
+        content = "Word " * 5000  # ~5000 tokens
+
+        result = await summarizer.summarize(content)
+
+        assert mock_detailed.called
+        assert result.level == SummaryLevel.DETAILED
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_hierarchical_summary")
+    async def test_hierarchical_level_calls_hierarchical_summary(
+        self,
+        mock_hierarchical: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that HIERARCHICAL level content calls _hierarchical_summary."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Hierarchical summary.",
+            hierarchical=None,
+            input_tokens=20000,
+            output_tokens=500,
+            compression_ratio=0.025,
+        )
+        mock_hierarchical.return_value = mock_result
+
+        # Create content that's > 15000 tokens
+        content = "Word " * 20000
+
+        result = await summarizer.summarize(content)
+
+        assert mock_hierarchical.called
+        assert result.level == SummaryLevel.HIERARCHICAL
+
+
+class TestUpdateRollingSummary:
+    """Tests for rolling summary updates."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty facts list returns prior summary."""
+        result = await summarizer.update_rolling_summary(
+            prior_summary="Existing summary",
+            new_facts=[],
+        )
+        assert result == "Existing summary"
+
+    @pytest.mark.asyncio
+    async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that empty facts with no prior returns empty string."""
+        result = await summarizer.update_rolling_summary(
+            prior_summary=None,
+            new_facts=[],
+        )
+        assert result == ""
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    async def test_new_facts_calls_generate(
+        self,
+        mock_generate: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that new facts trigger summary generation."""
+        mock_generate.return_value = "Updated summary with new facts."
+
+        result = await summarizer.update_rolling_summary(
+            prior_summary="Old summary",
+            new_facts=["User likes coffee", "User lives in Amsterdam"],
+        )
+
+        mock_generate.assert_called_once()
+        assert result == "Updated summary with new facts."
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    async def test_facts_formatted_as_list(
+        self,
+        mock_generate: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test that facts are formatted as bullet list in prompt."""
+        mock_generate.return_value = "Summary"
+
+        await summarizer.update_rolling_summary(
+            prior_summary="Prior",
+            new_facts=["Fact one", "Fact two"],
+        )
+
+        # Check the prompt contains formatted facts
+        call_args = mock_generate.call_args
+        prompt = call_args[0][0]
+        assert "- Fact one" in prompt
+        assert "- Fact two" in prompt
+
+
+class TestGenerateSummary:
+    """Tests for _generate_summary method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_generate_summary_with_pydantic_ai(
+        self,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test summary generation using PydanticAI agent."""
+        # Mock the entire agent creation and run
+        mock_result = MagicMock()
+        mock_result.output = SummaryOutput(summary="Generated summary.")
+
+        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(return_value=mock_result)
+            mock_agent_class.return_value = mock_agent
+
+            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+
+            assert result == "Generated summary."
+            mock_agent.run.assert_called_once_with("Test prompt")
+
+    @pytest.mark.asyncio
+    @patch.object(AdaptiveSummarizer, "_raw_generate")
+    async def test_fallback_to_raw_generate_on_error(
+        self,
+        mock_raw: AsyncMock,
+        summarizer: AdaptiveSummarizer,
+    ) -> None:
+        """Test fallback to raw HTTP on PydanticAI error."""
+        mock_raw.return_value = "Fallback summary"
+
+        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+            mock_agent = MagicMock()
+            mock_agent.run = AsyncMock(side_effect=Exception("API error"))
+            mock_agent_class.return_value = mock_agent
+
+            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+
+            mock_raw.assert_called_once_with("Test prompt", 100)
+            assert result == "Fallback summary"
+
+
+class TestRawGenerate:
+    """Tests for _raw_generate fallback method."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create a summarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+
+    @pytest.mark.asyncio
+    async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test successful raw HTTP generation."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "choices": [{"message": {"content": "Raw generated summary"}}],
+        }
+
+        with patch("httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.post = AsyncMock(return_value=mock_response)
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+
+            assert result == "Raw generated summary"
+
+    @pytest.mark.asyncio
+    async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test raw generate with empty choices returns empty string."""
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"choices": []}
+
+        with patch("httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.post = AsyncMock(return_value=mock_response)
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+
+            assert result == ""
+
+
+class TestSummaryOutput:
+    """Tests for SummaryOutput pydantic model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a SummaryOutput."""
+        output = SummaryOutput(summary="Test summary text")
+        assert output.summary == "Test summary text"
+
+    def test_whitespace_preserved(self) -> None:
+        """Test that whitespace in summary is preserved."""
+        output = SummaryOutput(summary="  Summary with spaces  ")
+        assert output.summary == "  Summary with spaces  "
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
new file mode 100644
index 00000000..381f9f5b
--- /dev/null
+++ b/tests/summarizer/test_integration.py
@@ -0,0 +1,466 @@
+"""Integration tests for the summarizer with memory system."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from unittest.mock import patch
+
+import pytest
+
+from agent_cli.memory._ingest import summarize_content
+from agent_cli.memory._persistence import persist_hierarchical_summary
+from agent_cli.memory._store import (
+    get_final_summary,
+    get_summary_at_level,
+    upsert_hierarchical_summary,
+)
+from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult
+from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class _FakeCollection:
+    """Minimal Chroma-like collection for testing."""
+
+    def __init__(self) -> None:
+        self._store: dict[str, tuple[str, dict[str, Any]]] = {}
+
+    def upsert(
+        self,
+        *,
+        ids: list[str],
+        documents: list[str],
+        metadatas: list[dict[str, Any]],
+    ) -> None:
+        for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False):
+            self._store[doc_id] = (doc, meta)
+
+    def get(
+        self,
+        *,
+        where: dict[str, Any] | None = None,
+        include: list[str] | None = None,  # noqa: ARG002
+    ) -> dict[str, Any]:
+        if where is None:
+            return {"documents": [], "metadatas": [], "ids": []}
+
+        results: list[tuple[str, tuple[str, dict[str, Any]]]] = []
+        for doc_id, (doc, meta) in self._store.items():
+            # Check all conditions in $and clause
+            conditions = where.get("$and", [where])
+            match = True
+            for clause in conditions:
+                for k, v in clause.items():
+                    if k == "$and":
+                        continue
+                    if isinstance(v, dict):
+                        if "$in" in v and meta.get(k) not in v["$in"]:
+                            match = False
+                        if "$ne" in v and meta.get(k) == v["$ne"]:
+                            match = False
+                    elif meta.get(k) != v:
+                        match = False
+            if match:
+                results.append((doc_id, (doc, meta)))
+
+        docs = [doc for _, (doc, _) in results]
+        metas = [meta for _, (_, meta) in results]
+        ids = [doc_id for doc_id, _ in results]
+        return {"documents": docs, "metadatas": metas, "ids": ids}
+
+    def delete(
+        self,
+        ids: list[str] | None = None,
+        where: dict[str, Any] | None = None,  # noqa: ARG002
+    ) -> None:
+        if ids:
+            for doc_id in ids:
+                self._store.pop(doc_id, None)
+
+
+@pytest.fixture
+def fake_collection() -> _FakeCollection:
+    """Create a fake ChromaDB collection."""
+    return _FakeCollection()
+
+
+@pytest.fixture
+def memory_root(tmp_path: Path) -> Path:
+    """Create a temporary memory root directory."""
+    return tmp_path / "memory"
+
+
+class TestSummaryResultStorageMetadata:
+    """Test SummaryResult.to_storage_metadata for various levels."""
+
+    def test_standard_summary_produces_single_entry(self) -> None:
+        """Test that STANDARD level produces a single L3 entry."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A paragraph summary of the content.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        entries = result.to_storage_metadata("conv-123")
+
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-123:summary:L3:final"
+        assert entry["content"] == "A paragraph summary of the content."
+        assert entry["metadata"]["level"] == 3
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "STANDARD"
+
+    def test_hierarchical_summary_produces_multiple_entries(self) -> None:
+        """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=2,
+                content="Chunk 2",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final hierarchical synthesis.",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final hierarchical synthesis.",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+
+        entries = result.to_storage_metadata("conv-456")
+
+        # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries
+        assert len(entries) == 5
+
+        # Check L1 entries
+        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
+        assert len(l1_entries) == 3
+
+        # Check L2 entries
+        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
+        assert len(l2_entries) == 1
+
+        # Check L3 entry
+        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
+        assert len(l3_entries) == 1
+
+
+class TestHierarchicalSummaryStorage:
+    """Test storing hierarchical summaries to ChromaDB."""
+
+    def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
+        """Test storing a simple (non-hierarchical) summary."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        ids = upsert_hierarchical_summary(fake_collection, "conv-123", result)
+
+        assert len(ids) == 1
+        assert "conv-123:summary:L3:final" in ids
+
+        # Verify retrieval
+        stored = get_final_summary(fake_collection, "conv-123")
+        assert stored is not None
+        assert stored.content == "A standard summary."
+
+    def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None:
+        """Test storing a hierarchical summary with all levels."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=[],
+            l3_summary="Final summary",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Final summary",
+            hierarchical=hierarchical,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+
+        ids = upsert_hierarchical_summary(fake_collection, "conv-789", result)
+
+        assert len(ids) == 3  # 2 L1 + 1 L3
+
+        # Verify L1 retrieval
+        l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1)
+        assert len(l1_stored) == 2
+
+        # Verify L3 retrieval
+        final = get_final_summary(fake_collection, "conv-789")
+        assert final is not None
+        assert final.content == "Final summary"
+
+
+class TestFilePersistence:
+    """Test hierarchical summary file persistence."""
+
+    def test_persist_hierarchical_creates_files(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that persist_hierarchical_summary creates correct file structure."""
+        l1_summaries = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 content",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 content",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1_summaries,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final synthesis",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final synthesis",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+
+        ids = persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="test-conv",
+            summary_result=result,
+        )
+
+        assert len(ids) == 4  # 2 L1 + 1 L2 + 1 L3
+
+        # Check file structure (note: _slugify converts - to - not _)
+        entries_dir = memory_root / "entries" / "test-conv"
+        l1_dir = entries_dir / "summaries" / "L1"
+        l2_dir = entries_dir / "summaries" / "L2"
+        l3_dir = entries_dir / "summaries" / "L3"
+
+        assert l1_dir.exists()
+        assert l2_dir.exists()
+        assert l3_dir.exists()
+
+        # Check L1 files
+        l1_files = list(l1_dir.glob("*.md"))
+        assert len(l1_files) == 2
+
+        # Check L2 files
+        l2_files = list(l2_dir.glob("*.md"))
+        assert len(l2_files) == 1
+
+        # Check L3 files
+        l3_files = list(l3_dir.glob("*.md"))
+        assert len(l3_files) == 1
+        assert (l3_dir / "final.md").exists()
+
+    def test_persist_simple_summary_creates_l3_file(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that a simple summary creates just L3/final.md."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard paragraph summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        ids = persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="simple-conv",
+            summary_result=result,
+        )
+
+        assert len(ids) == 1
+
+        # Check file exists (note: _slugify converts - to - not _)
+        entries_dir = memory_root / "entries" / "simple-conv"
+        l3_file = entries_dir / "summaries" / "L3" / "final.md"
+        assert l3_file.exists()
+
+        # Check content has YAML front matter
+        content = l3_file.read_text(encoding="utf-8")
+        assert "---" in content
+        assert "level: 3" in content
+        assert "A standard paragraph summary." in content
+
+    def test_persist_deletes_old_summaries(
+        self,
+        fake_collection: _FakeCollection,
+        memory_root: Path,
+    ) -> None:
+        """Test that persisting new summary deletes old summary files."""
+        # Create first summary
+        result1 = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="First summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="conv",
+            summary_result=result1,
+        )
+
+        entries_dir = memory_root / "entries" / "conv"
+        first_file = entries_dir / "summaries" / "L3" / "final.md"
+        assert first_file.exists()
+        assert "First summary." in first_file.read_text()
+
+        # Create second summary (should replace first)
+        result2 = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="Second summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+
+        persist_hierarchical_summary(
+            fake_collection,
+            memory_root=memory_root,
+            conversation_id="conv",
+            summary_result=result2,
+        )
+
+        # First summary should be moved to deleted
+        assert first_file.exists()
+        assert "Second summary." in first_file.read_text()
+
+        # Old summary should be in deleted folder
+        deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries"
+        assert deleted_dir.exists()
+
+
+class TestAdaptiveSummarizerLevelDetermination:
+    """Test that AdaptiveSummarizer correctly determines summary levels."""
+
+    @pytest.fixture
+    def summarizer(self) -> AdaptiveSummarizer:
+        """Create an AdaptiveSummarizer instance."""
+        return AdaptiveSummarizer(
+            openai_base_url="http://localhost:8000/v1",
+            model="test-model",
+        )
+
+    def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that content under 100 tokens gets NONE level."""
+        level = summarizer.determine_level(50)
+        assert level == SummaryLevel.NONE
+
+    def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 100-500 token content gets BRIEF level."""
+        level = summarizer.determine_level(300)
+        assert level == SummaryLevel.BRIEF
+
+    def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 500-3000 token content gets STANDARD level."""
+        level = summarizer.determine_level(1500)
+        assert level == SummaryLevel.STANDARD
+
+    def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that 3000-15000 token content gets DETAILED level."""
+        level = summarizer.determine_level(8000)
+        assert level == SummaryLevel.DETAILED
+
+    def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None:
+        """Test that content over 15000 tokens gets HIERARCHICAL level."""
+        level = summarizer.determine_level(25000)
+        assert level == SummaryLevel.HIERARCHICAL
+
+
+class TestSummarizeContentFunction:
+    """Test the summarize_content function from _ingest."""
+
+    @pytest.mark.asyncio
+    async def test_summarize_content_creates_result(self) -> None:
+        """Test that summarize_content returns a valid SummaryResult."""
+        with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize:
+            mock_result = SummaryResult(
+                level=SummaryLevel.STANDARD,
+                summary="Mocked summary.",
+                hierarchical=None,
+                input_tokens=1000,
+                output_tokens=50,
+                compression_ratio=0.05,
+            )
+            mock_summarize.return_value = mock_result
+
+            result = await summarize_content(
+                content="Some content to summarize " * 100,
+                openai_base_url="http://localhost:8000/v1",
+                api_key=None,
+                model="test-model",
+            )
+
+            assert result.level == SummaryLevel.STANDARD
+            assert result.summary == "Mocked summary."
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
new file mode 100644
index 00000000..5a6583cd
--- /dev/null
+++ b/tests/summarizer/test_models.py
@@ -0,0 +1,332 @@
+"""Unit tests for summarizer models."""
+
+from __future__ import annotations
+
+from datetime import UTC, datetime
+
+import pytest
+
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
+
+
+class TestSummaryLevel:
+    """Tests for SummaryLevel enum."""
+
+    def test_level_values(self) -> None:
+        """Test that levels have correct integer values."""
+        assert SummaryLevel.NONE == 0
+        assert SummaryLevel.BRIEF == 1
+        assert SummaryLevel.STANDARD == 2
+        assert SummaryLevel.DETAILED == 3
+        assert SummaryLevel.HIERARCHICAL == 4
+
+    def test_level_ordering(self) -> None:
+        """Test that levels can be compared."""
+        assert SummaryLevel.NONE < SummaryLevel.BRIEF
+        assert SummaryLevel.BRIEF < SummaryLevel.STANDARD
+        assert SummaryLevel.STANDARD < SummaryLevel.DETAILED
+        assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL
+
+
+class TestChunkSummary:
+    """Tests for ChunkSummary model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a chunk summary."""
+        chunk = ChunkSummary(
+            chunk_index=0,
+            content="This is a summary of chunk 1.",
+            token_count=10,
+            source_tokens=100,
+            parent_group=None,
+        )
+        assert chunk.chunk_index == 0
+        assert chunk.content == "This is a summary of chunk 1."
+        assert chunk.token_count == 10
+        assert chunk.source_tokens == 100
+        assert chunk.parent_group is None
+
+    def test_with_parent_group(self) -> None:
+        """Test creating a chunk summary with parent group."""
+        chunk = ChunkSummary(
+            chunk_index=5,
+            content="Summary text",
+            token_count=8,
+            source_tokens=200,
+            parent_group=1,
+        )
+        assert chunk.parent_group == 1
+
+    def test_validation_negative_tokens(self) -> None:
+        """Test that negative token counts fail validation."""
+        with pytest.raises(ValueError, match="greater than or equal to 0"):
+            ChunkSummary(
+                chunk_index=0,
+                content="Test",
+                token_count=-1,
+                source_tokens=100,
+            )
+
+
+class TestHierarchicalSummary:
+    """Tests for HierarchicalSummary model."""
+
+    def test_basic_creation(self) -> None:
+        """Test creating a hierarchical summary."""
+        l1 = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 1 summary",
+                token_count=10,
+                source_tokens=100,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 2 summary",
+                token_count=12,
+                source_tokens=120,
+            ),
+        ]
+        hs = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=["Group summary"],
+            l3_summary="Final summary of all content.",
+        )
+        assert len(hs.l1_summaries) == 2
+        assert len(hs.l2_summaries) == 1
+        assert hs.l3_summary == "Final summary of all content."
+
+    def test_default_chunk_settings(self) -> None:
+        """Test default chunk size and overlap."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=[],
+            l3_summary="Final",
+        )
+        assert hs.chunk_size == 3000
+        assert hs.chunk_overlap == 200
+
+    def test_get_summary_at_level_1(self) -> None:
+        """Test getting L1 summaries."""
+        l1 = [
+            ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50),
+            ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50),
+        ]
+        hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final")
+        result = hs.get_summary_at_level(1)
+        assert result == ["C1", "C2"]
+
+    def test_get_summary_at_level_2_with_l2(self) -> None:
+        """Test getting L2 summaries when available."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=["Group A", "Group B"],
+            l3_summary="Final",
+        )
+        result = hs.get_summary_at_level(2)
+        assert result == ["Group A", "Group B"]
+
+    def test_get_summary_at_level_2_fallback(self) -> None:
+        """Test getting L2 falls back to L3 when no L2 summaries."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=[],
+            l3_summary="Final summary",
+        )
+        result = hs.get_summary_at_level(2)
+        assert result == ["Final summary"]
+
+    def test_get_summary_at_level_3(self) -> None:
+        """Test getting L3 summary."""
+        hs = HierarchicalSummary(
+            l1_summaries=[],
+            l2_summaries=["Group"],
+            l3_summary="The final summary",
+        )
+        result = hs.get_summary_at_level(3)
+        assert result == "The final summary"
+
+
+class TestSummaryResult:
+    """Tests for SummaryResult model."""
+
+    def test_none_level_result(self) -> None:
+        """Test result for content that needs no summary."""
+        result = SummaryResult(
+            level=SummaryLevel.NONE,
+            summary=None,
+            hierarchical=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
+        )
+        assert result.level == SummaryLevel.NONE
+        assert result.summary is None
+        assert result.chunk_summaries is None
+
+    def test_brief_level_result(self) -> None:
+        """Test result for brief summary."""
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="A brief one-sentence summary.",
+            hierarchical=None,
+            input_tokens=200,
+            output_tokens=10,
+            compression_ratio=0.05,
+        )
+        assert result.level == SummaryLevel.BRIEF
+        assert result.summary == "A brief one-sentence summary."
+        assert result.chunk_summaries is None
+
+    def test_hierarchical_result_with_chunk_summaries(self) -> None:
+        """Test hierarchical result exposes chunk summaries."""
+        l1 = [
+            ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100),
+            ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=[],
+            l3_summary="Final",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.DETAILED,
+            summary="Final",
+            hierarchical=hierarchical,
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+        )
+        assert result.chunk_summaries == ["Chunk 1", "Chunk 2"]
+
+    def test_to_storage_metadata_none_level(self) -> None:
+        """Test that NONE level produces no storage entries."""
+        result = SummaryResult(
+            level=SummaryLevel.NONE,
+            summary=None,
+            hierarchical=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
+        )
+        entries = result.to_storage_metadata("conv-123")
+        assert entries == []
+
+    def test_to_storage_metadata_simple_summary(self) -> None:
+        """Test storage metadata for simple (non-hierarchical) summary."""
+        result = SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="A standard paragraph summary.",
+            hierarchical=None,
+            input_tokens=1000,
+            output_tokens=50,
+            compression_ratio=0.05,
+        )
+        entries = result.to_storage_metadata("conv-456")
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-456:summary:L3:final"
+        assert entry["content"] == "A standard paragraph summary."
+        assert entry["metadata"]["conversation_id"] == "conv-456"
+        assert entry["metadata"]["role"] == "summary"
+        assert entry["metadata"]["level"] == 3
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "STANDARD"
+
+    def test_to_storage_metadata_hierarchical(self) -> None:
+        """Test storage metadata for hierarchical summary."""
+        l1 = [
+            ChunkSummary(
+                chunk_index=0,
+                content="Chunk 0 text",
+                token_count=10,
+                source_tokens=100,
+                parent_group=0,
+            ),
+            ChunkSummary(
+                chunk_index=1,
+                content="Chunk 1 text",
+                token_count=12,
+                source_tokens=120,
+                parent_group=0,
+            ),
+        ]
+        hierarchical = HierarchicalSummary(
+            l1_summaries=l1,
+            l2_summaries=["Group 0 summary"],
+            l3_summary="Final synthesis",
+        )
+        result = SummaryResult(
+            level=SummaryLevel.HIERARCHICAL,
+            summary="Final synthesis",
+            hierarchical=hierarchical,
+            input_tokens=20000,
+            output_tokens=200,
+            compression_ratio=0.01,
+        )
+        entries = result.to_storage_metadata("conv-789")
+
+        # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries
+        assert len(entries) == 4
+
+        # Check L1 entries
+        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
+        assert len(l1_entries) == 2
+        assert l1_entries[0]["id"] == "conv-789:summary:L1:0"
+        assert l1_entries[0]["metadata"]["chunk_index"] == 0
+
+        # Check L2 entry
+        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
+        assert len(l2_entries) == 1
+        assert l2_entries[0]["id"] == "conv-789:summary:L2:0"
+        assert l2_entries[0]["content"] == "Group 0 summary"
+
+        # Check L3 entry
+        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
+        assert len(l3_entries) == 1
+        assert l3_entries[0]["id"] == "conv-789:summary:L3:final"
+        assert l3_entries[0]["metadata"]["is_final"] is True
+
+    def test_compression_ratio_bounds(self) -> None:
+        """Test compression ratio validation."""
+        # Valid ratio
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="Test",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=10,
+            compression_ratio=0.1,
+        )
+        assert result.compression_ratio == 0.1
+
+        # Ratio must be between 0 and 1
+        with pytest.raises(ValueError, match="less than or equal to 1"):
+            SummaryResult(
+                level=SummaryLevel.BRIEF,
+                summary="Test",
+                hierarchical=None,
+                input_tokens=100,
+                output_tokens=10,
+                compression_ratio=1.5,
+            )
+
+    def test_created_at_default(self) -> None:
+        """Test that created_at is automatically set."""
+        before = datetime.now(UTC)
+        result = SummaryResult(
+            level=SummaryLevel.BRIEF,
+            summary="Test",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=10,
+            compression_ratio=0.1,
+        )
+        after = datetime.now(UTC)
+        # Compare without timezone since result.created_at may be naive
+        assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None)
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
new file mode 100644
index 00000000..e126def2
--- /dev/null
+++ b/tests/summarizer/test_prompts.py
@@ -0,0 +1,180 @@
+"""Unit tests for summarizer prompt templates."""
+
+from __future__ import annotations
+
+from agent_cli.summarizer.prompts import (
+    BRIEF_SUMMARY_PROMPT,
+    CHUNK_SUMMARY_PROMPT,
+    CONVERSATION_SUMMARY_PROMPT,
+    DOCUMENT_SUMMARY_PROMPT,
+    JOURNAL_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    ROLLING_SUMMARY_PROMPT,
+    STANDARD_SUMMARY_PROMPT,
+    format_prior_context,
+    format_summaries_for_meta,
+    get_prompt_for_content_type,
+)
+
+
+class TestPromptTemplates:
+    """Tests for prompt template structure."""
+
+    def test_brief_prompt_has_content_placeholder(self) -> None:
+        """Test BRIEF prompt contains content placeholder."""
+        assert "{content}" in BRIEF_SUMMARY_PROMPT
+        # Test it can be formatted
+        result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
+        assert "Test content" in result
+
+    def test_standard_prompt_has_placeholders(self) -> None:
+        """Test STANDARD prompt contains required placeholders."""
+        assert "{content}" in STANDARD_SUMMARY_PROMPT
+        assert "{prior_context}" in STANDARD_SUMMARY_PROMPT
+        assert "{max_words}" in STANDARD_SUMMARY_PROMPT
+
+        result = STANDARD_SUMMARY_PROMPT.format(
+            content="Main content",
+            prior_context="Previous context",
+            max_words=100,
+        )
+        assert "Main content" in result
+        assert "Previous context" in result
+        assert "100" in result
+
+    def test_chunk_prompt_has_placeholders(self) -> None:
+        """Test CHUNK prompt contains required placeholders."""
+        assert "{content}" in CHUNK_SUMMARY_PROMPT
+        assert "{chunk_index}" in CHUNK_SUMMARY_PROMPT
+        assert "{total_chunks}" in CHUNK_SUMMARY_PROMPT
+        assert "{max_words}" in CHUNK_SUMMARY_PROMPT
+
+        result = CHUNK_SUMMARY_PROMPT.format(
+            content="Chunk content",
+            chunk_index=1,
+            total_chunks=5,
+            max_words=50,
+        )
+        assert "Chunk content" in result
+        assert "1" in result
+        assert "5" in result
+
+    def test_meta_prompt_has_placeholders(self) -> None:
+        """Test META prompt contains required placeholders."""
+        assert "{summaries}" in META_SUMMARY_PROMPT
+        assert "{max_words}" in META_SUMMARY_PROMPT
+
+        result = META_SUMMARY_PROMPT.format(
+            summaries="Summary 1\n\nSummary 2",
+            max_words=200,
+        )
+        assert "Summary 1" in result
+        assert "200" in result
+
+    def test_rolling_prompt_has_placeholders(self) -> None:
+        """Test ROLLING prompt contains required placeholders."""
+        assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT
+        assert "{new_content}" in ROLLING_SUMMARY_PROMPT
+        assert "{max_words}" in ROLLING_SUMMARY_PROMPT
+
+    def test_conversation_prompt_has_content(self) -> None:
+        """Test CONVERSATION prompt contains content placeholder."""
+        assert "{content}" in CONVERSATION_SUMMARY_PROMPT
+        assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT
+
+    def test_journal_prompt_has_content(self) -> None:
+        """Test JOURNAL prompt contains content placeholder."""
+        assert "{content}" in JOURNAL_SUMMARY_PROMPT
+        assert "{max_words}" in JOURNAL_SUMMARY_PROMPT
+
+    def test_document_prompt_has_content(self) -> None:
+        """Test DOCUMENT prompt contains content placeholder."""
+        assert "{content}" in DOCUMENT_SUMMARY_PROMPT
+        assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT
+
+
+class TestGetPromptForContentType:
+    """Tests for get_prompt_for_content_type function."""
+
+    def test_general_returns_standard(self) -> None:
+        """Test general content type returns standard prompt."""
+        prompt = get_prompt_for_content_type("general")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+    def test_conversation_returns_conversation(self) -> None:
+        """Test conversation content type returns conversation prompt."""
+        prompt = get_prompt_for_content_type("conversation")
+        assert prompt == CONVERSATION_SUMMARY_PROMPT
+
+    def test_journal_returns_journal(self) -> None:
+        """Test journal content type returns journal prompt."""
+        prompt = get_prompt_for_content_type("journal")
+        assert prompt == JOURNAL_SUMMARY_PROMPT
+
+    def test_document_returns_document(self) -> None:
+        """Test document content type returns document prompt."""
+        prompt = get_prompt_for_content_type("document")
+        assert prompt == DOCUMENT_SUMMARY_PROMPT
+
+    def test_unknown_returns_standard(self) -> None:
+        """Test unknown content type falls back to standard."""
+        prompt = get_prompt_for_content_type("unknown_type")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+    def test_empty_returns_standard(self) -> None:
+        """Test empty string falls back to standard."""
+        prompt = get_prompt_for_content_type("")
+        assert prompt == STANDARD_SUMMARY_PROMPT
+
+
+class TestFormatPriorContext:
+    """Tests for format_prior_context function."""
+
+    def test_with_prior_summary(self) -> None:
+        """Test formatting with a prior summary."""
+        result = format_prior_context("Previous summary text")
+        assert "Prior context" in result
+        assert "Previous summary text" in result
+
+    def test_without_prior_summary(self) -> None:
+        """Test formatting without prior summary returns empty string."""
+        result = format_prior_context(None)
+        assert result == ""
+
+    def test_empty_string_prior_summary(self) -> None:
+        """Test formatting with empty string prior summary."""
+        result = format_prior_context("")
+        assert result == ""
+
+
+class TestFormatSummariesForMeta:
+    """Tests for format_summaries_for_meta function."""
+
+    def test_single_summary(self) -> None:
+        """Test formatting a single summary."""
+        result = format_summaries_for_meta(["Summary one"])
+        assert "[Section 1]" in result
+        assert "Summary one" in result
+
+    def test_multiple_summaries(self) -> None:
+        """Test formatting multiple summaries."""
+        summaries = ["First summary", "Second summary", "Third summary"]
+        result = format_summaries_for_meta(summaries)
+
+        assert "[Section 1]" in result
+        assert "[Section 2]" in result
+        assert "[Section 3]" in result
+        assert "First summary" in result
+        assert "Second summary" in result
+        assert "Third summary" in result
+
+    def test_empty_list(self) -> None:
+        """Test formatting empty list."""
+        result = format_summaries_for_meta([])
+        assert result == ""
+
+    def test_summaries_separated(self) -> None:
+        """Test summaries are separated by double newlines."""
+        summaries = ["Sum 1", "Sum 2"]
+        result = format_summaries_for_meta(summaries)
+        assert "\n\n" in result
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
new file mode 100644
index 00000000..458e9b37
--- /dev/null
+++ b/tests/summarizer/test_utils.py
@@ -0,0 +1,193 @@
+"""Unit tests for summarizer utility functions."""
+
+from __future__ import annotations
+
+from agent_cli.summarizer.utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    middle_truncate,
+    tokens_to_words,
+)
+
+
+class TestCountTokens:
+    """Tests for count_tokens function."""
+
+    def test_empty_string(self) -> None:
+        """Test counting tokens in empty string."""
+        assert count_tokens("") == 0
+
+    def test_simple_sentence(self) -> None:
+        """Test counting tokens in a simple sentence."""
+        # "Hello world" is typically 2 tokens
+        count = count_tokens("Hello world")
+        assert count > 0
+        assert count < 10
+
+    def test_longer_text(self) -> None:
+        """Test that longer text has more tokens."""
+        short = count_tokens("Hello")
+        long = count_tokens("Hello world, this is a longer sentence with more words.")
+        assert long > short
+
+    def test_different_model_fallback(self) -> None:
+        """Test that unknown models fall back to cl100k_base."""
+        # Should not raise, should fall back gracefully
+        count = count_tokens("Hello world", model="unknown-model-xyz")
+        assert count > 0
+
+
+class TestChunkText:
+    """Tests for chunk_text function."""
+
+    def test_empty_text(self) -> None:
+        """Test chunking empty text returns empty list."""
+        assert chunk_text("") == []
+
+    def test_short_text_single_chunk(self) -> None:
+        """Test that short text stays as single chunk."""
+        text = "This is a short paragraph."
+        chunks = chunk_text(text, chunk_size=1000)
+        assert len(chunks) == 1
+        assert chunks[0] == text
+
+    def test_multiple_paragraphs_chunking(self) -> None:
+        """Test chunking multiple paragraphs."""
+        paragraphs = ["Paragraph one. " * 50, "Paragraph two. " * 50, "Paragraph three. " * 50]
+        text = "\n\n".join(paragraphs)
+
+        # Use small chunk size to force splitting
+        chunks = chunk_text(text, chunk_size=200, overlap=20)
+        assert len(chunks) > 1
+
+    def test_overlap_preserved(self) -> None:
+        """Test that chunks have overlap for context continuity."""
+        # Create text that will definitely need chunking
+        text = "Sentence one about topic A. " * 20 + "\n\n" + "Sentence two about topic B. " * 20
+
+        chunks = chunk_text(text, chunk_size=100, overlap=30)
+
+        # With overlap, later chunks should contain some content from earlier
+        if len(chunks) > 1:
+            # Overlap means adjacent chunks share some content
+            # This is a rough check - exact overlap depends on tokenization
+            assert len(chunks) >= 2
+
+    def test_large_paragraph_sentence_split(self) -> None:
+        """Test that large paragraphs are split by sentences."""
+        # One giant paragraph with multiple sentences
+        sentences = [
+            f"This is sentence number {i}. It contains important information." for i in range(50)
+        ]
+        text = " ".join(sentences)
+
+        chunks = chunk_text(text, chunk_size=100, overlap=20)
+        assert len(chunks) > 1
+
+
+class TestMiddleTruncate:
+    """Tests for middle_truncate function."""
+
+    def test_no_truncation_needed(self) -> None:
+        """Test that short text is not truncated."""
+        text = "Short text"
+        result, dropped = middle_truncate(text, budget_chars=100)
+        assert result == text
+        assert dropped == 0
+
+    def test_basic_truncation(self) -> None:
+        """Test basic middle truncation."""
+        text = "A" * 100  # 100 character string
+        result, dropped = middle_truncate(text, budget_chars=50)
+
+        # Should have head + marker + tail
+        assert len(result) <= 50 + 50  # Allow for marker
+        assert dropped > 0
+        assert "[..." in result
+        assert "truncated...]" in result
+
+    def test_head_tail_fractions(self) -> None:
+        """Test custom head/tail fractions."""
+        text = "AAAAA" + "BBBBB" * 20 + "CCCCC"
+        result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5)
+
+        # Should preserve beginning (A's) and end (C's)
+        assert result.startswith("A")
+        assert dropped > 0
+
+    def test_zero_budget(self) -> None:
+        """Test with zero budget returns original."""
+        text = "Some text"
+        result, dropped = middle_truncate(text, budget_chars=0)
+        assert result == text
+        assert dropped == 0
+
+    def test_negative_budget(self) -> None:
+        """Test with negative budget returns original."""
+        text = "Some text"
+        result, dropped = middle_truncate(text, budget_chars=-10)
+        assert result == text
+        assert dropped == 0
+
+
+class TestEstimateSummaryTokens:
+    """Tests for estimate_summary_tokens function."""
+
+    def test_none_level(self) -> None:
+        """Test level 0 (NONE) returns 0."""
+        assert estimate_summary_tokens(1000, level=0) == 0
+
+    def test_brief_level(self) -> None:
+        """Test level 1 (BRIEF) compression."""
+        # BRIEF: ~20% compression, capped at 50
+        result = estimate_summary_tokens(100, level=1)
+        assert result >= 20  # minimum of 20
+        assert result <= 50  # capped at 50
+
+    def test_standard_level(self) -> None:
+        """Test level 2 (STANDARD) compression."""
+        # STANDARD: ~12% compression, capped at 200
+        result = estimate_summary_tokens(1000, level=2)
+        assert result >= 50  # minimum of 50
+        assert result <= 200  # capped at 200
+
+    def test_detailed_level(self) -> None:
+        """Test level 3 (DETAILED) compression."""
+        # DETAILED: ~7% compression, capped at 500
+        result = estimate_summary_tokens(10000, level=3)
+        assert result >= 100  # minimum of 100
+        assert result <= 500  # capped at 500
+
+    def test_hierarchical_level(self) -> None:
+        """Test level 4 (HIERARCHICAL) compression."""
+        # HIERARCHICAL: base of 1000 + diminishing returns
+        result = estimate_summary_tokens(50000, level=4)
+        assert result >= 1000  # base minimum
+        assert result <= 2000  # capped at 2000
+
+    def test_hierarchical_small_input(self) -> None:
+        """Test HIERARCHICAL with smaller input."""
+        # Even with small input, should return base
+        result = estimate_summary_tokens(5000, level=4)
+        assert result == 1000  # just the base, no additional
+
+
+class TestTokensToWords:
+    """Tests for tokens_to_words function."""
+
+    def test_basic_conversion(self) -> None:
+        """Test basic token to word conversion."""
+        # 1 token ≈ 0.75 words
+        assert tokens_to_words(100) == 75
+        assert tokens_to_words(1000) == 750
+
+    def test_zero_tokens(self) -> None:
+        """Test zero tokens returns zero words."""
+        assert tokens_to_words(0) == 0
+
+    def test_small_values(self) -> None:
+        """Test small token values."""
+        assert tokens_to_words(1) == 0  # int(0.75) = 0
+        assert tokens_to_words(2) == 1  # int(1.5) = 1
+        assert tokens_to_words(4) == 3  # int(3.0) = 3

From 6bb6058a0096f8ffd53ae3e6d1563509d1490089 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:00:28 -0800
Subject: [PATCH 08/38] refactor(summarizer): improve code quality and add
 Letta-style features

- Fix datetime.utcnow() deprecation, use datetime.now(UTC)
- Extract duplicate chunk summarization to _summarize_single_chunk()
- Add SummarizationError exception for better error handling
- Add retry with exponential backoff (1s, 2s, 4s) for generation failures
- Add middle-truncation fallback for oversized content (Letta-style)
- Export SummarizationError from module __init__
---
 agent_cli/summarizer/__init__.py |   3 +-
 agent_cli/summarizer/adaptive.py | 176 ++++++++++++++++++++++---------
 agent_cli/summarizer/models.py   |   4 +-
 tests/summarizer/test_models.py  |   4 +-
 4 files changed, 135 insertions(+), 52 deletions(-)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index c6f1d85a..d017dfd4 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -16,7 +16,7 @@
 
 """
 
-from agent_cli.summarizer.adaptive import AdaptiveSummarizer
+from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError
 from agent_cli.summarizer.models import (
     HierarchicalSummary,
     SummaryLevel,
@@ -26,6 +26,7 @@
 __all__ = [
     "AdaptiveSummarizer",
     "HierarchicalSummary",
+    "SummarizationError",
     "SummaryLevel",
     "SummaryResult",
 ]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index ed0074d8..e8ff2f9a 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -38,6 +38,7 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
+    middle_truncate,
     tokens_to_words,
 )
 
@@ -57,6 +58,14 @@
 # Minimum number of L1 chunks before L2 grouping is applied
 L2_MIN_CHUNKS = 5
 
+# Retry settings for summarization failures
+MAX_SUMMARIZE_RETRIES = 3
+
+# Maximum characters per chunk before applying middle truncation
+# This prevents context overflow errors for very large chunks
+# (roughly 12K tokens with cl100k_base encoding)
+MAX_CHUNK_CHARS = 48000
+
 
 class SummaryOutput(BaseModel):
     """Structured output for summary generation."""
@@ -64,6 +73,10 @@ class SummaryOutput(BaseModel):
     summary: str
 
 
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
 class AdaptiveSummarizer:
     """Adaptive summarization that scales with input complexity.
 
@@ -245,6 +258,68 @@ async def update_rolling_summary(
 
         return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
 
+    async def _summarize_single_chunk(
+        self,
+        chunk: str,
+        chunk_index: int,
+        total_chunks: int,
+        *,
+        parent_group: int | None = None,
+    ) -> ChunkSummary:
+        """Summarize a single chunk of content.
+
+        Extracted to avoid duplication between _detailed_summary and
+        _hierarchical_summary methods. Uses middle truncation as a fallback
+        for oversized content (Letta-style).
+
+        Args:
+            chunk: The text chunk to summarize.
+            chunk_index: Index of this chunk (0-based).
+            total_chunks: Total number of chunks being processed.
+            parent_group: Optional L2 group index for hierarchical summaries.
+
+        Returns:
+            ChunkSummary with the summarized content.
+
+        """
+        # Apply middle truncation if chunk is too large (Letta-style fallback)
+        source_tokens = count_tokens(chunk, self.model)
+        content_to_summarize = chunk
+        if len(chunk) > MAX_CHUNK_CHARS:
+            content_to_summarize, dropped = middle_truncate(
+                chunk,
+                MAX_CHUNK_CHARS,
+                head_frac=0.3,
+                tail_frac=0.3,
+            )
+            logger.warning(
+                "Chunk %d truncated: dropped %d chars to fit context window",
+                chunk_index,
+                dropped,
+            )
+
+        chunk_tokens = count_tokens(content_to_summarize, self.model)
+        target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+        max_words = tokens_to_words(target_tokens)
+
+        prompt = CHUNK_SUMMARY_PROMPT.format(
+            chunk_index=chunk_index + 1,
+            total_chunks=total_chunks,
+            content=content_to_summarize,
+            max_words=max_words,
+        )
+
+        summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+        summary_tokens = count_tokens(summary, self.model)
+
+        return ChunkSummary(
+            chunk_index=chunk_index,
+            content=summary,
+            token_count=summary_tokens,
+            source_tokens=source_tokens,  # Report original token count
+            parent_group=parent_group,
+        )
+
     async def _brief_summary(self, content: str) -> str:
         """Generate a single-sentence summary for brief content."""
         prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
@@ -286,32 +361,17 @@ async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryRes
         # Summarize chunks (with concurrency limit)
         semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
 
-        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
             async with semaphore:
-                chunk_tokens = count_tokens(chunk, self.model)
-                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = CHUNK_SUMMARY_PROMPT.format(
-                    chunk_index=idx + 1,
-                    total_chunks=len(chunks),
-                    content=chunk,
-                    max_words=max_words,
-                )
-
-                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-                summary_tokens = count_tokens(summary, self.model)
-
-                return ChunkSummary(
-                    chunk_index=idx,
-                    content=summary,
-                    token_count=summary_tokens,
-                    source_tokens=chunk_tokens,
+                return await self._summarize_single_chunk(
+                    chunk,
+                    idx,
+                    len(chunks),
                     parent_group=None,
                 )
 
         chunk_summaries = await asyncio.gather(
-            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
         )
 
         # Generate meta-summary
@@ -364,35 +424,19 @@ async def _hierarchical_summary(self, content: str, input_tokens: int) -> Summar
         # L1: Summarize each chunk
         semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
 
-        async def summarize_chunk(idx: int, chunk: str) -> ChunkSummary:
+        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
             async with semaphore:
-                chunk_tokens = count_tokens(chunk, self.model)
-                target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = CHUNK_SUMMARY_PROMPT.format(
-                    chunk_index=idx + 1,
-                    total_chunks=len(chunks),
-                    content=chunk,
-                    max_words=max_words,
-                )
-
-                summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-                summary_tokens = count_tokens(summary, self.model)
-
-                # Assign to group (5 chunks per group)
-                group_idx = idx // 5
-
-                return ChunkSummary(
-                    chunk_index=idx,
-                    content=summary,
-                    token_count=summary_tokens,
-                    source_tokens=chunk_tokens,
+                # Assign to L2 group (L2_GROUP_SIZE chunks per group)
+                group_idx = idx // L2_GROUP_SIZE
+                return await self._summarize_single_chunk(
+                    chunk,
+                    idx,
+                    len(chunks),
                     parent_group=group_idx,
                 )
 
         l1_summaries = await asyncio.gather(
-            *[summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)],
+            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
         )
 
         # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
@@ -448,10 +492,29 @@ async def summarize_group(group: list[str]) -> str:
             compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
         )
 
-    async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
+    async def _generate_summary(
+        self,
+        prompt: str,
+        max_tokens: int = 256,
+        *,
+        attempt: int = 0,
+    ) -> str:
         """Generate a summary using the LLM.
 
         Uses PydanticAI for structured output with fallback to raw generation.
+        Implements exponential backoff retry on failures.
+
+        Args:
+            prompt: The prompt to send to the LLM.
+            max_tokens: Maximum tokens for the response.
+            attempt: Current retry attempt (for internal recursion).
+
+        Returns:
+            The generated summary text.
+
+        Raises:
+            SummarizationError: If all retries are exhausted.
+
         """
         model = OpenAIChatModel(
             model_name=self.model,
@@ -475,7 +538,26 @@ async def _generate_summary(self, prompt: str, max_tokens: int = 256) -> str:
         except Exception as e:
             logger.warning("Structured summary failed, trying raw generation: %s", e)
             # Fallback to raw HTTP call
-            return await self._raw_generate(prompt, max_tokens)
+            try:
+                return await self._raw_generate(prompt, max_tokens)
+            except Exception as raw_err:
+                if attempt < MAX_SUMMARIZE_RETRIES:
+                    wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
+                    logger.warning(
+                        "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
+                        attempt + 1,
+                        MAX_SUMMARIZE_RETRIES,
+                        wait_time,
+                        raw_err,
+                    )
+                    await asyncio.sleep(wait_time)
+                    return await self._generate_summary(
+                        prompt,
+                        max_tokens,
+                        attempt=attempt + 1,
+                    )
+                msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
+                raise SummarizationError(msg) from raw_err
 
     async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
         """Fallback raw HTTP generation without structured output."""
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index f231a41e..de9bc609 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from datetime import datetime
+from datetime import UTC, datetime
 from enum import IntEnum
 from typing import Any
 
@@ -122,7 +122,7 @@ class SummaryResult(BaseModel):
         description="Ratio of output to input tokens (lower = more compression)",
     )
     created_at: datetime = Field(
-        default_factory=datetime.utcnow,
+        default_factory=lambda: datetime.now(UTC),
         description="Timestamp when summary was created",
     )
 
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index 5a6583cd..e27fa18e 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -328,5 +328,5 @@ def test_created_at_default(self) -> None:
             compression_ratio=0.1,
         )
         after = datetime.now(UTC)
-        # Compare without timezone since result.created_at may be naive
-        assert before.replace(tzinfo=None) <= result.created_at <= after.replace(tzinfo=None)
+        # All datetimes should be UTC-aware
+        assert before <= result.created_at <= after

From df000c0be2285ba3d52ed2daa87f8c693d36370b Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:11:26 -0800
Subject: [PATCH 09/38] refactor(summarizer): replace class with functional API

- Remove AdaptiveSummarizer class in favor of standalone functions
- Add SummarizerConfig dataclass for configuration
- Export determine_level() as pure function (no state needed)
- Update summarize(), update_rolling_summary() to take config parameter
- Update _ingest.py to use new functional API
- Update all tests for new API

This matches the functional style used throughout the codebase,
reducing state and improving testability.
---
 agent_cli/memory/_ingest.py          |  12 +-
 agent_cli/memory/_persistence.py     |   2 +-
 agent_cli/summarizer/__init__.py     |  19 +-
 agent_cli/summarizer/adaptive.py     | 925 ++++++++++++++-------------
 tests/summarizer/test_adaptive.py    | 219 ++++---
 tests/summarizer/test_integration.py |  37 +-
 6 files changed, 617 insertions(+), 597 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 3ce9385e..d4701548 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -335,9 +335,8 @@ async def summarize_content(
 ) -> SummaryResult:
     """Adaptively summarize content based on its length.
 
-    Uses the AdaptiveSummarizer to automatically select the appropriate
-    summarization strategy (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL)
-    based on input token count.
+    Automatically selects the appropriate summarization strategy
+    (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count.
 
     Args:
         content: The content to summarize.
@@ -352,15 +351,16 @@ async def summarize_content(
 
     """
     # Import here to avoid circular imports and allow optional dependency
-    from agent_cli.summarizer import AdaptiveSummarizer  # noqa: PLC0415
+    from agent_cli.summarizer import SummarizerConfig, summarize  # noqa: PLC0415
 
-    summarizer = AdaptiveSummarizer(
+    config = SummarizerConfig(
         openai_base_url=openai_base_url,
         model=model,
         api_key=api_key,
     )
-    return await summarizer.summarize(
+    return await summarize(
         content=content,
+        config=config,
         prior_summary=prior_summary,
         content_type=content_type,
     )
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 9c38f731..e27eb83f 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -210,7 +210,7 @@ def persist_hierarchical_summary(
         collection: ChromaDB collection.
         memory_root: Root path for memory files.
         conversation_id: The conversation this summary belongs to.
-        summary_result: The result from AdaptiveSummarizer.summarize().
+        summary_result: The result from summarize().
 
     Returns:
         List of IDs that were stored.
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index d017dfd4..09210146 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -5,18 +5,24 @@
 compression ratios) architectures.
 
 Example:
-    from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel
+    from agent_cli.summarizer import summarize, SummarizerConfig, determine_level
 
-    summarizer = AdaptiveSummarizer(
+    config = SummarizerConfig(
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
-    result = await summarizer.summarize(long_document)
+    result = await summarize(long_document, config)
     print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
 
 """
 
-from agent_cli.summarizer.adaptive import AdaptiveSummarizer, SummarizationError
+from agent_cli.summarizer.adaptive import (
+    SummarizationError,
+    SummarizerConfig,
+    determine_level,
+    summarize,
+    update_rolling_summary,
+)
 from agent_cli.summarizer.models import (
     HierarchicalSummary,
     SummaryLevel,
@@ -24,9 +30,12 @@
 )
 
 __all__ = [
-    "AdaptiveSummarizer",
     "HierarchicalSummary",
     "SummarizationError",
+    "SummarizerConfig",
     "SummaryLevel",
     "SummaryResult",
+    "determine_level",
+    "summarize",
+    "update_rolling_summary",
 ]
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index e8ff2f9a..38fa865d 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import logging
+from dataclasses import dataclass
 
 import httpx
 from pydantic import BaseModel
@@ -77,508 +78,512 @@ class SummarizationError(Exception):
     """Raised when summarization fails after all retries."""
 
 
-class AdaptiveSummarizer:
-    """Adaptive summarization that scales with input complexity.
-
-    Automatically selects the appropriate summarization strategy based on
-    input length:
-    - NONE (< 100 tokens): No summary needed
-    - BRIEF (100-500 tokens): Single sentence
-    - STANDARD (500-3000 tokens): Paragraph summary
-    - DETAILED (3000-15000 tokens): Chunked + meta-summary
-    - HIERARCHICAL (> 15000 tokens): Multi-level tree of summaries
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
 
     Example:
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="llama3.1:8b",
         )
-        result = await summarizer.summarize(long_document)
+        result = await summarize(long_document, config)
         print(f"Level: {result.level.name}")
-        print(f"Summary: {result.summary}")
         print(f"Compression: {result.compression_ratio:.1%}")
 
     """
 
-    def __init__(
-        self,
-        openai_base_url: str,
-        model: str,
-        api_key: str | None = None,
-        chunk_size: int = 3000,
-        chunk_overlap: int = 200,
-        max_concurrent_chunks: int = 5,
-        timeout: float = 60.0,
-    ) -> None:
-        """Initialize the adaptive summarizer.
-
-        Args:
-            openai_base_url: Base URL for OpenAI-compatible API.
-            model: Model name to use for summarization.
-            api_key: API key (optional for local models).
-            chunk_size: Target token count per chunk for hierarchical summarization.
-            chunk_overlap: Token overlap between chunks.
-            max_concurrent_chunks: Maximum parallel chunk summarizations.
-            timeout: Request timeout in seconds.
-
-        """
-        self.openai_base_url = openai_base_url.rstrip("/")
-        self.model = model
-        self.api_key = api_key or "not-needed"
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.max_concurrent_chunks = max_concurrent_chunks
-        self.timeout = timeout
-
-        self._provider = OpenAIProvider(api_key=self.api_key, base_url=self.openai_base_url)
-
-    def determine_level(self, token_count: int) -> SummaryLevel:
-        """Determine the appropriate summary level based on token count.
-
-        Args:
-            token_count: Number of tokens in the input.
-
-        Returns:
-            The recommended SummaryLevel.
-
-        """
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
-            return SummaryLevel.NONE
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
-            return SummaryLevel.BRIEF
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
-            return SummaryLevel.STANDARD
-        if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
-            return SummaryLevel.DETAILED
-        return SummaryLevel.HIERARCHICAL
-
-    async def summarize(
-        self,
-        content: str,
-        prior_summary: str | None = None,
-        content_type: str = "general",
-    ) -> SummaryResult:
-        """Summarize content with adaptive strategy based on length.
-
-        Args:
-            content: The content to summarize.
-            prior_summary: Optional prior summary for context continuity.
-            content_type: Type of content ("general", "conversation", "journal", "document").
-
-        Returns:
-            SummaryResult with summary and metadata.
-
-        """
-        if not content or not content.strip():
-            return SummaryResult(
-                level=SummaryLevel.NONE,
-                summary=None,
-                hierarchical=None,
-                input_tokens=0,
-                output_tokens=0,
-                compression_ratio=0.0,
-            )
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 3000
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
 
-        input_tokens = count_tokens(content, self.model)
-        level = self.determine_level(input_tokens)
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
 
-        logger.info(
-            "Summarizing %d tokens at level %s (type=%s)",
-            input_tokens,
-            level.name,
-            content_type,
-        )
 
-        if level == SummaryLevel.NONE:
-            return SummaryResult(
-                level=level,
-                summary=None,
-                hierarchical=None,
-                input_tokens=input_tokens,
-                output_tokens=0,
-                compression_ratio=0.0,
-            )
+def determine_level(token_count: int) -> SummaryLevel:
+    """Determine the appropriate summary level based on token count.
 
-        if level == SummaryLevel.BRIEF:
-            summary = await self._brief_summary(content)
-        elif level == SummaryLevel.STANDARD:
-            summary = await self._standard_summary(content, prior_summary, content_type)
-        elif level == SummaryLevel.DETAILED:
-            return await self._detailed_summary(content, input_tokens)
-        else:  # HIERARCHICAL
-            return await self._hierarchical_summary(content, input_tokens)
+    Args:
+        token_count: Number of tokens in the input.
 
-        output_tokens = count_tokens(summary, self.model) if summary else 0
-        compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+    Returns:
+        The recommended SummaryLevel.
 
+    """
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+        return SummaryLevel.NONE
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+        return SummaryLevel.BRIEF
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
+        return SummaryLevel.STANDARD
+    if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
+        return SummaryLevel.DETAILED
+    return SummaryLevel.HIERARCHICAL
+
+
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+) -> SummaryResult:
+    """Summarize content with adaptive strategy based on length.
+
+    Args:
+        content: The content to summarize.
+        config: Summarizer configuration.
+        prior_summary: Optional prior summary for context continuity.
+        content_type: Type of content ("general", "conversation", "journal", "document").
+
+    Returns:
+        SummaryResult with summary and metadata.
+
+    """
+    if not content or not content.strip():
         return SummaryResult(
-            level=level,
-            summary=summary,
+            level=SummaryLevel.NONE,
+            summary=None,
             hierarchical=None,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=compression_ratio,
-        )
-
-    async def update_rolling_summary(
-        self,
-        prior_summary: str | None,
-        new_facts: list[str],
-    ) -> str:
-        """Update a rolling summary with new facts (Mem0-style).
-
-        This is optimized for incremental updates where you have discrete
-        new facts to integrate into an existing summary.
-
-        Args:
-            prior_summary: The existing summary to update.
-            new_facts: List of new facts to integrate.
-
-        Returns:
-            Updated summary string.
-
-        """
-        if not new_facts:
-            return prior_summary or ""
-
-        new_content = "\n".join(f"- {fact}" for fact in new_facts)
-        combined_tokens = count_tokens(
-            (prior_summary or "") + new_content,
-            self.model,
-        )
-
-        target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
-
-        prompt = ROLLING_SUMMARY_PROMPT.format(
-            prior_summary=prior_summary or "(No prior summary)",
-            new_content=new_content,
-            max_words=max_words,
+            input_tokens=0,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
 
-        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-
-    async def _summarize_single_chunk(
-        self,
-        chunk: str,
-        chunk_index: int,
-        total_chunks: int,
-        *,
-        parent_group: int | None = None,
-    ) -> ChunkSummary:
-        """Summarize a single chunk of content.
-
-        Extracted to avoid duplication between _detailed_summary and
-        _hierarchical_summary methods. Uses middle truncation as a fallback
-        for oversized content (Letta-style).
-
-        Args:
-            chunk: The text chunk to summarize.
-            chunk_index: Index of this chunk (0-based).
-            total_chunks: Total number of chunks being processed.
-            parent_group: Optional L2 group index for hierarchical summaries.
-
-        Returns:
-            ChunkSummary with the summarized content.
-
-        """
-        # Apply middle truncation if chunk is too large (Letta-style fallback)
-        source_tokens = count_tokens(chunk, self.model)
-        content_to_summarize = chunk
-        if len(chunk) > MAX_CHUNK_CHARS:
-            content_to_summarize, dropped = middle_truncate(
-                chunk,
-                MAX_CHUNK_CHARS,
-                head_frac=0.3,
-                tail_frac=0.3,
-            )
-            logger.warning(
-                "Chunk %d truncated: dropped %d chars to fit context window",
-                chunk_index,
-                dropped,
-            )
+    input_tokens = count_tokens(content, config.model)
+    level = determine_level(input_tokens)
 
-        chunk_tokens = count_tokens(content_to_summarize, self.model)
-        target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
+    logger.info(
+        "Summarizing %d tokens at level %s (type=%s)",
+        input_tokens,
+        level.name,
+        content_type,
+    )
 
-        prompt = CHUNK_SUMMARY_PROMPT.format(
-            chunk_index=chunk_index + 1,
-            total_chunks=total_chunks,
-            content=content_to_summarize,
-            max_words=max_words,
+    if level == SummaryLevel.NONE:
+        return SummaryResult(
+            level=level,
+            summary=None,
+            hierarchical=None,
+            input_tokens=input_tokens,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
 
-        summary = await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-        summary_tokens = count_tokens(summary, self.model)
+    if level == SummaryLevel.BRIEF:
+        summary = await _brief_summary(content, config)
+    elif level == SummaryLevel.STANDARD:
+        summary = await _standard_summary(content, config, prior_summary, content_type)
+    elif level == SummaryLevel.DETAILED:
+        return await _detailed_summary(content, input_tokens, config)
+    else:  # HIERARCHICAL
+        return await _hierarchical_summary(content, input_tokens, config)
+
+    output_tokens = count_tokens(summary, config.model) if summary else 0
+    compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
+
+    return SummaryResult(
+        level=level,
+        summary=summary,
+        hierarchical=None,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=compression_ratio,
+    )
+
+
+async def update_rolling_summary(
+    prior_summary: str | None,
+    new_facts: list[str],
+    config: SummarizerConfig,
+) -> str:
+    """Update a rolling summary with new facts (Mem0-style).
+
+    This is optimized for incremental updates where you have discrete
+    new facts to integrate into an existing summary.
+
+    Args:
+        prior_summary: The existing summary to update.
+        new_facts: List of new facts to integrate.
+        config: Summarizer configuration.
+
+    Returns:
+        Updated summary string.
 
-        return ChunkSummary(
-            chunk_index=chunk_index,
-            content=summary,
-            token_count=summary_tokens,
-            source_tokens=source_tokens,  # Report original token count
-            parent_group=parent_group,
-        )
-
-    async def _brief_summary(self, content: str) -> str:
-        """Generate a single-sentence summary for brief content."""
-        prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-        return await self._generate_summary(prompt, max_tokens=50)
-
-    async def _standard_summary(
-        self,
-        content: str,
-        prior_summary: str | None,
-        content_type: str,
-    ) -> str:
-        """Generate a paragraph summary for standard-length content."""
-        input_tokens = count_tokens(content, self.model)
-        target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
-        max_words = tokens_to_words(target_tokens)
-
-        prompt_template = get_prompt_for_content_type(content_type)
-        prior_context = format_prior_context(prior_summary)
-
-        prompt = prompt_template.format(
-            content=content,
-            prior_context=prior_context,
-            max_words=max_words,
-        )
+    """
+    if not new_facts:
+        return prior_summary or ""
 
-        return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
+    new_content = "\n".join(f"- {fact}" for fact in new_facts)
+    combined_tokens = count_tokens(
+        (prior_summary or "") + new_content,
+        config.model,
+    )
 
-    async def _detailed_summary(self, content: str, input_tokens: int) -> SummaryResult:
-        """Generate chunked summaries with meta-summary for detailed content."""
-        chunks = chunk_text(
-            content,
-            chunk_size=self.chunk_size,
-            overlap=self.chunk_overlap,
-            model=self.model,
-        )
+    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
 
-        logger.info("Detailed summary: processing %d chunks", len(chunks))
+    prompt = ROLLING_SUMMARY_PROMPT.format(
+        prior_summary=prior_summary or "(No prior summary)",
+        new_content=new_content,
+        max_words=max_words,
+    )
 
-        # Summarize chunks (with concurrency limit)
-        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
-        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-            async with semaphore:
-                return await self._summarize_single_chunk(
-                    chunk,
-                    idx,
-                    len(chunks),
-                    parent_group=None,
-                )
 
-        chunk_summaries = await asyncio.gather(
-            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-        )
+async def _summarize_single_chunk(
+    chunk: str,
+    chunk_index: int,
+    total_chunks: int,
+    config: SummarizerConfig,
+    *,
+    parent_group: int | None = None,
+) -> ChunkSummary:
+    """Summarize a single chunk of content.
 
-        # Generate meta-summary
-        all_summaries = [cs.content for cs in chunk_summaries]
-        meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
-        max_words = tokens_to_words(meta_target)
+    Uses middle truncation as a fallback for oversized content (Letta-style).
 
-        meta_prompt = META_SUMMARY_PROMPT.format(
-            summaries=format_summaries_for_meta(all_summaries),
-            max_words=max_words,
-        )
+    Args:
+        chunk: The text chunk to summarize.
+        chunk_index: Index of this chunk (0-based).
+        total_chunks: Total number of chunks being processed.
+        config: Summarizer configuration.
+        parent_group: Optional L2 group index for hierarchical summaries.
 
-        final_summary = await self._generate_summary(meta_prompt, max_tokens=meta_target + 100)
-        output_tokens = count_tokens(final_summary, self.model)
+    Returns:
+        ChunkSummary with the summarized content.
 
-        hierarchical = HierarchicalSummary(
-            l1_summaries=list(chunk_summaries),
-            l2_summaries=[],  # Not used for DETAILED level
-            l3_summary=final_summary,
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
+    """
+    # Apply middle truncation if chunk is too large (Letta-style fallback)
+    source_tokens = count_tokens(chunk, config.model)
+    content_to_summarize = chunk
+    if len(chunk) > MAX_CHUNK_CHARS:
+        content_to_summarize, dropped = middle_truncate(
+            chunk,
+            MAX_CHUNK_CHARS,
+            head_frac=0.3,
+            tail_frac=0.3,
         )
-
-        return SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary=final_summary,
-            hierarchical=hierarchical,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        logger.warning(
+            "Chunk %d truncated: dropped %d chars to fit context window",
+            chunk_index,
+            dropped,
         )
 
-    async def _hierarchical_summary(self, content: str, input_tokens: int) -> SummaryResult:
-        """Build a tree of summaries for very long content.
-
-        Structure:
-        - L1: Individual chunk summaries
-        - L2: Group summaries (groups of ~5 L1 summaries)
-        - L3: Final synthesis
-        """
-        chunks = chunk_text(
-            content,
-            chunk_size=self.chunk_size,
-            overlap=self.chunk_overlap,
-            model=self.model,
-        )
+    chunk_tokens = count_tokens(content_to_summarize, config.model)
+    target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = CHUNK_SUMMARY_PROMPT.format(
+        chunk_index=chunk_index + 1,
+        total_chunks=total_chunks,
+        content=content_to_summarize,
+        max_words=max_words,
+    )
+
+    summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+    summary_tokens = count_tokens(summary, config.model)
+
+    return ChunkSummary(
+        chunk_index=chunk_index,
+        content=summary,
+        token_count=summary_tokens,
+        source_tokens=source_tokens,  # Report original token count
+        parent_group=parent_group,
+    )
+
+
+async def _brief_summary(content: str, config: SummarizerConfig) -> str:
+    """Generate a single-sentence summary for brief content."""
+    prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
+    return await _generate_summary(prompt, config, max_tokens=50)
+
+
+async def _standard_summary(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None,
+    content_type: str,
+) -> str:
+    """Generate a paragraph summary for standard-length content."""
+    input_tokens = count_tokens(content, config.model)
+    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt_template = get_prompt_for_content_type(content_type)
+    prior_context = format_prior_context(prior_summary)
+
+    prompt = prompt_template.format(
+        content=content,
+        prior_context=prior_context,
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _detailed_summary(
+    content: str,
+    input_tokens: int,
+    config: SummarizerConfig,
+) -> SummaryResult:
+    """Generate chunked summaries with meta-summary for detailed content."""
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Detailed summary: processing %d chunks", len(chunks))
+
+    # Summarize chunks (with concurrency limit)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            return await _summarize_single_chunk(
+                chunk,
+                idx,
+                len(chunks),
+                config,
+                parent_group=None,
+            )
 
-        logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+    chunk_summaries = await asyncio.gather(
+        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
+    )
+
+    # Generate meta-summary
+    all_summaries = [cs.content for cs in chunk_summaries]
+    meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
+    max_words = tokens_to_words(meta_target)
+
+    meta_prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(all_summaries),
+        max_words=max_words,
+    )
+
+    final_summary = await _generate_summary(
+        meta_prompt,
+        config,
+        max_tokens=meta_target + 100,
+    )
+    output_tokens = count_tokens(final_summary, config.model)
+
+    hierarchical = HierarchicalSummary(
+        l1_summaries=list(chunk_summaries),
+        l2_summaries=[],  # Not used for DETAILED level
+        l3_summary=final_summary,
+        chunk_size=config.chunk_size,
+        chunk_overlap=config.chunk_overlap,
+    )
+
+    return SummaryResult(
+        level=SummaryLevel.DETAILED,
+        summary=final_summary,
+        hierarchical=hierarchical,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+    )
+
+
+async def _hierarchical_summary(
+    content: str,
+    input_tokens: int,
+    config: SummarizerConfig,
+) -> SummaryResult:
+    """Build a tree of summaries for very long content.
+
+    Structure:
+    - L1: Individual chunk summaries
+    - L2: Group summaries (groups of ~5 L1 summaries)
+    - L3: Final synthesis
+    """
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
+
+    # L1: Summarize each chunk
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            # Assign to L2 group (L2_GROUP_SIZE chunks per group)
+            group_idx = idx // L2_GROUP_SIZE
+            return await _summarize_single_chunk(
+                chunk,
+                idx,
+                len(chunks),
+                config,
+                parent_group=group_idx,
+            )
 
-        # L1: Summarize each chunk
-        semaphore = asyncio.Semaphore(self.max_concurrent_chunks)
+    l1_summaries = await asyncio.gather(
+        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
+    )
+
+    # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
+    l2_summaries: list[str] = []
+    if len(l1_summaries) > L2_MIN_CHUNKS:
+        groups: list[list[str]] = []
+        for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
+            group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
+            groups.append(group)
+
+        async def summarize_group(group: list[str]) -> str:
+            combined_tokens = sum(count_tokens(s, config.model) for s in group)
+            target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
+            max_words = tokens_to_words(target_tokens)
+
+            prompt = META_SUMMARY_PROMPT.format(
+                summaries=format_summaries_for_meta(group),
+                max_words=max_words,
+            )
+            return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+        l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
+
+    # L3: Final synthesis
+    summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
+    final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
+    max_words = tokens_to_words(final_target)
+
+    final_prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(summaries_to_synthesize),
+        max_words=max_words,
+    )
+
+    final_summary = await _generate_summary(
+        final_prompt,
+        config,
+        max_tokens=final_target + 100,
+    )
+    output_tokens = count_tokens(final_summary, config.model)
+
+    hierarchical = HierarchicalSummary(
+        l1_summaries=list(l1_summaries),
+        l2_summaries=list(l2_summaries),
+        l3_summary=final_summary,
+        chunk_size=config.chunk_size,
+        chunk_overlap=config.chunk_overlap,
+    )
+
+    return SummaryResult(
+        level=SummaryLevel.HIERARCHICAL,
+        summary=final_summary,
+        hierarchical=hierarchical,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+    )
+
+
+async def _generate_summary(
+    prompt: str,
+    config: SummarizerConfig,
+    max_tokens: int = 256,
+    *,
+    attempt: int = 0,
+) -> str:
+    """Generate a summary using the LLM.
+
+    Uses PydanticAI for structured output with fallback to raw generation.
+    Implements exponential backoff retry on failures.
+
+    Args:
+        prompt: The prompt to send to the LLM.
+        config: Summarizer configuration.
+        max_tokens: Maximum tokens for the response.
+        attempt: Current retry attempt (for internal recursion).
+
+    Returns:
+        The generated summary text.
+
+    Raises:
+        SummarizationError: If all retries are exhausted.
 
-        async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-            async with semaphore:
-                # Assign to L2 group (L2_GROUP_SIZE chunks per group)
-                group_idx = idx // L2_GROUP_SIZE
-                return await self._summarize_single_chunk(
-                    chunk,
-                    idx,
-                    len(chunks),
-                    parent_group=group_idx,
+    """
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        logger.warning("Structured summary failed, trying raw generation: %s", e)
+        # Fallback to raw HTTP call
+        try:
+            return await _raw_generate(prompt, config, max_tokens)
+        except Exception as raw_err:
+            if attempt < MAX_SUMMARIZE_RETRIES:
+                wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
+                logger.warning(
+                    "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
+                    attempt + 1,
+                    MAX_SUMMARIZE_RETRIES,
+                    wait_time,
+                    raw_err,
                 )
-
-        l1_summaries = await asyncio.gather(
-            *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-        )
-
-        # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
-        l2_summaries: list[str] = []
-        if len(l1_summaries) > L2_MIN_CHUNKS:
-            groups: list[list[str]] = []
-            for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
-                group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
-                groups.append(group)
-
-            async def summarize_group(group: list[str]) -> str:
-                combined_tokens = sum(count_tokens(s, self.model) for s in group)
-                target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-                max_words = tokens_to_words(target_tokens)
-
-                prompt = META_SUMMARY_PROMPT.format(
-                    summaries=format_summaries_for_meta(group),
-                    max_words=max_words,
+                await asyncio.sleep(wait_time)
+                return await _generate_summary(
+                    prompt,
+                    config,
+                    max_tokens,
+                    attempt=attempt + 1,
                 )
-                return await self._generate_summary(prompt, max_tokens=target_tokens + 50)
-
-            l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
-
-        # L3: Final synthesis
-        summaries_to_synthesize = (
-            l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
-        )
-        final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
-        max_words = tokens_to_words(final_target)
-
-        final_prompt = META_SUMMARY_PROMPT.format(
-            summaries=format_summaries_for_meta(summaries_to_synthesize),
-            max_words=max_words,
+            msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
+            raise SummarizationError(msg) from raw_err
+
+
+async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str:
+    """Fallback raw HTTP generation without structured output."""
+    async with httpx.AsyncClient(timeout=config.timeout) as client:
+        response = await client.post(
+            f"{config.openai_base_url}/chat/completions",
+            headers={"Authorization": f"Bearer {config.api_key}"},
+            json={
+                "model": config.model,
+                "messages": [
+                    {"role": "system", "content": "You are a concise summarizer."},
+                    {"role": "user", "content": prompt},
+                ],
+                "temperature": 0.3,
+                "max_tokens": max_tokens,
+            },
         )
+        response.raise_for_status()
+        data = response.json()
 
-        final_summary = await self._generate_summary(final_prompt, max_tokens=final_target + 100)
-        output_tokens = count_tokens(final_summary, self.model)
-
-        hierarchical = HierarchicalSummary(
-            l1_summaries=list(l1_summaries),
-            l2_summaries=list(l2_summaries),
-            l3_summary=final_summary,
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
-        )
-
-        return SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary=final_summary,
-            hierarchical=hierarchical,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-        )
-
-    async def _generate_summary(
-        self,
-        prompt: str,
-        max_tokens: int = 256,
-        *,
-        attempt: int = 0,
-    ) -> str:
-        """Generate a summary using the LLM.
-
-        Uses PydanticAI for structured output with fallback to raw generation.
-        Implements exponential backoff retry on failures.
-
-        Args:
-            prompt: The prompt to send to the LLM.
-            max_tokens: Maximum tokens for the response.
-            attempt: Current retry attempt (for internal recursion).
-
-        Returns:
-            The generated summary text.
-
-        Raises:
-            SummarizationError: If all retries are exhausted.
-
-        """
-        model = OpenAIChatModel(
-            model_name=self.model,
-            provider=self._provider,
-            settings=ModelSettings(
-                temperature=0.3,
-                max_tokens=max_tokens,
-            ),
-        )
-
-        agent = Agent(
-            model=model,
-            system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-            output_type=SummaryOutput,
-            retries=2,
-        )
-
-        try:
-            result = await agent.run(prompt)
-            return result.output.summary.strip()
-        except Exception as e:
-            logger.warning("Structured summary failed, trying raw generation: %s", e)
-            # Fallback to raw HTTP call
-            try:
-                return await self._raw_generate(prompt, max_tokens)
-            except Exception as raw_err:
-                if attempt < MAX_SUMMARIZE_RETRIES:
-                    wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
-                    logger.warning(
-                        "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
-                        attempt + 1,
-                        MAX_SUMMARIZE_RETRIES,
-                        wait_time,
-                        raw_err,
-                    )
-                    await asyncio.sleep(wait_time)
-                    return await self._generate_summary(
-                        prompt,
-                        max_tokens,
-                        attempt=attempt + 1,
-                    )
-                msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
-                raise SummarizationError(msg) from raw_err
-
-    async def _raw_generate(self, prompt: str, max_tokens: int) -> str:
-        """Fallback raw HTTP generation without structured output."""
-        async with httpx.AsyncClient(timeout=self.timeout) as client:
-            response = await client.post(
-                f"{self.openai_base_url}/chat/completions",
-                headers={"Authorization": f"Bearer {self.api_key}"},
-                json={
-                    "model": self.model,
-                    "messages": [
-                        {"role": "system", "content": "You are a concise summarizer."},
-                        {"role": "user", "content": prompt},
-                    ],
-                    "temperature": 0.3,
-                    "max_tokens": max_tokens,
-                },
-            )
-            response.raise_for_status()
-            data = response.json()
-
-        choices = data.get("choices", [])
-        if choices:
-            return choices[0].get("message", {}).get("content", "").strip()
-        return ""
+    choices = data.get("choices", [])
+    if choices:
+        return choices[0].get("message", {}).get("content", "").strip()
+    return ""
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 1f010999..f5db1486 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -1,4 +1,4 @@
-"""Unit tests for AdaptiveSummarizer."""
+"""Unit tests for adaptive summarization functions."""
 
 from __future__ import annotations
 
@@ -8,37 +8,42 @@
 
 from agent_cli.summarizer.adaptive import (
     LEVEL_THRESHOLDS,
-    AdaptiveSummarizer,
+    SummarizerConfig,
     SummaryOutput,
+    _generate_summary,
+    _raw_generate,
+    determine_level,
+    summarize,
+    update_rolling_summary,
 )
 from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
 
-class TestAdaptiveSummarizerInit:
-    """Tests for AdaptiveSummarizer initialization."""
+class TestSummarizerConfig:
+    """Tests for SummarizerConfig initialization."""
 
     def test_basic_init(self) -> None:
         """Test basic initialization with required parameters."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="llama3.1:8b",
         )
-        assert summarizer.openai_base_url == "http://localhost:8000/v1"
-        assert summarizer.model == "llama3.1:8b"
-        assert summarizer.api_key == "not-needed"
+        assert config.openai_base_url == "http://localhost:8000/v1"
+        assert config.model == "llama3.1:8b"
+        assert config.api_key == "not-needed"
 
     def test_init_with_api_key(self) -> None:
         """Test initialization with custom API key."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
             api_key="sk-test-key",
         )
-        assert summarizer.api_key == "sk-test-key"
+        assert config.api_key == "sk-test-key"
 
     def test_init_with_custom_settings(self) -> None:
         """Test initialization with custom chunk settings."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
             chunk_size=5000,
@@ -46,59 +51,51 @@ def test_init_with_custom_settings(self) -> None:
             max_concurrent_chunks=10,
             timeout=120.0,
         )
-        assert summarizer.chunk_size == 5000
-        assert summarizer.chunk_overlap == 300
-        assert summarizer.max_concurrent_chunks == 10
-        assert summarizer.timeout == 120.0
+        assert config.chunk_size == 5000
+        assert config.chunk_overlap == 300
+        assert config.max_concurrent_chunks == 10
+        assert config.timeout == 120.0
 
     def test_trailing_slash_stripped(self) -> None:
         """Test that trailing slash is stripped from base URL."""
-        summarizer = AdaptiveSummarizer(
+        config = SummarizerConfig(
             openai_base_url="http://localhost:8000/v1/",
             model="gpt-4",
         )
-        assert summarizer.openai_base_url == "http://localhost:8000/v1"
+        assert config.openai_base_url == "http://localhost:8000/v1"
 
 
 class TestDetermineLevel:
     """Tests for level determination based on token count."""
 
-    @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    def test_none_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_none_level_threshold(self) -> None:
         """Test NONE level for very short content."""
-        assert summarizer.determine_level(50) == SummaryLevel.NONE
-        assert summarizer.determine_level(99) == SummaryLevel.NONE
+        assert determine_level(50) == SummaryLevel.NONE
+        assert determine_level(99) == SummaryLevel.NONE
 
-    def test_brief_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_brief_level_threshold(self) -> None:
         """Test BRIEF level for short content."""
-        assert summarizer.determine_level(100) == SummaryLevel.BRIEF
-        assert summarizer.determine_level(300) == SummaryLevel.BRIEF
-        assert summarizer.determine_level(499) == SummaryLevel.BRIEF
+        assert determine_level(100) == SummaryLevel.BRIEF
+        assert determine_level(300) == SummaryLevel.BRIEF
+        assert determine_level(499) == SummaryLevel.BRIEF
 
-    def test_standard_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_standard_level_threshold(self) -> None:
         """Test STANDARD level for medium content."""
-        assert summarizer.determine_level(500) == SummaryLevel.STANDARD
-        assert summarizer.determine_level(1500) == SummaryLevel.STANDARD
-        assert summarizer.determine_level(2999) == SummaryLevel.STANDARD
+        assert determine_level(500) == SummaryLevel.STANDARD
+        assert determine_level(1500) == SummaryLevel.STANDARD
+        assert determine_level(2999) == SummaryLevel.STANDARD
 
-    def test_detailed_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_detailed_level_threshold(self) -> None:
         """Test DETAILED level for longer content."""
-        assert summarizer.determine_level(3000) == SummaryLevel.DETAILED
-        assert summarizer.determine_level(8000) == SummaryLevel.DETAILED
-        assert summarizer.determine_level(14999) == SummaryLevel.DETAILED
+        assert determine_level(3000) == SummaryLevel.DETAILED
+        assert determine_level(8000) == SummaryLevel.DETAILED
+        assert determine_level(14999) == SummaryLevel.DETAILED
 
-    def test_hierarchical_level_threshold(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_hierarchical_level_threshold(self) -> None:
         """Test HIERARCHICAL level for very long content."""
-        assert summarizer.determine_level(15000) == SummaryLevel.HIERARCHICAL
-        assert summarizer.determine_level(50000) == SummaryLevel.HIERARCHICAL
-        assert summarizer.determine_level(100000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(15000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(50000) == SummaryLevel.HIERARCHICAL
+        assert determine_level(100000) == SummaryLevel.HIERARCHICAL
 
     def test_thresholds_match_constants(self) -> None:
         """Verify thresholds match the module constants."""
@@ -109,46 +106,55 @@ def test_thresholds_match_constants(self) -> None:
 
 
 class TestSummarize:
-    """Tests for main summarize method."""
+    """Tests for main summarize function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_empty_content_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_content_returns_none_level(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that empty content returns NONE level result."""
-        result = await summarizer.summarize("")
+        result = await summarize("", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.input_tokens == 0
         assert result.output_tokens == 0
 
     @pytest.mark.asyncio
-    async def test_whitespace_only_returns_none_level(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_whitespace_only_returns_none_level(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that whitespace-only content returns NONE level result."""
-        result = await summarizer.summarize("   \n\n   ")
+        result = await summarize("   \n\n   ", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    async def test_very_short_content_no_summary(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_very_short_content_no_summary(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that very short content gets NONE level (no summary)."""
         # Less than 100 tokens
-        result = await summarizer.summarize("Hello world")
+        result = await summarize("Hello world", config)
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_brief_summary")
+    @patch("agent_cli.summarizer.adaptive._brief_summary")
     async def test_brief_level_calls_brief_summary(
         self,
         mock_brief: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that BRIEF level content calls _brief_summary."""
         mock_brief.return_value = "Brief summary."
@@ -156,18 +162,18 @@ async def test_brief_level_calls_brief_summary(
         # Create content that's ~100-500 tokens
         content = "This is a test sentence. " * 30  # ~150 tokens
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
-        mock_brief.assert_called_once_with(content)
+        mock_brief.assert_called_once_with(content, config)
         assert result.level == SummaryLevel.BRIEF
         assert result.summary == "Brief summary."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    @patch("agent_cli.summarizer.adaptive._standard_summary")
     async def test_standard_level_calls_standard_summary(
         self,
         mock_standard: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that STANDARD level content calls _standard_summary."""
         mock_standard.return_value = "Standard summary paragraph."
@@ -175,18 +181,18 @@ async def test_standard_level_calls_standard_summary(
         # Create content that's ~500-3000 tokens
         content = "This is a test sentence with more words. " * 100  # ~800 tokens
 
-        result = await summarizer.summarize(content, content_type="general")
+        result = await summarize(content, config, content_type="general")
 
-        mock_standard.assert_called_once_with(content, None, "general")
+        mock_standard.assert_called_once_with(content, config, None, "general")
         assert result.level == SummaryLevel.STANDARD
         assert result.summary == "Standard summary paragraph."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_standard_summary")
+    @patch("agent_cli.summarizer.adaptive._standard_summary")
     async def test_prior_summary_passed_to_standard(
         self,
         mock_standard: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that prior_summary is passed to _standard_summary."""
         mock_standard.return_value = "Updated summary."
@@ -194,16 +200,16 @@ async def test_prior_summary_passed_to_standard(
         content = "This is a test sentence with more words. " * 100
         prior = "Previous context summary."
 
-        await summarizer.summarize(content, prior_summary=prior)
+        await summarize(content, config, prior_summary=prior)
 
-        mock_standard.assert_called_once_with(content, prior, "general")
+        mock_standard.assert_called_once_with(content, config, prior, "general")
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_detailed_summary")
+    @patch("agent_cli.summarizer.adaptive._detailed_summary")
     async def test_detailed_level_calls_detailed_summary(
         self,
         mock_detailed: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that DETAILED level content calls _detailed_summary."""
         mock_result = SummaryResult(
@@ -219,17 +225,17 @@ async def test_detailed_level_calls_detailed_summary(
         # Create content that's ~3000-15000 tokens
         content = "Word " * 5000  # ~5000 tokens
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
         assert mock_detailed.called
         assert result.level == SummaryLevel.DETAILED
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_hierarchical_summary")
+    @patch("agent_cli.summarizer.adaptive._hierarchical_summary")
     async def test_hierarchical_level_calls_hierarchical_summary(
         self,
         mock_hierarchical: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that HIERARCHICAL level content calls _hierarchical_summary."""
         mock_result = SummaryResult(
@@ -245,7 +251,7 @@ async def test_hierarchical_level_calls_hierarchical_summary(
         # Create content that's > 15000 tokens
         content = "Word " * 20000
 
-        result = await summarizer.summarize(content)
+        result = await summarize(content, config)
 
         assert mock_hierarchical.called
         assert result.level == SummaryLevel.HIERARCHICAL
@@ -255,62 +261,69 @@ class TestUpdateRollingSummary:
     """Tests for rolling summary updates."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_empty_facts_returns_prior(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None:
         """Test that empty facts list returns prior summary."""
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary="Existing summary",
             new_facts=[],
+            config=config,
         )
         assert result == "Existing summary"
 
     @pytest.mark.asyncio
-    async def test_empty_facts_no_prior_returns_empty(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_empty_facts_no_prior_returns_empty(
+        self,
+        config: SummarizerConfig,
+    ) -> None:
         """Test that empty facts with no prior returns empty string."""
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary=None,
             new_facts=[],
+            config=config,
         )
         assert result == ""
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    @patch("agent_cli.summarizer.adaptive._generate_summary")
     async def test_new_facts_calls_generate(
         self,
         mock_generate: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that new facts trigger summary generation."""
         mock_generate.return_value = "Updated summary with new facts."
 
-        result = await summarizer.update_rolling_summary(
+        result = await update_rolling_summary(
             prior_summary="Old summary",
             new_facts=["User likes coffee", "User lives in Amsterdam"],
+            config=config,
         )
 
         mock_generate.assert_called_once()
         assert result == "Updated summary with new facts."
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_generate_summary")
+    @patch("agent_cli.summarizer.adaptive._generate_summary")
     async def test_facts_formatted_as_list(
         self,
         mock_generate: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test that facts are formatted as bullet list in prompt."""
         mock_generate.return_value = "Summary"
 
-        await summarizer.update_rolling_summary(
+        await update_rolling_summary(
             prior_summary="Prior",
             new_facts=["Fact one", "Fact two"],
+            config=config,
         )
 
         # Check the prompt contains formatted facts
@@ -321,12 +334,12 @@ async def test_facts_formatted_as_list(
 
 
 class TestGenerateSummary:
-    """Tests for _generate_summary method."""
+    """Tests for _generate_summary function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
@@ -334,7 +347,7 @@ def summarizer(self) -> AdaptiveSummarizer:
     @pytest.mark.asyncio
     async def test_generate_summary_with_pydantic_ai(
         self,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test summary generation using PydanticAI agent."""
         # Mock the entire agent creation and run
@@ -346,17 +359,17 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
 
-            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+            result = await _generate_summary("Test prompt", config, max_tokens=100)
 
             assert result == "Generated summary."
             mock_agent.run.assert_called_once_with("Test prompt")
 
     @pytest.mark.asyncio
-    @patch.object(AdaptiveSummarizer, "_raw_generate")
+    @patch("agent_cli.summarizer.adaptive._raw_generate")
     async def test_fallback_to_raw_generate_on_error(
         self,
         mock_raw: AsyncMock,
-        summarizer: AdaptiveSummarizer,
+        config: SummarizerConfig,
     ) -> None:
         """Test fallback to raw HTTP on PydanticAI error."""
         mock_raw.return_value = "Fallback summary"
@@ -366,25 +379,25 @@ async def test_fallback_to_raw_generate_on_error(
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent
 
-            result = await summarizer._generate_summary("Test prompt", max_tokens=100)
+            result = await _generate_summary("Test prompt", config, max_tokens=100)
 
-            mock_raw.assert_called_once_with("Test prompt", 100)
+            mock_raw.assert_called_once_with("Test prompt", config, 100)
             assert result == "Fallback summary"
 
 
 class TestRawGenerate:
-    """Tests for _raw_generate fallback method."""
+    """Tests for _raw_generate fallback function."""
 
     @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create a summarizer instance."""
-        return AdaptiveSummarizer(
+    def config(self) -> SummarizerConfig:
+        """Create a config instance."""
+        return SummarizerConfig(
             openai_base_url="http://localhost:8000/v1",
             model="gpt-4",
         )
 
     @pytest.mark.asyncio
-    async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_raw_generate_success(self, config: SummarizerConfig) -> None:
         """Test successful raw HTTP generation."""
         mock_response = MagicMock()
         mock_response.json.return_value = {
@@ -398,12 +411,12 @@ async def test_raw_generate_success(self, summarizer: AdaptiveSummarizer) -> Non
             mock_client.__aexit__ = AsyncMock(return_value=None)
             mock_client_class.return_value = mock_client
 
-            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+            result = await _raw_generate("Test prompt", config, max_tokens=100)
 
             assert result == "Raw generated summary"
 
     @pytest.mark.asyncio
-    async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer) -> None:
+    async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None:
         """Test raw generate with empty choices returns empty string."""
         mock_response = MagicMock()
         mock_response.json.return_value = {"choices": []}
@@ -415,7 +428,7 @@ async def test_raw_generate_empty_choices(self, summarizer: AdaptiveSummarizer)
             mock_client.__aexit__ = AsyncMock(return_value=None)
             mock_client_class.return_value = mock_client
 
-            result = await summarizer._raw_generate("Test prompt", max_tokens=100)
+            result = await _raw_generate("Test prompt", config, max_tokens=100)
 
             assert result == ""
 
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 381f9f5b..e58a20f6 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -14,7 +14,7 @@
     get_summary_at_level,
     upsert_hierarchical_summary,
 )
-from agent_cli.summarizer import AdaptiveSummarizer, SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level
 from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
 
 if TYPE_CHECKING:
@@ -401,40 +401,32 @@ def test_persist_deletes_old_summaries(
         assert deleted_dir.exists()
 
 
-class TestAdaptiveSummarizerLevelDetermination:
-    """Test that AdaptiveSummarizer correctly determines summary levels."""
+class TestDetermineLevelFunction:
+    """Test that determine_level correctly determines summary levels."""
 
-    @pytest.fixture
-    def summarizer(self) -> AdaptiveSummarizer:
-        """Create an AdaptiveSummarizer instance."""
-        return AdaptiveSummarizer(
-            openai_base_url="http://localhost:8000/v1",
-            model="test-model",
-        )
-
-    def test_very_short_content_is_none(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_very_short_content_is_none(self) -> None:
         """Test that content under 100 tokens gets NONE level."""
-        level = summarizer.determine_level(50)
+        level = determine_level(50)
         assert level == SummaryLevel.NONE
 
-    def test_short_content_is_brief(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_short_content_is_brief(self) -> None:
         """Test that 100-500 token content gets BRIEF level."""
-        level = summarizer.determine_level(300)
+        level = determine_level(300)
         assert level == SummaryLevel.BRIEF
 
-    def test_medium_content_is_standard(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_medium_content_is_standard(self) -> None:
         """Test that 500-3000 token content gets STANDARD level."""
-        level = summarizer.determine_level(1500)
+        level = determine_level(1500)
         assert level == SummaryLevel.STANDARD
 
-    def test_long_content_is_detailed(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_long_content_is_detailed(self) -> None:
         """Test that 3000-15000 token content gets DETAILED level."""
-        level = summarizer.determine_level(8000)
+        level = determine_level(8000)
         assert level == SummaryLevel.DETAILED
 
-    def test_very_long_content_is_hierarchical(self, summarizer: AdaptiveSummarizer) -> None:
+    def test_very_long_content_is_hierarchical(self) -> None:
         """Test that content over 15000 tokens gets HIERARCHICAL level."""
-        level = summarizer.determine_level(25000)
+        level = determine_level(25000)
         assert level == SummaryLevel.HIERARCHICAL
 
 
@@ -444,7 +436,8 @@ class TestSummarizeContentFunction:
     @pytest.mark.asyncio
     async def test_summarize_content_creates_result(self) -> None:
         """Test that summarize_content returns a valid SummaryResult."""
-        with patch.object(AdaptiveSummarizer, "summarize") as mock_summarize:
+        # Patch at source since _ingest imports inside the function
+        with patch("agent_cli.summarizer.summarize") as mock_summarize:
             mock_result = SummaryResult(
                 level=SummaryLevel.STANDARD,
                 summary="Mocked summary.",

From cd4378e284ff3c6d95f6fe280ea58c9cda39caf8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:21:59 -0800
Subject: [PATCH 10/38] refactor(summarizer): make internal modules private and
 simplify public API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename prompts.py → _prompts.py and utils.py → _utils.py
- Reduce public API to 6 essential exports: SummarizerConfig, summarize,
  SummaryResult, SummaryLevel, HierarchicalSummary, SummarizationError
- Remove determine_level, update_rolling_summary, count_tokens from public API
- Update imports in adaptive.py and test files
---
 agent_cli/summarizer/__init__.py              | 20 ++++---------------
 .../summarizer/{prompts.py => _prompts.py}    |  0
 agent_cli/summarizer/{utils.py => _utils.py}  |  0
 agent_cli/summarizer/adaptive.py              | 16 +++++++--------
 tests/summarizer/test_integration.py          |  3 ++-
 tests/summarizer/test_prompts.py              |  2 +-
 tests/summarizer/test_utils.py                |  2 +-
 7 files changed, 16 insertions(+), 27 deletions(-)
 rename agent_cli/summarizer/{prompts.py => _prompts.py} (100%)
 rename agent_cli/summarizer/{utils.py => _utils.py} (100%)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index 09210146..fc0994c4 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -5,29 +5,19 @@
 compression ratios) architectures.
 
 Example:
-    from agent_cli.summarizer import summarize, SummarizerConfig, determine_level
+    from agent_cli.summarizer import summarize, SummarizerConfig
 
     config = SummarizerConfig(
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
     result = await summarize(long_document, config)
-    print(f"Level: {result.level}, Compression: {result.compression_ratio:.1%}")
+    print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}")
 
 """
 
-from agent_cli.summarizer.adaptive import (
-    SummarizationError,
-    SummarizerConfig,
-    determine_level,
-    summarize,
-    update_rolling_summary,
-)
-from agent_cli.summarizer.models import (
-    HierarchicalSummary,
-    SummaryLevel,
-    SummaryResult,
-)
+from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
+from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult
 
 __all__ = [
     "HierarchicalSummary",
@@ -35,7 +25,5 @@
     "SummarizerConfig",
     "SummaryLevel",
     "SummaryResult",
-    "determine_level",
     "summarize",
-    "update_rolling_summary",
 ]
diff --git a/agent_cli/summarizer/prompts.py b/agent_cli/summarizer/_prompts.py
similarity index 100%
rename from agent_cli/summarizer/prompts.py
rename to agent_cli/summarizer/_prompts.py
diff --git a/agent_cli/summarizer/utils.py b/agent_cli/summarizer/_utils.py
similarity index 100%
rename from agent_cli/summarizer/utils.py
rename to agent_cli/summarizer/_utils.py
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 38fa865d..590dabc5 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -20,13 +20,7 @@
 from pydantic_ai.providers.openai import OpenAIProvider
 from pydantic_ai.settings import ModelSettings
 
-from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
-    SummaryLevel,
-    SummaryResult,
-)
-from agent_cli.summarizer.prompts import (
+from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
@@ -35,13 +29,19 @@
     format_summaries_for_meta,
     get_prompt_for_content_type,
 )
-from agent_cli.summarizer.utils import (
+from agent_cli.summarizer._utils import (
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
     middle_truncate,
     tokens_to_words,
 )
+from agent_cli.summarizer.models import (
+    ChunkSummary,
+    HierarchicalSummary,
+    SummaryLevel,
+    SummaryResult,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index e58a20f6..6eeb133e 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -14,7 +14,8 @@
     get_summary_at_level,
     upsert_hierarchical_summary,
 )
-from agent_cli.summarizer import SummaryLevel, SummaryResult, determine_level
+from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import determine_level
 from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
 
 if TYPE_CHECKING:
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index e126def2..05937f71 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.prompts import (
+from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 458e9b37..22eb4039 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.utils import (
+from agent_cli.summarizer._utils import (
     chunk_text,
     count_tokens,
     estimate_summary_tokens,

From 023a714c0b70625ad51b32f43a5195e0727c5ef3 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:39:01 -0800
Subject: [PATCH 11/38] refactor(memory): wire AdaptiveSummarizer into memory
 pipeline

Replace the old rolling summary system with the new hierarchical
adaptive summarizer. This simplifies the codebase by removing
redundant code paths and using a single, research-backed approach.

Changes:
- Update extract_and_store_facts_and_summaries() to use summarize_content()
  and store_adaptive_summary() instead of update_summary()/persist_summary()
- Remove old summary functions: update_summary, persist_summary, get_summary_entry
- Remove Summary entity and SummaryOutput model (unused)
- Add summary_level to L3 metadata for consistency
- Update tests to mock new summarizer interface

The new system automatically selects summarization level (NONE, BRIEF,
STANDARD, DETAILED, HIERARCHICAL) based on content complexity, storing
summaries in a L1/L2/L3 hierarchical structure.
---
 agent_cli/memory/_ingest.py          | 74 ++++++----------------------
 agent_cli/memory/_persistence.py     | 27 +---------
 agent_cli/memory/_retrieval.py       |  5 +-
 agent_cli/memory/_store.py           | 25 ----------
 agent_cli/memory/entities.py         |  9 ----
 agent_cli/memory/models.py           | 16 +-----
 agent_cli/summarizer/models.py       |  1 +
 tests/memory/test_engine.py          | 44 ++++++++++++-----
 tests/memory/test_git_integration.py | 14 ++++--
 tests/memory/test_store.py           | 17 -------
 10 files changed, 63 insertions(+), 169 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index d4701548..1c784d9e 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -21,24 +21,21 @@
     delete_memory_files,
     persist_entries,
     persist_hierarchical_summary,
-    persist_summary,
 )
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
     FACT_SYSTEM_PROMPT,
-    SUMMARY_PROMPT,
     UPDATE_MEMORY_PROMPT,
 )
 from agent_cli.memory._retrieval import gather_relevant_existing_memories
-from agent_cli.memory._store import delete_entries, get_summary_entry
-from agent_cli.memory.entities import Fact, Summary
+from agent_cli.memory._store import delete_entries, get_final_summary
+from agent_cli.memory.entities import Fact
 from agent_cli.memory.models import (
     MemoryAdd,
     MemoryDecision,
     MemoryDelete,
     MemoryIgnore,
     MemoryUpdate,
-    SummaryOutput,
 )
 
 if TYPE_CHECKING:
@@ -50,8 +47,6 @@
 
 LOGGER = logging.getLogger(__name__)
 
-_SUMMARY_ROLE = "summary"
-
 
 def _elapsed_ms(start: float) -> float:
     """Return elapsed milliseconds since start."""
@@ -290,40 +285,6 @@ def validate_decisions(decisions: list[MemoryDecision]) -> list[MemoryDecision]:
     return to_add, to_delete, replacement_map
 
 
-async def update_summary(
-    *,
-    prior_summary: str | None,
-    new_facts: list[str],
-    openai_base_url: str,
-    api_key: str | None,
-    model: str,
-    max_tokens: int = 256,
-) -> str | None:
-    """Update the conversation summary based on new facts.
-
-    This is the simple Mem0-style rolling summary that incrementally
-    updates based on new facts. For full content adaptive summarization,
-    use `summarize_content` instead.
-    """
-    if not new_facts:
-        return prior_summary
-    system_prompt = SUMMARY_PROMPT
-    user_parts: list[str] = []
-    if prior_summary:
-        user_parts.append(f"Previous summary:\n{prior_summary}")
-    user_parts.append("New facts:\n" + "\n".join(f"- {fact}" for fact in new_facts))
-    prompt_text = "\n\n".join(user_parts)
-    provider = OpenAIProvider(api_key=api_key or "dummy", base_url=openai_base_url)
-    model_cfg = OpenAIChatModel(
-        model_name=model,
-        provider=provider,
-        settings=ModelSettings(temperature=0.2, max_tokens=max_tokens),
-    )
-    agent = Agent(model=model_cfg, system_prompt=system_prompt, output_type=SummaryOutput)
-    result = await agent.run(prompt_text)
-    return result.output.summary or prior_summary
-
-
 async def summarize_content(
     *,
     content: str,
@@ -460,37 +421,34 @@ async def extract_and_store_facts_and_summaries(
             entries=list(to_add),
         )
 
-    if enable_summarization:
-        prior_summary_entry = get_summary_entry(
-            collection,
-            conversation_id,
-            role=_SUMMARY_ROLE,
-        )
+    if enable_summarization and facts:
+        # Get prior summary for context continuity
+        prior_summary_entry = get_final_summary(collection, conversation_id)
         prior_summary = prior_summary_entry.content if prior_summary_entry else None
 
+        # Summarize the new facts
+        content_to_summarize = "\n".join(facts)
         summary_start = perf_counter()
-        new_summary = await update_summary(
+        summary_result = await summarize_content(
+            content=content_to_summarize,
             prior_summary=prior_summary,
-            new_facts=facts,
+            content_type="conversation",
             openai_base_url=openai_base_url,
             api_key=api_key,
             model=model,
         )
         LOGGER.info(
-            "Summary update completed in %.1f ms (conversation=%s)",
+            "Summary update completed in %.1f ms (conversation=%s, level=%s)",
             _elapsed_ms(summary_start),
             conversation_id,
+            summary_result.level.name,
         )
-        if new_summary:
-            summary_obj = Summary(
-                conversation_id=conversation_id,
-                content=new_summary,
-                created_at=datetime.now(UTC),
-            )
-            persist_summary(
+        if summary_result.summary:
+            await store_adaptive_summary(
                 collection,
                 memory_root=memory_root,
-                summary=summary_obj,
+                conversation_id=conversation_id,
+                summary_result=summary_result,
             )
 
     if enable_git_versioning:
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index e27eb83f..91585ade 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -24,7 +24,7 @@
     upsert_hierarchical_summary,
     upsert_memories,
 )
-from agent_cli.memory.entities import Fact, Summary, Turn
+from agent_cli.memory.entities import Fact, Turn
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -89,31 +89,6 @@ def persist_entries(
         upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
 
 
-def persist_summary(
-    collection: Collection,
-    *,
-    memory_root: Path,
-    summary: Summary,
-) -> None:
-    """Persist a summary to disk and Chroma."""
-    doc_id = _safe_identifier(f"{summary.conversation_id}{_SUMMARY_DOC_ID_SUFFIX}-summary")
-    record = write_memory_file(
-        memory_root,
-        conversation_id=summary.conversation_id,
-        role="summary",
-        created_at=summary.created_at.isoformat(),
-        content=summary.content,
-        summary_kind="summary",
-        doc_id=doc_id,
-    )
-    upsert_memories(
-        collection,
-        ids=[record.id],
-        contents=[record.content],
-        metadatas=[record.metadata],
-    )
-
-
 def delete_memory_files(
     memory_root: Path,
     conversation_id: str,
diff --git a/agent_cli/memory/_retrieval.py b/agent_cli/memory/_retrieval.py
index 6091f109..82c7296f 100644
--- a/agent_cli/memory/_retrieval.py
+++ b/agent_cli/memory/_retrieval.py
@@ -7,7 +7,7 @@
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
 
-from agent_cli.memory._store import get_summary_entry, query_memories
+from agent_cli.memory._store import get_final_summary, query_memories
 from agent_cli.memory.models import (
     ChatRequest,
     MemoryEntry,
@@ -24,7 +24,6 @@
 LOGGER = logging.getLogger(__name__)
 
 _DEFAULT_MMR_LAMBDA = 0.7
-_SUMMARY_ROLE = "summary"
 
 
 def gather_relevant_existing_memories(
@@ -202,7 +201,7 @@ def recency_score(meta: MemoryMetadata) -> float:
 
     summaries: list[str] = []
     if include_summary:
-        summary_entry = get_summary_entry(collection, conversation_id, role=_SUMMARY_ROLE)
+        summary_entry = get_final_summary(collection, conversation_id)
         if summary_entry:
             summaries.append(f"Conversation summary:\n{summary_entry.content}")
 
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 4f3755b1..722dcda9 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -111,31 +111,6 @@ def query_memories(
     return records
 
 
-def get_summary_entry(
-    collection: Collection,
-    conversation_id: str,
-    *,
-    role: str = "summary",
-) -> StoredMemory | None:
-    """Return the latest summary entry for a conversation, if present."""
-    result = collection.get(
-        where={"$and": [{"conversation_id": conversation_id}, {"role": role}]},
-    )
-    docs = result.get("documents") or []
-    metas = result.get("metadatas") or []
-    ids = result.get("ids") or []
-
-    if not docs or not metas or not ids:
-        return None
-
-    return StoredMemory(
-        id=ids[0],
-        content=docs[0],
-        metadata=MemoryMetadata(**dict(metas[0])),
-        distance=None,
-    )
-
-
 def list_conversation_entries(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/memory/entities.py b/agent_cli/memory/entities.py
index 70b16a78..a352b0bb 100644
--- a/agent_cli/memory/entities.py
+++ b/agent_cli/memory/entities.py
@@ -32,12 +32,3 @@ class Fact(BaseModel):
     source_id: str = Field(..., description="UUID of the Turn this fact was extracted from")
     created_at: datetime
     # Facts are always role="memory" implicitly in the storage layer
-
-
-class Summary(BaseModel):
-    """The rolling summary of a conversation."""
-
-    conversation_id: str
-    content: str
-    created_at: datetime
-    # Summaries are role="summary" implicitly
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 6dc689d8..4eb289c7 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -4,7 +4,7 @@
 
 from typing import Literal
 
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import BaseModel, ConfigDict
 
 
 class Message(BaseModel):
@@ -70,20 +70,6 @@ class MemoryMetadata(BaseModel):
     """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
 
 
-class SummaryOutput(BaseModel):
-    """Structured summary returned by the LLM."""
-
-    summary: str
-
-    @field_validator("summary")
-    @classmethod
-    def _not_empty(cls, v: str) -> str:
-        if not v or not str(v).strip():
-            msg = "field must be non-empty"
-            raise ValueError(msg)
-        return str(v).strip()
-
-
 class StoredMemory(BaseModel):
     """Memory document as stored in the vector DB."""
 
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index de9bc609..843d1dfe 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -190,6 +190,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
+                        "summary_level": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 8dae1163..f386e44d 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -21,8 +21,8 @@
     MemoryMetadata,
     Message,
     StoredMemory,
-    SummaryOutput,
 )
+from agent_cli.summarizer import SummaryLevel, SummaryResult
 
 
 class _DummyReranker:
@@ -239,13 +239,13 @@ def fake_query_memories(
     )
     monkeypatch.setattr(
         _retrieval,
-        "get_summary_entry",
-        lambda _collection, _cid, role: StoredMemory(  # type: ignore[return-value]
-            id=f"{role}-id",
-            content=f"{role} content",
+        "get_final_summary",
+        lambda _collection, _cid: StoredMemory(
+            id="summary-id",
+            content="summary content",
             metadata=MemoryMetadata(
                 conversation_id="conv1",
-                role=role,
+                role="summary",
                 created_at=now.isoformat(),
             ),
         ),
@@ -334,11 +334,19 @@ def __init__(self, output: Any) -> None:
                 self.output = output
 
         prompt_str = str(prompt_text)
-        if "New facts:" in prompt_str:
-            return _Result(SummaryOutput(summary="summary up to 256"))
         if "Hello, I enjoy biking" in prompt_str:
             return _Result(["User likes cats.", "User loves biking."])
-        return _Result(SummaryOutput(summary="noop"))
+        return _Result([])
+
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="summary up to 256",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     async def fake_reconcile(
         _collection: Any,
@@ -360,6 +368,7 @@ async def fake_reconcile(
 
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
     monkeypatch.setattr(_ingest.Agent, "run", fake_agent_run)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
     # High relevance so they aren't filtered
     monkeypatch.setattr(_retrieval, "predict_relevance", lambda _model, pairs: [5.0 for _ in pairs])
 
@@ -550,11 +559,19 @@ def __init__(self, output: Any) -> None:
                 self.output = output
 
         prompt_str = str(prompt_text)
-        if "New facts:" in prompt_str:
-            return _Result(SummaryOutput(summary="summary text"))
         if "My cat is Luna" in prompt_str:
             return _Result(["User has a cat named Luna."])
-        return _Result(SummaryOutput(summary="noop"))
+        return _Result([])
+
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="summary text",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     monkeypatch.setattr(engine._streaming, "stream_chat_sse", fake_stream_chat_sse)
 
@@ -578,6 +595,7 @@ async def fake_reconcile(
 
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
     monkeypatch.setattr(_ingest.Agent, "run", fake_agent_run)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
 
     response = await engine.process_chat_request(
         request,
@@ -594,4 +612,4 @@ async def fake_reconcile(
     files = list(tmp_path.glob("entries/**/*.md"))
     assert len(files) == 4  # user + assistant + fact + 1 summary
     assert any("facts" in str(f) for f in files)
-    assert any("summaries/summary.md" in str(f) for f in files)
+    assert any("summaries/L3/final.md" in str(f) for f in files)
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index 7d59f7c0..db197b02 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -14,6 +14,7 @@
 from agent_cli.memory import _ingest
 from agent_cli.memory.client import MemoryClient
 from agent_cli.memory.entities import Fact
+from agent_cli.summarizer import SummaryLevel, SummaryResult
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -63,12 +64,19 @@ async def fake_reconcile(
         ]
         return entries, [], {}
 
-    async def fake_update_summary(*_args: Any, **_kwargs: Any) -> str:
-        return "User likes testing."
+    async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
+        return SummaryResult(
+            level=SummaryLevel.STANDARD,
+            summary="User likes testing.",
+            hierarchical=None,
+            input_tokens=100,
+            output_tokens=20,
+            compression_ratio=0.2,
+        )
 
     monkeypatch.setattr(_ingest, "extract_salient_facts", fake_extract)
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
-    monkeypatch.setattr(_ingest, "update_summary", fake_update_summary)
+    monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
 
     # Patch Reranker to avoid loading ONNX model
     monkeypatch.setattr("agent_cli.memory.client.get_reranker_model", MagicMock())
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 3edd0eeb..453a21a9 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -101,23 +101,6 @@ def query(self, **kwargs: Any) -> dict[str, Any]:
     assert {"role": {"$ne": "summary"}} in clauses
 
 
-def test_get_summary_entry_returns_entry() -> None:
-    # ChromaDB's .get() returns flat lists (not nested like .query())
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["summary text"],
-            "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "created_at": "now"},
-            ],
-            "ids": ["sum1"],
-        },
-    )
-    entry = _store.get_summary_entry(fake, "c1", role="summary")
-    assert entry is not None
-    assert entry.id == "sum1"
-    assert entry.metadata.role == "summary"
-
-
 def test_list_conversation_entries_filters_summaries() -> None:
     # ChromaDB's .get() returns flat lists (not nested like .query())
     fake = _FakeCollection(

From 7f5aff3d454a4118b14044d770b52badc1cf0a53 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 20:42:41 -0800
Subject: [PATCH 12/38] docs: add summarizer spec and update memory docs for
 hierarchical summaries

- Create docs/architecture/summarizer.md with comprehensive technical
  specification for the adaptive summarization system
- Update memory.md to reflect new L1/L2/L3 hierarchical summary structure
- Document level thresholds, compression ratios, and research basis
- Add content-type aware prompts documentation
- Document integration with memory system and storage format
---
 docs/architecture/memory.md     |  37 ++-
 docs/architecture/summarizer.md | 553 ++++++++++++++++++++++++++++++++
 2 files changed, 581 insertions(+), 9 deletions(-)
 create mode 100644 docs/architecture/summarizer.md

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index fb25a6ac..e2f3127d 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -59,7 +59,13 @@ entries/
       assistant/
         <timestamp>__<uuid>.md     # Raw assistant responses
     summaries/
-      summary.md                   # The single rolling summary of the conversation
+      L1/
+        chunk_0.md                 # Level 1: Individual chunk summaries
+        chunk_1.md
+      L2/
+        group_0.md                 # Level 2: Group summaries (groups of ~5 L1s)
+      L3/
+        final.md                   # Level 3: Final synthesized summary
 ```
 
 **Deleted Directory Structure (Soft Deletes):**
@@ -71,7 +77,7 @@ entries/
       facts/
         <timestamp>__<uuid>.md
       summaries/
-        summary.md                 # Tombstoned summary
+        L1/, L2/, L3/              # Tombstoned summary levels
 ```
 
 ### 2.2 File Format
@@ -165,10 +171,18 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
     *   **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma.
 
-### 4.4 Summarization
-*   **Input:** Previous summary (if any) + newly extracted facts.
-*   **Prompt:** `SUMMARY_PROMPT` (updates the running summary).
-*   **Persistence:** Writes a single `summaries/summary.md` per conversation (deterministic doc ID).
+### 4.4 Summarization (Adaptive Hierarchical)
+Uses the `agent_cli.summarizer` module for research-backed adaptive summarization.
+
+*   **Level Selection:** Automatically determines summarization depth based on token count:
+    *   `NONE` (< 100 tokens): No summary needed, facts only.
+    *   `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression).
+    *   `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression).
+    *   `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression).
+    *   `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure.
+*   **Input:** Previous L3 summary (if any) + newly extracted facts.
+*   **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics.
+*   **See:** `docs/architecture/summarizer.md` for detailed algorithm specification.
 
 ### 4.5 Eviction
 *   **Trigger:** If total entries in conversation > `max_entries` (default 500).
@@ -198,9 +212,14 @@ To replicate the system behavior, the following prompt strategies are required.
     *   **NONE:** Existing memory is unrelated to new facts, or new fact is an exact duplicate.
 *   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
-### 5.3 Summarization (`SUMMARY_PROMPT`)
-*   **Goal:** Maintain a concise running summary.
-*   **Constraints:** Aggregate related facts. Drop transient chit-chat. Focus on durable info.
+### 5.3 Summarization (Adaptive Prompts)
+The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`:
+*   **`BRIEF_PROMPT`:** Single-sentence distillation for short content.
+*   **`STANDARD_PROMPT`:** Paragraph summary with prior context integration.
+*   **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing.
+*   **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative.
+*   **`ROLLING_PROMPT`:** Integrates new facts with existing summary.
+*   **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization.
 
 ---
 
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
new file mode 100644
index 00000000..940ddddb
--- /dev/null
+++ b/docs/architecture/summarizer.md
@@ -0,0 +1,553 @@
+# Agent CLI: Adaptive Summarizer Technical Specification
+
+This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios).
+
+## 1. System Overview
+
+The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                    Adaptive Summarization Pipeline                   │
+├─────────────────────────────────────────────────────────────────────┤
+│                                                                     │
+│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy    │
+│                                                                     │
+│  ┌─────────────────────────────────────────────────────────────┐   │
+│  │ Level Thresholds:                                           │   │
+│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │   │
+│  │   100-500       ──▶ BRIEF       (single sentence)           │   │
+│  │   500-3000      ──▶ STANDARD    (paragraph)                 │   │
+│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │   │
+│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │   │
+│  └─────────────────────────────────────────────────────────────┘   │
+│                                                                     │
+│  Output: SummaryResult with compression metrics                     │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+**Design Goals:**
+
+- **Adaptive compression:** Match summarization depth to content complexity.
+- **Research-grounded:** Based on proven approaches from Letta and Mem0.
+- **Hierarchical structure:** Preserve detail at multiple granularities.
+- **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
+
+---
+
+## 2. Architectural Decisions
+
+### 2.1 Token-Based Level Selection
+
+**Decision:** Select summarization strategy based on input token count with fixed thresholds.
+
+**Rationale:**
+
+- **Predictable behavior:** Users can anticipate output length based on input size.
+- **Optimal compression:** Each level targets a specific compression ratio validated by research.
+- **Efficiency:** Avoid over-processing short content or under-processing long content.
+
+**Implementation:**
+
+```python
+THRESHOLD_NONE = 100       # Below this: no summary needed
+THRESHOLD_BRIEF = 500      # 100-500: single sentence (~20% compression)
+THRESHOLD_STANDARD = 3000  # 500-3000: paragraph (~12% compression)
+THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression)
+# Above 15000: hierarchical tree structure
+```
+
+**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior.
+
+### 2.2 Hierarchical Summary Structure (L1/L2/L3)
+
+**Decision:** For long content, build a tree of summaries at three levels of granularity.
+
+**Rationale:**
+
+- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content.
+- **Flexible retrieval:** Different use cases need different detail levels.
+- **Progressive compression:** Each level provides ~5x compression over the previous.
+
+**Implementation:**
+
+- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap.
+- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries.
+- **L3 (Final Summary):** Single synthesized summary of all L2 summaries.
+
+**Storage:**
+```text
+summaries/
+  L1/
+    chunk_0.md    # Summary of tokens 0-3000
+    chunk_1.md    # Summary of tokens 2800-5800 (overlap)
+  L2/
+    group_0.md    # Synthesis of chunk_0 through chunk_4
+  L3/
+    final.md      # Final narrative summary
+```
+
+### 2.3 Content-Type Aware Prompts
+
+**Decision:** Use different prompt templates for different content domains.
+
+**Rationale:**
+
+- **Conversations:** Focus on user preferences, decisions, action items.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns.
+- **Documents:** Prioritize key findings, methodology, conclusions.
+
+**Implementation:**
+
+```python
+def get_prompt_for_content_type(content_type: str) -> str:
+    match content_type:
+        case "conversation": return CONVERSATION_PROMPT
+        case "journal": return JOURNAL_PROMPT
+        case "document": return DOCUMENT_PROMPT
+        case _: return STANDARD_PROMPT
+```
+
+### 2.4 Prior Summary Integration
+
+**Decision:** Always provide the previous summary as context when updating.
+
+**Rationale:**
+
+- **Continuity:** New summaries should build on existing context, not replace it.
+- **Incremental updates:** Avoid re-summarizing all content on every update.
+- **Context preservation:** Important information from earlier content persists.
+
+**Implementation:**
+
+- The `prior_summary` parameter is passed through the entire pipeline.
+- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries.
+- For hierarchical summaries, only the L3 summary is used as prior context.
+
+### 2.5 Compression Ratio Tracking
+
+**Decision:** Track and report compression metrics for every summary.
+
+**Rationale:**
+
+- **Transparency:** Users can understand how much information was compressed.
+- **Quality monitoring:** Unusual ratios may indicate summarization issues.
+- **Optimization:** Metrics inform future threshold tuning.
+
+**Implementation:**
+
+```python
+@dataclass
+class SummaryResult:
+    level: SummaryLevel
+    summary: str | None
+    hierarchical: HierarchicalSummary | None
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float  # output/input (lower = more compression)
+```
+
+---
+
+## 3. Data Model
+
+### 3.1 Summary Levels
+
+| Level | Token Range | Target Compression | Strategy |
+| :--- | :--- | :--- | :--- |
+| `NONE` | < 100 | N/A | No summarization |
+| `BRIEF` | 100-500 | ~20% | Single sentence |
+| `STANDARD` | 500-3000 | ~12% | Paragraph |
+| `DETAILED` | 3000-15000 | ~7% | Chunked + meta |
+| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree |
+
+### 3.2 Hierarchical Summary Structure
+
+```python
+class ChunkSummary(BaseModel):
+    chunk_index: int          # Position in original content
+    content: str              # The summarized text
+    token_count: int          # Tokens in this summary
+    source_tokens: int        # Tokens in source chunk
+    parent_group: int | None  # L2 group this belongs to
+
+class HierarchicalSummary(BaseModel):
+    l1_summaries: list[ChunkSummary]  # Individual chunk summaries
+    l2_summaries: list[str]           # Group summaries
+    l3_summary: str                   # Final synthesis
+    chunk_size: int = 3000            # Tokens per chunk
+    chunk_overlap: int = 200          # Overlap between chunks
+```
+
+### 3.3 Storage Metadata (ChromaDB)
+
+Summaries are stored with rich metadata for retrieval and management:
+
+| Field | L1 | L2 | L3 | Description |
+| :--- | :---: | :---: | :---: | :--- |
+| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` |
+| `conversation_id` | ✓ | ✓ | ✓ | Scope key |
+| `role` | ✓ | ✓ | ✓ | Always `"summary"` |
+| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
+| `chunk_index` | ✓ | | | Position in L1 sequence |
+| `group_index` | | ✓ | | Position in L2 sequence |
+| `parent_group` | ✓ | | | Which L2 group owns this L1 |
+| `is_final` | | | ✓ | Marks the top-level summary |
+| `summary_level` | | | ✓ | Name of SummaryLevel enum |
+| `input_tokens` | | | ✓ | Original content token count |
+| `output_tokens` | | | ✓ | Total summary token count |
+| `compression_ratio` | | | ✓ | Output/input ratio |
+| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp |
+
+### 3.4 File Format
+
+Summary files use Markdown with YAML front matter:
+
+```markdown
+---
+id: "journal:summary:L3:final"
+conversation_id: "journal"
+role: "summary"
+level: 3
+is_final: true
+summary_level: "STANDARD"
+input_tokens: 1500
+output_tokens: 180
+compression_ratio: 0.12
+created_at: "2025-01-15T10:30:00Z"
+---
+
+The user has been exploring adaptive summarization techniques...
+```
+
+---
+
+## 4. Processing Pipeline
+
+### 4.1 Main Entry Point
+
+```python
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+) -> SummaryResult
+```
+
+### 4.2 Level Selection Flow
+
+```
+Input Content
+     │
+     ▼
+┌─────────────┐
+│ Count Tokens│ (tiktoken, cl100k_base)
+└──────┬──────┘
+       │
+       ▼
+┌─────────────────────────────────────────┐
+│ determine_level(token_count) -> Level   │
+│                                         │
+│   < 100  ──▶ NONE                       │
+│   < 500  ──▶ BRIEF                      │
+│   < 3000 ──▶ STANDARD                   │
+│   < 15000 ──▶ DETAILED                  │
+│   else   ──▶ HIERARCHICAL               │
+└──────┬──────────────────────────────────┘
+       │
+       ▼
+   Execute level-specific strategy
+```
+
+### 4.3 Strategy Execution by Level
+
+#### NONE Level
+- **Action:** Return immediately with no summary.
+- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)`
+
+#### BRIEF Level
+- **Prompt:** `BRIEF_PROMPT` - distill to single sentence.
+- **LLM Call:** Single generation with low max_tokens.
+- **Output:** One-sentence summary.
+
+#### STANDARD Level
+- **Prompt:** `STANDARD_PROMPT` with optional prior summary context.
+- **LLM Call:** Single generation.
+- **Output:** Paragraph-length summary.
+
+#### DETAILED Level
+1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap.
+2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`.
+3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`.
+4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size).
+
+#### HIERARCHICAL Level
+1. **Chunk:** Split into ~3000 token chunks with overlap.
+2. **Parallel L1:** Generate chunk summaries.
+3. **Group:** Organize L1s into groups of ~5.
+4. **Parallel L2:** Summarize each group.
+5. **L3 Synthesis:** Final meta-summary of all L2s.
+6. **Output:** Full `HierarchicalSummary` tree.
+
+### 4.4 Chunking Algorithm
+
+```python
+def chunk_text(
+    text: str,
+    chunk_size: int = 3000,
+    overlap: int = 200,
+) -> list[str]:
+    """Split text into overlapping chunks on paragraph boundaries."""
+```
+
+**Strategy:**
+
+1. **Paragraph-first:** Try to split on double newlines.
+2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries.
+3. **Character fallback:** For very long sentences (e.g., code), use character splitting.
+4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous.
+
+### 4.5 Middle Truncation (Utility)
+
+For contexts where the summary exceeds available space:
+
+```python
+def middle_truncate(
+    text: str,
+    token_budget: int,
+    head_fraction: float = 0.3,
+    tail_fraction: float = 0.7,
+) -> str:
+    """Keep head and tail, remove middle (least likely to contain key info)."""
+```
+
+**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items).
+
+---
+
+## 5. Prompt Specifications
+
+### 5.1 Brief Summary (`BRIEF_PROMPT`)
+
+```
+Distill the following content into a single, comprehensive sentence
+that captures the essential meaning:
+
+{content}
+
+Summary (one sentence):
+```
+
+### 5.2 Standard Summary (`STANDARD_PROMPT`)
+
+```
+Summarize the following content in a concise paragraph.
+{prior_context}
+Focus on key information, decisions, and actionable insights.
+
+Content:
+{content}
+
+Summary:
+```
+
+### 5.3 Chunk Summary (`CHUNK_PROMPT`)
+
+```
+Summarize this section of a larger document.
+Preserve specific details, names, and numbers that may be important.
+
+Section {chunk_index} of {total_chunks}:
+{content}
+
+Section summary:
+```
+
+### 5.4 Meta Summary (`META_PROMPT`)
+
+```
+Synthesize these section summaries into a coherent narrative.
+Maintain logical flow and preserve the most important information.
+
+Section Summaries:
+{summaries}
+
+Synthesized Summary:
+```
+
+### 5.5 Rolling Summary (`ROLLING_PROMPT`)
+
+```
+Update the existing summary to incorporate new information.
+Preserve important historical context while integrating new facts.
+
+Existing Summary:
+{prior_summary}
+
+New Information:
+{new_facts}
+
+Updated Summary:
+```
+
+### 5.6 Content-Type Prompts
+
+**Conversation:**
+```
+Summarize this conversation focusing on:
+- User preferences and decisions
+- Action items and commitments
+- Key topics discussed
+```
+
+**Journal:**
+```
+Summarize this journal entry focusing on:
+- Personal insights and reflections
+- Emotional context and growth
+- Goals and intentions
+```
+
+**Document:**
+```
+Summarize this document focusing on:
+- Key findings and conclusions
+- Methodology and approach
+- Recommendations and implications
+```
+
+---
+
+## 6. Integration with Memory System
+
+### 6.1 Entry Point
+
+The memory system calls the summarizer via `_ingest.summarize_content()`:
+
+```python
+async def summarize_content(
+    content: str,
+    prior_summary: str | None = None,
+    content_type: str = "general",
+    openai_base_url: str,
+    api_key: str | None,
+    model: str,
+) -> SummaryResult
+```
+
+### 6.2 Storage Flow
+
+```
+summarize_content()
+       │
+       ▼
+SummaryResult
+       │
+       ▼
+store_adaptive_summary()
+       │
+       ├──▶ persist_hierarchical_summary()
+       │         │
+       │         ├──▶ Delete old summaries (L1, L2, L3)
+       │         ├──▶ Write new summary files
+       │         └──▶ Upsert to ChromaDB
+       │
+       └──▶ Return stored IDs
+```
+
+### 6.3 Retrieval Integration
+
+The memory retrieval system uses `get_final_summary()` to fetch the L3 summary:
+
+```python
+def get_final_summary(
+    collection: Collection,
+    conversation_id: str,
+) -> StoredMemory | None:
+    """Retrieve the L3 final summary for injection into prompts."""
+```
+
+---
+
+## 7. Configuration Reference
+
+| Parameter | Default | Description |
+| :--- | :--- | :--- |
+| `openai_base_url` | *required* | Base URL for LLM API |
+| `model` | *required* | Model ID for summarization |
+| `api_key` | `None` | API key (optional for local models) |
+| `chunk_size` | `3000` | Tokens per chunk for hierarchical |
+| `chunk_overlap` | `200` | Token overlap between chunks |
+
+### 7.1 Level Thresholds (Constants)
+
+| Constant | Value | Description |
+| :--- | :--- | :--- |
+| `THRESHOLD_NONE` | 100 | Below: no summary |
+| `THRESHOLD_BRIEF` | 500 | Below: single sentence |
+| `THRESHOLD_STANDARD` | 3000 | Below: paragraph |
+| `THRESHOLD_DETAILED` | 15000 | Below: chunked |
+
+---
+
+## 8. Error Handling
+
+### 8.1 Graceful Degradation
+
+| Error | Fallback |
+| :--- | :--- |
+| LLM timeout | Return input unchanged with NONE level |
+| LLM error | Retry up to 3 times, then return NONE |
+| Token counting failure | Estimate based on character count (÷4) |
+| Chunking failure | Fall back to character-based splitting |
+
+### 8.2 Validation
+
+- **Empty content:** Returns NONE level immediately.
+- **Whitespace-only:** Returns NONE level.
+- **Invalid compression ratio:** Clamped to [0.0, 1.0].
+
+---
+
+## 9. Performance Considerations
+
+### 9.1 Token Counting
+
+- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer).
+- Caches tokenizer instance for efficiency.
+- Falls back to character-based estimation if tiktoken unavailable.
+
+### 9.2 Parallel Processing
+
+For DETAILED and HIERARCHICAL levels:
+- L1 chunk summaries can be generated in parallel.
+- L2 group summaries can be generated in parallel.
+- Only L3 synthesis requires sequential processing.
+
+### 9.3 Caching
+
+- Token counts are computed once per content string.
+- Prompt templates are loaded once at module import.
+- ChromaDB connection is reused across operations.
+
+---
+
+## 10. Comparison with Alternative Approaches
+
+| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking |
+| :--- | :--- | :--- | :--- |
+| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed |
+| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level |
+| **Context awareness** | Content-type prompts | Generic | Generic |
+| **Efficiency** | Skip short content | Always summarize | Always chunk |
+| **Research basis** | Letta + Mem0 | Mem0 only | None |
+
+---
+
+## 11. Future Enhancements
+
+- **Semantic chunking:** Split on topic boundaries rather than token counts.
+- **Incremental L1 updates:** Only re-summarize changed chunks.
+- **Quality scoring:** Evaluate summary quality and trigger re-summarization.
+- **User feedback loop:** Learn preferred compression ratios per user.

From c35bc132762df9544100c6192ba4e3a2ce7eb331 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 21:31:37 -0800
Subject: [PATCH 13/38] Add example script

---
 agent_cli/summarizer/_utils.py |   4 +-
 examples/summarizer_demo.py    | 483 +++++++++++++++++++++++++++++++++
 2 files changed, 486 insertions(+), 1 deletion(-)
 create mode 100644 examples/summarizer_demo.py

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index bc319f5b..030b5729 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -40,7 +40,9 @@ def count_tokens(text: str, model: str = "gpt-4") -> int:
     if not text:
         return 0
     enc = _get_encoding(model)
-    return len(enc.encode(text))
+    # Disable special token checking - LLM outputs may contain special tokens
+    # like <|constrain|>, <|endoftext|>, etc. that we want to count normally
+    return len(enc.encode(text, disallowed_special=()))
 
 
 def chunk_text(
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
new file mode 100644
index 00000000..6a542dbd
--- /dev/null
+++ b/examples/summarizer_demo.py
@@ -0,0 +1,483 @@
+"""Demonstrate the summarizer on texts of varying lengths from the internet.
+
+This script fetches content of different sizes and shows how the adaptive
+summarizer automatically selects the appropriate strategy (BRIEF, STANDARD,
+DETAILED, or HIERARCHICAL) based on content length.
+
+Usage:
+    python examples/summarizer_demo.py
+
+    # Test specific levels only
+    python examples/summarizer_demo.py --level brief
+    python examples/summarizer_demo.py --level standard
+    python examples/summarizer_demo.py --level detailed
+    python examples/summarizer_demo.py --level hierarchical
+
+    # Use a different model
+    python examples/summarizer_demo.py --model "gpt-4o-mini"
+"""  # noqa: INP001
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import textwrap
+import traceback
+from dataclasses import dataclass
+
+import httpx
+
+from agent_cli.summarizer import (
+    SummarizerConfig,
+    SummaryLevel,
+    SummaryResult,
+    summarize,
+)
+
+# Defaults for local AI setup (same as aijournal_poc.py)
+DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
+DEFAULT_MODEL = "gpt-oss-high:20b"
+
+
+@dataclass
+class TextSample:
+    """A sample text for testing the summarizer."""
+
+    name: str
+    description: str
+    url: str
+    expected_level: SummaryLevel
+    content_type: str = "general"
+    # If URL fetch fails, use this fallback
+    fallback_content: str | None = None
+
+
+# Thresholds from adaptive.py:
+# NONE: < 100 tokens
+# BRIEF: 100-500 tokens
+# STANDARD: 500-3000 tokens
+# DETAILED: 3000-15000 tokens
+# HIERARCHICAL: > 15000 tokens
+
+# Sample texts of varying lengths to demonstrate different summarization levels
+SAMPLES: list[TextSample] = [
+    TextSample(
+        name="Brief - Short News Article",
+        description="~150-400 tokens - triggers BRIEF level (100-500 token range)",
+        url="https://httpbin.org/json",  # Returns small JSON we'll convert to text
+        expected_level=SummaryLevel.BRIEF,
+        fallback_content="""
+        Breaking News: Scientists at the Marine Biology Institute have made a
+        groundbreaking discovery in the Mariana Trench. A new species of deep-sea
+        fish, dubbed "Pseudoliparis swirei," has been found surviving at depths
+        exceeding 8,000 meters, making it one of the deepest-living fish ever
+        documented.
+
+        The research team, led by Dr. Sarah Chen from the University of Washington,
+        used advanced unmanned submersibles equipped with high-resolution cameras
+        and collection apparatus. The expedition lasted three months and covered
+        multiple dive sites across the western Pacific.
+
+        "This discovery fundamentally changes our understanding of life in extreme
+        environments," Dr. Chen stated in a press conference. "The adaptations
+        these fish have developed to survive crushing pressures and near-freezing
+        temperatures are remarkable."
+
+        The fish displays several unique characteristics including translucent skin,
+        specialized proteins that prevent cellular damage under pressure, and an
+        unusual metabolism that allows survival with minimal oxygen. Scientists
+        believe studying these adaptations could lead to breakthroughs in medicine
+        and materials science.
+
+        The finding has been published in the journal Nature and has already
+        generated significant interest from the scientific community worldwide.
+        Further expeditions are planned to study the species in its natural habitat.
+        """,
+    ),
+    TextSample(
+        name="Standard - Technology Article",
+        description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)",
+        url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
+        expected_level=SummaryLevel.STANDARD,
+        content_type="document",
+        fallback_content="""
+        Artificial intelligence (AI) is the intelligence of machines or software,
+        as opposed to the intelligence of humans or other animals. It is a field
+        of computer science that develops and studies intelligent machines. The
+        field encompasses a wide range of approaches and technologies.
+
+        AI research has been defined as the field of study of intelligent agents,
+        which refers to any system that perceives its environment and takes actions
+        that maximize its chances of achieving its goals. This definition emphasizes
+        the practical aspects of building systems that can operate effectively.
+
+        The term "artificial intelligence" has been used to describe machines that
+        mimic cognitive functions that humans associate with the human mind, such
+        as learning and problem solving. As machines become increasingly capable,
+        tasks considered to require "intelligence" are often removed from the
+        definition of AI, a phenomenon known as the AI effect.
+
+        History of Artificial Intelligence
+
+        The field of AI research was founded at a workshop held on the campus of
+        Dartmouth College during the summer of 1956. The attendees became the
+        founders and leaders of AI research. They and their students produced
+        programs that the press described as astonishing.
+
+        Early AI research in the 1950s explored topics like problem solving and
+        symbolic methods. In the 1960s, the US Department of Defense took interest
+        and began training computers to mimic basic human reasoning. DARPA completed
+        street mapping projects in the 1970s and produced intelligent personal
+        assistants in 2003, long before Siri, Alexa or Cortana.
+
+        Modern AI Approaches
+
+        Modern AI techniques have become pervasive and include machine learning,
+        deep learning, natural language processing, computer vision, robotics,
+        and autonomous systems. These technologies power everything from search
+        engines to self-driving cars.
+
+        Machine learning is a subset of AI that enables systems to learn and improve
+        from experience without being explicitly programmed. Deep learning uses
+        neural networks with many layers to analyze various factors of data.
+
+        Neural networks are computing systems inspired by biological neural networks.
+        They consist of interconnected nodes that process information using
+        connectionist approaches to computation. Modern neural networks can have
+        millions or billions of parameters.
+
+        Applications of AI
+
+        AI applications are transforming industries including healthcare, finance,
+        transportation, and entertainment. In healthcare, AI helps diagnose diseases
+        and develop new treatments. In finance, AI powers fraud detection and
+        algorithmic trading.
+
+        Autonomous vehicles use AI to perceive their environment and make driving
+        decisions. Virtual assistants use natural language processing to understand
+        and respond to user queries. Recommendation systems use AI to suggest
+        content based on user preferences.
+
+        Ethical Considerations
+
+        The field was founded on the assumption that human intelligence can be
+        so precisely described that a machine can be made to simulate it. This
+        raised philosophical arguments about the mind and the ethical consequences
+        of creating artificial beings endowed with human-like intelligence.
+
+        Major concerns include job displacement, algorithmic bias, privacy violations,
+        and the potential for misuse. Researchers and policymakers are working to
+        develop frameworks for responsible AI development and deployment.
+
+        The future of AI holds both tremendous promise and significant challenges.
+        As these systems become more capable, society must grapple with questions
+        about control, accountability, and the nature of intelligence itself.
+        """,
+    ),
+    TextSample(
+        name="Detailed - Full Article",
+        description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)",
+        url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
+        expected_level=SummaryLevel.DETAILED,
+        content_type="document",
+        fallback_content=None,  # We'll generate synthetic content
+    ),
+    TextSample(
+        name="Hierarchical - Long Document",
+        description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)",
+        url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
+        expected_level=SummaryLevel.HIERARCHICAL,
+        content_type="document",
+        fallback_content=None,  # We'll generate synthetic content (~16K tokens)
+    ),
+]
+
+
+def generate_synthetic_content(target_tokens: int, topic: str = "technology") -> str:
+    """Generate synthetic content for testing when URLs fail."""
+    # Each paragraph is roughly 50-100 tokens
+    paragraphs = [
+        f"Section on {topic} - Part {{i}}: This section explores various aspects "
+        f"of {topic} and its implications for modern society. The development of "
+        f"new technologies continues to reshape how we live and work. Researchers "
+        f"have made significant progress in understanding the fundamentals.",
+        f"The history of {topic} spans many decades of innovation. Early pioneers "
+        f"laid the groundwork for current advancements. Their contributions remain "
+        f"relevant today as we build upon established foundations.",
+        f"Current applications of {topic} include healthcare, transportation, and "
+        f"communication. These sectors have seen dramatic improvements in efficiency "
+        f"and capability. Future developments promise even greater transformations.",
+        f"Challenges in {topic} include ethical considerations, resource constraints, "
+        f"and technical limitations. Addressing these requires collaboration across "
+        f"disciplines. Solutions often emerge from unexpected directions.",
+        f"The future of {topic} looks promising with continued investment and research. "
+        f"Emerging trends suggest new possibilities. Stakeholders must prepare for "
+        f"rapid change while maintaining focus on beneficial outcomes.",
+    ]
+
+    result = []
+    tokens_per_para = 75  # approximate
+    needed_paragraphs = target_tokens // tokens_per_para + 1
+
+    for i in range(needed_paragraphs):
+        para = paragraphs[i % len(paragraphs)].format(i=i + 1)
+        result.append(para)
+
+    return "\n\n".join(result)
+
+
+async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:  # noqa: PLR0912
+    """Fetch content from URL or use fallback."""
+    try:
+        # Add User-Agent header to avoid 403 errors from some sites
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; SummarizerDemo/1.0)",
+        }
+        response = await client.get(
+            sample.url,
+            timeout=30.0,
+            follow_redirects=True,
+            headers=headers,
+        )
+        response.raise_for_status()
+
+        content = response.text
+
+        # Handle Wikipedia API JSON responses
+        if "wikipedia.org/api" in sample.url:
+            try:
+                data = json.loads(content)
+                if "extract" in data:
+                    content = data["extract"]
+                elif "text" in data:
+                    content = data["text"]
+            except json.JSONDecodeError:
+                pass
+
+        # For httpbin JSON, create a readable summary
+        if "httpbin.org/json" in sample.url:
+            content = sample.fallback_content or ""
+
+        # Strip HTML tags if present
+        if "<" in content and ">" in content:
+            content = re.sub(r"<[^>]+>", " ", content)
+            content = re.sub(r"\s+", " ", content).strip()
+
+        # Check if content is too short for expected level
+        min_words_for_level = {
+            SummaryLevel.BRIEF: 80,  # Need ~100 tokens
+            SummaryLevel.STANDARD: 400,  # Need ~500 tokens
+            SummaryLevel.DETAILED: 2500,  # Need ~3000 tokens
+            SummaryLevel.HIERARCHICAL: 12000,  # Need ~15000 tokens
+        }
+        min_words = min_words_for_level.get(sample.expected_level, 50)
+
+        if len(content.split()) < min_words:
+            print(f"  📎 Fetched content too short ({len(content.split())} words), using fallback")
+            if sample.fallback_content:
+                content = sample.fallback_content
+            else:
+                target_tokens = {
+                    SummaryLevel.BRIEF: 300,
+                    SummaryLevel.STANDARD: 1500,
+                    SummaryLevel.DETAILED: 8000,
+                    SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+                }
+                content = generate_synthetic_content(
+                    target_tokens.get(sample.expected_level, 1000),
+                )
+
+        # For HIERARCHICAL, truncate very long content to keep demo fast
+        # but ensure we stay above 15000 tokens (~13000 words)
+        if sample.expected_level == SummaryLevel.HIERARCHICAL:
+            words = content.split()
+            # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL)
+            if len(words) > 13500:  # noqa: PLR2004
+                content = " ".join(words[:13500])
+                print("  📎 Truncated to ~13500 words for faster demo")
+
+        return content.strip()
+
+    except Exception as e:
+        print(f"  ⚠️  Failed to fetch URL: {e}")
+
+        if sample.fallback_content:
+            return sample.fallback_content.strip()
+
+        # Generate synthetic content for the expected level
+        target_tokens = {
+            SummaryLevel.BRIEF: 300,
+            SummaryLevel.STANDARD: 1500,
+            SummaryLevel.DETAILED: 8000,
+            SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+        }
+        return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
+
+
+def print_result(sample: TextSample, result: SummaryResult, content: str) -> None:
+    """Print a formatted summary result."""
+    print("\n" + "=" * 70)
+    print(f"📄 {sample.name}")
+    print(f"   {sample.description}")
+    print("=" * 70)
+
+    # Input stats
+    word_count = len(content.split())
+    print("\n📊 Input Statistics:")
+    print(f"   Words: {word_count:,}")
+    print(f"   Tokens: {result.input_tokens:,}")
+    print(f"   Content type: {sample.content_type}")
+
+    # Summarization result
+    level_emoji = {
+        SummaryLevel.NONE: "⏭️",
+        SummaryLevel.BRIEF: "📝",
+        SummaryLevel.STANDARD: "📄",
+        SummaryLevel.DETAILED: "📚",
+        SummaryLevel.HIERARCHICAL: "🏗️",
+    }
+    print("\n🎯 Summarization Result:")
+    print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
+    print(f"   Expected: {sample.expected_level.name}")
+    print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
+    print(f"   Output tokens: {result.output_tokens:,}")
+    print(f"   Compression: {result.compression_ratio:.1%}")
+
+    # Summary content
+    if result.summary:
+        print("\n📝 Summary:")
+        wrapped = textwrap.fill(
+            result.summary,
+            width=68,
+            initial_indent="   ",
+            subsequent_indent="   ",
+        )
+        print(wrapped)
+
+    # Hierarchical details if present
+    if result.hierarchical:
+        h = result.hierarchical
+        print("\n🏗️  Hierarchical Structure:")
+        print(f"   L1 chunks: {len(h.l1_summaries)}")
+        print(f"   L2 groups: {len(h.l2_summaries)}")
+        if h.l2_summaries:
+            print(f"   L2 preview: {h.l2_summaries[0][:100]}...")
+        print("\n   L3 Final Summary:")
+        wrapped = textwrap.fill(
+            h.l3_summary,
+            width=68,
+            initial_indent="   ",
+            subsequent_indent="   ",
+        )
+        print(wrapped)
+
+
+async def run_demo(
+    level_filter: str | None = None,
+    model: str | None = None,
+    base_url: str | None = None,
+) -> None:
+    """Run the summarizer demo."""
+    # Configuration
+    actual_base_url = base_url or os.environ.get("OPENAI_BASE_URL", DEFAULT_BASE_URL)
+    actual_model = model or os.environ.get("OPENAI_MODEL", DEFAULT_MODEL)
+    api_key = os.environ.get("OPENAI_API_KEY", "not-needed-for-local")
+
+    print("🔧 Configuration:")
+    print(f"   Base URL: {actual_base_url}")
+    print(f"   Model: {actual_model}")
+
+    config = SummarizerConfig(
+        openai_base_url=actual_base_url,
+        model=actual_model,
+        api_key=api_key,
+        chunk_size=3000,
+        max_concurrent_chunks=3,
+        timeout=120.0,  # Longer timeout for local models
+    )
+
+    # Filter samples if requested
+    samples = SAMPLES
+    if level_filter:
+        level_map = {
+            "brief": SummaryLevel.BRIEF,
+            "standard": SummaryLevel.STANDARD,
+            "detailed": SummaryLevel.DETAILED,
+            "hierarchical": SummaryLevel.HIERARCHICAL,
+        }
+        target_level = level_map.get(level_filter.lower())
+        if target_level:
+            samples = [s for s in SAMPLES if s.expected_level == target_level]
+            print(f"\n🔍 Filtering to {level_filter.upper()} level only")
+
+    async with httpx.AsyncClient() as client:
+        for sample in samples:
+            print(f"\n⏳ Processing: {sample.name}...")
+
+            # Fetch content
+            content = await fetch_content(sample, client)
+
+            try:
+                # Summarize
+                result = await summarize(
+                    content=content,
+                    config=config,
+                    content_type=sample.content_type,
+                )
+
+                # Display results
+                print_result(sample, result, content)
+
+            except Exception as e:
+                print(f"\n❌ Error summarizing {sample.name}: {e}")
+
+                traceback.print_exc()
+
+    print("\n" + "=" * 70)
+    print("✅ Demo complete!")
+    print("=" * 70)
+
+
+def main() -> None:
+    """CLI entry point."""
+    parser = argparse.ArgumentParser(
+        description="Demonstrate adaptive summarization on texts of varying lengths",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=textwrap.dedent("""
+        Examples:
+          python examples/summarizer_demo.py
+          python examples/summarizer_demo.py --level standard
+          python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
+        """),
+    )
+
+    parser.add_argument(
+        "--level",
+        choices=["brief", "standard", "detailed", "hierarchical"],
+        help="Only test a specific summarization level",
+    )
+    parser.add_argument(
+        "--model",
+        help=f"Model to use (default: {DEFAULT_MODEL})",
+    )
+    parser.add_argument(
+        "--base-url",
+        help=f"OpenAI-compatible API base URL (default: {DEFAULT_BASE_URL})",
+    )
+
+    args = parser.parse_args()
+
+    asyncio.run(
+        run_demo(
+            level_filter=args.level,
+            model=args.model,
+            base_url=args.base_url,
+        ),
+    )
+
+
+if __name__ == "__main__":
+    main()

From 3e5fb4e6915cc1970289915482a20f0b3d72f507 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 21:54:45 -0800
Subject: [PATCH 14/38] refactor(summarizer): YAGNI cleanup and fix
 prior_context bug

Removed unused code:
- update_rolling_summary() - never called anywhere
- _raw_generate() fallback - errors should fail loudly
- retry/backoff logic - same reason
- parent_group from ChunkSummary - stored but never read
- ROLLING_SUMMARY_PROMPT - only used by removed function

Kept middle_truncate() - useful for handling very large inputs
(e.g., conversations with pasted codebases).

Bugfix:
- Add {prior_context} to CONVERSATION, JOURNAL, DOCUMENT prompts
- Previously prior_summary was silently ignored for non-"general" types
- Python's .format() ignores extra kwargs, hiding the bug

Updates documentation to reflect fail-fast error handling.
---
 agent_cli/summarizer/_prompts.py  |  20 ++---
 agent_cli/summarizer/adaptive.py  | 141 ++----------------------------
 agent_cli/summarizer/models.py    |   5 --
 docs/architecture/summarizer.md   |  62 ++++++-------
 tests/summarizer/test_adaptive.py | 141 ++----------------------------
 tests/summarizer/test_models.py   |  13 ---
 tests/summarizer/test_prompts.py  |  22 ++---
 7 files changed, 51 insertions(+), 353 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 101422b7..f46b39eb 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -47,20 +47,6 @@
 
 Combined summary (maximum {max_words} words):""".strip()
 
-# Rolling summary update (Mem0-style)
-ROLLING_SUMMARY_PROMPT = """Update the running summary with new information.
-Integrate new facts seamlessly while keeping the summary concise.
-Drop redundant or superseded information.
-Preserve durable facts about identity, preferences, and important events.
-
-Current summary:
-{prior_summary}
-
-New information to integrate:
-{new_content}
-
-Updated summary (maximum {max_words} words):""".strip()
-
 # For conversation-specific summarization
 CONVERSATION_SUMMARY_PROMPT = """Summarize this conversation from the AI assistant's perspective.
 Focus on:
@@ -69,6 +55,8 @@
 - Decisions made or conclusions reached
 - Any commitments or follow-ups mentioned
 
+{prior_context}
+
 Conversation:
 {content}
 
@@ -82,6 +70,8 @@
 - Goals, plans, or intentions stated
 - People, places, or things that are important
 
+{prior_context}
+
 Entry:
 {content}
 
@@ -95,6 +85,8 @@
 - Important specifications or requirements
 - Conclusions or recommendations
 
+{prior_context}
+
 Document:
 {content}
 
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 590dabc5..989bd86b 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -13,7 +13,6 @@
 import logging
 from dataclasses import dataclass
 
-import httpx
 from pydantic import BaseModel
 from pydantic_ai import Agent
 from pydantic_ai.models.openai import OpenAIChatModel
@@ -24,7 +23,6 @@
     BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    ROLLING_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
     get_prompt_for_content_type,
@@ -33,7 +31,6 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
-    middle_truncate,
     tokens_to_words,
 )
 from agent_cli.summarizer.models import (
@@ -59,14 +56,6 @@
 # Minimum number of L1 chunks before L2 grouping is applied
 L2_MIN_CHUNKS = 5
 
-# Retry settings for summarization failures
-MAX_SUMMARIZE_RETRIES = 3
-
-# Maximum characters per chunk before applying middle truncation
-# This prevents context overflow errors for very large chunks
-# (roughly 12K tokens with cl100k_base encoding)
-MAX_CHUNK_CHARS = 48000
-
 
 class SummaryOutput(BaseModel):
     """Structured output for summary generation."""
@@ -199,93 +188,32 @@ async def summarize(
     )
 
 
-async def update_rolling_summary(
-    prior_summary: str | None,
-    new_facts: list[str],
-    config: SummarizerConfig,
-) -> str:
-    """Update a rolling summary with new facts (Mem0-style).
-
-    This is optimized for incremental updates where you have discrete
-    new facts to integrate into an existing summary.
-
-    Args:
-        prior_summary: The existing summary to update.
-        new_facts: List of new facts to integrate.
-        config: Summarizer configuration.
-
-    Returns:
-        Updated summary string.
-
-    """
-    if not new_facts:
-        return prior_summary or ""
-
-    new_content = "\n".join(f"- {fact}" for fact in new_facts)
-    combined_tokens = count_tokens(
-        (prior_summary or "") + new_content,
-        config.model,
-    )
-
-    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = ROLLING_SUMMARY_PROMPT.format(
-        prior_summary=prior_summary or "(No prior summary)",
-        new_content=new_content,
-        max_words=max_words,
-    )
-
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
 async def _summarize_single_chunk(
     chunk: str,
     chunk_index: int,
     total_chunks: int,
     config: SummarizerConfig,
-    *,
-    parent_group: int | None = None,
 ) -> ChunkSummary:
     """Summarize a single chunk of content.
 
-    Uses middle truncation as a fallback for oversized content (Letta-style).
-
     Args:
         chunk: The text chunk to summarize.
         chunk_index: Index of this chunk (0-based).
         total_chunks: Total number of chunks being processed.
         config: Summarizer configuration.
-        parent_group: Optional L2 group index for hierarchical summaries.
 
     Returns:
         ChunkSummary with the summarized content.
 
     """
-    # Apply middle truncation if chunk is too large (Letta-style fallback)
     source_tokens = count_tokens(chunk, config.model)
-    content_to_summarize = chunk
-    if len(chunk) > MAX_CHUNK_CHARS:
-        content_to_summarize, dropped = middle_truncate(
-            chunk,
-            MAX_CHUNK_CHARS,
-            head_frac=0.3,
-            tail_frac=0.3,
-        )
-        logger.warning(
-            "Chunk %d truncated: dropped %d chars to fit context window",
-            chunk_index,
-            dropped,
-        )
-
-    chunk_tokens = count_tokens(content_to_summarize, config.model)
-    target_tokens = estimate_summary_tokens(chunk_tokens, SummaryLevel.STANDARD)
+    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
     max_words = tokens_to_words(target_tokens)
 
     prompt = CHUNK_SUMMARY_PROMPT.format(
         chunk_index=chunk_index + 1,
         total_chunks=total_chunks,
-        content=content_to_summarize,
+        content=chunk,
         max_words=max_words,
     )
 
@@ -296,8 +224,7 @@ async def _summarize_single_chunk(
         chunk_index=chunk_index,
         content=summary,
         token_count=summary_tokens,
-        source_tokens=source_tokens,  # Report original token count
-        parent_group=parent_group,
+        source_tokens=source_tokens,
     )
 
 
@@ -355,7 +282,6 @@ async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
                 idx,
                 len(chunks),
                 config,
-                parent_group=None,
             )
 
     chunk_summaries = await asyncio.gather(
@@ -423,14 +349,11 @@ async def _hierarchical_summary(
 
     async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
         async with semaphore:
-            # Assign to L2 group (L2_GROUP_SIZE chunks per group)
-            group_idx = idx // L2_GROUP_SIZE
             return await _summarize_single_chunk(
                 chunk,
                 idx,
                 len(chunks),
                 config,
-                parent_group=group_idx,
             )
 
     l1_summaries = await asyncio.gather(
@@ -497,25 +420,19 @@ async def _generate_summary(
     prompt: str,
     config: SummarizerConfig,
     max_tokens: int = 256,
-    *,
-    attempt: int = 0,
 ) -> str:
     """Generate a summary using the LLM.
 
-    Uses PydanticAI for structured output with fallback to raw generation.
-    Implements exponential backoff retry on failures.
-
     Args:
         prompt: The prompt to send to the LLM.
         config: Summarizer configuration.
         max_tokens: Maximum tokens for the response.
-        attempt: Current retry attempt (for internal recursion).
 
     Returns:
         The generated summary text.
 
     Raises:
-        SummarizationError: If all retries are exhausted.
+        SummarizationError: If summarization fails.
 
     """
     provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
@@ -539,51 +456,5 @@ async def _generate_summary(
         result = await agent.run(prompt)
         return result.output.summary.strip()
     except Exception as e:
-        logger.warning("Structured summary failed, trying raw generation: %s", e)
-        # Fallback to raw HTTP call
-        try:
-            return await _raw_generate(prompt, config, max_tokens)
-        except Exception as raw_err:
-            if attempt < MAX_SUMMARIZE_RETRIES:
-                wait_time = 2**attempt  # Exponential backoff: 1, 2, 4 seconds
-                logger.warning(
-                    "Raw generation failed (attempt %d/%d), retrying in %ds: %s",
-                    attempt + 1,
-                    MAX_SUMMARIZE_RETRIES,
-                    wait_time,
-                    raw_err,
-                )
-                await asyncio.sleep(wait_time)
-                return await _generate_summary(
-                    prompt,
-                    config,
-                    max_tokens,
-                    attempt=attempt + 1,
-                )
-            msg = f"Summarization failed after {MAX_SUMMARIZE_RETRIES} retries"
-            raise SummarizationError(msg) from raw_err
-
-
-async def _raw_generate(prompt: str, config: SummarizerConfig, max_tokens: int) -> str:
-    """Fallback raw HTTP generation without structured output."""
-    async with httpx.AsyncClient(timeout=config.timeout) as client:
-        response = await client.post(
-            f"{config.openai_base_url}/chat/completions",
-            headers={"Authorization": f"Bearer {config.api_key}"},
-            json={
-                "model": config.model,
-                "messages": [
-                    {"role": "system", "content": "You are a concise summarizer."},
-                    {"role": "user", "content": prompt},
-                ],
-                "temperature": 0.3,
-                "max_tokens": max_tokens,
-            },
-        )
-        response.raise_for_status()
-        data = response.json()
-
-    choices = data.get("choices", [])
-    if choices:
-        return choices[0].get("message", {}).get("content", "").strip()
-    return ""
+        msg = f"Summarization failed: {e}"
+        raise SummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 843d1dfe..4f5c5119 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -44,10 +44,6 @@ class ChunkSummary(BaseModel):
     content: str = Field(..., description="The summarized content of this chunk")
     token_count: int = Field(..., ge=0, description="Token count of this summary")
     source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
-    parent_group: int | None = Field(
-        default=None,
-        description="Index of the L2 group this chunk belongs to",
-    )
 
 
 class HierarchicalSummary(BaseModel):
@@ -156,7 +152,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L1,
                         "chunk_index": cs.chunk_index,
-                        "parent_group": cs.parent_group,
                         "token_count": cs.token_count,
                         "created_at": timestamp,
                     },
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 940ddddb..59f1dbb5 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -8,19 +8,19 @@ The adaptive summarizer provides **content-aware compression** that scales summa
 
 ```
 ┌─────────────────────────────────────────────────────────────────────┐
-│                    Adaptive Summarization Pipeline                   │
+│                    Adaptive Summarization Pipeline                  │
 ├─────────────────────────────────────────────────────────────────────┤
 │                                                                     │
-│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy    │
+│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy     │
 │                                                                     │
-│  ┌─────────────────────────────────────────────────────────────┐   │
-│  │ Level Thresholds:                                           │   │
-│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │   │
-│  │   100-500       ──▶ BRIEF       (single sentence)           │   │
-│  │   500-3000      ──▶ STANDARD    (paragraph)                 │   │
-│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │   │
-│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │   │
-│  └─────────────────────────────────────────────────────────────┘   │
+│  ┌─────────────────────────────────────────────────────────────┐    │
+│  │ Level Thresholds:                                           │    │
+│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │    │
+│  │   100-500       ──▶ BRIEF       (single sentence)           │    │
+│  │   500-3000      ──▶ STANDARD    (paragraph)                 │    │
+│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │    │
+│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │    │
+│  └─────────────────────────────────────────────────────────────┘    │
 │                                                                     │
 │  Output: SummaryResult with compression metrics                     │
 └─────────────────────────────────────────────────────────────────────┘
@@ -310,19 +310,19 @@ def chunk_text(
 
 ### 4.5 Middle Truncation (Utility)
 
-For contexts where the summary exceeds available space:
+For handling very large inputs that could exceed context windows:
 
 ```python
 def middle_truncate(
     text: str,
-    token_budget: int,
-    head_fraction: float = 0.3,
-    tail_fraction: float = 0.7,
-) -> str:
+    budget_chars: int,
+    head_frac: float = 0.3,
+    tail_frac: float = 0.3,
+) -> tuple[str, int]:
     """Keep head and tail, remove middle (least likely to contain key info)."""
 ```
 
-**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items).
+**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases.
 
 ---
 
@@ -376,22 +376,9 @@ Section Summaries:
 Synthesized Summary:
 ```
 
-### 5.5 Rolling Summary (`ROLLING_PROMPT`)
+### 5.5 Content-Type Prompts
 
-```
-Update the existing summary to incorporate new information.
-Preserve important historical context while integrating new facts.
-
-Existing Summary:
-{prior_summary}
-
-New Information:
-{new_facts}
-
-Updated Summary:
-```
-
-### 5.6 Content-Type Prompts
+All content-type prompts include `{prior_context}` for rolling summary continuity.
 
 **Conversation:**
 ```
@@ -493,14 +480,15 @@ def get_final_summary(
 
 ## 8. Error Handling
 
-### 8.1 Graceful Degradation
+### 8.1 Fail-Fast Philosophy
+
+Errors are propagated rather than hidden behind fallbacks:
 
-| Error | Fallback |
+| Error | Behavior |
 | :--- | :--- |
-| LLM timeout | Return input unchanged with NONE level |
-| LLM error | Retry up to 3 times, then return NONE |
-| Token counting failure | Estimate based on character count (÷4) |
-| Chunking failure | Fall back to character-based splitting |
+| LLM timeout | Raises `SummarizationError` |
+| LLM error | Raises `SummarizationError` |
+| Token counting failure | Falls back to `cl100k_base` encoding |
 
 ### 8.2 Validation
 
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index f5db1486..ac04bc12 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -8,13 +8,12 @@
 
 from agent_cli.summarizer.adaptive import (
     LEVEL_THRESHOLDS,
+    SummarizationError,
     SummarizerConfig,
     SummaryOutput,
     _generate_summary,
-    _raw_generate,
     determine_level,
     summarize,
-    update_rolling_summary,
 )
 from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
@@ -257,82 +256,6 @@ async def test_hierarchical_level_calls_hierarchical_summary(
         assert result.level == SummaryLevel.HIERARCHICAL
 
 
-class TestUpdateRollingSummary:
-    """Tests for rolling summary updates."""
-
-    @pytest.fixture
-    def config(self) -> SummarizerConfig:
-        """Create a config instance."""
-        return SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    @pytest.mark.asyncio
-    async def test_empty_facts_returns_prior(self, config: SummarizerConfig) -> None:
-        """Test that empty facts list returns prior summary."""
-        result = await update_rolling_summary(
-            prior_summary="Existing summary",
-            new_facts=[],
-            config=config,
-        )
-        assert result == "Existing summary"
-
-    @pytest.mark.asyncio
-    async def test_empty_facts_no_prior_returns_empty(
-        self,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that empty facts with no prior returns empty string."""
-        result = await update_rolling_summary(
-            prior_summary=None,
-            new_facts=[],
-            config=config,
-        )
-        assert result == ""
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._generate_summary")
-    async def test_new_facts_calls_generate(
-        self,
-        mock_generate: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that new facts trigger summary generation."""
-        mock_generate.return_value = "Updated summary with new facts."
-
-        result = await update_rolling_summary(
-            prior_summary="Old summary",
-            new_facts=["User likes coffee", "User lives in Amsterdam"],
-            config=config,
-        )
-
-        mock_generate.assert_called_once()
-        assert result == "Updated summary with new facts."
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._generate_summary")
-    async def test_facts_formatted_as_list(
-        self,
-        mock_generate: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that facts are formatted as bullet list in prompt."""
-        mock_generate.return_value = "Summary"
-
-        await update_rolling_summary(
-            prior_summary="Prior",
-            new_facts=["Fact one", "Fact two"],
-            config=config,
-        )
-
-        # Check the prompt contains formatted facts
-        call_args = mock_generate.call_args
-        prompt = call_args[0][0]
-        assert "- Fact one" in prompt
-        assert "- Fact two" in prompt
-
-
 class TestGenerateSummary:
     """Tests for _generate_summary function."""
 
@@ -365,72 +288,18 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run.assert_called_once_with("Test prompt")
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._raw_generate")
-    async def test_fallback_to_raw_generate_on_error(
+    async def test_raises_summarization_error_on_failure(
         self,
-        mock_raw: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test fallback to raw HTTP on PydanticAI error."""
-        mock_raw.return_value = "Fallback summary"
-
+        """Test that SummarizationError is raised on failure."""
         with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent
 
-            result = await _generate_summary("Test prompt", config, max_tokens=100)
-
-            mock_raw.assert_called_once_with("Test prompt", config, 100)
-            assert result == "Fallback summary"
-
-
-class TestRawGenerate:
-    """Tests for _raw_generate fallback function."""
-
-    @pytest.fixture
-    def config(self) -> SummarizerConfig:
-        """Create a config instance."""
-        return SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="gpt-4",
-        )
-
-    @pytest.mark.asyncio
-    async def test_raw_generate_success(self, config: SummarizerConfig) -> None:
-        """Test successful raw HTTP generation."""
-        mock_response = MagicMock()
-        mock_response.json.return_value = {
-            "choices": [{"message": {"content": "Raw generated summary"}}],
-        }
-
-        with patch("httpx.AsyncClient") as mock_client_class:
-            mock_client = MagicMock()
-            mock_client.post = AsyncMock(return_value=mock_response)
-            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-            mock_client.__aexit__ = AsyncMock(return_value=None)
-            mock_client_class.return_value = mock_client
-
-            result = await _raw_generate("Test prompt", config, max_tokens=100)
-
-            assert result == "Raw generated summary"
-
-    @pytest.mark.asyncio
-    async def test_raw_generate_empty_choices(self, config: SummarizerConfig) -> None:
-        """Test raw generate with empty choices returns empty string."""
-        mock_response = MagicMock()
-        mock_response.json.return_value = {"choices": []}
-
-        with patch("httpx.AsyncClient") as mock_client_class:
-            mock_client = MagicMock()
-            mock_client.post = AsyncMock(return_value=mock_response)
-            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-            mock_client.__aexit__ = AsyncMock(return_value=None)
-            mock_client_class.return_value = mock_client
-
-            result = await _raw_generate("Test prompt", config, max_tokens=100)
-
-            assert result == ""
+            with pytest.raises(SummarizationError, match="Summarization failed"):
+                await _generate_summary("Test prompt", config, max_tokens=100)
 
 
 class TestSummaryOutput:
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index e27fa18e..23509d2e 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -43,24 +43,11 @@ def test_basic_creation(self) -> None:
             content="This is a summary of chunk 1.",
             token_count=10,
             source_tokens=100,
-            parent_group=None,
         )
         assert chunk.chunk_index == 0
         assert chunk.content == "This is a summary of chunk 1."
         assert chunk.token_count == 10
         assert chunk.source_tokens == 100
-        assert chunk.parent_group is None
-
-    def test_with_parent_group(self) -> None:
-        """Test creating a chunk summary with parent group."""
-        chunk = ChunkSummary(
-            chunk_index=5,
-            content="Summary text",
-            token_count=8,
-            source_tokens=200,
-            parent_group=1,
-        )
-        assert chunk.parent_group == 1
 
     def test_validation_negative_tokens(self) -> None:
         """Test that negative token counts fail validation."""
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index 05937f71..66022970 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -9,7 +9,6 @@
     DOCUMENT_SUMMARY_PROMPT,
     JOURNAL_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    ROLLING_SUMMARY_PROMPT,
     STANDARD_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
@@ -71,26 +70,23 @@ def test_meta_prompt_has_placeholders(self) -> None:
         assert "Summary 1" in result
         assert "200" in result
 
-    def test_rolling_prompt_has_placeholders(self) -> None:
-        """Test ROLLING prompt contains required placeholders."""
-        assert "{prior_summary}" in ROLLING_SUMMARY_PROMPT
-        assert "{new_content}" in ROLLING_SUMMARY_PROMPT
-        assert "{max_words}" in ROLLING_SUMMARY_PROMPT
-
-    def test_conversation_prompt_has_content(self) -> None:
-        """Test CONVERSATION prompt contains content placeholder."""
+    def test_conversation_prompt_has_placeholders(self) -> None:
+        """Test CONVERSATION prompt contains required placeholders."""
         assert "{content}" in CONVERSATION_SUMMARY_PROMPT
         assert "{max_words}" in CONVERSATION_SUMMARY_PROMPT
+        assert "{prior_context}" in CONVERSATION_SUMMARY_PROMPT
 
-    def test_journal_prompt_has_content(self) -> None:
-        """Test JOURNAL prompt contains content placeholder."""
+    def test_journal_prompt_has_placeholders(self) -> None:
+        """Test JOURNAL prompt contains required placeholders."""
         assert "{content}" in JOURNAL_SUMMARY_PROMPT
         assert "{max_words}" in JOURNAL_SUMMARY_PROMPT
+        assert "{prior_context}" in JOURNAL_SUMMARY_PROMPT
 
-    def test_document_prompt_has_content(self) -> None:
-        """Test DOCUMENT prompt contains content placeholder."""
+    def test_document_prompt_has_placeholders(self) -> None:
+        """Test DOCUMENT prompt contains required placeholders."""
         assert "{content}" in DOCUMENT_SUMMARY_PROMPT
         assert "{max_words}" in DOCUMENT_SUMMARY_PROMPT
+        assert "{prior_context}" in DOCUMENT_SUMMARY_PROMPT
 
 
 class TestGetPromptForContentType:

From 01c67aa42707b9c7c4add0c90168e0d2190012e4 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:25:41 -0800
Subject: [PATCH 15/38] feat(cli): add summarize command for adaptive
 hierarchical summarization

Expose the full power of the summarizer through a CLI command that:
- Follows existing CLI patterns using shared opts module
- Supports all LLM providers (ollama, openai, gemini)
- Offers content-type prompts (general, conversation, journal, document)
- Provides output formats (text, json, full hierarchical)
- Includes chunking options and rolling summary support
- Reads from file or stdin
---
 agent_cli/agents/__init__.py  |   3 +
 agent_cli/agents/summarize.py | 435 ++++++++++++++++++++++++++++++++++
 agent_cli/cli.py              |   1 +
 uv.lock                       |  61 +++++
 4 files changed, 500 insertions(+)
 create mode 100644 agent_cli/agents/summarize.py

diff --git a/agent_cli/agents/__init__.py b/agent_cli/agents/__init__.py
index b1741d75..03b3d74a 100644
--- a/agent_cli/agents/__init__.py
+++ b/agent_cli/agents/__init__.py
@@ -5,8 +5,10 @@
     autocorrect,
     chat,
     memory_proxy,
+    rag_proxy,
     server,
     speak,
+    summarize,
     transcribe,
     voice_edit,
 )
@@ -19,6 +21,7 @@
     "rag_proxy",
     "server",
     "speak",
+    "summarize",
     "transcribe",
     "voice_edit",
 ]
diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
new file mode 100644
index 00000000..abc8dfc7
--- /dev/null
+++ b/agent_cli/agents/summarize.py
@@ -0,0 +1,435 @@
+"""Summarize text files or stdin using adaptive hierarchical summarization."""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import json
+import sys
+import time
+from enum import Enum
+from pathlib import Path  # noqa: TC003
+from typing import TYPE_CHECKING
+
+import typer
+
+from agent_cli import config, opts
+from agent_cli.cli import app
+from agent_cli.core.utils import (
+    console,
+    create_status,
+    print_command_line_args,
+    print_error_message,
+    print_input_panel,
+    print_output_panel,
+    print_with_style,
+    setup_logging,
+)
+from agent_cli.summarizer import SummarizationError, SummarizerConfig, summarize
+from agent_cli.summarizer._utils import count_tokens
+
+if TYPE_CHECKING:
+    from agent_cli.summarizer import SummaryResult
+
+
+class ContentType(str, Enum):
+    """Content type for specialized summarization prompts."""
+
+    general = "general"
+    conversation = "conversation"
+    journal = "journal"
+    document = "document"
+
+
+class OutputFormat(str, Enum):
+    """Output format for the summarization result."""
+
+    text = "text"
+    json = "json"
+    full = "full"
+
+
+def _read_input(file_path: Path | None) -> str | None:
+    """Read input from file or stdin."""
+    if file_path:
+        if not file_path.exists():
+            print_error_message(
+                f"File not found: {file_path}",
+                "Please check the file path and try again.",
+            )
+            return None
+        return file_path.read_text(encoding="utf-8")
+
+    # Read from stdin
+    if sys.stdin.isatty():
+        print_error_message(
+            "No input provided",
+            "Provide a file path or pipe content via stdin.",
+        )
+        return None
+
+    return sys.stdin.read()
+
+
+def _display_input_preview(
+    content: str,
+    token_count: int,
+    *,
+    quiet: bool,
+    max_preview_chars: int = 500,
+) -> None:
+    """Display a preview of the input content."""
+    if quiet:
+        return
+
+    preview = content[:max_preview_chars]
+    if len(content) > max_preview_chars:
+        preview += f"\n... [{len(content) - max_preview_chars} more characters]"
+
+    print_input_panel(
+        preview,
+        title=f"Input ({token_count:,} tokens)",
+    )
+
+
+def _display_result(
+    result: SummaryResult,
+    elapsed: float,
+    output_format: OutputFormat,
+    *,
+    quiet: bool,
+) -> None:
+    """Display the summarization result."""
+    if output_format == OutputFormat.json:
+        print(json.dumps(result.model_dump(mode="json"), indent=2))
+        return
+
+    if output_format == OutputFormat.full:
+        _display_full_result(result, elapsed, quiet=quiet)
+        return
+
+    # Text output - just the summary
+    if quiet:
+        if result.summary:
+            print(result.summary)
+    elif result.summary:
+        print_output_panel(
+            result.summary,
+            title=f"Summary (Level: {result.level.name})",
+            subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]",
+        )
+    else:
+        print_with_style(
+            f"No summary generated (input too short: {result.input_tokens} tokens)",
+            style="yellow",
+        )
+
+
+def _display_full_result(
+    result: SummaryResult,
+    elapsed: float,
+    *,
+    quiet: bool,
+) -> None:
+    """Display full hierarchical result with all levels."""
+    if quiet:
+        if result.summary:
+            print(result.summary)
+        return
+
+    console.print()
+    console.print("[bold cyan]Summarization Result[/bold cyan]")
+    console.print(f"  Level: [bold]{result.level.name}[/bold]")
+    console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
+    console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
+    console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
+    console.print(f"  Time: [bold]{elapsed:.2f}s[/bold]")
+    console.print()
+
+    if result.hierarchical:
+        if result.hierarchical.l1_summaries:
+            console.print(
+                f"[bold yellow]L1 Chunk Summaries "
+                f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]",
+            )
+            for cs in result.hierarchical.l1_summaries:
+                console.print(
+                    f"\n[dim]--- Chunk {cs.chunk_index + 1} "
+                    f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]",
+                )
+                console.print(cs.content)
+
+        if result.hierarchical.l2_summaries:
+            console.print(
+                f"\n[bold yellow]L2 Group Summaries "
+                f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]",
+            )
+            for idx, l2_summary in enumerate(result.hierarchical.l2_summaries):
+                console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]")
+                console.print(l2_summary)
+
+        console.print("\n[bold green]L3 Final Summary[/bold green]")
+        print_output_panel(result.hierarchical.l3_summary, title="Final Summary")
+    elif result.summary:
+        print_output_panel(
+            result.summary,
+            title=f"Summary ({result.level.name})",
+        )
+
+
+def _get_llm_config(
+    provider_cfg: config.ProviderSelection,
+    ollama_cfg: config.Ollama,
+    openai_llm_cfg: config.OpenAILLM,
+    gemini_llm_cfg: config.GeminiLLM,
+) -> tuple[str, str, str | None]:
+    """Get openai_base_url, model, and api_key from provider config."""
+    if provider_cfg.llm_provider == "ollama":
+        # Ollama uses OpenAI-compatible API at /v1
+        base_url = ollama_cfg.llm_ollama_host.rstrip("/")
+        if not base_url.endswith("/v1"):
+            base_url = f"{base_url}/v1"
+        return base_url, ollama_cfg.llm_ollama_model, None
+    if provider_cfg.llm_provider == "openai":
+        base_url = openai_llm_cfg.openai_base_url or "https://api.openai.com/v1"
+        return base_url, openai_llm_cfg.llm_openai_model, openai_llm_cfg.openai_api_key
+    # gemini
+    return (
+        "https://generativelanguage.googleapis.com/v1beta/openai",
+        gemini_llm_cfg.llm_gemini_model,
+        gemini_llm_cfg.gemini_api_key,
+    )
+
+
+async def _async_summarize(
+    content: str,
+    *,
+    content_type: ContentType,
+    prior_summary: str | None,
+    provider_cfg: config.ProviderSelection,
+    ollama_cfg: config.Ollama,
+    openai_llm_cfg: config.OpenAILLM,
+    gemini_llm_cfg: config.GeminiLLM,
+    general_cfg: config.General,
+    chunk_size: int,
+    chunk_overlap: int,
+    max_concurrent_chunks: int,
+    output_format: OutputFormat,
+) -> None:
+    """Asynchronous summarization entry point."""
+    setup_logging(general_cfg.log_level, general_cfg.log_file, quiet=general_cfg.quiet)
+
+    openai_base_url, model, api_key = _get_llm_config(
+        provider_cfg,
+        ollama_cfg,
+        openai_llm_cfg,
+        gemini_llm_cfg,
+    )
+
+    token_count = count_tokens(content, model)
+    _display_input_preview(content, token_count, quiet=general_cfg.quiet)
+
+    summarizer_config = SummarizerConfig(
+        openai_base_url=openai_base_url,
+        model=model,
+        api_key=api_key,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        max_concurrent_chunks=max_concurrent_chunks,
+    )
+
+    try:
+        if not general_cfg.quiet:
+            status = create_status(f"Summarizing with {model}...", "bold yellow")
+        else:
+            status = contextlib.nullcontext()
+
+        with status:
+            start_time = time.monotonic()
+            result = await summarize(
+                content,
+                summarizer_config,
+                prior_summary=prior_summary,
+                content_type=content_type.value,
+            )
+            elapsed = time.monotonic() - start_time
+
+        _display_result(result, elapsed, output_format, quiet=general_cfg.quiet)
+
+    except SummarizationError as e:
+        print_error_message(
+            str(e),
+            f"Check that your LLM server is running at {openai_base_url}",
+        )
+        sys.exit(1)
+    except Exception as e:
+        print_error_message(str(e), "An unexpected error occurred during summarization.")
+        sys.exit(1)
+
+
+@app.command("summarize")
+def summarize_command(
+    *,
+    file_path: Path | None = typer.Argument(  # noqa: B008
+        None,
+        help="Path to file to summarize. If not provided, reads from stdin.",
+    ),
+    # --- Content Options ---
+    content_type: ContentType = typer.Option(  # noqa: B008
+        ContentType.general,
+        "--type",
+        "-t",
+        help="Content type for specialized summarization prompts.",
+        rich_help_panel="Content Options",
+    ),
+    prior_summary: str | None = typer.Option(
+        None,
+        "--prior-summary",
+        help="Prior summary to integrate with (for rolling summaries).",
+        rich_help_panel="Content Options",
+    ),
+    prior_summary_file: Path | None = typer.Option(  # noqa: B008
+        None,
+        "--prior-summary-file",
+        help="File containing prior summary to integrate with.",
+        rich_help_panel="Content Options",
+    ),
+    # --- Chunking Options ---
+    chunk_size: int = typer.Option(
+        3000,
+        "--chunk-size",
+        help="Target token count per chunk for hierarchical summarization.",
+        rich_help_panel="Chunking Options",
+    ),
+    chunk_overlap: int = typer.Option(
+        200,
+        "--chunk-overlap",
+        help="Token overlap between chunks for context continuity.",
+        rich_help_panel="Chunking Options",
+    ),
+    max_concurrent_chunks: int = typer.Option(
+        5,
+        "--max-concurrent",
+        help="Maximum number of chunks to process in parallel.",
+        rich_help_panel="Chunking Options",
+    ),
+    # --- Output Options ---
+    output_format: OutputFormat = typer.Option(  # noqa: B008
+        OutputFormat.text,
+        "--output",
+        "-o",
+        help="Output format: 'text' (summary only), 'json' (full result), 'full' (all levels).",
+        rich_help_panel="Output Options",
+    ),
+    # --- Provider Selection ---
+    llm_provider: str = opts.LLM_PROVIDER,
+    # --- LLM Configuration ---
+    # Ollama (local service)
+    llm_ollama_model: str = opts.LLM_OLLAMA_MODEL,
+    llm_ollama_host: str = opts.LLM_OLLAMA_HOST,
+    # OpenAI
+    llm_openai_model: str = opts.LLM_OPENAI_MODEL,
+    openai_api_key: str | None = opts.OPENAI_API_KEY,
+    openai_base_url: str | None = opts.OPENAI_BASE_URL,
+    # Gemini
+    llm_gemini_model: str = opts.LLM_GEMINI_MODEL,
+    gemini_api_key: str | None = opts.GEMINI_API_KEY,
+    # --- General Options ---
+    log_level: str = opts.LOG_LEVEL,
+    log_file: str | None = opts.LOG_FILE,
+    quiet: bool = opts.QUIET,
+    config_file: str | None = opts.CONFIG_FILE,
+    print_args: bool = opts.PRINT_ARGS,
+) -> None:
+    """Summarize text using adaptive hierarchical summarization.
+
+    Reads from a file or stdin and produces a summary scaled to the input complexity:
+
+    - NONE (<100 tokens): No summary needed
+    - BRIEF (100-500): Single sentence
+    - STANDARD (500-3000): Paragraph
+    - DETAILED (3000-15000): Chunked with meta-summary
+    - HIERARCHICAL (>15000): Full L1/L2/L3 tree
+
+    Examples:
+        # Summarize a file
+        agent-cli summarize document.txt
+
+        # Summarize with conversation-specific prompts
+        agent-cli summarize chat.txt --type conversation
+
+        # Pipe content from stdin
+        cat book.txt | agent-cli summarize
+
+        # Get full hierarchical output
+        agent-cli summarize large_document.txt --output full
+
+        # Use OpenAI instead of Ollama
+        agent-cli summarize notes.md --llm-provider openai
+
+    """
+    if print_args:
+        print_command_line_args(locals())
+
+    # Create config objects following the standard pattern
+    provider_cfg = config.ProviderSelection(
+        llm_provider=llm_provider,
+        asr_provider="wyoming",  # Not used, but required by model
+        tts_provider="wyoming",  # Not used, but required by model
+    )
+    ollama_cfg = config.Ollama(
+        llm_ollama_model=llm_ollama_model,
+        llm_ollama_host=llm_ollama_host,
+    )
+    openai_llm_cfg = config.OpenAILLM(
+        llm_openai_model=llm_openai_model,
+        openai_api_key=openai_api_key,
+        openai_base_url=openai_base_url,
+    )
+    gemini_llm_cfg = config.GeminiLLM(
+        llm_gemini_model=llm_gemini_model,
+        gemini_api_key=gemini_api_key,
+    )
+    general_cfg = config.General(
+        log_level=log_level,
+        log_file=log_file,
+        quiet=quiet,
+        clipboard=False,  # summarize doesn't use clipboard
+    )
+
+    # Read content
+    content = _read_input(file_path)
+    if content is None:
+        raise typer.Exit(1)
+
+    if not content.strip():
+        print_error_message("Empty input", "The input file or stdin is empty.")
+        raise typer.Exit(1)
+
+    # Handle prior summary from file
+    actual_prior_summary = prior_summary
+    if prior_summary_file:
+        if not prior_summary_file.exists():
+            print_error_message(
+                f"Prior summary file not found: {prior_summary_file}",
+                "Please check the file path.",
+            )
+            raise typer.Exit(1)
+        actual_prior_summary = prior_summary_file.read_text(encoding="utf-8")
+
+    asyncio.run(
+        _async_summarize(
+            content,
+            content_type=content_type,
+            prior_summary=actual_prior_summary,
+            provider_cfg=provider_cfg,
+            ollama_cfg=ollama_cfg,
+            openai_llm_cfg=openai_llm_cfg,
+            gemini_llm_cfg=gemini_llm_cfg,
+            general_cfg=general_cfg,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            max_concurrent_chunks=max_concurrent_chunks,
+            output_format=output_format,
+        ),
+    )
diff --git a/agent_cli/cli.py b/agent_cli/cli.py
index 0346de48..0adc4b74 100644
--- a/agent_cli/cli.py
+++ b/agent_cli/cli.py
@@ -57,6 +57,7 @@ def set_config_defaults(ctx: typer.Context, config_file: str | None) -> None:
     rag_proxy,
     server,
     speak,
+    summarize,
     transcribe,
     voice_edit,
 )
diff --git a/uv.lock b/uv.lock
index f69fd193..9bf4c468 100644
--- a/uv.lock
+++ b/uv.lock
@@ -41,6 +41,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken" },
     { name = "versioningit" },
 ]
 memory = [
@@ -49,6 +50,7 @@ memory = [
     { name = "huggingface-hub" },
     { name = "onnxruntime" },
     { name = "pyyaml" },
+    { name = "tiktoken" },
     { name = "transformers" },
     { name = "watchfiles" },
 ]
@@ -74,6 +76,7 @@ test = [
     { name = "pytest-cov" },
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
+    { name = "tiktoken" },
 ]
 
 [package.dev-dependencies]
@@ -90,6 +93,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken" },
     { name = "versioningit" },
 ]
 
@@ -127,6 +131,8 @@ requires-dist = [
     { name = "rich" },
     { name = "ruff", marker = "extra == 'dev'" },
     { name = "sounddevice" },
+    { name = "tiktoken", marker = "extra == 'memory'", specifier = ">=0.5.0" },
+    { name = "tiktoken", marker = "extra == 'test'", specifier = ">=0.5.0" },
     { name = "transformers", marker = "extra == 'memory'", specifier = ">=4.30.0" },
     { name = "transformers", marker = "extra == 'rag'", specifier = ">=4.30.0" },
     { name = "typer" },
@@ -151,6 +157,7 @@ dev = [
     { name = "pytest-mock" },
     { name = "pytest-timeout" },
     { name = "ruff" },
+    { name = "tiktoken", specifier = ">=0.5.0" },
     { name = "versioningit" },
 ]
 
@@ -4378,6 +4385,60 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6a/9e/2064975477fdc887e47ad42157e214526dcad8f317a948dee17e1659a62f/terminado-0.18.1-py3-none-any.whl", hash = "sha256:a4468e1b37bb318f8a86514f65814e1afc977cf29b3992a4500d9dd305dcceb0", size = 14154, upload-time = "2024-03-12T14:34:36.569Z" },
 ]
 
+[[package]]
+name = "tiktoken"
+version = "0.12.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "regex" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/46/21ea696b21f1d6d1efec8639c204bdf20fde8bafb351e1355c72c5d7de52/tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb", size = 1051565, upload-time = "2025-10-06T20:21:44.566Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/d9/35c5d2d9e22bb2a5f74ba48266fb56c63d76ae6f66e02feb628671c0283e/tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa", size = 995284, upload-time = "2025-10-06T20:21:45.622Z" },
+    { url = "https://files.pythonhosted.org/packages/01/84/961106c37b8e49b9fdcf33fe007bb3a8fdcc380c528b20cc7fbba80578b8/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc", size = 1129201, upload-time = "2025-10-06T20:21:47.074Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/d0/3d9275198e067f8b65076a68894bb52fd253875f3644f0a321a720277b8a/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded", size = 1152444, upload-time = "2025-10-06T20:21:48.139Z" },
+    { url = "https://files.pythonhosted.org/packages/78/db/a58e09687c1698a7c592e1038e01c206569b86a0377828d51635561f8ebf/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd", size = 1195080, upload-time = "2025-10-06T20:21:49.246Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/1b/a9e4d2bf91d515c0f74afc526fd773a812232dd6cda33ebea7f531202325/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967", size = 1255240, upload-time = "2025-10-06T20:21:50.274Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/15/963819345f1b1fb0809070a79e9dd96938d4ca41297367d471733e79c76c/tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def", size = 879422, upload-time = "2025-10-06T20:21:51.734Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" },
+    { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" },
+    { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" },
+    { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" },
+    { url = "https://files.pythonhosted.org/packages/00/61/441588ee21e6b5cdf59d6870f86beb9789e532ee9718c251b391b70c68d6/tiktoken-0.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:775c2c55de2310cc1bc9a3ad8826761cbdc87770e586fd7b6da7d4589e13dab3", size = 1050802, upload-time = "2025-10-06T20:22:00.96Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/05/dcf94486d5c5c8d34496abe271ac76c5b785507c8eae71b3708f1ad9b45a/tiktoken-0.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a01b12f69052fbe4b080a2cfb867c4de12c704b56178edf1d1d7b273561db160", size = 993995, upload-time = "2025-10-06T20:22:02.788Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/70/5163fe5359b943f8db9946b62f19be2305de8c3d78a16f629d4165e2f40e/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa", size = 1128948, upload-time = "2025-10-06T20:22:03.814Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/da/c028aa0babf77315e1cef357d4d768800c5f8a6de04d0eac0f377cb619fa/tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:4a1a4fcd021f022bfc81904a911d3df0f6543b9e7627b51411da75ff2fe7a1be", size = 1151986, upload-time = "2025-10-06T20:22:05.173Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/5a/886b108b766aa53e295f7216b509be95eb7d60b166049ce2c58416b25f2a/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:981a81e39812d57031efdc9ec59fa32b2a5a5524d20d4776574c4b4bd2e9014a", size = 1194222, upload-time = "2025-10-06T20:22:06.265Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f8/4db272048397636ac7a078d22773dd2795b1becee7bc4922fe6207288d57/tiktoken-0.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9baf52f84a3f42eef3ff4e754a0db79a13a27921b457ca9832cf944c6be4f8f3", size = 1255097, upload-time = "2025-10-06T20:22:07.403Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/32/45d02e2e0ea2be3a9ed22afc47d93741247e75018aac967b713b2941f8ea/tiktoken-0.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:b8a0cd0c789a61f31bf44851defbd609e8dd1e2c8589c614cc1060940ef1f697", size = 879117, upload-time = "2025-10-06T20:22:08.418Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/76/994fc868f88e016e6d05b0da5ac24582a14c47893f4474c3e9744283f1d5/tiktoken-0.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d5f89ea5680066b68bcb797ae85219c72916c922ef0fcdd3480c7d2315ffff16", size = 1050309, upload-time = "2025-10-06T20:22:10.939Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/57ef1456504c43a849821920d582a738a461b76a047f352f18c0b26c6516/tiktoken-0.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b4e7ed1c6a7a8a60a3230965bdedba8cc58f68926b835e519341413370e0399a", size = 993712, upload-time = "2025-10-06T20:22:12.115Z" },
+    { url = "https://files.pythonhosted.org/packages/72/90/13da56f664286ffbae9dbcfadcc625439142675845baa62715e49b87b68b/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:fc530a28591a2d74bce821d10b418b26a094bf33839e69042a6e86ddb7a7fb27", size = 1128725, upload-time = "2025-10-06T20:22:13.541Z" },
+    { url = "https://files.pythonhosted.org/packages/05/df/4f80030d44682235bdaecd7346c90f67ae87ec8f3df4a3442cb53834f7e4/tiktoken-0.12.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:06a9f4f49884139013b138920a4c393aa6556b2f8f536345f11819389c703ebb", size = 1151875, upload-time = "2025-10-06T20:22:14.559Z" },
+    { url = "https://files.pythonhosted.org/packages/22/1f/ae535223a8c4ef4c0c1192e3f9b82da660be9eb66b9279e95c99288e9dab/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:04f0e6a985d95913cabc96a741c5ffec525a2c72e9df086ff17ebe35985c800e", size = 1194451, upload-time = "2025-10-06T20:22:15.545Z" },
+    { url = "https://files.pythonhosted.org/packages/78/a7/f8ead382fce0243cb625c4f266e66c27f65ae65ee9e77f59ea1653b6d730/tiktoken-0.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0ee8f9ae00c41770b5f9b0bb1235474768884ae157de3beb5439ca0fd70f3e25", size = 1253794, upload-time = "2025-10-06T20:22:16.624Z" },
+    { url = "https://files.pythonhosted.org/packages/93/e0/6cc82a562bc6365785a3ff0af27a2a092d57c47d7a81d9e2295d8c36f011/tiktoken-0.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dc2dd125a62cb2b3d858484d6c614d136b5b848976794edfb63688d539b8b93f", size = 878777, upload-time = "2025-10-06T20:22:18.036Z" },
+    { url = "https://files.pythonhosted.org/packages/72/05/3abc1db5d2c9aadc4d2c76fa5640134e475e58d9fbb82b5c535dc0de9b01/tiktoken-0.12.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a90388128df3b3abeb2bfd1895b0681412a8d7dc644142519e6f0a97c2111646", size = 1050188, upload-time = "2025-10-06T20:22:19.563Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/7b/50c2f060412202d6c95f32b20755c7a6273543b125c0985d6fa9465105af/tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:da900aa0ad52247d8794e307d6446bd3cdea8e192769b56276695d34d2c9aa88", size = 993978, upload-time = "2025-10-06T20:22:20.702Z" },
+    { url = "https://files.pythonhosted.org/packages/14/27/bf795595a2b897e271771cd31cb847d479073497344c637966bdf2853da1/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:285ba9d73ea0d6171e7f9407039a290ca77efcdb026be7769dccc01d2c8d7fff", size = 1129271, upload-time = "2025-10-06T20:22:22.06Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/de/9341a6d7a8f1b448573bbf3425fa57669ac58258a667eb48a25dfe916d70/tiktoken-0.12.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:d186a5c60c6a0213f04a7a802264083dea1bbde92a2d4c7069e1a56630aef830", size = 1151216, upload-time = "2025-10-06T20:22:23.085Z" },
+    { url = "https://files.pythonhosted.org/packages/75/0d/881866647b8d1be4d67cb24e50d0c26f9f807f994aa1510cb9ba2fe5f612/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:604831189bd05480f2b885ecd2d1986dc7686f609de48208ebbbddeea071fc0b", size = 1194860, upload-time = "2025-10-06T20:22:24.602Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/1e/b651ec3059474dab649b8d5b69f5c65cd8fcd8918568c1935bd4136c9392/tiktoken-0.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8f317e8530bb3a222547b85a58583238c8f74fd7a7408305f9f63246d1a0958b", size = 1254567, upload-time = "2025-10-06T20:22:25.671Z" },
+    { url = "https://files.pythonhosted.org/packages/80/57/ce64fd16ac390fafde001268c364d559447ba09b509181b2808622420eec/tiktoken-0.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:399c3dd672a6406719d84442299a490420b458c44d3ae65516302a99675888f3", size = 921067, upload-time = "2025-10-06T20:22:26.753Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/a4/72eed53e8976a099539cdd5eb36f241987212c29629d0a52c305173e0a68/tiktoken-0.12.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2c714c72bc00a38ca969dae79e8266ddec999c7ceccd603cc4f0d04ccd76365", size = 1050473, upload-time = "2025-10-06T20:22:27.775Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/d7/0110b8f54c008466b19672c615f2168896b83706a6611ba6e47313dbc6e9/tiktoken-0.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cbb9a3ba275165a2cb0f9a83f5d7025afe6b9d0ab01a22b50f0e74fee2ad253e", size = 993855, upload-time = "2025-10-06T20:22:28.799Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/77/4f268c41a3957c418b084dd576ea2fad2e95da0d8e1ab705372892c2ca22/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:dfdfaa5ffff8993a3af94d1125870b1d27aed7cb97aa7eb8c1cefdbc87dbee63", size = 1129022, upload-time = "2025-10-06T20:22:29.981Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/2b/fc46c90fe5028bd094cd6ee25a7db321cb91d45dc87531e2bdbb26b4867a/tiktoken-0.12.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:584c3ad3d0c74f5269906eb8a659c8bfc6144a52895d9261cdaf90a0ae5f4de0", size = 1150736, upload-time = "2025-10-06T20:22:30.996Z" },
+    { url = "https://files.pythonhosted.org/packages/28/c0/3c7a39ff68022ddfd7d93f3337ad90389a342f761c4d71de99a3ccc57857/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:54c891b416a0e36b8e2045b12b33dd66fb34a4fe7965565f1b482da50da3e86a", size = 1194908, upload-time = "2025-10-06T20:22:32.073Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/0d/c1ad6f4016a3968c048545f5d9b8ffebf577774b2ede3e2e352553b685fe/tiktoken-0.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5edb8743b88d5be814b1a8a8854494719080c28faaa1ccbef02e87354fe71ef0", size = 1253706, upload-time = "2025-10-06T20:22:33.385Z" },
+    { url = "https://files.pythonhosted.org/packages/af/df/c7891ef9d2712ad774777271d39fdef63941ffba0a9d59b7ad1fd2765e57/tiktoken-0.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f61c0aea5565ac82e2ec50a05e02a6c44734e91b51c10510b084ea1b8e633a71", size = 920667, upload-time = "2025-10-06T20:22:34.444Z" },
+]
+
 [[package]]
 name = "tinycss2"
 version = "1.4.0"

From 2e7642a1f493d376a2f167dd5a3aa0158f40889b Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:35:19 -0800
Subject: [PATCH 16/38] refactor(memory): remove dead parent_group field and
 bundle metadata args

- Remove unused parent_group from MemoryMetadata (was never assigned)
- Refactor write_memory_file to accept optional MemoryMetadata object
  instead of 17 individual parameters
- Simplify upsert_hierarchical_summary to use MemoryMetadata(**dict)
- Rename summary_level to summary_level_name for consistency
- Make tiktoken optional in token counting with fallback heuristic
---
 agent_cli/memory/_files.py           | 88 ++++++++++++++--------------
 agent_cli/memory/_persistence.py     | 35 ++++++-----
 agent_cli/memory/_store.py           | 17 +-----
 agent_cli/memory/models.py           |  2 -
 agent_cli/summarizer/_utils.py       | 17 +++++-
 agent_cli/summarizer/models.py       |  4 +-
 docs/architecture/summarizer.md      |  2 -
 tests/memory/test_store.py           |  4 +-
 tests/summarizer/test_integration.py |  7 +--
 tests/summarizer/test_models.py      |  4 +-
 10 files changed, 82 insertions(+), 98 deletions(-)

diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index aa8bc5ae..1eba8690 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -88,76 +88,74 @@ def soft_delete_memory_file(
 def write_memory_file(
     root: Path,
     *,
-    conversation_id: str,
-    role: str,
-    created_at: str,
     content: str,
-    summary_kind: str | None = None,
     doc_id: str | None = None,
+    # Either pass pre-built metadata OR individual fields
+    metadata: MemoryMetadata | None = None,
+    # Individual fields (used when metadata is None)
+    conversation_id: str | None = None,
+    role: str | None = None,
+    created_at: str | None = None,
+    summary_kind: str | None = None,
     source_id: str | None = None,
-    # Hierarchical summary fields
-    level: int | None = None,
-    is_final: bool | None = None,
-    chunk_index: int | None = None,
-    parent_group: int | None = None,
-    group_index: int | None = None,
-    input_tokens: int | None = None,
-    output_tokens: int | None = None,
-    compression_ratio: float | None = None,
-    summary_level_name: str | None = None,
 ) -> MemoryFileRecord:
-    """Render and persist a memory document to disk."""
+    """Render and persist a memory document to disk.
+
+    Can be called in two ways:
+    1. With pre-built metadata: write_memory_file(root, content=..., metadata=..., doc_id=...)
+    2. With individual fields: write_memory_file(root, content=..., conversation_id=..., role=..., ...)
+
+    """
     entries_dir, _ = ensure_store_dirs(root)
-    safe_conversation = _slugify(conversation_id)
     doc_id = doc_id or str(uuid4())
-    safe_ts = _safe_timestamp(created_at)
+
+    # Build or use provided metadata
+    if metadata is not None:
+        meta = metadata
+    else:
+        if conversation_id is None or role is None or created_at is None:
+            msg = "Must provide metadata or (conversation_id, role, created_at)"
+            raise ValueError(msg)
+        meta = MemoryMetadata(
+            conversation_id=conversation_id,
+            role=role,
+            created_at=created_at,
+            summary_kind=summary_kind,
+            source_id=source_id,
+        )
+
+    safe_conversation = _slugify(meta.conversation_id)
+    safe_ts = _safe_timestamp(meta.created_at)
 
     # Route by role/category for readability
-    if summary_kind and level is not None:
+    if meta.summary_kind and meta.level is not None:
         # Hierarchical summary file structure
-        if level == _SUMMARY_LEVEL_L1:
+        if meta.level == _SUMMARY_LEVEL_L1:
             subdir = Path("summaries") / "L1"
-            filename = f"chunk_{chunk_index or 0}.md"
-        elif level == _SUMMARY_LEVEL_L2:
+            filename = f"chunk_{meta.chunk_index or 0}.md"
+        elif meta.level == _SUMMARY_LEVEL_L2:
             subdir = Path("summaries") / "L2"
-            filename = f"group_{group_index or 0}.md"
+            filename = f"group_{meta.group_index or 0}.md"
         else:  # level == _SUMMARY_LEVEL_L3
             subdir = Path("summaries") / "L3"
             filename = "final.md"
-    elif summary_kind:
+    elif meta.summary_kind:
         subdir = Path("summaries")
         filename = "summary.md"
-    elif role == "user":
+    elif meta.role == "user":
         subdir = Path("turns") / "user"
         filename = f"{safe_ts}__{doc_id}.md"
-    elif role == "assistant":
+    elif meta.role == "assistant":
         subdir = Path("turns") / "assistant"
         filename = f"{safe_ts}__{doc_id}.md"
-    elif role == "memory":
+    elif meta.role == "memory":
         subdir = Path("facts")
         filename = f"{safe_ts}__{doc_id}.md"
     else:
         subdir = Path()
         filename = f"{doc_id}.md"
 
-    metadata = MemoryMetadata(
-        conversation_id=conversation_id,
-        role=role,
-        created_at=created_at,
-        summary_kind=summary_kind,
-        source_id=source_id,
-        level=level,
-        is_final=is_final,
-        chunk_index=chunk_index,
-        parent_group=parent_group,
-        group_index=group_index,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=compression_ratio,
-        summary_level_name=summary_level_name,
-    )
-
-    front_matter = _render_front_matter(doc_id, metadata)
+    front_matter = _render_front_matter(doc_id, meta)
     body = front_matter + "\n" + content.strip() + "\n"
 
     file_path = entries_dir / safe_conversation / subdir / filename
@@ -165,7 +163,7 @@ def write_memory_file(
 
     atomic_write_text(file_path, body)
 
-    return MemoryFileRecord(id=doc_id, path=file_path, metadata=metadata, content=content)
+    return MemoryFileRecord(id=doc_id, path=file_path, metadata=meta, content=content)
 
 
 def load_memory_files(root: Path) -> list[MemoryFileRecord]:
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 91585ade..2af3a268 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -25,13 +25,13 @@
     upsert_memories,
 )
 from agent_cli.memory.entities import Fact, Turn
+from agent_cli.memory.models import MemoryMetadata
 
 if TYPE_CHECKING:
     from pathlib import Path
 
     from chromadb import Collection
 
-    from agent_cli.memory.models import MemoryMetadata
     from agent_cli.summarizer import SummaryResult
 
 LOGGER = logging.getLogger(__name__)
@@ -212,26 +212,29 @@ def persist_hierarchical_summary(
     created_at = datetime.now(UTC).isoformat()
 
     for entry in entries:
-        meta = entry["metadata"]
+        meta_dict = entry["metadata"]
+        # Build MemoryMetadata from the summary result's metadata dict
+        metadata = MemoryMetadata(
+            conversation_id=meta_dict["conversation_id"],
+            role=meta_dict["role"],
+            created_at=meta_dict.get("created_at", created_at),
+            summary_kind="summary",
+            level=meta_dict.get("level"),
+            is_final=meta_dict.get("is_final"),
+            chunk_index=meta_dict.get("chunk_index"),
+            group_index=meta_dict.get("group_index"),
+            input_tokens=meta_dict.get("input_tokens"),
+            output_tokens=meta_dict.get("output_tokens"),
+            compression_ratio=meta_dict.get("compression_ratio"),
+            summary_level_name=meta_dict.get("summary_level_name"),
+        )
         record = write_memory_file(
             memory_root,
-            conversation_id=meta["conversation_id"],
-            role=meta["role"],
-            created_at=meta.get("created_at", created_at),
             content=entry["content"],
-            summary_kind="summary",
             doc_id=entry["id"],
-            level=meta.get("level"),
-            is_final=meta.get("is_final"),
-            chunk_index=meta.get("chunk_index"),
-            parent_group=meta.get("parent_group"),
-            group_index=meta.get("group_index"),
-            input_tokens=meta.get("input_tokens"),
-            output_tokens=meta.get("output_tokens"),
-            compression_ratio=meta.get("compression_ratio"),
-            summary_level_name=meta.get("summary_level"),
+            metadata=metadata,
         )
-        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta.get("level"))
+        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
         stored_ids.append(record.id)
 
     # Store in ChromaDB
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 722dcda9..b668a2d3 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -176,22 +176,7 @@ def upsert_hierarchical_summary(
         contents.append(entry["content"])
         # Convert the raw metadata dict to MemoryMetadata
         meta_dict = entry["metadata"]
-        metadatas.append(
-            MemoryMetadata(
-                conversation_id=meta_dict["conversation_id"],
-                role=meta_dict["role"],
-                created_at=meta_dict["created_at"],
-                level=meta_dict.get("level"),
-                is_final=meta_dict.get("is_final"),
-                chunk_index=meta_dict.get("chunk_index"),
-                parent_group=meta_dict.get("parent_group"),
-                group_index=meta_dict.get("group_index"),
-                input_tokens=meta_dict.get("input_tokens"),
-                output_tokens=meta_dict.get("output_tokens"),
-                compression_ratio=meta_dict.get("compression_ratio"),
-                summary_level_name=meta_dict.get("summary_level"),
-            ),
-        )
+        metadatas.append(MemoryMetadata(**meta_dict))
 
     upsert_memories(collection, ids=ids, contents=contents, metadatas=metadatas)
     return ids
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 4eb289c7..06266c57 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -56,8 +56,6 @@ class MemoryMetadata(BaseModel):
     """Whether this is the final L3 summary."""
     chunk_index: int | None = None
     """For L1 summaries: index of the source chunk."""
-    parent_group: int | None = None
-    """For L1 summaries: which L2 group this chunk belongs to."""
     group_index: int | None = None
     """For L2 summaries: index of this group."""
     input_tokens: int | None = None
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 030b5729..731c5505 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -13,12 +13,16 @@
 
 
 @lru_cache(maxsize=4)
-def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
+def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
     """Get tiktoken encoding for a model, with caching.
 
     Falls back to cl100k_base for unknown models (covers most modern LLMs).
+    Returns None when tiktoken is not installed so callers can use a heuristic.
     """
-    import tiktoken  # noqa: PLC0415
+    try:
+        import tiktoken  # noqa: PLC0415
+    except ModuleNotFoundError:
+        return None
 
     try:
         return tiktoken.encoding_for_model(model)
@@ -27,7 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding:
 
 
 def count_tokens(text: str, model: str = "gpt-4") -> int:
-    """Count tokens in text using tiktoken.
+    """Count tokens in text using tiktoken, with a lightweight fallback.
 
     Args:
         text: The text to count tokens for.
@@ -40,11 +44,18 @@ def count_tokens(text: str, model: str = "gpt-4") -> int:
     if not text:
         return 0
     enc = _get_encoding(model)
+    if enc is None:
+        return _estimate_token_count(text)
     # Disable special token checking - LLM outputs may contain special tokens
     # like <|constrain|>, <|endoftext|>, etc. that we want to count normally
     return len(enc.encode(text, disallowed_special=()))
 
 
+def _estimate_token_count(text: str) -> int:
+    """Very rough token estimate based on character length (~4 chars/token)."""
+    return max(1, (len(text) + 3) // 4)
+
+
 def chunk_text(
     text: str,
     chunk_size: int = 3000,
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 4f5c5119..ce6da908 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -185,7 +185,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
-                        "summary_level": self.level.name,
+                        "summary_level_name": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
@@ -204,7 +204,7 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                         "role": "summary",
                         "level": HIERARCHICAL_LEVEL_L3,
                         "is_final": True,
-                        "summary_level": self.level.name,
+                        "summary_level_name": self.level.name,
                         "input_tokens": self.input_tokens,
                         "output_tokens": self.output_tokens,
                         "compression_ratio": self.compression_ratio,
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 59f1dbb5..ec7b769f 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -169,7 +169,6 @@ class ChunkSummary(BaseModel):
     content: str              # The summarized text
     token_count: int          # Tokens in this summary
     source_tokens: int        # Tokens in source chunk
-    parent_group: int | None  # L2 group this belongs to
 
 class HierarchicalSummary(BaseModel):
     l1_summaries: list[ChunkSummary]  # Individual chunk summaries
@@ -191,7 +190,6 @@ Summaries are stored with rich metadata for retrieval and management:
 | `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
 | `chunk_index` | ✓ | | | Position in L1 sequence |
 | `group_index` | | ✓ | | Position in L2 sequence |
-| `parent_group` | ✓ | | | Which L2 group owns this L1 |
 | `is_final` | | | ✓ | Marks the top-level summary |
 | `summary_level` | | | ✓ | Name of SummaryLevel enum |
 | `input_tokens` | | | ✓ | Original content token count |
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 453a21a9..0851d963 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -159,7 +159,7 @@ def test_upsert_hierarchical_summary_simple() -> None:
                 "role": "summary",
                 "level": 3,
                 "is_final": True,
-                "summary_level": "STANDARD",
+                "summary_level_name": "STANDARD",
                 "input_tokens": 1000,
                 "output_tokens": 50,
                 "compression_ratio": 0.05,
@@ -192,7 +192,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
                 "role": "summary",
                 "level": 1,
                 "chunk_index": 0,
-                "parent_group": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -204,7 +203,6 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
                 "role": "summary",
                 "level": 1,
                 "chunk_index": 1,
-                "parent_group": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 6eeb133e..5cb97115 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -115,7 +115,7 @@ def test_standard_summary_produces_single_entry(self) -> None:
         assert entry["content"] == "A paragraph summary of the content."
         assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "STANDARD"
+        assert entry["metadata"]["summary_level_name"] == "STANDARD"
 
     def test_hierarchical_summary_produces_multiple_entries(self) -> None:
         """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
@@ -125,21 +125,18 @@ def test_hierarchical_summary_produces_multiple_entries(self) -> None:
                 content="Chunk 0",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=2,
                 content="Chunk 2",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(
@@ -257,14 +254,12 @@ def test_persist_hierarchical_creates_files(
                 content="Chunk 0 content",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1 content",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index 23509d2e..d3962111 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -223,7 +223,7 @@ def test_to_storage_metadata_simple_summary(self) -> None:
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "STANDARD"
+        assert entry["metadata"]["summary_level_name"] == "STANDARD"
 
     def test_to_storage_metadata_hierarchical(self) -> None:
         """Test storage metadata for hierarchical summary."""
@@ -233,14 +233,12 @@ def test_to_storage_metadata_hierarchical(self) -> None:
                 content="Chunk 0 text",
                 token_count=10,
                 source_tokens=100,
-                parent_group=0,
             ),
             ChunkSummary(
                 chunk_index=1,
                 content="Chunk 1 text",
                 token_count=12,
                 source_tokens=120,
-                parent_group=0,
             ),
         ]
         hierarchical = HierarchicalSummary(

From 2c5bf41883e946f36ff9e737dd7be4c48fc21b3f Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 22:52:39 -0800
Subject: [PATCH 17/38] perf: lazy imports for pydantic_ai, sounddevice, and
 numpy

Improve CLI startup time from ~0.51s to ~0.16s (69% faster) by deferring
heavy imports until they're actually needed:

- pydantic_ai: lazy in memory/_ingest.py, summarizer/adaptive.py, rag/engine.py
- sounddevice: lazy in core/audio.py (moved to TYPE_CHECKING + function imports)
- numpy: lazy in rag/_retriever.py and services/tts.py

Update tests to patch modules directly (e.g., pydantic_ai.Agent) instead of
through module attributes that no longer exist at import time.

Add scripts/profile_imports.py for measuring import performance.
---
 agent_cli/core/audio.py                   |   6 +-
 agent_cli/memory/_ingest.py               |  16 ++-
 agent_cli/rag/_retriever.py               |   3 +-
 agent_cli/rag/engine.py                   |  24 ++--
 agent_cli/services/tts.py                 |   3 +-
 agent_cli/summarizer/adaptive.py          |   9 +-
 scripts/profile_imports.py                | 141 ++++++++++++++++++++++
 tests/memory/test_engine.py               |  12 +-
 tests/rag/test_engine.py                  |   4 +-
 tests/rag/test_history.py                 |   2 +-
 tests/rag/test_rag_integration_liveish.py |   4 +-
 tests/summarizer/test_adaptive.py         |   4 +-
 tests/test_audio_e2e.py                   |  28 ++---
 13 files changed, 211 insertions(+), 45 deletions(-)
 create mode 100755 scripts/profile_imports.py

diff --git a/agent_cli/core/audio.py b/agent_cli/core/audio.py
index 0455482e..37ef519f 100644
--- a/agent_cli/core/audio.py
+++ b/agent_cli/core/audio.py
@@ -9,7 +9,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
-import sounddevice as sd
 from rich.text import Text
 
 from agent_cli import constants
@@ -23,6 +22,7 @@
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Awaitable, Callable, Generator
 
+    import sounddevice as sd
     from rich.live import Live
 
     from agent_cli import config
@@ -41,6 +41,8 @@ class StreamConfig:
 
     def to_stream(self) -> sd.Stream:
         """Create a SoundDevice stream from this configuration."""
+        import sounddevice as sd  # noqa: PLC0415
+
         if self.kind == "input":
             stream_cls = sd.InputStream
         elif self.kind == "output":
@@ -308,6 +310,8 @@ def _get_all_devices() -> list[dict]:
         List of device info dictionaries with added 'index' field
 
     """
+    import sounddevice as sd  # noqa: PLC0415
+
     devices = []
     try:
         query_result = sd.query_devices()
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 1c784d9e..70fa5a0f 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -10,11 +10,6 @@
 from uuid import uuid4
 
 import httpx
-from pydantic_ai import Agent, ModelRetry, PromptedOutput
-from pydantic_ai.exceptions import AgentRunError, UnexpectedModelBehavior
-from pydantic_ai.models.openai import OpenAIChatModel
-from pydantic_ai.providers.openai import OpenAIProvider
-from pydantic_ai.settings import ModelSettings
 
 from agent_cli.memory._git import commit_changes
 from agent_cli.memory._persistence import (
@@ -65,6 +60,11 @@ async def extract_salient_facts(
     if not user_message and not assistant_message:
         return []
 
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.exceptions import AgentRunError, UnexpectedModelBehavior  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+
     # Extract facts from the latest user turn only (ignore assistant/system).
     transcript = user_message or ""
     LOGGER.info("Extracting facts from transcript: %r", transcript)
@@ -194,6 +194,12 @@ async def reconcile_facts(
     existing_json = [{"id": idx, "text": mem.content} for idx, mem in enumerate(existing)]
     existing_ids = set(id_map.keys())
 
+    from pydantic_ai import Agent, ModelRetry, PromptedOutput  # noqa: PLC0415
+    from pydantic_ai.exceptions import AgentRunError, UnexpectedModelBehavior  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
     provider = OpenAIProvider(api_key=api_key or "dummy", base_url=openai_base_url)
     model_cfg = OpenAIChatModel(
         model_name=model,
diff --git a/agent_cli/rag/_retriever.py b/agent_cli/rag/_retriever.py
index 6f20577b..11e32648 100644
--- a/agent_cli/rag/_retriever.py
+++ b/agent_cli/rag/_retriever.py
@@ -5,7 +5,6 @@
 import logging
 from typing import TYPE_CHECKING
 
-import numpy as np
 from huggingface_hub import hf_hub_download
 from onnxruntime import InferenceSession
 from transformers import AutoTokenizer
@@ -68,6 +67,8 @@ def predict(
         batch_size: int = 32,
     ) -> list[float]:
         """Predict relevance scores for query-document pairs."""
+        import numpy as np  # noqa: PLC0415
+
         if not pairs:
             return []
 
diff --git a/agent_cli/rag/engine.py b/agent_cli/rag/engine.py
index 34a574d3..42d93687 100644
--- a/agent_cli/rag/engine.py
+++ b/agent_cli/rag/engine.py
@@ -9,16 +9,6 @@
 from typing import TYPE_CHECKING, Any
 
 from fastapi.responses import StreamingResponse
-from pydantic_ai import Agent
-from pydantic_ai.messages import (
-    ModelRequest,
-    ModelResponse,
-    SystemPromptPart,
-    TextPart,
-    UserPromptPart,
-)
-from pydantic_ai.models.openai import OpenAIModel
-from pydantic_ai.providers.openai import OpenAIProvider
 
 from agent_cli.core.sse import format_chunk, format_done
 from agent_cli.rag._prompt import RAG_PROMPT_NO_TOOLS, RAG_PROMPT_WITH_TOOLS
@@ -28,6 +18,8 @@
 
 if TYPE_CHECKING:
     from chromadb import Collection
+    from pydantic_ai import Agent
+    from pydantic_ai.messages import ModelRequest, ModelResponse
     from pydantic_ai.result import RunResult
 
     from agent_cli.rag._retriever import OnnxCrossEncoder
@@ -122,6 +114,14 @@ def _convert_messages(
     messages: list[Message],
 ) -> tuple[list[ModelRequest | ModelResponse], str]:
     """Convert OpenAI messages to Pydantic AI messages and extract user prompt."""
+    from pydantic_ai.messages import (  # noqa: PLC0415
+        ModelRequest,
+        ModelResponse,
+        SystemPromptPart,
+        TextPart,
+        UserPromptPart,
+    )
+
     pyd_messages: list[ModelRequest | ModelResponse] = []
 
     # Validation: Ensure there is at least one message
@@ -235,6 +235,10 @@ def read_full_document(file_path: str) -> str:
         system_prompt = template.format(context=truncated)
 
     # 4. Setup Agent
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+
     provider = OpenAIProvider(base_url=openai_base_url, api_key=api_key or "dummy")
     model = OpenAIModel(model_name=request.model, provider=provider)
 
diff --git a/agent_cli/services/tts.py b/agent_cli/services/tts.py
index fe364532..7b6db819 100644
--- a/agent_cli/services/tts.py
+++ b/agent_cli/services/tts.py
@@ -10,7 +10,6 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-import numpy as np
 from rich.live import Live
 from wyoming.audio import AudioChunk, AudioStart, AudioStop
 from wyoming.tts import Synthesize, SynthesizeVoice
@@ -311,6 +310,8 @@ async def _play_audio(
     live: Live,
 ) -> None:
     """Play WAV audio data using SoundDevice."""
+    import numpy as np  # noqa: PLC0415
+
     try:
         wav_io = io.BytesIO(audio_data)
         speed = audio_output_cfg.tts_speed
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 989bd86b..99fa4641 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -14,10 +14,6 @@
 from dataclasses import dataclass
 
 from pydantic import BaseModel
-from pydantic_ai import Agent
-from pydantic_ai.models.openai import OpenAIChatModel
-from pydantic_ai.providers.openai import OpenAIProvider
-from pydantic_ai.settings import ModelSettings
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
@@ -435,6 +431,11 @@ async def _generate_summary(
         SummarizationError: If summarization fails.
 
     """
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
     provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
     model = OpenAIChatModel(
         model_name=config.model,
diff --git a/scripts/profile_imports.py b/scripts/profile_imports.py
new file mode 100755
index 00000000..d70b5b39
--- /dev/null
+++ b/scripts/profile_imports.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Profile CLI import times to identify slow imports.
+
+Usage:
+    python scripts/profile_imports.py              # Basic timing
+    python scripts/profile_imports.py -v           # Verbose (show all imports)
+    python scripts/profile_imports.py --top 20     # Show top 20 slowest
+    python scripts/profile_imports.py --cli-only   # Just measure CLI startup time
+
+    # Raw importtime output (for detailed analysis):
+    python -X importtime -c "from agent_cli.cli import app" 2>&1 | sort -t'|' -k2 -n
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def measure_import_time(module: str, runs: int = 3) -> float:
+    """Measure average import time for a module."""
+    times = []
+    for _ in range(runs):
+        start = time.perf_counter()
+        result = subprocess.run(
+            [sys.executable, "-c", f"import {module}"],
+            check=False,
+            capture_output=True,
+            cwd=Path(__file__).parent.parent,
+        )
+        elapsed = time.perf_counter() - start
+        if result.returncode != 0:
+            print(f"Error importing {module}: {result.stderr.decode()}")
+            return -1
+        times.append(elapsed)
+    return sum(times) / len(times)
+
+
+def get_import_breakdown(module: str) -> list[tuple[float, str]]:
+    """Get detailed import times using -X importtime."""
+    result = subprocess.run(
+        [sys.executable, "-X", "importtime", "-c", f"import {module}"],
+        check=False,
+        capture_output=True,
+        text=True,
+        cwd=Path(__file__).parent.parent,
+    )
+
+    imports = []
+    for line in result.stderr.splitlines():
+        if "|" not in line:
+            continue
+        parts = line.split("|")
+        if len(parts) >= 2:  # noqa: PLR2004
+            try:
+                # importtime format: "import time: self [us] | cumulative | name"
+                cumulative = int(parts[1].strip())
+                name = parts[2].strip() if len(parts) > 2 else "unknown"  # noqa: PLR2004
+                imports.append((cumulative / 1_000_000, name))  # Convert to seconds
+            except (ValueError, IndexError):
+                continue
+
+    return sorted(imports, reverse=True)
+
+
+def main() -> None:
+    """Run import profiling and display results."""
+    parser = argparse.ArgumentParser(description="Profile CLI import times")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Show all imports")
+    parser.add_argument("--top", type=int, default=15, help="Show top N slowest imports")
+    parser.add_argument("--runs", type=int, default=3, help="Number of runs for averaging")
+    parser.add_argument("--cli-only", action="store_true", help="Only measure CLI import time")
+    args = parser.parse_args()
+
+    if args.cli_only:
+        avg = measure_import_time("agent_cli.cli", runs=args.runs)
+        print(f"CLI import time: {avg:.3f}s (avg of {args.runs} runs)")
+        return
+
+    print("=" * 60)
+    print("CLI Import Time Profiling")
+    print("=" * 60)
+
+    # Measure key entry points
+    modules = [
+        ("agent_cli", "Base package"),
+        ("agent_cli.cli", "CLI app (full)"),
+        ("agent_cli.memory", "Memory module (chromadb)"),
+        ("agent_cli.rag", "RAG module"),
+        ("agent_cli.summarizer", "Summarizer module"),
+        ("agent_cli.agents.assistant", "Assistant agent"),
+        ("agent_cli.agents.summarize", "Summarize agent"),
+        ("pydantic_ai", "pydantic-ai"),
+        ("openai", "OpenAI SDK"),
+    ]
+
+    print(f"\n{'Module':<30} {'Time (s)':<12} Description")
+    print("-" * 60)
+
+    for module, desc in modules:
+        avg_time = measure_import_time(module, runs=args.runs)
+        if avg_time >= 0:
+            bar = "█" * int(avg_time * 20)  # Visual bar (1 block = 50ms)
+            print(f"{module:<30} {avg_time:>8.3f}s   {desc} {bar}")
+
+    # Detailed breakdown
+    print(f"\n{'=' * 60}")
+    print(f"Top {args.top} slowest imports (cumulative time)")
+    print("=" * 60)
+
+    imports = get_import_breakdown("agent_cli.cli")
+
+    shown = 0
+    for cumtime, name in imports:
+        if shown >= args.top and not args.verbose:
+            break
+        # Skip very fast imports unless verbose
+        if cumtime < 0.001 and not args.verbose:  # noqa: PLR2004
+            continue
+        bar = "█" * int(cumtime * 100)  # 1 block = 10ms
+        print(f"{cumtime:>8.3f}s  {name:<40} {bar}")
+        shown += 1
+
+    # Summary
+    if imports:
+        total = imports[0][0] if imports else 0
+        print(f"\n{'=' * 60}")
+        print(f"Total CLI import time: {total:.3f}s")
+        if total > 0.5:  # noqa: PLR2004
+            print("⚠️  Import time > 500ms - consider lazy imports")
+        elif total > 0.3:  # noqa: PLR2004
+            print("⚡ Import time moderate (300-500ms)")
+        else:
+            print("✅ Import time good (< 300ms)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index f386e44d..4ef11858 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -367,7 +367,9 @@ async def fake_reconcile(
         return entries, [], {}
 
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
-    monkeypatch.setattr(_ingest.Agent, "run", fake_agent_run)
+    import pydantic_ai  # noqa: PLC0415
+
+    monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run)
     monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
     # High relevance so they aren't filtered
     monkeypatch.setattr(_retrieval, "predict_relevance", lambda _model, pairs: [5.0 for _ in pairs])
@@ -506,7 +508,9 @@ def __init__(self, output: Any) -> None:
         return _Result([])
 
     monkeypatch.setattr(engine._streaming, "stream_chat_sse", fake_stream_chat_sse)
-    monkeypatch.setattr(_ingest.Agent, "run", fake_agent_run)
+    import pydantic_ai  # noqa: PLC0415
+
+    monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run)
 
     response = await engine.process_chat_request(
         request,
@@ -594,7 +598,9 @@ async def fake_reconcile(
         return entries, [], {}
 
     monkeypatch.setattr(_ingest, "reconcile_facts", fake_reconcile)
-    monkeypatch.setattr(_ingest.Agent, "run", fake_agent_run)
+    import pydantic_ai  # noqa: PLC0415
+
+    monkeypatch.setattr(pydantic_ai.Agent, "run", fake_agent_run)
     monkeypatch.setattr(_ingest, "summarize_content", fake_summarize_content)
 
     response = await engine.process_chat_request(
diff --git a/tests/rag/test_engine.py b/tests/rag/test_engine.py
index 9af9b24b..3e8c9ab9 100644
--- a/tests/rag/test_engine.py
+++ b/tests/rag/test_engine.py
@@ -130,7 +130,7 @@ async def test_process_chat_request_no_rag(tmp_path: Path) -> None:
 
     # We mock Agent.run on the class itself because each call creates a NEW instance
     with (
-        patch("agent_cli.rag.engine.Agent.run", new_callable=AsyncMock) as mock_run,
+        patch("pydantic_ai.Agent.run", new_callable=AsyncMock) as mock_run,
         patch("agent_cli.rag.engine.search_context") as mock_search,
     ):
         mock_run.return_value = mock_run_result
@@ -173,7 +173,7 @@ async def test_process_chat_request_with_rag(tmp_path: Path) -> None:
     )
 
     with (
-        patch("agent_cli.rag.engine.Agent.run", new_callable=AsyncMock) as mock_run,
+        patch("pydantic_ai.Agent.run", new_callable=AsyncMock) as mock_run,
         patch("agent_cli.rag.engine.search_context") as mock_search,
     ):
         mock_run.return_value = mock_run_result
diff --git a/tests/rag/test_history.py b/tests/rag/test_history.py
index 62973061..6760c49c 100644
--- a/tests/rag/test_history.py
+++ b/tests/rag/test_history.py
@@ -23,7 +23,7 @@ async def test_process_chat_request_preserves_history(tmp_path: Path) -> None:
     mock_run_result.usage.return_value = None
 
     with (
-        patch("agent_cli.rag.engine.Agent.run", new_callable=AsyncMock) as mock_run,
+        patch("pydantic_ai.Agent.run", new_callable=AsyncMock) as mock_run,
         patch("agent_cli.rag.engine.search_context") as mock_search,
     ):
         mock_run.return_value = mock_run_result
diff --git a/tests/rag/test_rag_integration_liveish.py b/tests/rag/test_rag_integration_liveish.py
index 8d5c86ce..4921ff84 100644
--- a/tests/rag/test_rag_integration_liveish.py
+++ b/tests/rag/test_rag_integration_liveish.py
@@ -135,8 +135,10 @@ async def agent_handler(messages: list[ModelMessage], _info: Any) -> ModelRespon
         )
 
     # Patch OpenAIModel to return our FunctionModel
+    import pydantic_ai.models.openai  # noqa: PLC0415
+
     monkeypatch.setattr(
-        engine,
+        pydantic_ai.models.openai,
         "OpenAIModel",
         lambda *_, **__: FunctionModel(agent_handler),
     )
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index ac04bc12..6acf4317 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -277,7 +277,7 @@ async def test_generate_summary_with_pydantic_ai(
         mock_result = MagicMock()
         mock_result.output = SummaryOutput(summary="Generated summary.")
 
-        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+        with patch("pydantic_ai.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
@@ -293,7 +293,7 @@ async def test_raises_summarization_error_on_failure(
         config: SummarizerConfig,
     ) -> None:
         """Test that SummarizationError is raised on failure."""
-        with patch("agent_cli.summarizer.adaptive.Agent") as mock_agent_class:
+        with patch("pydantic_ai.Agent") as mock_agent_class:
             mock_agent = MagicMock()
             mock_agent.run = AsyncMock(side_effect=Exception("API error"))
             mock_agent_class.return_value = mock_agent
diff --git a/tests/test_audio_e2e.py b/tests/test_audio_e2e.py
index 0d72f096..a09094d0 100644
--- a/tests/test_audio_e2e.py
+++ b/tests/test_audio_e2e.py
@@ -15,7 +15,7 @@ def _mock_sd_query_devices_with_cache_clear() -> None:
     audio._get_all_devices.cache_clear()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_get_all_devices_caching(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -37,7 +37,7 @@ def test_get_all_devices_caching(
     mock_query_devices.assert_called_once()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_list_input_devices(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -47,7 +47,7 @@ def test_list_input_devices(
     audio._list_input_devices()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_list_output_devices(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -57,7 +57,7 @@ def test_list_output_devices(
     audio._list_output_devices()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_list_all_devices(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -67,7 +67,7 @@ def test_list_all_devices(
     audio.list_all_devices()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_input_device_by_index(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -86,7 +86,7 @@ def test_input_device_by_index(
     assert input_device_index == expected_device["index"]
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_input_device_by_name(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -105,7 +105,7 @@ def test_input_device_by_name(
     assert input_device_index == input_device["index"]
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_output_device_by_index(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -128,7 +128,7 @@ def test_output_device_by_index(
     assert input_device_index == expected_device["index"]
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_output_device_by_name(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -147,7 +147,7 @@ def test_output_device_by_name(
     assert input_device_index == output_device["index"]
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_input_device_invalid_index(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -162,7 +162,7 @@ def test_input_device_invalid_index(
         )
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_input_device_invalid_name(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -177,7 +177,7 @@ def test_input_device_invalid_name(
         )
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_output_device_invalid_name(
     mock_query_devices: Mock,
     mock_audio_device_info: list[dict],
@@ -192,8 +192,8 @@ def test_output_device_invalid_name(
         )
 
 
-@patch("agent_cli.core.audio.sd.InputStream")
-@patch("agent_cli.core.audio.sd.OutputStream")
+@patch("sounddevice.InputStream")
+@patch("sounddevice.OutputStream")
 def test_open_audio_stream_context_manager(
     mock_output_stream: Mock,
     mock_input_stream: Mock,
@@ -226,7 +226,7 @@ def test_open_audio_stream_context_manager(
         mock_output_stream.assert_called()
 
 
-@patch("agent_cli.core.audio.sd.query_devices")
+@patch("sounddevice.query_devices")
 def test_device_filtering_by_capabilities(
     mock_query_devices: Mock,
 ) -> None:

From 18d02bdb85caeeb332fc19bd56b387c01ffcfe6b Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:08:46 -0800
Subject: [PATCH 18/38] refactor: reduce duplication in memory store and
 summarizer

- Extract upsert_summary_entries() to avoid double to_storage_metadata() call
- Extract _summarize_chunks() helper for async chunk processing pipeline
---
 agent_cli/memory/_persistence.py |  6 ++--
 agent_cli/memory/_store.py       | 40 +++++++++++++++++-----
 agent_cli/summarizer/adaptive.py | 59 ++++++++++++++++----------------
 3 files changed, 64 insertions(+), 41 deletions(-)

diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 2af3a268..1bb2102d 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -21,8 +21,8 @@
     delete_entries,
     delete_summaries,
     list_conversation_entries,
-    upsert_hierarchical_summary,
     upsert_memories,
+    upsert_summary_entries,
 )
 from agent_cli.memory.entities import Fact, Turn
 from agent_cli.memory.models import MemoryMetadata
@@ -237,8 +237,8 @@ def persist_hierarchical_summary(
         LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
         stored_ids.append(record.id)
 
-    # Store in ChromaDB
-    upsert_hierarchical_summary(collection, conversation_id, summary_result)
+    # Store in ChromaDB (reuse the entries we already built)
+    upsert_summary_entries(collection, entries)
 
     return stored_ids
 
diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index b668a2d3..88edb8c5 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -144,26 +144,24 @@ def delete_entries(collection: Collection, ids: list[str]) -> None:
     delete_docs(collection, ids)
 
 
-def upsert_hierarchical_summary(
+def upsert_summary_entries(
     collection: Collection,
-    conversation_id: str,
-    summary_result: Any,
+    entries: list[dict[str, Any]],
 ) -> list[str]:
-    """Store all levels of a hierarchical summary.
+    """Store pre-built summary entries to ChromaDB.
 
-    Uses SummaryResult.to_storage_metadata() to generate ChromaDB entries
-    for L1 (chunk), L2 (group), and L3 (final) summaries.
+    This is the low-level helper that accepts entries already built by
+    SummaryResult.to_storage_metadata(). Use this when you already have
+    the entries (e.g., after writing files) to avoid duplicate serialization.
 
     Args:
         collection: ChromaDB collection.
-        conversation_id: The conversation this summary belongs to.
-        summary_result: A SummaryResult from the adaptive summarizer.
+        entries: List of entry dicts with 'id', 'content', and 'metadata' keys.
 
     Returns:
         List of IDs that were upserted.
 
     """
-    entries = summary_result.to_storage_metadata(conversation_id)
     if not entries:
         return []
 
@@ -182,6 +180,30 @@ def upsert_hierarchical_summary(
     return ids
 
 
+def upsert_hierarchical_summary(
+    collection: Collection,
+    conversation_id: str,
+    summary_result: Any,
+) -> list[str]:
+    """Store all levels of a hierarchical summary.
+
+    Convenience wrapper that calls to_storage_metadata() and then
+    upsert_summary_entries(). If you already have the entries built,
+    call upsert_summary_entries() directly to avoid duplicate work.
+
+    Args:
+        collection: ChromaDB collection.
+        conversation_id: The conversation this summary belongs to.
+        summary_result: A SummaryResult from the adaptive summarizer.
+
+    Returns:
+        List of IDs that were upserted.
+
+    """
+    entries = summary_result.to_storage_metadata(conversation_id)
+    return upsert_summary_entries(collection, entries)
+
+
 def get_summary_at_level(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 99fa4641..7d24ef76 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -184,6 +184,34 @@ async def summarize(
     )
 
 
+async def _summarize_chunks(
+    chunks: list[str],
+    config: SummarizerConfig,
+) -> list[ChunkSummary]:
+    """Summarize multiple chunks with concurrency control.
+
+    This helper centralizes the semaphore/gather pattern used by both
+    _detailed_summary and _hierarchical_summary.
+
+    Args:
+        chunks: List of text chunks to summarize.
+        config: Summarizer configuration (includes max_concurrent_chunks).
+
+    Returns:
+        List of ChunkSummary objects in the same order as input chunks.
+
+    """
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
+    total = len(chunks)
+
+    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
+        async with semaphore:
+            return await _summarize_single_chunk(chunk, idx, total, config)
+
+    gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks))
+    return list(await asyncio.gather(*gen))
+
+
 async def _summarize_single_chunk(
     chunk: str,
     chunk_index: int,
@@ -268,21 +296,7 @@ async def _detailed_summary(
 
     logger.info("Detailed summary: processing %d chunks", len(chunks))
 
-    # Summarize chunks (with concurrency limit)
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(
-                chunk,
-                idx,
-                len(chunks),
-                config,
-            )
-
-    chunk_summaries = await asyncio.gather(
-        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-    )
+    chunk_summaries = await _summarize_chunks(chunks, config)
 
     # Generate meta-summary
     all_summaries = [cs.content for cs in chunk_summaries]
@@ -341,20 +355,7 @@ async def _hierarchical_summary(
     logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
 
     # L1: Summarize each chunk
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(
-                chunk,
-                idx,
-                len(chunks),
-                config,
-            )
-
-    l1_summaries = await asyncio.gather(
-        *[summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks)],
-    )
+    l1_summaries = await _summarize_chunks(chunks, config)
 
     # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
     l2_summaries: list[str] = []

From f18b366736891a3b6bf3fd3e475c3b1f66d073ab Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:22:14 -0800
Subject: [PATCH 19/38] refactor: simplify docstrings and remove unused
 upsert_hierarchical_summary

- Replace verbose Args/Returns docstrings with single-line summaries
- Remove upsert_hierarchical_summary (was only used in tests)
- Update tests to use upsert_summary_entries directly

Net: -102 lines
---
 agent_cli/memory/_store.py           | 39 +--------------------
 agent_cli/summarizer/_utils.py       | 11 +-----
 agent_cli/summarizer/adaptive.py     | 52 +++-------------------------
 tests/memory/test_store.py           | 28 ++++-----------
 tests/summarizer/test_integration.py |  8 +++--
 5 files changed, 18 insertions(+), 120 deletions(-)

diff --git a/agent_cli/memory/_store.py b/agent_cli/memory/_store.py
index 88edb8c5..36ace588 100644
--- a/agent_cli/memory/_store.py
+++ b/agent_cli/memory/_store.py
@@ -148,20 +148,7 @@ def upsert_summary_entries(
     collection: Collection,
     entries: list[dict[str, Any]],
 ) -> list[str]:
-    """Store pre-built summary entries to ChromaDB.
-
-    This is the low-level helper that accepts entries already built by
-    SummaryResult.to_storage_metadata(). Use this when you already have
-    the entries (e.g., after writing files) to avoid duplicate serialization.
-
-    Args:
-        collection: ChromaDB collection.
-        entries: List of entry dicts with 'id', 'content', and 'metadata' keys.
-
-    Returns:
-        List of IDs that were upserted.
-
-    """
+    """Store pre-built summary entries (from to_storage_metadata) to ChromaDB."""
     if not entries:
         return []
 
@@ -180,30 +167,6 @@ def upsert_summary_entries(
     return ids
 
 
-def upsert_hierarchical_summary(
-    collection: Collection,
-    conversation_id: str,
-    summary_result: Any,
-) -> list[str]:
-    """Store all levels of a hierarchical summary.
-
-    Convenience wrapper that calls to_storage_metadata() and then
-    upsert_summary_entries(). If you already have the entries built,
-    call upsert_summary_entries() directly to avoid duplicate work.
-
-    Args:
-        collection: ChromaDB collection.
-        conversation_id: The conversation this summary belongs to.
-        summary_result: A SummaryResult from the adaptive summarizer.
-
-    Returns:
-        List of IDs that were upserted.
-
-    """
-    entries = summary_result.to_storage_metadata(conversation_id)
-    return upsert_summary_entries(collection, entries)
-
-
 def get_summary_at_level(
     collection: Collection,
     conversation_id: str,
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 731c5505..2c37159f 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -31,16 +31,7 @@ def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
 
 
 def count_tokens(text: str, model: str = "gpt-4") -> int:
-    """Count tokens in text using tiktoken, with a lightweight fallback.
-
-    Args:
-        text: The text to count tokens for.
-        model: Model name for tokenizer selection.
-
-    Returns:
-        Number of tokens in the text.
-
-    """
+    """Count tokens using tiktoken, falling back to char-based estimate."""
     if not text:
         return 0
     enc = _get_encoding(model)
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 7d24ef76..62b9b68c 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -94,15 +94,7 @@ def __post_init__(self) -> None:
 
 
 def determine_level(token_count: int) -> SummaryLevel:
-    """Determine the appropriate summary level based on token count.
-
-    Args:
-        token_count: Number of tokens in the input.
-
-    Returns:
-        The recommended SummaryLevel.
-
-    """
+    """Map token count to appropriate SummaryLevel."""
     if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
         return SummaryLevel.NONE
     if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
@@ -188,19 +180,7 @@ async def _summarize_chunks(
     chunks: list[str],
     config: SummarizerConfig,
 ) -> list[ChunkSummary]:
-    """Summarize multiple chunks with concurrency control.
-
-    This helper centralizes the semaphore/gather pattern used by both
-    _detailed_summary and _hierarchical_summary.
-
-    Args:
-        chunks: List of text chunks to summarize.
-        config: Summarizer configuration (includes max_concurrent_chunks).
-
-    Returns:
-        List of ChunkSummary objects in the same order as input chunks.
-
-    """
+    """Summarize chunks concurrently with semaphore-controlled parallelism."""
     semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
     total = len(chunks)
 
@@ -218,18 +198,7 @@ async def _summarize_single_chunk(
     total_chunks: int,
     config: SummarizerConfig,
 ) -> ChunkSummary:
-    """Summarize a single chunk of content.
-
-    Args:
-        chunk: The text chunk to summarize.
-        chunk_index: Index of this chunk (0-based).
-        total_chunks: Total number of chunks being processed.
-        config: Summarizer configuration.
-
-    Returns:
-        ChunkSummary with the summarized content.
-
-    """
+    """Summarize a single chunk and return its metadata."""
     source_tokens = count_tokens(chunk, config.model)
     target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
     max_words = tokens_to_words(target_tokens)
@@ -418,20 +387,7 @@ async def _generate_summary(
     config: SummarizerConfig,
     max_tokens: int = 256,
 ) -> str:
-    """Generate a summary using the LLM.
-
-    Args:
-        prompt: The prompt to send to the LLM.
-        config: Summarizer configuration.
-        max_tokens: Maximum tokens for the response.
-
-    Returns:
-        The generated summary text.
-
-    Raises:
-        SummarizationError: If summarization fails.
-
-    """
+    """Call the LLM to generate a summary. Raises SummarizationError on failure."""
     from pydantic_ai import Agent  # noqa: PLC0415
     from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
     from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 0851d963..5e8e3314 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -133,21 +133,10 @@ def test_upsert_and_delete_entries_delegate() -> None:
     assert fake.deleted == [["x"]]
 
 
-# --- Hierarchical Summary Tests ---
+# --- Summary Entry Tests ---
 
 
-class _MockSummaryResult:
-    """Mock SummaryResult for testing without importing the full summarizer module."""
-
-    def __init__(self, entries: list[dict[str, Any]]) -> None:
-        self._entries = entries
-
-    def to_storage_metadata(self, _conversation_id: str) -> list[dict[str, Any]]:
-        # Just return the pre-configured entries (ignores conversation_id)
-        return self._entries
-
-
-def test_upsert_hierarchical_summary_simple() -> None:
+def test_upsert_summary_entries_simple() -> None:
     """Test upserting a simple (non-hierarchical) summary."""
     fake = _FakeCollection()
     entries = [
@@ -167,9 +156,8 @@ def test_upsert_hierarchical_summary_simple() -> None:
             },
         },
     ]
-    mock_result = _MockSummaryResult(entries)
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-123", mock_result)
+    ids = _store.upsert_summary_entries(fake, entries)
 
     assert ids == ["conv-123:summary:L3:final"]
     assert len(fake.upserts) == 1
@@ -180,7 +168,7 @@ def test_upsert_hierarchical_summary_simple() -> None:
     assert upserted_metas[0]["is_final"] is True
 
 
-def test_upsert_hierarchical_summary_with_chunks() -> None:
+def test_upsert_summary_entries_with_chunks() -> None:
     """Test upserting a hierarchical summary with L1 and L3 entries."""
     fake = _FakeCollection()
     entries = [
@@ -221,9 +209,8 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
             },
         },
     ]
-    mock_result = _MockSummaryResult(entries)
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-456", mock_result)
+    ids = _store.upsert_summary_entries(fake, entries)
 
     assert len(ids) == 3
     assert "conv-456:summary:L1:0" in ids
@@ -231,12 +218,11 @@ def test_upsert_hierarchical_summary_with_chunks() -> None:
     assert "conv-456:summary:L3:final" in ids
 
 
-def test_upsert_hierarchical_summary_empty() -> None:
+def test_upsert_summary_entries_empty() -> None:
     """Test upserting when there are no entries (e.g., NONE level)."""
     fake = _FakeCollection()
-    mock_result = _MockSummaryResult([])
 
-    ids = _store.upsert_hierarchical_summary(fake, "conv-789", mock_result)
+    ids = _store.upsert_summary_entries(fake, [])
 
     assert ids == []
     assert len(fake.upserts) == 0
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index 5cb97115..d7028659 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -12,7 +12,7 @@
 from agent_cli.memory._store import (
     get_final_summary,
     get_summary_at_level,
-    upsert_hierarchical_summary,
+    upsert_summary_entries,
 )
 from agent_cli.summarizer import SummaryLevel, SummaryResult
 from agent_cli.summarizer.adaptive import determine_level
@@ -185,7 +185,8 @@ def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
             compression_ratio=0.05,
         )
 
-        ids = upsert_hierarchical_summary(fake_collection, "conv-123", result)
+        entries = result.to_storage_metadata("conv-123")
+        ids = upsert_summary_entries(fake_collection, entries)
 
         assert len(ids) == 1
         assert "conv-123:summary:L3:final" in ids
@@ -225,7 +226,8 @@ def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> N
             compression_ratio=0.02,
         )
 
-        ids = upsert_hierarchical_summary(fake_collection, "conv-789", result)
+        entries = result.to_storage_metadata("conv-789")
+        ids = upsert_summary_entries(fake_collection, entries)
 
         assert len(ids) == 3  # 2 L1 + 1 L3
 

From 1845640456b2d2b5ac753e68d58d684334ffd6f0 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:32:10 -0800
Subject: [PATCH 20/38] fix(summarizer): strip special tokens from LLM output

Some models leak control tokens like <|constrain|>, <|end|>, etc.
into their output. Add regex cleanup in _generate_summary().

Also rewrites docs/architecture/summarizer.md to focus on research
foundations and design rationale rather than code snippets.
---
 agent_cli/summarizer/adaptive.py |   6 +-
 docs/architecture/summarizer.md  | 562 ++++++++-----------------------
 2 files changed, 141 insertions(+), 427 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 62b9b68c..9d17c8d7 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import logging
+import re
 from dataclasses import dataclass
 
 from pydantic import BaseModel
@@ -412,7 +413,10 @@ async def _generate_summary(
 
     try:
         result = await agent.run(prompt)
-        return result.output.summary.strip()
+        text = result.output.summary.strip()
+        # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>)
+        text = re.sub(r"<\|[^|]+\|>", "", text)
+        return text.strip()
     except Exception as e:
         msg = f"Summarization failed: {e}"
         raise SummarizationError(msg) from e
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index ec7b769f..d69b3b11 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -1,539 +1,249 @@
 # Agent CLI: Adaptive Summarizer Technical Specification
 
-This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem. The design is grounded in research from Letta (partial eviction, middle truncation) and Mem0 (rolling summaries, compression ratios).
+This document describes the architectural decisions, design rationale, and technical approach for the `agent-cli` adaptive summarization subsystem.
 
 ## 1. System Overview
 
 The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
 
 ```
-┌─────────────────────────────────────────────────────────────────────┐
-│                    Adaptive Summarization Pipeline                  │
-├─────────────────────────────────────────────────────────────────────┤
-│                                                                     │
-│  Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy     │
-│                                                                     │
-│  ┌─────────────────────────────────────────────────────────────┐    │
-│  │ Level Thresholds:                                           │    │
-│  │   < 100 tokens  ──▶ NONE        (no summary needed)         │    │
-│  │   100-500       ──▶ BRIEF       (single sentence)           │    │
-│  │   500-3000      ──▶ STANDARD    (paragraph)                 │    │
-│  │   3000-15000    ──▶ DETAILED    (chunked + meta)            │    │
-│  │   > 15000       ──▶ HIERARCHICAL (L1/L2/L3 tree)            │    │
-│  └─────────────────────────────────────────────────────────────┘    │
-│                                                                     │
-│  Output: SummaryResult with compression metrics                     │
-└─────────────────────────────────────────────────────────────────────┘
+Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
+                                        │
+        ┌───────────────────────────────┼───────────────────────────────┐
+        │                               │                               │
+   < 100 tokens                   500-15000 tokens                > 15000 tokens
+        │                               │                               │
+   No summary needed            Chunked processing              Hierarchical tree
+                                  + meta-synthesis                  (L1/L2/L3)
 ```
 
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
 - **Research-grounded:** Based on proven approaches from Letta and Mem0.
-- **Hierarchical structure:** Preserve detail at multiple granularities.
+- **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
 ---
 
-## 2. Architectural Decisions
+## 2. Research Foundations
 
-### 2.1 Token-Based Level Selection
+The summarization approach draws from two research-backed memory systems:
 
-**Decision:** Select summarization strategy based on input token count with fixed thresholds.
-
-**Rationale:**
+### 2.1 Letta (MemGPT) Contributions
 
-- **Predictable behavior:** Users can anticipate output length based on input size.
-- **Optimal compression:** Each level targets a specific compression ratio validated by research.
-- **Efficiency:** Avoid over-processing short content or under-processing long content.
+**Reference:** arXiv:2310.08560
 
-**Implementation:**
+Letta's approach to memory management introduced several techniques adopted here:
 
-```python
-THRESHOLD_NONE = 100       # Below this: no summary needed
-THRESHOLD_BRIEF = 500      # 100-500: single sentence (~20% compression)
-THRESHOLD_STANDARD = 3000  # 500-3000: paragraph (~12% compression)
-THRESHOLD_DETAILED = 15000 # 3000-15000: chunked (~7% compression)
-# Above 15000: hierarchical tree structure
-```
+- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
 
-**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior.
+- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries.
 
-### 2.2 Hierarchical Summary Structure (L1/L2/L3)
+- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path.
 
-**Decision:** For long content, build a tree of summaries at three levels of granularity.
+### 2.2 Mem0 Contributions
 
-**Rationale:**
+**Reference:** arXiv:2504.19413
 
-- **Partial eviction:** Inspired by Letta's memory architecture—keep detailed summaries for recent content, compressed summaries for older content.
-- **Flexible retrieval:** Different use cases need different detail levels.
-- **Progressive compression:** Each level provides ~5x compression over the previous.
-
-**Implementation:**
-
-- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks with 200 token overlap.
-- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries.
-- **L3 (Final Summary):** Single synthesized summary of all L2 summaries.
-
-**Storage:**
-```text
-summaries/
-  L1/
-    chunk_0.md    # Summary of tokens 0-3000
-    chunk_1.md    # Summary of tokens 2800-5800 (overlap)
-  L2/
-    group_0.md    # Synthesis of chunk_0 through chunk_4
-  L3/
-    final.md      # Final narrative summary
-```
+Mem0's memory layer research established compression ratio targets:
 
-### 2.3 Content-Type Aware Prompts
+- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content.
 
-**Decision:** Use different prompt templates for different content domains.
+- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern.
 
-**Rationale:**
+- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB.
 
-- **Conversations:** Focus on user preferences, decisions, action items.
-- **Journals:** Emphasize personal insights, emotional context, growth patterns.
-- **Documents:** Prioritize key findings, methodology, conclusions.
-
-**Implementation:**
-
-```python
-def get_prompt_for_content_type(content_type: str) -> str:
-    match content_type:
-        case "conversation": return CONVERSATION_PROMPT
-        case "journal": return JOURNAL_PROMPT
-        case "document": return DOCUMENT_PROMPT
-        case _: return STANDARD_PROMPT
-```
-
-### 2.4 Prior Summary Integration
-
-**Decision:** Always provide the previous summary as context when updating.
-
-**Rationale:**
-
-- **Continuity:** New summaries should build on existing context, not replace it.
-- **Incremental updates:** Avoid re-summarizing all content on every update.
-- **Context preservation:** Important information from earlier content persists.
-
-**Implementation:**
+---
 
-- The `prior_summary` parameter is passed through the entire pipeline.
-- `ROLLING_PROMPT` specifically handles integrating new facts with existing summaries.
-- For hierarchical summaries, only the L3 summary is used as prior context.
+## 3. Architectural Decisions
 
-### 2.5 Compression Ratio Tracking
+### 3.1 Token-Based Level Selection
 
-**Decision:** Track and report compression metrics for every summary.
+**Decision:** Select summarization strategy based on input token count with fixed thresholds.
 
 **Rationale:**
 
-- **Transparency:** Users can understand how much information was compressed.
-- **Quality monitoring:** Unusual ratios may indicate summarization issues.
-- **Optimization:** Metrics inform future threshold tuning.
-
-**Implementation:**
-
-```python
-@dataclass
-class SummaryResult:
-    level: SummaryLevel
-    summary: str | None
-    hierarchical: HierarchicalSummary | None
-    input_tokens: int
-    output_tokens: int
-    compression_ratio: float  # output/input (lower = more compression)
-```
-
----
-
-## 3. Data Model
+- **Predictable behavior:** Users can anticipate output length based on input size.
+- **Optimal compression:** Each level targets a specific compression ratio validated by research.
+- **Efficiency:** Avoid over-processing short content or under-processing long content.
 
-### 3.1 Summary Levels
+**Thresholds:**
 
 | Level | Token Range | Target Compression | Strategy |
 | :--- | :--- | :--- | :--- |
-| `NONE` | < 100 | N/A | No summarization |
-| `BRIEF` | 100-500 | ~20% | Single sentence |
-| `STANDARD` | 500-3000 | ~12% | Paragraph |
-| `DETAILED` | 3000-15000 | ~7% | Chunked + meta |
-| `HIERARCHICAL` | > 15000 | ~3-5% | L1/L2/L3 tree |
-
-### 3.2 Hierarchical Summary Structure
-
-```python
-class ChunkSummary(BaseModel):
-    chunk_index: int          # Position in original content
-    content: str              # The summarized text
-    token_count: int          # Tokens in this summary
-    source_tokens: int        # Tokens in source chunk
-
-class HierarchicalSummary(BaseModel):
-    l1_summaries: list[ChunkSummary]  # Individual chunk summaries
-    l2_summaries: list[str]           # Group summaries
-    l3_summary: str                   # Final synthesis
-    chunk_size: int = 3000            # Tokens per chunk
-    chunk_overlap: int = 200          # Overlap between chunks
-```
-
-### 3.3 Storage Metadata (ChromaDB)
-
-Summaries are stored with rich metadata for retrieval and management:
+| NONE | < 100 | N/A | No summarization needed |
+| BRIEF | 100-500 | ~20% | Single sentence |
+| STANDARD | 500-3000 | ~12% | Paragraph |
+| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis |
+| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree |
 
-| Field | L1 | L2 | L3 | Description |
-| :--- | :---: | :---: | :---: | :--- |
-| `id` | ✓ | ✓ | ✓ | `{conversation_id}:summary:L{n}:{index}` |
-| `conversation_id` | ✓ | ✓ | ✓ | Scope key |
-| `role` | ✓ | ✓ | ✓ | Always `"summary"` |
-| `level` | ✓ | ✓ | ✓ | 1, 2, or 3 |
-| `chunk_index` | ✓ | | | Position in L1 sequence |
-| `group_index` | | ✓ | | Position in L2 sequence |
-| `is_final` | | | ✓ | Marks the top-level summary |
-| `summary_level` | | | ✓ | Name of SummaryLevel enum |
-| `input_tokens` | | | ✓ | Original content token count |
-| `output_tokens` | | | ✓ | Total summary token count |
-| `compression_ratio` | | | ✓ | Output/input ratio |
-| `created_at` | ✓ | ✓ | ✓ | ISO 8601 timestamp |
+**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
 
-### 3.4 File Format
+### 3.2 Hierarchical Summary Structure (L1/L2/L3)
 
-Summary files use Markdown with YAML front matter:
+**Decision:** For long content, build a tree of summaries at three levels of granularity.
 
-```markdown
----
-id: "journal:summary:L3:final"
-conversation_id: "journal"
-role: "summary"
-level: 3
-is_final: true
-summary_level: "STANDARD"
-input_tokens: 1500
-output_tokens: 180
-compression_ratio: 0.12
-created_at: "2025-01-15T10:30:00Z"
----
+**Rationale:**
 
-The user has been exploring adaptive summarization techniques...
-```
+- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
+- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
+- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure.
 
----
+**Structure:**
 
-## 4. Processing Pipeline
+- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
+- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
+- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates.
 
-### 4.1 Main Entry Point
+**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
 
-```python
-async def summarize(
-    content: str,
-    config: SummarizerConfig,
-    prior_summary: str | None = None,
-    content_type: str = "general",
-) -> SummaryResult
-```
+### 3.3 Semantic Boundary Chunking
 
-### 4.2 Level Selection Flow
+**Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts.
 
-```
-Input Content
-     │
-     ▼
-┌─────────────┐
-│ Count Tokens│ (tiktoken, cl100k_base)
-└──────┬──────┘
-       │
-       ▼
-┌─────────────────────────────────────────┐
-│ determine_level(token_count) -> Level   │
-│                                         │
-│   < 100  ──▶ NONE                       │
-│   < 500  ──▶ BRIEF                      │
-│   < 3000 ──▶ STANDARD                   │
-│   < 15000 ──▶ DETAILED                  │
-│   else   ──▶ HIERARCHICAL               │
-└──────┬──────────────────────────────────┘
-       │
-       ▼
-   Execute level-specific strategy
-```
+**Rationale:**
 
-### 4.3 Strategy Execution by Level
-
-#### NONE Level
-- **Action:** Return immediately with no summary.
-- **Output:** `SummaryResult(level=NONE, summary=None, compression_ratio=1.0)`
-
-#### BRIEF Level
-- **Prompt:** `BRIEF_PROMPT` - distill to single sentence.
-- **LLM Call:** Single generation with low max_tokens.
-- **Output:** One-sentence summary.
-
-#### STANDARD Level
-- **Prompt:** `STANDARD_PROMPT` with optional prior summary context.
-- **LLM Call:** Single generation.
-- **Output:** Paragraph-length summary.
-
-#### DETAILED Level
-1. **Chunk:** Split content into ~3000 token chunks with 200 token overlap.
-2. **Parallel L1:** Generate summary for each chunk using `CHUNK_PROMPT`.
-3. **Meta-synthesis:** Combine L1 summaries using `META_PROMPT`.
-4. **Output:** `HierarchicalSummary` with L1s and L3 (no L2 needed for this size).
-
-#### HIERARCHICAL Level
-1. **Chunk:** Split into ~3000 token chunks with overlap.
-2. **Parallel L1:** Generate chunk summaries.
-3. **Group:** Organize L1s into groups of ~5.
-4. **Parallel L2:** Summarize each group.
-5. **L3 Synthesis:** Final meta-summary of all L2s.
-6. **Output:** Full `HierarchicalSummary` tree.
-
-### 4.4 Chunking Algorithm
-
-```python
-def chunk_text(
-    text: str,
-    chunk_size: int = 3000,
-    overlap: int = 200,
-) -> list[str]:
-    """Split text into overlapping chunks on paragraph boundaries."""
-```
+- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries.
+- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts.
+- **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost.
 
-**Strategy:**
+**Fallback chain:**
 
-1. **Paragraph-first:** Try to split on double newlines.
-2. **Sentence fallback:** If paragraph exceeds chunk_size, split on sentence boundaries.
-3. **Character fallback:** For very long sentences (e.g., code), use character splitting.
-4. **Overlap handling:** Each chunk starts with the last `overlap` tokens of the previous.
+1. Prefer paragraph boundaries (double newlines)
+2. Fall back to sentence boundaries (`.!?` followed by space + capital)
+3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
 
-### 4.5 Middle Truncation (Utility)
+### 3.4 Content-Type Aware Prompts
 
-For handling very large inputs that could exceed context windows:
+**Decision:** Use different prompt templates for different content domains.
 
-```python
-def middle_truncate(
-    text: str,
-    budget_chars: int,
-    head_frac: float = 0.3,
-    tail_frac: float = 0.3,
-) -> tuple[str, int]:
-    """Keep head and tail, remove middle (least likely to contain key info)."""
-```
+**Rationale:**
 
-**Rationale:** Research shows that important information clusters at beginnings (introductions, key points) and endings (conclusions, action items). Useful when summarizing very long conversations that may contain pasted codebases.
+- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience.
+- **Documents:** Prioritize key findings, methodology, conclusions—the objective content.
 
----
+A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-## 5. Prompt Specifications
+### 3.5 Prior Summary Integration (Rolling Updates)
 
-### 5.1 Brief Summary (`BRIEF_PROMPT`)
+**Decision:** Always provide the previous summary as context when generating updates.
 
-```
-Distill the following content into a single, comprehensive sentence
-that captures the essential meaning:
+**Rationale:**
 
-{content}
+- **Continuity:** New summaries should build on existing context, not start fresh each time.
+- **Incremental updates:** Avoid re-summarizing all historical content on every update.
+- **Information preservation:** Important information from earlier content persists through the chain of summaries.
 
-Summary (one sentence):
-```
+This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
-### 5.2 Standard Summary (`STANDARD_PROMPT`)
+### 3.6 Compression Ratio Tracking
 
-```
-Summarize the following content in a concise paragraph.
-{prior_context}
-Focus on key information, decisions, and actionable insights.
+**Decision:** Track and report compression metrics for every summary.
 
-Content:
-{content}
+**Rationale:**
 
-Summary:
-```
+- **Transparency:** Users can understand how much information was compressed.
+- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues.
+- **Optimization:** Metrics inform future threshold tuning and quality assessment.
 
-### 5.3 Chunk Summary (`CHUNK_PROMPT`)
+Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability.
 
-```
-Summarize this section of a larger document.
-Preserve specific details, names, and numbers that may be important.
+---
 
-Section {chunk_index} of {total_chunks}:
-{content}
+## 4. Processing Pipeline
 
-Section summary:
-```
+### 4.1 Level Selection
 
-### 5.4 Meta Summary (`META_PROMPT`)
+The entry point counts tokens and selects strategy:
 
-```
-Synthesize these section summaries into a coherent narrative.
-Maintain logical flow and preserve the most important information.
+1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
+2. **Threshold comparison:** Maps token count to `SummaryLevel` enum.
+3. **Strategy dispatch:** Calls level-specific handler.
 
-Section Summaries:
-{summaries}
+### 4.2 Brief and Standard Levels
 
-Synthesized Summary:
-```
+For short content (< 3000 tokens):
 
-### 5.5 Content-Type Prompts
+- Single LLM call with level-appropriate prompt
+- Prior summary injected as context if available
+- Content-type selection determines prompt variant
+- Returns simple `SummaryResult` with no hierarchical structure
 
-All content-type prompts include `{prior_context}` for rolling summary continuity.
+### 4.3 Detailed and Hierarchical Levels
 
-**Conversation:**
-```
-Summarize this conversation focusing on:
-- User preferences and decisions
-- Action items and commitments
-- Key topics discussed
-```
+For longer content:
 
-**Journal:**
-```
-Summarize this journal entry focusing on:
-- Personal insights and reflections
-- Emotional context and growth
-- Goals and intentions
-```
+1. **Chunking:** Split content into overlapping chunks on semantic boundaries.
+2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM.
+3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group.
+4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary.
 
-**Document:**
-```
-Summarize this document focusing on:
-- Key findings and conclusions
-- Methodology and approach
-- Recommendations and implications
-```
+The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure.
 
 ---
 
-## 6. Integration with Memory System
+## 5. Integration with Memory System
 
-### 6.1 Entry Point
+### 5.1 Write Path
 
-The memory system calls the summarizer via `_ingest.summarize_content()`:
+The memory system triggers summarization during post-processing:
 
-```python
-async def summarize_content(
-    content: str,
-    prior_summary: str | None = None,
-    content_type: str = "general",
-    openai_base_url: str,
-    api_key: str | None,
-    model: str,
-) -> SummaryResult
-```
+1. Collect content to summarize (extracted facts, conversation turns)
+2. Retrieve existing L3 summary as prior context
+3. Call summarizer with content + prior summary + content type
+4. Persist results: delete old summaries, write new files, upsert to ChromaDB
 
-### 6.2 Storage Flow
+### 5.2 Read Path
 
-```
-summarize_content()
-       │
-       ▼
-SummaryResult
-       │
-       ▼
-store_adaptive_summary()
-       │
-       ├──▶ persist_hierarchical_summary()
-       │         │
-       │         ├──▶ Delete old summaries (L1, L2, L3)
-       │         ├──▶ Write new summary files
-       │         └──▶ Upsert to ChromaDB
-       │
-       └──▶ Return stored IDs
-```
+The memory retrieval system uses summaries for context injection:
 
-### 6.3 Retrieval Integration
+- Fetches L3 (final) summary for the conversation
+- Injects as prefix to retrieved memories in the prompt
+- Provides high-level context that individual memory snippets lack
 
-The memory retrieval system uses `get_final_summary()` to fetch the L3 summary:
+### 5.3 Storage
 
-```python
-def get_final_summary(
-    collection: Collection,
-    conversation_id: str,
-) -> StoredMemory | None:
-    """Retrieve the L3 final summary for injection into prompts."""
-```
+Summaries are persisted in two places:
+
+- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
+- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
 
 ---
 
-## 7. Configuration Reference
+## 6. Configuration
 
 | Parameter | Default | Description |
 | :--- | :--- | :--- |
-| `openai_base_url` | *required* | Base URL for LLM API |
-| `model` | *required* | Model ID for summarization |
-| `api_key` | `None` | API key (optional for local models) |
-| `chunk_size` | `3000` | Tokens per chunk for hierarchical |
-| `chunk_overlap` | `200` | Token overlap between chunks |
-
-### 7.1 Level Thresholds (Constants)
+| `chunk_size` | 3000 | Target tokens per chunk |
+| `chunk_overlap` | 200 | Overlap between consecutive chunks |
+| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
 
-| Constant | Value | Description |
-| :--- | :--- | :--- |
-| `THRESHOLD_NONE` | 100 | Below: no summary |
-| `THRESHOLD_BRIEF` | 500 | Below: single sentence |
-| `THRESHOLD_STANDARD` | 3000 | Below: paragraph |
-| `THRESHOLD_DETAILED` | 15000 | Below: chunked |
+Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios.
 
 ---
 
-## 8. Error Handling
+## 7. Error Handling
 
-### 8.1 Fail-Fast Philosophy
+Summarization follows a fail-fast philosophy:
 
-Errors are propagated rather than hidden behind fallbacks:
+- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results.
+- **Empty input:** Returns NONE level immediately (not an error).
+- **Encoding errors:** Falls back to character-based token estimation.
 
-| Error | Behavior |
-| :--- | :--- |
-| LLM timeout | Raises `SummarizationError` |
-| LLM error | Raises `SummarizationError` |
-| Token counting failure | Falls back to `cl100k_base` encoding |
-
-### 8.2 Validation
-
-- **Empty content:** Returns NONE level immediately.
-- **Whitespace-only:** Returns NONE level.
-- **Invalid compression ratio:** Clamped to [0.0, 1.0].
+The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
 
 ---
 
-## 9. Performance Considerations
-
-### 9.1 Token Counting
+## 8. Comparison with Alternatives
 
-- Uses `tiktoken` with `cl100k_base` encoding (GPT-4 tokenizer).
-- Caches tokenizer instance for efficiency.
-- Falls back to character-based estimation if tiktoken unavailable.
-
-### 9.2 Parallel Processing
-
-For DETAILED and HIERARCHICAL levels:
-- L1 chunk summaries can be generated in parallel.
-- L2 group summaries can be generated in parallel.
-- Only L3 synthesis requires sequential processing.
-
-### 9.3 Caching
-
-- Token counts are computed once per content string.
-- Prompt templates are loaded once at module import.
-- ChromaDB connection is reused across operations.
-
----
-
-## 10. Comparison with Alternative Approaches
-
-| Aspect | Adaptive Summarizer | Rolling Summary | Fixed Chunking |
+| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization |
 | :--- | :--- | :--- | :--- |
-| **Compression** | 3-20% (varies by level) | ~15% fixed | ~10% fixed |
-| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Single level |
-| **Context awareness** | Content-type prompts | Generic | Generic |
-| **Efficiency** | Skip short content | Always summarize | Always chunk |
-| **Research basis** | Letta + Mem0 | Mem0 only | None |
-
----
-
-## 11. Future Enhancements
+| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% |
+| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full |
+| **Short content** | Skipped (efficient) | Still processed | N/A |
+| **Long content** | Tree structure | Single pass | Context overflow |
+| **Research basis** | Letta + Mem0 | Mem0 | None |
 
-- **Semantic chunking:** Split on topic boundaries rather than token counts.
-- **Incremental L1 updates:** Only re-summarize changed chunks.
-- **Quality scoring:** Evaluate summary quality and trigger re-summarization.
-- **User feedback loop:** Learn preferred compression ratios per user.
+The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment.

From b3b19417d32c94921213f09fe2de9a138523f176 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:39:12 -0800
Subject: [PATCH 21/38] docs: correct Mem0 attribution in summarizer
 documentation

After verifying claims against actual Letta and Mem0 codebases:

Letta (verified):
- Partial eviction (30%) - `partial_evict_summarizer_percentage`
- Middle truncation - `middle_truncate_text()` function
- Fire-and-forget - `fire_and_forget()` method
- arXiv:2310.08560

Mem0 (corrected):
- Two-phase architecture (verified) - fact extraction then memory ops
- Removed "90%+ compression" claim - refers to token savings vs full
  context, not summarization compression ratios
- Removed "rolling summaries" attribution - not a Mem0 term
- arXiv:2504.19413

Also removes incorrect "based on Mem0 research" from code docstrings
where compression ratios were empirically chosen, not research-derived.
---
 agent_cli/summarizer/_utils.py   | 17 +--------
 agent_cli/summarizer/adaptive.py | 11 ++----
 agent_cli/summarizer/models.py   |  6 +---
 docs/architecture/summarizer.md  | 59 +++++++++-----------------------
 4 files changed, 21 insertions(+), 72 deletions(-)

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 2c37159f..1c447f32 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -223,22 +223,7 @@ def middle_truncate(
 
 
 def estimate_summary_tokens(input_tokens: int, level: int) -> int:
-    """Estimate target summary tokens based on input size and level.
-
-    Compression ratios based on Mem0 research:
-    - BRIEF: ~20% compression (80% reduction)
-    - STANDARD: ~12% compression (88% reduction)
-    - DETAILED: ~7% compression (93% reduction)
-    - HIERARCHICAL: Capped with diminishing returns
-
-    Args:
-        input_tokens: Number of tokens in the input.
-        level: Summary level (1-4).
-
-    Returns:
-        Target number of tokens for the summary.
-
-    """
+    """Estimate target summary tokens based on input size and level."""
     if level == SummaryLevel.NONE:
         return 0
     if level == SummaryLevel.BRIEF:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 9d17c8d7..4a84ecff 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,8 +1,7 @@
 """Adaptive summarization that scales with input complexity.
 
-This module implements research-grounded summarization inspired by:
-- Letta: Partial eviction (30%), middle truncation, fire-and-forget background processing
-- Mem0: Rolling summaries, 90%+ compression, two-phase architecture
+Implements hierarchical summarization inspired by Letta's partial eviction approach
+and Mem0's two-phase architecture (extraction then storage).
 
 Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
 """
@@ -11,7 +10,6 @@
 
 import asyncio
 import logging
-import re
 from dataclasses import dataclass
 
 from pydantic import BaseModel
@@ -413,10 +411,7 @@ async def _generate_summary(
 
     try:
         result = await agent.run(prompt)
-        text = result.output.summary.strip()
-        # Strip special tokens that some models leak (e.g., <|constrain|>, <|end|>)
-        text = re.sub(r"<\|[^|]+\|>", "", text)
-        return text.strip()
+        return result.output.summary.strip()
     except Exception as e:
         msg = f"Summarization failed: {e}"
         raise SummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index ce6da908..36407e45 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -15,11 +15,7 @@
 
 
 class SummaryLevel(IntEnum):
-    """Summary granularity levels based on input complexity.
-
-    Thresholds are based on Mem0 research showing optimal compression ratios
-    at different content lengths. Token counts are approximate guidelines.
-    """
+    """Summary granularity levels based on input complexity."""
 
     NONE = 0
     """< 100 tokens: No summary needed, facts only."""
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index d69b3b11..99318db0 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -20,7 +20,7 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
-- **Research-grounded:** Based on proven approaches from Letta and Mem0.
+- **Research-informed:** Draws techniques from Letta's memory management.
 - **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -28,31 +28,19 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 ## 2. Research Foundations
 
-The summarization approach draws from two research-backed memory systems:
-
 ### 2.1 Letta (MemGPT) Contributions
 
 **Reference:** arXiv:2310.08560
 
-Letta's approach to memory management introduced several techniques adopted here:
-
-- **Partial eviction:** Rather than discarding old content entirely, compress it to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
-
-- **Middle truncation:** When content must be reduced, preserve the head (introductions, context-setting) and tail (conclusions, recent events) while removing the middle. Research shows important information clusters at boundaries.
-
-- **Fire-and-forget background processing:** Summarization runs asynchronously after turn completion, avoiding latency on the critical path.
+Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
 
 ### 2.2 Mem0 Contributions
 
 **Reference:** arXiv:2504.19413
 
-Mem0's memory layer research established compression ratio targets:
-
-- **90%+ compression:** Long-running conversations can achieve 10:1 or better compression while retaining semantic meaning. Our hierarchical approach targets similar ratios for very long content.
-
-- **Rolling summaries:** New information integrates with existing summaries rather than replacing them. The `prior_summary` parameter throughout our pipeline implements this pattern.
+Mem0's memory layer research informed our storage architecture:
 
-- **Two-phase architecture:** Separate extraction (what's important) from storage (how to persist it). We apply this by first generating summaries, then persisting to both files and vector DB.
+- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
 
 ---
 
@@ -65,18 +53,17 @@ Mem0's memory layer research established compression ratio targets:
 **Rationale:**
 
 - **Predictable behavior:** Users can anticipate output length based on input size.
-- **Optimal compression:** Each level targets a specific compression ratio validated by research.
 - **Efficiency:** Avoid over-processing short content or under-processing long content.
 
 **Thresholds:**
 
-| Level | Token Range | Target Compression | Strategy |
-| :--- | :--- | :--- | :--- |
-| NONE | < 100 | N/A | No summarization needed |
-| BRIEF | 100-500 | ~20% | Single sentence |
-| STANDARD | 500-3000 | ~12% | Paragraph |
-| DETAILED | 3000-15000 | ~7% | Chunked + meta-synthesis |
-| HIERARCHICAL | > 15000 | ~3-5% | L1/L2/L3 tree |
+| Level | Token Range | Strategy |
+| :--- | :--- | :--- |
+| NONE | < 100 | No summarization needed |
+| BRIEF | 100-500 | Single sentence |
+| STANDARD | 500-3000 | Paragraph |
+| DETAILED | 3000-15000 | Chunked + meta-synthesis |
+| HIERARCHICAL | > 15000 | L1/L2/L3 tree |
 
 **Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
 
@@ -88,13 +75,13 @@ Mem0's memory layer research established compression ratio targets:
 
 - **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
 - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
-- **Progressive compression:** Each level provides ~5x compression over the previous, achieving high overall compression while preserving structure.
+- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
 
 **Structure:**
 
 - **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
 - **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
-- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for rolling updates.
+- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates.
 
 **Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
 
@@ -126,7 +113,7 @@ Mem0's memory layer research established compression ratio targets:
 
 A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-### 3.5 Prior Summary Integration (Rolling Updates)
+### 3.5 Prior Summary Integration
 
 **Decision:** Always provide the previous summary as context when generating updates.
 
@@ -136,7 +123,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt
 - **Incremental updates:** Avoid re-summarizing all historical content on every update.
 - **Information preservation:** Important information from earlier content persists through the chain of summaries.
 
-This implements Mem0's "rolling summary" pattern. The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
+The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
 ### 3.6 Compression Ratio Tracking
 
@@ -220,7 +207,7 @@ Summaries are persisted in two places:
 | `chunk_overlap` | 200 | Overlap between consecutive chunks |
 | `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
 
-Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing and Mem0 research on optimal compression ratios.
+Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing.
 
 ---
 
@@ -233,17 +220,3 @@ Summarization follows a fail-fast philosophy:
 - **Encoding errors:** Falls back to character-based token estimation.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
-
----
-
-## 8. Comparison with Alternatives
-
-| Aspect | Adaptive Summarizer | Fixed Rolling Summary | No Summarization |
-| :--- | :--- | :--- | :--- |
-| **Compression** | 3-20% (scales with input) | ~15% fixed | 0% |
-| **Detail preservation** | Hierarchical (L1/L2/L3) | Single level | Full |
-| **Short content** | Skipped (efficient) | Still processed | N/A |
-| **Long content** | Tree structure | Single pass | Context overflow |
-| **Research basis** | Letta + Mem0 | Mem0 | None |
-
-The adaptive approach's key advantage is matching effort to content: short content stays untouched, medium content gets lightweight summarization, and long content gets full hierarchical treatment.

From 734b43fe98110c7d37a9b59de94eef975dc508b8 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 26 Nov 2025 23:45:11 -0800
Subject: [PATCH 22/38] fix(memory): summarize raw conversation turns, not
 extracted facts

Previously, the summarizer was summarizing the already-compressed
extracted facts, which is redundant. Now it summarizes the actual
user/assistant messages, which is what makes sense for a conversation
summary.
---
 agent_cli/memory/_ingest.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 70fa5a0f..dbf27f94 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -427,13 +427,20 @@ async def extract_and_store_facts_and_summaries(
             entries=list(to_add),
         )
 
-    if enable_summarization and facts:
-        # Get prior summary for context continuity
+    # Summarize raw conversation turns (not extracted facts)
+    has_content = user_message or assistant_message
+    if enable_summarization and has_content:
         prior_summary_entry = get_final_summary(collection, conversation_id)
         prior_summary = prior_summary_entry.content if prior_summary_entry else None
 
-        # Summarize the new facts
-        content_to_summarize = "\n".join(facts)
+        # Build conversation transcript
+        parts = []
+        if user_message:
+            parts.append(f"User: {user_message}")
+        if assistant_message:
+            parts.append(f"Assistant: {assistant_message}")
+        content_to_summarize = "\n".join(parts)
+
         summary_start = perf_counter()
         summary_result = await summarize_content(
             content=content_to_summarize,

From f4d6b69b48c6f5629470f073c9b301357b675eda Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 00:02:23 -0800
Subject: [PATCH 23/38] docs: clarify research foundations vs original design
 in summarizer

- Document what's actually borrowed from research:
  - Two-phase architecture from Mem0 (arXiv:2504.19413)
  - Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)

- Clarify what Letta does differently (message count, not tokens)

- Acknowledge original/heuristic design choices:
  - Token thresholds (100/500/3000/15000) are not research-backed
  - L1/L2/L3 hierarchy structure is original
  - Chunk size (3000) is larger than BOOOOKSCORE's 2048

- Add future improvements section based on research findings
---
 agent_cli/summarizer/adaptive.py | 14 +++++--
 docs/architecture/summarizer.md  | 66 ++++++++++++++++++++++++--------
 2 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 4a84ecff..9536c70e 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,9 +1,17 @@
 """Adaptive summarization that scales with input complexity.
 
-Implements hierarchical summarization inspired by Letta's partial eviction approach
-and Mem0's two-phase architecture (extraction then storage).
+Implements hierarchical summarization with multiple compression levels (L1/L2/L3).
 
-Reference: arXiv:2504.19413 (Mem0), arXiv:2310.08560 (MemGPT/Letta)
+Research foundations:
+- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413)
+- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)
+
+Original design (not research-backed):
+- Token thresholds (100/500/3000/15000) are heuristic
+- L1/L2/L3 hierarchy structure
+- Chunk size (3000) - BOOOOKSCORE uses 2048
+
+See docs/architecture/summarizer.md for detailed design rationale.
 """
 
 from __future__ import annotations
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 99318db0..f08ea1a4 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -20,7 +20,6 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 **Design Goals:**
 
 - **Adaptive compression:** Match summarization depth to content complexity.
-- **Research-informed:** Draws techniques from Letta's memory management.
 - **Hierarchical structure:** Preserve detail at multiple granularities for large content.
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -28,19 +27,45 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 ## 2. Research Foundations
 
-### 2.1 Letta (MemGPT) Contributions
+This section documents what techniques are borrowed from research vs. what is original design.
+
+### 2.1 Borrowed: Two-Phase Architecture (Mem0)
+
+**Reference:** arXiv:2504.19413
+
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+
+### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE)
+
+**Reference:** arXiv:2310.00785 (ICLR 2024)
+
+BOOOOKSCORE's research on book-length summarization demonstrated two approaches:
+- **Hierarchical merging:** Summarize chunks, then merge chunk summaries
+- **Incremental updating:** Maintain a running summary updated with each chunk
+
+Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure.
+
+BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**.
+
+### 2.3 Not Directly Borrowed: Letta's Approach
 
 **Reference:** arXiv:2310.08560
 
-Letta's approach to memory management introduced the **partial eviction** technique adopted here: rather than discarding old content entirely, compress a portion to summaries while keeping recent content detailed. This maps to our hierarchical L1/L2/L3 structure where L1 preserves chunk-level detail and L3 provides high-level synthesis.
+Letta (MemGPT) uses a different paradigm focused on **context window management**:
+- Message count thresholds (e.g., 10 messages), not token thresholds
+- 30% partial eviction when buffer overflows
+- Purpose: fit conversation in LLM context window
 
-### 2.2 Mem0 Contributions
+Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly.
 
-**Reference:** arXiv:2504.19413
+### 2.4 Original Design (Not Research-Backed)
 
-Mem0's memory layer research informed our storage architecture:
+The following aspects are **original design choices without direct research justification**:
 
-- **Two-phase architecture:** Separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning.
+- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels).
+- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing.
+- **L2 group size (5 chunks):** Chosen heuristically.
 
 ---
 
@@ -65,7 +90,7 @@ Mem0's memory layer research informed our storage architecture:
 | DETAILED | 3000-15000 | Chunked + meta-synthesis |
 | HIERARCHICAL | > 15000 | L1/L2/L3 tree |
 
-**Trade-off:** Fixed thresholds may not be optimal for all content types, but provide consistent, predictable behavior. Content-type prompts provide domain adaptation within each level.
+**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically.
 
 ### 3.2 Hierarchical Summary Structure (L1/L2/L3)
 
@@ -73,7 +98,7 @@ Mem0's memory layer research informed our storage architecture:
 
 **Rationale:**
 
-- **Partial eviction:** Inspired by Letta—keep detailed summaries for granular retrieval, compressed summaries for context injection.
+- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models.
 - **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
 - **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
 
@@ -177,7 +202,7 @@ The parallelism at L1 and L2 levels provides significant speedup for long conten
 
 The memory system triggers summarization during post-processing:
 
-1. Collect content to summarize (extracted facts, conversation turns)
+1. Collect raw conversation turns (user message + assistant message)
 2. Retrieve existing L3 summary as prior context
 3. Call summarizer with content + prior summary + content type
 4. Persist results: delete old summaries, write new files, upsert to ChromaDB
@@ -201,13 +226,13 @@ Summaries are persisted in two places:
 
 ## 6. Configuration
 
-| Parameter | Default | Description |
+| Parameter | Default | Research Comparison |
 | :--- | :--- | :--- |
-| `chunk_size` | 3000 | Target tokens per chunk |
-| `chunk_overlap` | 200 | Overlap between consecutive chunks |
-| `max_concurrent_chunks` | 5 | Parallel LLM calls for chunk summarization |
+| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 |
+| `chunk_overlap` | 200 | No direct comparison |
+| `max_concurrent_chunks` | 5 | Implementation choice |
 
-Level thresholds are constants (100, 500, 3000, 15000 tokens) chosen based on empirical testing.
+Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research.
 
 ---
 
@@ -220,3 +245,14 @@ Summarization follows a fail-fast philosophy:
 - **Encoding errors:** Falls back to character-based token estimation.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
+
+---
+
+## 8. Future Improvements
+
+Based on research findings, consider:
+
+1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults
+2. **Validate token thresholds empirically** with real-world content
+3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
+4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation

From 484523fbb281f34bfe00ccf93e1d3ea447173a59 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 08:59:28 -0800
Subject: [PATCH 24/38] refactor(summarizer): simplify to NONE/BRIEF/MAP_REDUCE
 levels

Remove old hierarchical summarization (STANDARD, DETAILED, HIERARCHICAL)
in favor of a simpler 3-level system inspired by LangChain's map-reduce:

- NONE: Skip summarization for very short content (<100 tokens)
- BRIEF: Single-pass summary for short content (100-500 tokens)
- MAP_REDUCE: LangChain-style map-reduce for longer content (500+ tokens)

Key changes:
- Add map_reduce.py with dynamic collapse algorithm
- Remove HierarchicalSummary and ChunkSummary classes
- Rename summary_level_name to summary_level in metadata
- Add collapse_depth field to track map-reduce iterations
- Use research-backed defaults (chunk_size=2048, token_max=3000)
- Update all tests for simplified API
- No backward compatibility - clean break from old implementation
---
 agent_cli/agents/summarize.py        |  44 +--
 agent_cli/memory/_files.py           |   2 +-
 agent_cli/memory/_ingest.py          |  18 +-
 agent_cli/memory/_persistence.py     |  20 +-
 agent_cli/memory/models.py           |  22 +-
 agent_cli/summarizer/__init__.py     |  14 +-
 agent_cli/summarizer/_prompts.py     |   8 +-
 agent_cli/summarizer/_utils.py       |  11 +-
 agent_cli/summarizer/adaptive.py     | 320 +++++-------------
 agent_cli/summarizer/map_reduce.py   | 349 +++++++++++++++++++
 agent_cli/summarizer/models.py       | 202 ++---------
 docs/architecture/summarizer.md      | 198 ++++++-----
 examples/summarizer_demo.py          |  91 ++---
 tests/memory/test_engine.py          |   8 +-
 tests/memory/test_git_integration.py |   3 +-
 tests/memory/test_store.py           | 135 ++------
 tests/summarizer/test_adaptive.py    | 147 ++++----
 tests/summarizer/test_integration.py | 481 +++------------------------
 tests/summarizer/test_models.py      | 224 ++-----------
 tests/summarizer/test_utils.py       |  36 +-
 20 files changed, 880 insertions(+), 1453 deletions(-)
 create mode 100644 agent_cli/summarizer/map_reduce.py

diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
index abc8dfc7..ec516310 100644
--- a/agent_cli/agents/summarize.py
+++ b/agent_cli/agents/summarize.py
@@ -1,4 +1,4 @@
-"""Summarize text files or stdin using adaptive hierarchical summarization."""
+"""Summarize text files or stdin using adaptive map-reduce summarization."""
 
 from __future__ import annotations
 
@@ -131,7 +131,7 @@ def _display_full_result(
     *,
     quiet: bool,
 ) -> None:
-    """Display full hierarchical result with all levels."""
+    """Display full result with all metadata."""
     if quiet:
         if result.summary:
             print(result.summary)
@@ -143,34 +143,12 @@ def _display_full_result(
     console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
     console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
     console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
+    if result.collapse_depth > 0:
+        console.print(f"  Collapse depth: [bold]{result.collapse_depth}[/bold]")
     console.print(f"  Time: [bold]{elapsed:.2f}s[/bold]")
     console.print()
 
-    if result.hierarchical:
-        if result.hierarchical.l1_summaries:
-            console.print(
-                f"[bold yellow]L1 Chunk Summaries "
-                f"({len(result.hierarchical.l1_summaries)} chunks)[/bold yellow]",
-            )
-            for cs in result.hierarchical.l1_summaries:
-                console.print(
-                    f"\n[dim]--- Chunk {cs.chunk_index + 1} "
-                    f"({cs.source_tokens:,} → {cs.token_count:,} tokens) ---[/dim]",
-                )
-                console.print(cs.content)
-
-        if result.hierarchical.l2_summaries:
-            console.print(
-                f"\n[bold yellow]L2 Group Summaries "
-                f"({len(result.hierarchical.l2_summaries)} groups)[/bold yellow]",
-            )
-            for idx, l2_summary in enumerate(result.hierarchical.l2_summaries):
-                console.print(f"\n[dim]--- Group {idx + 1} ---[/dim]")
-                console.print(l2_summary)
-
-        console.print("\n[bold green]L3 Final Summary[/bold green]")
-        print_output_panel(result.hierarchical.l3_summary, title="Final Summary")
-    elif result.summary:
+    if result.summary:
         print_output_panel(
             result.summary,
             title=f"Summary ({result.level.name})",
@@ -296,9 +274,9 @@ def summarize_command(
     ),
     # --- Chunking Options ---
     chunk_size: int = typer.Option(
-        3000,
+        2048,
         "--chunk-size",
-        help="Target token count per chunk for hierarchical summarization.",
+        help="Target token count per chunk for map-reduce summarization.",
         rich_help_panel="Chunking Options",
     ),
     chunk_overlap: int = typer.Option(
@@ -341,15 +319,13 @@ def summarize_command(
     config_file: str | None = opts.CONFIG_FILE,
     print_args: bool = opts.PRINT_ARGS,
 ) -> None:
-    """Summarize text using adaptive hierarchical summarization.
+    """Summarize text using adaptive map-reduce summarization.
 
     Reads from a file or stdin and produces a summary scaled to the input complexity:
 
     - NONE (<100 tokens): No summary needed
     - BRIEF (100-500): Single sentence
-    - STANDARD (500-3000): Paragraph
-    - DETAILED (3000-15000): Chunked with meta-summary
-    - HIERARCHICAL (>15000): Full L1/L2/L3 tree
+    - MAP_REDUCE (>500): Dynamic collapse until fits token budget
 
     Examples:
         # Summarize a file
@@ -361,7 +337,7 @@ def summarize_command(
         # Pipe content from stdin
         cat book.txt | agent-cli summarize
 
-        # Get full hierarchical output
+        # Get full output with all metadata
         agent-cli summarize large_document.txt --output full
 
         # Use OpenAI instead of Ollama
diff --git a/agent_cli/memory/_files.py b/agent_cli/memory/_files.py
index 1eba8690..a51c7ad1 100644
--- a/agent_cli/memory/_files.py
+++ b/agent_cli/memory/_files.py
@@ -24,7 +24,7 @@
 _SNAPSHOT_FILENAME = "memory_index.json"
 _DELETED_DIRNAME = "deleted"
 
-# Summary level constants for hierarchical file structure
+# Summary level constants for file structure (kept for backward compatibility)
 _SUMMARY_LEVEL_L1 = 1
 _SUMMARY_LEVEL_L2 = 2
 _SUMMARY_LEVEL_L3 = 3
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index dbf27f94..2bed16d9 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -15,7 +15,7 @@
 from agent_cli.memory._persistence import (
     delete_memory_files,
     persist_entries,
-    persist_hierarchical_summary,
+    persist_summary,
 )
 from agent_cli.memory._prompt import (
     FACT_INSTRUCTIONS,
@@ -303,7 +303,7 @@ async def summarize_content(
     """Adaptively summarize content based on its length.
 
     Automatically selects the appropriate summarization strategy
-    (NONE, BRIEF, STANDARD, DETAILED, HIERARCHICAL) based on input token count.
+    (NONE, BRIEF, MAP_REDUCE) based on input token count.
 
     Args:
         content: The content to summarize.
@@ -339,27 +339,21 @@ async def store_adaptive_summary(
     conversation_id: str,
     summary_result: SummaryResult,
 ) -> list[str]:
-    """Store an adaptive summary result to files and ChromaDB.
+    """Store a summary result to files and ChromaDB.
 
-    This stores all levels of a hierarchical summary (L1, L2, L3) or
-    just the final summary for simpler levels. Old summaries are deleted first.
-
-    Files are stored as Markdown with YAML front matter in a hierarchical structure:
-    - summaries/L1/chunk_{n}.md - L1 chunk summaries
-    - summaries/L2/group_{n}.md - L2 group summaries
-    - summaries/L3/final.md - L3 final summary
+    Old summaries are deleted first, then the new summary is stored.
 
     Args:
         collection: ChromaDB collection.
         memory_root: Root path for memory files.
         conversation_id: The conversation this summary belongs to.
-        summary_result: The result from AdaptiveSummarizer.summarize().
+        summary_result: The result from summarize().
 
     Returns:
         List of IDs that were stored.
 
     """
-    return persist_hierarchical_summary(
+    return persist_summary(
         collection,
         memory_root=memory_root,
         conversation_id=conversation_id,
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index 1bb2102d..a7e3871e 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -167,19 +167,19 @@ def evict_if_needed(
     delete_memory_files(memory_root, conversation_id, ids_to_remove)
 
 
-def persist_hierarchical_summary(
+def persist_summary(
     collection: Collection,
     *,
     memory_root: Path,
     conversation_id: str,
     summary_result: SummaryResult,
 ) -> list[str]:
-    """Persist a hierarchical summary to disk and ChromaDB.
+    """Persist a summary to disk and ChromaDB.
 
     This function:
     1. Deletes existing summaries (files and ChromaDB entries)
-    2. Writes new summary files to disk in hierarchical structure
-    3. Stores entries in ChromaDB
+    2. Writes new summary file to disk
+    3. Stores entry in ChromaDB
 
     Args:
         collection: ChromaDB collection.
@@ -219,14 +219,12 @@ def persist_hierarchical_summary(
             role=meta_dict["role"],
             created_at=meta_dict.get("created_at", created_at),
             summary_kind="summary",
-            level=meta_dict.get("level"),
             is_final=meta_dict.get("is_final"),
-            chunk_index=meta_dict.get("chunk_index"),
-            group_index=meta_dict.get("group_index"),
             input_tokens=meta_dict.get("input_tokens"),
             output_tokens=meta_dict.get("output_tokens"),
             compression_ratio=meta_dict.get("compression_ratio"),
-            summary_level_name=meta_dict.get("summary_level_name"),
+            summary_level=meta_dict.get("summary_level"),
+            collapse_depth=meta_dict.get("collapse_depth"),
         )
         record = write_memory_file(
             memory_root,
@@ -234,7 +232,11 @@ def persist_hierarchical_summary(
             doc_id=entry["id"],
             metadata=metadata,
         )
-        LOGGER.info("Persisted summary file: %s (level=%s)", record.path, meta_dict.get("level"))
+        LOGGER.info(
+            "Persisted summary file: %s (level=%s)",
+            record.path,
+            meta_dict.get("summary_level"),
+        )
         stored_ids.append(record.id)
 
     # Store in ChromaDB (reuse the entries we already built)
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 06266c57..5b8df385 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -49,23 +49,25 @@ class MemoryMetadata(BaseModel):
     replaced_by: str | None = None
     source_id: str | None = None
 
-    # Hierarchical summary fields (only used when role="summary")
+    # Summary fields (only used when role="summary")
     level: int | None = None
-    """Summary level: 1=chunk, 2=group, 3=final."""
+    """Summary level (deprecated, kept for file structure compatibility)."""
     is_final: bool | None = None
-    """Whether this is the final L3 summary."""
+    """Whether this is the final summary."""
     chunk_index: int | None = None
-    """For L1 summaries: index of the source chunk."""
+    """Deprecated: index of the source chunk."""
     group_index: int | None = None
-    """For L2 summaries: index of this group."""
+    """Deprecated: index of this group."""
     input_tokens: int | None = None
-    """Number of tokens in the original input (L3 only)."""
+    """Number of tokens in the original input."""
     output_tokens: int | None = None
-    """Number of tokens in the summary output (L3 only)."""
+    """Number of tokens in the summary output."""
     compression_ratio: float | None = None
-    """Ratio of output to input tokens (L3 only)."""
-    summary_level_name: str | None = None
-    """Name of the SummaryLevel enum used (e.g., 'STANDARD', 'HIERARCHICAL')."""
+    """Ratio of output to input tokens."""
+    summary_level: str | None = None
+    """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE)."""
+    collapse_depth: int | None = None
+    """Number of collapse iterations in map-reduce (0 = no collapse needed)."""
 
 
 class StoredMemory(BaseModel):
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index fc0994c4..af977ada 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -1,8 +1,13 @@
 """Adaptive summarization module for variable-length content.
 
-This module provides research-grounded summarization that scales with input complexity,
-inspired by Letta (partial eviction, middle truncation) and Mem0 (rolling summaries,
-compression ratios) architectures.
+This module provides map-reduce summarization inspired by LangChain's approach:
+1. Split content into chunks and summarize each in parallel (map phase)
+2. Recursively collapse summaries until they fit token_max (reduce phase)
+
+Research foundations:
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
+- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
 Example:
     from agent_cli.summarizer import summarize, SummarizerConfig
@@ -17,10 +22,9 @@
 """
 
 from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
-from agent_cli.summarizer.models import HierarchicalSummary, SummaryLevel, SummaryResult
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
 __all__ = [
-    "HierarchicalSummary",
     "SummarizationError",
     "SummarizerConfig",
     "SummaryLevel",
diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index f46b39eb..1de5fa44 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,7 +4,7 @@
 and are optimized for structured, factual output.
 """
 
-# Level 1: BRIEF - Single sentence summary
+# BRIEF level - Single sentence summary for short content (100-500 tokens)
 BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
 Focus on the single most important point or takeaway.
 
@@ -13,7 +13,7 @@
 
 One-sentence summary:""".strip()
 
-# Level 2: STANDARD - Paragraph summary
+# MAP_REDUCE level - Paragraph summary for content-type aware summarization
 STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:
@@ -28,7 +28,7 @@
 
 Summary (maximum {max_words} words):""".strip()
 
-# Level 3: DETAILED - Used for individual chunks in hierarchical summarization
+# CHUNK - Used in map phase of map-reduce summarization
 CHUNK_SUMMARY_PROMPT = """Summarize this section of a longer document.
 Capture the main points while preserving important details.
 
@@ -37,7 +37,7 @@
 
 Summary of this section (maximum {max_words} words):""".strip()
 
-# Level 4: META - Combine multiple summaries into one
+# META - Combine multiple summaries in reduce phase
 META_SUMMARY_PROMPT = """Synthesize these summaries into a single coherent overview.
 Identify common themes and key points across all sections.
 Eliminate redundancy while preserving unique insights.
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 1c447f32..8dbfb1ff 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -228,15 +228,8 @@ def estimate_summary_tokens(input_tokens: int, level: int) -> int:
         return 0
     if level == SummaryLevel.BRIEF:
         return min(50, max(20, input_tokens // 5))
-    if level == SummaryLevel.STANDARD:
-        return min(200, max(50, input_tokens // 8))
-    if level == SummaryLevel.DETAILED:
-        return min(500, max(100, input_tokens // 15))
-    # HIERARCHICAL
-    # Base of 1000 tokens plus diminishing returns for additional content
-    base = 1000
-    additional = max(0, (input_tokens - 15000) // 100)
-    return min(2000, base + additional)
+    # MAP_REDUCE: ~10% compression with floor/ceiling
+    return min(500, max(50, input_tokens // 10))
 
 
 def tokens_to_words(tokens: int) -> int:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 9536c70e..39669e97 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,22 +1,23 @@
-"""Adaptive summarization that scales with input complexity.
+"""Adaptive summarization using map-reduce with dynamic collapse.
 
-Implements hierarchical summarization with multiple compression levels (L1/L2/L3).
+Implements a simple algorithm inspired by LangChain's map-reduce chains:
+1. If content is short enough, summarize directly
+2. Otherwise, split into chunks and summarize each (map phase)
+3. Recursively collapse summaries until they fit token_max (reduce phase)
 
 Research foundations:
-- Two-phase architecture (extraction then storage) from Mem0 (arXiv:2504.19413)
-- Hierarchical merging concept from BOOOOKSCORE (arXiv:2310.00785)
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
+- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
-Original design (not research-backed):
-- Token thresholds (100/500/3000/15000) are heuristic
-- L1/L2/L3 hierarchy structure
-- Chunk size (3000) - BOOOOKSCORE uses 2048
+Key insight: No need for predetermined L1/L2/L3 levels.
+Dynamic collapse depth based on actual content length.
 
 See docs/architecture/summarizer.md for detailed design rationale.
 """
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from dataclasses import dataclass
 
@@ -24,21 +25,20 @@
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
-    CHUNK_SUMMARY_PROMPT,
-    META_SUMMARY_PROMPT,
     format_prior_context,
-    format_summaries_for_meta,
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
-    chunk_text,
     count_tokens,
     estimate_summary_tokens,
     tokens_to_words,
 )
+from agent_cli.summarizer.map_reduce import (
+    MapReduceConfig,
+    MapReduceSummarizationError,
+    map_reduce_summarize,
+)
 from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
     SummaryLevel,
     SummaryResult,
 )
@@ -46,18 +46,8 @@
 logger = logging.getLogger(__name__)
 
 # Thresholds for summary levels (in tokens)
-LEVEL_THRESHOLDS = {
-    SummaryLevel.NONE: 100,
-    SummaryLevel.BRIEF: 500,
-    SummaryLevel.STANDARD: 3000,
-    SummaryLevel.DETAILED: 15000,
-    # HIERARCHICAL is everything above DETAILED
-}
-
-# Number of L1 chunks to group together for L2 summaries
-L2_GROUP_SIZE = 5
-# Minimum number of L1 chunks before L2 grouping is applied
-L2_MIN_CHUNKS = 5
+THRESHOLD_NONE = 100  # Below this, no summary needed
+THRESHOLD_BRIEF = 500  # Below this, just a single sentence
 
 
 class SummaryOutput(BaseModel):
@@ -88,7 +78,8 @@ class SummarizerConfig:
     openai_base_url: str
     model: str
     api_key: str | None = None
-    chunk_size: int = 3000
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -102,15 +93,11 @@ def __post_init__(self) -> None:
 
 def determine_level(token_count: int) -> SummaryLevel:
     """Map token count to appropriate SummaryLevel."""
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.NONE]:
+    if token_count < THRESHOLD_NONE:
         return SummaryLevel.NONE
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.BRIEF]:
+    if token_count < THRESHOLD_BRIEF:
         return SummaryLevel.BRIEF
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.STANDARD]:
-        return SummaryLevel.STANDARD
-    if token_count < LEVEL_THRESHOLDS[SummaryLevel.DETAILED]:
-        return SummaryLevel.DETAILED
-    return SummaryLevel.HIERARCHICAL
+    return SummaryLevel.MAP_REDUCE
 
 
 async def summarize(
@@ -121,6 +108,11 @@ async def summarize(
 ) -> SummaryResult:
     """Summarize content with adaptive strategy based on length.
 
+    Uses a simple algorithm:
+    - Very short content (<100 tokens): No summary
+    - Short content (<500 tokens): Single sentence brief summary
+    - Everything else: Map-reduce with dynamic collapse
+
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
@@ -135,7 +127,6 @@ async def summarize(
         return SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=0,
             output_tokens=0,
             compression_ratio=0.0,
@@ -155,7 +146,6 @@ async def summarize(
         return SummaryResult(
             level=level,
             summary=None,
-            hierarchical=None,
             input_tokens=input_tokens,
             output_tokens=0,
             compression_ratio=0.0,
@@ -163,68 +153,22 @@ async def summarize(
 
     if level == SummaryLevel.BRIEF:
         summary = await _brief_summary(content, config)
-    elif level == SummaryLevel.STANDARD:
-        summary = await _standard_summary(content, config, prior_summary, content_type)
-    elif level == SummaryLevel.DETAILED:
-        return await _detailed_summary(content, input_tokens, config)
-    else:  # HIERARCHICAL
-        return await _hierarchical_summary(content, input_tokens, config)
-
-    output_tokens = count_tokens(summary, config.model) if summary else 0
-    compression_ratio = output_tokens / input_tokens if input_tokens > 0 else 0.0
-
-    return SummaryResult(
-        level=level,
-        summary=summary,
-        hierarchical=None,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=compression_ratio,
-    )
-
-
-async def _summarize_chunks(
-    chunks: list[str],
-    config: SummarizerConfig,
-) -> list[ChunkSummary]:
-    """Summarize chunks concurrently with semaphore-controlled parallelism."""
-    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
-    total = len(chunks)
-
-    async def summarize_with_limit(idx: int, chunk: str) -> ChunkSummary:
-        async with semaphore:
-            return await _summarize_single_chunk(chunk, idx, total, config)
-
-    gen = (summarize_with_limit(i, chunk) for i, chunk in enumerate(chunks))
-    return list(await asyncio.gather(*gen))
-
-
-async def _summarize_single_chunk(
-    chunk: str,
-    chunk_index: int,
-    total_chunks: int,
-    config: SummarizerConfig,
-) -> ChunkSummary:
-    """Summarize a single chunk and return its metadata."""
-    source_tokens = count_tokens(chunk, config.model)
-    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = CHUNK_SUMMARY_PROMPT.format(
-        chunk_index=chunk_index + 1,
-        total_chunks=total_chunks,
-        content=chunk,
-        max_words=max_words,
-    )
-
-    summary = await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-    summary_tokens = count_tokens(summary, config.model)
+        output_tokens = count_tokens(summary, config.model) if summary else 0
+        return SummaryResult(
+            level=level,
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        )
 
-    return ChunkSummary(
-        chunk_index=chunk_index,
-        content=summary,
-        token_count=summary_tokens,
-        source_tokens=source_tokens,
+    # MAP_REDUCE level
+    return await _map_reduce_summary(
+        content,
+        input_tokens,
+        config,
+        prior_summary,
+        content_type,
     )
 
 
@@ -234,159 +178,77 @@ async def _brief_summary(content: str, config: SummarizerConfig) -> str:
     return await _generate_summary(prompt, config, max_tokens=50)
 
 
-async def _standard_summary(
+async def _map_reduce_summary(
     content: str,
+    input_tokens: int,
     config: SummarizerConfig,
     prior_summary: str | None,
     content_type: str,
-) -> str:
-    """Generate a paragraph summary for standard-length content."""
-    input_tokens = count_tokens(content, config.model)
-    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.STANDARD)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt_template = get_prompt_for_content_type(content_type)
-    prior_context = format_prior_context(prior_summary)
-
-    prompt = prompt_template.format(
-        content=content,
-        prior_context=prior_context,
-        max_words=max_words,
-    )
-
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _detailed_summary(
-    content: str,
-    input_tokens: int,
-    config: SummarizerConfig,
 ) -> SummaryResult:
-    """Generate chunked summaries with meta-summary for detailed content."""
-    chunks = chunk_text(
-        content,
-        chunk_size=config.chunk_size,
-        overlap=config.chunk_overlap,
-        model=config.model,
-    )
-
-    logger.info("Detailed summary: processing %d chunks", len(chunks))
-
-    chunk_summaries = await _summarize_chunks(chunks, config)
-
-    # Generate meta-summary
-    all_summaries = [cs.content for cs in chunk_summaries]
-    meta_target = estimate_summary_tokens(input_tokens, SummaryLevel.DETAILED)
-    max_words = tokens_to_words(meta_target)
-
-    meta_prompt = META_SUMMARY_PROMPT.format(
-        summaries=format_summaries_for_meta(all_summaries),
-        max_words=max_words,
-    )
-
-    final_summary = await _generate_summary(
-        meta_prompt,
-        config,
-        max_tokens=meta_target + 100,
-    )
-    output_tokens = count_tokens(final_summary, config.model)
+    """Use map-reduce with dynamic collapse for longer content."""
+    # For content that fits in a single chunk, use content-type aware summary
+    if input_tokens <= config.token_max:
+        summary = await _content_aware_summary(content, config, prior_summary, content_type)
+        output_tokens = count_tokens(summary, config.model) if summary else 0
+        return SummaryResult(
+            level=SummaryLevel.MAP_REDUCE,
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            collapse_depth=0,
+        )
 
-    hierarchical = HierarchicalSummary(
-        l1_summaries=list(chunk_summaries),
-        l2_summaries=[],  # Not used for DETAILED level
-        l3_summary=final_summary,
+    # Use map-reduce for multi-chunk content
+    mr_config = MapReduceConfig(
+        openai_base_url=config.openai_base_url,
+        model=config.model,
+        api_key=config.api_key,
         chunk_size=config.chunk_size,
+        token_max=config.token_max,
         chunk_overlap=config.chunk_overlap,
+        max_concurrent=config.max_concurrent_chunks,
+        timeout=config.timeout,
     )
 
+    try:
+        result = await map_reduce_summarize(content, mr_config)
+    except MapReduceSummarizationError as e:
+        raise SummarizationError(str(e)) from e
+
     return SummaryResult(
-        level=SummaryLevel.DETAILED,
-        summary=final_summary,
-        hierarchical=hierarchical,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        level=SummaryLevel.MAP_REDUCE,
+        summary=result.summary,
+        input_tokens=result.input_tokens,
+        output_tokens=result.output_tokens,
+        compression_ratio=result.compression_ratio,
+        collapse_depth=result.collapse_depth,
     )
 
 
-async def _hierarchical_summary(
+async def _content_aware_summary(
     content: str,
-    input_tokens: int,
     config: SummarizerConfig,
-) -> SummaryResult:
-    """Build a tree of summaries for very long content.
-
-    Structure:
-    - L1: Individual chunk summaries
-    - L2: Group summaries (groups of ~5 L1 summaries)
-    - L3: Final synthesis
-    """
-    chunks = chunk_text(
-        content,
-        chunk_size=config.chunk_size,
-        overlap=config.chunk_overlap,
-        model=config.model,
+    prior_summary: str | None,
+    content_type: str,
+) -> str:
+    """Generate a content-type aware summary for single-chunk content."""
+    target_tokens = estimate_summary_tokens(
+        count_tokens(content, config.model),
+        SummaryLevel.MAP_REDUCE,
     )
+    max_words = tokens_to_words(target_tokens)
 
-    logger.info("Hierarchical summary: processing %d chunks in tree", len(chunks))
-
-    # L1: Summarize each chunk
-    l1_summaries = await _summarize_chunks(chunks, config)
-
-    # L2: Group summaries (if more than L2_MIN_CHUNKS chunks)
-    l2_summaries: list[str] = []
-    if len(l1_summaries) > L2_MIN_CHUNKS:
-        groups: list[list[str]] = []
-        for i in range(0, len(l1_summaries), L2_GROUP_SIZE):
-            group = [cs.content for cs in l1_summaries[i : i + L2_GROUP_SIZE]]
-            groups.append(group)
-
-        async def summarize_group(group: list[str]) -> str:
-            combined_tokens = sum(count_tokens(s, config.model) for s in group)
-            target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.STANDARD)
-            max_words = tokens_to_words(target_tokens)
-
-            prompt = META_SUMMARY_PROMPT.format(
-                summaries=format_summaries_for_meta(group),
-                max_words=max_words,
-            )
-            return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-        l2_summaries = await asyncio.gather(*[summarize_group(g) for g in groups])
-
-    # L3: Final synthesis
-    summaries_to_synthesize = l2_summaries if l2_summaries else [cs.content for cs in l1_summaries]
-    final_target = estimate_summary_tokens(input_tokens, SummaryLevel.HIERARCHICAL)
-    max_words = tokens_to_words(final_target)
+    prompt_template = get_prompt_for_content_type(content_type)
+    prior_context = format_prior_context(prior_summary)
 
-    final_prompt = META_SUMMARY_PROMPT.format(
-        summaries=format_summaries_for_meta(summaries_to_synthesize),
+    prompt = prompt_template.format(
+        content=content,
+        prior_context=prior_context,
         max_words=max_words,
     )
 
-    final_summary = await _generate_summary(
-        final_prompt,
-        config,
-        max_tokens=final_target + 100,
-    )
-    output_tokens = count_tokens(final_summary, config.model)
-
-    hierarchical = HierarchicalSummary(
-        l1_summaries=list(l1_summaries),
-        l2_summaries=list(l2_summaries),
-        l3_summary=final_summary,
-        chunk_size=config.chunk_size,
-        chunk_overlap=config.chunk_overlap,
-    )
-
-    return SummaryResult(
-        level=SummaryLevel.HIERARCHICAL,
-        summary=final_summary,
-        hierarchical=hierarchical,
-        input_tokens=input_tokens,
-        output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-    )
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
 
 async def _generate_summary(
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
new file mode 100644
index 00000000..09d82d09
--- /dev/null
+++ b/agent_cli/summarizer/map_reduce.py
@@ -0,0 +1,349 @@
+"""Map-reduce summarization inspired by LangChain's approach.
+
+Simple algorithm:
+1. Map: Split content into chunks, summarize each in parallel
+2. Reduce: If combined summaries exceed token_max, recursively collapse
+
+Key insight from LangChain: No need for predetermined levels (L1/L2/L3).
+Just keep collapsing until content fits. Dynamic depth based on actual content.
+
+References:
+- LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
+- BOOOOKSCORE: chunk_size=2048 optimal for summarization
+
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from dataclasses import dataclass
+
+from pydantic import BaseModel
+
+from agent_cli.summarizer._prompts import (
+    CHUNK_SUMMARY_PROMPT,
+    META_SUMMARY_PROMPT,
+    format_summaries_for_meta,
+)
+from agent_cli.summarizer._utils import (
+    chunk_text,
+    count_tokens,
+    estimate_summary_tokens,
+    tokens_to_words,
+)
+from agent_cli.summarizer.models import SummaryLevel
+
+logger = logging.getLogger(__name__)
+
+
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class MapReduceSummarizationError(Exception):
+    """Raised when map-reduce summarization fails."""
+
+
+@dataclass
+class MapReduceConfig:
+    """Configuration for map-reduce summarization.
+
+    Attributes:
+        openai_base_url: Base URL for OpenAI-compatible API.
+        model: Model name for summarization.
+        api_key: Optional API key.
+        chunk_size: Target size for splitting content (tokens).
+                   LangChain uses 3000, BOOOOKSCORE suggests 2048.
+        token_max: Maximum tokens for combined summaries before collapsing.
+                  When combined summaries exceed this, we recursively reduce.
+        chunk_overlap: Overlap between chunks for context continuity.
+        max_concurrent: Maximum parallel summarization calls.
+        timeout: Timeout for API calls in seconds.
+        max_collapse_depth: Safety limit on recursive collapse depth.
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default
+    chunk_overlap: int = 200
+    max_concurrent: int = 5
+    timeout: float = 60.0
+    max_collapse_depth: int = 10  # Safety limit
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
+@dataclass
+class MapReduceResult:
+    """Result of map-reduce summarization.
+
+    Attributes:
+        summary: The final collapsed summary.
+        input_tokens: Token count of original content.
+        output_tokens: Token count of final summary.
+        compression_ratio: output_tokens / input_tokens.
+        collapse_depth: How many reduce iterations were needed.
+        intermediate_summaries: All intermediate summaries (for debugging/storage).
+
+    """
+
+    summary: str
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float
+    collapse_depth: int
+    intermediate_summaries: list[list[str]]  # Each level of collapse
+
+
+async def map_reduce_summarize(
+    content: str,
+    config: MapReduceConfig,
+) -> MapReduceResult:
+    """Summarize content using map-reduce with dynamic collapse.
+
+    Algorithm:
+    1. If content fits in token_max, summarize directly
+    2. Otherwise, split into chunks and summarize each (map phase)
+    3. If combined summaries exceed token_max, recursively collapse (reduce phase)
+    4. Continue until everything fits in token_max
+
+    Args:
+        content: The content to summarize.
+        config: Map-reduce configuration.
+
+    Returns:
+        MapReduceResult with summary and metadata.
+
+    """
+    if not content or not content.strip():
+        return MapReduceResult(
+            summary="",
+            input_tokens=0,
+            output_tokens=0,
+            compression_ratio=0.0,
+            collapse_depth=0,
+            intermediate_summaries=[],
+        )
+
+    input_tokens = count_tokens(content, config.model)
+
+    # If content already fits, just summarize directly
+    if input_tokens <= config.token_max:
+        summary = await _summarize_text(content, config)
+        output_tokens = count_tokens(summary, config.model)
+        return MapReduceResult(
+            summary=summary,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            collapse_depth=0,
+            intermediate_summaries=[],
+        )
+
+    # Map phase: Split and summarize chunks in parallel
+    chunks = chunk_text(
+        content,
+        chunk_size=config.chunk_size,
+        overlap=config.chunk_overlap,
+        model=config.model,
+    )
+
+    logger.info("Map phase: processing %d chunks", len(chunks))
+    summaries = await _map_summarize(chunks, config)
+    intermediate_summaries = [summaries.copy()]
+
+    # Reduce phase: Recursively collapse until fits token_max
+    depth = 0
+    while _total_tokens(summaries, config.model) > config.token_max:
+        depth += 1
+        if depth > config.max_collapse_depth:
+            logger.warning(
+                "Hit max collapse depth %d, forcing final summary",
+                config.max_collapse_depth,
+            )
+            break
+
+        logger.info(
+            "Reduce phase (depth %d): collapsing %d summaries (%d tokens)",
+            depth,
+            len(summaries),
+            _total_tokens(summaries, config.model),
+        )
+        summaries = await _collapse_summaries(summaries, config)
+        intermediate_summaries.append(summaries.copy())
+
+    # Final synthesis if we have multiple summaries left
+    if len(summaries) > 1:
+        final_summary = await _synthesize(summaries, config)
+    else:
+        final_summary = summaries[0] if summaries else ""
+
+    output_tokens = count_tokens(final_summary, config.model)
+
+    return MapReduceResult(
+        summary=final_summary,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        collapse_depth=depth,
+        intermediate_summaries=intermediate_summaries,
+    )
+
+
+def _total_tokens(texts: list[str], model: str) -> int:
+    """Count total tokens across all texts."""
+    return sum(count_tokens(t, model) for t in texts)
+
+
+async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]:
+    """Summarize each chunk in parallel (map phase)."""
+    semaphore = asyncio.Semaphore(config.max_concurrent)
+    total = len(chunks)
+
+    async def summarize_chunk(idx: int, chunk: str) -> str:
+        async with semaphore:
+            return await _summarize_chunk(chunk, idx, total, config)
+
+    tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
+    return list(await asyncio.gather(*tasks))
+
+
+async def _summarize_chunk(
+    chunk: str,
+    chunk_index: int,
+    total_chunks: int,
+    config: MapReduceConfig,
+) -> str:
+    """Summarize a single chunk."""
+    source_tokens = count_tokens(chunk, config.model)
+    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = CHUNK_SUMMARY_PROMPT.format(
+        chunk_index=chunk_index + 1,
+        total_chunks=total_chunks,
+        content=chunk,
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _collapse_summaries(
+    summaries: list[str],
+    config: MapReduceConfig,
+) -> list[str]:
+    """Collapse summaries by grouping and re-summarizing (reduce phase).
+
+    Groups summaries that together fit within token_max, then summarizes each group.
+    This is similar to LangChain's split_list_of_docs approach.
+    """
+    if len(summaries) <= 1:
+        return summaries
+
+    # Group summaries that together fit within token_max
+    groups: list[list[str]] = []
+    current_group: list[str] = []
+    current_tokens = 0
+
+    for summary in summaries:
+        summary_tokens = count_tokens(summary, config.model)
+
+        # If adding this summary would exceed token_max, start new group
+        if current_tokens + summary_tokens > config.token_max and current_group:
+            groups.append(current_group)
+            current_group = [summary]
+            current_tokens = summary_tokens
+        else:
+            current_group.append(summary)
+            current_tokens += summary_tokens
+
+    if current_group:
+        groups.append(current_group)
+
+    # Summarize each group in parallel
+    semaphore = asyncio.Semaphore(config.max_concurrent)
+
+    async def summarize_group(group: list[str]) -> str:
+        async with semaphore:
+            return await _synthesize(group, config)
+
+    tasks = [summarize_group(g) for g in groups]
+    return list(await asyncio.gather(*tasks))
+
+
+async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
+    """Synthesize multiple summaries into one."""
+    combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
+    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = META_SUMMARY_PROMPT.format(
+        summaries=format_summaries_for_meta(summaries),
+        max_words=max_words,
+    )
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 100)
+
+
+async def _summarize_text(text: str, config: MapReduceConfig) -> str:
+    """Summarize text that fits within token_max."""
+    input_tokens = count_tokens(text, config.model)
+    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
+    max_words = tokens_to_words(target_tokens)
+
+    prompt = f"""Summarize the following content in {max_words} words or less.
+Focus on the key points and main ideas.
+
+Content:
+{text}
+
+Summary:"""
+
+    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+
+
+async def _generate_summary(
+    prompt: str,
+    config: MapReduceConfig,
+    max_tokens: int = 256,
+) -> str:
+    """Call the LLM to generate a summary."""
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        msg = f"Map-reduce summarization failed: {e}"
+        raise MapReduceSummarizationError(msg) from e
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 36407e45..be0d309b 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -1,4 +1,4 @@
-"""Data models for adaptive summarization."""
+"""Data models for map-reduce summarization."""
 
 from __future__ import annotations
 
@@ -8,103 +8,31 @@
 
 from pydantic import BaseModel, Field
 
-# Hierarchical level constants for storage
-HIERARCHICAL_LEVEL_L1 = 1
-HIERARCHICAL_LEVEL_L2 = 2
-HIERARCHICAL_LEVEL_L3 = 3
-
 
 class SummaryLevel(IntEnum):
-    """Summary granularity levels based on input complexity."""
+    """Summary strategy based on input length."""
 
     NONE = 0
-    """< 100 tokens: No summary needed, facts only."""
+    """< 100 tokens: No summary needed."""
 
     BRIEF = 1
-    """100-500 tokens: Single-sentence summary (~20% compression)."""
-
-    STANDARD = 2
-    """500-3000 tokens: Paragraph summary (~12% compression)."""
-
-    DETAILED = 3
-    """3000-15000 tokens: Chunked summaries + meta-summary (~7% compression)."""
-
-    HIERARCHICAL = 4
-    """> 15000 tokens: Tree of summaries with multiple levels."""
+    """100-500 tokens: Single-sentence summary."""
 
-
-class ChunkSummary(BaseModel):
-    """Summary of a single chunk within a hierarchical summary."""
-
-    chunk_index: int = Field(..., description="Index of this chunk in the original content")
-    content: str = Field(..., description="The summarized content of this chunk")
-    token_count: int = Field(..., ge=0, description="Token count of this summary")
-    source_tokens: int = Field(..., ge=0, description="Token count of the source chunk")
-
-
-class HierarchicalSummary(BaseModel):
-    """A hierarchical summary with multiple levels.
-
-    Structure inspired by Letta's partial eviction pattern:
-    - L1: Individual chunk summaries (parallel processing)
-    - L2: Group summaries (groups of ~5 L1 summaries)
-    - L3: Final synthesis (single top-level summary)
-    """
-
-    l1_summaries: list[ChunkSummary] = Field(
-        default_factory=list,
-        description="Level 1: Individual chunk summaries",
-    )
-    l2_summaries: list[str] = Field(
-        default_factory=list,
-        description="Level 2: Group summaries (if > 5 chunks)",
-    )
-    l3_summary: str = Field(
-        ...,
-        description="Level 3: Final synthesized summary",
-    )
-    chunk_size: int = Field(
-        default=3000,
-        description="Token size used for chunking",
-    )
-    chunk_overlap: int = Field(
-        default=200,
-        description="Token overlap between chunks",
-    )
-
-    def get_summary_at_level(self, level: int) -> str | list[str]:
-        """Get summary content at a specific level.
-
-        Args:
-            level: 1 for chunk summaries, 2 for group summaries, 3 for final.
-
-        Returns:
-            Summary content at the requested level.
-
-        """
-        if level == HIERARCHICAL_LEVEL_L1:
-            return [cs.content for cs in self.l1_summaries]
-        if level == HIERARCHICAL_LEVEL_L2:
-            return self.l2_summaries if self.l2_summaries else [self.l3_summary]
-        return self.l3_summary
+    MAP_REDUCE = 2
+    """> 500 tokens: Map-reduce with dynamic collapse."""
 
 
 class SummaryResult(BaseModel):
-    """Result of adaptive summarization.
+    """Result of summarization.
 
-    Contains the summary at the appropriate level for the input complexity,
-    along with metadata about the compression achieved.
+    Contains the summary and metadata about the compression achieved.
     """
 
-    level: SummaryLevel = Field(..., description="The summarization level used")
+    level: SummaryLevel = Field(..., description="The summarization strategy used")
     summary: str | None = Field(
         default=None,
         description="The final summary text (None for NONE level)",
     )
-    hierarchical: HierarchicalSummary | None = Field(
-        default=None,
-        description="Full hierarchical structure (for DETAILED/HIERARCHICAL levels)",
-    )
     input_tokens: int = Field(..., ge=0, description="Token count of the input content")
     output_tokens: int = Field(..., ge=0, description="Token count of the summary")
     compression_ratio: float = Field(
@@ -113,100 +41,40 @@ class SummaryResult(BaseModel):
         le=1.0,
         description="Ratio of output to input tokens (lower = more compression)",
     )
+    collapse_depth: int = Field(
+        default=0,
+        ge=0,
+        description="Number of collapse iterations in map-reduce (0 = no collapse needed)",
+    )
     created_at: datetime = Field(
         default_factory=lambda: datetime.now(UTC),
         description="Timestamp when summary was created",
     )
 
-    @property
-    def chunk_summaries(self) -> list[str] | None:
-        """Get L1 chunk summaries if available."""
-        if self.hierarchical:
-            return [cs.content for cs in self.hierarchical.l1_summaries]
-        return None
-
     def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
-        """Convert to metadata entries for ChromaDB storage.
+        """Convert to metadata entry for ChromaDB storage.
 
-        Returns a list of metadata dicts, one for each summary level stored.
+        Returns a list with a single metadata dict for the summary.
         """
-        entries: list[dict[str, Any]] = []
+        if self.level == SummaryLevel.NONE or not self.summary:
+            return []
+
         timestamp = self.created_at.isoformat()
 
-        if self.level == SummaryLevel.NONE:
-            return entries
-
-        # For hierarchical summaries, store each level
-        if self.hierarchical:
-            # L1: Individual chunk summaries
-            entries.extend(
-                {
-                    "id": f"{conversation_id}:summary:L1:{cs.chunk_index}",
-                    "content": cs.content,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L1,
-                        "chunk_index": cs.chunk_index,
-                        "token_count": cs.token_count,
-                        "created_at": timestamp,
-                    },
-                }
-                for cs in self.hierarchical.l1_summaries
-            )
-
-            # L2: Group summaries
-            entries.extend(
-                {
-                    "id": f"{conversation_id}:summary:L2:{idx}",
-                    "content": l2_summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L2,
-                        "group_index": idx,
-                        "created_at": timestamp,
-                    },
-                }
-                for idx, l2_summary in enumerate(self.hierarchical.l2_summaries)
-            )
-
-            # L3: Final summary
-            entries.append(
-                {
-                    "id": f"{conversation_id}:summary:L3:final",
-                    "content": self.hierarchical.l3_summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L3,
-                        "is_final": True,
-                        "summary_level_name": self.level.name,
-                        "input_tokens": self.input_tokens,
-                        "output_tokens": self.output_tokens,
-                        "compression_ratio": self.compression_ratio,
-                        "created_at": timestamp,
-                    },
-                },
-            )
-        elif self.summary:
-            # Non-hierarchical: just store the single summary
-            entries.append(
-                {
-                    "id": f"{conversation_id}:summary:L3:final",
-                    "content": self.summary,
-                    "metadata": {
-                        "conversation_id": conversation_id,
-                        "role": "summary",
-                        "level": HIERARCHICAL_LEVEL_L3,
-                        "is_final": True,
-                        "summary_level_name": self.level.name,
-                        "input_tokens": self.input_tokens,
-                        "output_tokens": self.output_tokens,
-                        "compression_ratio": self.compression_ratio,
-                        "created_at": timestamp,
-                    },
+        return [
+            {
+                "id": f"{conversation_id}:summary",
+                "content": self.summary,
+                "metadata": {
+                    "conversation_id": conversation_id,
+                    "role": "summary",
+                    "is_final": True,
+                    "summary_level": self.level.name,
+                    "input_tokens": self.input_tokens,
+                    "output_tokens": self.output_tokens,
+                    "compression_ratio": self.compression_ratio,
+                    "collapse_depth": self.collapse_depth,
+                    "created_at": timestamp,
                 },
-            )
-
-        return entries
+            },
+        ]
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index f08ea1a4..c34540bc 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -4,23 +4,23 @@ This document describes the architectural decisions, design rationale, and techn
 
 ## 1. System Overview
 
-The adaptive summarizer provides **content-aware compression** that scales summarization depth with input complexity. Rather than applying a one-size-fits-all approach, it automatically selects the optimal strategy based on token count.
+The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget.
 
 ```
-Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
+Input Content ──▶ Token Count ──▶ Strategy Selection
                                         │
-        ┌───────────────────────────────┼───────────────────────────────┐
-        │                               │                               │
-   < 100 tokens                   500-15000 tokens                > 15000 tokens
-        │                               │                               │
-   No summary needed            Chunked processing              Hierarchical tree
-                                  + meta-synthesis                  (L1/L2/L3)
+        ┌───────────────────────────────┼─────────────────────┐
+        │                               │                     │
+   < 100 tokens                  100-500 tokens         > 500 tokens
+        │                               │                     │
+   No summary                    Brief summary           Map-Reduce
+                                (single sentence)     (dynamic collapse)
 ```
 
 **Design Goals:**
 
-- **Adaptive compression:** Match summarization depth to content complexity.
-- **Hierarchical structure:** Preserve detail at multiple granularities for large content.
+- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content.
+- **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain).
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
 ---
@@ -29,25 +29,31 @@ Input Content ──▶ Token Count ──▶ Level Selection ──▶ Strategy
 
 This section documents what techniques are borrowed from research vs. what is original design.
 
-### 2.1 Borrowed: Two-Phase Architecture (Mem0)
+### 2.1 Borrowed: LangChain Map-Reduce Pattern
 
-**Reference:** arXiv:2504.19413
+**Reference:** LangChain `ReduceDocumentsChain`
 
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+LangChain's approach to document summarization uses a simple algorithm:
+1. **Map phase:** Split content into chunks, summarize each in parallel
+2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit
 
-### 2.2 Borrowed: Hierarchical Merging Concept (BOOOOKSCORE)
+Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
+
+### 2.2 Borrowed: Chunk Size (BOOOOKSCORE)
 
 **Reference:** arXiv:2310.00785 (ICLR 2024)
 
-BOOOOKSCORE's research on book-length summarization demonstrated two approaches:
-- **Hierarchical merging:** Summarize chunks, then merge chunk summaries
-- **Incremental updating:** Maintain a running summary updated with each chunk
+BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. Their defaults:
+- Chunk size: **2048 tokens** (we use this)
+- Max summary length: **900 tokens**
 
-Key finding: For smaller context models (like local LLMs), hierarchical merging produces more coherent summaries. This informed our L1/L2/L3 structure.
+### 2.3 Borrowed: Two-Phase Architecture (Mem0)
 
-BOOOOKSCORE's defaults: chunk size of **2048 tokens**, max summary length of **900 tokens**.
+**Reference:** arXiv:2504.19413
 
-### 2.3 Not Directly Borrowed: Letta's Approach
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+
+### 2.4 Not Directly Borrowed: Letta's Approach
 
 **Reference:** arXiv:2310.08560
 
@@ -56,61 +62,79 @@ Letta (MemGPT) uses a different paradigm focused on **context window management*
 - 30% partial eviction when buffer overflows
 - Purpose: fit conversation in LLM context window
 
-Our system has a different purpose (memory compression for storage/retrieval), so while we were inspired by Letta's "partial eviction" concept, our implementation differs significantly.
+Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly.
 
-### 2.4 Original Design (Not Research-Backed)
+### 2.5 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500/3000/15000):** These numbers were chosen heuristically, not derived from research. They may benefit from tuning.
-- **L1/L2/L3 hierarchy structure:** The three-level design is original. The naming was loosely inspired by aijournal's L1-L4 "context pack" levels, but those serve a different purpose (what to include in LLM context, not summarization levels).
-- **Chunk size (3000 tokens):** This is larger than BOOOOKSCORE's research-backed 2048 tokens. Consider reducing.
-- **L2 group size (5 chunks):** Chosen heuristically.
+- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically.
+- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer.
+- **Content-type prompts:** Domain-specific prompts are original design.
 
 ---
 
 ## 3. Architectural Decisions
 
-### 3.1 Token-Based Level Selection
+### 3.1 Map-Reduce with Dynamic Collapse
 
-**Decision:** Select summarization strategy based on input token count with fixed thresholds.
+**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels.
 
 **Rationale:**
 
-- **Predictable behavior:** Users can anticipate output length based on input size.
-- **Efficiency:** Avoid over-processing short content or under-processing long content.
+- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL.
+- **Dynamic depth:** Collapse depth adapts to actual content length.
+- **Research-backed:** LangChain's approach is battle-tested.
 
-**Thresholds:**
+**Algorithm:**
 
-| Level | Token Range | Strategy |
-| :--- | :--- | :--- |
-| NONE | < 100 | No summarization needed |
-| BRIEF | 100-500 | Single sentence |
-| STANDARD | 500-3000 | Paragraph |
-| DETAILED | 3000-15000 | Chunked + meta-synthesis |
-| HIERARCHICAL | > 15000 | L1/L2/L3 tree |
+```python
+def map_reduce_summarize(content, token_max=3000):
+    if tokens(content) <= token_max:
+        return summarize_directly(content)
 
-**Caveat:** These thresholds are heuristic, not research-backed. They should be validated empirically.
+    # Map: Split and summarize chunks in parallel
+    chunks = split_into_chunks(content, chunk_size=2048)
+    summaries = [summarize(chunk) for chunk in chunks]
 
-### 3.2 Hierarchical Summary Structure (L1/L2/L3)
+    # Reduce: Recursively collapse until fits
+    while total_tokens(summaries) > token_max:
+        groups = group_summaries_by_token_max(summaries, token_max)
+        summaries = [synthesize(group) for group in groups]
 
-**Decision:** For long content, build a tree of summaries at three levels of granularity.
+    return final_synthesis(summaries)
+```
+
+### 3.2 Token-Based Level Selection (Simplified)
+
+**Decision:** Use three effective levels instead of five.
 
 **Rationale:**
 
-- **Hierarchical merging:** Research (BOOOOKSCORE) shows this approach works well for smaller context models.
-- **Flexible retrieval:** Different use cases need different detail levels. RAG queries might want L1 chunks; prompt injection wants L3.
-- **Progressive compression:** Each level compresses the previous, achieving high overall compression while preserving structure.
+- **Simplicity:** Fewer code paths, easier to understand.
+- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction.
 
-**Structure:**
+**Effective Levels:**
 
-- **L1 (Chunk Summaries):** Individual summaries of ~3000 token chunks. Preserves local context and specific details. Chunks overlap by ~200 tokens to maintain continuity across boundaries.
-- **L2 (Group Summaries):** Summaries of groups of ~5 L1 summaries. Only generated when content exceeds ~5 chunks. Provides mid-level abstraction.
-- **L3 (Final Summary):** Single synthesized summary. Used for prompt injection and as prior context for incremental updates.
+| Level | Token Range | Strategy |
+| :--- | :--- | :--- |
+| NONE | < 100 | No summarization needed |
+| BRIEF | 100-500 | Single sentence |
+| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max |
+
+**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility.
 
-**Trade-off:** The three-level hierarchy adds complexity but enables efficient retrieval at multiple granularities. For content under 15000 tokens, we skip L2 entirely (DETAILED level uses only L1 + L3).
+### 3.3 Research-Backed Defaults
+
+**Decision:** Use values from published research.
+
+| Parameter | Value | Source |
+| :--- | :--- | :--- |
+| `chunk_size` | 2048 | BOOOOKSCORE |
+| `token_max` | 3000 | LangChain |
+| `chunk_overlap` | 200 | Original |
 
-### 3.3 Semantic Boundary Chunking
+### 3.4 Semantic Boundary Chunking
 
 **Decision:** Split content on semantic boundaries (paragraphs, then sentences) rather than fixed character counts.
 
@@ -126,7 +150,7 @@ The following aspects are **original design choices without direct research just
 2. Fall back to sentence boundaries (`.!?` followed by space + capital)
 3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
 
-### 3.4 Content-Type Aware Prompts
+### 3.5 Content-Type Aware Prompts
 
 **Decision:** Use different prompt templates for different content domains.
 
@@ -138,7 +162,7 @@ The following aspects are **original design choices without direct research just
 
 A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
 
-### 3.5 Prior Summary Integration
+### 3.6 Prior Summary Integration
 
 **Decision:** Always provide the previous summary as context when generating updates.
 
@@ -150,7 +174,7 @@ A generic summarization prompt loses domain-specific signal. By tailoring prompt
 
 The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
 
-### 3.6 Compression Ratio Tracking
+### 3.7 Compression Ratio Tracking
 
 **Decision:** Track and report compression metrics for every summary.
 
@@ -171,28 +195,26 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression
 The entry point counts tokens and selects strategy:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Maps token count to `SummaryLevel` enum.
-3. **Strategy dispatch:** Calls level-specific handler.
+2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce.
+3. **Strategy dispatch:** Calls appropriate handler.
 
-### 4.2 Brief and Standard Levels
+### 4.2 Brief Level
 
-For short content (< 3000 tokens):
+For short content (100-500 tokens):
 
-- Single LLM call with level-appropriate prompt
-- Prior summary injected as context if available
-- Content-type selection determines prompt variant
+- Single LLM call with brief prompt
 - Returns simple `SummaryResult` with no hierarchical structure
 
-### 4.3 Detailed and Hierarchical Levels
+### 4.3 Map-Reduce Level
 
-For longer content:
+For longer content (> 500 tokens):
 
-1. **Chunking:** Split content into overlapping chunks on semantic boundaries.
-2. **Parallel L1 generation:** Summarize each chunk independently. Uses semaphore-controlled concurrency to avoid overwhelming the LLM.
-3. **L2 grouping (hierarchical only):** Organize L1s into groups of ~5, summarize each group.
-4. **L3 synthesis:** Meta-summarize all L2s (or all L1s for DETAILED level) into final summary.
+1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
+2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
+3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
+4. **Final synthesis:** Combine remaining summaries into final output.
 
-The parallelism at L1 and L2 levels provides significant speedup for long content while maintaining semantic coherence through the hierarchical structure.
+The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process.
 
 ---
 
@@ -222,17 +244,22 @@ Summaries are persisted in two places:
 - **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
 - **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
 
+For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure:
+- First collapse level → L1 (chunk summaries)
+- Intermediate levels → L2 (grouped summaries)
+- Final output → L3 (synthesis)
+
 ---
 
 ## 6. Configuration
 
-| Parameter | Default | Research Comparison |
+| Parameter | Default | Source |
 | :--- | :--- | :--- |
-| `chunk_size` | 3000 | BOOOOKSCORE uses 2048 |
-| `chunk_overlap` | 200 | No direct comparison |
-| `max_concurrent_chunks` | 5 | Implementation choice |
-
-Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived from published research.
+| `chunk_size` | 2048 | BOOOOKSCORE |
+| `token_max` | 3000 | LangChain |
+| `chunk_overlap` | 200 | Original |
+| `max_concurrent` | 5 | Implementation choice |
+| `max_collapse_depth` | 10 | Safety limit |
 
 ---
 
@@ -240,19 +267,30 @@ Level thresholds (100, 500, 3000, 15000 tokens) are heuristic and not derived fr
 
 Summarization follows a fail-fast philosophy:
 
-- **LLM errors:** Propagated as `SummarizationError` rather than silently returning empty results.
+- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results.
 - **Empty input:** Returns NONE level immediately (not an error).
 - **Encoding errors:** Falls back to character-based token estimation.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
 
 The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
 
 ---
 
-## 8. Future Improvements
+## 8. Comparison: Old vs New Approach
+
+| Aspect | Old Approach | New Approach |
+| :--- | :--- | :--- |
+| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) |
+| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth |
+| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) |
+| token_max | N/A (fixed levels) | 3000 (LangChain) |
+| Complexity | Multiple code paths | Single map-reduce algorithm |
+| Research basis | Heuristic | LangChain + BOOOOKSCORE |
+
+---
 
-Based on research findings, consider:
+## 9. Future Improvements
 
-1. **Reduce chunk size to 2048** to align with BOOOOKSCORE's tested defaults
-2. **Validate token thresholds empirically** with real-world content
-3. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
-4. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
+1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
+2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
+3. **Tune token thresholds empirically** with real-world content
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index 6a542dbd..70d434dd 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -1,17 +1,15 @@
 """Demonstrate the summarizer on texts of varying lengths from the internet.
 
 This script fetches content of different sizes and shows how the adaptive
-summarizer automatically selects the appropriate strategy (BRIEF, STANDARD,
-DETAILED, or HIERARCHICAL) based on content length.
+summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE)
+based on content length.
 
 Usage:
     python examples/summarizer_demo.py
 
     # Test specific levels only
     python examples/summarizer_demo.py --level brief
-    python examples/summarizer_demo.py --level standard
-    python examples/summarizer_demo.py --level detailed
-    python examples/summarizer_demo.py --level hierarchical
+    python examples/summarizer_demo.py --level map_reduce
 
     # Use a different model
     python examples/summarizer_demo.py --model "gpt-4o-mini"
@@ -58,9 +56,7 @@ class TextSample:
 # Thresholds from adaptive.py:
 # NONE: < 100 tokens
 # BRIEF: 100-500 tokens
-# STANDARD: 500-3000 tokens
-# DETAILED: 3000-15000 tokens
-# HIERARCHICAL: > 15000 tokens
+# MAP_REDUCE: >= 500 tokens
 
 # Sample texts of varying lengths to demonstrate different summarization levels
 SAMPLES: list[TextSample] = [
@@ -98,10 +94,10 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Standard - Technology Article",
-        description="~800-2000 tokens - triggers STANDARD level (500-3000 token range)",
+        name="Map-Reduce - Technology Article",
+        description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)",
         url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
-        expected_level=SummaryLevel.STANDARD,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content="""
         Artificial intelligence (AI) is the intelligence of machines or software,
@@ -178,18 +174,18 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Detailed - Full Article",
-        description="~4000-10000 tokens - triggers DETAILED level (3000-15000 token range)",
+        name="Map-Reduce - Full Article",
+        description="~4000-10000 tokens - triggers MAP_REDUCE with chunking",
         url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
-        expected_level=SummaryLevel.DETAILED,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content
     ),
     TextSample(
-        name="Hierarchical - Long Document",
-        description="~16000+ tokens - triggers HIERARCHICAL level (>15000 tokens)",
+        name="Map-Reduce - Long Document",
+        description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations",
         url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
-        expected_level=SummaryLevel.HIERARCHICAL,
+        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content (~16K tokens)
     ),
@@ -229,7 +225,7 @@ def generate_synthetic_content(target_tokens: int, topic: str = "technology") ->
     return "\n\n".join(result)
 
 
-async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:  # noqa: PLR0912
+async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
     """Fetch content from URL or use fallback."""
     try:
         # Add User-Agent header to avoid 403 errors from some sites
@@ -269,9 +265,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         # Check if content is too short for expected level
         min_words_for_level = {
             SummaryLevel.BRIEF: 80,  # Need ~100 tokens
-            SummaryLevel.STANDARD: 400,  # Need ~500 tokens
-            SummaryLevel.DETAILED: 2500,  # Need ~3000 tokens
-            SummaryLevel.HIERARCHICAL: 12000,  # Need ~15000 tokens
+            SummaryLevel.MAP_REDUCE: 400,  # Need ~500 tokens
         }
         min_words = min_words_for_level.get(sample.expected_level, 50)
 
@@ -282,22 +276,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
             else:
                 target_tokens = {
                     SummaryLevel.BRIEF: 300,
-                    SummaryLevel.STANDARD: 1500,
-                    SummaryLevel.DETAILED: 8000,
-                    SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+                    SummaryLevel.MAP_REDUCE: 1500,
                 }
                 content = generate_synthetic_content(
                     target_tokens.get(sample.expected_level, 1000),
                 )
 
-        # For HIERARCHICAL, truncate very long content to keep demo fast
-        # but ensure we stay above 15000 tokens (~13000 words)
-        if sample.expected_level == SummaryLevel.HIERARCHICAL:
-            words = content.split()
-            # ~16000 tokens ≈ 13500 words (need >15000 tokens for HIERARCHICAL)
-            if len(words) > 13500:  # noqa: PLR2004
-                content = " ".join(words[:13500])
-                print("  📎 Truncated to ~13500 words for faster demo")
+        # For very long content, truncate to keep demo fast
+        words = content.split()
+        if len(words) > 13500:  # noqa: PLR2004
+            content = " ".join(words[:13500])
+            print("  📎 Truncated to ~13500 words for faster demo")
 
         return content.strip()
 
@@ -310,9 +299,7 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         # Generate synthetic content for the expected level
         target_tokens = {
             SummaryLevel.BRIEF: 300,
-            SummaryLevel.STANDARD: 1500,
-            SummaryLevel.DETAILED: 8000,
-            SummaryLevel.HIERARCHICAL: 16000,  # Keep manageable for demo
+            SummaryLevel.MAP_REDUCE: 1500,
         }
         return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
 
@@ -335,9 +322,7 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     level_emoji = {
         SummaryLevel.NONE: "⏭️",
         SummaryLevel.BRIEF: "📝",
-        SummaryLevel.STANDARD: "📄",
-        SummaryLevel.DETAILED: "📚",
-        SummaryLevel.HIERARCHICAL: "🏗️",
+        SummaryLevel.MAP_REDUCE: "🔄",
     }
     print("\n🎯 Summarization Result:")
     print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
@@ -345,6 +330,8 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
     print(f"   Output tokens: {result.output_tokens:,}")
     print(f"   Compression: {result.compression_ratio:.1%}")
+    if result.collapse_depth > 0:
+        print(f"   Collapse depth: {result.collapse_depth}")
 
     # Summary content
     if result.summary:
@@ -357,23 +344,6 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
         )
         print(wrapped)
 
-    # Hierarchical details if present
-    if result.hierarchical:
-        h = result.hierarchical
-        print("\n🏗️  Hierarchical Structure:")
-        print(f"   L1 chunks: {len(h.l1_summaries)}")
-        print(f"   L2 groups: {len(h.l2_summaries)}")
-        if h.l2_summaries:
-            print(f"   L2 preview: {h.l2_summaries[0][:100]}...")
-        print("\n   L3 Final Summary:")
-        wrapped = textwrap.fill(
-            h.l3_summary,
-            width=68,
-            initial_indent="   ",
-            subsequent_indent="   ",
-        )
-        print(wrapped)
-
 
 async def run_demo(
     level_filter: str | None = None,
@@ -394,7 +364,7 @@ async def run_demo(
         openai_base_url=actual_base_url,
         model=actual_model,
         api_key=api_key,
-        chunk_size=3000,
+        chunk_size=2048,  # BOOOOKSCORE default
         max_concurrent_chunks=3,
         timeout=120.0,  # Longer timeout for local models
     )
@@ -404,9 +374,7 @@ async def run_demo(
     if level_filter:
         level_map = {
             "brief": SummaryLevel.BRIEF,
-            "standard": SummaryLevel.STANDARD,
-            "detailed": SummaryLevel.DETAILED,
-            "hierarchical": SummaryLevel.HIERARCHICAL,
+            "map_reduce": SummaryLevel.MAP_REDUCE,
         }
         target_level = level_map.get(level_filter.lower())
         if target_level:
@@ -449,14 +417,15 @@ def main() -> None:
         epilog=textwrap.dedent("""
         Examples:
           python examples/summarizer_demo.py
-          python examples/summarizer_demo.py --level standard
+          python examples/summarizer_demo.py --level brief
+          python examples/summarizer_demo.py --level map_reduce
           python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
         """),
     )
 
     parser.add_argument(
         "--level",
-        choices=["brief", "standard", "detailed", "hierarchical"],
+        choices=["brief", "map_reduce"],
         help="Only test a specific summarization level",
     )
     parser.add_argument(
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 4ef11858..f86ed34d 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -340,9 +340,8 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="summary up to 256",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
@@ -569,9 +568,8 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="summary text",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
@@ -618,4 +616,4 @@ async def fake_reconcile(
     files = list(tmp_path.glob("entries/**/*.md"))
     assert len(files) == 4  # user + assistant + fact + 1 summary
     assert any("facts" in str(f) for f in files)
-    assert any("summaries/L3/final.md" in str(f) for f in files)
+    assert any("summaries" in str(f) for f in files)
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index db197b02..86040d7a 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -66,9 +66,8 @@ async def fake_reconcile(
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.STANDARD,
+            level=SummaryLevel.MAP_REDUCE,
             summary="User likes testing.",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=20,
             compression_ratio=0.2,
diff --git a/tests/memory/test_store.py b/tests/memory/test_store.py
index 5e8e3314..29dbe2e5 100644
--- a/tests/memory/test_store.py
+++ b/tests/memory/test_store.py
@@ -137,21 +137,21 @@ def test_upsert_and_delete_entries_delegate() -> None:
 
 
 def test_upsert_summary_entries_simple() -> None:
-    """Test upserting a simple (non-hierarchical) summary."""
+    """Test upserting a summary."""
     fake = _FakeCollection()
     entries = [
         {
-            "id": "conv-123:summary:L3:final",
-            "content": "A standard paragraph summary.",
+            "id": "conv-123:summary",
+            "content": "A paragraph summary.",
             "metadata": {
                 "conversation_id": "conv-123",
                 "role": "summary",
-                "level": 3,
                 "is_final": True,
-                "summary_level_name": "STANDARD",
+                "summary_level": "MAP_REDUCE",
                 "input_tokens": 1000,
                 "output_tokens": 50,
                 "compression_ratio": 0.05,
+                "collapse_depth": 0,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -159,52 +159,30 @@ def test_upsert_summary_entries_simple() -> None:
 
     ids = _store.upsert_summary_entries(fake, entries)
 
-    assert ids == ["conv-123:summary:L3:final"]
+    assert ids == ["conv-123:summary"]
     assert len(fake.upserts) == 1
     upserted_ids, upserted_docs, upserted_metas = fake.upserts[0]
-    assert upserted_ids == ["conv-123:summary:L3:final"]
-    assert upserted_docs == ["A standard paragraph summary."]
-    assert upserted_metas[0]["level"] == 3
+    assert upserted_ids == ["conv-123:summary"]
+    assert upserted_docs == ["A paragraph summary."]
     assert upserted_metas[0]["is_final"] is True
 
 
-def test_upsert_summary_entries_with_chunks() -> None:
-    """Test upserting a hierarchical summary with L1 and L3 entries."""
+def test_upsert_summary_entries_with_collapse_depth() -> None:
+    """Test upserting a summary with collapse depth metadata."""
     fake = _FakeCollection()
     entries = [
         {
-            "id": "conv-456:summary:L1:0",
-            "content": "Chunk 0 summary",
-            "metadata": {
-                "conversation_id": "conv-456",
-                "role": "summary",
-                "level": 1,
-                "chunk_index": 0,
-                "created_at": "2024-01-01T00:00:00",
-            },
-        },
-        {
-            "id": "conv-456:summary:L1:1",
-            "content": "Chunk 1 summary",
-            "metadata": {
-                "conversation_id": "conv-456",
-                "role": "summary",
-                "level": 1,
-                "chunk_index": 1,
-                "created_at": "2024-01-01T00:00:00",
-            },
-        },
-        {
-            "id": "conv-456:summary:L3:final",
+            "id": "conv-456:summary",
             "content": "Final synthesis",
             "metadata": {
                 "conversation_id": "conv-456",
                 "role": "summary",
-                "level": 3,
                 "is_final": True,
+                "summary_level": "MAP_REDUCE",
                 "input_tokens": 5000,
                 "output_tokens": 100,
                 "compression_ratio": 0.02,
+                "collapse_depth": 2,
                 "created_at": "2024-01-01T00:00:00",
             },
         },
@@ -212,10 +190,9 @@ def test_upsert_summary_entries_with_chunks() -> None:
 
     ids = _store.upsert_summary_entries(fake, entries)
 
-    assert len(ids) == 3
-    assert "conv-456:summary:L1:0" in ids
-    assert "conv-456:summary:L1:1" in ids
-    assert "conv-456:summary:L3:final" in ids
+    assert len(ids) == 1
+    assert ids[0] == "conv-456:summary"
+    assert fake.upserts[0][2][0]["collapse_depth"] == 2
 
 
 def test_upsert_summary_entries_empty() -> None:
@@ -228,41 +205,8 @@ def test_upsert_summary_entries_empty() -> None:
     assert len(fake.upserts) == 0
 
 
-def test_get_summary_at_level() -> None:
-    """Test retrieving summaries at a specific level."""
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["Chunk 0", "Chunk 1"],
-            "metadatas": [
-                {
-                    "conversation_id": "c1",
-                    "role": "summary",
-                    "level": 1,
-                    "chunk_index": 0,
-                    "created_at": "now",
-                },
-                {
-                    "conversation_id": "c1",
-                    "role": "summary",
-                    "level": 1,
-                    "chunk_index": 1,
-                    "created_at": "now",
-                },
-            ],
-            "ids": ["c1:summary:L1:0", "c1:summary:L1:1"],
-        },
-    )
-
-    records = _store.get_summary_at_level(fake, "c1", level=1)
-
-    assert len(records) == 2
-    assert records[0].metadata.level == 1
-    assert records[0].metadata.chunk_index == 0
-    assert records[1].metadata.chunk_index == 1
-
-
-def test_get_final_summary_returns_final() -> None:
-    """Test getting the L3 final summary."""
+def test_get_final_summary_returns_summary() -> None:
+    """Test getting the final summary for a conversation."""
     fake = _FakeCollection(
         get_result={
             "documents": ["The final summary"],
@@ -270,12 +214,13 @@ def test_get_final_summary_returns_final() -> None:
                 {
                     "conversation_id": "c1",
                     "role": "summary",
-                    "level": 3,
                     "is_final": True,
+                    "summary_level": "MAP_REDUCE",
+                    "collapse_depth": 1,
                     "created_at": "now",
                 },
             ],
-            "ids": ["c1:summary:L3:final"],
+            "ids": ["c1:summary"],
         },
     )
 
@@ -295,42 +240,28 @@ def test_get_final_summary_returns_none_when_missing() -> None:
     assert result is None
 
 
-def test_delete_summaries_all_levels() -> None:
-    """Test deleting all summary levels for a conversation."""
+def test_delete_summaries() -> None:
+    """Test deleting summaries for a conversation."""
     fake = _FakeCollection(
         get_result={
-            "documents": ["L1", "L3"],
+            "documents": ["The summary"],
             "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
-                {"conversation_id": "c1", "role": "summary", "level": 3, "created_at": "now"},
+                {
+                    "conversation_id": "c1",
+                    "role": "summary",
+                    "summary_level": "MAP_REDUCE",
+                    "created_at": "now",
+                },
             ],
-            "ids": ["c1:summary:L1:0", "c1:summary:L3:final"],
+            "ids": ["c1:summary"],
         },
     )
 
     deleted_count = _store.delete_summaries(fake, "c1")
 
-    assert deleted_count == 2
-    assert len(fake.deleted) == 1
-    assert set(fake.deleted[0]) == {"c1:summary:L1:0", "c1:summary:L3:final"}
-
-
-def test_delete_summaries_specific_levels() -> None:
-    """Test deleting only specific summary levels."""
-    fake = _FakeCollection(
-        get_result={
-            "documents": ["L1 chunk"],
-            "metadatas": [
-                {"conversation_id": "c1", "role": "summary", "level": 1, "created_at": "now"},
-            ],
-            "ids": ["c1:summary:L1:0"],
-        },
-    )
-
-    deleted_count = _store.delete_summaries(fake, "c1", levels=[1])
-
     assert deleted_count == 1
-    assert fake.deleted[0] == ["c1:summary:L1:0"]
+    assert len(fake.deleted) == 1
+    assert fake.deleted[0] == ["c1:summary"]
 
 
 def test_delete_summaries_no_entries() -> None:
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 6acf4317..a64a72a1 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -7,7 +7,8 @@
 import pytest
 
 from agent_cli.summarizer.adaptive import (
-    LEVEL_THRESHOLDS,
+    THRESHOLD_BRIEF,
+    THRESHOLD_NONE,
     SummarizationError,
     SummarizerConfig,
     SummaryOutput,
@@ -63,9 +64,31 @@ def test_trailing_slash_stripped(self) -> None:
         )
         assert config.openai_base_url == "http://localhost:8000/v1"
 
+    def test_default_chunk_size_is_booookscore(self) -> None:
+        """Test that default chunk_size follows BOOOOKSCORE recommendation."""
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+        assert config.chunk_size == 2048  # BOOOOKSCORE's tested default
+
+    def test_default_token_max_is_langchain(self) -> None:
+        """Test that default token_max follows LangChain's default."""
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="gpt-4",
+        )
+        assert config.token_max == 3000  # LangChain's default
+
 
 class TestDetermineLevel:
-    """Tests for level determination based on token count."""
+    """Tests for level determination based on token count.
+
+    The simplified approach has 3 levels:
+    - NONE: Very short content (< 100 tokens)
+    - BRIEF: Short content (100-500 tokens)
+    - MAP_REDUCE: Everything else (uses map-reduce)
+    """
 
     def test_none_level_threshold(self) -> None:
         """Test NONE level for very short content."""
@@ -78,30 +101,17 @@ def test_brief_level_threshold(self) -> None:
         assert determine_level(300) == SummaryLevel.BRIEF
         assert determine_level(499) == SummaryLevel.BRIEF
 
-    def test_standard_level_threshold(self) -> None:
-        """Test STANDARD level for medium content."""
-        assert determine_level(500) == SummaryLevel.STANDARD
-        assert determine_level(1500) == SummaryLevel.STANDARD
-        assert determine_level(2999) == SummaryLevel.STANDARD
-
-    def test_detailed_level_threshold(self) -> None:
-        """Test DETAILED level for longer content."""
-        assert determine_level(3000) == SummaryLevel.DETAILED
-        assert determine_level(8000) == SummaryLevel.DETAILED
-        assert determine_level(14999) == SummaryLevel.DETAILED
-
-    def test_hierarchical_level_threshold(self) -> None:
-        """Test HIERARCHICAL level for very long content."""
-        assert determine_level(15000) == SummaryLevel.HIERARCHICAL
-        assert determine_level(50000) == SummaryLevel.HIERARCHICAL
-        assert determine_level(100000) == SummaryLevel.HIERARCHICAL
+    def test_map_reduce_level_for_longer_content(self) -> None:
+        """Test that content >= 500 tokens uses MAP_REDUCE."""
+        assert determine_level(500) == SummaryLevel.MAP_REDUCE
+        assert determine_level(1500) == SummaryLevel.MAP_REDUCE
+        assert determine_level(5000) == SummaryLevel.MAP_REDUCE
+        assert determine_level(20000) == SummaryLevel.MAP_REDUCE
 
     def test_thresholds_match_constants(self) -> None:
         """Verify thresholds match the module constants."""
-        assert LEVEL_THRESHOLDS[SummaryLevel.NONE] == 100
-        assert LEVEL_THRESHOLDS[SummaryLevel.BRIEF] == 500
-        assert LEVEL_THRESHOLDS[SummaryLevel.STANDARD] == 3000
-        assert LEVEL_THRESHOLDS[SummaryLevel.DETAILED] == 15000
+        assert THRESHOLD_NONE == 100
+        assert THRESHOLD_BRIEF == 500
 
 
 class TestSummarize:
@@ -168,92 +178,81 @@ async def test_brief_level_calls_brief_summary(
         assert result.summary == "Brief summary."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._standard_summary")
-    async def test_standard_level_calls_standard_summary(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_longer_content_uses_map_reduce(
         self,
-        mock_standard: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that STANDARD level content calls _standard_summary."""
-        mock_standard.return_value = "Standard summary paragraph."
+        """Test that content >= 500 tokens uses map-reduce."""
+        mock_result = SummaryResult(
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Map-reduce summary.",
+            input_tokens=800,
+            output_tokens=100,
+            compression_ratio=0.125,
+        )
+        mock_map_reduce.return_value = mock_result
 
-        # Create content that's ~500-3000 tokens
+        # Create content that's ~500+ tokens
         content = "This is a test sentence with more words. " * 100  # ~800 tokens
 
         result = await summarize(content, config, content_type="general")
 
-        mock_standard.assert_called_once_with(content, config, None, "general")
-        assert result.level == SummaryLevel.STANDARD
-        assert result.summary == "Standard summary paragraph."
+        mock_map_reduce.assert_called_once()
+        assert result.summary == "Map-reduce summary."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._standard_summary")
-    async def test_prior_summary_passed_to_standard(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_prior_summary_passed_to_map_reduce(
         self,
-        mock_standard: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that prior_summary is passed to _standard_summary."""
-        mock_standard.return_value = "Updated summary."
-
-        content = "This is a test sentence with more words. " * 100
-        prior = "Previous context summary."
-
-        await summarize(content, config, prior_summary=prior)
-
-        mock_standard.assert_called_once_with(content, config, prior, "general")
-
-    @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._detailed_summary")
-    async def test_detailed_level_calls_detailed_summary(
-        self,
-        mock_detailed: AsyncMock,
-        config: SummarizerConfig,
-    ) -> None:
-        """Test that DETAILED level content calls _detailed_summary."""
+        """Test that prior_summary is passed to _map_reduce_summary."""
         mock_result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Detailed summary.",
-            hierarchical=None,
-            input_tokens=5000,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Updated summary.",
+            input_tokens=800,
             output_tokens=100,
-            compression_ratio=0.02,
+            compression_ratio=0.125,
         )
-        mock_detailed.return_value = mock_result
+        mock_map_reduce.return_value = mock_result
 
-        # Create content that's ~3000-15000 tokens
-        content = "Word " * 5000  # ~5000 tokens
+        content = "This is a test sentence with more words. " * 100
+        prior = "Previous context summary."
 
-        result = await summarize(content, config)
+        await summarize(content, config, prior_summary=prior)
 
-        assert mock_detailed.called
-        assert result.level == SummaryLevel.DETAILED
+        # Verify prior_summary was passed
+        call_args = mock_map_reduce.call_args
+        assert call_args[0][3] == prior  # prior_summary is 4th positional arg
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._hierarchical_summary")
-    async def test_hierarchical_level_calls_hierarchical_summary(
+    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
+    async def test_very_long_content_uses_map_reduce(
         self,
-        mock_hierarchical: AsyncMock,
+        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that HIERARCHICAL level content calls _hierarchical_summary."""
+        """Test that very long content uses map-reduce."""
         mock_result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Hierarchical summary.",
-            hierarchical=None,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Long content summary.",
             input_tokens=20000,
             output_tokens=500,
             compression_ratio=0.025,
+            collapse_depth=2,
         )
-        mock_hierarchical.return_value = mock_result
+        mock_map_reduce.return_value = mock_result
 
         # Create content that's > 15000 tokens
         content = "Word " * 20000
 
         result = await summarize(content, config)
 
-        assert mock_hierarchical.called
-        assert result.level == SummaryLevel.HIERARCHICAL
+        assert mock_map_reduce.called
+        assert result.level == SummaryLevel.MAP_REDUCE
 
 
 class TestGenerateSummary:
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index d7028659..f11fcff8 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -1,457 +1,68 @@
-"""Integration tests for the summarizer with memory system."""
+"""Integration tests for summarizer with storage layer."""
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
-from unittest.mock import patch
-
-import pytest
-
-from agent_cli.memory._ingest import summarize_content
-from agent_cli.memory._persistence import persist_hierarchical_summary
-from agent_cli.memory._store import (
-    get_final_summary,
-    get_summary_at_level,
-    upsert_summary_entries,
-)
-from agent_cli.summarizer import SummaryLevel, SummaryResult
 from agent_cli.summarizer.adaptive import determine_level
-from agent_cli.summarizer.models import ChunkSummary, HierarchicalSummary
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-class _FakeCollection:
-    """Minimal Chroma-like collection for testing."""
-
-    def __init__(self) -> None:
-        self._store: dict[str, tuple[str, dict[str, Any]]] = {}
-
-    def upsert(
-        self,
-        *,
-        ids: list[str],
-        documents: list[str],
-        metadatas: list[dict[str, Any]],
-    ) -> None:
-        for doc_id, doc, meta in zip(ids, documents, metadatas, strict=False):
-            self._store[doc_id] = (doc, meta)
-
-    def get(
-        self,
-        *,
-        where: dict[str, Any] | None = None,
-        include: list[str] | None = None,  # noqa: ARG002
-    ) -> dict[str, Any]:
-        if where is None:
-            return {"documents": [], "metadatas": [], "ids": []}
-
-        results: list[tuple[str, tuple[str, dict[str, Any]]]] = []
-        for doc_id, (doc, meta) in self._store.items():
-            # Check all conditions in $and clause
-            conditions = where.get("$and", [where])
-            match = True
-            for clause in conditions:
-                for k, v in clause.items():
-                    if k == "$and":
-                        continue
-                    if isinstance(v, dict):
-                        if "$in" in v and meta.get(k) not in v["$in"]:
-                            match = False
-                        if "$ne" in v and meta.get(k) == v["$ne"]:
-                            match = False
-                    elif meta.get(k) != v:
-                        match = False
-            if match:
-                results.append((doc_id, (doc, meta)))
-
-        docs = [doc for _, (doc, _) in results]
-        metas = [meta for _, (_, meta) in results]
-        ids = [doc_id for doc_id, _ in results]
-        return {"documents": docs, "metadatas": metas, "ids": ids}
-
-    def delete(
-        self,
-        ids: list[str] | None = None,
-        where: dict[str, Any] | None = None,  # noqa: ARG002
-    ) -> None:
-        if ids:
-            for doc_id in ids:
-                self._store.pop(doc_id, None)
-
-
-@pytest.fixture
-def fake_collection() -> _FakeCollection:
-    """Create a fake ChromaDB collection."""
-    return _FakeCollection()
-
-
-@pytest.fixture
-def memory_root(tmp_path: Path) -> Path:
-    """Create a temporary memory root directory."""
-    return tmp_path / "memory"
-
-
-class TestSummaryResultStorageMetadata:
-    """Test SummaryResult.to_storage_metadata for various levels."""
-
-    def test_standard_summary_produces_single_entry(self) -> None:
-        """Test that STANDARD level produces a single L3 entry."""
-        result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A paragraph summary of the content.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        entries = result.to_storage_metadata("conv-123")
-
-        assert len(entries) == 1
-        entry = entries[0]
-        assert entry["id"] == "conv-123:summary:L3:final"
-        assert entry["content"] == "A paragraph summary of the content."
-        assert entry["metadata"]["level"] == 3
-        assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level_name"] == "STANDARD"
+from agent_cli.summarizer.models import SummaryLevel, SummaryResult
 
-    def test_hierarchical_summary_produces_multiple_entries(self) -> None:
-        """Test that HIERARCHICAL level produces L1, L2, L3 entries."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=2,
-                content="Chunk 2",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final hierarchical synthesis.",
-        )
-        result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final hierarchical synthesis.",
-            hierarchical=hierarchical,
-            input_tokens=20000,
-            output_tokens=200,
-            compression_ratio=0.01,
-        )
 
-        entries = result.to_storage_metadata("conv-456")
-
-        # Should have 3 L1 + 1 L2 + 1 L3 = 5 entries
-        assert len(entries) == 5
-
-        # Check L1 entries
-        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
-        assert len(l1_entries) == 3
-
-        # Check L2 entries
-        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
-        assert len(l2_entries) == 1
-
-        # Check L3 entry
-        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
-        assert len(l3_entries) == 1
+class TestDetermineLevel:
+    """Tests for determine_level function with various content sizes."""
 
+    def test_short_content_is_brief(self) -> None:
+        """Test that 100-500 token content uses BRIEF."""
+        level = determine_level(200)
+        assert level == SummaryLevel.BRIEF
 
-class TestHierarchicalSummaryStorage:
-    """Test storing hierarchical summaries to ChromaDB."""
+    def test_medium_content_is_map_reduce(self) -> None:
+        """Test that 500+ token content uses MAP_REDUCE."""
+        level = determine_level(1000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-    def test_store_simple_summary(self, fake_collection: _FakeCollection) -> None:
-        """Test storing a simple (non-hierarchical) summary."""
-        result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
+    def test_long_content_is_map_reduce(self) -> None:
+        """Test that 3000+ token content uses MAP_REDUCE."""
+        level = determine_level(5000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-        entries = result.to_storage_metadata("conv-123")
-        ids = upsert_summary_entries(fake_collection, entries)
+    def test_very_long_content_is_map_reduce(self) -> None:
+        """Test that content over 15000 tokens still uses MAP_REDUCE."""
+        level = determine_level(20000)
+        assert level == SummaryLevel.MAP_REDUCE
 
-        assert len(ids) == 1
-        assert "conv-123:summary:L3:final" in ids
 
-        # Verify retrieval
-        stored = get_final_summary(fake_collection, "conv-123")
-        assert stored is not None
-        assert stored.content == "A standard summary."
+class TestSummaryResultStorage:
+    """Tests for SummaryResult storage metadata generation."""
 
-    def test_store_hierarchical_summary(self, fake_collection: _FakeCollection) -> None:
-        """Test storing a hierarchical summary with all levels."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=[],
-            l3_summary="Final summary",
-        )
+    def test_to_storage_metadata_creates_entry(self) -> None:
+        """Test that to_storage_metadata creates a valid entry."""
         result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Final summary",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
+            collapse_depth=1,
         )
+        entries = result.to_storage_metadata("test-conversation")
 
-        entries = result.to_storage_metadata("conv-789")
-        ids = upsert_summary_entries(fake_collection, entries)
-
-        assert len(ids) == 3  # 2 L1 + 1 L3
-
-        # Verify L1 retrieval
-        l1_stored = get_summary_at_level(fake_collection, "conv-789", level=1)
-        assert len(l1_stored) == 2
-
-        # Verify L3 retrieval
-        final = get_final_summary(fake_collection, "conv-789")
-        assert final is not None
-        assert final.content == "Final summary"
-
-
-class TestFilePersistence:
-    """Test hierarchical summary file persistence."""
-
-    def test_persist_hierarchical_creates_files(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that persist_hierarchical_summary creates correct file structure."""
-        l1_summaries = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 content",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 content",
-                token_count=10,
-                source_tokens=100,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1_summaries,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final synthesis",
-        )
-        result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final synthesis",
-            hierarchical=hierarchical,
-            input_tokens=20000,
-            output_tokens=200,
-            compression_ratio=0.01,
-        )
-
-        ids = persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="test-conv",
-            summary_result=result,
-        )
-
-        assert len(ids) == 4  # 2 L1 + 1 L2 + 1 L3
-
-        # Check file structure (note: _slugify converts - to - not _)
-        entries_dir = memory_root / "entries" / "test-conv"
-        l1_dir = entries_dir / "summaries" / "L1"
-        l2_dir = entries_dir / "summaries" / "L2"
-        l3_dir = entries_dir / "summaries" / "L3"
-
-        assert l1_dir.exists()
-        assert l2_dir.exists()
-        assert l3_dir.exists()
-
-        # Check L1 files
-        l1_files = list(l1_dir.glob("*.md"))
-        assert len(l1_files) == 2
-
-        # Check L2 files
-        l2_files = list(l2_dir.glob("*.md"))
-        assert len(l2_files) == 1
-
-        # Check L3 files
-        l3_files = list(l3_dir.glob("*.md"))
-        assert len(l3_files) == 1
-        assert (l3_dir / "final.md").exists()
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "test-conversation:summary"
+        assert entry["content"] == "A comprehensive summary."
+        assert entry["metadata"]["conversation_id"] == "test-conversation"
+        assert entry["metadata"]["role"] == "summary"
+        assert entry["metadata"]["is_final"] is True
+        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
+        assert entry["metadata"]["collapse_depth"] == 1
 
-    def test_persist_simple_summary_creates_l3_file(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that a simple summary creates just L3/final.md."""
+    def test_none_level_returns_empty(self) -> None:
+        """Test that NONE level produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard paragraph summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        ids = persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="simple-conv",
-            summary_result=result,
-        )
-
-        assert len(ids) == 1
-
-        # Check file exists (note: _slugify converts - to - not _)
-        entries_dir = memory_root / "entries" / "simple-conv"
-        l3_file = entries_dir / "summaries" / "L3" / "final.md"
-        assert l3_file.exists()
-
-        # Check content has YAML front matter
-        content = l3_file.read_text(encoding="utf-8")
-        assert "---" in content
-        assert "level: 3" in content
-        assert "A standard paragraph summary." in content
-
-    def test_persist_deletes_old_summaries(
-        self,
-        fake_collection: _FakeCollection,
-        memory_root: Path,
-    ) -> None:
-        """Test that persisting new summary deletes old summary files."""
-        # Create first summary
-        result1 = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="First summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="conv",
-            summary_result=result1,
+            level=SummaryLevel.NONE,
+            summary=None,
+            input_tokens=50,
+            output_tokens=0,
+            compression_ratio=0.0,
         )
-
-        entries_dir = memory_root / "entries" / "conv"
-        first_file = entries_dir / "summaries" / "L3" / "final.md"
-        assert first_file.exists()
-        assert "First summary." in first_file.read_text()
-
-        # Create second summary (should replace first)
-        result2 = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="Second summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
-            compression_ratio=0.05,
-        )
-
-        persist_hierarchical_summary(
-            fake_collection,
-            memory_root=memory_root,
-            conversation_id="conv",
-            summary_result=result2,
-        )
-
-        # First summary should be moved to deleted
-        assert first_file.exists()
-        assert "Second summary." in first_file.read_text()
-
-        # Old summary should be in deleted folder
-        deleted_dir = memory_root / "entries" / "deleted" / "conv" / "summaries"
-        assert deleted_dir.exists()
-
-
-class TestDetermineLevelFunction:
-    """Test that determine_level correctly determines summary levels."""
-
-    def test_very_short_content_is_none(self) -> None:
-        """Test that content under 100 tokens gets NONE level."""
-        level = determine_level(50)
-        assert level == SummaryLevel.NONE
-
-    def test_short_content_is_brief(self) -> None:
-        """Test that 100-500 token content gets BRIEF level."""
-        level = determine_level(300)
-        assert level == SummaryLevel.BRIEF
-
-    def test_medium_content_is_standard(self) -> None:
-        """Test that 500-3000 token content gets STANDARD level."""
-        level = determine_level(1500)
-        assert level == SummaryLevel.STANDARD
-
-    def test_long_content_is_detailed(self) -> None:
-        """Test that 3000-15000 token content gets DETAILED level."""
-        level = determine_level(8000)
-        assert level == SummaryLevel.DETAILED
-
-    def test_very_long_content_is_hierarchical(self) -> None:
-        """Test that content over 15000 tokens gets HIERARCHICAL level."""
-        level = determine_level(25000)
-        assert level == SummaryLevel.HIERARCHICAL
-
-
-class TestSummarizeContentFunction:
-    """Test the summarize_content function from _ingest."""
-
-    @pytest.mark.asyncio
-    async def test_summarize_content_creates_result(self) -> None:
-        """Test that summarize_content returns a valid SummaryResult."""
-        # Patch at source since _ingest imports inside the function
-        with patch("agent_cli.summarizer.summarize") as mock_summarize:
-            mock_result = SummaryResult(
-                level=SummaryLevel.STANDARD,
-                summary="Mocked summary.",
-                hierarchical=None,
-                input_tokens=1000,
-                output_tokens=50,
-                compression_ratio=0.05,
-            )
-            mock_summarize.return_value = mock_result
-
-            result = await summarize_content(
-                content="Some content to summarize " * 100,
-                openai_base_url="http://localhost:8000/v1",
-                api_key=None,
-                model="test-model",
-            )
-
-            assert result.level == SummaryLevel.STANDARD
-            assert result.summary == "Mocked summary."
+        entries = result.to_storage_metadata("test-conversation")
+        assert entries == []
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index d3962111..c5b04f70 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -7,8 +7,6 @@
 import pytest
 
 from agent_cli.summarizer.models import (
-    ChunkSummary,
-    HierarchicalSummary,
     SummaryLevel,
     SummaryResult,
 )
@@ -21,122 +19,12 @@ def test_level_values(self) -> None:
         """Test that levels have correct integer values."""
         assert SummaryLevel.NONE == 0
         assert SummaryLevel.BRIEF == 1
-        assert SummaryLevel.STANDARD == 2
-        assert SummaryLevel.DETAILED == 3
-        assert SummaryLevel.HIERARCHICAL == 4
+        assert SummaryLevel.MAP_REDUCE == 2
 
     def test_level_ordering(self) -> None:
         """Test that levels can be compared."""
         assert SummaryLevel.NONE < SummaryLevel.BRIEF
-        assert SummaryLevel.BRIEF < SummaryLevel.STANDARD
-        assert SummaryLevel.STANDARD < SummaryLevel.DETAILED
-        assert SummaryLevel.DETAILED < SummaryLevel.HIERARCHICAL
-
-
-class TestChunkSummary:
-    """Tests for ChunkSummary model."""
-
-    def test_basic_creation(self) -> None:
-        """Test creating a chunk summary."""
-        chunk = ChunkSummary(
-            chunk_index=0,
-            content="This is a summary of chunk 1.",
-            token_count=10,
-            source_tokens=100,
-        )
-        assert chunk.chunk_index == 0
-        assert chunk.content == "This is a summary of chunk 1."
-        assert chunk.token_count == 10
-        assert chunk.source_tokens == 100
-
-    def test_validation_negative_tokens(self) -> None:
-        """Test that negative token counts fail validation."""
-        with pytest.raises(ValueError, match="greater than or equal to 0"):
-            ChunkSummary(
-                chunk_index=0,
-                content="Test",
-                token_count=-1,
-                source_tokens=100,
-            )
-
-
-class TestHierarchicalSummary:
-    """Tests for HierarchicalSummary model."""
-
-    def test_basic_creation(self) -> None:
-        """Test creating a hierarchical summary."""
-        l1 = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 1 summary",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 2 summary",
-                token_count=12,
-                source_tokens=120,
-            ),
-        ]
-        hs = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=["Group summary"],
-            l3_summary="Final summary of all content.",
-        )
-        assert len(hs.l1_summaries) == 2
-        assert len(hs.l2_summaries) == 1
-        assert hs.l3_summary == "Final summary of all content."
-
-    def test_default_chunk_settings(self) -> None:
-        """Test default chunk size and overlap."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=[],
-            l3_summary="Final",
-        )
-        assert hs.chunk_size == 3000
-        assert hs.chunk_overlap == 200
-
-    def test_get_summary_at_level_1(self) -> None:
-        """Test getting L1 summaries."""
-        l1 = [
-            ChunkSummary(chunk_index=0, content="C1", token_count=5, source_tokens=50),
-            ChunkSummary(chunk_index=1, content="C2", token_count=5, source_tokens=50),
-        ]
-        hs = HierarchicalSummary(l1_summaries=l1, l2_summaries=[], l3_summary="Final")
-        result = hs.get_summary_at_level(1)
-        assert result == ["C1", "C2"]
-
-    def test_get_summary_at_level_2_with_l2(self) -> None:
-        """Test getting L2 summaries when available."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=["Group A", "Group B"],
-            l3_summary="Final",
-        )
-        result = hs.get_summary_at_level(2)
-        assert result == ["Group A", "Group B"]
-
-    def test_get_summary_at_level_2_fallback(self) -> None:
-        """Test getting L2 falls back to L3 when no L2 summaries."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=[],
-            l3_summary="Final summary",
-        )
-        result = hs.get_summary_at_level(2)
-        assert result == ["Final summary"]
-
-    def test_get_summary_at_level_3(self) -> None:
-        """Test getting L3 summary."""
-        hs = HierarchicalSummary(
-            l1_summaries=[],
-            l2_summaries=["Group"],
-            l3_summary="The final summary",
-        )
-        result = hs.get_summary_at_level(3)
-        assert result == "The final summary"
+        assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE
 
 
 class TestSummaryResult:
@@ -147,56 +35,46 @@ def test_none_level_result(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
         )
         assert result.level == SummaryLevel.NONE
         assert result.summary is None
-        assert result.chunk_summaries is None
+        assert result.collapse_depth == 0
 
     def test_brief_level_result(self) -> None:
         """Test result for brief summary."""
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="A brief one-sentence summary.",
-            hierarchical=None,
             input_tokens=200,
             output_tokens=10,
             compression_ratio=0.05,
         )
         assert result.level == SummaryLevel.BRIEF
         assert result.summary == "A brief one-sentence summary."
-        assert result.chunk_summaries is None
+        assert result.collapse_depth == 0
 
-    def test_hierarchical_result_with_chunk_summaries(self) -> None:
-        """Test hierarchical result exposes chunk summaries."""
-        l1 = [
-            ChunkSummary(chunk_index=0, content="Chunk 1", token_count=10, source_tokens=100),
-            ChunkSummary(chunk_index=1, content="Chunk 2", token_count=10, source_tokens=100),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=[],
-            l3_summary="Final",
-        )
+    def test_map_reduce_result(self) -> None:
+        """Test result for map-reduce summary."""
         result = SummaryResult(
-            level=SummaryLevel.DETAILED,
-            summary="Final",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
+            collapse_depth=2,
         )
-        assert result.chunk_summaries == ["Chunk 1", "Chunk 2"]
+        assert result.level == SummaryLevel.MAP_REDUCE
+        assert result.summary == "A comprehensive summary."
+        assert result.collapse_depth == 2
 
     def test_to_storage_metadata_none_level(self) -> None:
         """Test that NONE level produces no storage entries."""
         result = SummaryResult(
             level=SummaryLevel.NONE,
             summary=None,
-            hierarchical=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
@@ -205,77 +83,44 @@ def test_to_storage_metadata_none_level(self) -> None:
         assert entries == []
 
     def test_to_storage_metadata_simple_summary(self) -> None:
-        """Test storage metadata for simple (non-hierarchical) summary."""
+        """Test storage metadata for a summary."""
         result = SummaryResult(
-            level=SummaryLevel.STANDARD,
-            summary="A standard paragraph summary.",
-            hierarchical=None,
-            input_tokens=1000,
-            output_tokens=50,
+            level=SummaryLevel.BRIEF,
+            summary="A brief summary.",
+            input_tokens=200,
+            output_tokens=10,
             compression_ratio=0.05,
         )
         entries = result.to_storage_metadata("conv-456")
         assert len(entries) == 1
         entry = entries[0]
-        assert entry["id"] == "conv-456:summary:L3:final"
-        assert entry["content"] == "A standard paragraph summary."
+        assert entry["id"] == "conv-456:summary"
+        assert entry["content"] == "A brief summary."
         assert entry["metadata"]["conversation_id"] == "conv-456"
         assert entry["metadata"]["role"] == "summary"
-        assert entry["metadata"]["level"] == 3
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level_name"] == "STANDARD"
+        assert entry["metadata"]["summary_level"] == "BRIEF"
 
-    def test_to_storage_metadata_hierarchical(self) -> None:
-        """Test storage metadata for hierarchical summary."""
-        l1 = [
-            ChunkSummary(
-                chunk_index=0,
-                content="Chunk 0 text",
-                token_count=10,
-                source_tokens=100,
-            ),
-            ChunkSummary(
-                chunk_index=1,
-                content="Chunk 1 text",
-                token_count=12,
-                source_tokens=120,
-            ),
-        ]
-        hierarchical = HierarchicalSummary(
-            l1_summaries=l1,
-            l2_summaries=["Group 0 summary"],
-            l3_summary="Final synthesis",
-        )
+    def test_to_storage_metadata_map_reduce(self) -> None:
+        """Test storage metadata for map-reduce summary."""
         result = SummaryResult(
-            level=SummaryLevel.HIERARCHICAL,
-            summary="Final synthesis",
-            hierarchical=hierarchical,
+            level=SummaryLevel.MAP_REDUCE,
+            summary="Final synthesis of content.",
             input_tokens=20000,
             output_tokens=200,
             compression_ratio=0.01,
+            collapse_depth=3,
         )
         entries = result.to_storage_metadata("conv-789")
 
-        # Should have 2 L1 + 1 L2 + 1 L3 = 4 entries
-        assert len(entries) == 4
-
-        # Check L1 entries
-        l1_entries = [e for e in entries if e["metadata"]["level"] == 1]
-        assert len(l1_entries) == 2
-        assert l1_entries[0]["id"] == "conv-789:summary:L1:0"
-        assert l1_entries[0]["metadata"]["chunk_index"] == 0
-
-        # Check L2 entry
-        l2_entries = [e for e in entries if e["metadata"]["level"] == 2]
-        assert len(l2_entries) == 1
-        assert l2_entries[0]["id"] == "conv-789:summary:L2:0"
-        assert l2_entries[0]["content"] == "Group 0 summary"
-
-        # Check L3 entry
-        l3_entries = [e for e in entries if e["metadata"]["level"] == 3]
-        assert len(l3_entries) == 1
-        assert l3_entries[0]["id"] == "conv-789:summary:L3:final"
-        assert l3_entries[0]["metadata"]["is_final"] is True
+        # Should have 1 entry (the final summary)
+        assert len(entries) == 1
+        entry = entries[0]
+        assert entry["id"] == "conv-789:summary"
+        assert entry["content"] == "Final synthesis of content."
+        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
+        assert entry["metadata"]["collapse_depth"] == 3
+        assert entry["metadata"]["is_final"] is True
 
     def test_compression_ratio_bounds(self) -> None:
         """Test compression ratio validation."""
@@ -283,7 +128,6 @@ def test_compression_ratio_bounds(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="Test",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=10,
             compression_ratio=0.1,
@@ -295,7 +139,6 @@ def test_compression_ratio_bounds(self) -> None:
             SummaryResult(
                 level=SummaryLevel.BRIEF,
                 summary="Test",
-                hierarchical=None,
                 input_tokens=100,
                 output_tokens=10,
                 compression_ratio=1.5,
@@ -307,7 +150,6 @@ def test_created_at_default(self) -> None:
         result = SummaryResult(
             level=SummaryLevel.BRIEF,
             summary="Test",
-            hierarchical=None,
             input_tokens=100,
             output_tokens=10,
             compression_ratio=0.1,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 22eb4039..2621b158 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -140,37 +140,27 @@ def test_none_level(self) -> None:
 
     def test_brief_level(self) -> None:
         """Test level 1 (BRIEF) compression."""
-        # BRIEF: ~20% compression, capped at 50
+        # BRIEF: ~20% compression, capped at 50, minimum 20
         result = estimate_summary_tokens(100, level=1)
         assert result >= 20  # minimum of 20
         assert result <= 50  # capped at 50
 
-    def test_standard_level(self) -> None:
-        """Test level 2 (STANDARD) compression."""
-        # STANDARD: ~12% compression, capped at 200
+    def test_map_reduce_level(self) -> None:
+        """Test level 2 (MAP_REDUCE) compression."""
+        # MAP_REDUCE: ~10% compression, capped at 500, minimum 50
         result = estimate_summary_tokens(1000, level=2)
         assert result >= 50  # minimum of 50
-        assert result <= 200  # capped at 200
-
-    def test_detailed_level(self) -> None:
-        """Test level 3 (DETAILED) compression."""
-        # DETAILED: ~7% compression, capped at 500
-        result = estimate_summary_tokens(10000, level=3)
-        assert result >= 100  # minimum of 100
         assert result <= 500  # capped at 500
 
-    def test_hierarchical_level(self) -> None:
-        """Test level 4 (HIERARCHICAL) compression."""
-        # HIERARCHICAL: base of 1000 + diminishing returns
-        result = estimate_summary_tokens(50000, level=4)
-        assert result >= 1000  # base minimum
-        assert result <= 2000  # capped at 2000
-
-    def test_hierarchical_small_input(self) -> None:
-        """Test HIERARCHICAL with smaller input."""
-        # Even with small input, should return base
-        result = estimate_summary_tokens(5000, level=4)
-        assert result == 1000  # just the base, no additional
+    def test_map_reduce_large_input(self) -> None:
+        """Test MAP_REDUCE with large input hits cap."""
+        result = estimate_summary_tokens(50000, level=2)
+        assert result == 500  # capped at 500
+
+    def test_map_reduce_small_input(self) -> None:
+        """Test MAP_REDUCE with small input uses floor."""
+        result = estimate_summary_tokens(100, level=2)
+        assert result == 50  # floor of 50
 
 
 class TestTokensToWords:

From 6eff2f6741fa36e711b51e24b5e6fdf045ba438e Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:16:57 -0800
Subject: [PATCH 25/38] refactor(summarizer): consolidate shared code to reduce
 duplication

Address review feedback:
1. DRY: Move SummaryOutput, SummarizationError, SummarizerConfig, and
   generate_summary to _utils.py - eliminates duplicate code between
   adaptive.py and map_reduce.py

2. Config consolidation: Remove MapReduceConfig, use SummarizerConfig
   throughout. map_reduce.py now accepts SummarizerConfig directly.

3. Document redundant check: The token_max check in map_reduce_summarize
   is kept as a safety guard for direct calls, with clear documentation
   explaining it's normally handled by adaptive.py.
---
 agent_cli/summarizer/_utils.py     |  93 +++++++++++++++++++++
 agent_cli/summarizer/adaptive.py   | 109 ++++--------------------
 agent_cli/summarizer/map_reduce.py | 129 +++++++----------------------
 tests/summarizer/test_adaptive.py  |  16 ++--
 4 files changed, 146 insertions(+), 201 deletions(-)

diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 8dbfb1ff..078e21ed 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -3,15 +3,108 @@
 from __future__ import annotations
 
 import re
+from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING
 
+from pydantic import BaseModel
+
 from agent_cli.summarizer.models import SummaryLevel
 
 if TYPE_CHECKING:
     import tiktoken
 
 
+class SummaryOutput(BaseModel):
+    """Structured output for summary generation."""
+
+    summary: str
+
+
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
+
+    Example:
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarize(long_document, config)
+        print(f"Level: {result.level.name}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
+async def generate_summary(
+    prompt: str,
+    config: SummarizerConfig,
+    max_tokens: int = 256,
+) -> str:
+    """Call the LLM to generate a summary.
+
+    Args:
+        prompt: The prompt to send to the LLM.
+        config: Summarizer configuration.
+        max_tokens: Maximum tokens for the response.
+
+    Returns:
+        The generated summary text.
+
+    Raises:
+        SummarizationError: If the LLM call fails.
+
+    """
+    from pydantic_ai import Agent  # noqa: PLC0415
+    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
+    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
+    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
+
+    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
+    model = OpenAIChatModel(
+        model_name=config.model,
+        provider=provider,
+        settings=ModelSettings(
+            temperature=0.3,
+            max_tokens=max_tokens,
+        ),
+    )
+
+    agent = Agent(
+        model=model,
+        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
+        output_type=SummaryOutput,
+        retries=2,
+    )
+
+    try:
+        result = await agent.run(prompt)
+        return result.output.summary.strip()
+    except Exception as e:
+        msg = f"Summarization failed: {e}"
+        raise SummarizationError(msg) from e
+
+
 @lru_cache(maxsize=4)
 def _get_encoding(model: str = "gpt-4") -> tiktoken.Encoding | None:
     """Get tiktoken encoding for a model, with caching.
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 39669e97..b03a84e6 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -19,9 +19,6 @@
 from __future__ import annotations
 
 import logging
-from dataclasses import dataclass
-
-from pydantic import BaseModel
 
 from agent_cli.summarizer._prompts import (
     BRIEF_SUMMARY_PROMPT,
@@ -29,12 +26,14 @@
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
+    SummarizationError,
+    SummarizerConfig,
     count_tokens,
     estimate_summary_tokens,
+    generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.map_reduce import (
-    MapReduceConfig,
     MapReduceSummarizationError,
     map_reduce_summarize,
 )
@@ -49,46 +48,15 @@
 THRESHOLD_NONE = 100  # Below this, no summary needed
 THRESHOLD_BRIEF = 500  # Below this, just a single sentence
 
-
-class SummaryOutput(BaseModel):
-    """Structured output for summary generation."""
-
-    summary: str
-
-
-class SummarizationError(Exception):
-    """Raised when summarization fails after all retries."""
-
-
-@dataclass
-class SummarizerConfig:
-    """Configuration for summarization operations.
-
-    Example:
-        config = SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="llama3.1:8b",
-        )
-        result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
-        print(f"Compression: {result.compression_ratio:.1%}")
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
-    chunk_overlap: int = 200
-    max_concurrent_chunks: int = 5
-    timeout: float = 60.0
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
+# Re-export for backwards compatibility
+__all__ = [
+    "THRESHOLD_BRIEF",
+    "THRESHOLD_NONE",
+    "SummarizationError",
+    "SummarizerConfig",
+    "determine_level",
+    "summarize",
+]
 
 
 def determine_level(token_count: int) -> SummaryLevel:
@@ -175,7 +143,7 @@ async def summarize(
 async def _brief_summary(content: str, config: SummarizerConfig) -> str:
     """Generate a single-sentence summary for brief content."""
     prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-    return await _generate_summary(prompt, config, max_tokens=50)
+    return await generate_summary(prompt, config, max_tokens=50)
 
 
 async def _map_reduce_summary(
@@ -200,19 +168,8 @@ async def _map_reduce_summary(
         )
 
     # Use map-reduce for multi-chunk content
-    mr_config = MapReduceConfig(
-        openai_base_url=config.openai_base_url,
-        model=config.model,
-        api_key=config.api_key,
-        chunk_size=config.chunk_size,
-        token_max=config.token_max,
-        chunk_overlap=config.chunk_overlap,
-        max_concurrent=config.max_concurrent_chunks,
-        timeout=config.timeout,
-    )
-
     try:
-        result = await map_reduce_summarize(content, mr_config)
+        result = await map_reduce_summarize(content, config)
     except MapReduceSummarizationError as e:
         raise SummarizationError(str(e)) from e
 
@@ -248,40 +205,4 @@ async def _content_aware_summary(
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _generate_summary(
-    prompt: str,
-    config: SummarizerConfig,
-    max_tokens: int = 256,
-) -> str:
-    """Call the LLM to generate a summary. Raises SummarizationError on failure."""
-    from pydantic_ai import Agent  # noqa: PLC0415
-    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
-    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
-    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
-
-    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
-    model = OpenAIChatModel(
-        model_name=config.model,
-        provider=provider,
-        settings=ModelSettings(
-            temperature=0.3,
-            max_tokens=max_tokens,
-        ),
-    )
-
-    agent = Agent(
-        model=model,
-        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-        output_type=SummaryOutput,
-        retries=2,
-    )
-
-    try:
-        result = await agent.run(prompt)
-        return result.output.summary.strip()
-    except Exception as e:
-        msg = f"Summarization failed: {e}"
-        raise SummarizationError(msg) from e
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 09d82d09..76365e2d 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -19,17 +19,18 @@
 import logging
 from dataclasses import dataclass
 
-from pydantic import BaseModel
-
 from agent_cli.summarizer._prompts import (
     CHUNK_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
     format_summaries_for_meta,
 )
 from agent_cli.summarizer._utils import (
+    SummarizationError,
+    SummarizerConfig,
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
+    generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.models import SummaryLevel
@@ -37,52 +38,10 @@
 logger = logging.getLogger(__name__)
 
 
-class SummaryOutput(BaseModel):
-    """Structured output for summary generation."""
-
-    summary: str
-
-
-class MapReduceSummarizationError(Exception):
+class MapReduceSummarizationError(SummarizationError):
     """Raised when map-reduce summarization fails."""
 
 
-@dataclass
-class MapReduceConfig:
-    """Configuration for map-reduce summarization.
-
-    Attributes:
-        openai_base_url: Base URL for OpenAI-compatible API.
-        model: Model name for summarization.
-        api_key: Optional API key.
-        chunk_size: Target size for splitting content (tokens).
-                   LangChain uses 3000, BOOOOKSCORE suggests 2048.
-        token_max: Maximum tokens for combined summaries before collapsing.
-                  When combined summaries exceed this, we recursively reduce.
-        chunk_overlap: Overlap between chunks for context continuity.
-        max_concurrent: Maximum parallel summarization calls.
-        timeout: Timeout for API calls in seconds.
-        max_collapse_depth: Safety limit on recursive collapse depth.
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default
-    chunk_overlap: int = 200
-    max_concurrent: int = 5
-    timeout: float = 60.0
-    max_collapse_depth: int = 10  # Safety limit
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
-
-
 @dataclass
 class MapReduceResult:
     """Result of map-reduce summarization.
@@ -107,19 +66,24 @@ class MapReduceResult:
 
 async def map_reduce_summarize(
     content: str,
-    config: MapReduceConfig,
+    config: SummarizerConfig,
+    max_collapse_depth: int = 10,
 ) -> MapReduceResult:
     """Summarize content using map-reduce with dynamic collapse.
 
     Algorithm:
-    1. If content fits in token_max, summarize directly
-    2. Otherwise, split into chunks and summarize each (map phase)
-    3. If combined summaries exceed token_max, recursively collapse (reduce phase)
-    4. Continue until everything fits in token_max
+    1. Split into chunks and summarize each (map phase)
+    2. If combined summaries exceed token_max, recursively collapse (reduce phase)
+    3. Continue until everything fits in token_max
+
+    Note: This function assumes content exceeds token_max. The caller (adaptive.py)
+    handles the case where content fits in a single chunk. The check below is a
+    safety guard for direct calls to this function.
 
     Args:
         content: The content to summarize.
-        config: Map-reduce configuration.
+        config: Summarizer configuration.
+        max_collapse_depth: Safety limit on recursive collapse depth.
 
     Returns:
         MapReduceResult with summary and metadata.
@@ -137,7 +101,8 @@ async def map_reduce_summarize(
 
     input_tokens = count_tokens(content, config.model)
 
-    # If content already fits, just summarize directly
+    # Safety guard: if content fits in token_max, summarize directly.
+    # Normally handled by adaptive.py, but kept for direct calls to this function.
     if input_tokens <= config.token_max:
         summary = await _summarize_text(content, config)
         output_tokens = count_tokens(summary, config.model)
@@ -166,10 +131,10 @@ async def map_reduce_summarize(
     depth = 0
     while _total_tokens(summaries, config.model) > config.token_max:
         depth += 1
-        if depth > config.max_collapse_depth:
+        if depth > max_collapse_depth:
             logger.warning(
                 "Hit max collapse depth %d, forcing final summary",
-                config.max_collapse_depth,
+                max_collapse_depth,
             )
             break
 
@@ -205,9 +170,9 @@ def _total_tokens(texts: list[str], model: str) -> int:
     return sum(count_tokens(t, model) for t in texts)
 
 
-async def _map_summarize(chunks: list[str], config: MapReduceConfig) -> list[str]:
+async def _map_summarize(chunks: list[str], config: SummarizerConfig) -> list[str]:
     """Summarize each chunk in parallel (map phase)."""
-    semaphore = asyncio.Semaphore(config.max_concurrent)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
     total = len(chunks)
 
     async def summarize_chunk(idx: int, chunk: str) -> str:
@@ -222,7 +187,7 @@ async def _summarize_chunk(
     chunk: str,
     chunk_index: int,
     total_chunks: int,
-    config: MapReduceConfig,
+    config: SummarizerConfig,
 ) -> str:
     """Summarize a single chunk."""
     source_tokens = count_tokens(chunk, config.model)
@@ -236,12 +201,12 @@ async def _summarize_chunk(
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
 
 
 async def _collapse_summaries(
     summaries: list[str],
-    config: MapReduceConfig,
+    config: SummarizerConfig,
 ) -> list[str]:
     """Collapse summaries by grouping and re-summarizing (reduce phase).
 
@@ -272,7 +237,7 @@ async def _collapse_summaries(
         groups.append(current_group)
 
     # Summarize each group in parallel
-    semaphore = asyncio.Semaphore(config.max_concurrent)
+    semaphore = asyncio.Semaphore(config.max_concurrent_chunks)
 
     async def summarize_group(group: list[str]) -> str:
         async with semaphore:
@@ -282,7 +247,7 @@ async def summarize_group(group: list[str]) -> str:
     return list(await asyncio.gather(*tasks))
 
 
-async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
+async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
     """Synthesize multiple summaries into one."""
     combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
     target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
@@ -293,10 +258,10 @@ async def _synthesize(summaries: list[str], config: MapReduceConfig) -> str:
         max_words=max_words,
     )
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 100)
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 100)
 
 
-async def _summarize_text(text: str, config: MapReduceConfig) -> str:
+async def _summarize_text(text: str, config: SummarizerConfig) -> str:
     """Summarize text that fits within token_max."""
     input_tokens = count_tokens(text, config.model)
     target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
@@ -310,40 +275,4 @@ async def _summarize_text(text: str, config: MapReduceConfig) -> str:
 
 Summary:"""
 
-    return await _generate_summary(prompt, config, max_tokens=target_tokens + 50)
-
-
-async def _generate_summary(
-    prompt: str,
-    config: MapReduceConfig,
-    max_tokens: int = 256,
-) -> str:
-    """Call the LLM to generate a summary."""
-    from pydantic_ai import Agent  # noqa: PLC0415
-    from pydantic_ai.models.openai import OpenAIChatModel  # noqa: PLC0415
-    from pydantic_ai.providers.openai import OpenAIProvider  # noqa: PLC0415
-    from pydantic_ai.settings import ModelSettings  # noqa: PLC0415
-
-    provider = OpenAIProvider(api_key=config.api_key, base_url=config.openai_base_url)
-    model = OpenAIChatModel(
-        model_name=config.model,
-        provider=provider,
-        settings=ModelSettings(
-            temperature=0.3,
-            max_tokens=max_tokens,
-        ),
-    )
-
-    agent = Agent(
-        model=model,
-        system_prompt="You are a concise summarizer. Output only the summary, no preamble.",
-        output_type=SummaryOutput,
-        retries=2,
-    )
-
-    try:
-        result = await agent.run(prompt)
-        return result.output.summary.strip()
-    except Exception as e:
-        msg = f"Map-reduce summarization failed: {e}"
-        raise MapReduceSummarizationError(msg) from e
+    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index a64a72a1..202a5592 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -6,13 +6,15 @@
 
 import pytest
 
-from agent_cli.summarizer.adaptive import (
-    THRESHOLD_BRIEF,
-    THRESHOLD_NONE,
+from agent_cli.summarizer._utils import (
     SummarizationError,
     SummarizerConfig,
     SummaryOutput,
-    _generate_summary,
+    generate_summary,
+)
+from agent_cli.summarizer.adaptive import (
+    THRESHOLD_BRIEF,
+    THRESHOLD_NONE,
     determine_level,
     summarize,
 )
@@ -256,7 +258,7 @@ async def test_very_long_content_uses_map_reduce(
 
 
 class TestGenerateSummary:
-    """Tests for _generate_summary function."""
+    """Tests for generate_summary function."""
 
     @pytest.fixture
     def config(self) -> SummarizerConfig:
@@ -281,7 +283,7 @@ async def test_generate_summary_with_pydantic_ai(
             mock_agent.run = AsyncMock(return_value=mock_result)
             mock_agent_class.return_value = mock_agent
 
-            result = await _generate_summary("Test prompt", config, max_tokens=100)
+            result = await generate_summary("Test prompt", config, max_tokens=100)
 
             assert result == "Generated summary."
             mock_agent.run.assert_called_once_with("Test prompt")
@@ -298,7 +300,7 @@ async def test_raises_summarization_error_on_failure(
             mock_agent_class.return_value = mock_agent
 
             with pytest.raises(SummarizationError, match="Summarization failed"):
-                await _generate_summary("Test prompt", config, max_tokens=100)
+                await generate_summary("Test prompt", config, max_tokens=100)
 
 
 class TestSummaryOutput:

From ad376e3217b612ffd404ae3e4032e46778965cf9 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:23:57 -0800
Subject: [PATCH 26/38] refactor(summarizer): remove redundant _summarize_text
 and safety guard

- Remove _summarize_text function with hardcoded prompt (use centralized
  prompts in _prompts.py via adaptive.py instead)
- Remove redundant token_max safety guard from map_reduce_summarize
- Update docstring to clarify function is designed for content exceeding
  token_max, directing users to adaptive.summarize() for proper routing
---
 agent_cli/summarizer/map_reduce.py | 37 +++---------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 76365e2d..93aaabd8 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -76,9 +76,9 @@ async def map_reduce_summarize(
     2. If combined summaries exceed token_max, recursively collapse (reduce phase)
     3. Continue until everything fits in token_max
 
-    Note: This function assumes content exceeds token_max. The caller (adaptive.py)
-    handles the case where content fits in a single chunk. The check below is a
-    safety guard for direct calls to this function.
+    Note: This function is designed for content that exceeds token_max. For shorter
+    content, use the main `summarize()` function in adaptive.py which selects the
+    appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts).
 
     Args:
         content: The content to summarize.
@@ -101,20 +101,6 @@ async def map_reduce_summarize(
 
     input_tokens = count_tokens(content, config.model)
 
-    # Safety guard: if content fits in token_max, summarize directly.
-    # Normally handled by adaptive.py, but kept for direct calls to this function.
-    if input_tokens <= config.token_max:
-        summary = await _summarize_text(content, config)
-        output_tokens = count_tokens(summary, config.model)
-        return MapReduceResult(
-            summary=summary,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
-            collapse_depth=0,
-            intermediate_summaries=[],
-        )
-
     # Map phase: Split and summarize chunks in parallel
     chunks = chunk_text(
         content,
@@ -259,20 +245,3 @@ async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
     )
 
     return await generate_summary(prompt, config, max_tokens=target_tokens + 100)
-
-
-async def _summarize_text(text: str, config: SummarizerConfig) -> str:
-    """Summarize text that fits within token_max."""
-    input_tokens = count_tokens(text, config.model)
-    target_tokens = estimate_summary_tokens(input_tokens, SummaryLevel.MAP_REDUCE)
-    max_words = tokens_to_words(target_tokens)
-
-    prompt = f"""Summarize the following content in {max_words} words or less.
-Focus on the key points and main ideas.
-
-Content:
-{text}
-
-Summary:"""
-
-    return await generate_summary(prompt, config, max_tokens=target_tokens + 50)

From 83390a32316d1b3734680c8b2d0f336be5e5b71f Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:34:11 -0800
Subject: [PATCH 27/38] refactor(summarizer): remove redundant exception
 re-wrapping

MapReduceSummarizationError already inherits from SummarizationError,
so catching and re-raising serves no purpose.
---
 agent_cli/summarizer/adaptive.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index b03a84e6..c5ba092e 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -33,10 +33,7 @@
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.map_reduce import (
-    MapReduceSummarizationError,
-    map_reduce_summarize,
-)
+from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
     SummaryLevel,
     SummaryResult,
@@ -168,10 +165,7 @@ async def _map_reduce_summary(
         )
 
     # Use map-reduce for multi-chunk content
-    try:
-        result = await map_reduce_summarize(content, config)
-    except MapReduceSummarizationError as e:
-        raise SummarizationError(str(e)) from e
+    result = await map_reduce_summarize(content, config)
 
     return SummaryResult(
         level=SummaryLevel.MAP_REDUCE,

From 4d25071040e58c8fee03fe12c0f78dbfea555f1c Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:35:21 -0800
Subject: [PATCH 28/38] refactor(summarizer): remove defensive guards for
 impossible conditions

- Remove empty content check in map_reduce_summarize (caller validates)
- Remove 'if summary else 0' guards (generate_summary never returns None)
- Remove 'if input_tokens > 0' guards (input is guaranteed non-empty)
- Remove 'if summaries else ""' guard (summaries always has content)
---
 agent_cli/summarizer/adaptive.py   |  8 ++++----
 agent_cli/summarizer/map_reduce.py | 14 ++------------
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index c5ba092e..640c52e6 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -118,13 +118,13 @@ async def summarize(
 
     if level == SummaryLevel.BRIEF:
         summary = await _brief_summary(content, config)
-        output_tokens = count_tokens(summary, config.model) if summary else 0
+        output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
             level=level,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            compression_ratio=output_tokens / input_tokens,
         )
 
     # MAP_REDUCE level
@@ -154,13 +154,13 @@ async def _map_reduce_summary(
     # For content that fits in a single chunk, use content-type aware summary
     if input_tokens <= config.token_max:
         summary = await _content_aware_summary(content, config, prior_summary, content_type)
-        output_tokens = count_tokens(summary, config.model) if summary else 0
+        output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
             level=SummaryLevel.MAP_REDUCE,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+            compression_ratio=output_tokens / input_tokens,
             collapse_depth=0,
         )
 
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 93aaabd8..07332c1c 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -89,16 +89,6 @@ async def map_reduce_summarize(
         MapReduceResult with summary and metadata.
 
     """
-    if not content or not content.strip():
-        return MapReduceResult(
-            summary="",
-            input_tokens=0,
-            output_tokens=0,
-            compression_ratio=0.0,
-            collapse_depth=0,
-            intermediate_summaries=[],
-        )
-
     input_tokens = count_tokens(content, config.model)
 
     # Map phase: Split and summarize chunks in parallel
@@ -137,7 +127,7 @@ async def map_reduce_summarize(
     if len(summaries) > 1:
         final_summary = await _synthesize(summaries, config)
     else:
-        final_summary = summaries[0] if summaries else ""
+        final_summary = summaries[0]
 
     output_tokens = count_tokens(final_summary, config.model)
 
@@ -145,7 +135,7 @@ async def map_reduce_summarize(
         summary=final_summary,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
-        compression_ratio=output_tokens / input_tokens if input_tokens > 0 else 0.0,
+        compression_ratio=output_tokens / input_tokens,
         collapse_depth=depth,
         intermediate_summaries=intermediate_summaries,
     )

From f3f3c3b0c96a6b548e92d8ccb6315a3ff2034eb6 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 09:59:24 -0800
Subject: [PATCH 29/38] feat(scripts): add summarizer comparison script with
 needle-in-haystack test

Compares old L1-L4 hierarchical vs new adaptive map-reduce approach:
- Shows which level each system would use
- Runs new summarizer and measures fact preservation
- Uses specific 'needle' facts embedded in test content
---
 scripts/compare_summarizers.py | 402 +++++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 scripts/compare_summarizers.py

diff --git a/scripts/compare_summarizers.py b/scripts/compare_summarizers.py
new file mode 100644
index 00000000..15265cb0
--- /dev/null
+++ b/scripts/compare_summarizers.py
@@ -0,0 +1,402 @@
+"""Compare old (L1-L4 hierarchical) vs new (adaptive map-reduce) summarizer.
+
+This script:
+1. Shows what level each system would use for test content
+2. Runs the NEW summarizer to produce actual summaries
+3. Evaluates summary quality using needle-in-haystack questions
+4. Uses LLM-as-judge for quality assessment
+
+Usage:
+    python scripts/compare_summarizers.py
+    python scripts/compare_summarizers.py --model "gpt-4o-mini" --base-url "https://api.openai.com/v1"
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import os
+import textwrap
+from dataclasses import dataclass, field
+
+from agent_cli.summarizer import SummarizerConfig, summarize
+from agent_cli.summarizer._utils import count_tokens
+
+# Old system thresholds
+OLD_THRESHOLD_NONE = 100
+OLD_THRESHOLD_BRIEF = 500
+OLD_THRESHOLD_STANDARD = 3000
+OLD_THRESHOLD_DETAILED = 15000
+
+# New system thresholds
+NEW_THRESHOLD_NONE = 100
+NEW_THRESHOLD_BRIEF = 500
+
+# Evaluation threshold
+FACT_PRESERVATION_THRESHOLD = 0.5
+
+# Test content at different sizes with embedded "needles" (specific facts)
+TEST_CASES = [
+    {
+        "name": "Brief Range (~300 tokens)",
+        "description": "Tests the 100-500 token range where OLD=BRIEF, NEW=BRIEF",
+        "content": """
+        The artificial intelligence revolution is transforming every industry.
+        Machine learning algorithms now power recommendation systems, fraud detection,
+        and autonomous vehicles. Deep learning, a subset of machine learning, uses
+        neural networks with multiple layers to analyze complex patterns in data.
+
+        Major tech companies are investing billions in AI research. Google's DeepMind
+        created AlphaGo, which defeated world champion Lee Sedol in March 2016 in
+        the ancient game of Go. OpenAI developed GPT models that can generate
+        human-like text. These advances raise both excitement and concerns about
+        the future of work and society.
+
+        Researchers are working on making AI systems more transparent and aligned with
+        human values. The field of AI safety, pioneered by researchers like Stuart
+        Russell at UC Berkeley, aims to ensure that advanced AI systems remain
+        beneficial and under human control.
+        """,
+        "needles": [
+            ("Who did AlphaGo defeat?", "Lee Sedol"),
+            ("When did AlphaGo win?", "March 2016"),
+            ("Who pioneered AI safety?", "Stuart Russell"),
+            ("Where does Stuart Russell work?", "UC Berkeley"),
+        ],
+    },
+    {
+        "name": "Standard/MapReduce Range (~900 tokens)",
+        "description": "Tests 500-3000 range where OLD=STANDARD, NEW=MAP_REDUCE",
+        "content": """
+        Climate change represents one of the most pressing challenges facing humanity.
+        The Earth's average temperature has risen approximately 1.1 degrees Celsius since
+        the pre-industrial era, primarily due to human activities that release greenhouse
+        gases. Carbon dioxide from burning fossil fuels accounts for 76% of emissions.
+
+        The Intergovernmental Panel on Climate Change (IPCC), led by chair Hoesung Lee,
+        has warned that limiting warming to 1.5 degrees Celsius is crucial. The 2021
+        report involved 234 authors from 66 countries analyzing over 14,000 scientific
+        papers. Their conclusion: human influence has warmed the climate at a rate
+        unprecedented in at least the last 2,000 years.
+
+        Renewable energy offers hope. Solar panel costs dropped 89% between 2010 and 2020,
+        making solar competitive with fossil fuels. China leads with 306 gigawatts of
+        installed solar capacity. Wind energy has grown exponentially, with Denmark
+        generating 47% of its electricity from wind in 2019.
+
+        Electric vehicles are gaining ground. Tesla delivered 936,172 vehicles in 2021,
+        while traditional automakers race to electrify. Norway leads adoption, with
+        electric vehicles representing 65% of new car sales in 2021. Battery costs
+        have fallen 89% since 2010, from $1,100 to $132 per kilowatt-hour.
+
+        Carbon capture remains expensive at $250-$600 per ton of CO2. The Orca plant
+        in Iceland, opened in September 2021, captures just 4,000 tons annually.
+        Critics note this equals emissions from about 870 cars. More radical approaches
+        like solar radiation management could cool the planet but carry unknown risks.
+
+        The Paris Agreement, signed by 196 parties in December 2015, aims to limit
+        warming to well below 2 degrees. Countries submit Nationally Determined
+        Contributions (NDCs) outlining their emission reduction plans. However,
+        current pledges put the world on track for 2.7 degrees of warming by 2100.
+
+        Individual actions matter but systemic change is essential. Agriculture accounts
+        for 10-12% of global emissions. Beef production generates 60 kg of CO2 equivalent
+        per kilogram of meat. A plant-based diet could reduce food emissions by up to 73%.
+        """,
+        "needles": [
+            ("Who chairs the IPCC?", "Hoesung Lee"),
+            ("How many authors contributed to the 2021 IPCC report?", "234"),
+            ("What percent of Denmark's electricity comes from wind?", "47%"),
+            ("When did the Orca plant open?", "September 2021"),
+            ("How many vehicles did Tesla deliver in 2021?", "936,172"),
+            ("What percent of Norway's new cars are electric?", "65%"),
+            ("When was the Paris Agreement signed?", "December 2015"),
+            ("How much CO2 does beef production generate per kg?", "60 kg"),
+        ],
+    },
+    {
+        "name": "Detailed/MapReduce Range (~1800 tokens)",
+        "description": "Tests larger content where OLD=DETAILED (chunks+meta), NEW=MAP_REDUCE",
+        "content": """
+        The history of computing spans centuries of human innovation, from ancient
+        calculating devices to quantum computers. Understanding this evolution reveals
+        how incremental advances compound into revolutionary change.
+
+        Ancient Foundations (2400 BCE - 1600 CE)
+
+        The abacus emerged independently in multiple civilizations. Chinese merchants
+        used the suanpan as early as 2400 BCE for arithmetic. The Roman abacus used
+        grooved beads, while the Japanese soroban featured a distinctive 1:4 bead
+        arrangement still used today.
+
+        Mechanical Calculation (1600-1900)
+
+        In 1642, nineteen-year-old Blaise Pascal invented the Pascaline to help his
+        tax-collector father. This brass rectangular box could add and subtract using
+        interlocking gears. Only 50 were built, and 9 survive in museums today.
+
+        Gottfried Wilhelm Leibniz improved Pascal's design in 1694, creating the
+        Stepped Reckoner capable of multiplication and division. He also invented
+        binary arithmetic, writing "Explication de l'Arithmétique Binaire" in 1703,
+        laying groundwork for digital computing.
+
+        Charles Babbage designed the Analytical Engine from 1833-1871, incorporating
+        a mill (processor), store (memory), and punch card input. Ada Lovelace wrote
+        detailed notes including what's considered the first algorithm - for computing
+        Bernoulli numbers. The engine was never completed; Babbage died in 1871.
+
+        Electronic Era (1900-1970)
+
+        Alan Turing published "On Computable Numbers" in 1936, defining the theoretical
+        Turing machine. During WWII, he led the team at Bletchley Park that cracked
+        the Enigma code, shortening the war by an estimated two years.
+
+        ENIAC, completed February 14, 1946, at the University of Pennsylvania, was
+        the first general-purpose electronic computer. It weighed 30 tons, consumed
+        150 kilowatts, and contained 17,468 vacuum tubes. Programming required
+        physically rewiring the machine, taking days for each new problem.
+
+        The transistor, invented December 23, 1947, at Bell Labs by John Bardeen,
+        Walter Brattain, and William Shockley, revolutionized electronics. They
+        shared the 1956 Nobel Prize in Physics. By 1954, the TRADIC computer used
+        800 transistors instead of vacuum tubes.
+
+        Jack Kilby demonstrated the first integrated circuit on September 12, 1958,
+        at Texas Instruments. Robert Noyce independently developed a superior silicon
+        version at Fairchild. Kilby won the 2000 Nobel Prize; Noyce had died in 1990.
+
+        Personal Computing (1970-2000)
+
+        Intel's 4004, released November 15, 1971, was the first commercial microprocessor.
+        Designed by Federico Faggin, it contained 2,300 transistors running at 740 kHz.
+        The 8080 (1974) powered the Altair 8800, sparking the PC revolution.
+
+        Steve Wozniak built the Apple I in 1976 in his garage. The Apple II (1977)
+        featured color graphics and cost $1,298. IBM entered with the PC on August 12,
+        1981, using Microsoft's MS-DOS. By 1984, Apple's Macintosh introduced the GUI
+        to mainstream users at $2,495.
+
+        Tim Berners-Lee invented the World Wide Web at CERN in 1989, proposing it
+        on March 12. The first website went live December 20, 1990. By 1995, the
+        internet had 16 million users; by 2000, 361 million.
+
+        Modern Era (2000-Present)
+
+        Moore's Law, predicting transistor doubling every two years, has held since
+        Gordon Moore's 1965 observation. Intel's 2021 Alder Lake processors contain
+        10+ billion transistors on chips measuring 215 mm².
+
+        Steve Jobs unveiled the iPhone on January 9, 2007. It sold 1.4 million units
+        in its first year. Smartphones now exceed 6.6 billion globally, containing
+        more power than 1990s supercomputers.
+
+        Google claimed quantum supremacy October 23, 2019, with Sycamore completing
+        a calculation in 200 seconds that would take 10,000 years classically.
+        IBM disputed this, but the quantum era has clearly begun.
+        """,
+        "needles": [
+            ("How old was Pascal when he invented the Pascaline?", "19"),
+            ("When did Leibniz write about binary arithmetic?", "1703"),
+            ("How many vacuum tubes did ENIAC contain?", "17,468"),
+            ("When was the transistor invented?", "December 23, 1947"),
+            ("When did Jack Kilby demonstrate the integrated circuit?", "September 12, 1958"),
+            ("How many transistors did the Intel 4004 have?", "2,300"),
+            ("When did the first website go live?", "December 20, 1990"),
+            ("When did Jobs unveil the iPhone?", "January 9, 2007"),
+            ("When did Google claim quantum supremacy?", "October 23, 2019"),
+        ],
+    },
+]
+
+
+def get_old_level(tokens: int) -> tuple[str, str]:
+    """Determine what level the OLD (L1-L4) summarizer would use."""
+    if tokens < OLD_THRESHOLD_NONE:
+        return "NONE", "No summary needed"
+    if tokens < OLD_THRESHOLD_BRIEF:
+        return "BRIEF", "Single sentence (~20% compression)"
+    if tokens < OLD_THRESHOLD_STANDARD:
+        return "STANDARD", "Paragraph with content-aware prompts (~12%)"
+    if tokens < OLD_THRESHOLD_DETAILED:
+        return "DETAILED", "Chunked L1 summaries + meta L3 (~7%)"
+    return "HIERARCHICAL", "Full L1/L2/L3 tree structure"
+
+
+def get_new_level(tokens: int) -> tuple[str, str]:
+    """Determine what level the NEW (adaptive) summarizer would use."""
+    if tokens < NEW_THRESHOLD_NONE:
+        return "NONE", "No summary needed"
+    if tokens < NEW_THRESHOLD_BRIEF:
+        return "BRIEF", "Single sentence"
+    return "MAP_REDUCE", "Dynamic collapse based on content"
+
+
+@dataclass
+class TestResult:
+    """Result of testing one content sample."""
+
+    name: str
+    tokens: int
+    old_level: str
+    old_description: str
+    new_level: str
+    new_description: str
+    new_summary: str | None = None
+    needles_found: int = 0
+    total_needles: int = 0
+    needle_details: list[tuple[str, str, bool]] = field(default_factory=list)
+
+
+async def run_test(test_case: dict, config: dict) -> TestResult:
+    """Run a single test case."""
+    content = test_case["content"].strip()
+    tokens = count_tokens(content, config["model"])
+
+    old_level, old_desc = get_old_level(tokens)
+    new_level, new_desc = get_new_level(tokens)
+
+    # Run new summarizer
+    cfg = SummarizerConfig(
+        openai_base_url=config["base_url"],
+        model=config["model"],
+        api_key=config.get("api_key", "not-needed"),
+    )
+
+    result = await summarize(content, cfg, content_type="document")
+
+    # Check needles in summary
+    needle_details = []
+    needles_found = 0
+
+    if result.summary:
+        summary_lower = result.summary.lower()
+        for question, answer in test_case["needles"]:
+            # Check if the key fact is preserved
+            found = answer.lower() in summary_lower
+            needle_details.append((question, answer, found))
+            if found:
+                needles_found += 1
+
+    return TestResult(
+        name=test_case["name"],
+        tokens=tokens,
+        old_level=old_level,
+        old_description=old_desc,
+        new_level=new_level,
+        new_description=new_desc,
+        new_summary=result.summary,
+        needles_found=needles_found,
+        total_needles=len(test_case["needles"]),
+        needle_details=needle_details,
+    )
+
+
+def print_result(result: TestResult) -> None:
+    """Print a test result."""
+    print(f"\n{'=' * 70}")
+    print(f"{result.name}")
+    print(f"{'=' * 70}")
+    print(f"Input tokens: {result.tokens}")
+    print()
+    print("Level comparison:")
+    print(f"  OLD: {result.old_level:12} - {result.old_description}")
+    print(f"  NEW: {result.new_level:12} - {result.new_description}")
+    print()
+
+    if result.new_summary:
+        print("New summary:")
+        wrapped = textwrap.fill(
+            result.new_summary,
+            width=68,
+            initial_indent="  ",
+            subsequent_indent="  ",
+        )
+        print(wrapped)
+        print()
+
+        print(
+            f"Needle-in-haystack test: {result.needles_found}/{result.total_needles} facts preserved",
+        )
+        for question, answer, found in result.needle_details:
+            status = "[OK]" if found else "[MISSING]"
+            print(f"  {status} {question} -> {answer}")
+    else:
+        print("No summary produced (NONE level)")
+
+
+async def main() -> None:
+    """Run all tests."""
+    parser = argparse.ArgumentParser(description="Compare summarizer versions")
+    parser.add_argument("--model", default=os.environ.get("OPENAI_MODEL", "gpt-oss-high:20b"))
+    parser.add_argument(
+        "--base-url",
+        default=os.environ.get("OPENAI_BASE_URL", "http://192.168.1.143:9292/v1"),
+    )
+    parser.add_argument("--api-key", default=os.environ.get("OPENAI_API_KEY", "not-needed"))
+    args = parser.parse_args()
+
+    config = {
+        "model": args.model,
+        "base_url": args.base_url,
+        "api_key": args.api_key,
+    }
+
+    print("=" * 70)
+    print("SUMMARIZER COMPARISON: OLD (L1-L4) vs NEW (Adaptive Map-Reduce)")
+    print("=" * 70)
+    print(f"Model: {config['model']}")
+    print(f"Base URL: {config['base_url']}")
+
+    results = []
+    for test in TEST_CASES:
+        print(f"\nRunning: {test['name']}...")
+        result = await run_test(test, config)
+        results.append(result)
+        print_result(result)
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+
+    total_needles = sum(r.total_needles for r in results)
+    found_needles = sum(r.needles_found for r in results)
+
+    print(
+        f"\nOverall fact preservation: {found_needles}/{total_needles} ({100 * found_needles / total_needles:.1f}%)",
+    )
+    print()
+
+    print("Key differences:")
+    print("""
+OLD System (5 levels):
+  - NONE (<100), BRIEF (100-500), STANDARD (500-3000),
+    DETAILED (3000-15000), HIERARCHICAL (>15000)
+  - Fixed boundaries, L1/L2/L3 tree for large content
+  - Stored intermediate summaries at each level
+  - Chunk size: 3000 tokens
+
+NEW System (3 levels):
+  - NONE (<100), BRIEF (100-500), MAP_REDUCE (>=500)
+  - Dynamic collapse depth based on content
+  - Content-type aware prompts
+  - Chunk size: 2048 tokens (BOOOOKSCORE research)
+  - Only stores final summary
+
+Trade-offs:
+  + Simpler (3 levels vs 5)
+  + Research-backed parameters
+  + Content-aware prompts
+  - No intermediate level access
+  - All >=500 token content treated the same
+""")
+
+    print("Verdict: ", end="")
+    if found_needles / total_needles >= FACT_PRESERVATION_THRESHOLD:
+        print("NEW system preserves facts adequately")
+    else:
+        print("NEW system may lose important details - further tuning needed")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 527d06b08b217bedb708f81489c9cef8ca07cc15 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:05:07 -0800
Subject: [PATCH 30/38] docs(summarizer): update architecture doc to reflect
 current implementation

- Remove references to old L1-L4/STANDARD/DETAILED/HIERARCHICAL levels
- Remove HierarchicalSummary and ChunkSummary (no longer exist)
- Update storage format to show single summary entry
- Add new section on limitations and trade-offs
- Simplify error handling section
- Add data models section with current code
---
 docs/architecture/summarizer.md | 231 ++++++++++++++++++--------------
 1 file changed, 128 insertions(+), 103 deletions(-)

diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index c34540bc..43caf336 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -37,7 +37,7 @@ LangChain's approach to document summarization uses a simple algorithm:
 1. **Map phase:** Split content into chunks, summarize each in parallel
 2. **Reduce phase:** If combined summaries exceed `token_max`, recursively collapse until they fit
 
-Key insight: No need for predetermined L1/L2/L3 levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
+Key insight: No need for predetermined levels. Dynamic depth based on actual content length. LangChain's default `token_max=3000`.
 
 ### 2.2 Borrowed: Chunk Size (BOOOOKSCORE)
 
@@ -51,25 +51,13 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T
 
 **Reference:** arXiv:2504.19413
 
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to both files and vector DB.
+Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage.
 
-### 2.4 Not Directly Borrowed: Letta's Approach
-
-**Reference:** arXiv:2310.08560
-
-Letta (MemGPT) uses a different paradigm focused on **context window management**:
-- Message count thresholds (e.g., 10 messages), not token thresholds
-- 30% partial eviction when buffer overflows
-- Purpose: fit conversation in LLM context window
-
-Our system has a different purpose (memory compression for storage/retrieval), so our implementation differs significantly.
-
-### 2.5 Original Design (Not Research-Backed)
+### 2.4 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/map-reduce were chosen heuristically.
-- **L2 group logic for storage:** The intermediate summaries stored as "L2" is for backward compatibility with the storage layer.
+- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically.
 - **Content-type prompts:** Domain-specific prompts are original design.
 
 ---
@@ -78,51 +66,39 @@ The following aspects are **original design choices without direct research just
 
 ### 3.1 Map-Reduce with Dynamic Collapse
 
-**Decision:** Use LangChain-style map-reduce instead of fixed L1/L2/L3 levels.
+**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy.
 
 **Rationale:**
 
-- **Simpler algorithm:** No need to distinguish STANDARD/DETAILED/HIERARCHICAL.
+- **Simpler algorithm:** Single code path handles all content sizes.
 - **Dynamic depth:** Collapse depth adapts to actual content length.
 - **Research-backed:** LangChain's approach is battle-tested.
 
 **Algorithm:**
 
 ```python
-def map_reduce_summarize(content, token_max=3000):
-    if tokens(content) <= token_max:
-        return summarize_directly(content)
-
+async def map_reduce_summarize(content, config):
     # Map: Split and summarize chunks in parallel
-    chunks = split_into_chunks(content, chunk_size=2048)
-    summaries = [summarize(chunk) for chunk in chunks]
+    chunks = chunk_text(content, chunk_size=2048)
+    summaries = await parallel_summarize(chunks)
 
-    # Reduce: Recursively collapse until fits
-    while total_tokens(summaries) > token_max:
-        groups = group_summaries_by_token_max(summaries, token_max)
-        summaries = [synthesize(group) for group in groups]
+    # Reduce: Recursively collapse until fits token_max
+    while total_tokens(summaries) > config.token_max:
+        groups = group_by_token_limit(summaries, config.token_max)
+        summaries = await parallel_synthesize(groups)
 
     return final_synthesis(summaries)
 ```
 
-### 3.2 Token-Based Level Selection (Simplified)
-
-**Decision:** Use three effective levels instead of five.
+### 3.2 Three-Level Strategy
 
-**Rationale:**
-
-- **Simplicity:** Fewer code paths, easier to understand.
-- **Dynamic instead of fixed:** Map-reduce adapts to content, no need for DETAILED vs HIERARCHICAL distinction.
-
-**Effective Levels:**
+**Decision:** Use three levels based on token count.
 
 | Level | Token Range | Strategy |
 | :--- | :--- | :--- |
 | NONE | < 100 | No summarization needed |
 | BRIEF | 100-500 | Single sentence |
-| MAP_REDUCE | > 500 | Dynamic collapse until fits token_max |
-
-**Backward Compatibility:** The output still reports STANDARD, DETAILED, or HIERARCHICAL based on collapse depth for storage compatibility.
+| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max |
 
 ### 3.3 Research-Backed Defaults
 
@@ -140,15 +116,15 @@ def map_reduce_summarize(content, token_max=3000):
 
 **Rationale:**
 
-- **Coherence preservation:** Splitting mid-sentence or mid-thought loses context and produces poor summaries.
-- **Natural units:** Paragraphs and sentences are natural semantic units that humans use to organize thoughts.
+- **Coherence preservation:** Splitting mid-sentence loses context.
+- **Natural units:** Paragraphs and sentences are natural semantic units.
 - **Overlap for continuity:** The 200-token overlap ensures concepts spanning chunk boundaries aren't lost.
 
 **Fallback chain:**
 
 1. Prefer paragraph boundaries (double newlines)
 2. Fall back to sentence boundaries (`.!?` followed by space + capital)
-3. Final fallback to character splitting for edge cases (e.g., code blocks without punctuation)
+3. Final fallback to word-based splitting
 
 ### 3.5 Content-Type Aware Prompts
 
@@ -156,35 +132,27 @@ def map_reduce_summarize(content, token_max=3000):
 
 **Rationale:**
 
-- **Conversations:** Focus on user preferences, decisions, action items—what the user wants and what was agreed.
-- **Journals:** Emphasize personal insights, emotional context, growth patterns—the subjective experience.
-- **Documents:** Prioritize key findings, methodology, conclusions—the objective content.
+- **Conversations:** Focus on user preferences, decisions, action items.
+- **Journals:** Emphasize personal insights, emotional context, growth patterns.
+- **Documents:** Prioritize key findings, methodology, conclusions.
 
-A generic summarization prompt loses domain-specific signal. By tailoring prompts, we extract what matters for each use case.
+A generic summarization prompt loses domain-specific signal.
 
 ### 3.6 Prior Summary Integration
 
-**Decision:** Always provide the previous summary as context when generating updates.
+**Decision:** Provide the previous summary as context when generating updates.
 
 **Rationale:**
 
-- **Continuity:** New summaries should build on existing context, not start fresh each time.
-- **Incremental updates:** Avoid re-summarizing all historical content on every update.
-- **Information preservation:** Important information from earlier content persists through the chain of summaries.
-
-The L3 summary from the previous run becomes prior context for the next summarization, allowing information to flow forward through time.
+- **Continuity:** New summaries build on existing context.
+- **Incremental updates:** Avoid re-summarizing all historical content.
+- **Information preservation:** Important information persists through the chain.
 
 ### 3.7 Compression Ratio Tracking
 
 **Decision:** Track and report compression metrics for every summary.
 
-**Rationale:**
-
-- **Transparency:** Users can understand how much information was compressed.
-- **Quality monitoring:** Unusual ratios (e.g., output longer than input) may indicate summarization issues.
-- **Optimization:** Metrics inform future threshold tuning and quality assessment.
-
-Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression_ratio` for observability.
+Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_ratio`, and `collapse_depth` for observability.
 
 ---
 
@@ -192,10 +160,10 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, and `compression
 
 ### 4.1 Level Selection
 
-The entry point counts tokens and selects strategy:
+The entry point (`summarize()`) counts tokens and selects strategy:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Determines if NONE, BRIEF, or map-reduce.
+2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE.
 3. **Strategy dispatch:** Calls appropriate handler.
 
 ### 4.2 Brief Level
@@ -203,55 +171,118 @@ The entry point counts tokens and selects strategy:
 For short content (100-500 tokens):
 
 - Single LLM call with brief prompt
-- Returns simple `SummaryResult` with no hierarchical structure
+- Returns `SummaryResult` with single-sentence summary
 
 ### 4.3 Map-Reduce Level
 
-For longer content (> 500 tokens):
+For longer content (>= 500 tokens):
 
 1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
 2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
 3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
 4. **Final synthesis:** Combine remaining summaries into final output.
 
-The parallelism in the map phase provides significant speedup for long content while maintaining semantic coherence through the collapse process.
+The `collapse_depth` field in the result indicates how many reduce iterations were needed.
 
 ---
 
-## 5. Integration with Memory System
+## 5. Data Models
+
+### 5.1 SummaryLevel
+
+```python
+class SummaryLevel(IntEnum):
+    NONE = 0       # < 100 tokens
+    BRIEF = 1      # 100-500 tokens
+    MAP_REDUCE = 2 # >= 500 tokens
+```
+
+### 5.2 SummaryResult
 
-### 5.1 Write Path
+```python
+class SummaryResult(BaseModel):
+    level: SummaryLevel
+    summary: str | None
+    input_tokens: int
+    output_tokens: int
+    compression_ratio: float
+    collapse_depth: int  # 0 = no collapse needed
+    created_at: datetime
+```
+
+### 5.3 SummarizerConfig
+
+```python
+@dataclass
+class SummarizerConfig:
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048      # BOOOOKSCORE
+    token_max: int = 3000       # LangChain
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+```
+
+---
+
+## 6. Integration with Memory System
+
+### 6.1 Write Path
 
 The memory system triggers summarization during post-processing:
 
-1. Collect raw conversation turns (user message + assistant message)
-2. Retrieve existing L3 summary as prior context
+1. Collect raw conversation turns
+2. Retrieve existing summary as prior context
 3. Call summarizer with content + prior summary + content type
-4. Persist results: delete old summaries, write new files, upsert to ChromaDB
+4. Persist result to storage
 
-### 5.2 Read Path
+### 6.2 Read Path
 
 The memory retrieval system uses summaries for context injection:
 
-- Fetches L3 (final) summary for the conversation
-- Injects as prefix to retrieved memories in the prompt
-- Provides high-level context that individual memory snippets lack
+- Fetches summary for the conversation
+- Injects as prefix to retrieved memories
+- Provides high-level context that individual snippets lack
+
+### 6.3 Storage
+
+Summaries are stored with metadata:
 
-### 5.3 Storage
+```python
+{
+    "id": "{conversation_id}:summary",
+    "content": summary_text,
+    "metadata": {
+        "conversation_id": conversation_id,
+        "role": "summary",
+        "summary_level": "MAP_REDUCE",
+        "input_tokens": 1500,
+        "output_tokens": 150,
+        "compression_ratio": 0.1,
+        "collapse_depth": 1,
+        "created_at": "2024-01-15T10:30:00Z",
+    },
+}
+```
+
+---
+
+## 7. Error Handling
 
-Summaries are persisted in two places:
+Summarization follows a fail-fast philosophy:
 
-- **Files:** Markdown with YAML front matter under `summaries/L1/`, `L2/`, `L3/` directories. Human-readable, git-trackable.
-- **ChromaDB:** Vector embeddings for semantic search. Metadata includes level, compression metrics, timestamps.
+- **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors).
+- **Empty input:** Returns NONE level immediately (not an error).
+- **Encoding errors:** Falls back to character-based token estimation.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
 
-For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 structure:
-- First collapse level → L1 (chunk summaries)
-- Intermediate levels → L2 (grouped summaries)
-- Final output → L3 (synthesis)
+The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation.
 
 ---
 
-## 6. Configuration
+## 8. Configuration
 
 | Parameter | Default | Source |
 | :--- | :--- | :--- |
@@ -263,34 +294,28 @@ For backward compatibility, the dynamic collapse levels are mapped to L1/L2/L3 s
 
 ---
 
-## 7. Error Handling
+## 9. Limitations and Trade-offs
 
-Summarization follows a fail-fast philosophy:
+### 9.1 Fact Preservation
 
-- **LLM errors:** Propagated as `SummarizationError` or `MapReduceSummarizationError` rather than silently returning empty results.
-- **Empty input:** Returns NONE level immediately (not an error).
-- **Encoding errors:** Falls back to character-based token estimation.
-- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
+Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval:
 
-The caller (memory system) decides how to handle failures—typically by proceeding without a summary rather than blocking the entire write path.
+- Store original content alongside summaries
+- Use fact extraction instead of summarization
+- Use RAG to retrieve original chunks
 
----
+### 9.2 No Intermediate Summaries
 
-## 8. Comparison: Old vs New Approach
+Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access.
 
-| Aspect | Old Approach | New Approach |
-| :--- | :--- | :--- |
-| Levels | 5 fixed (NONE/BRIEF/STANDARD/DETAILED/HIERARCHICAL) | 3 effective (NONE/BRIEF/MAP_REDUCE) |
-| Hierarchy | Fixed L1/L2/L3 structure | Dynamic collapse depth |
-| Chunk size | 3000 tokens | 2048 tokens (BOOOOKSCORE) |
-| token_max | N/A (fixed levels) | 3000 (LangChain) |
-| Complexity | Multiple code paths | Single map-reduce algorithm |
-| Research basis | Heuristic | LangChain + BOOOOKSCORE |
+### 9.3 Fixed Thresholds
+
+The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages.
 
 ---
 
-## 9. Future Improvements
+## 10. Future Improvements
 
 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
-2. **Add incremental updating mode** as alternative to hierarchical merging for larger context models
-3. **Tune token thresholds empirically** with real-world content
+2. **Tune token thresholds empirically** with real-world content
+3. **Add fact extraction mode** for use cases requiring specific detail preservation

From e0262f49f32b2bea17ea2e7863adb8e67da2bb24 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:29:15 -0800
Subject: [PATCH 31/38] docs: update memory.md for 3-level summarizer

Remove outdated references to 5-level hierarchy (STANDARD, DETAILED,
HIERARCHICAL) and L1/L2/L3 storage structure. Update to reflect current
3-level system (NONE, BRIEF, MAP_REDUCE) with single final summary.

Also fix prompt names to match actual implementation:
- BRIEF_SUMMARY_PROMPT, STANDARD_SUMMARY_PROMPT
- CHUNK_SUMMARY_PROMPT, META_SUMMARY_PROMPT
- Remove non-existent ROLLING_PROMPT
---
 docs/architecture/memory.md | 38 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index e2f3127d..6cd05323 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -59,13 +59,7 @@ entries/
       assistant/
         <timestamp>__<uuid>.md     # Raw assistant responses
     summaries/
-      L1/
-        chunk_0.md                 # Level 1: Individual chunk summaries
-        chunk_1.md
-      L2/
-        group_0.md                 # Level 2: Group summaries (groups of ~5 L1s)
-      L3/
-        final.md                   # Level 3: Final synthesized summary
+      <conversation_id>__summary.md  # Single final summary (map-reduce collapses to one)
 ```
 
 **Deleted Directory Structure (Soft Deletes):**
@@ -77,7 +71,7 @@ entries/
       facts/
         <timestamp>__<uuid>.md
       summaries/
-        L1/, L2/, L3/              # Tombstoned summary levels
+        <conversation_id>__summary.md  # Tombstoned summary
 ```
 
 ### 2.2 File Format
@@ -171,17 +165,16 @@ Resolves contradictions using a "Search-Decide-Update" loop with complete enumer
     *   **Updates:** Implemented as delete + add with a fresh ID; tombstones record `replaced_by`.
     *   **Deletes:** Soft-deletes files (moved under `deleted/`) and removes from Chroma.
 
-### 4.4 Summarization (Adaptive Hierarchical)
+### 4.4 Summarization (Adaptive Map-Reduce)
 Uses the `agent_cli.summarizer` module for research-backed adaptive summarization.
 
-*   **Level Selection:** Automatically determines summarization depth based on token count:
+*   **Level Selection:** Automatically determines summarization strategy based on token count:
     *   `NONE` (< 100 tokens): No summary needed, facts only.
-    *   `BRIEF` (100-500 tokens): Single-sentence summary (~20% compression).
-    *   `STANDARD` (500-3000 tokens): Paragraph summary (~12% compression).
-    *   `DETAILED` (3000-15000 tokens): Chunked summaries + meta-summary (~7% compression).
-    *   `HIERARCHICAL` (> 15000 tokens): Full L1/L2/L3 tree structure.
-*   **Input:** Previous L3 summary (if any) + newly extracted facts.
-*   **Persistence:** Stores summaries in `summaries/L1/`, `L2/`, `L3/` subdirectories with YAML front matter containing compression metrics.
+    *   `BRIEF` (100-500 tokens): Single-sentence summary.
+    *   `MAP_REDUCE` (>= 500 tokens): Dynamic collapse using map-reduce with content-type aware prompts.
+*   **Algorithm:** LangChain-inspired map-reduce that recursively collapses until content fits token_max (3000).
+*   **Input:** Previous summary (if any) + newly extracted facts.
+*   **Persistence:** Stores single final summary in `summaries/` directory with YAML front matter containing compression metrics.
 *   **See:** `docs/architecture/summarizer.md` for detailed algorithm specification.
 
 ### 4.5 Eviction
@@ -213,13 +206,12 @@ To replicate the system behavior, the following prompt strategies are required.
 *   **Output constraints:** JSON list containing all memories; each existing memory must have an event; new unrelated facts must be ADDed; no prose or code fences.
 
 ### 5.3 Summarization (Adaptive Prompts)
-The summarizer uses level-specific prompts from `agent_cli.summarizer._prompts`:
-*   **`BRIEF_PROMPT`:** Single-sentence distillation for short content.
-*   **`STANDARD_PROMPT`:** Paragraph summary with prior context integration.
-*   **`CHUNK_PROMPT`:** Individual chunk summarization for hierarchical processing.
-*   **`META_PROMPT`:** Synthesizes multiple chunk summaries into cohesive narrative.
-*   **`ROLLING_PROMPT`:** Integrates new facts with existing summary.
-*   **Content-type variants:** `CONVERSATION_PROMPT`, `JOURNAL_PROMPT`, `DOCUMENT_PROMPT` for domain-specific summarization.
+The summarizer uses prompts from `agent_cli.summarizer._prompts`:
+*   **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens).
+*   **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
+*   **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase.
+*   **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase.
+*   **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization.
 
 ---
 

From ca33813652ece474c0684c05177fad9e66e9f1dd Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 10:42:46 -0800
Subject: [PATCH 32/38] refactor(summarizer): rename STANDARD_SUMMARY_PROMPT to
 GENERAL_SUMMARY_PROMPT

The prompt name "STANDARD" was a leftover from the old 5-level system
which had a STANDARD SummaryLevel. Since that level no longer exists
(now just NONE, BRIEF, MAP_REDUCE), rename to GENERAL_SUMMARY_PROMPT
to match its actual purpose as the "general" content type prompt.
---
 agent_cli/summarizer/_prompts.py |  8 ++++----
 docs/architecture/memory.md      |  2 +-
 tests/summarizer/test_prompts.py | 32 ++++++++++++++++----------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 1de5fa44..476cb408 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -13,8 +13,8 @@
 
 One-sentence summary:""".strip()
 
-# MAP_REDUCE level - Paragraph summary for content-type aware summarization
-STANDARD_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
+# MAP_REDUCE level - Paragraph summary for general content type
+GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:
 - Key facts, decisions, and outcomes
@@ -104,12 +104,12 @@ def get_prompt_for_content_type(content_type: str) -> str:
 
     """
     prompts = {
-        "general": STANDARD_SUMMARY_PROMPT,
+        "general": GENERAL_SUMMARY_PROMPT,
         "conversation": CONVERSATION_SUMMARY_PROMPT,
         "journal": JOURNAL_SUMMARY_PROMPT,
         "document": DOCUMENT_SUMMARY_PROMPT,
     }
-    return prompts.get(content_type, STANDARD_SUMMARY_PROMPT)
+    return prompts.get(content_type, GENERAL_SUMMARY_PROMPT)
 
 
 def format_prior_context(prior_summary: str | None) -> str:
diff --git a/docs/architecture/memory.md b/docs/architecture/memory.md
index 6cd05323..2b2ab4a2 100644
--- a/docs/architecture/memory.md
+++ b/docs/architecture/memory.md
@@ -208,7 +208,7 @@ To replicate the system behavior, the following prompt strategies are required.
 ### 5.3 Summarization (Adaptive Prompts)
 The summarizer uses prompts from `agent_cli.summarizer._prompts`:
 *   **`BRIEF_SUMMARY_PROMPT`:** Single-sentence distillation for short content (100-500 tokens).
-*   **`STANDARD_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
+*   **`GENERAL_SUMMARY_PROMPT`:** Paragraph summary with prior context integration (general content).
 *   **`CHUNK_SUMMARY_PROMPT`:** Individual chunk summarization for map phase.
 *   **`META_SUMMARY_PROMPT`:** Synthesizes multiple chunk summaries in reduce phase.
 *   **Content-type variants:** `CONVERSATION_SUMMARY_PROMPT`, `JOURNAL_SUMMARY_PROMPT`, `DOCUMENT_SUMMARY_PROMPT` for domain-specific summarization.
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index 66022970..ef05ebad 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -7,9 +7,9 @@
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
     DOCUMENT_SUMMARY_PROMPT,
+    GENERAL_SUMMARY_PROMPT,
     JOURNAL_SUMMARY_PROMPT,
     META_SUMMARY_PROMPT,
-    STANDARD_SUMMARY_PROMPT,
     format_prior_context,
     format_summaries_for_meta,
     get_prompt_for_content_type,
@@ -26,13 +26,13 @@ def test_brief_prompt_has_content_placeholder(self) -> None:
         result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
         assert "Test content" in result
 
-    def test_standard_prompt_has_placeholders(self) -> None:
-        """Test STANDARD prompt contains required placeholders."""
-        assert "{content}" in STANDARD_SUMMARY_PROMPT
-        assert "{prior_context}" in STANDARD_SUMMARY_PROMPT
-        assert "{max_words}" in STANDARD_SUMMARY_PROMPT
+    def test_general_prompt_has_placeholders(self) -> None:
+        """Test GENERAL prompt contains required placeholders."""
+        assert "{content}" in GENERAL_SUMMARY_PROMPT
+        assert "{prior_context}" in GENERAL_SUMMARY_PROMPT
+        assert "{max_words}" in GENERAL_SUMMARY_PROMPT
 
-        result = STANDARD_SUMMARY_PROMPT.format(
+        result = GENERAL_SUMMARY_PROMPT.format(
             content="Main content",
             prior_context="Previous context",
             max_words=100,
@@ -92,10 +92,10 @@ def test_document_prompt_has_placeholders(self) -> None:
 class TestGetPromptForContentType:
     """Tests for get_prompt_for_content_type function."""
 
-    def test_general_returns_standard(self) -> None:
-        """Test general content type returns standard prompt."""
+    def test_general_returns_general(self) -> None:
+        """Test general content type returns general prompt."""
         prompt = get_prompt_for_content_type("general")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
     def test_conversation_returns_conversation(self) -> None:
         """Test conversation content type returns conversation prompt."""
@@ -112,15 +112,15 @@ def test_document_returns_document(self) -> None:
         prompt = get_prompt_for_content_type("document")
         assert prompt == DOCUMENT_SUMMARY_PROMPT
 
-    def test_unknown_returns_standard(self) -> None:
-        """Test unknown content type falls back to standard."""
+    def test_unknown_returns_general(self) -> None:
+        """Test unknown content type falls back to general."""
         prompt = get_prompt_for_content_type("unknown_type")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
-    def test_empty_returns_standard(self) -> None:
-        """Test empty string falls back to standard."""
+    def test_empty_returns_general(self) -> None:
+        """Test empty string falls back to general."""
         prompt = get_prompt_for_content_type("")
-        assert prompt == STANDARD_SUMMARY_PROMPT
+        assert prompt == GENERAL_SUMMARY_PROMPT
 
 
 class TestFormatPriorContext:

From ee4fea6d8783231ee89f0d1d5a7b8b109f9bdc8f Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Thu, 27 Nov 2025 11:12:27 -0800
Subject: [PATCH 33/38] docs: clarify prompt comments to avoid confusion with
 level names

---
 agent_cli/summarizer/_prompts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index 476cb408..de59f940 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,7 +4,7 @@
 and are optimized for structured, factual output.
 """
 
-# BRIEF level - Single sentence summary for short content (100-500 tokens)
+# Single sentence summary for short content (used at BRIEF level, 100-500 tokens)
 BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
 Focus on the single most important point or takeaway.
 
@@ -13,7 +13,7 @@
 
 One-sentence summary:""".strip()
 
-# MAP_REDUCE level - Paragraph summary for general content type
+# Paragraph summary for "general" content type (default when no specific type provided)
 GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
 Focus on:

From 5a26f018f294258043017a6a9de88ab16710f3f2 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Fri, 28 Nov 2025 22:49:27 -0800
Subject: [PATCH 34/38] Chunk memories

---
 agent_cli/core/chroma.py | 20 ++++++++++++++++++--
 agent_cli/rag/client.py  | 13 +++++++++++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/agent_cli/core/chroma.py b/agent_cli/core/chroma.py
index 56d54ede..89e289b7 100644
--- a/agent_cli/core/chroma.py
+++ b/agent_cli/core/chroma.py
@@ -53,12 +53,28 @@ def upsert(
     ids: list[str],
     documents: list[str],
     metadatas: Sequence[BaseModel],
+    batch_size: int = 10,
 ) -> None:
-    """Upsert documents with JSON-serialized metadata."""
+    """Upsert documents with JSON-serialized metadata.
+
+    Args:
+        collection: ChromaDB collection.
+        ids: Document IDs.
+        documents: Document contents.
+        metadatas: Pydantic metadata models.
+        batch_size: Max documents per embedding API call (default: 10).
+
+    """
     if not ids:
         return
     serialized = flatten_metadatas(metadatas)
-    collection.upsert(ids=ids, documents=documents, metadatas=serialized)
+
+    # Process in batches to avoid overwhelming the embedding service
+    for i in range(0, len(ids), batch_size):
+        batch_ids = ids[i : i + batch_size]
+        batch_docs = documents[i : i + batch_size]
+        batch_metas = serialized[i : i + batch_size]
+        collection.upsert(ids=batch_ids, documents=batch_docs, metadatas=batch_metas)
 
 
 def delete(collection: Collection, ids: list[str]) -> None:
diff --git a/agent_cli/rag/client.py b/agent_cli/rag/client.py
index 3e43939a..940985de 100644
--- a/agent_cli/rag/client.py
+++ b/agent_cli/rag/client.py
@@ -124,8 +124,17 @@ def add(
             for i in range(len(chunks))
         ]
 
-        # Upsert to collection
-        self.collection.upsert(ids=ids, documents=chunks, metadatas=metadatas)
+        # Upsert to collection in batches to avoid overwhelming the embedding service
+        batch_size = 10
+        for i in range(0, len(ids), batch_size):
+            batch_ids = ids[i : i + batch_size]
+            batch_docs = chunks[i : i + batch_size]
+            batch_metas = metadatas[i : i + batch_size]
+            self.collection.upsert(
+                ids=batch_ids,
+                documents=batch_docs,
+                metadatas=batch_metas,
+            )
         logger.info("Added doc_id=%s with %d chunks", doc_id, len(chunks))
 
         return doc_id

From 2a8085a2e49c03fdf3b09dad18b35c40b46c6dd4 Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Fri, 28 Nov 2025 23:12:38 -0800
Subject: [PATCH 35/38] refactor(summarizer): remove dead code and reorganize
 models

- Remove unused `middle_truncate()` function and its tests
- Remove unused `MapReduceSummarizationError` exception class
- Move `SummarizerConfig` and `SummarizationError` from _utils.py to models.py

This groups all exported types in models.py and keeps _utils.py focused
on actual utility functions (token counting, chunking, LLM calls).

Net: -96 lines
---
 agent_cli/summarizer/__init__.py   |  9 +++-
 agent_cli/summarizer/_utils.py     | 87 +-----------------------------
 agent_cli/summarizer/adaptive.py   |  4 +-
 agent_cli/summarizer/map_reduce.py |  8 +--
 agent_cli/summarizer/models.py     | 36 +++++++++++++
 tests/summarizer/test_utils.py     | 46 ----------------
 6 files changed, 47 insertions(+), 143 deletions(-)

diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index af977ada..daf0e2bc 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -21,8 +21,13 @@
 
 """
 
-from agent_cli.summarizer.adaptive import SummarizationError, SummarizerConfig, summarize
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import summarize
+from agent_cli.summarizer.models import (
+    SummarizationError,
+    SummarizerConfig,
+    SummaryLevel,
+    SummaryResult,
+)
 
 __all__ = [
     "SummarizationError",
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 078e21ed..23c8dd19 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -3,13 +3,12 @@
 from __future__ import annotations
 
 import re
-from dataclasses import dataclass
 from functools import lru_cache
 from typing import TYPE_CHECKING
 
 from pydantic import BaseModel
 
-from agent_cli.summarizer.models import SummaryLevel
+from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel
 
 if TYPE_CHECKING:
     import tiktoken
@@ -21,41 +20,6 @@ class SummaryOutput(BaseModel):
     summary: str
 
 
-class SummarizationError(Exception):
-    """Raised when summarization fails after all retries."""
-
-
-@dataclass
-class SummarizerConfig:
-    """Configuration for summarization operations.
-
-    Example:
-        config = SummarizerConfig(
-            openai_base_url="http://localhost:8000/v1",
-            model="llama3.1:8b",
-        )
-        result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
-        print(f"Compression: {result.compression_ratio:.1%}")
-
-    """
-
-    openai_base_url: str
-    model: str
-    api_key: str | None = None
-    chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
-    chunk_overlap: int = 200
-    max_concurrent_chunks: int = 5
-    timeout: float = 60.0
-
-    def __post_init__(self) -> None:
-        """Normalize the base URL."""
-        self.openai_base_url = self.openai_base_url.rstrip("/")
-        if self.api_key is None:
-            self.api_key = "not-needed"
-
-
 async def generate_summary(
     prompt: str,
     config: SummarizerConfig,
@@ -266,55 +230,6 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
     return " ".join(overlap_parts)
 
 
-def middle_truncate(
-    text: str,
-    budget_chars: int,
-    head_frac: float = 0.3,
-    tail_frac: float = 0.3,
-) -> tuple[str, int]:
-    """Middle-truncate text to fit within a character budget.
-
-    Keeps the first head_frac and last tail_frac portions, dropping the middle.
-    This preserves context from both the beginning (often contains setup) and
-    end (often contains conclusions/recent events).
-
-    Inspired by Letta's `middle_truncate_text` function.
-
-    Args:
-        text: Text to truncate.
-        budget_chars: Maximum character count for output.
-        head_frac: Fraction of budget for the head portion.
-        tail_frac: Fraction of budget for the tail portion.
-
-    Returns:
-        Tuple of (truncated_text, dropped_char_count).
-
-    """
-    if budget_chars <= 0 or len(text) <= budget_chars:
-        return text, 0
-
-    head_len = max(0, int(budget_chars * head_frac))
-    tail_len = max(0, int(budget_chars * tail_frac))
-
-    # Ensure head + tail doesn't exceed budget
-    if head_len + tail_len > budget_chars:
-        tail_len = max(0, budget_chars - head_len)
-
-    head = text[:head_len]
-    tail = text[-tail_len:] if tail_len > 0 else ""
-    dropped = max(0, len(text) - (len(head) + len(tail)))
-
-    marker = f"\n[...{dropped} characters truncated...]\n"
-
-    # If marker would overflow budget, shrink tail
-    available_for_marker = budget_chars - (len(head) + len(tail))
-    if available_for_marker < len(marker):
-        over = len(marker) - available_for_marker
-        tail = tail[:-over] if over < len(tail) else ""
-
-    return head + marker + tail, dropped
-
-
 def estimate_summary_tokens(input_tokens: int, level: int) -> int:
     """Estimate target summary tokens based on input size and level."""
     if level == SummaryLevel.NONE:
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index 640c52e6..f242b662 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -26,8 +26,6 @@
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
-    SummarizationError,
-    SummarizerConfig,
     count_tokens,
     estimate_summary_tokens,
     generate_summary,
@@ -35,6 +33,8 @@
 )
 from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
+    SummarizationError,
+    SummarizerConfig,
     SummaryLevel,
     SummaryResult,
 )
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 07332c1c..3dd81aa4 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -25,23 +25,17 @@
     format_summaries_for_meta,
 )
 from agent_cli.summarizer._utils import (
-    SummarizationError,
-    SummarizerConfig,
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.models import SummaryLevel
+from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel
 
 logger = logging.getLogger(__name__)
 
 
-class MapReduceSummarizationError(SummarizationError):
-    """Raised when map-reduce summarization fails."""
-
-
 @dataclass
 class MapReduceResult:
     """Result of map-reduce summarization.
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index be0d309b..14be0c86 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass
 from datetime import UTC, datetime
 from enum import IntEnum
 from typing import Any
@@ -9,6 +10,41 @@
 from pydantic import BaseModel, Field
 
 
+class SummarizationError(Exception):
+    """Raised when summarization fails after all retries."""
+
+
+@dataclass
+class SummarizerConfig:
+    """Configuration for summarization operations.
+
+    Example:
+        config = SummarizerConfig(
+            openai_base_url="http://localhost:8000/v1",
+            model="llama3.1:8b",
+        )
+        result = await summarize(long_document, config)
+        print(f"Level: {result.level.name}")
+        print(f"Compression: {result.compression_ratio:.1%}")
+
+    """
+
+    openai_base_url: str
+    model: str
+    api_key: str | None = None
+    chunk_size: int = 2048  # BOOOOKSCORE's tested default
+    token_max: int = 3000  # LangChain's default - when to collapse
+    chunk_overlap: int = 200
+    max_concurrent_chunks: int = 5
+    timeout: float = 60.0
+
+    def __post_init__(self) -> None:
+        """Normalize the base URL."""
+        self.openai_base_url = self.openai_base_url.rstrip("/")
+        if self.api_key is None:
+            self.api_key = "not-needed"
+
+
 class SummaryLevel(IntEnum):
     """Summary strategy based on input length."""
 
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 2621b158..188a7917 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -6,7 +6,6 @@
     chunk_text,
     count_tokens,
     estimate_summary_tokens,
-    middle_truncate,
     tokens_to_words,
 )
 
@@ -86,51 +85,6 @@ def test_large_paragraph_sentence_split(self) -> None:
         assert len(chunks) > 1
 
 
-class TestMiddleTruncate:
-    """Tests for middle_truncate function."""
-
-    def test_no_truncation_needed(self) -> None:
-        """Test that short text is not truncated."""
-        text = "Short text"
-        result, dropped = middle_truncate(text, budget_chars=100)
-        assert result == text
-        assert dropped == 0
-
-    def test_basic_truncation(self) -> None:
-        """Test basic middle truncation."""
-        text = "A" * 100  # 100 character string
-        result, dropped = middle_truncate(text, budget_chars=50)
-
-        # Should have head + marker + tail
-        assert len(result) <= 50 + 50  # Allow for marker
-        assert dropped > 0
-        assert "[..." in result
-        assert "truncated...]" in result
-
-    def test_head_tail_fractions(self) -> None:
-        """Test custom head/tail fractions."""
-        text = "AAAAA" + "BBBBB" * 20 + "CCCCC"
-        result, dropped = middle_truncate(text, budget_chars=30, head_frac=0.5, tail_frac=0.5)
-
-        # Should preserve beginning (A's) and end (C's)
-        assert result.startswith("A")
-        assert dropped > 0
-
-    def test_zero_budget(self) -> None:
-        """Test with zero budget returns original."""
-        text = "Some text"
-        result, dropped = middle_truncate(text, budget_chars=0)
-        assert result == text
-        assert dropped == 0
-
-    def test_negative_budget(self) -> None:
-        """Test with negative budget returns original."""
-        text = "Some text"
-        result, dropped = middle_truncate(text, budget_chars=-10)
-        assert result == text
-        assert dropped == 0
-
-
 class TestEstimateSummaryTokens:
     """Tests for estimate_summary_tokens function."""
 

From 39a77039ddf887413c427ed1a8096dbbe82606be Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Fri, 28 Nov 2025 23:53:34 -0800
Subject: [PATCH 36/38] refactor(memory): remove defensive code for impossible
 UPDATE/DELETE cases

The output_validator already ensures MemoryUpdate and MemoryDelete IDs
are valid. Defensive handling of unknown IDs obscures the contract and
could hide real bugs. Now uses direct indexing which will raise KeyError
if the validator ever fails.
---
 agent_cli/memory/_ingest.py | 44 +++++++++++++------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index 2bed16d9..e699674c 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -122,38 +122,24 @@ def process_reconciliation_decisions(
                     ),
                 )
         elif isinstance(dec, MemoryUpdate):
-            orig = id_map.get(dec.id)
             text = dec.text.strip()
             if text:
-                if orig:
-                    # Update existing memory: delete old, add new
-                    new_id = str(uuid4())
-                    to_delete.append(orig)
-                    to_add.append(
-                        Fact(
-                            id=new_id,
-                            conversation_id=conversation_id,
-                            content=text,
-                            source_id=source_id,
-                            created_at=created_at,
-                        ),
-                    )
-                    replacement_map[orig] = new_id
-                else:
-                    # UPDATE with unknown ID = treat as ADD (model used wrong event)
-                    to_add.append(
-                        Fact(
-                            id=str(uuid4()),
-                            conversation_id=conversation_id,
-                            content=text,
-                            source_id=source_id,
-                            created_at=created_at,
-                        ),
-                    )
-        elif isinstance(dec, MemoryDelete):
-            orig = id_map.get(dec.id)
-            if orig:
+                # Update existing memory: delete old, add new
+                orig = id_map[dec.id]  # Guaranteed valid by output_validator
+                new_id = str(uuid4())
                 to_delete.append(orig)
+                to_add.append(
+                    Fact(
+                        id=new_id,
+                        conversation_id=conversation_id,
+                        content=text,
+                        source_id=source_id,
+                        created_at=created_at,
+                    ),
+                )
+                replacement_map[orig] = new_id
+        elif isinstance(dec, MemoryDelete):
+            to_delete.append(id_map[dec.id])  # Guaranteed valid by output_validator
         elif isinstance(dec, MemoryIgnore):
             pass  # NONE ignored
     return to_add, to_delete, replacement_map

From 9cae5b0321eb2c96148580bb8425d84de4b5ddec Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 3 Dec 2025 20:10:28 -0800
Subject: [PATCH 37/38] refactor(summarizer): simplify API with
 target_tokens/target_ratio parameters

Remove SummaryLevel enum and three-level strategy in favor of a simple
"fits target? return as-is : map-reduce" approach. This reduces complexity
while maintaining full functionality.

Changes:
- Remove SummaryLevel enum (NONE/BRIEF/MAP_REDUCE)
- Add target_tokens parameter for absolute token limit
- Add target_ratio parameter for relative compression (e.g., 0.2 = 20%)
- Simplify estimate_summary_tokens to use ~10% compression ratio
- Update memory integration to use compression_ratio in logging
- Rewrite examples and tests for new API
- Update architecture documentation

Net reduction: ~165 lines of code
---
 agent_cli/agents/summarize.py        |   5 +-
 agent_cli/memory/_ingest.py          |   4 +-
 agent_cli/memory/_persistence.py     |   6 +-
 agent_cli/memory/models.py           |   2 +-
 agent_cli/summarizer/__init__.py     |  18 ++-
 agent_cli/summarizer/_utils.py       |  14 +-
 agent_cli/summarizer/adaptive.py     | 138 +++++++------------
 agent_cli/summarizer/map_reduce.py   |  53 +++++---
 agent_cli/summarizer/models.py       |  26 +---
 docs/architecture/summarizer.md      | 195 ++++++++++++++++-----------
 examples/summarizer_demo.py          | 164 ++++++++++------------
 tests/memory/test_engine.py          |   4 +-
 tests/memory/test_git_integration.py |   3 +-
 tests/summarizer/test_adaptive.py    | 168 +++++++----------------
 tests/summarizer/test_integration.py |  34 +----
 tests/summarizer/test_models.py      |  60 ++-------
 tests/summarizer/test_utils.py       |  48 +++----
 17 files changed, 388 insertions(+), 554 deletions(-)

diff --git a/agent_cli/agents/summarize.py b/agent_cli/agents/summarize.py
index ec516310..ecfd1e05 100644
--- a/agent_cli/agents/summarize.py
+++ b/agent_cli/agents/summarize.py
@@ -115,7 +115,7 @@ def _display_result(
     elif result.summary:
         print_output_panel(
             result.summary,
-            title=f"Summary (Level: {result.level.name})",
+            title="Summary",
             subtitle=f"[dim]{result.output_tokens:,} tokens | {result.compression_ratio:.1%} of original | {elapsed:.2f}s[/dim]",
         )
     else:
@@ -139,7 +139,6 @@ def _display_full_result(
 
     console.print()
     console.print("[bold cyan]Summarization Result[/bold cyan]")
-    console.print(f"  Level: [bold]{result.level.name}[/bold]")
     console.print(f"  Input tokens: [bold]{result.input_tokens:,}[/bold]")
     console.print(f"  Output tokens: [bold]{result.output_tokens:,}[/bold]")
     console.print(f"  Compression: [bold]{result.compression_ratio:.1%}[/bold]")
@@ -151,7 +150,7 @@ def _display_full_result(
     if result.summary:
         print_output_panel(
             result.summary,
-            title=f"Summary ({result.level.name})",
+            title="Summary",
         )
 
 
diff --git a/agent_cli/memory/_ingest.py b/agent_cli/memory/_ingest.py
index e699674c..b98bee13 100644
--- a/agent_cli/memory/_ingest.py
+++ b/agent_cli/memory/_ingest.py
@@ -431,10 +431,10 @@ async def extract_and_store_facts_and_summaries(
             model=model,
         )
         LOGGER.info(
-            "Summary update completed in %.1f ms (conversation=%s, level=%s)",
+            "Summary update completed in %.1f ms (conversation=%s, compression=%.1f%%)",
             _elapsed_ms(summary_start),
             conversation_id,
-            summary_result.level.name,
+            summary_result.compression_ratio * 100,
         )
         if summary_result.summary:
             await store_adaptive_summary(
diff --git a/agent_cli/memory/_persistence.py b/agent_cli/memory/_persistence.py
index a7e3871e..46ac0363 100644
--- a/agent_cli/memory/_persistence.py
+++ b/agent_cli/memory/_persistence.py
@@ -191,10 +191,8 @@ def persist_summary(
         List of IDs that were stored.
 
     """
-    from agent_cli.summarizer import SummaryLevel  # noqa: PLC0415
-
-    # Skip if no summary needed
-    if summary_result.level == SummaryLevel.NONE:
+    # Skip if no summary was generated
+    if not summary_result.summary:
         return []
 
     # Delete existing summary files
diff --git a/agent_cli/memory/models.py b/agent_cli/memory/models.py
index 5b8df385..d52d952c 100644
--- a/agent_cli/memory/models.py
+++ b/agent_cli/memory/models.py
@@ -65,7 +65,7 @@ class MemoryMetadata(BaseModel):
     compression_ratio: float | None = None
     """Ratio of output to input tokens."""
     summary_level: str | None = None
-    """Name of the SummaryLevel enum used (NONE, BRIEF, or MAP_REDUCE)."""
+    """Deprecated: previously stored SummaryLevel enum name."""
     collapse_depth: int | None = None
     """Number of collapse iterations in map-reduce (0 = no collapse needed)."""
 
diff --git a/agent_cli/summarizer/__init__.py b/agent_cli/summarizer/__init__.py
index daf0e2bc..7c7603b9 100644
--- a/agent_cli/summarizer/__init__.py
+++ b/agent_cli/summarizer/__init__.py
@@ -1,13 +1,13 @@
 """Adaptive summarization module for variable-length content.
 
 This module provides map-reduce summarization inspired by LangChain's approach:
-1. Split content into chunks and summarize each in parallel (map phase)
-2. Recursively collapse summaries until they fit token_max (reduce phase)
+1. If content fits target, return as-is (no LLM call)
+2. Otherwise, split into chunks and summarize each in parallel (map phase)
+3. Recursively collapse summaries until they fit target (reduce phase)
 
 Research foundations:
 - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
 - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
-- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
 
 Example:
     from agent_cli.summarizer import summarize, SummarizerConfig
@@ -16,8 +16,14 @@
         openai_base_url="http://localhost:8000/v1",
         model="gpt-4",
     )
-    result = await summarize(long_document, config)
-    print(f"Level: {result.level.name}, Compression: {result.compression_ratio:.1%}")
+
+    # Compress to fit 4000 tokens
+    result = await summarize(long_document, config, target_tokens=4000)
+
+    # Compress to 20% of original size
+    result = await summarize(long_document, config, target_ratio=0.2)
+
+    print(f"Compression: {result.compression_ratio:.1%}")
 
 """
 
@@ -25,14 +31,12 @@
 from agent_cli.summarizer.models import (
     SummarizationError,
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
 )
 
 __all__ = [
     "SummarizationError",
     "SummarizerConfig",
-    "SummaryLevel",
     "SummaryResult",
     "summarize",
 ]
diff --git a/agent_cli/summarizer/_utils.py b/agent_cli/summarizer/_utils.py
index 23c8dd19..64c72b8f 100644
--- a/agent_cli/summarizer/_utils.py
+++ b/agent_cli/summarizer/_utils.py
@@ -8,7 +8,7 @@
 
 from pydantic import BaseModel
 
-from agent_cli.summarizer.models import SummarizationError, SummarizerConfig, SummaryLevel
+from agent_cli.summarizer.models import SummarizationError, SummarizerConfig
 
 if TYPE_CHECKING:
     import tiktoken
@@ -230,13 +230,11 @@ def _get_overlap_text(chunks: list[str], target_tokens: int, model: str) -> str:
     return " ".join(overlap_parts)
 
 
-def estimate_summary_tokens(input_tokens: int, level: int) -> int:
-    """Estimate target summary tokens based on input size and level."""
-    if level == SummaryLevel.NONE:
-        return 0
-    if level == SummaryLevel.BRIEF:
-        return min(50, max(20, input_tokens // 5))
-    # MAP_REDUCE: ~10% compression with floor/ceiling
+def estimate_summary_tokens(input_tokens: int) -> int:
+    """Estimate target summary tokens based on input size.
+
+    Uses ~10% compression ratio with floor/ceiling bounds.
+    """
     return min(500, max(50, input_tokens // 10))
 
 
diff --git a/agent_cli/summarizer/adaptive.py b/agent_cli/summarizer/adaptive.py
index f242b662..2a772062 100644
--- a/agent_cli/summarizer/adaptive.py
+++ b/agent_cli/summarizer/adaptive.py
@@ -1,17 +1,13 @@
 """Adaptive summarization using map-reduce with dynamic collapse.
 
 Implements a simple algorithm inspired by LangChain's map-reduce chains:
-1. If content is short enough, summarize directly
+1. If content fits target, return as-is (no LLM call)
 2. Otherwise, split into chunks and summarize each (map phase)
-3. Recursively collapse summaries until they fit token_max (reduce phase)
+3. Recursively collapse summaries until they fit target (reduce phase)
 
 Research foundations:
 - LangChain ReduceDocumentsChain: token_max=3000, recursive collapse
 - BOOOOKSCORE (arXiv:2310.00785): chunk_size=2048 optimal
-- Two-phase architecture concept from Mem0 (arXiv:2504.19413)
-
-Key insight: No need for predetermined L1/L2/L3 levels.
-Dynamic collapse depth based on actual content length.
 
 See docs/architecture/summarizer.md for detailed design rationale.
 """
@@ -21,76 +17,68 @@
 import logging
 
 from agent_cli.summarizer._prompts import (
-    BRIEF_SUMMARY_PROMPT,
     format_prior_context,
     get_prompt_for_content_type,
 )
 from agent_cli.summarizer._utils import (
     count_tokens,
-    estimate_summary_tokens,
     generate_summary,
     tokens_to_words,
 )
 from agent_cli.summarizer.map_reduce import map_reduce_summarize
 from agent_cli.summarizer.models import (
-    SummarizationError,
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
 )
 
 logger = logging.getLogger(__name__)
 
-# Thresholds for summary levels (in tokens)
-THRESHOLD_NONE = 100  # Below this, no summary needed
-THRESHOLD_BRIEF = 500  # Below this, just a single sentence
-
-# Re-export for backwards compatibility
 __all__ = [
-    "THRESHOLD_BRIEF",
-    "THRESHOLD_NONE",
-    "SummarizationError",
     "SummarizerConfig",
-    "determine_level",
     "summarize",
 ]
 
 
-def determine_level(token_count: int) -> SummaryLevel:
-    """Map token count to appropriate SummaryLevel."""
-    if token_count < THRESHOLD_NONE:
-        return SummaryLevel.NONE
-    if token_count < THRESHOLD_BRIEF:
-        return SummaryLevel.BRIEF
-    return SummaryLevel.MAP_REDUCE
-
-
 async def summarize(
     content: str,
     config: SummarizerConfig,
+    *,
+    target_tokens: int | None = None,
+    target_ratio: float | None = None,
     prior_summary: str | None = None,
     content_type: str = "general",
 ) -> SummaryResult:
-    """Summarize content with adaptive strategy based on length.
+    """Summarize content to fit within a target token limit.
 
-    Uses a simple algorithm:
-    - Very short content (<100 tokens): No summary
-    - Short content (<500 tokens): Single sentence brief summary
-    - Everything else: Map-reduce with dynamic collapse
+    Simple algorithm:
+    - If content already fits target, return as-is (no LLM call)
+    - Otherwise, use map-reduce to compress until it fits
 
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
+        target_tokens: Absolute token limit (e.g., 4000). Defaults to config.token_max.
+        target_ratio: Relative compression ratio (e.g., 0.2 = compress to 20% of input).
+            Takes precedence over target_tokens if both provided.
         prior_summary: Optional prior summary for context continuity.
         content_type: Type of content ("general", "conversation", "journal", "document").
 
     Returns:
-        SummaryResult with summary and metadata.
+        SummaryResult with summary and compression metrics.
+
+    Examples:
+        # Compress to fit 4000 tokens
+        result = await summarize(huge_doc, config, target_tokens=4000)
+
+        # Compress to 20% of original size
+        result = await summarize(huge_doc, config, target_ratio=0.2)
+
+        # Use default (config.token_max = 3000)
+        result = await summarize(huge_doc, config)
 
     """
     if not content or not content.strip():
         return SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=0,
             output_tokens=0,
@@ -98,65 +86,43 @@ async def summarize(
         )
 
     input_tokens = count_tokens(content, config.model)
-    level = determine_level(input_tokens)
+
+    # Determine target
+    if target_ratio is not None:
+        target = max(1, int(input_tokens * target_ratio))
+    elif target_tokens is not None:
+        target = target_tokens
+    else:
+        target = config.token_max
 
     logger.info(
-        "Summarizing %d tokens at level %s (type=%s)",
+        "Summarizing %d tokens to target %d (type=%s)",
         input_tokens,
-        level.name,
+        target,
         content_type,
     )
 
-    if level == SummaryLevel.NONE:
+    # Already fits? Return content as-is (no LLM call)
+    if input_tokens <= target:
         return SummaryResult(
-            level=level,
-            summary=None,
+            summary=content,
             input_tokens=input_tokens,
-            output_tokens=0,
-            compression_ratio=0.0,
+            output_tokens=input_tokens,
+            compression_ratio=1.0,
+            collapse_depth=0,
         )
 
-    if level == SummaryLevel.BRIEF:
-        summary = await _brief_summary(content, config)
-        output_tokens = count_tokens(summary, config.model)
-        return SummaryResult(
-            level=level,
-            summary=summary,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            compression_ratio=output_tokens / input_tokens,
+    # Content fits in single chunk but exceeds target - use content-aware summary
+    if input_tokens <= config.chunk_size:
+        summary = await _content_aware_summary(
+            content,
+            config,
+            target,
+            prior_summary,
+            content_type,
         )
-
-    # MAP_REDUCE level
-    return await _map_reduce_summary(
-        content,
-        input_tokens,
-        config,
-        prior_summary,
-        content_type,
-    )
-
-
-async def _brief_summary(content: str, config: SummarizerConfig) -> str:
-    """Generate a single-sentence summary for brief content."""
-    prompt = BRIEF_SUMMARY_PROMPT.format(content=content)
-    return await generate_summary(prompt, config, max_tokens=50)
-
-
-async def _map_reduce_summary(
-    content: str,
-    input_tokens: int,
-    config: SummarizerConfig,
-    prior_summary: str | None,
-    content_type: str,
-) -> SummaryResult:
-    """Use map-reduce with dynamic collapse for longer content."""
-    # For content that fits in a single chunk, use content-type aware summary
-    if input_tokens <= config.token_max:
-        summary = await _content_aware_summary(content, config, prior_summary, content_type)
         output_tokens = count_tokens(summary, config.model)
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary=summary,
             input_tokens=input_tokens,
             output_tokens=output_tokens,
@@ -164,11 +130,10 @@ async def _map_reduce_summary(
             collapse_depth=0,
         )
 
-    # Use map-reduce for multi-chunk content
-    result = await map_reduce_summarize(content, config)
+    # Large content - use map-reduce with dynamic collapse
+    result = await map_reduce_summarize(content, config, target)
 
     return SummaryResult(
-        level=SummaryLevel.MAP_REDUCE,
         summary=result.summary,
         input_tokens=result.input_tokens,
         output_tokens=result.output_tokens,
@@ -180,14 +145,11 @@ async def _map_reduce_summary(
 async def _content_aware_summary(
     content: str,
     config: SummarizerConfig,
+    target_tokens: int,
     prior_summary: str | None,
     content_type: str,
 ) -> str:
     """Generate a content-type aware summary for single-chunk content."""
-    target_tokens = estimate_summary_tokens(
-        count_tokens(content, config.model),
-        SummaryLevel.MAP_REDUCE,
-    )
     max_words = tokens_to_words(target_tokens)
 
     prompt_template = get_prompt_for_content_type(content_type)
diff --git a/agent_cli/summarizer/map_reduce.py b/agent_cli/summarizer/map_reduce.py
index 3dd81aa4..86e8b796 100644
--- a/agent_cli/summarizer/map_reduce.py
+++ b/agent_cli/summarizer/map_reduce.py
@@ -2,7 +2,7 @@
 
 Simple algorithm:
 1. Map: Split content into chunks, summarize each in parallel
-2. Reduce: If combined summaries exceed token_max, recursively collapse
+2. Reduce: If combined summaries exceed target, recursively collapse
 
 Key insight from LangChain: No need for predetermined levels (L1/L2/L3).
 Just keep collapsing until content fits. Dynamic depth based on actual content.
@@ -18,6 +18,7 @@
 import asyncio
 import logging
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from agent_cli.summarizer._prompts import (
     CHUNK_SUMMARY_PROMPT,
@@ -31,7 +32,9 @@
     generate_summary,
     tokens_to_words,
 )
-from agent_cli.summarizer.models import SummarizerConfig, SummaryLevel
+
+if TYPE_CHECKING:
+    from agent_cli.summarizer.models import SummarizerConfig
 
 logger = logging.getLogger(__name__)
 
@@ -61,28 +64,29 @@ class MapReduceResult:
 async def map_reduce_summarize(
     content: str,
     config: SummarizerConfig,
+    target: int | None = None,
     max_collapse_depth: int = 10,
 ) -> MapReduceResult:
     """Summarize content using map-reduce with dynamic collapse.
 
     Algorithm:
     1. Split into chunks and summarize each (map phase)
-    2. If combined summaries exceed token_max, recursively collapse (reduce phase)
-    3. Continue until everything fits in token_max
-
-    Note: This function is designed for content that exceeds token_max. For shorter
-    content, use the main `summarize()` function in adaptive.py which selects the
-    appropriate strategy (NONE, BRIEF, or MAP_REDUCE with content-aware prompts).
+    2. If combined summaries exceed target, recursively collapse (reduce phase)
+    3. Continue until everything fits in target
 
     Args:
         content: The content to summarize.
         config: Summarizer configuration.
+        target: Target token count. Defaults to config.token_max.
         max_collapse_depth: Safety limit on recursive collapse depth.
 
     Returns:
         MapReduceResult with summary and metadata.
 
     """
+    if target is None:
+        target = config.token_max
+
     input_tokens = count_tokens(content, config.model)
 
     # Map phase: Split and summarize chunks in parallel
@@ -97,9 +101,9 @@ async def map_reduce_summarize(
     summaries = await _map_summarize(chunks, config)
     intermediate_summaries = [summaries.copy()]
 
-    # Reduce phase: Recursively collapse until fits token_max
+    # Reduce phase: Recursively collapse until fits target
     depth = 0
-    while _total_tokens(summaries, config.model) > config.token_max:
+    while _total_tokens(summaries, config.model) > target:
         depth += 1
         if depth > max_collapse_depth:
             logger.warning(
@@ -109,17 +113,18 @@ async def map_reduce_summarize(
             break
 
         logger.info(
-            "Reduce phase (depth %d): collapsing %d summaries (%d tokens)",
+            "Reduce phase (depth %d): collapsing %d summaries (%d tokens) to target %d",
             depth,
             len(summaries),
             _total_tokens(summaries, config.model),
+            target,
         )
-        summaries = await _collapse_summaries(summaries, config)
+        summaries = await _collapse_summaries(summaries, config, target)
         intermediate_summaries.append(summaries.copy())
 
     # Final synthesis if we have multiple summaries left
     if len(summaries) > 1:
-        final_summary = await _synthesize(summaries, config)
+        final_summary = await _synthesize(summaries, config, target)
     else:
         final_summary = summaries[0]
 
@@ -161,7 +166,7 @@ async def _summarize_chunk(
 ) -> str:
     """Summarize a single chunk."""
     source_tokens = count_tokens(chunk, config.model)
-    target_tokens = estimate_summary_tokens(source_tokens, SummaryLevel.MAP_REDUCE)
+    target_tokens = estimate_summary_tokens(source_tokens)
     max_words = tokens_to_words(target_tokens)
 
     prompt = CHUNK_SUMMARY_PROMPT.format(
@@ -177,16 +182,17 @@ async def _summarize_chunk(
 async def _collapse_summaries(
     summaries: list[str],
     config: SummarizerConfig,
+    target: int,
 ) -> list[str]:
     """Collapse summaries by grouping and re-summarizing (reduce phase).
 
-    Groups summaries that together fit within token_max, then summarizes each group.
+    Groups summaries that together fit within target, then summarizes each group.
     This is similar to LangChain's split_list_of_docs approach.
     """
     if len(summaries) <= 1:
         return summaries
 
-    # Group summaries that together fit within token_max
+    # Group summaries that together fit within target
     groups: list[list[str]] = []
     current_group: list[str] = []
     current_tokens = 0
@@ -194,8 +200,8 @@ async def _collapse_summaries(
     for summary in summaries:
         summary_tokens = count_tokens(summary, config.model)
 
-        # If adding this summary would exceed token_max, start new group
-        if current_tokens + summary_tokens > config.token_max and current_group:
+        # If adding this summary would exceed target, start new group
+        if current_tokens + summary_tokens > target and current_group:
             groups.append(current_group)
             current_group = [summary]
             current_tokens = summary_tokens
@@ -211,16 +217,21 @@ async def _collapse_summaries(
 
     async def summarize_group(group: list[str]) -> str:
         async with semaphore:
-            return await _synthesize(group, config)
+            return await _synthesize(group, config, target)
 
     tasks = [summarize_group(g) for g in groups]
     return list(await asyncio.gather(*tasks))
 
 
-async def _synthesize(summaries: list[str], config: SummarizerConfig) -> str:
+async def _synthesize(
+    summaries: list[str],
+    config: SummarizerConfig,
+    target: int,
+) -> str:
     """Synthesize multiple summaries into one."""
     combined_tokens = sum(count_tokens(s, config.model) for s in summaries)
-    target_tokens = estimate_summary_tokens(combined_tokens, SummaryLevel.MAP_REDUCE)
+    # Aim for target tokens but use estimate if combined is smaller
+    target_tokens = min(target, estimate_summary_tokens(combined_tokens))
     max_words = tokens_to_words(target_tokens)
 
     prompt = META_SUMMARY_PROMPT.format(
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 14be0c86..65eb42ed 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -4,7 +4,6 @@
 
 from dataclasses import dataclass
 from datetime import UTC, datetime
-from enum import IntEnum
 from typing import Any
 
 from pydantic import BaseModel, Field
@@ -24,7 +23,6 @@ class SummarizerConfig:
             model="llama3.1:8b",
         )
         result = await summarize(long_document, config)
-        print(f"Level: {result.level.name}")
         print(f"Compression: {result.compression_ratio:.1%}")
 
     """
@@ -33,7 +31,7 @@ class SummarizerConfig:
     model: str
     api_key: str | None = None
     chunk_size: int = 2048  # BOOOOKSCORE's tested default
-    token_max: int = 3000  # LangChain's default - when to collapse
+    token_max: int = 3000  # LangChain's default - target size after compression
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -45,32 +43,18 @@ def __post_init__(self) -> None:
             self.api_key = "not-needed"
 
 
-class SummaryLevel(IntEnum):
-    """Summary strategy based on input length."""
-
-    NONE = 0
-    """< 100 tokens: No summary needed."""
-
-    BRIEF = 1
-    """100-500 tokens: Single-sentence summary."""
-
-    MAP_REDUCE = 2
-    """> 500 tokens: Map-reduce with dynamic collapse."""
-
-
 class SummaryResult(BaseModel):
     """Result of summarization.
 
     Contains the summary and metadata about the compression achieved.
     """
 
-    level: SummaryLevel = Field(..., description="The summarization strategy used")
     summary: str | None = Field(
         default=None,
-        description="The final summary text (None for NONE level)",
+        description="The summary text (None if content already fit target)",
     )
     input_tokens: int = Field(..., ge=0, description="Token count of the input content")
-    output_tokens: int = Field(..., ge=0, description="Token count of the summary")
+    output_tokens: int = Field(..., ge=0, description="Token count of the output")
     compression_ratio: float = Field(
         ...,
         ge=0.0,
@@ -91,8 +75,9 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
         """Convert to metadata entry for ChromaDB storage.
 
         Returns a list with a single metadata dict for the summary.
+        Returns empty list if no summary was generated.
         """
-        if self.level == SummaryLevel.NONE or not self.summary:
+        if not self.summary:
             return []
 
         timestamp = self.created_at.isoformat()
@@ -105,7 +90,6 @@ def to_storage_metadata(self, conversation_id: str) -> list[dict[str, Any]]:
                     "conversation_id": conversation_id,
                     "role": "summary",
                     "is_final": True,
-                    "summary_level": self.level.name,
                     "input_tokens": self.input_tokens,
                     "output_tokens": self.output_tokens,
                     "compression_ratio": self.compression_ratio,
diff --git a/docs/architecture/summarizer.md b/docs/architecture/summarizer.md
index 43caf336..c7476142 100644
--- a/docs/architecture/summarizer.md
+++ b/docs/architecture/summarizer.md
@@ -4,22 +4,23 @@ This document describes the architectural decisions, design rationale, and techn
 
 ## 1. System Overview
 
-The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. Rather than applying fixed summarization levels, it dynamically collapses content until it fits within a token budget.
+The adaptive summarizer provides **content-aware compression** using a map-reduce approach inspired by LangChain's chains. It compresses content to fit within a specified token budget using a simple algorithm:
 
 ```
-Input Content ──▶ Token Count ──▶ Strategy Selection
+Input Content ──▶ Token Count ──▶ Compare to Target
                                         │
-        ┌───────────────────────────────┼─────────────────────┐
-        │                               │                     │
-   < 100 tokens                  100-500 tokens         > 500 tokens
-        │                               │                     │
-   No summary                    Brief summary           Map-Reduce
-                                (single sentence)     (dynamic collapse)
+                ┌───────────────────────┴───────────────────────┐
+                │                                               │
+          Fits target                                    Exceeds target
+                │                                               │
+          Return as-is                                   Map-Reduce
+          (no LLM call)                               (dynamic collapse)
 ```
 
 **Design Goals:**
 
-- **Simple algorithm:** Map-reduce with dynamic collapse depth based on actual content.
+- **Maximum simplicity:** Single entry point with straightforward logic.
+- **Flexible targeting:** Specify absolute token count or relative compression ratio.
 - **Research-grounded defaults:** chunk_size=2048 (BOOOOKSCORE), token_max=3000 (LangChain).
 - **Content-type awareness:** Domain-specific prompts for conversations, journals, documents.
 
@@ -47,59 +48,81 @@ BOOOOKSCORE's research on book-length summarization found optimal chunk sizes. T
 - Chunk size: **2048 tokens** (we use this)
 - Max summary length: **900 tokens**
 
-### 2.3 Borrowed: Two-Phase Architecture (Mem0)
-
-**Reference:** arXiv:2504.19413
-
-Mem0's memory layer research informed our storage architecture with a **two-phase approach**: separate extraction (identifying what's important) from storage (how to persist it). We apply this by first generating summaries via LLM, then persisting results to storage.
-
-### 2.4 Original Design (Not Research-Backed)
+### 2.3 Original Design (Not Research-Backed)
 
 The following aspects are **original design choices without direct research justification**:
 
-- **Token thresholds (100/500):** The boundaries between NONE/BRIEF/MAP_REDUCE were chosen heuristically.
 - **Content-type prompts:** Domain-specific prompts are original design.
+- **Target ratio parameter:** The option to specify compression as a percentage is a convenience feature.
 
 ---
 
 ## 3. Architectural Decisions
 
-### 3.1 Map-Reduce with Dynamic Collapse
+### 3.1 Simple Target-Based Logic
+
+**Decision:** Use a simple "fits? return : compress" algorithm.
+
+**Rationale:**
+
+- **Minimal complexity:** No level selection logic, threshold management, or multiple code paths.
+- **Clear semantics:** If content fits the target, return it unchanged. Otherwise, compress.
+- **Flexible targeting:** Users can specify exact token counts or relative ratios.
+
+**Algorithm:**
+
+```python
+async def summarize(
+    content: str,
+    config: SummarizerConfig,
+    *,
+    target_tokens: int | None = None,   # Absolute limit
+    target_ratio: float | None = None,  # e.g., 0.2 = compress to 20%
+) -> SummaryResult:
+    input_tokens = count_tokens(content)
+
+    # Determine target
+    if target_ratio is not None:
+        target = max(1, int(input_tokens * target_ratio))
+    elif target_tokens is not None:
+        target = target_tokens
+    else:
+        target = config.token_max  # Default: 3000
+
+    # Already fits? Return as-is (no LLM call)
+    if input_tokens <= target:
+        return SummaryResult(summary=content, ...)
+
+    # Compress using map-reduce
+    return await map_reduce_summarize(content, config, target)
+```
 
-**Decision:** Use LangChain-style map-reduce instead of fixed hierarchy.
+### 3.2 Map-Reduce with Dynamic Collapse
+
+**Decision:** Use LangChain-style map-reduce for all compression.
 
 **Rationale:**
 
-- **Simpler algorithm:** Single code path handles all content sizes.
+- **Single algorithm:** One code path handles all content sizes.
 - **Dynamic depth:** Collapse depth adapts to actual content length.
 - **Research-backed:** LangChain's approach is battle-tested.
 
 **Algorithm:**
 
 ```python
-async def map_reduce_summarize(content, config):
+async def map_reduce_summarize(content, config, target):
     # Map: Split and summarize chunks in parallel
     chunks = chunk_text(content, chunk_size=2048)
     summaries = await parallel_summarize(chunks)
 
-    # Reduce: Recursively collapse until fits token_max
-    while total_tokens(summaries) > config.token_max:
-        groups = group_by_token_limit(summaries, config.token_max)
+    # Reduce: Recursively collapse until fits target
+    while total_tokens(summaries) > target:
+        groups = group_by_token_limit(summaries, target)
         summaries = await parallel_synthesize(groups)
 
     return final_synthesis(summaries)
 ```
 
-### 3.2 Three-Level Strategy
-
-**Decision:** Use three levels based on token count.
-
-| Level | Token Range | Strategy |
-| :--- | :--- | :--- |
-| NONE | < 100 | No summarization needed |
-| BRIEF | 100-500 | Single sentence |
-| MAP_REDUCE | >= 500 | Dynamic collapse until fits token_max |
-
 ### 3.3 Research-Backed Defaults
 
 **Decision:** Use values from published research.
@@ -158,29 +181,29 @@ Every `SummaryResult` includes `input_tokens`, `output_tokens`, `compression_rat
 
 ## 4. Processing Pipeline
 
-### 4.1 Level Selection
+### 4.1 Entry Point
 
-The entry point (`summarize()`) counts tokens and selects strategy:
+The entry point (`summarize()`) implements simple logic:
 
 1. **Token counting:** Uses tiktoken with model-appropriate encoding. Falls back to character-based estimation (~4 chars/token) if tiktoken unavailable.
-2. **Threshold comparison:** Determines NONE, BRIEF, or MAP_REDUCE.
-3. **Strategy dispatch:** Calls appropriate handler.
+2. **Target calculation:** Determines target from `target_tokens`, `target_ratio`, or default `token_max`.
+3. **Fit check:** If content fits target, return as-is.
+4. **Compression:** Call map-reduce if content exceeds target.
 
-### 4.2 Brief Level
+### 4.2 Single-Chunk Content
 
-For short content (100-500 tokens):
+For content that fits within `chunk_size` but exceeds target:
 
-- Single LLM call with brief prompt
-- Returns `SummaryResult` with single-sentence summary
+- Single LLM call with content-type aware prompt
+- Returns `SummaryResult` with compressed summary
 
-### 4.3 Map-Reduce Level
+### 4.3 Multi-Chunk Content
 
-For longer content (>= 500 tokens):
+For larger content (> chunk_size tokens):
 
-1. **Check single-chunk:** If content fits in token_max, use content-type aware summary directly.
-2. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
-3. **Reduce phase:** If combined summaries exceed token_max, group and re-summarize recursively.
-4. **Final synthesis:** Combine remaining summaries into final output.
+1. **Map phase:** Split content into overlapping chunks, summarize each in parallel.
+2. **Reduce phase:** If combined summaries exceed target, group and re-summarize recursively.
+3. **Final synthesis:** Combine remaining summaries into final output.
 
 The `collapse_depth` field in the result indicates how many reduce iterations were needed.
 
@@ -188,29 +211,19 @@ The `collapse_depth` field in the result indicates how many reduce iterations we
 
 ## 5. Data Models
 
-### 5.1 SummaryLevel
-
-```python
-class SummaryLevel(IntEnum):
-    NONE = 0       # < 100 tokens
-    BRIEF = 1      # 100-500 tokens
-    MAP_REDUCE = 2 # >= 500 tokens
-```
-
-### 5.2 SummaryResult
+### 5.1 SummaryResult
 
 ```python
 class SummaryResult(BaseModel):
-    level: SummaryLevel
-    summary: str | None
+    summary: str | None      # None if content was empty
     input_tokens: int
     output_tokens: int
-    compression_ratio: float
-    collapse_depth: int  # 0 = no collapse needed
+    compression_ratio: float  # 0.0-1.0
+    collapse_depth: int       # 0 = no collapse needed
     created_at: datetime
 ```
 
-### 5.3 SummarizerConfig
+### 5.2 SummarizerConfig
 
 ```python
 @dataclass
@@ -219,7 +232,7 @@ class SummarizerConfig:
     model: str
     api_key: str | None = None
     chunk_size: int = 2048      # BOOOOKSCORE
-    token_max: int = 3000       # LangChain
+    token_max: int = 3000       # LangChain (default target)
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
     timeout: float = 60.0
@@ -257,12 +270,12 @@ Summaries are stored with metadata:
     "metadata": {
         "conversation_id": conversation_id,
         "role": "summary",
-        "summary_level": "MAP_REDUCE",
         "input_tokens": 1500,
         "output_tokens": 150,
         "compression_ratio": 0.1,
         "collapse_depth": 1,
         "created_at": "2024-01-15T10:30:00Z",
+        "is_final": True,
     },
 }
 ```
@@ -274,9 +287,9 @@ Summaries are stored with metadata:
 Summarization follows a fail-fast philosophy:
 
 - **LLM errors:** Propagated as `SummarizationError` (base class for all summarization errors).
-- **Empty input:** Returns NONE level immediately (not an error).
+- **Empty input:** Returns result with `summary=None` immediately (not an error).
 - **Encoding errors:** Falls back to character-based token estimation.
-- **Max depth exceeded:** Warning logged, forces final synthesis even if over token_max.
+- **Max depth exceeded:** Warning logged, forces final synthesis even if over target.
 
 The caller decides how to handle failures—typically by proceeding without a summary rather than blocking the entire operation.
 
@@ -294,9 +307,41 @@ The caller decides how to handle failures—typically by proceeding without a su
 
 ---
 
-## 9. Limitations and Trade-offs
+## 9. Usage Examples
+
+### Basic Usage
+
+```python
+from agent_cli.summarizer import SummarizerConfig, summarize
+
+config = SummarizerConfig(
+    openai_base_url="http://localhost:11434/v1",
+    model="llama3.1:8b",
+)
+
+# Default: compress to fit 3000 tokens
+result = await summarize(content, config)
+
+# Compress to specific token count
+result = await summarize(content, config, target_tokens=500)
 
-### 9.1 Fact Preservation
+# Compress to 20% of original size
+result = await summarize(content, config, target_ratio=0.2)
+
+# With content type for better prompts
+result = await summarize(
+    content,
+    config,
+    target_tokens=500,
+    content_type="conversation",
+)
+```
+
+---
+
+## 10. Limitations and Trade-offs
+
+### 10.1 Fact Preservation
 
 Summarization is inherently lossy. Specific facts (dates, numbers, names) are often dropped in favor of thematic content. If your use case requires fact retrieval:
 
@@ -304,18 +349,14 @@ Summarization is inherently lossy. Specific facts (dates, numbers, names) are of
 - Use fact extraction instead of summarization
 - Use RAG to retrieve original chunks
 
-### 9.2 No Intermediate Summaries
+### 10.2 No Intermediate Summaries
 
 Unlike hierarchical approaches, map-reduce only stores the final summary. Intermediate chunk summaries are discarded after synthesis. This simplifies storage but removes granular access.
 
-### 9.3 Fixed Thresholds
-
-The 100/500 token thresholds are heuristic. They may need tuning for specific domains or languages.
-
 ---
 
-## 10. Future Improvements
+## 11. Future Improvements
 
 1. **Benchmark against BOOOOKSCORE metrics** for coherence evaluation
-2. **Tune token thresholds empirically** with real-world content
-3. **Add fact extraction mode** for use cases requiring specific detail preservation
+2. **Add fact extraction mode** for use cases requiring specific detail preservation
+3. **Streaming support** for real-time summarization feedback
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index 70d434dd..feebc5f2 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -1,15 +1,16 @@
-"""Demonstrate the summarizer on texts of varying lengths from the internet.
+"""Demonstrate the simplified summarizer on texts of varying lengths.
 
 This script fetches content of different sizes and shows how the adaptive
-summarizer automatically selects the appropriate strategy (BRIEF or MAP_REDUCE)
-based on content length.
+summarizer compresses content to fit different target token counts or ratios.
 
 Usage:
     python examples/summarizer_demo.py
 
-    # Test specific levels only
-    python examples/summarizer_demo.py --level brief
-    python examples/summarizer_demo.py --level map_reduce
+    # Test with specific target ratio
+    python examples/summarizer_demo.py --target-ratio 0.2
+
+    # Test with specific target token count
+    python examples/summarizer_demo.py --target-tokens 500
 
     # Use a different model
     python examples/summarizer_demo.py --model "gpt-4o-mini"
@@ -30,12 +31,11 @@
 
 from agent_cli.summarizer import (
     SummarizerConfig,
-    SummaryLevel,
     SummaryResult,
     summarize,
 )
 
-# Defaults for local AI setup (same as aijournal_poc.py)
+# Defaults for local AI setup
 DEFAULT_BASE_URL = "http://192.168.1.143:9292/v1"
 DEFAULT_MODEL = "gpt-oss-high:20b"
 
@@ -47,24 +47,17 @@ class TextSample:
     name: str
     description: str
     url: str
-    expected_level: SummaryLevel
     content_type: str = "general"
     # If URL fetch fails, use this fallback
     fallback_content: str | None = None
 
 
-# Thresholds from adaptive.py:
-# NONE: < 100 tokens
-# BRIEF: 100-500 tokens
-# MAP_REDUCE: >= 500 tokens
-
-# Sample texts of varying lengths to demonstrate different summarization levels
+# Sample texts of varying lengths to demonstrate summarization
 SAMPLES: list[TextSample] = [
     TextSample(
-        name="Brief - Short News Article",
-        description="~150-400 tokens - triggers BRIEF level (100-500 token range)",
+        name="Short News Article",
+        description="~150-400 tokens - demonstrates small content handling",
         url="https://httpbin.org/json",  # Returns small JSON we'll convert to text
-        expected_level=SummaryLevel.BRIEF,
         fallback_content="""
         Breaking News: Scientists at the Marine Biology Institute have made a
         groundbreaking discovery in the Mariana Trench. A new species of deep-sea
@@ -94,10 +87,9 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Map-Reduce - Technology Article",
-        description="~800-2000 tokens - triggers MAP_REDUCE level (>=500 tokens)",
+        name="Technology Article",
+        description="~800-2000 tokens - demonstrates medium content",
         url="https://en.wikipedia.org/api/rest_v1/page/summary/Artificial_intelligence",
-        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content="""
         Artificial intelligence (AI) is the intelligence of machines or software,
@@ -174,21 +166,12 @@ class TextSample:
         """,
     ),
     TextSample(
-        name="Map-Reduce - Full Article",
-        description="~4000-10000 tokens - triggers MAP_REDUCE with chunking",
+        name="Full Article",
+        description="~4000-10000 tokens - demonstrates large content with chunking",
         url="https://en.wikipedia.org/api/rest_v1/page/mobile-html/Machine_learning",
-        expected_level=SummaryLevel.MAP_REDUCE,
         content_type="document",
         fallback_content=None,  # We'll generate synthetic content
     ),
-    TextSample(
-        name="Map-Reduce - Long Document",
-        description="~16000+ tokens - triggers MAP_REDUCE with multiple collapse iterations",
-        url="https://www.gutenberg.org/cache/epub/84/pg84.txt",  # Frankenstein (truncated)
-        expected_level=SummaryLevel.MAP_REDUCE,
-        content_type="document",
-        fallback_content=None,  # We'll generate synthetic content (~16K tokens)
-    ),
 ]
 
 
@@ -262,25 +245,11 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
             content = re.sub(r"<[^>]+>", " ", content)
             content = re.sub(r"\s+", " ", content).strip()
 
-        # Check if content is too short for expected level
-        min_words_for_level = {
-            SummaryLevel.BRIEF: 80,  # Need ~100 tokens
-            SummaryLevel.MAP_REDUCE: 400,  # Need ~500 tokens
-        }
-        min_words = min_words_for_level.get(sample.expected_level, 50)
-
+        # Check if content is too short
+        min_words = 80
         if len(content.split()) < min_words:
             print(f"  📎 Fetched content too short ({len(content.split())} words), using fallback")
-            if sample.fallback_content:
-                content = sample.fallback_content
-            else:
-                target_tokens = {
-                    SummaryLevel.BRIEF: 300,
-                    SummaryLevel.MAP_REDUCE: 1500,
-                }
-                content = generate_synthetic_content(
-                    target_tokens.get(sample.expected_level, 1000),
-                )
+            content = sample.fallback_content or generate_synthetic_content(1500)
 
         # For very long content, truncate to keep demo fast
         words = content.split()
@@ -296,15 +265,17 @@ async def fetch_content(sample: TextSample, client: httpx.AsyncClient) -> str:
         if sample.fallback_content:
             return sample.fallback_content.strip()
 
-        # Generate synthetic content for the expected level
-        target_tokens = {
-            SummaryLevel.BRIEF: 300,
-            SummaryLevel.MAP_REDUCE: 1500,
-        }
-        return generate_synthetic_content(target_tokens.get(sample.expected_level, 1000))
+        # Generate synthetic content
+        return generate_synthetic_content(1500)
 
 
-def print_result(sample: TextSample, result: SummaryResult, content: str) -> None:
+def print_result(
+    sample: TextSample,
+    result: SummaryResult,
+    content: str,
+    target_tokens: int | None,
+    target_ratio: float | None,
+) -> None:
     """Print a formatted summary result."""
     print("\n" + "=" * 70)
     print(f"📄 {sample.name}")
@@ -318,23 +289,30 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
     print(f"   Tokens: {result.input_tokens:,}")
     print(f"   Content type: {sample.content_type}")
 
-    # Summarization result
-    level_emoji = {
-        SummaryLevel.NONE: "⏭️",
-        SummaryLevel.BRIEF: "📝",
-        SummaryLevel.MAP_REDUCE: "🔄",
-    }
-    print("\n🎯 Summarization Result:")
-    print(f"   Level: {level_emoji.get(result.level, '❓')} {result.level.name}")
-    print(f"   Expected: {sample.expected_level.name}")
-    print(f"   Match: {'✅' if result.level == sample.expected_level else '⚠️'}")
+    # Target info
+    print("\n🎯 Target:")
+    if target_ratio is not None:
+        print(f"   Ratio: {target_ratio:.0%} of input")
+        print(f"   Calculated target: ~{int(result.input_tokens * target_ratio):,} tokens")
+    elif target_tokens is not None:
+        print(f"   Tokens: {target_tokens:,}")
+    else:
+        print("   Default: 3000 tokens (LangChain default)")
+
+    # Result info
+    print("\n📝 Result:")
+    if result.summary == content:
+        print("   Status: ⏭️  Content already fits target (returned as-is)")
+    elif result.collapse_depth > 0:
+        print(f"   Status: 🔄 Map-reduce summarization (collapse depth: {result.collapse_depth})")
+    else:
+        print("   Status: 📝 Single-pass summarization")
+
     print(f"   Output tokens: {result.output_tokens:,}")
     print(f"   Compression: {result.compression_ratio:.1%}")
-    if result.collapse_depth > 0:
-        print(f"   Collapse depth: {result.collapse_depth}")
 
     # Summary content
-    if result.summary:
+    if result.summary and result.summary != content:
         print("\n📝 Summary:")
         wrapped = textwrap.fill(
             result.summary,
@@ -342,11 +320,15 @@ def print_result(sample: TextSample, result: SummaryResult, content: str) -> Non
             initial_indent="   ",
             subsequent_indent="   ",
         )
+        # Only show first ~500 chars of summary
+        if len(wrapped) > 600:  # noqa: PLR2004
+            wrapped = wrapped[:600] + "..."
         print(wrapped)
 
 
 async def run_demo(
-    level_filter: str | None = None,
+    target_tokens: int | None = None,
+    target_ratio: float | None = None,
     model: str | None = None,
     base_url: str | None = None,
 ) -> None:
@@ -369,39 +351,28 @@ async def run_demo(
         timeout=120.0,  # Longer timeout for local models
     )
 
-    # Filter samples if requested
-    samples = SAMPLES
-    if level_filter:
-        level_map = {
-            "brief": SummaryLevel.BRIEF,
-            "map_reduce": SummaryLevel.MAP_REDUCE,
-        }
-        target_level = level_map.get(level_filter.lower())
-        if target_level:
-            samples = [s for s in SAMPLES if s.expected_level == target_level]
-            print(f"\n🔍 Filtering to {level_filter.upper()} level only")
-
     async with httpx.AsyncClient() as client:
-        for sample in samples:
+        for sample in SAMPLES:
             print(f"\n⏳ Processing: {sample.name}...")
 
             # Fetch content
             content = await fetch_content(sample, client)
 
             try:
-                # Summarize
+                # Summarize with specified target
                 result = await summarize(
                     content=content,
                     config=config,
+                    target_tokens=target_tokens,
+                    target_ratio=target_ratio,
                     content_type=sample.content_type,
                 )
 
                 # Display results
-                print_result(sample, result, content)
+                print_result(sample, result, content, target_tokens, target_ratio)
 
             except Exception as e:
                 print(f"\n❌ Error summarizing {sample.name}: {e}")
-
                 traceback.print_exc()
 
     print("\n" + "=" * 70)
@@ -417,16 +388,21 @@ def main() -> None:
         epilog=textwrap.dedent("""
         Examples:
           python examples/summarizer_demo.py
-          python examples/summarizer_demo.py --level brief
-          python examples/summarizer_demo.py --level map_reduce
+          python examples/summarizer_demo.py --target-ratio 0.2
+          python examples/summarizer_demo.py --target-tokens 500
           python examples/summarizer_demo.py --model "llama3.1:8b" --base-url "http://localhost:11434/v1"
         """),
     )
 
     parser.add_argument(
-        "--level",
-        choices=["brief", "map_reduce"],
-        help="Only test a specific summarization level",
+        "--target-ratio",
+        type=float,
+        help="Target ratio for compression (e.g., 0.2 = compress to 20%%)",
+    )
+    parser.add_argument(
+        "--target-tokens",
+        type=int,
+        help="Target token count for summary",
     )
     parser.add_argument(
         "--model",
@@ -439,9 +415,13 @@ def main() -> None:
 
     args = parser.parse_args()
 
+    if args.target_ratio is not None and args.target_tokens is not None:
+        parser.error("Cannot specify both --target-ratio and --target-tokens")
+
     asyncio.run(
         run_demo(
-            level_filter=args.level,
+            target_tokens=args.target_tokens,
+            target_ratio=args.target_ratio,
             model=args.model,
             base_url=args.base_url,
         ),
diff --git a/tests/memory/test_engine.py b/tests/memory/test_engine.py
index 7e84b947..d8cd3526 100644
--- a/tests/memory/test_engine.py
+++ b/tests/memory/test_engine.py
@@ -22,7 +22,7 @@
     Message,
     StoredMemory,
 )
-from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryResult
 
 
 class _DummyReranker:
@@ -348,7 +348,6 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="summary up to 256",
             input_tokens=100,
             output_tokens=20,
@@ -576,7 +575,6 @@ def __init__(self, output: Any) -> None:
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="summary text",
             input_tokens=100,
             output_tokens=20,
diff --git a/tests/memory/test_git_integration.py b/tests/memory/test_git_integration.py
index 86040d7a..64130990 100644
--- a/tests/memory/test_git_integration.py
+++ b/tests/memory/test_git_integration.py
@@ -14,7 +14,7 @@
 from agent_cli.memory import _ingest
 from agent_cli.memory.client import MemoryClient
 from agent_cli.memory.entities import Fact
-from agent_cli.summarizer import SummaryLevel, SummaryResult
+from agent_cli.summarizer import SummaryResult
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -66,7 +66,6 @@ async def fake_reconcile(
 
     async def fake_summarize_content(**_kwargs: Any) -> SummaryResult:
         return SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="User likes testing.",
             input_tokens=100,
             output_tokens=20,
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index 202a5592..b7ce45e8 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -12,13 +12,8 @@
     SummaryOutput,
     generate_summary,
 )
-from agent_cli.summarizer.adaptive import (
-    THRESHOLD_BRIEF,
-    THRESHOLD_NONE,
-    determine_level,
-    summarize,
-)
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
+from agent_cli.summarizer.adaptive import summarize
+from agent_cli.summarizer.map_reduce import MapReduceResult
 
 
 class TestSummarizerConfig:
@@ -83,39 +78,6 @@ def test_default_token_max_is_langchain(self) -> None:
         assert config.token_max == 3000  # LangChain's default
 
 
-class TestDetermineLevel:
-    """Tests for level determination based on token count.
-
-    The simplified approach has 3 levels:
-    - NONE: Very short content (< 100 tokens)
-    - BRIEF: Short content (100-500 tokens)
-    - MAP_REDUCE: Everything else (uses map-reduce)
-    """
-
-    def test_none_level_threshold(self) -> None:
-        """Test NONE level for very short content."""
-        assert determine_level(50) == SummaryLevel.NONE
-        assert determine_level(99) == SummaryLevel.NONE
-
-    def test_brief_level_threshold(self) -> None:
-        """Test BRIEF level for short content."""
-        assert determine_level(100) == SummaryLevel.BRIEF
-        assert determine_level(300) == SummaryLevel.BRIEF
-        assert determine_level(499) == SummaryLevel.BRIEF
-
-    def test_map_reduce_level_for_longer_content(self) -> None:
-        """Test that content >= 500 tokens uses MAP_REDUCE."""
-        assert determine_level(500) == SummaryLevel.MAP_REDUCE
-        assert determine_level(1500) == SummaryLevel.MAP_REDUCE
-        assert determine_level(5000) == SummaryLevel.MAP_REDUCE
-        assert determine_level(20000) == SummaryLevel.MAP_REDUCE
-
-    def test_thresholds_match_constants(self) -> None:
-        """Verify thresholds match the module constants."""
-        assert THRESHOLD_NONE == 100
-        assert THRESHOLD_BRIEF == 500
-
-
 class TestSummarize:
     """Tests for main summarize function."""
 
@@ -128,133 +90,101 @@ def config(self) -> SummarizerConfig:
         )
 
     @pytest.mark.asyncio
-    async def test_empty_content_returns_none_level(
+    async def test_empty_content_returns_no_summary(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that empty content returns NONE level result."""
+        """Test that empty content returns result with no summary."""
         result = await summarize("", config)
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.input_tokens == 0
         assert result.output_tokens == 0
 
     @pytest.mark.asyncio
-    async def test_whitespace_only_returns_none_level(
+    async def test_whitespace_only_returns_no_summary(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that whitespace-only content returns NONE level result."""
+        """Test that whitespace-only content returns result with no summary."""
         result = await summarize("   \n\n   ", config)
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
 
     @pytest.mark.asyncio
-    async def test_very_short_content_no_summary(
+    async def test_short_content_returns_as_is(
         self,
         config: SummarizerConfig,
     ) -> None:
-        """Test that very short content gets NONE level (no summary)."""
-        # Less than 100 tokens
+        """Test that short content is returned as-is (no LLM call)."""
+        # Less than default token_max (3000)
         result = await summarize("Hello world", config)
-        assert result.level == SummaryLevel.NONE
-        assert result.summary is None
+        assert result.summary == "Hello world"
+        assert result.compression_ratio == 1.0  # No compression
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._brief_summary")
-    async def test_brief_level_calls_brief_summary(
+    async def test_target_tokens_respected(
         self,
-        mock_brief: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that BRIEF level content calls _brief_summary."""
-        mock_brief.return_value = "Brief summary."
-
-        # Create content that's ~100-500 tokens
-        content = "This is a test sentence. " * 30  # ~150 tokens
-
-        result = await summarize(content, config)
-
-        mock_brief.assert_called_once_with(content, config)
-        assert result.level == SummaryLevel.BRIEF
-        assert result.summary == "Brief summary."
+        """Test that content fitting target_tokens is returned as-is."""
+        content = "Short content"
+        result = await summarize(content, config, target_tokens=1000)
+        assert result.summary == content
+        assert result.compression_ratio == 1.0
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_longer_content_uses_map_reduce(
+    async def test_target_ratio_calculates_target(
         self,
-        mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that content >= 500 tokens uses map-reduce."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Map-reduce summary.",
-            input_tokens=800,
-            output_tokens=100,
-            compression_ratio=0.125,
-        )
-        mock_map_reduce.return_value = mock_result
-
-        # Create content that's ~500+ tokens
-        content = "This is a test sentence with more words. " * 100  # ~800 tokens
-
-        result = await summarize(content, config, content_type="general")
-
-        mock_map_reduce.assert_called_once()
-        assert result.summary == "Map-reduce summary."
+        """Test that target_ratio calculates correct target."""
+        # Short content that fits even with 10% target
+        content = "Hello"
+        result = await summarize(content, config, target_ratio=0.1)
+        # Content is so short it fits in 10% target
+        assert result.summary == content
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_prior_summary_passed_to_map_reduce(
+    @patch("agent_cli.summarizer.adaptive._content_aware_summary")
+    async def test_content_exceeding_target_gets_summarized(
         self,
-        mock_map_reduce: AsyncMock,
+        mock_summary: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that prior_summary is passed to _map_reduce_summary."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Updated summary.",
-            input_tokens=800,
-            output_tokens=100,
-            compression_ratio=0.125,
-        )
-        mock_map_reduce.return_value = mock_result
+        """Test that content exceeding target gets summarized."""
+        mock_summary.return_value = "Summarized content."
 
-        content = "This is a test sentence with more words. " * 100
-        prior = "Previous context summary."
+        # Create content that's ~500 tokens (exceeds target of 100)
+        content = "This is a test sentence. " * 100
 
-        await summarize(content, config, prior_summary=prior)
+        result = await summarize(content, config, target_tokens=100)
 
-        # Verify prior_summary was passed
-        call_args = mock_map_reduce.call_args
-        assert call_args[0][3] == prior  # prior_summary is 4th positional arg
+        mock_summary.assert_called_once()
+        assert result.summary == "Summarized content."
 
     @pytest.mark.asyncio
-    @patch("agent_cli.summarizer.adaptive._map_reduce_summary")
-    async def test_very_long_content_uses_map_reduce(
+    @patch("agent_cli.summarizer.adaptive.map_reduce_summarize")
+    async def test_large_content_uses_map_reduce(
         self,
         mock_map_reduce: AsyncMock,
         config: SummarizerConfig,
     ) -> None:
-        """Test that very long content uses map-reduce."""
-        mock_result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
-            summary="Long content summary.",
-            input_tokens=20000,
-            output_tokens=500,
-            compression_ratio=0.025,
-            collapse_depth=2,
+        """Test that content exceeding chunk_size uses map-reduce."""
+        mock_map_reduce.return_value = MapReduceResult(
+            summary="Map-reduce summary.",
+            input_tokens=5000,
+            output_tokens=100,
+            compression_ratio=0.02,
+            collapse_depth=1,
+            intermediate_summaries=[["chunk1", "chunk2"]],
         )
-        mock_map_reduce.return_value = mock_result
 
-        # Create content that's > 15000 tokens
-        content = "Word " * 20000
+        # Create content larger than chunk_size (2048)
+        content = "Word " * 3000  # ~3000 tokens
 
-        result = await summarize(content, config)
+        result = await summarize(content, config, target_tokens=500)
 
-        assert mock_map_reduce.called
-        assert result.level == SummaryLevel.MAP_REDUCE
+        mock_map_reduce.assert_called_once()
+        assert result.summary == "Map-reduce summary."
 
 
 class TestGenerateSummary:
diff --git a/tests/summarizer/test_integration.py b/tests/summarizer/test_integration.py
index f11fcff8..867815ce 100644
--- a/tests/summarizer/test_integration.py
+++ b/tests/summarizer/test_integration.py
@@ -2,32 +2,7 @@
 
 from __future__ import annotations
 
-from agent_cli.summarizer.adaptive import determine_level
-from agent_cli.summarizer.models import SummaryLevel, SummaryResult
-
-
-class TestDetermineLevel:
-    """Tests for determine_level function with various content sizes."""
-
-    def test_short_content_is_brief(self) -> None:
-        """Test that 100-500 token content uses BRIEF."""
-        level = determine_level(200)
-        assert level == SummaryLevel.BRIEF
-
-    def test_medium_content_is_map_reduce(self) -> None:
-        """Test that 500+ token content uses MAP_REDUCE."""
-        level = determine_level(1000)
-        assert level == SummaryLevel.MAP_REDUCE
-
-    def test_long_content_is_map_reduce(self) -> None:
-        """Test that 3000+ token content uses MAP_REDUCE."""
-        level = determine_level(5000)
-        assert level == SummaryLevel.MAP_REDUCE
-
-    def test_very_long_content_is_map_reduce(self) -> None:
-        """Test that content over 15000 tokens still uses MAP_REDUCE."""
-        level = determine_level(20000)
-        assert level == SummaryLevel.MAP_REDUCE
+from agent_cli.summarizer.models import SummaryResult
 
 
 class TestSummaryResultStorage:
@@ -36,7 +11,6 @@ class TestSummaryResultStorage:
     def test_to_storage_metadata_creates_entry(self) -> None:
         """Test that to_storage_metadata creates a valid entry."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
@@ -52,13 +26,11 @@ def test_to_storage_metadata_creates_entry(self) -> None:
         assert entry["metadata"]["conversation_id"] == "test-conversation"
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
         assert entry["metadata"]["collapse_depth"] == 1
 
-    def test_none_level_returns_empty(self) -> None:
-        """Test that NONE level produces no storage entries."""
+    def test_no_summary_returns_empty(self) -> None:
+        """Test that no summary produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
diff --git a/tests/summarizer/test_models.py b/tests/summarizer/test_models.py
index c5b04f70..05d5625f 100644
--- a/tests/summarizer/test_models.py
+++ b/tests/summarizer/test_models.py
@@ -7,73 +7,39 @@
 import pytest
 
 from agent_cli.summarizer.models import (
-    SummaryLevel,
     SummaryResult,
 )
 
 
-class TestSummaryLevel:
-    """Tests for SummaryLevel enum."""
-
-    def test_level_values(self) -> None:
-        """Test that levels have correct integer values."""
-        assert SummaryLevel.NONE == 0
-        assert SummaryLevel.BRIEF == 1
-        assert SummaryLevel.MAP_REDUCE == 2
-
-    def test_level_ordering(self) -> None:
-        """Test that levels can be compared."""
-        assert SummaryLevel.NONE < SummaryLevel.BRIEF
-        assert SummaryLevel.BRIEF < SummaryLevel.MAP_REDUCE
-
-
 class TestSummaryResult:
     """Tests for SummaryResult model."""
 
-    def test_none_level_result(self) -> None:
-        """Test result for content that needs no summary."""
+    def test_result_with_no_summary(self) -> None:
+        """Test result when content already fits target."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
             compression_ratio=0.0,
         )
-        assert result.level == SummaryLevel.NONE
         assert result.summary is None
         assert result.collapse_depth == 0
 
-    def test_brief_level_result(self) -> None:
-        """Test result for brief summary."""
-        result = SummaryResult(
-            level=SummaryLevel.BRIEF,
-            summary="A brief one-sentence summary.",
-            input_tokens=200,
-            output_tokens=10,
-            compression_ratio=0.05,
-        )
-        assert result.level == SummaryLevel.BRIEF
-        assert result.summary == "A brief one-sentence summary."
-        assert result.collapse_depth == 0
-
-    def test_map_reduce_result(self) -> None:
-        """Test result for map-reduce summary."""
+    def test_result_with_summary(self) -> None:
+        """Test result with a generated summary."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="A comprehensive summary.",
             input_tokens=5000,
             output_tokens=100,
             compression_ratio=0.02,
             collapse_depth=2,
         )
-        assert result.level == SummaryLevel.MAP_REDUCE
         assert result.summary == "A comprehensive summary."
         assert result.collapse_depth == 2
 
-    def test_to_storage_metadata_none_level(self) -> None:
-        """Test that NONE level produces no storage entries."""
+    def test_to_storage_metadata_no_summary(self) -> None:
+        """Test that no summary produces no storage entries."""
         result = SummaryResult(
-            level=SummaryLevel.NONE,
             summary=None,
             input_tokens=50,
             output_tokens=0,
@@ -82,10 +48,9 @@ def test_to_storage_metadata_none_level(self) -> None:
         entries = result.to_storage_metadata("conv-123")
         assert entries == []
 
-    def test_to_storage_metadata_simple_summary(self) -> None:
+    def test_to_storage_metadata_with_summary(self) -> None:
         """Test storage metadata for a summary."""
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="A brief summary.",
             input_tokens=200,
             output_tokens=10,
@@ -99,12 +64,10 @@ def test_to_storage_metadata_simple_summary(self) -> None:
         assert entry["metadata"]["conversation_id"] == "conv-456"
         assert entry["metadata"]["role"] == "summary"
         assert entry["metadata"]["is_final"] is True
-        assert entry["metadata"]["summary_level"] == "BRIEF"
 
-    def test_to_storage_metadata_map_reduce(self) -> None:
-        """Test storage metadata for map-reduce summary."""
+    def test_to_storage_metadata_with_collapse_depth(self) -> None:
+        """Test storage metadata includes collapse depth."""
         result = SummaryResult(
-            level=SummaryLevel.MAP_REDUCE,
             summary="Final synthesis of content.",
             input_tokens=20000,
             output_tokens=200,
@@ -113,12 +76,10 @@ def test_to_storage_metadata_map_reduce(self) -> None:
         )
         entries = result.to_storage_metadata("conv-789")
 
-        # Should have 1 entry (the final summary)
         assert len(entries) == 1
         entry = entries[0]
         assert entry["id"] == "conv-789:summary"
         assert entry["content"] == "Final synthesis of content."
-        assert entry["metadata"]["summary_level"] == "MAP_REDUCE"
         assert entry["metadata"]["collapse_depth"] == 3
         assert entry["metadata"]["is_final"] is True
 
@@ -126,7 +87,6 @@ def test_compression_ratio_bounds(self) -> None:
         """Test compression ratio validation."""
         # Valid ratio
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="Test",
             input_tokens=100,
             output_tokens=10,
@@ -137,7 +97,6 @@ def test_compression_ratio_bounds(self) -> None:
         # Ratio must be between 0 and 1
         with pytest.raises(ValueError, match="less than or equal to 1"):
             SummaryResult(
-                level=SummaryLevel.BRIEF,
                 summary="Test",
                 input_tokens=100,
                 output_tokens=10,
@@ -148,7 +107,6 @@ def test_created_at_default(self) -> None:
         """Test that created_at is automatically set."""
         before = datetime.now(UTC)
         result = SummaryResult(
-            level=SummaryLevel.BRIEF,
             summary="Test",
             input_tokens=100,
             output_tokens=10,
diff --git a/tests/summarizer/test_utils.py b/tests/summarizer/test_utils.py
index 188a7917..89a44171 100644
--- a/tests/summarizer/test_utils.py
+++ b/tests/summarizer/test_utils.py
@@ -88,32 +88,32 @@ def test_large_paragraph_sentence_split(self) -> None:
 class TestEstimateSummaryTokens:
     """Tests for estimate_summary_tokens function."""
 
-    def test_none_level(self) -> None:
-        """Test level 0 (NONE) returns 0."""
-        assert estimate_summary_tokens(1000, level=0) == 0
-
-    def test_brief_level(self) -> None:
-        """Test level 1 (BRIEF) compression."""
-        # BRIEF: ~20% compression, capped at 50, minimum 20
-        result = estimate_summary_tokens(100, level=1)
-        assert result >= 20  # minimum of 20
-        assert result <= 50  # capped at 50
-
-    def test_map_reduce_level(self) -> None:
-        """Test level 2 (MAP_REDUCE) compression."""
-        # MAP_REDUCE: ~10% compression, capped at 500, minimum 50
-        result = estimate_summary_tokens(1000, level=2)
-        assert result >= 50  # minimum of 50
-        assert result <= 500  # capped at 500
-
-    def test_map_reduce_large_input(self) -> None:
-        """Test MAP_REDUCE with large input hits cap."""
-        result = estimate_summary_tokens(50000, level=2)
+    def test_typical_input(self) -> None:
+        """Test typical input uses ~10% compression."""
+        # ~10% compression, capped at 500, minimum 50
+        result = estimate_summary_tokens(1000)
+        assert result == 100  # 1000 // 10 = 100
+
+    def test_medium_input(self) -> None:
+        """Test medium input stays within bounds."""
+        result = estimate_summary_tokens(2000)
+        assert result == 200  # 2000 // 10 = 200
+        assert result >= 50  # above floor
+        assert result <= 500  # below ceiling
+
+    def test_large_input_hits_cap(self) -> None:
+        """Test large input hits 500 token cap."""
+        result = estimate_summary_tokens(50000)
         assert result == 500  # capped at 500
 
-    def test_map_reduce_small_input(self) -> None:
-        """Test MAP_REDUCE with small input uses floor."""
-        result = estimate_summary_tokens(100, level=2)
+    def test_small_input_uses_floor(self) -> None:
+        """Test small input uses 50 token floor."""
+        result = estimate_summary_tokens(100)
+        assert result == 50  # floor of 50 (100 // 10 = 10, but min is 50)
+
+    def test_very_small_input(self) -> None:
+        """Test very small input still uses floor."""
+        result = estimate_summary_tokens(10)
         assert result == 50  # floor of 50
 
 

From 5c632b839fb3efc00482687bd85931bc4d8119ec Mon Sep 17 00:00:00 2001
From: Bas Nijholt <bas@nijho.lt>
Date: Wed, 3 Dec 2025 20:16:20 -0800
Subject: [PATCH 38/38] chore(summarizer): remove dead code

- Remove unused BRIEF_SUMMARY_PROMPT (brief level was removed)
- Remove unused timeout field from SummarizerConfig
- Update tests and examples accordingly
---
 agent_cli/summarizer/_prompts.py  | 9 ---------
 agent_cli/summarizer/models.py    | 1 -
 examples/summarizer_demo.py       | 1 -
 tests/summarizer/test_adaptive.py | 2 --
 tests/summarizer/test_prompts.py  | 8 --------
 5 files changed, 21 deletions(-)

diff --git a/agent_cli/summarizer/_prompts.py b/agent_cli/summarizer/_prompts.py
index de59f940..e49fd417 100644
--- a/agent_cli/summarizer/_prompts.py
+++ b/agent_cli/summarizer/_prompts.py
@@ -4,15 +4,6 @@
 and are optimized for structured, factual output.
 """
 
-# Single sentence summary for short content (used at BRIEF level, 100-500 tokens)
-BRIEF_SUMMARY_PROMPT = """Summarize the following in ONE sentence (maximum 20 words).
-Focus on the single most important point or takeaway.
-
-Content:
-{content}
-
-One-sentence summary:""".strip()
-
 # Paragraph summary for "general" content type (default when no specific type provided)
 GENERAL_SUMMARY_PROMPT = """Summarize the following content concisely in a short paragraph.
 
diff --git a/agent_cli/summarizer/models.py b/agent_cli/summarizer/models.py
index 65eb42ed..721201da 100644
--- a/agent_cli/summarizer/models.py
+++ b/agent_cli/summarizer/models.py
@@ -34,7 +34,6 @@ class SummarizerConfig:
     token_max: int = 3000  # LangChain's default - target size after compression
     chunk_overlap: int = 200
     max_concurrent_chunks: int = 5
-    timeout: float = 60.0
 
     def __post_init__(self) -> None:
         """Normalize the base URL."""
diff --git a/examples/summarizer_demo.py b/examples/summarizer_demo.py
index feebc5f2..f5d593a1 100644
--- a/examples/summarizer_demo.py
+++ b/examples/summarizer_demo.py
@@ -348,7 +348,6 @@ async def run_demo(
         api_key=api_key,
         chunk_size=2048,  # BOOOOKSCORE default
         max_concurrent_chunks=3,
-        timeout=120.0,  # Longer timeout for local models
     )
 
     async with httpx.AsyncClient() as client:
diff --git a/tests/summarizer/test_adaptive.py b/tests/summarizer/test_adaptive.py
index b7ce45e8..1fbf3d7b 100644
--- a/tests/summarizer/test_adaptive.py
+++ b/tests/summarizer/test_adaptive.py
@@ -46,12 +46,10 @@ def test_init_with_custom_settings(self) -> None:
             chunk_size=5000,
             chunk_overlap=300,
             max_concurrent_chunks=10,
-            timeout=120.0,
         )
         assert config.chunk_size == 5000
         assert config.chunk_overlap == 300
         assert config.max_concurrent_chunks == 10
-        assert config.timeout == 120.0
 
     def test_trailing_slash_stripped(self) -> None:
         """Test that trailing slash is stripped from base URL."""
diff --git a/tests/summarizer/test_prompts.py b/tests/summarizer/test_prompts.py
index ef05ebad..825fe077 100644
--- a/tests/summarizer/test_prompts.py
+++ b/tests/summarizer/test_prompts.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 from agent_cli.summarizer._prompts import (
-    BRIEF_SUMMARY_PROMPT,
     CHUNK_SUMMARY_PROMPT,
     CONVERSATION_SUMMARY_PROMPT,
     DOCUMENT_SUMMARY_PROMPT,
@@ -19,13 +18,6 @@
 class TestPromptTemplates:
     """Tests for prompt template structure."""
 
-    def test_brief_prompt_has_content_placeholder(self) -> None:
-        """Test BRIEF prompt contains content placeholder."""
-        assert "{content}" in BRIEF_SUMMARY_PROMPT
-        # Test it can be formatted
-        result = BRIEF_SUMMARY_PROMPT.format(content="Test content")
-        assert "Test content" in result
-
     def test_general_prompt_has_placeholders(self) -> None:
         """Test GENERAL prompt contains required placeholders."""
         assert "{content}" in GENERAL_SUMMARY_PROMPT