From 65fcb5ed629f0ca1dfa1bd060a084786e52b47a2 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Wed, 7 Jan 2026 10:22:53 -0800 Subject: [PATCH 01/13] Additional tests cleanup and skipping when running on github actions. --- test/backends/test_huggingface.py | 7 +++ test/backends/test_huggingface_tools.py | 7 +++ test/backends/test_vllm.py | 6 +++ test/backends/test_vllm_tools.py | 6 +++ test/backends/test_watsonx.py | 6 +++ test/conftest.py | 49 +++++++++++++++++++++ test/stdlib_intrinsics/test_rag/test_rag.py | 6 +++ 7 files changed, 87 insertions(+) diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 328b6068..7be4e552 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -1,6 +1,7 @@ import asyncio from copy import copy import faulthandler +import os import random import time from typing import Any, Coroutine @@ -11,6 +12,12 @@ import torch from typing_extensions import Annotated +# Skip entire module in CI since 17/18 tests are qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping HuggingFace tests in CI - mostly qualitative tests", +) + from mellea import MelleaSession from mellea.backends.adapters.adapter import GraniteCommonAdapter from mellea.backends.cache import SimpleLRUCache diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py index 0df5f3dc..5fca6d10 100644 --- a/test/backends/test_huggingface_tools.py +++ b/test/backends/test_huggingface_tools.py @@ -1,7 +1,14 @@ +import os import pydantic import pytest from typing_extensions import Annotated +# Skip entire module in CI since the single test is qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping HuggingFace tools tests in CI - qualitative test", +) + import mellea.backends.model_ids as model_ids from mellea import MelleaSession from mellea.backends.cache import SimpleLRUCache diff --git a/test/backends/test_vllm.py b/test/backends/test_vllm.py index cfcda8c2..c396b916 100644 --- a/test/backends/test_vllm.py +++ b/test/backends/test_vllm.py @@ -4,6 +4,12 @@ import pytest from typing_extensions import Annotated +# Skip entire module in CI since all 8 tests are qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping vLLM tests in CI - all qualitative tests", +) + from mellea import MelleaSession from mellea.backends.vllm import LocalVLLMBackend from mellea.backends.types import ModelOption diff --git a/test/backends/test_vllm_tools.py b/test/backends/test_vllm_tools.py index 69c824b2..76101754 100644 --- a/test/backends/test_vllm_tools.py +++ b/test/backends/test_vllm_tools.py @@ -3,6 +3,12 @@ import pytest from typing_extensions import Annotated +# Skip entire module in CI since the single test is qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping vLLM tools tests in CI - qualitative test", +) + from mellea import MelleaSession from mellea.backends.vllm import LocalVLLMBackend from mellea.backends.types import ModelOption diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index 08615973..9a17ea97 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -5,6 +5,12 @@ import pydantic import pytest +# Skip entire module in CI since 8/9 tests are qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping Watsonx tests in CI - mostly qualitative tests", +) + from mellea import MelleaSession from mellea.backends.formatter import TemplateFormatter from mellea.backends.types import ModelOption diff --git a/test/conftest.py b/test/conftest.py index 4b799d50..19f713a3 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,3 +1,4 @@ +import gc import os import pytest @@ -9,6 +10,7 @@ def gh_run() -> int: def pytest_runtest_setup(item): + """Skip qualitative tests when running in CI environment.""" # Runs tests *not* marked with `@pytest.mark.qualitative` to run normally. if not item.get_closest_marker("qualitative"): return @@ -19,3 +21,50 @@ def pytest_runtest_setup(item): pytest.skip( reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows." ) + + +@pytest.fixture(autouse=True, scope="function") +def aggressive_cleanup(): + """Aggressive memory cleanup after each test to prevent OOM on CI runners.""" + yield + # Only run aggressive cleanup in CI where memory is constrained + if int(os.environ.get("CICD", 0)) != 1: + return + + # Cleanup after each test + gc.collect() + gc.collect() + + # If torch is available, clear CUDA cache + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + except ImportError: + pass + + +@pytest.fixture(autouse=True, scope="module") +def cleanup_module_fixtures(): + """Cleanup module-scoped fixtures to free memory between test modules.""" + yield + # Only run aggressive cleanup in CI where memory is constrained + if int(os.environ.get("CICD", 0)) != 1: + return + + # Cleanup after module + gc.collect() + gc.collect() + gc.collect() + + # If torch is available, clear CUDA cache + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + except ImportError: + pass diff --git a/test/stdlib_intrinsics/test_rag/test_rag.py b/test/stdlib_intrinsics/test_rag/test_rag.py index 47b13e02..66fb833f 100644 --- a/test/stdlib_intrinsics/test_rag/test_rag.py +++ b/test/stdlib_intrinsics/test_rag/test_rag.py @@ -13,6 +13,12 @@ from mellea.stdlib.chat import Message from mellea.stdlib.intrinsics import rag +# Skip entire module in CI since all 7 tests are qualitative +pytestmark = pytest.mark.skipif( + int(os.environ.get("CICD", 0)) == 1, + reason="Skipping RAG tests in CI - all qualitative tests", +) + DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" """Location of data files for the tests in this file.""" From f22afe6f24a7c7b983cfe1002de0085938db9524 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Thu, 8 Jan 2026 11:08:18 -0800 Subject: [PATCH 02/13] moving all tests to IBM_GRANITE_4_MICRO_3B --- .github/workflows/quality.yml | 17 +++------ test/stdlib_basics/test_functional.py | 15 +------- test/stdlib_basics/test_majority_voting.py | 17 +++------ test/stdlib_basics/test_model_output_thunk.py | 14 +------ test/stdlib_basics/test_sampling_ctx.py | 3 +- test/stdlib_basics/test_session.py | 37 +++---------------- test/stdlib_basics/test_vision_ollama.py | 10 ++--- test/stdlib_basics/test_vision_openai.py | 7 ++-- 8 files changed, 30 insertions(+), 90 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 75777260..776e09e7 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -3,10 +3,9 @@ name: Verify Code Quality on: workflow_call: - concurrency: - group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} + cancel-in-progress: true env: CICD: 1 @@ -15,10 +14,10 @@ env: jobs: quality: runs-on: ubuntu-latest - timeout-minutes: 90 # TODO: need to reduce this after we figure out our testing strategy. + timeout-minutes: 90 # TODO: need to reduce this after we figure out our testing strategy. strategy: matrix: - python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. + python-version: ["3.10", "3.11", "3.12"] # Need to add 3.13 once we resolve outlines issues. steps: - uses: actions/checkout@v4 - name: Free disk space @@ -39,18 +38,14 @@ jobs: - name: Check style and run tests run: pre-commit run --all-files - name: Send failure message pre-commit - if: failure() # This step will only run if a previous step failed + if: failure() # This step will only run if a previous step failed run: echo "The quality verification failed. Please run precommit " - name: Install Ollama run: curl -fsSL https://ollama.com/install.sh | sh - name: Start serving ollama run: nohup ollama serve & - - name: Pull Llama 3.2:1b model - run: ollama pull llama3.2:1b - - name: Run Tests run: uv run -m pytest -v test - name: Send failure message tests - if: failure() # This step will only run if a previous step failed + if: failure() # This step will only run if a previous step failed run: echo "Tests failed. Please verify that tests are working locally." - diff --git a/test/stdlib_basics/test_functional.py b/test/stdlib_basics/test_functional.py index 4dbfb9e0..86bf9c81 100644 --- a/test/stdlib_basics/test_functional.py +++ b/test/stdlib_basics/test_functional.py @@ -3,25 +3,14 @@ from mellea.backends.types import ModelOption from mellea.stdlib.base import ModelOutputThunk from mellea.stdlib.chat import Message -from mellea.stdlib.functional import instruct, aact, avalidate, ainstruct +from mellea.stdlib.functional import aact, ainstruct, avalidate, instruct from mellea.stdlib.requirement import req from mellea.stdlib.session import start_session @pytest.fixture(scope="module") def m_session(gh_run): - if gh_run == 1: - m = start_session( - "ollama", - model_id="llama3.2:1b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) - else: - m = start_session( - "ollama", - model_id="granite3.3:8b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) + m = start_session(model_options={ModelOption.MAX_NEW_TOKENS: 5}) yield m del m diff --git a/test/stdlib_basics/test_majority_voting.py b/test/stdlib_basics/test_majority_voting.py index 56cc3389..eeec7b7c 100644 --- a/test/stdlib_basics/test_majority_voting.py +++ b/test/stdlib_basics/test_majority_voting.py @@ -1,25 +1,18 @@ +import pytest + +from mellea import MelleaSession, start_session from mellea.backends import ModelOption -from mellea import start_session, MelleaSession from mellea.stdlib.requirement import check, req, simple_validate from mellea.stdlib.sampling.majority_voting import ( - MBRDRougeLStrategy, MajorityVotingStrategyForMath, + MBRDRougeLStrategy, ) -import pytest - from mellea.stdlib.sampling.types import SamplingResult @pytest.fixture(scope="module") def m_session(gh_run): - if gh_run == 1: - m = start_session( - "ollama", - model_id="llama3.2:1b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) - else: - m = start_session("ollama", model_id="llama3.2:1b") + m = start_session(model_options={ModelOption.MAX_NEW_TOKENS: 5}) yield m del m diff --git a/test/stdlib_basics/test_model_output_thunk.py b/test/stdlib_basics/test_model_output_thunk.py index 6f562812..8878c4b2 100644 --- a/test/stdlib_basics/test_model_output_thunk.py +++ b/test/stdlib_basics/test_model_output_thunk.py @@ -1,4 +1,5 @@ import copy + import pytest from mellea.backends.types import ModelOption @@ -10,18 +11,7 @@ # backend, but it simplifies test setup. @pytest.fixture(scope="module") def m_session(gh_run): - if gh_run == 1: - m = start_session( - "ollama", - model_id="llama3.2:1b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) - else: - m = start_session( - "ollama", - model_id="granite3.3:8b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) + m = start_session(model_options={ModelOption.MAX_NEW_TOKENS: 5}) yield m del m diff --git a/test/stdlib_basics/test_sampling_ctx.py b/test/stdlib_basics/test_sampling_ctx.py index 362730d6..496d3689 100644 --- a/test/stdlib_basics/test_sampling_ctx.py +++ b/test/stdlib_basics/test_sampling_ctx.py @@ -1,7 +1,8 @@ import pytest + from mellea import start_session from mellea.backends import ModelOption -from mellea.stdlib.base import ChatContext, ModelOutputThunk, Context +from mellea.stdlib.base import ChatContext, Context, ModelOutputThunk from mellea.stdlib.requirement import Requirement from mellea.stdlib.sampling import ( MultiTurnStrategy, diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 6694246c..beb4fc78 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -7,24 +7,13 @@ from mellea.backends.types import ModelOption from mellea.stdlib.base import ChatContext, ModelOutputThunk from mellea.stdlib.chat import Message -from mellea.stdlib.session import start_session, MelleaSession +from mellea.stdlib.session import MelleaSession, start_session # We edit the context type in the async tests below. Don't change the scope here. -@pytest.fixture(scope="function") +@pytest.fixture(scope="module") def m_session(gh_run): - if gh_run == 1: - m = start_session( - "ollama", - model_id="llama3.2:1b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) - else: - m = start_session( - "ollama", - model_id="granite3.3:8b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) + m = start_session(model_options={ModelOption.MAX_NEW_TOKENS: 5}) yield m del m @@ -39,23 +28,9 @@ def test_start_session_watsonx(gh_run): assert response.value is not None -def test_start_session_openai_with_kwargs(gh_run): - if gh_run == 1: - m = start_session( - "openai", - model_id="llama3.2:1b", - base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", - api_key="ollama", - ) - else: - m = start_session( - "openai", - model_id="granite3.3:8b", - base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", - api_key="ollama", - ) - initial_ctx = m.ctx - response = m.instruct("testing") +def test_start_session_openai_with_kwargs(m_session): + initial_ctx = m_session.ctx + response = m_session.instruct("testing") assert isinstance(response, ModelOutputThunk) assert response.value is not None assert initial_ctx is not m.ctx diff --git a/test/stdlib_basics/test_vision_ollama.py b/test/stdlib_basics/test_vision_ollama.py index eae4e87b..1d7b5caf 100644 --- a/test/stdlib_basics/test_vision_ollama.py +++ b/test/stdlib_basics/test_vision_ollama.py @@ -3,10 +3,10 @@ from io import BytesIO import numpy as np -from PIL import Image import pytest +from PIL import Image -from mellea import start_session, MelleaSession +from mellea import MelleaSession, start_session from mellea.backends import ModelOption from mellea.stdlib.base import ImageBlock, ModelOutputThunk from mellea.stdlib.chat import Message @@ -16,11 +16,7 @@ @pytest.fixture(scope="module") def m_session(gh_run): if gh_run == 1: - m = start_session( - "ollama", - model_id="llama3.2:1b", - model_options={ModelOption.MAX_NEW_TOKENS: 5}, - ) + m = start_session(model_options={ModelOption.MAX_NEW_TOKENS: 5}) else: m = start_session( "ollama", diff --git a/test/stdlib_basics/test_vision_openai.py b/test/stdlib_basics/test_vision_openai.py index c922acd5..22f3e73b 100644 --- a/test/stdlib_basics/test_vision_openai.py +++ b/test/stdlib_basics/test_vision_openai.py @@ -3,11 +3,12 @@ from io import BytesIO import numpy as np -from PIL import Image import pytest +from PIL import Image -from mellea import start_session, MelleaSession +from mellea import MelleaSession, start_session from mellea.backends import ModelOption +from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B from mellea.stdlib.base import ImageBlock, ModelOutputThunk from mellea.stdlib.chat import Message from mellea.stdlib.instruction import Instruction @@ -18,7 +19,7 @@ def m_session(gh_run): if gh_run == 1: m = start_session( "openai", - model_id="llama3.2:1b", + model_id=IBM_GRANITE_4_MICRO_3B, base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", model_options={ModelOption.MAX_NEW_TOKENS: 5}, From 8b328a0c77123f9449bf6461754bad2a220f328e Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 9 Jan 2026 09:17:49 -0800 Subject: [PATCH 03/13] changing granite4 micro ollama name to latest --- mellea/backends/model_ids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index 3ffdb4b8..c7bb277e 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -28,7 +28,7 @@ class ModelIdentifier: IBM_GRANITE_4_MICRO_3B = ModelIdentifier( hf_model_name="ibm-granite/granite-4.0-micro", - ollama_name="ibm/granite4:micro", + ollama_name="granite4:micro", watsonx_name="ibm/granite-4-h-small", ) # todo: watsonx model is different from ollama model - should be same. From 8431f28eae5d7e0c99b146afff0533cd64f1e108 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 9 Jan 2026 09:42:10 -0800 Subject: [PATCH 04/13] Adding step to download granite micro --- .github/workflows/quality.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 776e09e7..ad82d084 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -44,6 +44,8 @@ jobs: run: curl -fsSL https://ollama.com/install.sh | sh - name: Start serving ollama run: nohup ollama serve & + - name: Pull model granite4:micro + run: ollama pull granite4:micro - name: Run Tests run: uv run -m pytest -v test - name: Send failure message tests From 2f995253f6cbe75b777d617e9e1fc1a4181189df Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 9 Jan 2026 09:42:58 -0800 Subject: [PATCH 05/13] Minor changes to make tests run --- mellea/backends/model_ids.py | 1 + test/stdlib_basics/test_session.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index c7bb277e..7f68165f 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -29,6 +29,7 @@ class ModelIdentifier: IBM_GRANITE_4_MICRO_3B = ModelIdentifier( hf_model_name="ibm-granite/granite-4.0-micro", ollama_name="granite4:micro", + openai_name="granite4:micro", # setting this just for testing purposes. watsonx_name="ibm/granite-4-h-small", ) # todo: watsonx model is different from ollama model - should be same. diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index beb4fc78..ea493318 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -33,7 +33,7 @@ def test_start_session_openai_with_kwargs(m_session): response = m_session.instruct("testing") assert isinstance(response, ModelOutputThunk) assert response.value is not None - assert initial_ctx is not m.ctx + assert initial_ctx is not m_session.ctx async def test_aact(m_session): From 3823a22f40e09c40448d127152306f767131ea2f Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 9 Jan 2026 10:25:47 -0800 Subject: [PATCH 06/13] Fixing some wayward tests still in llama --- test/backends/test_openai_ollama.py | 22 ++++++------------- test/stdlib_basics/test_contextual_session.py | 7 ++---- test/stdlib_basics/test_genslot.py | 8 ++++--- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 57ca3281..8b3b7c06 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -10,7 +10,7 @@ from mellea import MelleaSession from mellea.backends.formatter import TemplateFormatter -from mellea.backends.model_ids import META_LLAMA_3_2_1B +from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B from mellea.backends.openai import OpenAIBackend from mellea.backends.types import ModelOption from mellea.stdlib.base import CBlock, ChatContext, ModelOutputThunk, SimpleContext @@ -19,20 +19,12 @@ @pytest.fixture(scope="module") def backend(gh_run: int): """Shared OpenAI backend configured for Ollama.""" - if gh_run == 1: - return OpenAIBackend( - model_id=META_LLAMA_3_2_1B.ollama_name, # type: ignore - formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B.hf_model_name), # type: ignore - base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", - api_key="ollama", - ) - else: - return OpenAIBackend( - model_id="granite3.3:8b", - formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"), - base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", - api_key="ollama", - ) + return OpenAIBackend( + model_id=IBM_GRANITE_4_MICRO_3B.ollama_name, # type: ignore + formatter=TemplateFormatter(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name), # type: ignore + base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", + api_key="ollama", + ) @pytest.fixture(scope="function") diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py index a401f117..98a8dc2e 100644 --- a/test/stdlib_basics/test_contextual_session.py +++ b/test/stdlib_basics/test_contextual_session.py @@ -4,7 +4,7 @@ # import pytest # # from mellea import chat, generative, instruct, query, start_session, transform, validate -# from mellea.backends.model_ids import IBM_GRANITE_3_3_8B, META_LLAMA_3_2_1B +# from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B # from mellea.stdlib.base import ModelOutputThunk # from mellea.stdlib.mify import MifiedProtocol, mify # from mellea.stdlib.requirement import req @@ -13,10 +13,7 @@ # # @pytest.fixture(scope="module") # def model_id(gh_run: int): -# if gh_run == 1: -# return META_LLAMA_3_2_1B -# else: -# return IBM_GRANITE_3_3_8B +# return IBM_GRANITE_4_MICRO_3B # # # @generative diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py index e7e0bfb3..984a8140 100644 --- a/test/stdlib_basics/test_genslot.py +++ b/test/stdlib_basics/test_genslot.py @@ -1,8 +1,10 @@ import asyncio -import pytest from typing import Literal + +import pytest + from mellea import generative, start_session -from mellea.backends.model_ids import META_LLAMA_3_2_1B +from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B from mellea.backends.ollama import OllamaModelBackend from mellea.stdlib.base import ChatContext, Context from mellea.stdlib.genslot import ( @@ -21,7 +23,7 @@ def backend(gh_run: int): """Shared backend.""" if gh_run == 1: return OllamaModelBackend( - model_id=META_LLAMA_3_2_1B.ollama_name # type: ignore + model_id=IBM_GRANITE_4_MICRO_3B.ollama_name # type: ignore ) else: return OllamaModelBackend(model_id="granite3.3:8b") From fee236fd7d738c89085bd25e42b5712831d7d1f3 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 12 Jan 2026 08:39:48 -0800 Subject: [PATCH 07/13] DRYing conftest --- test/conftest.py | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 19f713a3..10c96e74 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -23,15 +23,15 @@ def pytest_runtest_setup(item): ) -@pytest.fixture(autouse=True, scope="function") -def aggressive_cleanup(): - """Aggressive memory cleanup after each test to prevent OOM on CI runners.""" +def memory_cleaner(): + """Aggressive memory cleanup function.""" yield # Only run aggressive cleanup in CI where memory is constrained if int(os.environ.get("CICD", 0)) != 1: return - # Cleanup after each test + # Cleanup after module + gc.collect() gc.collect() gc.collect() @@ -46,25 +46,13 @@ def aggressive_cleanup(): pass +@pytest.fixture(autouse=True, scope="function") +def aggressive_cleanup(): + """Aggressive memory cleanup after each test to prevent OOM on CI runners.""" + memory_cleaner() + + @pytest.fixture(autouse=True, scope="module") def cleanup_module_fixtures(): """Cleanup module-scoped fixtures to free memory between test modules.""" - yield - # Only run aggressive cleanup in CI where memory is constrained - if int(os.environ.get("CICD", 0)) != 1: - return - - # Cleanup after module - gc.collect() - gc.collect() - gc.collect() - - # If torch is available, clear CUDA cache - try: - import torch - - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - except ImportError: - pass + memory_cleaner() From 4d85f1711e9dd5acad02f1d61bc7566dc75c5810 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Thu, 15 Jan 2026 13:25:26 -0800 Subject: [PATCH 08/13] Adding metadata to notebooks to help with automated testing. --- .../compositionality_with_generative_slots.ipynb | 10 ++++++++-- docs/examples/notebooks/context_example.ipynb | 10 ++++++++-- docs/examples/notebooks/document_mobject.ipynb | 10 ++++++++-- docs/examples/notebooks/example.ipynb | 10 ++++++++-- docs/examples/notebooks/georgia_tech.ipynb | 15 ++++++++++----- .../notebooks/instruct_validate_repair.ipynb | 10 ++++++++-- docs/examples/notebooks/m_serve_example.ipynb | 10 ++++++++-- docs/examples/notebooks/mcp_example.ipynb | 10 ++++++++-- .../notebooks/model_options_example.ipynb | 10 ++++++++-- .../examples/notebooks/sentiment_classifier.ipynb | 10 ++++++++-- docs/examples/notebooks/simple_email.ipynb | 10 ++++++++-- docs/examples/notebooks/table_mobject.ipynb | 10 ++++++++-- 12 files changed, 98 insertions(+), 27 deletions(-) diff --git a/docs/examples/notebooks/compositionality_with_generative_slots.ipynb b/docs/examples/notebooks/compositionality_with_generative_slots.ipynb index 6f7f4ed4..478a5332 100644 --- a/docs/examples/notebooks/compositionality_with_generative_slots.ipynb +++ b/docs/examples/notebooks/compositionality_with_generative_slots.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/context_example.ipynb b/docs/examples/notebooks/context_example.ipynb index ec5d03fa..1c0d3ef5 100644 --- a/docs/examples/notebooks/context_example.ipynb +++ b/docs/examples/notebooks/context_example.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/document_mobject.ipynb b/docs/examples/notebooks/document_mobject.ipynb index 55c7a2b7..8846f841 100644 --- a/docs/examples/notebooks/document_mobject.ipynb +++ b/docs/examples/notebooks/document_mobject.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/example.ipynb b/docs/examples/notebooks/example.ipynb index 21877e45..275de1ce 100644 --- a/docs/examples/notebooks/example.ipynb +++ b/docs/examples/notebooks/example.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/georgia_tech.ipynb b/docs/examples/notebooks/georgia_tech.ipynb index 3b349881..08422fb4 100644 --- a/docs/examples/notebooks/georgia_tech.ipynb +++ b/docs/examples/notebooks/georgia_tech.ipynb @@ -28,7 +28,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "6fDEbLHL_hkK" + "id": "6fDEbLHL_hkK", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -134,14 +137,14 @@ " strategy=RejectionSamplingStrategy(loop_budget=5),\n", " user_variables={\"name\": name, \"notes\": notes},\n", " return_sampling_results=True,\n", - " )\n", + " ) # type: ignore\n", " if email_candidate.success:\n", " return str(email_candidate.result)\n", " else:\n", " return email_candidate.sample_generations[0].value\n", "\n", "\n", - "m = mellea_org.start_session()\n", + "m = mellea.start_session()\n", "print(\n", " write_email(\n", " m,\n", @@ -556,11 +559,13 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "mellea-public", + "language": "python", "name": "python3" }, "language_info": { - "name": "python" + "name": "python", + "version": "3.12.10" } }, "nbformat": 4, diff --git a/docs/examples/notebooks/instruct_validate_repair.ipynb b/docs/examples/notebooks/instruct_validate_repair.ipynb index 14896c2b..7144d539 100644 --- a/docs/examples/notebooks/instruct_validate_repair.ipynb +++ b/docs/examples/notebooks/instruct_validate_repair.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/m_serve_example.ipynb b/docs/examples/notebooks/m_serve_example.ipynb index 871349f7..729b75bf 100644 --- a/docs/examples/notebooks/m_serve_example.ipynb +++ b/docs/examples/notebooks/m_serve_example.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/mcp_example.ipynb b/docs/examples/notebooks/mcp_example.ipynb index 50c6233b..565c128d 100644 --- a/docs/examples/notebooks/mcp_example.ipynb +++ b/docs/examples/notebooks/mcp_example.ipynb @@ -26,7 +26,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -58,7 +61,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/model_options_example.ipynb b/docs/examples/notebooks/model_options_example.ipynb index a706c05a..0216010c 100644 --- a/docs/examples/notebooks/model_options_example.ipynb +++ b/docs/examples/notebooks/model_options_example.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/sentiment_classifier.ipynb b/docs/examples/notebooks/sentiment_classifier.ipynb index e1cd70bd..dc2dec4d 100644 --- a/docs/examples/notebooks/sentiment_classifier.ipynb +++ b/docs/examples/notebooks/sentiment_classifier.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/simple_email.ipynb b/docs/examples/notebooks/simple_email.ipynb index f80f1663..3662fcb5 100644 --- a/docs/examples/notebooks/simple_email.ipynb +++ b/docs/examples/notebooks/simple_email.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ diff --git a/docs/examples/notebooks/table_mobject.ipynb b/docs/examples/notebooks/table_mobject.ipynb index 94289994..bf963f46 100644 --- a/docs/examples/notebooks/table_mobject.ipynb +++ b/docs/examples/notebooks/table_mobject.ipynb @@ -25,7 +25,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "VDaTfltQY3Fl" + "id": "VDaTfltQY3Fl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ @@ -56,7 +59,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "9EurAUSz_1yl" + "id": "9EurAUSz_1yl", + "tags": [ + "skip-execution" + ] }, "outputs": [], "source": [ From 2a7c7cc35866ac33f892505f262963a3a6f56f82 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Thu, 15 Jan 2026 13:45:36 -0800 Subject: [PATCH 09/13] enabling notebooks to run with pytest docs --- docs/examples/conftest.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py index 2fde57e2..2b1b4ea5 100644 --- a/docs/examples/conftest.py +++ b/docs/examples/conftest.py @@ -6,6 +6,18 @@ import pytest +# Enable nbmake for notebook testing when running pytest in this directory. +# This allows `pytest docs/` to automatically run notebooks via nbmake. +pytest_plugins = ["nbmake"] + + +def pytest_configure(config): + """Configure nbmake to run notebooks in docs/examples/notebooks/.""" + # Only enable nbmake if we're collecting from docs directory + if hasattr(config.option, "nbmake"): + config.option.nbmake = True + + examples_to_skip = { "101_example.py", "__init__.py", @@ -43,14 +55,6 @@ def pytest_collect_file(parent: pytest.Dir, file_path: pathlib.PosixPath): return ExampleFile.from_parent(parent, path=file_path) - # TODO: Support running jupyter notebooks: - # - use nbmake or directly use nbclient as documented below - # - install the nbclient package - # - run either using python api or jupyter execute - # - must replace background processes - # if file_path.suffix == ".ipynb": - # return ExampleFile.from_parent(parent, path=file_path) - class ExampleFile(pytest.File): def collect(self): From 9a73fa3e7ca9c249cfbe7031b5730ef1298ab5ee Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Thu, 15 Jan 2026 13:48:35 -0800 Subject: [PATCH 10/13] reverting cause plugins can only be initiated at top level --- docs/examples/conftest.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/docs/examples/conftest.py b/docs/examples/conftest.py index 2b1b4ea5..bef7dce6 100644 --- a/docs/examples/conftest.py +++ b/docs/examples/conftest.py @@ -1,4 +1,7 @@ -"""Allows you to use `pytest docs` to run the examples.""" +"""Allows you to use `pytest docs` to run the examples. + +To run notebooks, use: uv run --with 'mcp' pytest --nbmake docs/examples/notebooks/ +""" import pathlib import subprocess @@ -6,18 +9,6 @@ import pytest -# Enable nbmake for notebook testing when running pytest in this directory. -# This allows `pytest docs/` to automatically run notebooks via nbmake. -pytest_plugins = ["nbmake"] - - -def pytest_configure(config): - """Configure nbmake to run notebooks in docs/examples/notebooks/.""" - # Only enable nbmake if we're collecting from docs directory - if hasattr(config.option, "nbmake"): - config.option.nbmake = True - - examples_to_skip = { "101_example.py", "__init__.py", From 419fcd0b359d457b9e933fddf093373f8639cd7a Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 16 Jan 2026 10:29:44 -0800 Subject: [PATCH 11/13] updating dev with nbmake --- pyproject.toml | 1 + uv.lock | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 913258b4..2431f6b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,7 @@ dev = [ "pytest-asyncio", "mypy>=1.17.0", "python-semantic-release~=7.32", + "nbmake>=1.5.5", ] notebook = [ diff --git a/uv.lock b/uv.lock index b49e0bd1..8d1f179a 100644 --- a/uv.lock +++ b/uv.lock @@ -3284,6 +3284,7 @@ watsonx = [ dev = [ { name = "isort" }, { name = "mypy" }, + { name = "nbmake" }, { name = "pdm" }, { name = "pre-commit" }, { name = "pylint" }, @@ -3350,6 +3351,7 @@ provides-extras = ["hf", "vllm", "litellm", "watsonx", "docling", "all"] dev = [ { name = "isort", specifier = ">=6.0.0" }, { name = "mypy", specifier = ">=1.17.0" }, + { name = "nbmake", specifier = ">=1.5.5" }, { name = "pdm", specifier = ">=2.24.0" }, { name = "pre-commit", specifier = ">=4.2.0" }, { name = "pylint", specifier = ">=3.3.4" }, @@ -3818,6 +3820,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" }, ] +[[package]] +name = "nbmake" +version = "1.5.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ipykernel" }, + { name = "nbclient" }, + { name = "nbformat" }, + { name = "pygments" }, + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/9a/aae201cee5639e1d562b3843af8fd9f8d018bb323e776a2b973bdd5fc64b/nbmake-1.5.5.tar.gz", hash = "sha256:239dc868ea13a7c049746e2aba2c229bd0f6cdbc6bfa1d22f4c88638aa4c5f5c", size = 85929, upload-time = "2024-12-23T18:33:46.774Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/be/b257e12f9710819fde40adc972578bee6b72c5992da1bc8369bef2597756/nbmake-1.5.5-py3-none-any.whl", hash = "sha256:c6fbe6e48b60cacac14af40b38bf338a3b88f47f085c54ac5b8639ff0babaf4b", size = 12818, upload-time = "2024-12-23T18:33:44.566Z" }, +] + [[package]] name = "nest-asyncio" version = "1.6.0" From 8836f7ae8ededb7cd388bf0977d5b02e75377619 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 16 Jan 2026 10:52:16 -0800 Subject: [PATCH 12/13] removing openai name from granite micro --- mellea/backends/model_ids.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index 9b585aea..6fd67fe2 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -28,7 +28,6 @@ class ModelIdentifier: IBM_GRANITE_4_MICRO_3B = ModelIdentifier( hf_model_name="ibm-granite/granite-4.0-micro", ollama_name="granite4:micro", - openai_name="granite4:micro", # setting this just for testing purposes. watsonx_name="ibm/granite-4-h-small", ) # todo: watsonx model is different from ollama model - should be same. From 956ba93442b889669389f78791dd04f436d2b398 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 16 Jan 2026 11:23:12 -0800 Subject: [PATCH 13/13] small change --- test/backends/test_vision_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py index dbe17b9c..9c958efe 100644 --- a/test/backends/test_vision_openai.py +++ b/test/backends/test_vision_openai.py @@ -18,7 +18,7 @@ def m_session(gh_run): if gh_run == 1: m = start_session( "openai", - model_id=IBM_GRANITE_4_MICRO_3B, + model_id=IBM_GRANITE_4_MICRO_3B.ollama_name, # type: ignore base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", model_options={ModelOption.MAX_NEW_TOKENS: 5},