promptfoo
diff --git a/‎test/AGENTS.md‎
Lines changed: 16 additions & 1 deletion b/‎test/AGENTS.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎test/assertions/python.test.ts‎
Lines changed: 6 additions & 0 deletions b/‎test/assertions/python.test.ts‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎test/commands/modelScan.test.ts‎
Lines changed: 125 additions & 5 deletions b/‎test/commands/modelScan.test.ts‎
Lines changed: 125 additions & 5 deletions
diff --git a/‎test/evaluator.test.ts‎
Lines changed: 12 additions & 0 deletions b/‎test/evaluator.test.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎test/globalConfig/accounts.test.ts‎
Lines changed: 4 additions & 0 deletions b/‎test/globalConfig/accounts.test.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎test/providers/watsonx.test.ts‎
Lines changed: 8 additions & 0 deletions b/‎test/providers/watsonx.test.ts‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎test/redteam/commands/generate.test.ts‎
Lines changed: 21 additions & 0 deletions b/‎test/redteam/commands/generate.test.ts‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎test/redteam/providers/iterative.test.ts‎
Lines changed: 7 additions & 0 deletions b/‎test/redteam/providers/iterative.test.ts‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎test/server/routes/modelAudit.test.ts‎
Lines changed: 9 additions & 3 deletions b/‎test/server/routes/modelAudit.test.ts‎
Lines changed: 9 additions & 3 deletions
@@ -26,7 +26,9 @@ npm run test:integration
 - **NEVER** increase test timeouts - fix the slow test
 - **NEVER** use `.only()` or `.skip()` in committed code
 - **ALWAYS** clean up mocks in `afterEach`
-- **ALWAYS** use `--randomize` to ensure test independence
+- Tests run in **random order by default** (configured in vitest.config.ts)
+  - Use `--sequence.shuffle=false` to disable when debugging specific failures
+  - Use `--sequence.seed=12345` to reproduce a specific order
 
 ## Writing Tests
 
@@ -67,6 +69,19 @@ axiosMock.post.mockResolvedValue({ data: { result: 'success' } });
 - Mock external dependencies but not the code being tested
 - Reset mocks between tests to prevent test pollution
 
+**Critical: Mock Isolation**
+
+`vi.clearAllMocks()` only clears call history, NOT mock implementations. Use `mockReset()` for full isolation:
+
+```typescript
+beforeEach(() => {
+  vi.clearAllMocks(); // Clears .mock.calls and .mock.results
+  vi.mocked(myMock).mockReset(); // Also clears mockReturnValue/mockResolvedValue
+});
+```
+
+For `vi.hoisted()` mocks or mocks with `mockReturnValue()`, you MUST call `mockReset()` in `beforeEach` to ensure test isolation when tests run in random order.
+
 ## Provider Testing
 
 Every provider needs tests covering:
 
@@ -378,6 +378,9 @@ describe('Python file references', { timeout: 15000 }, () => {
 
   it('should handle when python file assertions throw an error', async () => {
     const output = 'Expected output';
+    // Must mock path.resolve and path.extname to ensure test isolation
+    vi.mocked(path.resolve).mockReturnValue('/path/to/assert.py');
+    vi.mocked(path.extname).mockReturnValue('.py');
     vi.mocked(runPython).mockRejectedValue(
       new Error('The Python script `call_api` function must return a dict with an `output`'),
     );
@@ -437,6 +440,9 @@ describe('Python file references', { timeout: 15000 }, () => {
       ],
     };
 
+    // Must mock path.resolve and path.extname to ensure test isolation
+    vi.mocked(path.resolve).mockReturnValue('/path/to/assert.py');
+    vi.mocked(path.extname).mockReturnValue('.py');
     vi.mocked(runPython).mockResolvedValueOnce(pythonResult as any);
 
     const fileAssertion: Assertion = {
 
@@ -34,12 +34,36 @@ describe('modelScanCommand', () => {
   let program: Command;
   let mockExit: MockInstance;
 
-  beforeEach(() => {
+  beforeEach(async () => {
     program = new Command();
     mockExit = vi.spyOn(process, 'exit').mockImplementation(function () {
       return undefined as never;
     });
     vi.clearAllMocks();
+
+    // Reset mock implementations (clearAllMocks only clears call history, not implementations)
+    vi.mocked(spawn).mockReset();
+    const { getModelAuditCurrentVersion } = await import('../../src/updates');
+    vi.mocked(getModelAuditCurrentVersion).mockReset();
+    vi.mocked(getModelAuditCurrentVersion).mockResolvedValue('0.2.16');
+
+    // Reset ModelAudit mock to default (no existing scan found)
+    const ModelAudit = (await import('../../src/models/modelAudit')).default;
+    vi.mocked(ModelAudit.findByRevision).mockReset();
+    vi.mocked(ModelAudit.findByRevision).mockResolvedValue(null);
+    vi.mocked(ModelAudit.create).mockReset();
+    vi.mocked(ModelAudit.create).mockResolvedValue({ id: 'scan-abc-2025-01-01T00:00:00' } as any);
+
+    // Reset HuggingFace mocks
+    const { isHuggingFaceModel, getHuggingFaceMetadata, parseHuggingFaceModel } = await import(
+      '../../src/util/huggingfaceMetadata'
+    );
+    vi.mocked(isHuggingFaceModel).mockReset();
+    vi.mocked(isHuggingFaceModel).mockReturnValue(false);
+    vi.mocked(getHuggingFaceMetadata).mockReset();
+    vi.mocked(getHuggingFaceMetadata).mockResolvedValue(null);
+    vi.mocked(parseHuggingFaceModel).mockReset();
+    vi.mocked(parseHuggingFaceModel).mockReturnValue(null);
   });
 
   afterEach(() => {
@@ -299,12 +323,36 @@ describe('Re-scan on version change behavior', () => {
   let program: Command;
   let mockExit: MockInstance;
 
-  beforeEach(() => {
+  beforeEach(async () => {
     program = new Command();
     mockExit = vi.spyOn(process, 'exit').mockImplementation(function () {
       return undefined as never;
     });
     vi.clearAllMocks();
+
+    // Reset mock implementations (clearAllMocks only clears call history, not implementations)
+    vi.mocked(spawn).mockReset();
+    const { getModelAuditCurrentVersion } = await import('../../src/updates');
+    vi.mocked(getModelAuditCurrentVersion).mockReset();
+    vi.mocked(getModelAuditCurrentVersion).mockResolvedValue('0.2.16');
+
+    // Reset ModelAudit mock to default (no existing scan found)
+    const ModelAudit = (await import('../../src/models/modelAudit')).default;
+    vi.mocked(ModelAudit.findByRevision).mockReset();
+    vi.mocked(ModelAudit.findByRevision).mockResolvedValue(null);
+    vi.mocked(ModelAudit.create).mockReset();
+    vi.mocked(ModelAudit.create).mockResolvedValue({ id: 'scan-abc-2025-01-01T00:00:00' } as any);
+
+    // Reset HuggingFace mocks
+    const { isHuggingFaceModel, getHuggingFaceMetadata, parseHuggingFaceModel } = await import(
+      '../../src/util/huggingfaceMetadata'
+    );
+    vi.mocked(isHuggingFaceModel).mockReset();
+    vi.mocked(isHuggingFaceModel).mockReturnValue(false);
+    vi.mocked(getHuggingFaceMetadata).mockReset();
+    vi.mocked(getHuggingFaceMetadata).mockResolvedValue(null);
+    vi.mocked(parseHuggingFaceModel).mockReset();
+    vi.mocked(parseHuggingFaceModel).mockReturnValue(null);
   });
 
   afterEach(() => {
@@ -495,8 +543,32 @@ describe('Re-scan on version change behavior', () => {
 });
 
 describe('checkModelAuditInstalled', () => {
-  beforeEach(() => {
+  beforeEach(async () => {
     vi.clearAllMocks();
+
+    // Reset mock implementations (clearAllMocks only clears call history, not implementations)
+    vi.mocked(spawn).mockReset();
+    const { getModelAuditCurrentVersion } = await import('../../src/updates');
+    vi.mocked(getModelAuditCurrentVersion).mockReset();
+    vi.mocked(getModelAuditCurrentVersion).mockResolvedValue('0.2.16');
+
+    // Reset ModelAudit mock to default (no existing scan found)
+    const ModelAudit = (await import('../../src/models/modelAudit')).default;
+    vi.mocked(ModelAudit.findByRevision).mockReset();
+    vi.mocked(ModelAudit.findByRevision).mockResolvedValue(null);
+    vi.mocked(ModelAudit.create).mockReset();
+    vi.mocked(ModelAudit.create).mockResolvedValue({ id: 'scan-abc-2025-01-01T00:00:00' } as any);
+
+    // Reset HuggingFace mocks
+    const { isHuggingFaceModel, getHuggingFaceMetadata, parseHuggingFaceModel } = await import(
+      '../../src/util/huggingfaceMetadata'
+    );
+    vi.mocked(isHuggingFaceModel).mockReset();
+    vi.mocked(isHuggingFaceModel).mockReturnValue(false);
+    vi.mocked(getHuggingFaceMetadata).mockReset();
+    vi.mocked(getHuggingFaceMetadata).mockResolvedValue(null);
+    vi.mocked(parseHuggingFaceModel).mockReset();
+    vi.mocked(parseHuggingFaceModel).mockReturnValue(null);
   });
 
   it('should return installed: true and version when getModelAuditCurrentVersion returns version', async () => {
@@ -541,12 +613,36 @@ describe('Command Options Validation', () => {
   let program: Command;
   let mockExit: MockInstance;
 
-  beforeEach(() => {
+  beforeEach(async () => {
     program = new Command();
     mockExit = vi.spyOn(process, 'exit').mockImplementation(function () {
       return undefined as never;
     });
     vi.clearAllMocks();
+
+    // Reset mock implementations (clearAllMocks only clears call history, not implementations)
+    vi.mocked(spawn).mockReset();
+    const { getModelAuditCurrentVersion } = await import('../../src/updates');
+    vi.mocked(getModelAuditCurrentVersion).mockReset();
+    vi.mocked(getModelAuditCurrentVersion).mockResolvedValue('0.2.16');
+
+    // Reset ModelAudit mock to default (no existing scan found)
+    const ModelAudit = (await import('../../src/models/modelAudit')).default;
+    vi.mocked(ModelAudit.findByRevision).mockReset();
+    vi.mocked(ModelAudit.findByRevision).mockResolvedValue(null);
+    vi.mocked(ModelAudit.create).mockReset();
+    vi.mocked(ModelAudit.create).mockResolvedValue({ id: 'scan-abc-2025-01-01T00:00:00' } as any);
+
+    // Reset HuggingFace mocks
+    const { isHuggingFaceModel, getHuggingFaceMetadata, parseHuggingFaceModel } = await import(
+      '../../src/util/huggingfaceMetadata'
+    );
+    vi.mocked(isHuggingFaceModel).mockReset();
+    vi.mocked(isHuggingFaceModel).mockReturnValue(false);
+    vi.mocked(getHuggingFaceMetadata).mockReset();
+    vi.mocked(getHuggingFaceMetadata).mockResolvedValue(null);
+    vi.mocked(parseHuggingFaceModel).mockReset();
+    vi.mocked(parseHuggingFaceModel).mockReturnValue(null);
   });
 
   afterEach(() => {
@@ -766,12 +862,36 @@ describe('Temp file JSON output (CLI UI fix)', () => {
   let program: Command;
   let mockExit: MockInstance;
 
-  beforeEach(() => {
+  beforeEach(async () => {
     program = new Command();
     mockExit = vi.spyOn(process, 'exit').mockImplementation(function () {
       return undefined as never;
     });
     vi.clearAllMocks();
+
+    // Reset mock implementations (clearAllMocks only clears call history, not implementations)
+    vi.mocked(spawn).mockReset();
+    const { getModelAuditCurrentVersion } = await import('../../src/updates');
+    vi.mocked(getModelAuditCurrentVersion).mockReset();
+    vi.mocked(getModelAuditCurrentVersion).mockResolvedValue('0.2.16');
+
+    // Reset ModelAudit mock to default (no existing scan found)
+    const ModelAudit = (await import('../../src/models/modelAudit')).default;
+    vi.mocked(ModelAudit.findByRevision).mockReset();
+    vi.mocked(ModelAudit.findByRevision).mockResolvedValue(null);
+    vi.mocked(ModelAudit.create).mockReset();
+    vi.mocked(ModelAudit.create).mockResolvedValue({ id: 'scan-abc-2025-01-01T00:00:00' } as any);
+
+    // Reset HuggingFace mocks
+    const { isHuggingFaceModel, getHuggingFaceMetadata, parseHuggingFaceModel } = await import(
+      '../../src/util/huggingfaceMetadata'
+    );
+    vi.mocked(isHuggingFaceModel).mockReset();
+    vi.mocked(isHuggingFaceModel).mockReturnValue(false);
+    vi.mocked(getHuggingFaceMetadata).mockReset();
+    vi.mocked(getHuggingFaceMetadata).mockResolvedValue(null);
+    vi.mocked(parseHuggingFaceModel).mockReset();
+    vi.mocked(parseHuggingFaceModel).mockReturnValue(null);
   });
 
   afterEach(() => {
 
@@ -339,6 +339,9 @@ describe('evaluator', () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    // Reset runExtensionHook to default implementation (other tests may have overridden it)
+    vi.mocked(runExtensionHook).mockReset();
+    vi.mocked(runExtensionHook).mockImplementation((_extensions, _hookName, context) => context);
     // Reset cliState for each test to ensure clean state
     cliState.resume = false;
     cliState.basePath = '';
@@ -4032,6 +4035,9 @@ describe('evaluator defaultTest merging', () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    // Reset runExtensionHook to default implementation (other tests may have overridden it)
+    vi.mocked(runExtensionHook).mockReset();
+    vi.mocked(runExtensionHook).mockImplementation((_extensions, _hookName, context) => context);
   });
 
   it('should merge defaultTest.options.provider with test case options', async () => {
@@ -4144,6 +4150,9 @@ describe('Evaluator with external defaultTest', () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    // Reset runExtensionHook to default implementation (other tests may have overridden it)
+    vi.mocked(runExtensionHook).mockReset();
+    vi.mocked(runExtensionHook).mockImplementation((_extensions, _hookName, context) => context);
   });
 
   it('should handle string defaultTest gracefully', async () => {
@@ -4364,6 +4373,9 @@ describe('defaultTest normalization for extensions', () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    // Reset runExtensionHook to default implementation (other tests may have overridden it)
+    vi.mocked(runExtensionHook).mockReset();
+    vi.mocked(runExtensionHook).mockImplementation((_extensions, _hookName, context) => context);
   });
 
   it('should initialize defaultTest when undefined and extensions are present', async () => {
 
@@ -152,6 +152,10 @@ describe('accounts', () => {
 
   describe('setUserEmail', () => {
     it('should write email to global config', () => {
+      // Must mock readGlobalConfig to ensure clean state (no leftover account properties from other tests)
+      vi.mocked(readGlobalConfig).mockReturnValue({
+        id: 'test-id',
+      });
       const email = 'test@example.com';
       setUserEmail(email);
       expect(writeGlobalConfigPartial).toHaveBeenCalledWith({
 
@@ -422,6 +422,14 @@ describe('WatsonXProvider', () => {
         return true;
       });
 
+      // Must mock WatsonXAI.newInstance to ensure test isolation
+      const mockedWatsonXAIClient: Partial<any> = {
+        generateText: vi.fn(),
+      };
+      vi.mocked(WatsonXAI.newInstance).mockImplementation(function () {
+        return mockedWatsonXAIClient as any;
+      });
+
       const provider = new WatsonXProvider(modelName, { config });
       const generateTextSpy = vi.spyOn(await provider.getClient(), 'generateText');
       const response = await provider.callApi(prompt);
 
@@ -1496,6 +1496,27 @@ describe('doGenerateRedteam', () => {
 describe('doGenerateRedteam with external defaultTest', () => {
   beforeEach(() => {
     vi.clearAllMocks();
+
+    // Reset resolveConfigs with default implementation (tests depend on this being set)
+    vi.mocked(configModule.resolveConfigs).mockReset();
+    vi.mocked(configModule.resolveConfigs).mockResolvedValue({
+      basePath: '/mock/path',
+      testSuite: {
+        providers: [
+          {
+            id: () => 'test-provider',
+            callApi: vi.fn().mockResolvedValue({ output: 'test output' }),
+            cleanup: vi.fn().mockResolvedValue(undefined),
+          },
+        ],
+        prompts: [],
+        tests: [],
+      },
+      config: {
+        redteam: {},
+      },
+    });
+
     vi.mocked(fs.existsSync).mockImplementation(function () {
       return false;
     });
 
@@ -47,6 +47,13 @@ describe('RedteamIterativeProvider', () => {
   beforeEach(() => {
     vi.clearAllMocks();
 
+    // Reset hoisted mocks to ensure test isolation
+    // mockReset clears both call history AND mock implementations
+    mockGetProvider.mockReset();
+    mockGetTargetResponse.mockReset();
+    mockCheckPenalizedPhrases.mockReset();
+    mockGetGraderById.mockReset();
+
     mockRedteamProvider = {
       id: vi.fn().mockReturnValue('mock-redteam'),
       callApi: vi
 
@@ -1,5 +1,7 @@
 import { spawn } from 'child_process';
 import fs from 'fs';
+import os from 'os';
+import path from 'path';
 import request from 'supertest';
 import { beforeEach, describe, expect, it, vi } from 'vitest';
 import { createApp } from '../../../src/server/server';
@@ -22,6 +24,10 @@ describe('Model Audit Routes', () => {
 
   beforeEach(() => {
     vi.clearAllMocks();
+    // Reset mock implementations to ensure test isolation when tests run in random order.
+    // vi.clearAllMocks() only clears call history, not mockResolvedValue/mockReturnValue.
+    mockedCheckModelAuditInstalled.mockReset();
+    mockedSpawn.mockReset();
     app = createApp();
   });
 
@@ -31,7 +37,7 @@ describe('Model Audit Routes', () => {
       mockedCheckModelAuditInstalled.mockResolvedValue({ installed: true, version: '0.2.20' });
 
       // Create a temporary test file
-      const testFilePath = '/tmp/test-model-audit-scan.pkl';
+      const testFilePath = path.join(os.tmpdir(), 'test-model-audit-scan.pkl');
       fs.writeFileSync(testFilePath, 'test data');
 
       const mockScanOutput = JSON.stringify({
@@ -70,7 +76,7 @@ describe('Model Audit Routes', () => {
     it('should handle request with empty options object', async () => {
       mockedCheckModelAuditInstalled.mockResolvedValue({ installed: true, version: '0.2.20' });
 
-      const testFilePath = '/tmp/test-model-audit-scan-2.pkl';
+      const testFilePath = path.join(os.tmpdir(), 'test-model-audit-scan-2.pkl');
       fs.writeFileSync(testFilePath, 'test data');
 
       const mockScanOutput = JSON.stringify({
@@ -120,7 +126,7 @@ describe('Model Audit Routes', () => {
     it('should return 400 when modelaudit is not installed', async () => {
       mockedCheckModelAuditInstalled.mockResolvedValue({ installed: false, version: null });
 
-      const testFilePath = '/tmp/test-model-audit-not-installed.pkl';
+      const testFilePath = path.join(os.tmpdir(), 'test-model-audit-not-installed.pkl');
       fs.writeFileSync(testFilePath, 'test data');
 
       const response = await request(app)