Skip to content

Commit 8da0e83

Browse files
committed
feat(satellite): implement immediate recovery for connection errors
1 parent 71a68d2 commit 8da0e83

File tree

3 files changed

+215
-1
lines changed

3 files changed

+215
-1
lines changed

services/satellite/src/core/mcp-server-wrapper.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1227,11 +1227,23 @@ export class McpServerWrapper {
12271227
status_message: message
12281228
}, `Emitted status change to backend: ${status}`);
12291229
}
1230+
1231+
// Attempt immediate recovery for connection errors (not auth errors which need user intervention)
1232+
if (status === 'offline' || status === 'error') {
1233+
this.handleServerRecovery(serverName, serverSlug, config).catch(err => {
1234+
this.logger.debug({
1235+
operation: 'immediate_recovery_attempt_failed',
1236+
server_slug: serverSlug,
1237+
error: err instanceof Error ? err.message : String(err)
1238+
}, 'Immediate recovery attempt failed - will retry on next tool call or backend health check');
1239+
});
1240+
}
12301241
}
12311242

12321243
/**
12331244
* Handle server recovery - trigger tool re-discovery
1234-
* Called when tool succeeds but server was previously offline/error
1245+
* Called when tool succeeds but server was previously offline/error,
1246+
* OR immediately after tool failure (immediate recovery attempt)
12351247
*/
12361248
private async handleServerRecovery(
12371249
serverName: string,

services/satellite/src/server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,9 @@ export async function createServer() {
536536
// Set UnifiedToolDiscoveryManager for disabled tools tracking
537537
commandProcessor.setUnifiedToolDiscoveryManager(toolDiscoveryManager);
538538

539+
// Set RemoteToolDiscoveryManager for mcp_recovery command handling
540+
commandProcessor.setRemoteToolDiscoveryManager(remoteToolDiscoveryManager);
541+
539542
// Wire up backend status tracking callbacks for CommandProcessor and ProcessManager
540543
commandProcessor.setBackendStatusCallback(backendStatusCallback);
541544
processManager.setBackendStatusCallback(backendStatusCallback);

services/satellite/src/services/command-processor.ts

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { ProcessManager } from '../process/manager';
55
import { RuntimeState } from '../process/runtime-state';
66
import { StdioToolDiscoveryManager } from './stdio-tool-discovery-manager';
77
import { UnifiedToolDiscoveryManager } from './unified-tool-discovery-manager';
8+
import { RemoteToolDiscoveryManager } from './remote-tool-discovery-manager';
89
import { MCPServerConfig } from '../process/types';
910
import { maskUrlForLogging } from '../utils/log-masker';
1011
import type { EventBus } from './event-bus';
@@ -27,6 +28,7 @@ export class CommandProcessor {
2728
private runtimeState: RuntimeState | null;
2829
private stdioDiscoveryManager: StdioToolDiscoveryManager | null;
2930
private unifiedToolDiscoveryManager: UnifiedToolDiscoveryManager | null = null;
31+
private remoteToolDiscoveryManager: RemoteToolDiscoveryManager | null = null;
3032
private eventBus: EventBus | null = null;
3133
// eslint-disable-next-line @typescript-eslint/no-explicit-any
3234
private tokenIntrospectionService: any | null = null;
@@ -66,6 +68,13 @@ export class CommandProcessor {
6668
this.unifiedToolDiscoveryManager = manager;
6769
}
6870

71+
/**
72+
* Set remote tool discovery manager for recovery handling
73+
*/
74+
setRemoteToolDiscoveryManager(manager: RemoteToolDiscoveryManager): void {
75+
this.remoteToolDiscoveryManager = manager;
76+
}
77+
6978
/**
7079
* Set event bus for status event emission
7180
*/
@@ -260,6 +269,11 @@ export class CommandProcessor {
260269
return await this.handleUpdateToolStatus(command);
261270
}
262271

272+
// Check if this is an mcp_recovery event from backend health check
273+
if (payload.event === 'mcp_recovery') {
274+
return await this.handleMcpRecovery(command);
275+
}
276+
263277
// Default behavior: trigger configuration refresh
264278
this.logger.info({
265279
operation: 'command_configure',
@@ -406,6 +420,191 @@ export class CommandProcessor {
406420
}
407421
}
408422

423+
/**
424+
* Handle mcp_recovery event - trigger tool re-discovery for recovered HTTP/SSE server
425+
* Called when backend detects an HTTP MCP server has recovered via health check
426+
*/
427+
private async handleMcpRecovery(command: SatelliteCommand): Promise<CommandResult> {
428+
const { installation_id, team_id } = command.payload;
429+
430+
this.logger.info({
431+
operation: 'mcp_recovery_received',
432+
command_id: command.id,
433+
installation_id,
434+
team_id
435+
}, `Processing MCP recovery command for installation ${installation_id}`);
436+
437+
// Validate required fields
438+
if (!installation_id) {
439+
const errorMsg = 'Missing installation_id in mcp_recovery payload';
440+
this.logger.error({
441+
operation: 'mcp_recovery_validation_failed',
442+
command_id: command.id
443+
}, errorMsg);
444+
445+
return {
446+
command_id: command.id,
447+
status: 'failed',
448+
error: errorMsg
449+
};
450+
}
451+
452+
// Find server config by installation_id
453+
const currentConfig = this.configManager.getCurrentConfiguration();
454+
let serverName: string | null = null;
455+
let serverConfig: typeof currentConfig.servers[string] | null = null;
456+
457+
for (const [name, config] of Object.entries(currentConfig.servers)) {
458+
if (config.installation_id === installation_id) {
459+
serverName = name;
460+
serverConfig = config;
461+
break;
462+
}
463+
}
464+
465+
if (!serverName || !serverConfig) {
466+
this.logger.warn({
467+
operation: 'mcp_recovery_server_not_found',
468+
command_id: command.id,
469+
installation_id
470+
}, `Server config not found for installation ${installation_id} - may not be deployed to this satellite`);
471+
472+
return {
473+
command_id: command.id,
474+
status: 'completed',
475+
result: {
476+
message: 'Server not found on this satellite',
477+
installation_id
478+
}
479+
};
480+
}
481+
482+
// Only handle HTTP/SSE servers (not stdio - they're handled via process lifecycle)
483+
if (serverConfig.transport_type === 'stdio') {
484+
this.logger.debug({
485+
operation: 'mcp_recovery_skipped_stdio',
486+
command_id: command.id,
487+
installation_id,
488+
server_name: serverName
489+
}, 'Skipping recovery for stdio server - handled via process lifecycle');
490+
491+
return {
492+
command_id: command.id,
493+
status: 'completed',
494+
result: {
495+
message: 'stdio servers do not require recovery re-discovery',
496+
installation_id,
497+
server_name: serverName
498+
}
499+
};
500+
}
501+
502+
// Check if RemoteToolDiscoveryManager is available
503+
if (!this.remoteToolDiscoveryManager) {
504+
const errorMsg = 'RemoteToolDiscoveryManager not available for recovery handling';
505+
this.logger.error({
506+
operation: 'mcp_recovery_no_manager',
507+
command_id: command.id
508+
}, errorMsg);
509+
510+
return {
511+
command_id: command.id,
512+
status: 'failed',
513+
error: errorMsg
514+
};
515+
}
516+
517+
// Emit 'connecting' status to backend
518+
const validatedTeamId = team_id || serverConfig.team_id || 'unknown';
519+
this.emitStatusChange(
520+
installation_id,
521+
validatedTeamId,
522+
serverConfig.user_id || 'unknown',
523+
'connecting',
524+
'Server recovered, satellite initiating tool re-discovery'
525+
);
526+
527+
try {
528+
// Emit 'discovering_tools' status
529+
this.emitStatusChange(
530+
installation_id,
531+
validatedTeamId,
532+
serverConfig.user_id || 'unknown',
533+
'discovering_tools',
534+
'Re-discovering tools after server recovery'
535+
);
536+
537+
// Trigger tool re-discovery
538+
const startTime = Date.now();
539+
const tools = await this.remoteToolDiscoveryManager.discoverServerTools(serverName);
540+
const discoveryTimeMs = Date.now() - startTime;
541+
542+
// Emit 'online' status on success
543+
this.emitStatusChange(
544+
installation_id,
545+
validatedTeamId,
546+
serverConfig.user_id || 'unknown',
547+
'online',
548+
`Server recovered with ${tools.length} tools`
549+
);
550+
551+
this.logger.info({
552+
operation: 'mcp_recovery_success',
553+
command_id: command.id,
554+
installation_id,
555+
server_name: serverName,
556+
tools_discovered: tools.length,
557+
discovery_time_ms: discoveryTimeMs
558+
}, `MCP recovery successful: ${serverName} with ${tools.length} tools (${discoveryTimeMs}ms)`);
559+
560+
return {
561+
command_id: command.id,
562+
status: 'completed',
563+
result: {
564+
installation_id,
565+
server_name: serverName,
566+
tools_discovered: tools.length,
567+
discovery_time_ms: discoveryTimeMs
568+
}
569+
};
570+
571+
} catch (error) {
572+
const errorMessage = error instanceof Error ? error.message : String(error);
573+
574+
// Determine appropriate error status
575+
const { status, message } = RemoteToolDiscoveryManager.getStatusFromError(errorMessage);
576+
577+
// Emit error status to backend
578+
this.emitStatusChange(
579+
installation_id,
580+
validatedTeamId,
581+
serverConfig.user_id || 'unknown',
582+
status,
583+
message
584+
);
585+
586+
this.logger.error({
587+
operation: 'mcp_recovery_failed',
588+
command_id: command.id,
589+
installation_id,
590+
server_name: serverName,
591+
error: errorMessage,
592+
resulting_status: status
593+
}, `MCP recovery failed for ${serverName}: ${errorMessage}`);
594+
595+
return {
596+
command_id: command.id,
597+
status: 'failed',
598+
error: errorMessage,
599+
result: {
600+
installation_id,
601+
server_name: serverName,
602+
resulting_status: status
603+
}
604+
};
605+
}
606+
}
607+
409608
/**
410609
* Handle spawn command - dispatches to HTTP or stdio handler based on transport_type
411610
*/

0 commit comments

Comments
 (0)