Skip to content

Commit c4ed6ca

Browse files
authored
Don't terminate unreachable SSH instances (#3568)
Fixes: #2531
1 parent 7729127 commit c4ed6ca

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

src/dstack/_internal/server/background/tasks/process_instances.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
778778
)
779779
return
780780

781-
if instance.termination_deadline is None:
781+
if not is_ssh_instance(instance) and instance.termination_deadline is None:
782782
instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET
783783

784784
if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
@@ -792,7 +792,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
792792
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
793793
elif instance.status.is_available():
794794
deadline = instance.termination_deadline
795-
if get_current_datetime() > deadline:
795+
if deadline is not None and get_current_datetime() > deadline:
796796
instance.termination_reason = InstanceTerminationReason.UNREACHABLE
797797
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
798798

src/tests/_internal/server/background/tasks/test_process_instances.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy
198198
session=session,
199199
project=project,
200200
status=InstanceStatus.IDLE,
201+
unreachable=False,
201202
)
202203
health_status = "SSH connection fail"
203204
with patch(
@@ -210,11 +211,39 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy
210211

211212
assert instance is not None
212213
assert instance.status == InstanceStatus.IDLE
214+
assert instance.unreachable
213215
assert instance.termination_deadline is not None
214216
assert instance.termination_deadline.replace(
215217
tzinfo=dt.timezone.utc
216218
) > get_current_datetime() + dt.timedelta(minutes=19)
217219

220+
@pytest.mark.asyncio
221+
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
222+
async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance(
223+
self, test_db, session: AsyncSession
224+
):
225+
project = await create_project(session=session)
226+
instance = await create_instance(
227+
session=session,
228+
project=project,
229+
status=InstanceStatus.IDLE,
230+
unreachable=False,
231+
remote_connection_info=get_remote_connection_info(),
232+
)
233+
health_status = "SSH connection fail"
234+
with patch(
235+
"dstack._internal.server.background.tasks.process_instances._check_instance_inner"
236+
) as healthcheck:
237+
healthcheck.return_value = InstanceCheck(reachable=False, message=health_status)
238+
await process_instances()
239+
240+
await session.refresh(instance)
241+
242+
assert instance is not None
243+
assert instance.status == InstanceStatus.IDLE
244+
assert instance.unreachable
245+
assert instance.termination_deadline is None
246+
218247
@pytest.mark.asyncio
219248
@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True)
220249
async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession):

0 commit comments

Comments
 (0)