Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/7315.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed bug where API process would miss heartbeats during long uploads.
47 changes: 45 additions & 2 deletions pulpcore/app/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from logging import getLogger
import os
import sys
import threading
import time

import click
import django
Expand All @@ -22,9 +24,40 @@


class PulpApiWorker(SyncWorker):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.heartbeat_thread = None

def _heartbeat_loop(self):
"""Run heartbeat in a loop. Exit process if heartbeat fails."""
try:
while self.alive:
try:
self.heartbeat()
except Exception as e:
# Any unhandled exception in heartbeat should restart the worker
logger.error(
f"Heartbeat thread encountered fatal error for worker {self.name}: {e}"
)
# Exit with error code to trigger worker restart
os._exit(Arbiter.WORKER_BOOT_ERROR)

time.sleep(self.heartbeat_interval)
except Exception as e:
# Safety net: if even the loop itself fails, restart
logger.error(f"Heartbeat thread loop failed for worker {self.name}: {e}")
os._exit(Arbiter.WORKER_BOOT_ERROR)

def notify(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First i thought, this is dead code. Now i think we have the opportunity to check in on the thread here.

super().notify()
self.heartbeat()

# Check if heartbeat thread is still alive
if not self.heartbeat_thread.is_alive():
logger.error(
f"Heartbeat thread died unexpectedly for worker {self.name}. "
"Exiting to trigger restart."
)
os._exit(Arbiter.WORKER_BOOT_ERROR)

def heartbeat(self):
try:
Expand All @@ -37,7 +70,8 @@ def heartbeat(self):
logger.debug(self.beat_msg)
except (InterfaceError, DatabaseError) as e:
logger.error(f"{self.fail_beat_msg} Exception: {str(e)}")
exit(Arbiter.WORKER_BOOT_ERROR)
# This will be caught by _heartbeat_loop and trigger os._exit
raise

def init_process(self):
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "pulpcore.app.settings")
Expand Down Expand Up @@ -76,6 +110,15 @@ def init_process(self):
logger.error(f"An API app with name {self.name} already exists in the database.")
exit(Arbiter.WORKER_BOOT_ERROR)

# Store heartbeat interval for the heartbeat thread
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Store heartbeat interval for the heartbeat thread
# Store the value of one third of the desired API application's time to live to the heartbeat interval on the current instance to be accessible by the heartbeat thread.

:mild sarcasm:

self.heartbeat_interval = settings.API_APP_TTL // 3

# Start heartbeat thread before entering the main loop
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now why don't we then put all of that stuff right before entering the main loop?

self.heartbeat_thread = threading.Thread(
target=self._heartbeat_loop, daemon=True, name=f"heartbeat-{self.name}"
)
self.heartbeat_thread.start()

super().init_process()

def run(self):
Expand Down
Loading