refactor: Use state machine for Windows daemon watchdog

This commit is contained in:
Nick Bolton
2025-03-03 11:03:21 +00:00
parent 4db77ba2f7
commit 44f72a29e9
2 changed files with 116 additions and 99 deletions

View File

@ -24,7 +24,8 @@
#include <Windows.h>
#define CURRENT_PROCESS_ID 0
#define MAXIMUM_WAIT_TIME 3
const auto kStartDelaySeconds = 1;
namespace {
std::string trimDesktopName(const std::string &nameFromTraces)
@ -56,30 +57,14 @@ const char g_activeDesktop[] = {"activeDesktop:"};
MSWindowsWatchdog::MSWindowsWatchdog(bool autoDetectCommand, bool foreground)
: m_thread(NULL),
m_autoDetectCommand(autoDetectCommand),
m_monitoring(true),
m_commandChanged(false),
m_outputWritePipe(nullptr),
m_outputReadPipe(nullptr),
m_elevateProcess(false),
m_processFailures(0),
m_processStarted(false),
m_startFailures(0),
m_fileLogOutputter(NULL),
m_ready(false),
m_foreground(foreground)
{
m_mutex = ARCH->newMutex();
m_condVar = ARCH->newCondVar();
}
MSWindowsWatchdog::~MSWindowsWatchdog()
{
if (m_condVar != NULL) {
ARCH->closeCondVar(m_condVar);
}
if (m_mutex != NULL) {
ARCH->closeMutex(m_mutex);
}
}
void MSWindowsWatchdog::startAsync()
@ -91,7 +76,7 @@ void MSWindowsWatchdog::startAsync()
void MSWindowsWatchdog::stop()
{
m_monitoring = false;
m_running = false;
if (!m_thread->wait(5)) {
LOG((CLOG_WARN "could not stop main thread"));
@ -194,60 +179,74 @@ void MSWindowsWatchdog::mainLoop(void *)
// Set the pipe to non-blocking mode, which allows us to stop the output reader thread immediately
// in order to speed up the shutdown process when the Windows service needs to stop.
DWORD mode = PIPE_NOWAIT;
if (!SetNamedPipeHandleState(m_outputReadPipe, &mode, nullptr, nullptr)) {
if (DWORD mode = PIPE_NOWAIT; !SetNamedPipeHandleState(m_outputReadPipe, &mode, nullptr, nullptr)) {
LOG_ERR("could not set pipe to non-blocking mode");
throw XArch(new XArchEvalWindows());
}
while (m_monitoring) {
try {
while (m_running) {
if (!m_command.empty() && !m_foreground && m_session.hasChanged()) {
LOG_DEBUG("session changed, queueing process start");
m_processState = ProcessState::StartPending;
m_nextStartTime.reset();
}
if (m_processStarted && getCommand().empty()) {
LOG((CLOG_DEBUG "process started but command is empty, shutting down"));
shutdownExistingProcesses();
continue;
switch (m_processState) {
using enum ProcessState;
case Idle:
LOG_DEBUG3("watchdog: process idle");
break;
case StartScheduled: {
LOG_DEBUG3("watchdog: process start scheduled");
if (m_nextStartTime.has_value() && m_nextStartTime.value() <= ARCH->time()) {
LOG_DEBUG("start time reached, queueing process start");
m_processState = StartPending;
}
} break;
if (m_processFailures > 1) {
// increasing backoff period, maximum of 10 seconds.
// only start sleeping at 1 second to avoid unnecessary delay when the process stops
// for the first failure (i.e. when the process just stopped running).
int timeout = m_processFailures < 10 ? m_processFailures : 10;
LOG_WARN("backing off after failure, wait=%ds, failures=%d", timeout, m_processFailures);
ARCH->sleep(timeout);
case StartPending: {
LOG_INFO("daemon starting new process");
try {
startProcess();
m_startFailures = 0;
m_processState = Running;
} catch (std::exception &e) { // NOSONAR - Catching all exceptions
handleStartError(e.what());
m_processState = StartPending;
} catch (...) { // NOSONAR - Catching remaining exceptions
handleStartError();
m_processState = StartPending;
}
} break;
if (!getCommand().empty()) {
bool startNeeded = false;
if (m_processFailures != 0) {
startNeeded = true;
} else if (!m_foreground && m_session.hasChanged()) {
startNeeded = true;
} else if (m_commandChanged) {
startNeeded = true;
}
if (startNeeded) {
startProcess();
}
} else {
// prevent backoff when no command is set.
m_processFailures = 0;
}
if (m_processStarted && !isProcessRunning()) {
m_processFailures++;
m_processStarted = false;
case Running: {
LOG_DEBUG3("watchdog: process running");
if (!isProcessRunning()) {
LOG((CLOG_WARN "detected application not running, pid=%d", m_process->info().dwProcessId));
m_processState = StartPending;
}
} break;
case StopPending: {
LOG_INFO("stopping running process");
if (m_process != nullptr) {
m_process->shutdown();
m_process.reset();
} else {
LOG_WARN("no process to stop");
}
shutdownExistingProcesses();
m_processState = Idle;
} break;
}
// TODO: This seems like a hack, why would we need to send the SAS function every loop iteration?
// This slows down both the process relaunch speed and the watchdog thread loop shut down time.
if (sendSasFunc != NULL) {
HANDLE sendSasEvent = CreateEvent(NULL, FALSE, FALSE, "Global\\SendSAS");
if (sendSasEvent != NULL) {
// use SendSAS event to wait for next session (timeout 1 second).
if (WaitForSingleObject(sendSasEvent, 1000) == WAIT_OBJECT_0) {
LOG_DEBUG("calling SendSAS from sas.dll");
@ -256,32 +255,23 @@ void MSWindowsWatchdog::mainLoop(void *)
CloseHandle(sendSasEvent);
} else {
LOG((CLOG_ERR "could not create SendSAS event"));
XArchEvalWindows e;
LOG_ERR("could not create SendSAS event");
}
// Sleep for only 100ms rather than 1 second so that the service can shut down faster.
ARCH->sleep(0.1);
} catch (std::exception &e) {
LOG((CLOG_CRIT "failed to launch, error: %s", e.what()));
m_processFailures++;
m_processStarted = false;
continue;
} catch (...) {
LOG((CLOG_CRIT "failed to launch, unknown error."));
m_processFailures++;
m_processStarted = false;
continue;
}
// Sleep for only 100ms rather than 1 second so that the service can shut down faster.
ARCH->sleep(0.1);
}
if (m_process != nullptr) {
LOG((CLOG_DEBUG "terminated running process on exit"));
m_process->shutdown();
m_process.reset();
m_processStarted = false;
}
shutdownExistingProcesses();
LOG((CLOG_DEBUG "watchdog main loop finished"));
}
@ -303,19 +293,14 @@ void MSWindowsWatchdog::setFileLogOutputter(FileLogOutputter *outputter)
void MSWindowsWatchdog::startProcess()
{
LOG_INFO("daemon starting new process");
if (m_command.empty()) {
throw XMSWindowsWatchdogError("cannot start process, command is empty");
}
m_commandChanged = false;
if (m_process != nullptr) {
LOG((CLOG_DEBUG "closing existing process to make way for new one"));
m_process->shutdown();
m_process.reset();
m_processStarted = false;
}
m_process = std::make_unique<deskflow::platform::MSWindowsProcess>(m_command, m_outputWritePipe, m_outputWritePipe);
@ -361,9 +346,6 @@ void MSWindowsWatchdog::startProcess()
throw XMSWindowsWatchdogError("process immediately stopped");
}
m_processStarted = true;
m_processFailures = 0;
LOG((CLOG_DEBUG "started core process from daemon"));
LOG(
(CLOG_DEBUG2 "process info, session=%i, elevated: %s, command=%s", m_session.getActiveSessionId(),
@ -372,13 +354,20 @@ void MSWindowsWatchdog::startProcess()
}
}
void MSWindowsWatchdog::setCommand(const std::string &command, bool elevate)
void MSWindowsWatchdog::setProcessConfig(const std::string &command, bool elevate)
{
LOG_DEBUG("command parameters updated");
LOG_DEBUG("watchdog process config updated");
m_command = command;
m_elevateProcess = elevate;
m_commandChanged = true;
m_processFailures = 0;
if (m_command.empty()) {
LOG_DEBUG("command cleared, queueing process stop");
m_processState = ProcessState::StopPending;
} else {
LOG_DEBUG("command changed, queueing process start");
m_processState = ProcessState::StartPending;
m_nextStartTime.reset();
}
}
std::string MSWindowsWatchdog::getCommand() const
@ -411,7 +400,7 @@ void MSWindowsWatchdog::outputLoop(void *)
// +1 char for \0
CHAR buffer[kOutputBufferSize + 1];
while (m_monitoring) {
while (m_running) {
DWORD bytesRead;
BOOL success = ReadFile(m_outputReadPipe, buffer, kOutputBufferSize, &bytesRead, NULL);
@ -482,7 +471,7 @@ HANDLE openProcessForKill(PROCESSENTRY32 entry)
void MSWindowsWatchdog::shutdownExistingProcesses()
{
LOG_DEBUG("shutting down existing processes");
LOG_DEBUG("shutting down any existing processes");
// first we need to take a snapshot of the running processes
HANDLE snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, CURRENT_PROCESS_ID);
@ -571,3 +560,24 @@ std::string MSWindowsWatchdog::runActiveDesktopUtility()
output.erase(output.find_last_not_of("\r\n") + 1);
return output;
}
void MSWindowsWatchdog::handleStartError(const std::string_view &message)
{
m_startFailures++;
if (!message.empty()) {
LOG_CRIT("failed to launch, error: %s", message.data());
} else {
LOG_CRIT("failed to launch, unknown error");
}
// When there has been more than one consecutive failure, slow down the retry rate.
if (m_startFailures > 1) {
m_nextStartTime = ARCH->time() + kStartDelaySeconds;
LOG_WARN("start failed %d times, delaying start", m_startFailures);
LOG_DEBUG("start delay, seconds=%d, time=%f", kStartDelaySeconds, m_nextStartTime.value());
} else {
LOG_INFO("retrying process start immediately");
m_nextStartTime.reset();
}
}

View File

@ -6,15 +6,14 @@
#pragma once
#include "arch/IArchMultithread.h"
#include "deskflow/XDeskflow.h"
#include "platform/MSWindowsProcess.h"
#include "platform/MSWindowsSession.h"
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <memory>
#include <optional>
#include <string>
class Thread;
@ -22,9 +21,18 @@ class FileLogOutputter;
class MSWindowsWatchdog
{
enum class ProcessState
{
Idle,
StartScheduled,
StartPending,
StopPending,
Running
};
public:
MSWindowsWatchdog(bool autoDetectCommand, bool foreground);
virtual ~MSWindowsWatchdog();
~MSWindowsWatchdog() = default;
void startAsync();
std::string getCommand() const;
@ -42,6 +50,7 @@ private:
void startProcess();
void sendSas();
void setStartupInfo(STARTUPINFO &si);
void handleStartError(const std::string_view &message = "");
/**
* @brief Re-run the process to get the active desktop name.
@ -56,23 +65,21 @@ private:
private:
Thread *m_thread;
bool m_autoDetectCommand;
std::string m_command;
bool m_monitoring;
bool m_commandChanged;
bool m_running;
HANDLE m_outputWritePipe;
HANDLE m_outputReadPipe;
Thread *m_outputThread;
bool m_elevateProcess;
MSWindowsSession m_session;
int m_processFailures;
bool m_processStarted;
int m_startFailures;
FileLogOutputter *m_fileLogOutputter;
ArchMutex m_mutex;
ArchCond m_condVar;
bool m_ready;
bool m_foreground;
std::string m_activeDesktop;
std::unique_ptr<deskflow::platform::MSWindowsProcess> m_process;
std::optional<double> m_nextStartTime = std::nullopt;
ProcessState m_processState = ProcessState::Idle;
std::string m_command = "";
};
//! Relauncher error