diff --git a/examples/example.c b/examples/example.c index fc3e10ea53..d6f22885b3 100644 --- a/examples/example.c +++ b/examples/example.c @@ -612,7 +612,28 @@ run_threads(thread_func_t func) } #endif -#if defined(SENTRY_PLATFORM_MACOS) +#if defined(SENTRY_PLATFORM_WINDOWS) +static unsigned __stdcall +app_hang_demo_thread(void *arg) +{ + (void)arg; + /* The first heartbeat latches this thread as the monitored target. Beat for + * 500 ms so the daemon sees a healthy baseline before the freeze. */ + for (int i = 0; i < 10; i++) { + sentry_app_hang_heartbeat(); + Sleep(50); + } + /* Add a couple of breadcrumbs before freezing so the captured app-hang + * event carries them (the daemon reads the breadcrumb ring files the host + * writes on each sentry_add_breadcrumb). */ + sentry_add_breadcrumb( + sentry_value_new_breadcrumb(NULL, "app-hang demo: about to freeze")); + sentry_add_breadcrumb(create_debug_crumb("app-hang demo breadcrumb")); + /* Freeze for 3x the configured timeout (3000 ms). */ + Sleep(3000); + return 0; +} +#elif defined(SENTRY_PLATFORM_MACOS) static void * app_hang_demo_thread(void *arg) { @@ -883,7 +904,7 @@ main(int argc, char **argv) options, SENTRY_CRASH_UPLOAD_MODE_ASYNC); } -#if defined(SENTRY_PLATFORM_MACOS) +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) if (has_arg(argc, argv, "app-hang")) { sentry_options_set_app_hang_enabled(options, 1); sentry_options_set_app_hang_timeout_ms(options, 1000); @@ -901,7 +922,7 @@ main(int argc, char **argv) return EXIT_FAILURE; } -#if defined(SENTRY_PLATFORM_MACOS) +#if defined(SENTRY_PLATFORM_WINDOWS) || defined(SENTRY_PLATFORM_MACOS) /* app-hang: spawn the demo thread BEFORE any other post-init work so it * begins heartbeating immediately. The thread freezes for 3x the timeout, * giving the daemon time to detect the hang and ship the envelope. We wait @@ -909,10 +930,19 @@ main(int argc, char **argv) * NOTE: this mode is intentionally exclusive – do not combine with crash/ * abort/etc. since those would terminate the process first. */ if (has_arg(argc, argv, "app-hang")) { +# if defined(SENTRY_PLATFORM_WINDOWS) + HANDLE t = (HANDLE)_beginthreadex( + NULL, 0, app_hang_demo_thread, NULL, 0, NULL); + if (t) { + WaitForSingleObject(t, INFINITE); + CloseHandle(t); + } +# else pthread_t t; if (0 == pthread_create(&t, NULL, app_hang_demo_thread, NULL)) { pthread_join(t, NULL); } +# endif sentry_close(); return EXIT_SUCCESS; } diff --git a/include/sentry.h b/include/sentry.h index d4b9c5a40e..2104353ada 100644 --- a/include/sentry.h +++ b/include/sentry.h @@ -1709,8 +1709,8 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_session_replay_duration( * The host process keeps running. * * Off by default. This setting only has an effect when using the `native` - * backend. In this initial release the feature is macOS-only; the call is a - * silent no-op on other platforms. + * backend. The feature is supported on macOS and Windows; the call is a silent + * no-op on other platforms. */ SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_enabled( sentry_options_t *opts, int enabled); @@ -1737,7 +1737,7 @@ SENTRY_EXPERIMENTAL_API void sentry_options_set_app_hang_timeout_ms( * No-op if * - app-hang detection is not enabled * - the native backend is not active - * - the platform is not macOS + * - the platform is neither macOS nor Windows */ SENTRY_EXPERIMENTAL_API void sentry_app_hang_heartbeat(void); diff --git a/src/backends/native/sentry_crash_context.h b/src/backends/native/sentry_crash_context.h index 9968afcbb3..f711ea9c27 100644 --- a/src/backends/native/sentry_crash_context.h +++ b/src/backends/native/sentry_crash_context.h @@ -327,15 +327,19 @@ typedef struct { uint32_t module_count; sentry_module_info_t modules[SENTRY_CRASH_MAX_MODULES]; - /* App-hang detection. + /* App-hang detection (Windows + macOS, native backend only). * * Sync model: * - app_hang_enabled, app_hang_timeout_ms: written by host before daemon * is signalled ready; read by daemon at startup. No further mutation. - * - app_hang_target_tid: latched once by host on first heartbeat. - * Daemon reads, never writes. - * - app_hang_last_heartbeat_ms: written on every heartbeat. - */ + * - app_hang_target_tid: latched once by host on first heartbeat via a + * compare-exchange (InterlockedCompareExchange64 on Windows, + * atomic_compare_exchange_strong on macOS). Daemon reads, never writes. + * - app_hang_last_heartbeat_ms: written on every heartbeat with a relaxed + * 64-bit store. Daemon reads with a relaxed load. Torn reads are not a + * correctness issue — the daemon compares against its remembered value + * from the previous tick. (On 64-bit Windows/macOS the aligned store is + * atomic; the tear note applies to 32-bit Windows.) */ bool app_hang_enabled; uint64_t app_hang_timeout_ms; volatile uint64_t app_hang_target_tid; diff --git a/src/backends/native/sentry_crash_daemon.c b/src/backends/native/sentry_crash_daemon.c index 9d56de75f4..cea9d07c1b 100644 --- a/src/backends/native/sentry_crash_daemon.c +++ b/src/backends/native/sentry_crash_daemon.c @@ -2937,7 +2937,159 @@ write_envelope_with_native_stacktrace(const sentry_options_t *options, return true; } -#if defined(SENTRY_PLATFORM_MACOS) +#if defined(SENTRY_PLATFORM_WINDOWS) +/** + * App-hang capture path (Windows). Suspends the latched target thread just long + * enough to snapshot its CONTEXT, then builds and submits an envelope using the + * same native-stacktrace path as crashes (with an AppHang event kind). + */ +static void +capture_and_send_app_hang(const sentry_options_t *options, + sentry_crash_ipc_t *ipc, uint64_t freeze_ms) +{ + /* NOTE (race, experimental Windows-only first cut): This function reads + * and mutates shmem fields (platform.context, threads[0], crashed_tid, + * num_threads) that are also written by the host's signal handler on a + * real crash. The daemon's main loop is single-threaded and the crash + * event has wait-priority 0, so we will not enter this function with a + * pending crash notification already signalled. The remaining narrow + * window is: the host crashes WHILE this function is running, the host's + * signal handler writes to shmem mid-capture, and we then send a + * partially-overwritten event. We accept this risk for the initial + * Windows-only implementation; mitigation (state check at entry / pause + * via an additional shmem flag) is tracked as follow-up work. */ + sentry_crash_context_t *ctx = ipc->shmem; + + /* Populate modules once per session if not already done. */ + if (ctx->module_count == 0) { + capture_modules_from_process(ctx); + } + + DWORD target_tid = (DWORD)ctx->app_hang_target_tid; + + /* Suspend the target thread and capture its CONTEXT. */ + HANDLE hThread = OpenThread(THREAD_GET_CONTEXT | THREAD_SUSPEND_RESUME + | THREAD_QUERY_INFORMATION, + FALSE, target_tid); + if (!hThread) { + SENTRY_DEBUGF("app-hang: OpenThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + return; + } + + DWORD suspend_count = SuspendThread(hThread); + if (suspend_count == (DWORD)-1) { + SENTRY_DEBUGF("app-hang: SuspendThread(%lu) failed: %lu", + (unsigned long)target_tid, GetLastError()); + CloseHandle(hThread); + return; + } + + CONTEXT thread_ctx; + memset(&thread_ctx, 0, sizeof(thread_ctx)); + thread_ctx.ContextFlags = CONTEXT_FULL; + if (!GetThreadContext(hThread, &thread_ctx)) { + SENTRY_DEBUGF( + "app-hang: GetThreadContext failed: %lu", GetLastError()); + ResumeThread(hThread); + CloseHandle(hThread); + return; + } + + /* Resume immediately; we have the snapshot we need. */ + ResumeThread(hThread); + CloseHandle(hThread); + + /* Place the snapshot in the "crashed thread" slot of the context so the + * existing event builder pulls a stacktrace out for the exception + * payload and the threads block. + * + * IMPORTANT: build_stacktrace_from_ctx() calls build_stacktrace_for_thread + * with thread_idx == SIZE_MAX, which on Windows reads from + * ctx->platform.context (NOT threads[0].context). We must populate both + * so the exception stacktrace uses the captured CONTEXT instead of an + * all-zero one (PC=0 -> StackWalk64 produces no frames). */ + ctx->platform.context = thread_ctx; + ctx->crashed_tid = target_tid; + ctx->platform.num_threads = 1; + ctx->platform.threads[0].thread_id = target_tid; + ctx->platform.threads[0].context = thread_ctx; + ctx->platform.threads[0].name[0] = '\0'; + + /* Build the per-event description with the freeze duration. `freeze_ms` is + * the time since the last heartbeat at detection, which is necessarily at + * least the configured timeout — hence "at least". */ + char value_buf[128]; + snprintf(value_buf, sizeof(value_buf), "App hung for at least %llu ms.", + (unsigned long long)freeze_ms); + + /* Build an envelope path next to the crash one. */ + char envelope_path[SENTRY_CRASH_MAX_PATH]; + int path_len = snprintf(envelope_path, sizeof(envelope_path), + "%s/sentry-app-hang-%lu-%llu.env", ctx->database_path, + (unsigned long)ctx->crashed_pid, + (unsigned long long)ctx->app_hang_last_heartbeat_ms); + + if (path_len < 0 || path_len >= (int)sizeof(envelope_path)) { + SENTRY_WARN("app-hang: envelope path truncated or invalid"); + return; + } + + /* Reuse the scope file the host keeps up-to-date via flush_scope so the + * app-hang event carries the same scope context as a crash event. The + * base event JSON is at ctx->event_path; the sibling run folder holds + * the attachments manifest, scope attachments, screenshot, and + * session replay — all pulled in by write_envelope_with_native_stacktrace + * when run_folder is non-NULL. */ + const char *event_file_path + = ctx->event_path[0] ? ctx->event_path : NULL; + sentry_path_t *run_folder = NULL; + if (event_file_path) { + sentry_path_t *ev_path = sentry__path_from_str(event_file_path); + if (ev_path) { + run_folder = sentry__path_dir(ev_path); + sentry__path_free(ev_path); + } + } + + /* App-hang event: overriding the exception type, handled, error level. + * The per-event value carries the freeze duration computed above. */ + sentry_value_t event = build_native_event(ctx, event_file_path, run_folder, + /*exception_type=*/"AppHang", + /*exception_value=*/value_buf, /*level=*/"error", + /*mechanism_type=*/"AppHang", /*handled=*/true); + + /* Surface the freeze duration as the event message too, so the issue + * title/summary reads "App hung for at least X ms." rather than the + * exception type alone. */ + sentry_value_set_by_key( + event, "message", sentry_value_new_string(value_buf)); + + bool ok = write_envelope_with_native_stacktrace( + options, envelope_path, ctx, event, /*minidump_path=*/NULL, run_folder); + + if (run_folder) { + sentry__path_free(run_folder); + } + + if (!ok) { + SENTRY_WARN("app-hang: failed to write envelope"); + return; + } + + /* Read envelope from disk and hand to transport. */ + sentry_path_t *env_path = sentry__path_from_str(envelope_path); + if (env_path) { + sentry_envelope_t *envelope = sentry__envelope_from_path(env_path); + if (envelope && options && options->transport) { + sentry__capture_envelope(options->transport, envelope, options); + } + sentry__path_remove(env_path); + sentry__path_free(env_path); + } +} + +#elif defined(SENTRY_PLATFORM_MACOS) /* Read `size` bytes at `addr` from another task into `buf`. Mirrors the * minidump writer's read_task_memory (mach_vm_read_overwrite). */ @@ -3421,7 +3573,7 @@ capture_and_send_app_hang(const sentry_options_t *options, sentry__path_free(env_path); } } -#endif /* SENTRY_PLATFORM_MACOS */ +#endif /* SENTRY_PLATFORM_WINDOWS / SENTRY_PLATFORM_MACOS */ /** * Manually write a Sentry envelope with event, minidump, and attachments. @@ -4392,16 +4544,105 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, #if defined(SENTRY_APP_HANG_HOST_SUPPORTED) /* Pre-populate crashed_pid so the app-hang path can reach the host - * out-of-process via task_for_pid. ctx->crashed_pid is otherwise only set - * by the host's crash handler; the crash handler re-sets it from the host - * context on a real crash — a no-op (same value). */ + * out-of-process (OpenProcess on Windows, task_for_pid on macOS). On + * Windows this also feeds capture_modules_from_process and + * walk_stack_with_dbghelp. ctx->crashed_pid is otherwise only set by the + * host's crash handler; the crash handler re-sets it from the host context + * on a real crash — a no-op (same value). */ ipc->shmem->crashed_pid = (pid_t)app_pid; #endif // Daemon main loop bool crash_processed = false; -#if defined(SENTRY_PLATFORM_MACOS) +#if defined(SENTRY_PLATFORM_WINDOWS) + /* App-hang detector state. Daemon-local; the daemon caches the timeout + * here so it does not race the host on subsequent shmem mutations. */ + const bool app_hang_enabled = ipc->shmem->app_hang_enabled; + const uint64_t app_hang_timeout_ms = ipc->shmem->app_hang_timeout_ms; + uint64_t last_fired_hb = 0; + + HANDLE timer = NULL; + if (app_hang_enabled) { + timer = CreateWaitableTimer(NULL, FALSE, NULL); + if (!timer) { + SENTRY_WARNF("app-hang: CreateWaitableTimer failed: %lu", + GetLastError()); + } else { + /* Negative dueTime: relative; 100ns units; -5_000_000 = 500 ms. + * Period 500 ms. */ + LARGE_INTEGER due_time; + due_time.QuadPart = -5000000LL; + if (!SetWaitableTimer( + timer, &due_time, 500, NULL, NULL, FALSE)) { + SENTRY_WARNF("app-hang: SetWaitableTimer failed: %lu", + GetLastError()); + CloseHandle(timer); + timer = NULL; + } + } + } + + /* Wait set: index 0 = crash event, index 1 = timer (optional). */ + HANDLE wait_handles[2]; + DWORD wait_count = 1; + wait_handles[0] = ipc->event_handle; + if (timer) { + wait_handles[1] = timer; + wait_count = 2; + } + + while (true) { + DWORD result = WaitForMultipleObjects(wait_count, wait_handles, + FALSE, SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS); + + if (result == WAIT_OBJECT_0) { + /* Crash notification — identical logic to the cross-platform + * path below. */ + SENTRY_DEBUG("Event signaled, checking crash state"); + long state = sentry__atomic_fetch(&ipc->shmem->state); + if (state == SENTRY_CRASH_STATE_CRASHED && !crash_processed) { + SENTRY_DEBUG("Crash notification received, processing"); + sentry__process_crash(options, ipc); + crash_processed = true; + SENTRY_DEBUG("Crash processed, daemon exiting"); + break; + } + SENTRY_DEBUG("Spurious notification or already processed"); + } else if (timer && result == WAIT_OBJECT_0 + 1) { + /* Timer tick — evaluate the app-hang heartbeat. */ + sentry_crash_context_t *shctx = ipc->shmem; + const uint64_t hb = shctx->app_hang_last_heartbeat_ms; + const uint64_t now = sentry__app_hang_now_ms(); + sentry_app_hang_decision_t d = sentry__app_hang_decide( + app_hang_enabled, hb, now, app_hang_timeout_ms, last_fired_hb); + if (d == SENTRY_APP_HANG_FIRE) { + capture_and_send_app_hang(options, ipc, now - hb); + /* Always advance last_fired_hb, even if capture failed — + * prevents a retry storm against a wedged thread. The next + * heartbeat advance re-arms detection naturally. */ + last_fired_hb = hb; + } + } else if (result == WAIT_TIMEOUT) { + /* Fall through to parent-liveness check below. */ + } else { + SENTRY_WARNF("daemon wait failed: %lu err=%lu", result, + GetLastError()); + break; + } + + if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { + SENTRY_DEBUG("Parent process exited without crash"); + break; + } + } + + if (timer) { + CancelWaitableTimer(timer); + CloseHandle(timer); + } +#else +# if defined(SENTRY_PLATFORM_MACOS) /* App-hang detector state. Daemon-local; the timeout is cached here so it * does not race the host on subsequent shmem mutations. When enabled, the * loop polls on a short cadence (so it can evaluate the heartbeat each @@ -4411,9 +4652,9 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, uint64_t last_fired_hb = 0; const int wait_timeout_ms = app_hang_enabled ? 500 : SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; -#else +# else const int wait_timeout_ms = SENTRY_CRASH_DAEMON_WAIT_TIMEOUT_MS; -#endif +# endif #if defined(SENTRY_PLATFORM_UNIX) /* Catch the SIGTERM that sentry_close sends on clean shutdown so the daemon @@ -4467,7 +4708,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, // If crash already processed, just ignore spurious notifications SENTRY_DEBUG("Spurious notification or already processed"); } -#if defined(SENTRY_PLATFORM_MACOS) +# if defined(SENTRY_PLATFORM_MACOS) else if (app_hang_enabled && !crash_processed) { /* No crash notification this wake (timeout or spurious) — evaluate * the app-hang heartbeat. */ @@ -4484,7 +4725,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, last_fired_hb = hb; } } -#endif +# endif // Check if parent is still alive (only if no crash processed yet) if (!crash_processed && !is_parent_alive(ipc->parent_handle)) { @@ -4500,6 +4741,7 @@ sentry__crash_daemon_main(pid_t app_pid, uint64_t app_tid, HANDLE event_handle, } #endif } +#endif SENTRY_DEBUG("Daemon exiting"); diff --git a/src/backends/sentry_backend_native.c b/src/backends/sentry_backend_native.c index 8284a4feca..c6cad015cb 100644 --- a/src/backends/sentry_backend_native.c +++ b/src/backends/sentry_backend_native.c @@ -851,7 +851,8 @@ native_backend_flush_scope( return; } - // Create event with current scope + // Create event with current scope. The daemon also reads this base event + // at app-hang time on Windows and macOS, so keep it current. sentry_value_t event = sentry_value_new_object(); sentry_value_set_by_key( event, "level", sentry__value_new_level(SENTRY_LEVEL_FATAL)); diff --git a/src/sentry_app_hang.c b/src/sentry_app_hang.c index 27162ebfff..f7c3a06634 100644 --- a/src/sentry_app_hang.c +++ b/src/sentry_app_hang.c @@ -12,9 +12,13 @@ #if defined(SENTRY_APP_HANG_HOST_SUPPORTED) # include "sentry_sync.h" -# include -# include -# include +# if defined(SENTRY_PLATFORM_WINDOWS) +# include +# elif defined(SENTRY_PLATFORM_MACOS) +# include +# include +# include +# endif #endif sentry_app_hang_decision_t @@ -89,6 +93,48 @@ sentry__app_hang_set_shmem(sentry_crash_context_t *ctx) sentry__mutex_unlock(&g_app_hang_lock); } +# if defined(SENTRY_PLATFORM_WINDOWS) + +uint64_t +sentry__app_hang_now_ms(void) +{ + ULONGLONG ticks_100ns = 0; + /* QueryUnbiasedInterruptTime is documented signal/SEH/wait-free; the + * same source is read on both sides of the IPC. */ + if (!QueryUnbiasedInterruptTime(&ticks_100ns)) { + return 0; + } + return (uint64_t)(ticks_100ns / 10000ULL); +} + +static void +app_hang_record_heartbeat(sentry_crash_context_t *ctx) +{ + DWORD current_tid = GetCurrentThreadId(); + + /* Self-register on the first heartbeat: CAS the current TID into the latch + * slot iff still unset — the first thread to heartbeat wins and becomes the + * monitored target. CAS (rather than a plain store) prevents a late call + * from a different thread from silently overwriting a prior latch. */ + InterlockedCompareExchange64((LONG64 volatile *)&ctx->app_hang_target_tid, + (LONG64)(uint64_t)current_tid, 0); + + /* Drop the heartbeat unless the latched thread is us, so a stray heartbeat + * from another thread cannot mask a frozen monitored thread. The non-atomic + * read can tear on x86; in that case the compare fails and we drop a + * heartbeat, which the daemon absorbs. */ + uint64_t latched = ctx->app_hang_target_tid; + if (latched == 0 || (DWORD)latched != current_tid) { + return; + } + + /* Relaxed 64-bit store. On x64 this is a single mov. On x86 the value + * may tear, but that is OK — see the comment in sentry_crash_context.h. */ + ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); +} + +# elif defined(SENTRY_PLATFORM_MACOS) + uint64_t sentry__app_hang_now_ms(void) { @@ -132,6 +178,8 @@ app_hang_record_heartbeat(sentry_crash_context_t *ctx) ctx->app_hang_last_heartbeat_ms = sentry__app_hang_now_ms(); } +# endif + void sentry_app_hang_heartbeat(void) { @@ -152,7 +200,7 @@ sentry_app_hang_heartbeat(void) void sentry_app_hang_heartbeat(void) { - /* No-op on non-macOS targets in this initial cut. */ + /* No-op on unsupported targets in this initial cut. */ } #endif diff --git a/src/sentry_app_hang.h b/src/sentry_app_hang.h index b2c95f5266..c14ff05bb0 100644 --- a/src/sentry_app_hang.h +++ b/src/sentry_app_hang.h @@ -7,9 +7,11 @@ #include /* The host-side heartbeat machinery (clock, latch, shmem registration) is - * available on the native backend on macOS. Windows, Linux, and other targets - * fall back to no-op stubs. */ -#if defined(SENTRY_PLATFORM_MACOS) && defined(SENTRY_BACKEND_NATIVE) + * available on the native backend on Windows (non-Xbox) and macOS. Linux and + * other targets fall back to no-op stubs. */ +#if (((defined(SENTRY_PLATFORM_WINDOWS) && !defined(SENTRY_PLATFORM_XBOX)) \ + || defined(SENTRY_PLATFORM_MACOS))) \ + && defined(SENTRY_BACKEND_NATIVE) # define SENTRY_APP_HANG_HOST_SUPPORTED 1 #endif diff --git a/tests/test_integration_native.py b/tests/test_integration_native.py index d0a696389c..f83d1bba04 100644 --- a/tests/test_integration_native.py +++ b/tests/test_integration_native.py @@ -1142,8 +1142,8 @@ def test_native_restart_on_crash(cmake, httpserver): @pytest.mark.skipif( - sys.platform != "darwin", - reason="app-hang detection is implemented on macOS", + sys.platform not in ("win32", "darwin"), + reason="app-hang detection is implemented on Windows and macOS", ) def test_native_app_hang(cmake, httpserver): """App hang detection emits exactly one AppHang event.