retrace: Added retrace watchdog

Retrace watchdog triggers abort() if replay of a single api call will
take more than 30 seconds.

BUG=b:209706870
BUG=b:203709492
TEST=tested on local dut

Change-Id: Ia90849e061e2fd63c92bd7c8f21f209e6038ce83
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/apitrace/+/3321677
Reviewed-by: Po-Hsien Wang <pwang@chromium.org>
Reviewed-by: Ilja Friedel <ihf@chromium.org>
Reviewed-by: David Riley <davidriley@chromium.org>
Commit-Queue: Robert Tarasov <tutankhamen@chromium.org>
Tested-by: Robert Tarasov <tutankhamen@chromium.org>
diff --git a/retrace/retrace_main.cpp b/retrace/retrace_main.cpp
index ab41dd4..bbbff2c 100644
--- a/retrace/retrace_main.cpp
+++ b/retrace/retrace_main.cpp
@@ -27,6 +27,7 @@
 
 
 #include <string.h>
+#include <atomic>
 #include <limits.h> // for CHAR_MAX
 #include <memory> // for unique_ptr
 #include <iostream>
@@ -35,6 +36,7 @@
 #include <fstream>
 #include <regex>
 #include <getopt.h>
+#include <time.h>
 #ifndef _WIN32
 #include <unistd.h> // for isatty()
 #endif
@@ -93,6 +95,7 @@
 bool dumpingState = false;
 bool dumpingSnapshots = false;
 bool dumpPerFrameStats = false;
+bool watchdogEnabled = false;
 
 // Per frame stats
 std::string perFrameStatsFile;
@@ -143,6 +146,83 @@
 static void
 takeSnapshot(unsigned call_no);
 
+/**
+ * Retrace watchdog.
+ *
+ * Retrace watchdog triggers abort() if retrace of a single api call
+ * will take more than {TimeoutInSec} seconds.
+ */
+class RetraceWatchdog
+{
+public:
+    enum {
+        TimeoutInSec = 30,
+        RunnerSleepInMills = 1000,
+    };
+
+    static uint32_t get_time_u32() {
+        return static_cast<uint32_t>(time(NULL));
+    }
+
+    static uint32_t get_duration_u32(uint32_t from, uint32_t to) {
+        if (to <= from) return 0;
+        return to - from;
+    }
+
+private:
+    typedef std::chrono::time_point<std::chrono::system_clock> TimePoint;
+
+    RetraceWatchdog()
+    : last_call_mark(get_time_u32()), want_exit(false) {
+        thread = os::thread(runnerThread, this);
+    }
+
+    std::thread thread;
+    std::atomic<uint64_t> last_call_mark;
+    std::atomic<bool> want_exit;
+
+    static void runnerThread(RetraceWatchdog* rw) {
+        while(!rw->want_exit) {
+            // Once a second check if progress has been made.
+            uint64_t mark = rw->last_call_mark;
+            uint32_t dur = get_duration_u32(
+                    static_cast<uint32_t>(mark&0xFFFFFFFF),
+                    get_time_u32()
+                );
+
+            if (dur >= static_cast<uint32_t>(TimeoutInSec)) {
+                fprintf(
+                    stderr,
+                    "Exception: Retrace watchdog timeout has occured at call %u!\n",
+                    static_cast<uint32_t>(mark >> 32));
+                abort();
+            }
+            os::sleep(RunnerSleepInMills * 1000);
+        }
+    }
+
+public:
+    ~RetraceWatchdog() {
+        want_exit = true;
+        if (thread.joinable()) {
+           thread.join();
+        }
+    }
+    // Assuming we use C++ 11 or newer therefore the following singleton
+    // implementation is thread safe
+    static RetraceWatchdog& Instance() {
+        static RetraceWatchdog inst;
+        return inst;
+    }
+
+	  void CallProcessed(uint32_t call_no) {
+      // Record time and monotonic number/id of last trace call.
+      last_call_mark = (static_cast<uint64_t>(call_no) << 32) | get_time_u32();
+    }
+
+    RetraceWatchdog(RetraceWatchdog const&) = delete;
+    void operator=(RetraceWatchdog const&) = delete;
+};
 
 /**
  * Called when we are about to present.
@@ -500,6 +580,8 @@
 
             bTimeout = isTimeout(os::getTime(), timeout, call);
             retraceCall(call);
+            if (watchdogEnabled)
+              RetraceWatchdog::Instance().CallProcessed(call->no);
             if (!call->reuse_call)
                 delete call;
             call = bTimeout ? NULL : parser->parse_call();
@@ -678,6 +760,8 @@
         while (!bTimeout && (call = parser->parse_call())) {
             bTimeout = isTimeout(os::getTime(), timeoutTime, call);
             retraceCall(call);
+            if (watchdogEnabled)
+                RetraceWatchdog::Instance().CallProcessed(call->no);
             if (!call->reuse_call)
                 delete call;
         }
@@ -775,6 +859,7 @@
         "      --dump-format=FORMAT dump state format (`json` or `ubjson`)\n"
         "  -w, --wait              waitOnFinish on final frame\n"
         "      --dump-per-frame-stats=out_file   dumps per frame retrace statistics in JSON format to the out_file\n"
+        "      --watchdog          invokes abort() if retrace of a single api call will take more than " << retrace::RetraceWatchdog::TimeoutInSec << " seconds\n"
         "      --timeout=N         specify retrace timeout in seconds\n"
         "      --per-frame-delay=MICROSECONDS   add extra delay after each frame (in addition to min-frame-duration)\n"
         "      --min-frame-duration=MICROSECONDS    specify minimum frame duration\n"
@@ -808,6 +893,7 @@
     MSAA_NO_RESOLVE_OPT,
     SB_OPT,
     DUMP_PER_FRAME_STATS_OPT,
+    WATCHDOG_OPT,
     TIMEOUT_OPT,
     PER_FRAME_DELAY_OPT,
     MIN_FRAME_DURATION_OPT,
@@ -865,6 +951,7 @@
     {"wait", no_argument, 0, 'w'},
     // begin of chromeos fork specific features
     {"dump-per-frame-stats", required_argument, 0, DUMP_PER_FRAME_STATS_OPT},
+    {"watchdog", no_argument, 0, WATCHDOG_OPT},
     {"timeout", required_argument, 0, TIMEOUT_OPT},
     {"per-frame-delay", required_argument, 0, PER_FRAME_DELAY_OPT},
     {"min-frame-duration", required_argument, 0, MIN_FRAME_DURATION_OPT},
@@ -1240,6 +1327,9 @@
                 dumpPerFrameStats = true;
             }
             break;
+        case WATCHDOG_OPT:
+            retrace::watchdogEnabled = true;
+            break;
         case TIMEOUT_OPT:
             timeoutSeconds = trace::intOption(optarg, -1);
             break;