blob: bd88c080ff4ae6bda6ef4d09a73459dec19e924c [file] [log] [blame]
From e0fed430cd04b0f952fbcc9d826ff33bb1c3a4b6 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 6 Jun 2019 10:25:08 -0700
Subject: [PATCH] CHROMIUM: hardlockup: detect hard lockups without NMIs using
secondary cpus
Emulate NMIs on systems where they are not available by using timer
interrupts on other cpus. Each cpu will use its softlockup hrtimer
to check that the next cpu is processing hrtimer interrupts by
verifying that a counter is increasing.
This patch is useful on systems where the hardlockup detector is not
available due to a lack of NMIs, for example most ARM SoCs.
Without this patch any cpu stuck with interrupts disabled can
cause a hardware watchdog reset with no debugging information,
but with this patch the kernel can detect the lockup and panic,
which can result in useful debugging info.
BUG=chromium:941638
TEST=see CL:Ibef3a87fb30ae679b398ac5d40263662904989fe
arm64 and x86_64 defconfigs still build
Signed-off-by: Colin Cross <ccross@android.com>
(mka@: ported from CrOS v4.4. Main delta is that upstream moved parts
of watchdog.c to watchdog_hld.c, also we now have most new code in
watchdog_buddy_cpu.c. A FROMLIST version with some comments can be
found at https://patchwork.kernel.org/patch/1967551/
/ https://lkml.kernel.org/r/1357941108-14138-1-git-send-email-ccross@android.com)
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Change-Id: I8d24aab8bb3a3b37e4492668a29b66516c66268b
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1662655
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Commit-Queue: Douglas Anderson <dianders@chromium.org>
[rebase54(groeck): Squashed:
FIXUP: CHROMIUM: hardlockup: detect hard lockups without NMIs using secondary cpus
]
Signed-off-by: Guenter Roeck <groeck@chromium.org>
[rebase510(groeck): Conflicts:
kernel/watchdog.c
Squash:
FIXUP: CHROMIUM: hardlockup: detect hard lockups without NMIs using secondary cpus
]
Signed-off-by: Guenter Roeck <groeck@chromium.org>
---
include/linux/nmi.h | 15 ++++-
kernel/Makefile | 1 +
kernel/watchdog.c | 24 +++++--
kernel/watchdog_buddy_cpu.c | 128 ++++++++++++++++++++++++++++++++++++
lib/Kconfig.debug | 21 +++++-
5 files changed, 179 insertions(+), 10 deletions(-)
create mode 100644 kernel/watchdog_buddy_cpu.c
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 750c7f395ca9..efc99fd47755 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -45,6 +45,8 @@ extern void touch_softlockup_watchdog(void);
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
+DECLARE_PER_CPU(unsigned long, hrtimer_interrupts);
+DECLARE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
extern int lockup_detector_online_cpu(unsigned int cpu);
extern int lockup_detector_offline_cpu(unsigned int cpu);
@@ -81,14 +83,14 @@ static inline void reset_hung_task_detector(void) { }
#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
-#if defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_CORE)
extern void hardlockup_detector_disable(void);
extern unsigned int hardlockup_panic;
#else
static inline void hardlockup_detector_disable(void) {}
#endif
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR_CORE)
# define NMI_WATCHDOG_SYSCTL_PERM 0644
#else
# define NMI_WATCHDOG_SYSCTL_PERM 0444
@@ -122,6 +124,12 @@ int watchdog_nmi_probe(void);
int watchdog_nmi_enable(unsigned int cpu);
void watchdog_nmi_disable(unsigned int cpu);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_BUDDY_CPU
+extern void buddy_cpu_touch_watchdog(void);
+#else
+static inline void buddy_cpu_touch_watchdog(void) {}
+#endif
+
/**
* touch_nmi_watchdog - restart NMI watchdog timeout.
*
@@ -132,6 +140,7 @@ void watchdog_nmi_disable(unsigned int cpu);
static inline void touch_nmi_watchdog(void)
{
arch_touch_nmi_watchdog();
+ buddy_cpu_touch_watchdog();
touch_softlockup_watchdog();
}
@@ -195,7 +204,7 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh);
#endif
#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
- defined(CONFIG_HARDLOCKUP_DETECTOR)
+ defined(CONFIG_HARDLOCKUP_DETECTOR_CORE)
void watchdog_update_hrtimer_threshold(u64 period);
#else
static inline void watchdog_update_hrtimer_threshold(u64 period) { }
diff --git a/kernel/Makefile b/kernel/Makefile
index 4df609be42d0..42f79d5e9071 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_BUDDY_CPU) += watchdog_buddy_cpu.o
obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7c397907d0e9..3ff11b20af3a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,7 +29,7 @@
static DEFINE_MUTEX(watchdog_mutex);
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_CORE) || defined(CONFIG_HAVE_NMI_WATCHDOG)
# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
# define NMI_WATCHDOG_DEFAULT 1
#else
@@ -47,7 +47,7 @@ static int __read_mostly nmi_watchdog_available;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_CORE
# ifdef CONFIG_SMP
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
@@ -85,7 +85,9 @@ static int __init hardlockup_panic_setup(char *str)
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_CORE */
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
/*
* These functions can be overridden if an architecture implements its
@@ -106,6 +108,13 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
hardlockup_detector_perf_disable();
}
+#else
+
+int __weak watchdog_nmi_enable(unsigned int cpu) { return 0; }
+void __weak watchdog_nmi_disable(unsigned int cpu) { return; }
+
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
/* Return 0, if a NMI watchdog is available. Error code otherwise */
int __weak __init watchdog_nmi_probe(void)
{
@@ -179,8 +188,8 @@ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(unsigned long, watchdog_report_ts);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static unsigned long soft_lockup_nmi_warn;
static int __init nowatchdog_setup(char *str)
@@ -326,6 +335,8 @@ bool is_hardlockup(void)
return false;
}
+void __weak watchdog_check_hardlockup(void) {}
+
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
@@ -365,6 +376,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups */
+ watchdog_check_hardlockup();
+
/* kick the softlockup detector */
if (completion_done(this_cpu_ptr(&softlockup_completion))) {
reinit_completion(this_cpu_ptr(&softlockup_completion));
diff --git a/kernel/watchdog_buddy_cpu.c b/kernel/watchdog_buddy_cpu.c
new file mode 100644
index 000000000000..56d96c0a2825
--- /dev/null
+++ b/kernel/watchdog_buddy_cpu.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/nmi.h>
+#include <linux/percpu-defs.h>
+
+static DEFINE_PER_CPU(bool, watchdog_touch);
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static cpumask_t __read_mostly watchdog_cpus;
+
+int __init watchdog_nmi_probe(void)
+{
+ return 0;
+}
+
+notrace void buddy_cpu_touch_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_touch, true);
+}
+EXPORT_SYMBOL_GPL(buddy_cpu_touch_watchdog);
+
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+
+static int is_hardlockup_buddy_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+void watchdog_check_hardlockup(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_touch, next_cpu) == true) {
+ per_cpu(watchdog_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_buddy_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d8eb185589ad..dd0879f28c28 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1041,6 +1041,9 @@ config HARDLOCKUP_DETECTOR_PERF
config HARDLOCKUP_CHECK_TIMESTAMP
bool
+config HARDLOCKUP_DETECTOR_CORE
+ bool
+
#
# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
# lockup detector rather than the perf based detector.
@@ -1050,6 +1053,7 @@ config HARDLOCKUP_DETECTOR
depends on DEBUG_KERNEL && !S390
depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
select LOCKUP_DETECTOR
+ select HARDLOCKUP_DETECTOR_CORE
select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH
help
@@ -1061,9 +1065,22 @@ config HARDLOCKUP_DETECTOR
chance to run. The current stack trace is displayed upon detection
and the system will stay locked up.
+config HARDLOCKUP_DETECTOR_BUDDY_CPU
+ bool "Buddy CPU hardlockup detector"
+ depends on DEBUG_KERNEL && SMP
+ depends on !HARDLOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
+ depends on !S390
+ select HARDLOCKUP_DETECTOR_CORE
+ select SOFTLOCKUP_DETECTOR
+ help
+ Say Y here to enable a hardlockup detector where CPUs check
+ each other for lockup. Each cpu uses its softlockup hrtimer
+ to check that the next cpu is processing hrtimer interrupts by
+ verifying that a counter is increasing.
+
config BOOTPARAM_HARDLOCKUP_PANIC
bool "Panic (Reboot) On Hard Lockups"
- depends on HARDLOCKUP_DETECTOR
+ depends on HARDLOCKUP_DETECTOR_CORE
help
Say Y here to enable the kernel to panic on "hard lockups",
which are bugs that cause the kernel to loop in kernel
@@ -1074,7 +1091,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC
config BOOTPARAM_HARDLOCKUP_PANIC_VALUE
int
- depends on HARDLOCKUP_DETECTOR
+ depends on HARDLOCKUP_DETECTOR_CORE
range 0 1
default 0 if !BOOTPARAM_HARDLOCKUP_PANIC
default 1 if BOOTPARAM_HARDLOCKUP_PANIC
--
2.17.1