blob: 65963bbb96036ed3ccf1a3e3820e7295fbebddf6 [file] [log] [blame]
# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
description "Reboot device upon repeated user interface crash."
author "chromium-os-dev@chromium.org"
# The ui job requires special respawning logic, as the system needs to respond
# differently to different kinds of exits.
#
# 0: Normal exit; respawn forever.
# 2: (Defined in session_manager_service.h) The browser is exiting too much too
# quickly. In some cases, such as a hung GPU, rebooting the device will allow
# the system to recover. Respawn up to TOO_CRASHY_LIMIT times in
# TOO_CRASHY_WINDOW_SECONDS before trying a reboot. Try this up to
# REBOOT_LIMIT times in REBOOT_WINDOW_SECONDS before giving up and waiting
# for an AU.
# All nonzero: respawn up to RESPAWN_LIMIT times in RESPAWN_WINDOW_SECONDS,
# then stop.
start on stopping ui
script
RESPAWN_LIMIT=6
RESPAWN_WINDOW_SECONDS=60
RESPAWN_TIMESTAMP_FILE=/tmp/ui-respawn-timestamps
CHILD_EXITING_TOO_FAST=2 # Defined in session_manager_service.h
TOO_CRASHY_LIMIT=1
TOO_CRASHY_WINDOW_SECONDS=180
TOO_CRASHY_TIMESTAMP_FILE=/tmp/ui-too-crashy-timestamps
REBOOT_LIMIT=1
REBOOT_WINDOW_SECONDS=$(( 3 * TOO_CRASHY_WINDOW_SECONDS ))
REBOOT_TIMESTAMP_DIR=/var/lib/ui
REBOOT_TIMESTAMP_FILE="${REBOOT_TIMESTAMP_DIR}"/reboot-timestamps
# Given a file full of timestamps, appends a timestamp for 'now', and then
# trims it to include only timestamps in 'now' - |time|.
# Succeeds if there are <= than |limit| timestamps remaining in |file|
# after this operation, fails otherwise.
# file: the name of the file containing timestamps (in s), one per line.
# time: the window of time, prior to now, that we consider.
# limit: the number of timestamps we're willing to allow.
under_limit() {
local file=$1
local time=$2
local limit=$3
local tfile=$(mktemp)
local curtime=$(date +%s)
echo $curtime >>$file # To ensure the file exists.
awk "\$0 >= $curtime - $time" $file >$tfile
mv $tfile $file
if [ $(wc -l < $file) -gt $limit ]; then
return 1
else
return 0
fi
}
over_ui_crash_limit() {
! under_limit $TOO_CRASHY_TIMESTAMP_FILE $TOO_CRASHY_WINDOW_SECONDS \
$TOO_CRASHY_LIMIT
}
under_reboot_limit() {
under_limit $REBOOT_TIMESTAMP_FILE $REBOOT_WINDOW_SECONDS \
$REBOOT_LIMIT
}
under_respawn_limit() {
under_limit $RESPAWN_TIMESTAMP_FILE $RESPAWN_WINDOW_SECONDS \
$RESPAWN_LIMIT
}
# Create the directory where reboot timestamps (which must persist across
# reboots) will live.
mkdir -p "${REBOOT_TIMESTAMP_DIR}"
# if ${RESULT} = "ok", that means the job was stopped manually, so we want
# to avoid respawning. Otherwise, we want to kick in the logic.
if [ "${RESULT}" != "ok" ]; then
if [ "${EXIT_STATUS}" = "0" ] ; then
start -n "${JOB}" # start ui job again, always.
else
if [ "${EXIT_STATUS}" = "${CHILD_EXITING_TOO_FAST}" ]; then
if over_ui_crash_limit ; then
if under_reboot_limit ; then
logger -t ${UPSTART_JOB} "Rebooting to mitigate crashiness."
reboot
else
logger -t ${UPSTART_JOB} "Rebooted too much; running respawn logic."
fi
else
logger -t ${UPSTART_JOB} "Haven't been too crashy too much yet."
fi
fi
if [ -n "${EXIT_STATUS}" ]; then
logger -t ${UPSTART_JOB} "${JOB} failed with exit status ${EXIT_STATUS}."
elif [ -n "${EXIT_SIGNAL}" ]; then
logger -t ${UPSTART_JOB} "${JOB} died on signal ${EXIT_SIGNAL}."
else
logger -t ${UPSTART_JOB} "${JOB} died for mysterious and unknown reasons."
fi
# Generic respawn handling.
if under_respawn_limit ; then
logger -t ${UPSTART_JOB} "Respawning ${JOB}."
start -n "${JOB}"
else
logger -t ${UPSTART_JOB} "Reached respawn limit for ${JOB}."
fi
fi
# TODO(cmasone): http://crbug.com/266153 Show the user a splash
# screen telling them to keep calm and carry on.
fi
end script