blob: 92982c7ef37532866bf271bf71b9db813c3ad2f6 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/* Adaptive Rate Limiting Qdisc (ARL) is designed for home routers to eliminate
* bufferbloat at upstream CPE (Cable/DSL) modem. It prevents bloated queue
* from forming at upstream CPE modem by rate shaping the throughput to match
* the available bandwidth. Instead of using a preconfigured static rate limit,
* it automatically figures out the available bandwidth and adjust rate limit
* in real time, by continuously monitoring latency passively.
* The latency measurement come from two sources: one is the RTT from kernel’s
* TCP/IP stacks, another is the half path RTT measured from routed TCP
* streams. The minimum latency from all flows is used as the indication of
* bufferbloat at upstream CPE modem, because that’s the common path for all
* flows. ARL adjusts the rate limit dynamically based on the minimum latency.
* If the throughput is less than available bandwidth, there will be no queue
* buildup at CPE device, hence the minimum latency should stay flat. On the
* other hand, a spike of minimum latency suggests there is bloated queue in
* upstream CPE modem, indicating the current rate limit is over the available
* bandwidth. In the case, ARL drains the queue and reduces rate limit.
* ARL can be applied as root qdisc for WAN interface to prevent upstream
* bufferbloat at the CPE modem. Queue is then managed locally at the
* router, by applying another qdisc such as fq_codel as child qdisc.
* In order to use ARL for downstream (ingress), an IFB device needs be created
* and setup filter rule to redirect ingress traffic to the IFB device, then
* apply ARL in ingress mode as the root qdisc for the IFB device
*
* The passive latency measurement method for routed TCP stream is inspired by:
* Kathleen Nichols, "Listening to Networks",
* http://netseminar.stanford.edu/seminars/02_02_17.pdf
*
* The rate shaping and some utility functions are from:
* net/sched/sch_tbf.c
* Author: Kan Yan <kyan@google.com>
*/
#include <linux/average.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/tcp.h>
#include <linux/types.h>
#include <linux/win_minmax.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/sch_generic.h>
#include <net/tcp.h>
#include "sch_arl.h"
#define ARL_SCALE 7 /* 128 Byte per second, approximately 1kbps */
#define ARL_BW_UNIT BIT(7) /* 128 Byte per second, approximately 1kbps */
/* High gain to exponentially increase bw. Double the BW in 20 cycles */
static const int ARL_HIGH_GAIN = ARL_BW_UNIT * 1035 / 1000;
/* Drain gain: half the rate in two cycles */
static const int ARL_DRAIN_GAIN = ARL_BW_UNIT * 707 / 1000;
static bool arl_latency_sampling_enabled;
static int arl_dev_index = -1;
/* The rate for each phase is:
* base_rate + rate_delta * arl_rate_tbl[state][phase]
*/
static const int arl_rate_tbl[][ARL_CYCLE_LEN] = {
{0, 0, 0, 0}, /* STABLE */
{-1, -1, -1, 0}, /* DRAIN */
{1, 0, 1, 0}, /* BW_PROBE */
{-1, -1, -1, 0}, /* LATENCY_PROBE */
{0, 0, 0, 0}, /* UNTHROTTLED */
};
static void arl_bw_estimate_reset(struct arl_sched_data *q)
{
q->vars.bw_est_start_t = jiffies;
q->vars.bw_est_bytes_sent = 0;
}
static void arl_update_bw_estimate(struct arl_sched_data *q)
{
struct arl_vars *vars = &q->vars;
unsigned long now = jiffies, bw_avg;
if (!time_after(now, (vars->bw_est_start_t
+ msecs_to_jiffies(q->vars.phase_dur) - 1)))
return;
vars->last_bw_est = vars->bw_est;
vars->bw_est = div_u64(vars->bw_est_bytes_sent * HZ,
(now - vars->bw_est_start_t)*1000);
ewma_arl_bw_avg_add(&vars->bw_avg, vars->bw_est);
bw_avg = ewma_arl_bw_avg_read(&vars->bw_avg);
minmax_running_max(&vars->max_bw, msecs_to_jiffies(ARL_LT_WIN), now,
bw_avg);
if (bw_avg > q->stats.max_bw)
q->stats.max_bw = bw_avg;
arl_bw_estimate_reset(q);
}
static bool arl_is_latency_high(struct arl_sched_data *q)
{
u32 lt_min_hrtt, st_min_hrtt;
/* return true only when there is recent latency measurement */
if (time_after(jiffies, q->vars.last_latency_upd_t +
msecs_to_jiffies(q->vars.phase_dur * ARL_CYCLE_LEN * 2)))
return false;
lt_min_hrtt = minmax_get(&q->vars.lt_min_hrtt);
st_min_hrtt = minmax_get(&q->vars.st_min_hrtt);
/* consider latency is high if the short term smoothed latency is
* significantly (>latency_hysteresis) higher than
* max(ARL_LOW_LATENCY, lt_min_hrtt) or higher than the ARL parameter
* "max_latency".
*/
if ((st_min_hrtt < q->params.latency_hysteresis +
max_t(u32, ARL_LOW_LATENCY, lt_min_hrtt)) &&
minmax_get(&q->vars.min_hrtt) < q->params.max_latency)
return false;
else
return true;
}
/* Check if the bandwidth is fully used.
* Return true if the measured throughput is above ~92% or within 400 Kbits of
* the configured rate.
*/
static bool arl_is_under_load(struct arl_sched_data *q)
{
u32 rate = q->vars.current_rate;
if (q->vars.bw_est > (rate - ((rate * 10) >> ARL_SCALE)) ||
q->vars.bw_est + 400 / 8 > rate)
return true;
else
return false;
}
static bool arl_is_throttling(struct arl_sched_data *q)
{
if (!q->qdisc)
return false;
/* consider ARL is throttling the traffic if it causes significant
* backlog (sojourn time > 1/2 CoDel target).
*/
return psched_l2t_ns(&q->vars.rate, q->qdisc->qstats.backlog) >
q->params.target * NSEC_PER_USEC / 2 ? 1 : 0;
}
/* Check if ARL should enter DRAIN state. Periodically DRAIN the queue helps
* find the true minmium latency.
*/
static bool arl_check_drain(struct arl_sched_data *q)
{
if (q->vars.state != ARL_LATENCY_PROBE && q->vars.state != ARL_STABLE)
return false;
/* No need to DRAIN if the latency is low */
if (minmax_get(&q->vars.st_min_hrtt) < ARL_LOW_LATENCY)
return false;
/* No need to DRAIN unless it is under load */
if (!arl_is_under_load(q))
return false;
/* For INGRESS mode, if ARL is throttling the traffic, it is already
* draining the queue, so no need to enter the DRAIN mode.
*/
if (q->params.mode == ARL_INGRESS && arl_is_throttling(q) &&
!arl_is_latency_high(q))
return false;
if (minmax_get(&q->vars.min_hrtt) > q->params.max_latency)
return true;
/* periodically enter DRAIN state for Egress mode if it is under load */
if (ktime_ms_delta(q->vars.phase_start_t, q->vars.last_drain_t)
> ARL_DRAIN_INTERVAL)
return true;
else
return false;
}
static void arl_apply_new_rate(struct arl_sched_data *q, u64 next_rate)
{
u32 buffer;
next_rate *= 1000;
psched_ratecfg_precompute(&q->vars.rate, &q->vars.cfg_rate, next_rate);
/* The buffer is burst size in ns, ensure it is large enough to
* transmit a max_size packet.
*/
buffer = psched_l2t_ns(&q->vars.rate, q->params.max_size);
q->vars.buffer = max(buffer, q->params.buffer);
}
static void arl_change_state(struct arl_sched_data *q, int new_state)
{
struct arl_vars *vars = &q->vars;
u64 next_rate;
u32 bw, dur_min, dur_max;
vars->phase = 0;
vars->cycle_cnt = 0;
vars->phase_start_t = ktime_get();
vars->state_start_t = jiffies;
vars->latency_trend = 0;
vars->rate_factor = ARL_BW_UNIT;
if (q->params.mode == ARL_INGRESS) {
dur_min = ARL_INGRESS_PHASE_DUR_MIN;
dur_max = ARL_INGRESS_PHASE_DUR_MAX;
} else {
dur_min = ARL_PHASE_DUR_MIN;
dur_max = ARL_PHASE_DUR_MAX;
}
vars->phase_dur = clamp((u32)(2 * minmax_get(&vars->st_min_hrtt)
/ USEC_PER_MSEC), dur_min, dur_max);
if (vars->state == new_state)
return;
/* observed available bandwidth at the end of previous state */
bw = max_t(u32, ewma_arl_bw_avg_read(&vars->bw_avg), vars->bw_est);
if (vars->state == ARL_DRAIN || new_state == ARL_LATENCY_PROBE) {
/* Leaving drain state or enter LATENCY_PROBE, restore bw to
* the last stable measurement of bw.
*/
bw = vars->last_stable_base_rate;
if (arl_is_latency_high(q))
vars->next_bw_probe_t = jiffies +
msecs_to_jiffies(120 *
MSEC_PER_SEC);
} else if (vars->state == ARL_BW_PROBE) {
/* Use the bw from previous cycle to avoid overshot */
bw = max_t(u32, ewma_arl_bw_avg_read(&vars->bw_avg),
vars->last_bw_est);
} else if (q->params.mode == ARL_EGRESS) {
/* For egress mode, reduce BW to offset the overshot due to
* increased BW when exit UNTHROTTLED state.
* It is not needed for ingress mode as the measured BW should
* be the actual available bandwidth.
*/
if (vars->state == ARL_UNTHROTTLED) {
bw -= (bw >> ARL_SCALE) * 5;
bw = min_t(u32, bw, vars->base_rate * 2);
} else {
bw = min_t(u32, bw, vars->base_rate);
if (arl_is_latency_high(q) ||
time_after(jiffies, vars->last_latency_upd_t +
msecs_to_jiffies(ARL_MT_WIN)))
bw -= (bw >> ARL_SCALE) * 2;
}
}
/* adjust for overshot */
bw -= (2 * bw >> ARL_SCALE);
/* New base rate for next state */
if (new_state != ARL_DRAIN && new_state != ARL_LATENCY_PROBE)
vars->base_rate = max(bw, vars->base_rate);
vars->last_stable_base_rate = bw;
/* set rate for next cycle */
vars->target_rate = vars->base_rate;
switch (new_state) {
case ARL_DRAIN:
vars->last_drain_t = ktime_get();
vars->phase_dur = minmax_get(&vars->st_min_hrtt) /
USEC_PER_MSEC;
vars->phase_dur = clamp(vars->phase_dur, ARL_DRAIN_DUR_MIN,
ARL_DRAIN_DUR_MAX);
vars->target_rate -= 5 * (vars->base_rate >> ARL_SCALE);
if (arl_is_latency_high(q)) {
/* If latency is high, reduce the base rate to ~70%. so
* a [-1, -1, -1, 0] cycle could eliminate ~90% of RTT
* worth of queueing latency.
*/
vars->target_rate = (vars->base_rate *
ARL_DRAIN_GAIN >> ARL_SCALE);
vars->base_rate -= 3 * (vars->base_rate >> ARL_SCALE);
}
break;
case ARL_BW_PROBE:
vars->target_rate = (vars->base_rate * ARL_HIGH_GAIN
>> ARL_SCALE);
break;
case ARL_LATENCY_PROBE:
vars->base_rate -= (vars->base_rate >> ARL_SCALE);
vars->target_rate -= 5 * (vars->base_rate >> ARL_SCALE);
break;
default:
break;
}
vars->base_rate = max_t(u32, vars->base_rate, q->params.min_rate);
vars->current_rate = vars->base_rate;
vars->rate_delta = abs(vars->target_rate - vars->base_rate);
vars->last_min_hrtt = minmax_get(&vars->st_min_hrtt);
vars->min_hrtt_last_cycle = vars->last_min_hrtt;
vars->state = new_state;
next_rate = vars->rate_delta * arl_rate_tbl[vars->state][vars->phase]
+ vars->current_rate;
arl_apply_new_rate(q, next_rate);
arl_bw_estimate_reset(q);
}
static void arl_update_phase(struct Qdisc *sch)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct arl_vars *vars = &q->vars;
u64 next_rate;
int latency;
u32 bw_avg;
bool is_under_load, is_latency_high, is_latency_current,
is_throttling;
is_under_load = arl_is_under_load(q);
is_throttling = arl_is_throttling(q);
/* Is latency high compared to long term minimum? */
is_latency_high = arl_is_latency_high(q);
is_latency_current = !time_after(jiffies, vars->last_latency_upd_t +
msecs_to_jiffies(vars->phase_dur *
ARL_CYCLE_LEN * 2));
if (arl_check_drain(q)) {
arl_change_state(q, ARL_DRAIN);
return;
}
vars->phase = (vars->phase == (ARL_CYCLE_LEN - 1)) ? 0 :
vars->phase + 1;
vars->phase_start_t = ktime_get();
next_rate = vars->rate_delta * arl_rate_tbl[vars->state][vars->phase]
+ vars->current_rate;
arl_apply_new_rate(q, next_rate);
latency = minmax_get(&vars->st_min_hrtt);
/* Update the latency_trend at the end of each phase for egress mode */
if (q->params.mode == ARL_EGRESS) {
if (!time_after(jiffies, vars->last_latency_upd_t +
msecs_to_jiffies(q->vars.phase_dur))) {
if (latency + q->params.latency_hysteresis / 2 <
min(vars->min_hrtt_last_cycle, vars->last_min_hrtt))
vars->latency_trend--;
else if (latency > q->params.latency_hysteresis / 2 +
min(vars->min_hrtt_last_cycle, vars->last_min_hrtt))
vars->latency_trend++;
}
} else if (vars->phase == 0) {
/* For ingress mode latency_trend indicates latency has been
* high for how many consective cycles.
*/
if (is_latency_high)
vars->latency_trend++;
else
vars->latency_trend--;
}
if (latency < ARL_LOW_LATENCY || vars->latency_trend < 0)
vars->latency_trend = 0;
/* Update state for next cycle */
if (vars->phase != 0)
return;
arl_update_bw_estimate(q);
bw_avg = ewma_arl_bw_avg_read(&vars->bw_avg);
/* If there is no recent latency, stop adjusting rates for Egress mode.
* For ingress mode, the BW is still get updated based on the current
* measurement of incoming data rate.
*/
if ((time_after(jiffies, vars->last_latency_upd_t +
msecs_to_jiffies(ARL_MT_WIN))) &&
q->params.mode == ARL_EGRESS) {
if (vars->state != ARL_STABLE)
arl_change_state(q, ARL_STABLE);
return;
}
if ((minmax_get(&q->vars.max_bw) > q->params.max_bw) &&
!is_latency_high) {
/* The available BW is too high to worry about bufferbloat.
* so detach the rate limiter to avoid overhead.
*/
arl_change_state(q, ARL_UNTHROTTLED);
return;
}
switch (vars->state) {
case ARL_STABLE:
if (vars->bw_est < q->params.min_rate && !is_under_load) {
arl_change_state(q, ARL_IDLE);
return;
}
if (q->params.mode == ARL_EGRESS) {
/* Exit stable state if latency increases under load */
if (is_latency_high) {
/* Defer a few cycles before trying to reduce
* the rate. It may be just a short glitch or
* the bloated queue happened in the other
* direction.
*/
if (is_under_load || vars->cycle_cnt > 3) {
arl_change_state(q, ARL_LATENCY_PROBE);
return;
}
} else if (is_under_load) {
if (vars->latency_trend == 0 &&
vars->cycle_cnt > 5 &&
time_after(jiffies,
vars->next_bw_probe_t)) {
arl_change_state(q, ARL_BW_PROBE);
return;
}
} else {
vars->cycle_cnt = 0;
}
break;
}
// INGRESS mode
if (is_latency_high) {
if (vars->latency_trend > 1) {
arl_change_state(q, ARL_LATENCY_PROBE);
return;
}
break;
}
/* If the ingress queue is building up when the latency
* increases, then it operates in the right direction. CoDel
* will do its work to shrink the queue. Otherwise, the current
* rate is too high and need be reduced.
*/
if (!is_throttling) {
vars->last_drain_t = ktime_get();
vars->cycle_cnt = 0;
break;
}
if (vars->latency_trend > 0 || !is_latency_current)
break;
/* Latency is low and the ingress queue is building up, the rate
* can be increased to the bw observed.
*/
if (vars->cycle_cnt > 5 && bw_avg > vars->base_rate -
2 * (vars->base_rate >> ARL_SCALE) &&
time_after(jiffies, vars->next_bw_probe_t)) {
arl_change_state(q, ARL_BW_PROBE);
return;
}
bw_avg -= 2 * (bw_avg >> ARL_SCALE);
vars->current_rate = max_t(u32, vars->current_rate, bw_avg);
break;
case ARL_BW_PROBE:
if (q->params.mode == ARL_EGRESS) {
/* Exit BW probe state if latency is increasing */
if (is_latency_high || vars->latency_trend > 2) {
arl_change_state(q, ARL_LATENCY_PROBE);
return;
}
/* Exit to stable state if the traffic is light */
if (!is_throttling || vars->latency_trend >= 1) {
arl_change_state(q, ARL_STABLE);
return;
}
/* If BW has increased signficantly (>30%)
* without latency increase, switch to UNTHROTTLED state
* to figure out the available BW quickly.
*/
if (vars->cycle_cnt > 9) {
if (vars->bw_est >
vars->base_rate * 130 / 100) {
arl_change_state(q, ARL_UNTHROTTLED);
return;
}
}
} else {
if (vars->latency_trend > 0) {
arl_change_state(q, ARL_LATENCY_PROBE);
return;
}
if (!is_throttling || !is_latency_current) {
/* For ingress, exit to stable state if not
* throttling the traffic and lost latency
* measurement.
*/
arl_change_state(q, ARL_STABLE);
return;
}
}
/* Update probe rate every 3 cycles */
if (vars->cycle_cnt % 3 == 2) {
/* Go to stable state if the measured bw stops
* increasing
*/
if (vars->bw_est < vars->base_rate) {
arl_change_state(q, ARL_STABLE);
return;
}
vars->current_rate = max_t(u32, vars->current_rate,
bw_avg);
vars->target_rate = vars->current_rate;
/* Pause the rate_delta increase for one in every 3
* cycles to observe the latency change. There could
* be some lags between rate change and latency change.
*/
vars->rate_factor = ARL_BW_UNIT;
/* For ingress mode, stop increase rate for every cycles
* and only increase rate based on observed bandwidth
* increase.
*/
vars->rate_delta = 0;
} else {
/* Switch to high gain if latency is stable. */
vars->rate_factor = ARL_HIGH_GAIN;
}
/* Ingress Mode, the rate is updated every cycle to the
* observed bandwidth.
*/
if (q->params.mode == ARL_INGRESS) {
vars->current_rate = max(vars->current_rate,
vars->bw_est);
} else {
/* update rate for next cycle */
vars->target_rate = max_t(u32, vars->target_rate,
vars->current_rate);
vars->target_rate = vars->target_rate *
vars->rate_factor >> ARL_SCALE;
vars->rate_delta = vars->target_rate + 1 -
vars->current_rate;
vars->rate_delta = min_t(u32, vars->rate_delta,
vars->base_rate / 10);
}
break;
case ARL_LATENCY_PROBE:
if (!is_latency_high || vars->bw_est < q->params.min_rate) {
/* If latency is no longer high or cannot be further
* reduced, go back to stable state.
*/
if (is_under_load)
vars->base_rate -= (vars->base_rate >>
ARL_SCALE);
arl_change_state(q, ARL_STABLE);
return;
}
/* If it is not just short term minor latency increases,
* then the pervious minor adjustment of rate is not sufficient.
* The base_rate is likely exceed the available bandwidth, goto
* DRAIN state.
*/
if (vars->bw_est > q->params.min_rate && is_latency_current &&
(minmax_get(&q->vars.st_min_hrtt) > q->params.max_latency &&
vars->cycle_cnt > 2)) {
vars->base_rate -= 3 * (vars->base_rate >> ARL_SCALE);
arl_change_state(q, ARL_DRAIN);
return;
}
/* update rate for next cycle */
if (vars->latency_trend >= 0 || is_latency_high)
vars->rate_factor = ARL_DRAIN_GAIN;
else
vars->rate_factor = ARL_BW_UNIT;
vars->target_rate = vars->target_rate *
vars->rate_factor >> ARL_SCALE;
vars->current_rate = clamp(vars->last_stable_base_rate,
vars->target_rate, vars->base_rate);
break;
case ARL_DRAIN:
if (!is_latency_high) {
arl_change_state(q, ARL_STABLE);
return;
}
vars->current_rate -= 5 * (vars->base_rate >> ARL_SCALE);
if (vars->last_stable_base_rate < vars->base_rate)
vars->base_rate = max(vars->last_stable_base_rate,
vars->current_rate);
arl_change_state(q, ARL_LATENCY_PROBE);
return;
case ARL_UNTHROTTLED:
if (minmax_get(&vars->max_bw) > q->params.max_bw ||
vars->bw_est < q->params.min_rate)
break;
if (is_latency_high || vars->latency_trend > 1 ||
!is_latency_current || vars->cycle_cnt > 10 ||
vars->bw_est > vars->base_rate * 2) {
arl_change_state(q, ARL_STABLE);
return;
}
break;
case ARL_IDLE:
if (vars->bw_est > q->params.min_rate || is_under_load) {
arl_change_state(q, ARL_STABLE);
return;
}
/* Restore the default rate when it has been idle for 20
* minutes.
*/
if (time_after(jiffies, vars->state_start_t + 20 * 60 * HZ)) {
if (vars->base_rate < q->params.rate)
vars->base_rate = q->params.rate;
arl_change_state(q, ARL_IDLE);
}
break;
}
/* state unchanged */
vars->cycle_cnt++;
vars->min_hrtt_last_cycle = minmax_get(&vars->st_min_hrtt);
if (q->params.mode == ARL_EGRESS)
vars->latency_trend = 0;
if (vars->base_rate < q->stats.min_rate || q->stats.min_rate == 0)
q->stats.min_rate = vars->base_rate;
next_rate = vars->rate_delta * arl_rate_tbl[vars->state][vars->phase]
+ vars->current_rate;
arl_apply_new_rate(q, next_rate);
}
static void arl_update(struct Qdisc *sch)
{
struct arl_sched_data *q = qdisc_priv(sch);
if (ktime_ms_delta(ktime_get(), q->vars.phase_start_t) <
q->vars.phase_dur)
return;
arl_update_phase(sch);
}
static void arl_params_init(struct arl_params *params)
{
params->max_size = 1600;
params->buffer = ARL_BUFFER_SIZE_DEFAULT * NSEC_PER_USEC;
params->max_bw = ARL_MAX_BW_DEFAULT;
params->min_rate = ARL_MIN_RATE_DEFAULT;
params->limit = 1000;
params->max_latency = ARL_MAX_LATENCY_DEFAULT;
params->latency_hysteresis = ARL_LAT_HYSTERESIS_DEFAULT;
params->mode = ARL_EGRESS;
params->target = 10000;
}
static void arl_vars_init(struct arl_sched_data *q)
{
struct arl_vars *vars = &q->vars;
vars->ts = ktime_get_ns();
vars->last_drain_t = ktime_get();
minmax_reset(&vars->lt_min_hrtt, jiffies, 5 * MSEC_PER_SEC);
minmax_reset(&vars->st_min_hrtt, jiffies, 5 * MSEC_PER_SEC);
minmax_reset(&vars->max_bw, jiffies, 0);
ewma_arl_bw_avg_init(&vars->bw_avg);
vars->cfg_rate.linklayer = TC_LINKLAYER_ETHERNET;
vars->base_rate = q->params.rate;
vars->current_rate = vars->base_rate;
vars->target_rate = vars->base_rate;
vars->tokens = q->params.buffer;
vars->buffer = q->params.buffer;
vars->next_bw_probe_t = jiffies;
vars->last_latency_upd_t = jiffies;
arl_bw_estimate_reset(q);
arl_change_state(q, ARL_BW_PROBE);
}
static void arl_update_latency_ct(struct arl_sched_data *q,
struct tcp_latency_sample *lat, u32 latency)
{
u32 s_hrtt, duration, s_hrtt_last = lat->s_hrtt_us;
if (latency > ARL_LATENCY_SAMPLE_TIMEOUT_US)
return;
if (s_hrtt_last > ARL_LATENCY_SAMPLE_TIMEOUT_US)
s_hrtt_last = latency;
/* s_hrtt_us = 3/4 old s_hrtt_us + 1/4 new sample */
if (s_hrtt_last)
s_hrtt = s_hrtt_last * 4 + latency - s_hrtt_last;
else
s_hrtt = latency * 4;
s_hrtt = s_hrtt / 4;
if (s_hrtt > ARL_LATENCY_SAMPLE_TIMEOUT_US)
s_hrtt = latency;
lat->s_hrtt_us = s_hrtt;
/* Ingess mode (downstream traffic) has fewer latency samples */
if (q->params.mode == ARL_INGRESS)
duration = q->vars.phase_dur * 4;
else
duration = q->vars.phase_dur;
minmax_running_min(&q->vars.st_min_hrtt, msecs_to_jiffies(duration),
jiffies, latency);
minmax_running_min(&q->vars.min_hrtt, msecs_to_jiffies(ARL_MT_WIN),
jiffies, s_hrtt);
minmax_running_min(&q->vars.lt_min_hrtt, msecs_to_jiffies(ARL_LT_WIN),
jiffies, s_hrtt);
q->vars.last_latency_upd_t = jiffies;
}
static void arl_update_latency(struct arl_sched_data *q, u32 latency)
{
u32 duration;
/* Ingess mode (downstream traffic) has fewer latency samples */
if (q->params.mode == ARL_INGRESS)
duration = q->vars.phase_dur * 4;
else
duration = q->vars.phase_dur;
minmax_running_min(&q->vars.st_min_hrtt,
(msecs_to_jiffies(duration)), jiffies,
latency);
minmax_running_min(&q->vars.min_hrtt, msecs_to_jiffies(ARL_MT_WIN),
jiffies, latency);
minmax_running_min(&q->vars.lt_min_hrtt, msecs_to_jiffies(ARL_LT_WIN),
jiffies, latency);
q->vars.last_latency_upd_t = jiffies;
}
/* Latency measurement related utilities.
* There are two sources of the latency measurement:
* 1) Kernel's RTT measurement for TCP sockets bound to the qdisc's interface.
* 2) The half path RTT measured by ARL for routed TCP sessions. The half path
* measured is from router-> internet -> ACKs back to the router.
*
* To measure the half path RTT for routed TCP sessions:
* For each routed TCP flow, one egress packet is sampled for latency
* measurement. The sequence number extracted from the TCP header and the
* dequeue time are stored in the TCP stream's conntrack entry. The latency is
* measured as from the time of the packet is dequeued at egress path to the
* time the TCP ACK for that segment is received at ingress path.
*/
static void arl_egress_mark_pkt(struct sk_buff *skb, u32 seq,
struct nf_conn *ct)
{
struct tcp_latency_sample *tcp_lat;
tcp_lat = &ct->proto.tcp.latency_sample;
tcp_lat->send_ts = ktime_get();
tcp_lat->last_seq = seq;
tcp_lat->last_hrtt = 0;
}
static struct tcphdr *arl_get_tcp_header_ipv4(struct sk_buff *skb,
void *buffer)
{
const struct iphdr *iph;
struct tcphdr *tcph;
u32 tcph_offset;
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
return NULL;
iph = ip_hdr(skb);
if (iph->protocol != IPPROTO_TCP)
return NULL;
tcph_offset = skb_network_offset(skb) + iph->ihl * 4;
if (tcph_offset > skb->len)
return NULL;
tcph = skb_header_pointer(skb, tcph_offset, sizeof(struct tcphdr),
buffer);
return tcph;
}
static struct tcphdr *arl_get_tcp_header_ipv6(struct sk_buff *skb,
void *buffer)
{
const struct ipv6hdr *ipv6h;
struct tcphdr *tcphdr;
u8 proto;
__be16 frag_off;
int tcphoff;
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
return NULL;
ipv6h = ipv6_hdr(skb);
if (ipv6h->version != 6)
return NULL;
if (ipv6_addr_is_multicast(&ipv6h->daddr) ||
ipv6_addr_is_multicast(&ipv6h->saddr))
return NULL;
proto = ipv6h->nexthdr;
tcphoff = ipv6_skip_exthdr(skb, skb_network_offset(skb) +
sizeof(*ipv6h), &proto, &frag_off);
if (tcphoff < 0 || proto != IPPROTO_TCP ||
((tcphoff + sizeof(struct tcphdr)) > skb->len))
return NULL;
tcphdr = skb_header_pointer(skb, tcphoff, sizeof(struct tcphdr),
buffer);
return tcphdr;
}
/* Find the conntrack entry for packet that takes the shortcut path and has no
* ct entry set in its skb.
*/
static struct nf_conn *arl_egress_find_ct_v4(struct sk_buff *skb,
struct tcphdr *tcph)
{
struct iphdr *iph;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct = NULL;
/* construct a tuple to lookup nf_conn. */
memset(&tuple, 0, sizeof(tuple));
iph = ip_hdr(skb);
tuple.dst.protonum = iph->protocol;
/* The routed packet is transfromed by NAPT, so use the CT entry from
* the reverse direction.
*/
tuple.dst.dir = IP_CT_DIR_REPLY;
tuple.src.u3.ip = iph->daddr;
tuple.dst.u3.ip = iph->saddr;
tuple.src.l3num = AF_INET;
tuple.src.u.tcp.port = tcph->dest;
tuple.dst.u.tcp.port = tcph->source;
h = nf_conntrack_find_get(&init_net, &nf_ct_zone_dflt, &tuple);
if (unlikely(!h))
return ct;
ct = nf_ct_tuplehash_to_ctrack(h);
return ct;
}
static struct nf_conn *arl_egress_find_ct_v6(struct sk_buff *skb,
struct tcphdr *tcph)
{
struct ipv6hdr *ipv6h;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct = NULL;
/* construct a tuple to lookup nf_conn. */
memset(&tuple, 0, sizeof(tuple));
tuple.dst.dir = IP_CT_DIR_REPLY;
tuple.dst.protonum = IPPROTO_TCP;
ipv6h = ipv6_hdr(skb);
tuple.src.u3.in6 = ipv6h->daddr;
tuple.dst.u3.in6 = ipv6h->saddr;
tuple.src.l3num = AF_INET6;
tuple.dst.u.tcp.port = tcph->source;
tuple.src.u.tcp.port = tcph->dest;
h = nf_conntrack_find_get(&init_net, &nf_ct_zone_dflt, &tuple);
if (unlikely(!h))
return ct;
ct = nf_ct_tuplehash_to_ctrack(h);
return ct;
}
static void arl_sample_latency_egress(struct arl_sched_data *q,
struct sk_buff *skb)
{
struct tcphdr *tcph, tcphdr;
struct nf_conn *ct;
struct tcp_latency_sample *tcp_lat;
u32 latency_sampling;
u32 latency = 0;
if (!arl_latency_sampling_enabled)
return;
/* skip small packets */
if (!skb || skb->len < 54)
return;
/* Skip bc/mc packets. */
if (unlikely(skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST))
return;
/* Only process TCP packets */
if (likely(htons(ETH_P_IP) == skb->protocol)) {
tcph = arl_get_tcp_header_ipv4(skb, &tcphdr);
if (!tcph)
return;
ct = arl_egress_find_ct_v4(skb, tcph);
} else if (likely(htons(ETH_P_IPV6) == skb->protocol)) {
tcph = arl_get_tcp_header_ipv6(skb, &tcphdr);
if (!tcph)
return;
ct = arl_egress_find_ct_v6(skb, tcph);
} else {
return;
}
if (unlikely(!ct))
return;
if (!nf_ct_is_confirmed(ct))
goto exit;
tcp_lat = &ct->proto.tcp.latency_sample;
latency_sampling = atomic_read(&tcp_lat->sampling_state);
if (unlikely(latency_sampling == ARL_SAMPLE_STATE_DONE)) {
latency = tcp_lat->last_hrtt;
if (atomic_cmpxchg(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_DONE,
ARL_SAMPLE_STATE_UPDATING)
!= ARL_SAMPLE_STATE_DONE)
goto exit;
if (latency) {
tcp_lat->last_hrtt = 0;
arl_update_latency_ct(q, tcp_lat, latency);
}
atomic_set(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_IDLE);
} else if (latency_sampling == ARL_SAMPLE_STATE_SAMPLING) {
s64 time_delta = ktime_us_delta(ktime_get(), tcp_lat->send_ts);
if (time_delta < ARL_LATENCY_SAMPLE_TIMEOUT_US)
goto exit;
if (atomic_cmpxchg(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_SAMPLING,
ARL_SAMPLE_STATE_IDLE) !=
ARL_SAMPLE_STATE_SAMPLING)
goto exit;
} else if (latency_sampling > ARL_SAMPLE_STATE_IDLE) {
goto exit;
}
/* Check if it should start sampling for latency again */
if (ntohl(tcph->seq) == tcp_lat->last_seq)
goto exit;
if (atomic_cmpxchg(&tcp_lat->sampling_state, ARL_SAMPLE_STATE_IDLE,
ARL_SAMPLE_STATE_UPDATING) != ARL_SAMPLE_STATE_IDLE)
goto exit;
arl_egress_mark_pkt(skb, ntohl(tcph->seq), ct);
atomic_set(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_SAMPLING);
exit:
nf_ct_put(ct);
}
/* Extract half path round trip time measured from routed TCP packets.
* Return 0 if successful, return -1 otherwise.
*/
static int arl_update_hrtt(struct arl_sched_data *q, struct sk_buff *skb,
u32 ack_seq,
struct tcp_latency_sample *tcp_lat)
{
s64 time_delta;
if (ack_seq < tcp_lat->last_seq)
return -1;
time_delta = ktime_us_delta(ktime_get(), tcp_lat->send_ts);
if (time_delta > ARL_LATENCY_SAMPLE_TIMEOUT_US) {
atomic_set(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_IDLE);
return -1;
}
if (atomic_cmpxchg(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_SAMPLING,
ARL_SAMPLE_STATE_UPDATING) !=
ARL_SAMPLE_STATE_SAMPLING)
return -1;
tcp_lat->last_hrtt = time_delta;
arl_update_latency_ct(q, tcp_lat, time_delta);
atomic_set(&tcp_lat->sampling_state,
ARL_SAMPLE_STATE_DONE);
return 0;
}
static void arl_sample_latency_ingress_v4(struct arl_sched_data *q,
struct sk_buff *skb,
struct tcphdr *tcph)
{
struct nf_conn *ct;
struct nf_conntrack_tuple tuple;
struct iphdr *iph;
struct nf_conntrack_tuple_hash *h;
struct tcp_latency_sample *tcp_lat;
if (!skb || !skb->dev)
return;
if (!arl_latency_sampling_enabled)
return;
/* construct a tuple to lookup nf_conn. */
memset(&tuple, 0, sizeof(tuple));
tuple.dst.dir = IP_CT_DIR_REPLY;
tuple.dst.protonum = IPPROTO_TCP;
iph = ip_hdr(skb);
tuple.src.u3.ip = iph->saddr;
tuple.dst.u3.ip = iph->daddr;
tuple.src.l3num = AF_INET;
tuple.dst.u.tcp.port = tcph->dest;
tuple.src.u.tcp.port = tcph->source;
h = nf_conntrack_find_get(&init_net, &nf_ct_zone_dflt, &tuple);
if (unlikely(!h))
return;
ct = nf_ct_tuplehash_to_ctrack(h);
if (!ct)
goto exit;
tcp_lat = &ct->proto.tcp.latency_sample;
if (atomic_read(&tcp_lat->sampling_state) != ARL_SAMPLE_STATE_SAMPLING)
goto exit;
if (arl_update_hrtt(q, skb, ntohl(tcph->ack_seq), tcp_lat))
goto exit;
exit:
nf_ct_put(ct);
}
static void arl_sample_latency_ingress_v6(struct arl_sched_data *q,
struct sk_buff *skb,
struct tcphdr *tcph)
{
struct nf_conn *ct;
struct nf_conntrack_tuple tuple;
struct ipv6hdr *ipv6h;
struct nf_conntrack_tuple_hash *h;
struct tcp_latency_sample *tcp_lat;
if (!skb || !skb->dev)
return;
if (!arl_latency_sampling_enabled)
return;
/* construct a tuple to lookup nf_conn. */
memset(&tuple, 0, sizeof(tuple));
tuple.dst.dir = IP_CT_DIR_REPLY;
tuple.dst.protonum = IPPROTO_TCP;
ipv6h = ipv6_hdr(skb);
tuple.src.u3.in6 = ipv6h->saddr;
tuple.dst.u3.in6 = ipv6h->daddr;
tuple.src.l3num = AF_INET6;
tuple.dst.u.tcp.port = tcph->dest;
tuple.src.u.tcp.port = tcph->source;
h = nf_conntrack_find_get(&init_net, &nf_ct_zone_dflt, &tuple);
if (unlikely(!h))
return;
ct = nf_ct_tuplehash_to_ctrack(h);
if (!ct)
goto exit;
tcp_lat = &ct->proto.tcp.latency_sample;
if (atomic_read(&tcp_lat->sampling_state) != ARL_SAMPLE_STATE_SAMPLING)
goto exit;
if (arl_update_hrtt(q, skb, ntohl(tcph->ack_seq), tcp_lat))
goto exit;
exit:
nf_ct_put(ct);
}
static void arl_sample_latency_ingress(struct arl_sched_data *q,
struct sk_buff *skb)
{
struct tcphdr *tcph, tcphdr;
if (htons(ETH_P_IP) == skb->protocol) {
tcph = arl_get_tcp_header_ipv4(skb, &tcphdr);
if (!tcph)
return;
arl_sample_latency_ingress_v4(q, skb, tcph);
} else if (htons(ETH_P_IPV6) == skb->protocol) {
tcph = arl_get_tcp_header_ipv6(skb, &tcphdr);
if (!tcph)
return;
arl_sample_latency_ingress_v6(q, skb, tcph);
}
}
static void arl_dequeue_update(struct Qdisc *sch, struct sk_buff *skb)
{
struct arl_sched_data *q = qdisc_priv(sch);
qdisc_qstats_backlog_dec(sch, skb);
if (WARN_ONCE(sch->qstats.backlog > INT_MAX,
"backlog underflow %d %d\n", sch->qstats.backlog,
qdisc_pkt_len(skb)))
sch->qstats.backlog = 0;
sch->q.qlen--;
qdisc_bstats_update(sch, skb);
if (q->params.mode != ARL_EGRESS)
return;
q->vars.bw_est_bytes_sent += qdisc_pkt_len(skb);
arl_sample_latency_egress(q, skb);
}
static void arl_enqueue_update(struct Qdisc *sch, unsigned int len)
{
struct arl_sched_data *q = qdisc_priv(sch);
sch->qstats.backlog += len;
sch->q.qlen++;
if (q->params.mode != ARL_INGRESS)
return;
q->vars.bw_est_bytes_sent += len;
}
/* GSO packets maybe too big and takes more than maxmium tokens to transmit.
* Segment the GSO packets that is larger than max_size.
*/
static int gso_segment(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
int ret, nb;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
return qdisc_drop(skb, sch, to_free);
nb = 0;
while (segs) {
nskb = segs->next;
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
len += segs->len;
ret = qdisc_enqueue(segs, q->qdisc, to_free);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
} else {
nb++;
}
segs = nskb;
}
sch->q.qlen += nb;
if (nb > 1)
qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
sch->qstats.backlog += len;
consume_skb(skb);
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
static int arl_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct arl_sched_data *q = qdisc_priv(sch);
unsigned int len = qdisc_pkt_len(skb);
int ret;
if (qdisc_pkt_len(skb) > q->params.max_size) {
if (skb_is_gso(skb) &&
skb_gso_mac_seglen(skb) <= q->params.max_size)
return gso_segment(skb, sch, to_free);
return qdisc_drop(skb, sch, to_free);
}
ret = qdisc_enqueue(skb, q->qdisc, to_free);
if (unlikely(ret != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
return ret;
}
if (q->params.mode == ARL_INGRESS)
arl_sample_latency_ingress(q, skb);
arl_enqueue_update(sch, len);
return NET_XMIT_SUCCESS;
}
static struct sk_buff *arl_dequeue(struct Qdisc *sch)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
arl_update(sch);
skb = q->qdisc->ops->peek(q->qdisc);
if (skb) {
s64 now;
s64 toks;
unsigned int len = qdisc_pkt_len(skb);
if (WARN_ONCE(len > q->params.max_size,
"Oversized pkt! %u Bytes, max:%u\n", len,
q->params.max_size))
len = q->params.max_size - 1;
if (q->vars.state == ARL_UNTHROTTLED) {
skb = qdisc_dequeue_peeked(q->qdisc);
if (unlikely(!skb))
return NULL;
arl_dequeue_update(sch, skb);
return skb;
}
now = ktime_get_ns();
toks = min_t(s64, now - q->vars.ts, q->vars.buffer);
toks += q->vars.tokens;
if (toks > q->vars.buffer)
toks = q->vars.buffer;
toks -= (s64)psched_l2t_ns(&q->vars.rate, len);
if (toks >= 0) {
skb = qdisc_dequeue_peeked(q->qdisc);
if (unlikely(!skb))
return NULL;
q->vars.ts = now;
q->vars.tokens = toks;
arl_dequeue_update(sch, skb);
return skb;
}
qdisc_watchdog_schedule_ns(&q->wtd, now +
min_t(u32, (-toks), q->vars.buffer));
qdisc_qstats_overlimit(sch);
}
return NULL;
}
static void arl_reset(struct Qdisc *sch)
{
struct arl_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
q->vars.ts = ktime_get_ns();
q->vars.tokens = q->vars.buffer;
qdisc_watchdog_cancel(&q->wtd);
}
static const struct nla_policy arl_policy[TCA_ARL_MAX + 1] = {
[TCA_ARL_BUFFER] = { .type = NLA_U32 },
[TCA_ARL_MIN_RATE] = { .type = NLA_U64 },
[TCA_ARL_MAX_BW] = { .type = NLA_U64 },
[TCA_ARL_LIMIT] = { .type = NLA_U32 },
[TCA_ARL_MAX_LATENCY] = { .type = NLA_U32 },
[TCA_ARL_LATENCY_HYSTERESIS] = { .type = NLA_U32 },
[TCA_ARL_MODE] = { .type = NLA_U32 },
[TCA_ARL_CODEL_TARGET] = { .type = NLA_U32 },
};
static int arl_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct arl_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_ARL_MAX + 1];
struct Qdisc *child = NULL;
struct psched_ratecfg rate;
struct tc_ratespec rate_conf;
err = nla_parse_nested(tb, TCA_ARL_MAX, opt, arl_policy, NULL);
if (err < 0)
return err;
if (tb[TCA_ARL_BUFFER])
q->params.buffer = nla_get_u32(tb[TCA_ARL_BUFFER])
* NSEC_PER_USEC;
/* Convert max_bw from Bps to KBps */
if (tb[TCA_ARL_MAX_BW])
q->params.max_bw = div_u64(nla_get_u64(tb[TCA_ARL_MAX_BW]),
1000);
if (tb[TCA_ARL_MIN_RATE])
q->params.min_rate = div_u64(nla_get_u64(tb[TCA_ARL_MIN_RATE]),
1000);
if (tb[TCA_ARL_MODE])
q->params.mode = nla_get_u32(tb[TCA_ARL_MODE]);
/* The default config set the minimum rate to 70% of connection speed */
q->params.rate = div_u64(q->params.min_rate * 100, 70);
if (tb[TCA_ARL_LIMIT])
q->params.limit = nla_get_u32(tb[TCA_ARL_LIMIT]);
if (tb[TCA_ARL_MAX_LATENCY])
q->params.max_latency = nla_get_u32(tb[TCA_ARL_MAX_LATENCY]);
if (tb[TCA_ARL_LATENCY_HYSTERESIS])
q->params.latency_hysteresis =
nla_get_u32(tb[TCA_ARL_LATENCY_HYSTERESIS]);
if (q->params.max_latency < ARL_MAX_LATENCY_DEFAULT / 2)
q->params.max_latency = ARL_MAX_LATENCY_DEFAULT;
if (tb[TCA_ARL_CODEL_TARGET])
q->params.target = nla_get_u32(tb[TCA_ARL_CODEL_TARGET]);
arl_vars_init(q);
memset(&rate_conf, 0, sizeof(rate_conf));
rate_conf.linklayer = TC_LINKLAYER_ETHERNET;
psched_ratecfg_precompute(&rate, &rate_conf, q->params.rate * 1000);
memcpy(&q->vars.rate, &rate, sizeof(struct psched_ratecfg));
if (q->qdisc != &noop_qdisc) {
err = fifo_set_limit(q->qdisc, q->params.limit);
if (err)
goto done;
} else if (q->params.limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops,
q->params.limit);
if (IS_ERR(child)) {
err = PTR_ERR(child);
goto done;
}
/* child is fifo, no need to check for noop_qdisc */
qdisc_hash_add(child, true);
}
sch_tree_lock(sch);
if (child) {
qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
q->qdisc->qstats.backlog);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
sch_tree_unlock(sch);
done:
return err;
}
static u32 arl_get_rtt_from_sk(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u32 rtt = U32_MAX, last_ack;
if (sk->sk_state != TCP_ESTABLISHED)
return rtt;
last_ack = jiffies_to_msecs(jiffies - tp->rcv_tstamp);
if (last_ack > ARL_ST_WIN) /* Discard stale data */
return rtt;
rtt = tp->srtt_us >> 3;
return rtt;
}
static u32 arl_get_rtt(struct Qdisc *sch)
{
int i;
struct inet_hashinfo *hashinfo = &tcp_hashinfo;
u32 rtt, rtt_min = U32_MAX;
for (i = 0; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
struct sock *sk;
struct hlist_nulls_node *node;
rcu_read_lock();
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_family != AF_INET && sk->sk_family !=
AF_INET6)
continue;
if (inet_sk(sk)->rx_dst_ifindex != arl_dev_index)
continue;
rtt = arl_get_rtt_from_sk(sk);
if (rtt == U32_MAX)
continue;
if (rtt < rtt_min)
rtt_min = rtt;
}
rcu_read_unlock();
}
return rtt_min;
}
static void arl_timer_func(unsigned long data)
{
struct Qdisc *sch = (struct Qdisc *)data;
struct arl_sched_data *q = qdisc_priv(sch);
u32 rtt;
mod_timer(&q->arl_timer, jiffies + ARL_TIMER_INTERVAL);
rtt = arl_get_rtt(sch);
if (rtt != U32_MAX)
arl_update_latency(q, rtt);
}
static int arl_init(struct Qdisc *sch, struct nlattr *opt)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
arl_params_init(&q->params);
qdisc_watchdog_init(&q->wtd, sch);
q->qdisc = &noop_qdisc;
init_timer(&q->arl_timer);
q->arl_timer.expires = jiffies + ARL_TIMER_INTERVAL;
q->arl_timer.data = (unsigned long)sch;
q->arl_timer.function = arl_timer_func;
add_timer(&q->arl_timer);
if (opt) {
int err = arl_change(sch, opt);
if (err)
return err;
}
arl_latency_sampling_enabled = true;
if (q->params.mode == ARL_EGRESS)
arl_dev_index = dev->ifindex;
return 0;
}
static void arl_destroy(struct Qdisc *sch)
{
struct arl_sched_data *q = qdisc_priv(sch);
if (q->params.mode == ARL_EGRESS)
arl_dev_index = -1;
qdisc_watchdog_cancel(&q->wtd);
del_timer_sync(&q->arl_timer);
qdisc_destroy(q->qdisc);
}
static int arl_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
if ((nla_put_u32(skb, TCA_ARL_BUFFER,
q->params.buffer / NSEC_PER_USEC)) ||
(nla_put_u64_64bit(skb, TCA_ARL_MIN_RATE,
q->params.min_rate * 1000, TCA_ARL_PAD)) ||
(nla_put_u32(skb, TCA_ARL_LIMIT, q->params.limit)) ||
(nla_put_u64_64bit(skb, TCA_ARL_MAX_BW, q->params.max_bw * 1000,
TCA_ARL_PAD)) ||
(nla_put_u32(skb, TCA_ARL_MODE, q->params.mode)) ||
(nla_put_u32(skb, TCA_ARL_LATENCY_HYSTERESIS,
q->params.latency_hysteresis)) ||
(nla_put_u32(skb, TCA_ARL_CODEL_TARGET,
q->params.target)) ||
(nla_put_u32(skb, TCA_ARL_MAX_LATENCY, q->params.max_latency)))
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int arl_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct arl_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int arl_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct arl_sched_data *q = qdisc_priv(sch);
struct tc_arl_xstats st = { 0 };
/* convert bw and rate from KBps to Kbits */
st.max_bw = q->stats.max_bw * 8;
st.min_rate = q->stats.min_rate * 8;
st.base_rate = q->vars.base_rate * 8;
st.current_rate = q->vars.current_rate * 8;
st.latency = minmax_get(&q->vars.min_hrtt);
st.state = q->vars.state;
st.current_bw = q->vars.bw_est * 8;
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static int arl_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct arl_sched_data *q = qdisc_priv(sch);
if (!new)
new = &noop_qdisc;
*old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
static struct Qdisc *arl_leaf(struct Qdisc *sch, unsigned long arg)
{
struct arl_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long arl_find(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void arl_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops arl_class_ops = {
.graft = arl_graft,
.leaf = arl_leaf,
.find = arl_find,
.walk = arl_walk,
.dump = arl_dump_class,
};
static struct Qdisc_ops arl_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &arl_class_ops,
.id = "arl",
.priv_size = sizeof(struct arl_sched_data),
.enqueue = arl_enqueue,
.dequeue = arl_dequeue,
.peek = qdisc_peek_dequeued,
.init = arl_init,
.reset = arl_reset,
.destroy = arl_destroy,
.change = arl_change,
.dump = arl_dump,
.dump_stats = arl_dump_stats,
.owner = THIS_MODULE,
};
static int __init arl_module_init(void)
{
return register_qdisc(&arl_qdisc_ops);
}
static void __exit arl_module_exit(void)
{
unregister_qdisc(&arl_qdisc_ops);
}
module_init(arl_module_init)
module_exit(arl_module_exit)
MODULE_DESCRIPTION("Adaptive Rate Limiting(ARL) queue discipline");
MODULE_AUTHOR("Kan Yan <kyan@google.com>");
MODULE_LICENSE("GPL");