Merge branch 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull irq updates from Ingo Molnar: "The changes in this cycle were: - Remove the irq timings/variance statistics code that tried to predict when the next interrupt would occur, which didn't work out as hoped and is replaced by another mechanism. - This new mechanism is the 'array suffix computation' estimate, which is superior to the previous one as it can detect not just a single periodic pattern, but independent periodic patterns along a log-2 scale of bucketing and exponential moving average. The comments are longer than the code - and it works better at predicting various complex interrupt patterns from real-world devices than the previous estimate. - avoid IRQ-work self-IPIs on the local CPU - fix work-list corruption in irq_set_affinity_notifier()" * 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: irq_work: Do not raise an IPI when queueing work on the local CPU genirq/devres: Use struct_size() in devm_kzalloc() genirq/timings: Add array suffix computation code genirq/timings: Remove variance computation code genirq: Prevent use-after-free and work list corruption
2019-05-06 13:45:04 -07:00 · 2019-05-06 13:45:04 -07:00 · 2f1835dffa
parent d90dcc1f14 471ba0e686
commit 2f1835dffa
4 changed files with 414 additions and 200 deletions
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
 			    irq_flow_handler_t handler)
 {
 	struct irq_chip_generic *gc;
-	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);

-	gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+	gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
 	if (gc)
 		irq_init_generic_chip(gc, name, num_ct,
 				      irq_base, reg_base, handler);
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	desc->affinity_notify = notify;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);

-	if (old_notify)
+	if (old_notify) {
+		cancel_work_sync(&old_notify->work);
 		kref_put(&old_notify->kref, old_notify->release);
+	}

 	return 0;
 }
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@ -9,6 +9,7 @@
 #include <linux/idr.h>
 #include <linux/irq.h>
 #include <linux/math64.h>
+#include <linux/log2.h>

 #include <trace/events/irq.h>

@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);

 DEFINE_PER_CPU(struct irq_timings, irq_timings);

-struct irqt_stat {
-	u64	next_evt;
-	u64	last_ts;
-	u64	variance;
-	u32	avg;
-	u32	nr_samples;
-	int	anomalies;
-	int	valid;
-};
-
 static DEFINE_IDR(irqt_stats);

 void irq_timings_enable(void)
@ -40,75 +31,360 @@ void irq_timings_disable(void)
 	static_branch_disable(&irq_timing_enabled);
 }

-/**
- * irqs_update - update the irq timing statistics with a new timestamp
+/*
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
 *
- * @irqs: an irqt_stat struct pointer
- * @ts: the new timestamp
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
 *
- * The statistics are computed online, in other words, the code is
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
 *
- * The computation happens at idle time. When the CPU is not idle, the
- * interrupts' timestamps are stored in the circular buffer, when the
- * CPU goes idle and this routine is called, all the buffer's values
- * are injected in the statistical model continuying to extend the
- * statistics from the previous busy-idle cycle.
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
 *
- * The observations showed a device will trigger a burst of periodic
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
 *
- * Basically, the purpose of the algorithm is to watch the periodic
- * interrupts and eliminate the peaks.
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
 *
- * An interrupt is considered periodically stable if the interval of
- * its occurences follow the normal distribution, thus the values
- * comply with:
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
 *
- *      avg - 3 x stddev < value < avg + 3 x stddev
+ * 1. Suffix array
 *
- * Which can be simplified to:
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
 *
- *      -3 x stddev < value - avg < 3 x stddev
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
 *
- *      abs(value - avg) < 3 x stddev
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
 *
- * In order to save a costly square root computation, we use the
- * variance. For the record, stddev = sqrt(variance). The equation
- * above becomes:
+ * 2. Log interval
 *
- *      abs(value - avg) < 3 x sqrt(variance)
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
 *
- * And finally we square it:
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
 *
- *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
 *
- *      (value - avg) x (value - avg) < 9 x variance
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
 *
- * Statistically speaking, any values out of this interval is
- * considered as an anomaly and is discarded. However, a normal
- * distribution appears when the number of samples is 30 (it is the
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
+ * 3. Exponential Moving Average
 *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ *  -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ *	For each interval:
+ *		Compute the index = ilog2(interval)
+ *		Compute a new_ema(buffer[index], interval)
+ *		Store the index in a circular buffer
+ *
+ *	Compute the suffix array of the indexes
+ *
+ *	For each suffix:
+ *		If the suffix is reverse-found 3 times
+ *			Return suffix
+ *
+ *	Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
+ *
+ * Example based on real values:
+ *
+ * Example 1 : MMC write/read interrupt interval:
+ *
+ *	223947, 1240, 1384, 1386, 1386,
+ *	217416, 1236, 1384, 1386, 1387,
+ *	214719, 1241, 1386, 1387, 1384,
+ *	213696, 1234, 1384, 1386, 1388,
+ *	219904, 1240, 1385, 1389, 1385,
+ *	212240, 1240, 1386, 1386, 1386,
+ *	214415, 1236, 1384, 1386, 1387,
+ *	214276, 1234, 1384, 1388, ?
+ *
+ * For each element, apply ilog2(value)
+ *
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, ?
+ *
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
+ * we can be confident if the pattern repeats itself three times it is
+ * a repeating pattern.
+ *
+ *	             8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, 8,
+ *	15, 8, 8, 8, ?
+ *
+ * Suffixes are:
+ *
+ *  1) 8, 15, 8, 8, 8  <- max period
+ *  2) 8, 15, 8, 8
+ *  3) 8, 15, 8
+ *  4) 8, 15           <- min period
+ *
+ * From there we search the repeating pattern for each suffix.
+ *
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
+ *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  |
+ *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  |
+ *                         8, 15, 8, 8, 8  |   |  |  |  |
+ *                                         8, 15, 8, 8, 8
+ *
+ * When moving the suffix, we found exactly 3 matches.
+ *
+ * The first suffix with period 5 is repeating.
+ *
+ * The next event is (3 * max_period) % suffix_period
+ *
+ * In this example, the result 0, so the next event is suffix[0] => 8
+ *
+ * However, 8 is the index in the array of exponential moving average
+ * which was calculated on the fly when storing the values, so the
+ * interval is ema[8] = 1366
+ *
+ *
+ * Example 2:
+ *
+ *	4, 3, 5, 100,
+ *	3, 3, 5, 117,
+ *	4, 4, 5, 112,
+ *	4, 3, 4, 110,
+ *	3, 5, 3, 117,
+ *	4, 4, 5, 112,
+ *	4, 3, 4, 110,
+ *	3, 4, 5, 112,
+ *	4, 3, 4, 110
+ *
+ * ilog2
+ *
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4
+ *
+ * Max period 5:
+ *	   0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4,
+ *	0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ *  1) 0, 0, 4, 0, 0
+ *  2) 0, 0, 4, 0
+ *  3) 0, 0, 4
+ *  4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  X
+ *         0, 0, 4, 0, 0, |  X
+ *                        0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
+ *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  |
+ *                     0, 0, 4, 0, |  |  |  |  |  |  |
+ *                                 0, 0, 4, 0, |  |  |
+ *                                             0  0  4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
 */
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+#define EMA_ALPHA_VAL		64
+#define EMA_ALPHA_SHIFT		7
+
+#define PREDICTION_PERIOD_MIN	2
+#define PREDICTION_PERIOD_MAX	5
+#define PREDICTION_FACTOR	4
+#define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */
+
+struct irqt_stat {
+	u64	last_ts;
+	u64	ema_time[PREDICTION_BUFFER_SIZE];
+	int	timings[IRQ_TIMINGS_SIZE];
+	int	circ_timings[IRQ_TIMINGS_SIZE];
+	int	count;
+};
+
+/*
+ * Exponential moving average computation
+ */
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
+{
+	s64 diff;
+
+	if (unlikely(!ema_old))
+		return value;
+
+	diff = (value - ema_old) * EMA_ALPHA_VAL;
+	/*
+	 * We can use a s64 type variable to be added with the u64
+	 * ema_old variable as this one will never have its topmost
+	 * bit set, it will be always smaller than 2^63 nanosec
+	 * interrupt interval (292 years).
+	 */
+	return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+	int i;
+
+	/*
+	 * The buffer contains the suite of intervals, in a ilog2
+	 * basis, we are looking for a repetition. We point the
+	 * beginning of the search three times the length of the
+	 * period beginning at the end of the buffer. We do that for
+	 * each suffix.
+	 */
+	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+
+		int *begin = &buffer[len - (i * 3)];
+		int *ptr = begin;
+
+		/*
+		 * We look if the suite with period 'i' repeat
+		 * itself. If it is truncated at the end, as it
+		 * repeats we can use the period to find out the next
+		 * element.
+		 */
+		while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
+			ptr += i;
+			if (ptr >= &buffer[len])
+				return begin[((i * 3) % i)];
+		}
+	}
+
+	return -1;
+}
+
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+	int index, i, period_max, count, start, min = INT_MAX;
+
+	if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+		irqs->count = irqs->last_ts = 0;
+		return U64_MAX;
+	}
+
+	/*
+	 * As we want to find three times the repetition, we need a
+	 * number of intervals greater or equal to three times the
+	 * maximum period, otherwise we truncate the max period.
+	 */
+	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+		PREDICTION_PERIOD_MAX : irqs->count / 3;
+
+	/*
+	 * If we don't have enough irq timings for this prediction,
+	 * just bail out.
+	 */
+	if (period_max <= PREDICTION_PERIOD_MIN)
+		return U64_MAX;
+
+	/*
+	 * 'count' will depends if the circular buffer wrapped or not
+	 */
+	count = irqs->count < IRQ_TIMINGS_SIZE ?
+		irqs->count : IRQ_TIMINGS_SIZE;
+
+	start = irqs->count < IRQ_TIMINGS_SIZE ?
+		0 : (irqs->count & IRQ_TIMINGS_MASK);
+
+	/*
+	 * Copy the content of the circular buffer into another buffer
+	 * in order to linearize the buffer instead of dealing with
+	 * wrapping indexes and shifted array which will be prone to
+	 * error and extremelly difficult to debug.
+	 */
+	for (i = 0; i < count; i++) {
+		int index = (start + i) & IRQ_TIMINGS_MASK;
+
+		irqs->timings[i] = irqs->circ_timings[index];
+		min = min_t(int, irqs->timings[i], min);
+	}
+
+	index = irq_timings_next_event_index(irqs->timings, count, period_max);
+	if (index < 0)
+		return irqs->last_ts + irqs->ema_time[min];
+
+	return irqs->last_ts + irqs->ema_time[index];
+}
+
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
 {
 	u64 old_ts = irqs->last_ts;
-	u64 variance = 0;
 	u64 interval;
-	s64 diff;
+	int index;

 	/*
 	 * The timestamps are absolute time values, we need to compute
@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)
 	 * want as we need another timestamp to compute an interval.
 	 */
 	if (interval >= NSEC_PER_SEC) {
-		memset(irqs, 0, sizeof(*irqs));
-		irqs->last_ts = ts;
+		irqs->count = 0;
 		return;
 	}

 	/*
-	 * Pre-compute the delta with the average as the result is
-	 * used several times in this function.
+	 * Get the index in the ema table for this interrupt. The
+	 * PREDICTION_FACTOR increase the interval size for the array
+	 * of exponential average.
 	 */
-	diff = interval - irqs->avg;
+	index = likely(interval) ?
+		ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;

 	/*
-	 * Increment the number of samples.
+	 * Store the index as an element of the pattern in another
+	 * circular array.
 	 */
-	irqs->nr_samples++;
+	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;

-	/*
-	 * Online variance divided by the number of elements if there
-	 * is more than one sample.  Normally the formula is division
-	 * by nr_samples - 1 but we assume the number of element will be
-	 * more than 32 and dividing by 32 instead of 31 is enough
-	 * precise.
-	 */
-	if (likely(irqs->nr_samples > 1))
-		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
+	irqs->ema_time[index] = irq_timings_ema_new(interval,
+						    irqs->ema_time[index]);

-	/*
-	 * The rule of thumb in statistics for the normal distribution
-	 * is having at least 30 samples in order to have the model to
-	 * apply. Values outside the interval are considered as an
-	 * anomaly.
-	 */
-	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
-		/*
-		 * After three consecutive anomalies, we reset the
-		 * stats as it is no longer stable enough.
-		 */
-		if (irqs->anomalies++ >= 3) {
-			memset(irqs, 0, sizeof(*irqs));
-			irqs->last_ts = ts;
-			return;
-		}
-	} else {
-		/*
-		 * The anomalies must be consecutives, so at this
-		 * point, we reset the anomalies counter.
-		 */
-		irqs->anomalies = 0;
-	}
-
-	/*
-	 * The interrupt is considered stable enough to try to predict
-	 * the next event on it.
-	 */
-	irqs->valid = 1;
-
-	/*
-	 * Online average algorithm:
-	 *
-	 *  new_average = average + ((value - average) / count)
-	 *
-	 * The variance computation depends on the new average
-	 * to be computed here first.
-	 *
-	 */
-	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
-
-	/*
-	 * Online variance algorithm:
-	 *
-	 *  new_variance = variance + (value - average) x (value - new_average)
-	 *
-	 * Warning: irqs->avg is updated with the line above, hence
-	 * 'interval - irqs->avg' is no longer equal to 'diff'
-	 */
-	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
-
-	/*
-	 * Update the next event
-	 */
-	irqs->next_evt = ts + irqs->avg;
+	irqs->count++;
 }

 /**
@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)
 	 */
 	lockdep_assert_irqs_disabled();

+	if (!irqts->count)
+		return next_evt;
+
 	/*
 	 * Number of elements in the circular buffer: If it happens it
 	 * was flushed before, then the number of elements could be
@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)
 	 * type but with the cost of extra computation in the
 	 * interrupt handler hot path. We choose efficiency.
 	 *
-	 * Inject measured irq/timestamp to the statistical model
-	 * while decrementing the counter because we consume the data
-	 * from our circular buffer.
+	 * Inject measured irq/timestamp to the pattern prediction
+	 * model while decrementing the counter because we consume the
+	 * data from our circular buffer.
 	 */
-	for (i = irqts->count & IRQ_TIMINGS_MASK,
-		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {

+	i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
+	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+
+	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
 		irq = irq_timing_decode(irqts->values[i], &ts);
-
 		s = idr_find(&irqt_stats, irq);
-		if (s) {
-			irqs = this_cpu_ptr(s);
-			irqs_update(irqs, ts);
-		}
+		if (s)
+			irq_timings_store(irq, this_cpu_ptr(s), ts);
 	}

 	/*
@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)

 		irqs = this_cpu_ptr(s);

-		if (!irqs->valid)
-			continue;
+		ts = __irq_timings_next_event(irqs, i, now);
+		if (ts <= now)
+			return now;

-		if (irqs->next_evt <= now) {
-			irq = i;
-			next_evt = now;
-
-			/*
-			 * This interrupt mustn't use in the future
-			 * until new events occur and update the
-			 * statistics.
-			 */
-			irqs->valid = 0;
-			break;
-		}
-
-		if (irqs->next_evt < next_evt) {
-			irq = i;
-			next_evt = irqs->next_evt;
-		}
+		if (ts < next_evt)
+			next_evt = ts;
 	}

 	return next_evt;
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@ -56,34 +56,18 @@ void __weak arch_irq_work_raise(void)
 	 */
 }

-/*
- * Enqueue the irq_work @work on @cpu unless it's already pending
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
+static void __irq_work_queue_local(struct irq_work *work)
 {
-	/* All work should have been flushed before going offline */
-	WARN_ON_ONCE(cpu_is_offline(cpu));
-
-#ifdef CONFIG_SMP
-
-	/* Arch remote IPI send/receive backend aren't NMI safe */
-	WARN_ON_ONCE(in_nmi());
-
-	/* Only queue if not already pending */
-	if (!irq_work_claim(work))
-		return false;
-
-	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
-		arch_send_call_function_single_ipi(cpu);
-
-#else /* #ifdef CONFIG_SMP */
-	irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
-
-	return true;
+	/* If the work is "lazy", handle it from next tick if any */
+	if (work->flags & IRQ_WORK_LAZY) {
+		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+		    tick_nohz_tick_stopped())
+			arch_irq_work_raise();
+	} else {
+		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+			arch_irq_work_raise();
+	}
 }

 /* Enqueue the irq work @work on the current CPU */
@ -95,23 +79,48 @@ bool irq_work_queue(struct irq_work *work)

 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
-
-	/* If the work is "lazy", handle it from next tick if any */
-	if (work->flags & IRQ_WORK_LAZY) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
-		    tick_nohz_tick_stopped())
-			arch_irq_work_raise();
-	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
-			arch_irq_work_raise();
-	}
-
+	__irq_work_queue_local(work);
 	preempt_enable();

 	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);

+/*
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+#ifndef CONFIG_SMP
+	return irq_work_queue(work);
+
+#else /* CONFIG_SMP: */
+	/* All work should have been flushed before going offline */
+	WARN_ON_ONCE(cpu_is_offline(cpu));
+
+	/* Only queue if not already pending */
+	if (!irq_work_claim(work))
+		return false;
+
+	preempt_disable();
+	if (cpu != smp_processor_id()) {
+		/* Arch remote IPI send/receive backend aren't NMI safe */
+		WARN_ON_ONCE(in_nmi());
+		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+			arch_send_call_function_single_ipi(cpu);
+	} else {
+		__irq_work_queue_local(work);
+	}
+	preempt_enable();
+
+	return true;
+#endif /* CONFIG_SMP */
+}
+
+
 bool irq_work_needs_cpu(void)
 {
 	struct llist_head *raised, *lazy;