enable 4.19.x-rt with preempt-rt Linux 4.19.15

Signed-off-by: Tiejun Chen <tiejun.china@gmail.com>
This commit is contained in:
Tiejun Chen
2019-03-04 13:41:59 -08:00
parent daab1a1ecc
commit 92ebe10a62
269 changed files with 32215 additions and 1 deletions

View File

@@ -1,5 +1,5 @@
kernel:
image: linuxkit/kernel:4.14.87-rt
image: linuxkit/kernel:4.19.25-rt
cmdline: "console=tty0"
init:
- linuxkit/init:a2166a6048ce041eebe005ab99454cfdeaa5c848

View File

@@ -265,12 +265,14 @@ $(eval $(call kernel,4.20.13,4.20.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.19.26,4.19.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.14.104,4.14.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.14.104,4.14.x,,-dbg))
$(eval $(call kernel,4.19.25,4.19.x,-rt,))
$(eval $(call kernel,4.9.161,4.9.x,$(EXTRA),$(DEBUG)))
else ifeq ($(ARCH),aarch64)
$(eval $(call kernel,4.20.13,4.20.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.19.26,4.19.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.14.104,4.14.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.19.25,4.19.x,-rt,))
else ifeq ($(ARCH),s390x)
$(eval $(call kernel,4.20.13,4.20.x,$(EXTRA),$(DEBUG)))

View File

@@ -0,0 +1,20 @@
CONFIG_SLUB_DEBUG=y
# CONFIG_SLUB_MEMCG_SYSFS_ON is not set
CONFIG_SLUB=y
# CONFIG_SLAB_FREELIST_HARDENED is not set
CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_RT_BASE=y
CONFIG_HAVE_PREEMPT_LAZY=y
CONFIG_PREEMPT_LAZY=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT__LL is not set
# CONFIG_PREEMPT_RTB is not set
CONFIG_PREEMPT_RT_FULL=y
CONFIG_PREEMPT_COUNT=y
# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_SLUB_STATS is not set
CONFIG_DEBUG_PREEMPT=y
# CONFIG_PREEMPT_TRACER is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000

View File

@@ -0,0 +1,22 @@
CONFIG_RWSEM_GENERIC_SPINLOCK=y
# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
CONFIG_PREEMPT_RCU=y
CONFIG_TASKS_RCU=y
CONFIG_SLUB_DEBUG=y
# CONFIG_SLUB_MEMCG_SYSFS_ON is not set
CONFIG_SLUB=y
# CONFIG_SLAB_FREELIST_HARDENED is not set
CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_RT_BASE=y
CONFIG_HAVE_PREEMPT_LAZY=y
CONFIG_PREEMPT_LAZY=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT__LL is not set
# CONFIG_PREEMPT_RTB is not set
CONFIG_PREEMPT_RT_FULL=y
CONFIG_PREEMPT_COUNT=y
# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_SLUB_STATS is not set
CONFIG_DEBUG_PREEMPT=y
# CONFIG_PREEMPT_TRACER is not set

View File

@@ -0,0 +1,202 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:18 +0200
Subject: [PATCH 1/7] ARM: at91: add TCB registers definitions
Add registers and bits definitions for the timer counter blocks found on
Atmel ARM SoCs.
Tested-by: Alexander Dahl <ada@thorsis.com>
Tested-by: Andras Szemzo <szemzo.andras@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/soc/at91/atmel_tcb.h | 183 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 183 insertions(+)
create mode 100644 include/soc/at91/atmel_tcb.h
--- /dev/null
+++ b/include/soc/at91/atmel_tcb.h
@@ -0,0 +1,183 @@
+//SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2018 Microchip */
+
+#ifndef __SOC_ATMEL_TCB_H
+#define __SOC_ATMEL_TCB_H
+
+/* Channel registers */
+#define ATMEL_TC_COFFS(c) ((c) * 0x40)
+#define ATMEL_TC_CCR(c) ATMEL_TC_COFFS(c)
+#define ATMEL_TC_CMR(c) (ATMEL_TC_COFFS(c) + 0x4)
+#define ATMEL_TC_SMMR(c) (ATMEL_TC_COFFS(c) + 0x8)
+#define ATMEL_TC_RAB(c) (ATMEL_TC_COFFS(c) + 0xc)
+#define ATMEL_TC_CV(c) (ATMEL_TC_COFFS(c) + 0x10)
+#define ATMEL_TC_RA(c) (ATMEL_TC_COFFS(c) + 0x14)
+#define ATMEL_TC_RB(c) (ATMEL_TC_COFFS(c) + 0x18)
+#define ATMEL_TC_RC(c) (ATMEL_TC_COFFS(c) + 0x1c)
+#define ATMEL_TC_SR(c) (ATMEL_TC_COFFS(c) + 0x20)
+#define ATMEL_TC_IER(c) (ATMEL_TC_COFFS(c) + 0x24)
+#define ATMEL_TC_IDR(c) (ATMEL_TC_COFFS(c) + 0x28)
+#define ATMEL_TC_IMR(c) (ATMEL_TC_COFFS(c) + 0x2c)
+#define ATMEL_TC_EMR(c) (ATMEL_TC_COFFS(c) + 0x30)
+
+/* Block registers */
+#define ATMEL_TC_BCR 0xc0
+#define ATMEL_TC_BMR 0xc4
+#define ATMEL_TC_QIER 0xc8
+#define ATMEL_TC_QIDR 0xcc
+#define ATMEL_TC_QIMR 0xd0
+#define ATMEL_TC_QISR 0xd4
+#define ATMEL_TC_FMR 0xd8
+#define ATMEL_TC_WPMR 0xe4
+
+/* CCR fields */
+#define ATMEL_TC_CCR_CLKEN BIT(0)
+#define ATMEL_TC_CCR_CLKDIS BIT(1)
+#define ATMEL_TC_CCR_SWTRG BIT(2)
+
+/* Common CMR fields */
+#define ATMEL_TC_CMR_TCLKS_MSK GENMASK(2, 0)
+#define ATMEL_TC_CMR_TCLK(x) (x)
+#define ATMEL_TC_CMR_XC(x) ((x) + 5)
+#define ATMEL_TC_CMR_CLKI BIT(3)
+#define ATMEL_TC_CMR_BURST_MSK GENMASK(5, 4)
+#define ATMEL_TC_CMR_BURST_XC(x) (((x) + 1) << 4)
+#define ATMEL_TC_CMR_WAVE BIT(15)
+
+/* Capture mode CMR fields */
+#define ATMEL_TC_CMR_LDBSTOP BIT(6)
+#define ATMEL_TC_CMR_LDBDIS BIT(7)
+#define ATMEL_TC_CMR_ETRGEDG_MSK GENMASK(9, 8)
+#define ATMEL_TC_CMR_ETRGEDG_NONE (0 << 8)
+#define ATMEL_TC_CMR_ETRGEDG_RISING (1 << 8)
+#define ATMEL_TC_CMR_ETRGEDG_FALLING (2 << 8)
+#define ATMEL_TC_CMR_ETRGEDG_BOTH (3 << 8)
+#define ATMEL_TC_CMR_ABETRG BIT(10)
+#define ATMEL_TC_CMR_CPCTRG BIT(14)
+#define ATMEL_TC_CMR_LDRA_MSK GENMASK(17, 16)
+#define ATMEL_TC_CMR_LDRA_NONE (0 << 16)
+#define ATMEL_TC_CMR_LDRA_RISING (1 << 16)
+#define ATMEL_TC_CMR_LDRA_FALLING (2 << 16)
+#define ATMEL_TC_CMR_LDRA_BOTH (3 << 16)
+#define ATMEL_TC_CMR_LDRB_MSK GENMASK(19, 18)
+#define ATMEL_TC_CMR_LDRB_NONE (0 << 18)
+#define ATMEL_TC_CMR_LDRB_RISING (1 << 18)
+#define ATMEL_TC_CMR_LDRB_FALLING (2 << 18)
+#define ATMEL_TC_CMR_LDRB_BOTH (3 << 18)
+#define ATMEL_TC_CMR_SBSMPLR_MSK GENMASK(22, 20)
+#define ATMEL_TC_CMR_SBSMPLR(x) ((x) << 20)
+
+/* Waveform mode CMR fields */
+#define ATMEL_TC_CMR_CPCSTOP BIT(6)
+#define ATMEL_TC_CMR_CPCDIS BIT(7)
+#define ATMEL_TC_CMR_EEVTEDG_MSK GENMASK(9, 8)
+#define ATMEL_TC_CMR_EEVTEDG_NONE (0 << 8)
+#define ATMEL_TC_CMR_EEVTEDG_RISING (1 << 8)
+#define ATMEL_TC_CMR_EEVTEDG_FALLING (2 << 8)
+#define ATMEL_TC_CMR_EEVTEDG_BOTH (3 << 8)
+#define ATMEL_TC_CMR_EEVT_MSK GENMASK(11, 10)
+#define ATMEL_TC_CMR_EEVT_XC(x) (((x) + 1) << 10)
+#define ATMEL_TC_CMR_ENETRG BIT(12)
+#define ATMEL_TC_CMR_WAVESEL_MSK GENMASK(14, 13)
+#define ATMEL_TC_CMR_WAVESEL_UP (0 << 13)
+#define ATMEL_TC_CMR_WAVESEL_UPDOWN (1 << 13)
+#define ATMEL_TC_CMR_WAVESEL_UPRC (2 << 13)
+#define ATMEL_TC_CMR_WAVESEL_UPDOWNRC (3 << 13)
+#define ATMEL_TC_CMR_ACPA_MSK GENMASK(17, 16)
+#define ATMEL_TC_CMR_ACPA(a) (ATMEL_TC_CMR_ACTION_##a << 16)
+#define ATMEL_TC_CMR_ACPC_MSK GENMASK(19, 18)
+#define ATMEL_TC_CMR_ACPC(a) (ATMEL_TC_CMR_ACTION_##a << 18)
+#define ATMEL_TC_CMR_AEEVT_MSK GENMASK(21, 20)
+#define ATMEL_TC_CMR_AEEVT(a) (ATMEL_TC_CMR_ACTION_##a << 20)
+#define ATMEL_TC_CMR_ASWTRG_MSK GENMASK(23, 22)
+#define ATMEL_TC_CMR_ASWTRG(a) (ATMEL_TC_CMR_ACTION_##a << 22)
+#define ATMEL_TC_CMR_BCPB_MSK GENMASK(25, 24)
+#define ATMEL_TC_CMR_BCPB(a) (ATMEL_TC_CMR_ACTION_##a << 24)
+#define ATMEL_TC_CMR_BCPC_MSK GENMASK(27, 26)
+#define ATMEL_TC_CMR_BCPC(a) (ATMEL_TC_CMR_ACTION_##a << 26)
+#define ATMEL_TC_CMR_BEEVT_MSK GENMASK(29, 28)
+#define ATMEL_TC_CMR_BEEVT(a) (ATMEL_TC_CMR_ACTION_##a << 28)
+#define ATMEL_TC_CMR_BSWTRG_MSK GENMASK(31, 30)
+#define ATMEL_TC_CMR_BSWTRG(a) (ATMEL_TC_CMR_ACTION_##a << 30)
+#define ATMEL_TC_CMR_ACTION_NONE 0
+#define ATMEL_TC_CMR_ACTION_SET 1
+#define ATMEL_TC_CMR_ACTION_CLEAR 2
+#define ATMEL_TC_CMR_ACTION_TOGGLE 3
+
+/* SMMR fields */
+#define ATMEL_TC_SMMR_GCEN BIT(0)
+#define ATMEL_TC_SMMR_DOWN BIT(1)
+
+/* SR/IER/IDR/IMR fields */
+#define ATMEL_TC_COVFS BIT(0)
+#define ATMEL_TC_LOVRS BIT(1)
+#define ATMEL_TC_CPAS BIT(2)
+#define ATMEL_TC_CPBS BIT(3)
+#define ATMEL_TC_CPCS BIT(4)
+#define ATMEL_TC_LDRAS BIT(5)
+#define ATMEL_TC_LDRBS BIT(6)
+#define ATMEL_TC_ETRGS BIT(7)
+#define ATMEL_TC_CLKSTA BIT(16)
+#define ATMEL_TC_MTIOA BIT(17)
+#define ATMEL_TC_MTIOB BIT(18)
+
+/* EMR fields */
+#define ATMEL_TC_EMR_TRIGSRCA_MSK GENMASK(1, 0)
+#define ATMEL_TC_EMR_TRIGSRCA_TIOA 0
+#define ATMEL_TC_EMR_TRIGSRCA_PWMX 1
+#define ATMEL_TC_EMR_TRIGSRCB_MSK GENMASK(5, 4)
+#define ATMEL_TC_EMR_TRIGSRCB_TIOB (0 << 4)
+#define ATMEL_TC_EMR_TRIGSRCB_PWM (1 << 4)
+#define ATMEL_TC_EMR_NOCLKDIV BIT(8)
+
+/* BCR fields */
+#define ATMEL_TC_BCR_SYNC BIT(0)
+
+/* BMR fields */
+#define ATMEL_TC_BMR_TCXC_MSK(c) GENMASK(((c) * 2) + 1, (c) * 2)
+#define ATMEL_TC_BMR_TCXC(x, c) ((x) << (2 * (c)))
+#define ATMEL_TC_BMR_QDEN BIT(8)
+#define ATMEL_TC_BMR_POSEN BIT(9)
+#define ATMEL_TC_BMR_SPEEDEN BIT(10)
+#define ATMEL_TC_BMR_QDTRANS BIT(11)
+#define ATMEL_TC_BMR_EDGPHA BIT(12)
+#define ATMEL_TC_BMR_INVA BIT(13)
+#define ATMEL_TC_BMR_INVB BIT(14)
+#define ATMEL_TC_BMR_INVIDX BIT(15)
+#define ATMEL_TC_BMR_SWAP BIT(16)
+#define ATMEL_TC_BMR_IDXPHB BIT(17)
+#define ATMEL_TC_BMR_AUTOC BIT(18)
+#define ATMEL_TC_MAXFILT_MSK GENMASK(25, 20)
+#define ATMEL_TC_MAXFILT(x) (((x) - 1) << 20)
+#define ATMEL_TC_MAXCMP_MSK GENMASK(29, 26)
+#define ATMEL_TC_MAXCMP(x) ((x) << 26)
+
+/* QEDC fields */
+#define ATMEL_TC_QEDC_IDX BIT(0)
+#define ATMEL_TC_QEDC_DIRCHG BIT(1)
+#define ATMEL_TC_QEDC_QERR BIT(2)
+#define ATMEL_TC_QEDC_MPE BIT(3)
+#define ATMEL_TC_QEDC_DIR BIT(8)
+
+/* FMR fields */
+#define ATMEL_TC_FMR_ENCF(x) BIT(x)
+
+/* WPMR fields */
+#define ATMEL_TC_WPMR_WPKEY (0x54494d << 8)
+#define ATMEL_TC_WPMR_WPEN BIT(0)
+
+static const u8 atmel_tc_divisors[5] = { 2, 8, 32, 128, 0, };
+
+static const struct of_device_id atmel_tcb_dt_ids[] = {
+ {
+ .compatible = "atmel,at91rm9200-tcb",
+ .data = (void *)16,
+ }, {
+ .compatible = "atmel,at91sam9x5-tcb",
+ .data = (void *)32,
+ }, {
+ /* sentinel */
+ }
+};
+
+#endif /* __SOC_ATMEL_TCB_H */

View File

@@ -0,0 +1,473 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:19 +0200
Subject: [PATCH 2/7] clocksource/drivers: Add a new driver for the Atmel ARM
TC blocks
Add a driver for the Atmel Timer Counter Blocks. This driver provides a
clocksource and two clockevent devices.
One of the clockevent device is linked to the clocksource counter and so it
will run at the same frequency. This will be used when there is only on TCB
channel available for timers.
The other clockevent device runs on a separate TCB channel when available.
This driver uses regmap and syscon to be able to probe early in the boot
and avoid having to switch on the TCB clocksource later. Using regmap also
means that unused TCB channels may be used by other drivers (PWM for
example). read/writel are still used to access channel specific registers
to avoid the performance impact of regmap (mainly locking).
Tested-by: Alexander Dahl <ada@thorsis.com>
Tested-by: Andras Szemzo <szemzo.andras@gmail.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/clocksource/Kconfig | 8
drivers/clocksource/Makefile | 3
drivers/clocksource/timer-atmel-tcb.c | 410 ++++++++++++++++++++++++++++++++++
3 files changed, 420 insertions(+), 1 deletion(-)
create mode 100644 drivers/clocksource/timer-atmel-tcb.c
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -404,6 +404,14 @@ config ATMEL_ST
help
Support for the Atmel ST timer.
+config ATMEL_ARM_TCB_CLKSRC
+ bool "Microchip ARM TC Block" if COMPILE_TEST
+ select REGMAP_MMIO
+ depends on GENERIC_CLOCKEVENTS
+ help
+ This enables build of clocksource and clockevent driver for
+ the integrated Timer Counter Blocks in Microchip ARM SoCs.
+
config CLKSRC_EXYNOS_MCT
bool "Exynos multi core timer driver" if COMPILE_TEST
depends on ARM || ARM64
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -3,7 +3,8 @@ obj-$(CONFIG_TIMER_OF) += timer-of.o
obj-$(CONFIG_TIMER_PROBE) += timer-probe.o
obj-$(CONFIG_ATMEL_PIT) += timer-atmel-pit.o
obj-$(CONFIG_ATMEL_ST) += timer-atmel-st.o
-obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o
+obj-$(CONFIG_ATMEL_TCB_CLKSRC) += tcb_clksrc.o
+obj-$(CONFIG_ATMEL_ARM_TCB_CLKSRC) += timer-atmel-tcb.o
obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o
obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o
obj-$(CONFIG_CS5535_CLOCK_EVENT_SRC) += cs5535-clockevt.o
--- /dev/null
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/clk.h>
+#include <linux/clockchips.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/syscon.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/regmap.h>
+#include <linux/sched_clock.h>
+#include <soc/at91/atmel_tcb.h>
+
+struct atmel_tcb_clksrc {
+ struct clocksource clksrc;
+ struct clock_event_device clkevt;
+ struct regmap *regmap;
+ void __iomem *base;
+ struct clk *clk[2];
+ char name[20];
+ int channels[2];
+ int bits;
+ int irq;
+ struct {
+ u32 cmr;
+ u32 imr;
+ u32 rc;
+ bool clken;
+ } cache[2];
+ u32 bmr_cache;
+ bool registered;
+ bool clk_enabled;
+};
+
+static struct atmel_tcb_clksrc tc;
+
+static struct clk *tcb_clk_get(struct device_node *node, int channel)
+{
+ struct clk *clk;
+ char clk_name[] = "t0_clk";
+
+ clk_name[1] += channel;
+ clk = of_clk_get_by_name(node->parent, clk_name);
+ if (!IS_ERR(clk))
+ return clk;
+
+ return of_clk_get_by_name(node->parent, "t0_clk");
+}
+
+/*
+ * Clocksource and clockevent using the same channel(s)
+ */
+static u64 tc_get_cycles(struct clocksource *cs)
+{
+ u32 lower, upper;
+
+ do {
+ upper = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1]));
+ lower = readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
+ } while (upper != readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[1])));
+
+ return (upper << 16) | lower;
+}
+
+static u64 tc_get_cycles32(struct clocksource *cs)
+{
+ return readl_relaxed(tc.base + ATMEL_TC_CV(tc.channels[0]));
+}
+
+static u64 notrace tc_sched_clock_read(void)
+{
+ return tc_get_cycles(&tc.clksrc);
+}
+
+static u64 notrace tc_sched_clock_read32(void)
+{
+ return tc_get_cycles32(&tc.clksrc);
+}
+
+static int tcb_clkevt_next_event(unsigned long delta,
+ struct clock_event_device *d)
+{
+ u32 old, next, cur;
+
+ old = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
+ next = old + delta;
+ writel(next, tc.base + ATMEL_TC_RC(tc.channels[0]));
+ cur = readl(tc.base + ATMEL_TC_CV(tc.channels[0]));
+
+ /* check whether the delta elapsed while setting the register */
+ if ((next < old && cur < old && cur > next) ||
+ (next > old && (cur < old || cur > next))) {
+ /*
+ * Clear the CPCS bit in the status register to avoid
+ * generating a spurious interrupt next time a valid
+ * timer event is configured.
+ */
+ old = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
+ return -ETIME;
+ }
+
+ writel(ATMEL_TC_CPCS, tc.base + ATMEL_TC_IER(tc.channels[0]));
+
+ return 0;
+}
+
+static irqreturn_t tc_clkevt_irq(int irq, void *handle)
+{
+ unsigned int sr;
+
+ sr = readl(tc.base + ATMEL_TC_SR(tc.channels[0]));
+ if (sr & ATMEL_TC_CPCS) {
+ tc.clkevt.event_handler(&tc.clkevt);
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static int tcb_clkevt_oneshot(struct clock_event_device *dev)
+{
+ if (clockevent_state_oneshot(dev))
+ return 0;
+
+ /*
+ * Because both clockevent devices may share the same IRQ, we don't want
+ * the less likely one to stay requested
+ */
+ return request_irq(tc.irq, tc_clkevt_irq, IRQF_TIMER | IRQF_SHARED,
+ tc.name, &tc);
+}
+
+static int tcb_clkevt_shutdown(struct clock_event_device *dev)
+{
+ writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[0]));
+ if (tc.bits == 16)
+ writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[1]));
+
+ if (!clockevent_state_detached(dev))
+ free_irq(tc.irq, &tc);
+
+ return 0;
+}
+
+static void __init tcb_setup_dual_chan(struct atmel_tcb_clksrc *tc,
+ int mck_divisor_idx)
+{
+ /* first channel: waveform mode, input mclk/8, clock TIOA on overflow */
+ writel(mck_divisor_idx /* likely divide-by-8 */
+ | ATMEL_TC_CMR_WAVE
+ | ATMEL_TC_CMR_WAVESEL_UP /* free-run */
+ | ATMEL_TC_CMR_ACPA(SET) /* TIOA rises at 0 */
+ | ATMEL_TC_CMR_ACPC(CLEAR), /* (duty cycle 50%) */
+ tc->base + ATMEL_TC_CMR(tc->channels[0]));
+ writel(0x0000, tc->base + ATMEL_TC_RA(tc->channels[0]));
+ writel(0x8000, tc->base + ATMEL_TC_RC(tc->channels[0]));
+ writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0])); /* no irqs */
+ writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
+
+ /* second channel: waveform mode, input TIOA */
+ writel(ATMEL_TC_CMR_XC(tc->channels[1]) /* input: TIOA */
+ | ATMEL_TC_CMR_WAVE
+ | ATMEL_TC_CMR_WAVESEL_UP, /* free-run */
+ tc->base + ATMEL_TC_CMR(tc->channels[1]));
+ writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[1])); /* no irqs */
+ writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[1]));
+
+ /* chain both channel, we assume the previous channel */
+ regmap_write(tc->regmap, ATMEL_TC_BMR,
+ ATMEL_TC_BMR_TCXC(1 + tc->channels[1], tc->channels[1]));
+ /* then reset all the timers */
+ regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static void __init tcb_setup_single_chan(struct atmel_tcb_clksrc *tc,
+ int mck_divisor_idx)
+{
+ /* channel 0: waveform mode, input mclk/8 */
+ writel(mck_divisor_idx /* likely divide-by-8 */
+ | ATMEL_TC_CMR_WAVE
+ | ATMEL_TC_CMR_WAVESEL_UP, /* free-run */
+ tc->base + ATMEL_TC_CMR(tc->channels[0]));
+ writel(0xff, tc->base + ATMEL_TC_IDR(tc->channels[0])); /* no irqs */
+ writel(ATMEL_TC_CCR_CLKEN, tc->base + ATMEL_TC_CCR(tc->channels[0]));
+
+ /* then reset all the timers */
+ regmap_write(tc->regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static void tc_clksrc_suspend(struct clocksource *cs)
+{
+ int i;
+
+ for (i = 0; i < 1 + (tc.bits == 16); i++) {
+ tc.cache[i].cmr = readl(tc.base + ATMEL_TC_CMR(tc.channels[i]));
+ tc.cache[i].imr = readl(tc.base + ATMEL_TC_IMR(tc.channels[i]));
+ tc.cache[i].rc = readl(tc.base + ATMEL_TC_RC(tc.channels[i]));
+ tc.cache[i].clken = !!(readl(tc.base +
+ ATMEL_TC_SR(tc.channels[i])) &
+ ATMEL_TC_CLKSTA);
+ }
+
+ if (tc.bits == 16)
+ regmap_read(tc.regmap, ATMEL_TC_BMR, &tc.bmr_cache);
+}
+
+static void tc_clksrc_resume(struct clocksource *cs)
+{
+ int i;
+
+ for (i = 0; i < 1 + (tc.bits == 16); i++) {
+ /* Restore registers for the channel, RA and RB are not used */
+ writel(tc.cache[i].cmr, tc.base + ATMEL_TC_CMR(tc.channels[i]));
+ writel(tc.cache[i].rc, tc.base + ATMEL_TC_RC(tc.channels[i]));
+ writel(0, tc.base + ATMEL_TC_RA(tc.channels[i]));
+ writel(0, tc.base + ATMEL_TC_RB(tc.channels[i]));
+ /* Disable all the interrupts */
+ writel(0xff, tc.base + ATMEL_TC_IDR(tc.channels[i]));
+ /* Reenable interrupts that were enabled before suspending */
+ writel(tc.cache[i].imr, tc.base + ATMEL_TC_IER(tc.channels[i]));
+
+ /* Start the clock if it was used */
+ if (tc.cache[i].clken)
+ writel(ATMEL_TC_CCR_CLKEN, tc.base +
+ ATMEL_TC_CCR(tc.channels[i]));
+ }
+
+ /* in case of dual channel, chain channels */
+ if (tc.bits == 16)
+ regmap_write(tc.regmap, ATMEL_TC_BMR, tc.bmr_cache);
+ /* Finally, trigger all the channels*/
+ regmap_write(tc.regmap, ATMEL_TC_BCR, ATMEL_TC_BCR_SYNC);
+}
+
+static int __init tcb_clksrc_register(struct device_node *node,
+ struct regmap *regmap, void __iomem *base,
+ int channel, int channel1, int irq,
+ int bits)
+{
+ u32 rate, divided_rate = 0;
+ int best_divisor_idx = -1;
+ int i, err = -1;
+ u64 (*tc_sched_clock)(void);
+
+ tc.regmap = regmap;
+ tc.base = base;
+ tc.channels[0] = channel;
+ tc.channels[1] = channel1;
+ tc.irq = irq;
+ tc.bits = bits;
+
+ tc.clk[0] = tcb_clk_get(node, tc.channels[0]);
+ if (IS_ERR(tc.clk[0]))
+ return PTR_ERR(tc.clk[0]);
+ err = clk_prepare_enable(tc.clk[0]);
+ if (err) {
+ pr_debug("can't enable T0 clk\n");
+ goto err_clk;
+ }
+
+ /* How fast will we be counting? Pick something over 5 MHz. */
+ rate = (u32)clk_get_rate(tc.clk[0]);
+ for (i = 0; i < 5; i++) {
+ unsigned int divisor = atmel_tc_divisors[i];
+ unsigned int tmp;
+
+ if (!divisor)
+ continue;
+
+ tmp = rate / divisor;
+ pr_debug("TC: %u / %-3u [%d] --> %u\n", rate, divisor, i, tmp);
+ if (best_divisor_idx > 0) {
+ if (tmp < 5 * 1000 * 1000)
+ continue;
+ }
+ divided_rate = tmp;
+ best_divisor_idx = i;
+ }
+
+ if (tc.bits == 32) {
+ tc.clksrc.read = tc_get_cycles32;
+ tcb_setup_single_chan(&tc, best_divisor_idx);
+ tc_sched_clock = tc_sched_clock_read32;
+ snprintf(tc.name, sizeof(tc.name), "%s:%d",
+ kbasename(node->parent->full_name), tc.channels[0]);
+ } else {
+ tc.clk[1] = tcb_clk_get(node, tc.channels[1]);
+ if (IS_ERR(tc.clk[1]))
+ goto err_disable_t0;
+
+ err = clk_prepare_enable(tc.clk[1]);
+ if (err) {
+ pr_debug("can't enable T1 clk\n");
+ goto err_clk1;
+ }
+ tc.clksrc.read = tc_get_cycles,
+ tcb_setup_dual_chan(&tc, best_divisor_idx);
+ tc_sched_clock = tc_sched_clock_read;
+ snprintf(tc.name, sizeof(tc.name), "%s:%d,%d",
+ kbasename(node->parent->full_name), tc.channels[0],
+ tc.channels[1]);
+ }
+
+ pr_debug("%s at %d.%03d MHz\n", tc.name,
+ divided_rate / 1000000,
+ ((divided_rate + 500000) % 1000000) / 1000);
+
+ tc.clksrc.name = tc.name;
+ tc.clksrc.suspend = tc_clksrc_suspend;
+ tc.clksrc.resume = tc_clksrc_resume;
+ tc.clksrc.rating = 200;
+ tc.clksrc.mask = CLOCKSOURCE_MASK(32);
+ tc.clksrc.flags = CLOCK_SOURCE_IS_CONTINUOUS;
+
+ err = clocksource_register_hz(&tc.clksrc, divided_rate);
+ if (err)
+ goto err_disable_t1;
+
+ sched_clock_register(tc_sched_clock, 32, divided_rate);
+
+ tc.registered = true;
+
+ /* Set up and register clockevents */
+ tc.clkevt.name = tc.name;
+ tc.clkevt.cpumask = cpumask_of(0);
+ tc.clkevt.set_next_event = tcb_clkevt_next_event;
+ tc.clkevt.set_state_oneshot = tcb_clkevt_oneshot;
+ tc.clkevt.set_state_shutdown = tcb_clkevt_shutdown;
+ tc.clkevt.features = CLOCK_EVT_FEAT_ONESHOT;
+ tc.clkevt.rating = 125;
+
+ clockevents_config_and_register(&tc.clkevt, divided_rate, 1,
+ BIT(tc.bits) - 1);
+
+ return 0;
+
+err_disable_t1:
+ if (tc.bits == 16)
+ clk_disable_unprepare(tc.clk[1]);
+
+err_clk1:
+ if (tc.bits == 16)
+ clk_put(tc.clk[1]);
+
+err_disable_t0:
+ clk_disable_unprepare(tc.clk[0]);
+
+err_clk:
+ clk_put(tc.clk[0]);
+
+ pr_err("%s: unable to register clocksource/clockevent\n",
+ tc.clksrc.name);
+
+ return err;
+}
+
+static int __init tcb_clksrc_init(struct device_node *node)
+{
+ const struct of_device_id *match;
+ struct regmap *regmap;
+ void __iomem *tcb_base;
+ u32 channel;
+ int irq, err, chan1 = -1;
+ unsigned bits;
+
+ if (tc.registered)
+ return -ENODEV;
+
+ /*
+ * The regmap has to be used to access registers that are shared
+ * between channels on the same TCB but we keep direct IO access for
+ * the counters to avoid the impact on performance
+ */
+ regmap = syscon_node_to_regmap(node->parent);
+ if (IS_ERR(regmap))
+ return PTR_ERR(regmap);
+
+ tcb_base = of_iomap(node->parent, 0);
+ if (!tcb_base) {
+ pr_err("%s +%d %s\n", __FILE__, __LINE__, __func__);
+ return -ENXIO;
+ }
+
+ match = of_match_node(atmel_tcb_dt_ids, node->parent);
+ bits = (uintptr_t)match->data;
+
+ err = of_property_read_u32_index(node, "reg", 0, &channel);
+ if (err)
+ return err;
+
+ irq = of_irq_get(node->parent, channel);
+ if (irq < 0) {
+ irq = of_irq_get(node->parent, 0);
+ if (irq < 0)
+ return irq;
+ }
+
+ if (bits == 16) {
+ of_property_read_u32_index(node, "reg", 1, &chan1);
+ if (chan1 == -1) {
+ pr_err("%s: clocksource needs two channels\n",
+ node->parent->full_name);
+ return -EINVAL;
+ }
+ }
+
+ return tcb_clksrc_register(node, regmap, tcb_base, channel, chan1, irq,
+ bits);
+}
+TIMER_OF_DECLARE(atmel_tcb_clksrc, "atmel,tcb-timer", tcb_clksrc_init);

View File

@@ -0,0 +1,264 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:20 +0200
Subject: [PATCH 3/7] clocksource/drivers: timer-atmel-tcb: add clockevent
device on separate channel
Add an other clockevent device that uses a separate TCB channel when
available.
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/clocksource/timer-atmel-tcb.c | 217 +++++++++++++++++++++++++++++++++-
1 file changed, 212 insertions(+), 5 deletions(-)
--- a/drivers/clocksource/timer-atmel-tcb.c
+++ b/drivers/clocksource/timer-atmel-tcb.c
@@ -32,7 +32,7 @@ struct atmel_tcb_clksrc {
bool clk_enabled;
};
-static struct atmel_tcb_clksrc tc;
+static struct atmel_tcb_clksrc tc, tce;
static struct clk *tcb_clk_get(struct device_node *node, int channel)
{
@@ -48,6 +48,203 @@ static struct clk *tcb_clk_get(struct de
}
/*
+ * Clockevent device using its own channel
+ */
+
+static void tc_clkevt2_clk_disable(struct clock_event_device *d)
+{
+ clk_disable(tce.clk[0]);
+ tce.clk_enabled = false;
+}
+
+static void tc_clkevt2_clk_enable(struct clock_event_device *d)
+{
+ if (tce.clk_enabled)
+ return;
+ clk_enable(tce.clk[0]);
+ tce.clk_enabled = true;
+}
+
+static int tc_clkevt2_stop(struct clock_event_device *d)
+{
+ writel(0xff, tce.base + ATMEL_TC_IDR(tce.channels[0]));
+ writel(ATMEL_TC_CCR_CLKDIS, tce.base + ATMEL_TC_CCR(tce.channels[0]));
+
+ return 0;
+}
+
+static int tc_clkevt2_shutdown(struct clock_event_device *d)
+{
+ tc_clkevt2_stop(d);
+ if (!clockevent_state_detached(d))
+ tc_clkevt2_clk_disable(d);
+
+ return 0;
+}
+
+/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
+ * because using one of the divided clocks would usually mean the
+ * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
+ *
+ * A divided clock could be good for high resolution timers, since
+ * 30.5 usec resolution can seem "low".
+ */
+static int tc_clkevt2_set_oneshot(struct clock_event_device *d)
+{
+ if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
+ tc_clkevt2_stop(d);
+
+ tc_clkevt2_clk_enable(d);
+
+ /* slow clock, count up to RC, then irq and stop */
+ writel(ATMEL_TC_CMR_TCLK(4) | ATMEL_TC_CMR_CPCSTOP |
+ ATMEL_TC_CMR_WAVE | ATMEL_TC_CMR_WAVESEL_UPRC,
+ tce.base + ATMEL_TC_CMR(tce.channels[0]));
+ writel(ATMEL_TC_CPCS, tce.base + ATMEL_TC_IER(tce.channels[0]));
+
+ return 0;
+}
+
+static int tc_clkevt2_set_periodic(struct clock_event_device *d)
+{
+ if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
+ tc_clkevt2_stop(d);
+
+ /* By not making the gentime core emulate periodic mode on top
+ * of oneshot, we get lower overhead and improved accuracy.
+ */
+ tc_clkevt2_clk_enable(d);
+
+ /* slow clock, count up to RC, then irq and restart */
+ writel(ATMEL_TC_CMR_TCLK(4) | ATMEL_TC_CMR_WAVE |
+ ATMEL_TC_CMR_WAVESEL_UPRC,
+ tce.base + ATMEL_TC_CMR(tce.channels[0]));
+ writel((32768 + HZ / 2) / HZ, tce.base + ATMEL_TC_RC(tce.channels[0]));
+
+ /* Enable clock and interrupts on RC compare */
+ writel(ATMEL_TC_CPCS, tce.base + ATMEL_TC_IER(tce.channels[0]));
+ writel(ATMEL_TC_CCR_CLKEN | ATMEL_TC_CCR_SWTRG,
+ tce.base + ATMEL_TC_CCR(tce.channels[0]));
+
+ return 0;
+}
+
+static int tc_clkevt2_next_event(unsigned long delta,
+ struct clock_event_device *d)
+{
+ writel(delta, tce.base + ATMEL_TC_RC(tce.channels[0]));
+ writel(ATMEL_TC_CCR_CLKEN | ATMEL_TC_CCR_SWTRG,
+ tce.base + ATMEL_TC_CCR(tce.channels[0]));
+
+ return 0;
+}
+
+static irqreturn_t tc_clkevt2_irq(int irq, void *handle)
+{
+ unsigned int sr;
+
+ sr = readl(tce.base + ATMEL_TC_SR(tce.channels[0]));
+ if (sr & ATMEL_TC_CPCS) {
+ tce.clkevt.event_handler(&tce.clkevt);
+ return IRQ_HANDLED;
+ }
+
+ return IRQ_NONE;
+}
+
+static void tc_clkevt2_suspend(struct clock_event_device *d)
+{
+ tce.cache[0].cmr = readl(tce.base + ATMEL_TC_CMR(tce.channels[0]));
+ tce.cache[0].imr = readl(tce.base + ATMEL_TC_IMR(tce.channels[0]));
+ tce.cache[0].rc = readl(tce.base + ATMEL_TC_RC(tce.channels[0]));
+ tce.cache[0].clken = !!(readl(tce.base + ATMEL_TC_SR(tce.channels[0])) &
+ ATMEL_TC_CLKSTA);
+}
+
+static void tc_clkevt2_resume(struct clock_event_device *d)
+{
+ /* Restore registers for the channel, RA and RB are not used */
+ writel(tce.cache[0].cmr, tc.base + ATMEL_TC_CMR(tce.channels[0]));
+ writel(tce.cache[0].rc, tc.base + ATMEL_TC_RC(tce.channels[0]));
+ writel(0, tc.base + ATMEL_TC_RA(tce.channels[0]));
+ writel(0, tc.base + ATMEL_TC_RB(tce.channels[0]));
+ /* Disable all the interrupts */
+ writel(0xff, tc.base + ATMEL_TC_IDR(tce.channels[0]));
+ /* Reenable interrupts that were enabled before suspending */
+ writel(tce.cache[0].imr, tc.base + ATMEL_TC_IER(tce.channels[0]));
+
+ /* Start the clock if it was used */
+ if (tce.cache[0].clken)
+ writel(ATMEL_TC_CCR_CLKEN | ATMEL_TC_CCR_SWTRG,
+ tc.base + ATMEL_TC_CCR(tce.channels[0]));
+}
+
+static int __init tc_clkevt_register(struct device_node *node,
+ struct regmap *regmap, void __iomem *base,
+ int channel, int irq, int bits)
+{
+ int ret;
+ struct clk *slow_clk;
+
+ tce.regmap = regmap;
+ tce.base = base;
+ tce.channels[0] = channel;
+ tce.irq = irq;
+
+ slow_clk = of_clk_get_by_name(node->parent, "slow_clk");
+ if (IS_ERR(slow_clk))
+ return PTR_ERR(slow_clk);
+
+ ret = clk_prepare_enable(slow_clk);
+ if (ret)
+ return ret;
+
+ tce.clk[0] = tcb_clk_get(node, tce.channels[0]);
+ if (IS_ERR(tce.clk[0])) {
+ ret = PTR_ERR(tce.clk[0]);
+ goto err_slow;
+ }
+
+ snprintf(tce.name, sizeof(tce.name), "%s:%d",
+ kbasename(node->parent->full_name), channel);
+ tce.clkevt.cpumask = cpumask_of(0);
+ tce.clkevt.name = tce.name;
+ tce.clkevt.set_next_event = tc_clkevt2_next_event,
+ tce.clkevt.set_state_shutdown = tc_clkevt2_shutdown,
+ tce.clkevt.set_state_periodic = tc_clkevt2_set_periodic,
+ tce.clkevt.set_state_oneshot = tc_clkevt2_set_oneshot,
+ tce.clkevt.suspend = tc_clkevt2_suspend,
+ tce.clkevt.resume = tc_clkevt2_resume,
+ tce.clkevt.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
+ tce.clkevt.rating = 140;
+
+ /* try to enable clk to avoid future errors in mode change */
+ ret = clk_prepare_enable(tce.clk[0]);
+ if (ret)
+ goto err_slow;
+ clk_disable(tce.clk[0]);
+
+ clockevents_config_and_register(&tce.clkevt, 32768, 1,
+ CLOCKSOURCE_MASK(bits));
+
+ ret = request_irq(tce.irq, tc_clkevt2_irq, IRQF_TIMER | IRQF_SHARED,
+ tce.clkevt.name, &tce);
+ if (ret)
+ goto err_clk;
+
+ tce.registered = true;
+
+ return 0;
+
+err_clk:
+ clk_unprepare(tce.clk[0]);
+err_slow:
+ clk_disable_unprepare(slow_clk);
+
+ return ret;
+}
+
+/*
* Clocksource and clockevent using the same channel(s)
*/
static u64 tc_get_cycles(struct clocksource *cs)
@@ -363,7 +560,7 @@ static int __init tcb_clksrc_init(struct
int irq, err, chan1 = -1;
unsigned bits;
- if (tc.registered)
+ if (tc.registered && tce.registered)
return -ENODEV;
/*
@@ -395,12 +592,22 @@ static int __init tcb_clksrc_init(struct
return irq;
}
+ if (tc.registered)
+ return tc_clkevt_register(node, regmap, tcb_base, channel, irq,
+ bits);
+
if (bits == 16) {
of_property_read_u32_index(node, "reg", 1, &chan1);
if (chan1 == -1) {
- pr_err("%s: clocksource needs two channels\n",
- node->parent->full_name);
- return -EINVAL;
+ if (tce.registered) {
+ pr_err("%s: clocksource needs two channels\n",
+ node->parent->full_name);
+ return -EINVAL;
+ } else {
+ return tc_clkevt_register(node, regmap,
+ tcb_base, channel,
+ irq, bits);
+ }
}
}

View File

@@ -0,0 +1,29 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:21 +0200
Subject: [PATCH 4/7] clocksource/drivers: atmel-pit: make option silent
To conform with the other option, make the ATMEL_PIT option silent so it
can be selected from the platform
Tested-by: Alexander Dahl <ada@thorsis.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/clocksource/Kconfig | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -393,8 +393,11 @@ config ARMV7M_SYSTICK
This options enables support for the ARMv7M system timer unit
config ATMEL_PIT
+ bool "Microchip ARM Periodic Interval Timer (PIT)" if COMPILE_TEST
select TIMER_OF if OF
- def_bool SOC_AT91SAM9 || SOC_SAMA5
+ help
+ This enables build of clocksource and clockevent driver for
+ the integrated PIT in Microchip ARM SoCs.
config ATMEL_ST
bool "Atmel ST timer support" if COMPILE_TEST

View File

@@ -0,0 +1,48 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:22 +0200
Subject: [PATCH 5/7] ARM: at91: Implement clocksource selection
Allow selecting and unselecting the PIT clocksource driver so it doesn't
have to be compile when unused.
Tested-by: Alexander Dahl <ada@thorsis.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/mach-at91/Kconfig | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -107,6 +107,31 @@ config SOC_AT91SAM9
AT91SAM9X35
AT91SAM9XE
+comment "Clocksource driver selection"
+
+config ATMEL_CLOCKSOURCE_PIT
+ bool "Periodic Interval Timer (PIT) support"
+ depends on SOC_AT91SAM9 || SOC_SAMA5
+ default SOC_AT91SAM9 || SOC_SAMA5
+ select ATMEL_PIT
+ help
+ Select this to get a clocksource based on the Atmel Periodic Interval
+ Timer. It has a relatively low resolution and the TC Block clocksource
+ should be preferred.
+
+config ATMEL_CLOCKSOURCE_TCB
+ bool "Timer Counter Blocks (TCB) support"
+ depends on SOC_AT91RM9200 || SOC_AT91SAM9 || SOC_SAMA5 || COMPILE_TEST
+ default SOC_AT91RM9200 || SOC_AT91SAM9 || SOC_SAMA5
+ depends on !ATMEL_TCLIB
+ select ATMEL_ARM_TCB_CLKSRC
+ help
+ Select this to get a high precision clocksource based on a
+ TC block with a 5+ MHz base clock rate.
+ On platforms with 16-bit counters, two timer channels are combined
+ to make a single 32-bit timer.
+ It can also be used as a clock event device supporting oneshot mode.
+
config HAVE_AT91_UTMI
bool

View File

@@ -0,0 +1,34 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:23 +0200
Subject: [PATCH 6/7] ARM: configs: at91: use new TCB timer driver
Unselecting ATMEL_TCLIB switches the TCB timer driver from tcb_clksrc to
timer-atmel-tcb.
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/configs/at91_dt_defconfig | 1 -
arch/arm/configs/sama5_defconfig | 1 -
2 files changed, 2 deletions(-)
--- a/arch/arm/configs/at91_dt_defconfig
+++ b/arch/arm/configs/at91_dt_defconfig
@@ -64,7 +64,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=4
CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_ATMEL_TCLIB=y
CONFIG_ATMEL_SSC=y
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
--- a/arch/arm/configs/sama5_defconfig
+++ b/arch/arm/configs/sama5_defconfig
@@ -75,7 +75,6 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=4
CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_ATMEL_TCLIB=y
CONFIG_ATMEL_SSC=y
CONFIG_EEPROM_AT24=y
CONFIG_SCSI=y

View File

@@ -0,0 +1,35 @@
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Thu, 13 Sep 2018 13:30:24 +0200
Subject: [PATCH 7/7] ARM: configs: at91: unselect PIT
The PIT is not required anymore to successfully boot and may actually harm
in case preempt-rt is used because the PIT interrupt is shared.
Disable it so the TCB clocksource is used.
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/configs/at91_dt_defconfig | 1 +
arch/arm/configs/sama5_defconfig | 1 +
2 files changed, 2 insertions(+)
--- a/arch/arm/configs/at91_dt_defconfig
+++ b/arch/arm/configs/at91_dt_defconfig
@@ -19,6 +19,7 @@ CONFIG_ARCH_MULTI_V5=y
CONFIG_ARCH_AT91=y
CONFIG_SOC_AT91RM9200=y
CONFIG_SOC_AT91SAM9=y
+# CONFIG_ATMEL_CLOCKSOURCE_PIT is not set
CONFIG_AEABI=y
CONFIG_UACCESS_WITH_MEMCPY=y
CONFIG_ZBOOT_ROM_TEXT=0x0
--- a/arch/arm/configs/sama5_defconfig
+++ b/arch/arm/configs/sama5_defconfig
@@ -20,6 +20,7 @@ CONFIG_ARCH_AT91=y
CONFIG_SOC_SAMA5D2=y
CONFIG_SOC_SAMA5D3=y
CONFIG_SOC_SAMA5D4=y
+# CONFIG_ATMEL_CLOCKSOURCE_PIT is not set
CONFIG_AEABI=y
CONFIG_UACCESS_WITH_MEMCPY=y
CONFIG_ZBOOT_ROM_TEXT=0x0

View File

@@ -0,0 +1,162 @@
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 27 Jul 2018 13:38:54 +0100
Subject: [PATCH] irqchip/gic-v3-its: Move pending table allocation to init
time
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/irqchip/irq-gic-v3-its.c | 80 ++++++++++++++++++++++++-------------
include/linux/irqchip/arm-gic-v3.h | 1
2 files changed, 53 insertions(+), 28 deletions(-)
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -179,6 +179,7 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock);
static DEFINE_IDA(its_vpeid_ida);
#define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist))
+#define gic_data_rdist_cpu(cpu) (per_cpu_ptr(gic_rdists->rdist, cpu))
#define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
#define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K)
@@ -1628,7 +1629,7 @@ static void its_free_prop_table(struct p
get_order(LPI_PROPBASE_SZ));
}
-static int __init its_alloc_lpi_tables(void)
+static int __init its_alloc_lpi_prop_table(void)
{
phys_addr_t paddr;
@@ -1951,30 +1952,47 @@ static void its_free_pending_table(struc
get_order(max_t(u32, LPI_PENDBASE_SZ, SZ_64K)));
}
-static void its_cpu_init_lpis(void)
+static int __init allocate_lpi_tables(void)
{
- void __iomem *rbase = gic_data_rdist_rd_base();
- struct page *pend_page;
- u64 val, tmp;
+ int err, cpu;
- /* If we didn't allocate the pending table yet, do it now */
- pend_page = gic_data_rdist()->pend_page;
- if (!pend_page) {
- phys_addr_t paddr;
+ err = its_alloc_lpi_prop_table();
+ if (err)
+ return err;
+
+ /*
+ * We allocate all the pending tables anyway, as we may have a
+ * mix of RDs that have had LPIs enabled, and some that
+ * don't. We'll free the unused ones as each CPU comes online.
+ */
+ for_each_possible_cpu(cpu) {
+ struct page *pend_page;
pend_page = its_allocate_pending_table(GFP_NOWAIT);
if (!pend_page) {
- pr_err("Failed to allocate PENDBASE for CPU%d\n",
- smp_processor_id());
- return;
+ pr_err("Failed to allocate PENDBASE for CPU%d\n", cpu);
+ return -ENOMEM;
}
- paddr = page_to_phys(pend_page);
- pr_info("CPU%d: using LPI pending table @%pa\n",
- smp_processor_id(), &paddr);
- gic_data_rdist()->pend_page = pend_page;
+ gic_data_rdist_cpu(cpu)->pend_page = pend_page;
}
+ return 0;
+}
+
+static void its_cpu_init_lpis(void)
+{
+ void __iomem *rbase = gic_data_rdist_rd_base();
+ struct page *pend_page;
+ phys_addr_t paddr;
+ u64 val, tmp;
+
+ if (gic_data_rdist()->lpi_enabled)
+ return;
+
+ pend_page = gic_data_rdist()->pend_page;
+ paddr = page_to_phys(pend_page);
+
/* set PROPBASE */
val = (page_to_phys(gic_rdists->prop_page) |
GICR_PROPBASER_InnerShareable |
@@ -2026,6 +2044,10 @@ static void its_cpu_init_lpis(void)
/* Make sure the GIC has seen the above */
dsb(sy);
+ gic_data_rdist()->lpi_enabled = true;
+ pr_info("GICv3: CPU%d: using LPI pending table @%pa\n",
+ smp_processor_id(),
+ &paddr);
}
static void its_cpu_init_collection(struct its_node *its)
@@ -3521,16 +3543,6 @@ static int redist_disable_lpis(void)
u64 timeout = USEC_PER_SEC;
u64 val;
- /*
- * If coming via a CPU hotplug event, we don't need to disable
- * LPIs before trying to re-enable them. They are already
- * configured and all is well in the world. Detect this case
- * by checking the allocation of the pending table for the
- * current CPU.
- */
- if (gic_data_rdist()->pend_page)
- return 0;
-
if (!gic_rdists_supports_plpis()) {
pr_info("CPU%d: LPIs not supported\n", smp_processor_id());
return -ENXIO;
@@ -3540,7 +3552,18 @@ static int redist_disable_lpis(void)
if (!(val & GICR_CTLR_ENABLE_LPIS))
return 0;
- pr_warn("CPU%d: Booted with LPIs enabled, memory probably corrupted\n",
+ /*
+ * If coming via a CPU hotplug event, we don't need to disable
+ * LPIs before trying to re-enable them. They are already
+ * configured and all is well in the world.
+ */
+ if (gic_data_rdist()->lpi_enabled)
+ return 0;
+
+ /*
+ * From that point on, we only try to do some damage control.
+ */
+ pr_warn("GICv3: CPU%d: Booted with LPIs enabled, memory probably corrupted\n",
smp_processor_id());
add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3796,7 +3819,8 @@ int __init its_init(struct fwnode_handle
}
gic_rdists = rdists;
- err = its_alloc_lpi_tables();
+
+ err = allocate_lpi_tables();
if (err)
return err;
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -585,6 +585,7 @@ struct rdists {
void __iomem *rd_base;
struct page *pend_page;
phys_addr_t phys_base;
+ bool lpi_enabled;
} __percpu *rdist;
struct page *prop_page;
u64 flags;

View File

@@ -0,0 +1,194 @@
From: Julia Cartwright <julia@ni.com>
Date: Fri, 28 Sep 2018 21:03:51 +0000
Subject: [PATCH] kthread: convert worker lock to raw spinlock
In order to enable the queuing of kthread work items from hardirq
context even when PREEMPT_RT_FULL is enabled, convert the worker
spin_lock to a raw_spin_lock.
This is only acceptable to do because the work performed under the lock
is well-bounded and minimal.
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Reported-and-tested-by: Steffen Trumtrar <s.trumtrar@pengutronix.de>
Reported-by: Tim Sander <tim@krieglstein.org>
Signed-off-by: Julia Cartwright <julia@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kthread.h | 2 +-
kernel/kthread.c | 42 +++++++++++++++++++++---------------------
2 files changed, 22 insertions(+), 22 deletions(-)
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -85,7 +85,7 @@ enum {
struct kthread_worker {
unsigned int flags;
- spinlock_t lock;
+ raw_spinlock_t lock;
struct list_head work_list;
struct list_head delayed_work_list;
struct task_struct *task;
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -599,7 +599,7 @@ void __kthread_init_worker(struct kthrea
struct lock_class_key *key)
{
memset(worker, 0, sizeof(struct kthread_worker));
- spin_lock_init(&worker->lock);
+ raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
INIT_LIST_HEAD(&worker->delayed_work_list);
@@ -641,21 +641,21 @@ int kthread_worker_fn(void *worker_ptr)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -812,12 +812,12 @@ bool kthread_queue_work(struct kthread_w
bool ret = false;
unsigned long flags;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);
@@ -843,7 +843,7 @@ void kthread_delayed_work_timer_fn(struc
if (WARN_ON_ONCE(!worker))
return;
- spin_lock(&worker->lock);
+ raw_spin_lock(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -852,7 +852,7 @@ void kthread_delayed_work_timer_fn(struc
list_del_init(&work->node);
kthread_insert_work(worker, work, &worker->work_list);
- spin_unlock(&worker->lock);
+ raw_spin_unlock(&worker->lock);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
@@ -908,14 +908,14 @@ bool kthread_queue_delayed_work(struct k
unsigned long flags;
bool ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
__kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@ -951,7 +951,7 @@ void kthread_flush_work(struct kthread_w
if (!worker)
return;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -963,7 +963,7 @@ void kthread_flush_work(struct kthread_w
else
noop = true;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
@@ -996,9 +996,9 @@ static bool __kthread_cancel_work(struct
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, *flags);
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
del_timer_sync(&dwork->timer);
- spin_lock_irqsave(&worker->lock, *flags);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
@@ -1045,7 +1045,7 @@ bool kthread_mod_delayed_work(struct kth
unsigned long flags;
int ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
if (!work->worker)
@@ -1062,7 +1062,7 @@ bool kthread_mod_delayed_work(struct kth
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@ -1076,7 +1076,7 @@ static bool __kthread_cancel_work_sync(s
if (!worker)
goto out;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -1090,13 +1090,13 @@ static bool __kthread_cancel_work_sync(s
* In the meantime, block any queuing by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
kthread_flush_work(work);
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
work->canceling--;
out_fast:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
return ret;
}

View File

@@ -0,0 +1,131 @@
From: =?UTF-8?q?Horia=20Geant=C4=83?= <horia.geanta@nxp.com>
Date: Mon, 8 Oct 2018 14:09:37 +0300
Subject: [PATCH] crypto: caam/qi - simplify CGR allocation, freeing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
[Upstream commit 29e83c757006fd751966bdc53392bb22d74179c6]
CGRs (Congestion Groups) have to be freed by the same CPU that
initialized them.
This is why currently the driver takes special measures; however, using
set_cpus_allowed_ptr() is incorrect - as reported by Sebastian.
Instead of the generic solution of replacing set_cpus_allowed_ptr() with
work_on_cpu_safe(), we use the qman_delete_cgr_safe() QBMan API instead
of qman_delete_cgr() - which internally takes care of proper CGR
deletion.
Link: https://lkml.kernel.org/r/20181005125443.dfhd2asqktm22ney@linutronix.de
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
drivers/crypto/caam/qi.c | 43 ++++---------------------------------------
drivers/crypto/caam/qi.h | 2 +-
2 files changed, 5 insertions(+), 40 deletions(-)
--- a/drivers/crypto/caam/qi.c
+++ b/drivers/crypto/caam/qi.c
@@ -84,13 +84,6 @@ static u64 times_congested;
#endif
/*
- * CPU from where the module initialised. This is required because QMan driver
- * requires CGRs to be removed from same CPU from where they were originally
- * allocated.
- */
-static int mod_init_cpu;
-
-/*
* This is a a cache of buffers, from which the users of CAAM QI driver
* can allocate short (CAAM_QI_MEMCACHE_SIZE) buffers. It's faster than
* doing malloc on the hotpath.
@@ -492,12 +485,11 @@ void caam_drv_ctx_rel(struct caam_drv_ct
}
EXPORT_SYMBOL(caam_drv_ctx_rel);
-int caam_qi_shutdown(struct device *qidev)
+void caam_qi_shutdown(struct device *qidev)
{
- int i, ret;
+ int i;
struct caam_qi_priv *priv = dev_get_drvdata(qidev);
const cpumask_t *cpus = qman_affine_cpus();
- struct cpumask old_cpumask = current->cpus_allowed;
for_each_cpu(i, cpus) {
struct napi_struct *irqtask;
@@ -510,26 +502,12 @@ int caam_qi_shutdown(struct device *qide
dev_err(qidev, "Rsp FQ kill failed, cpu: %d\n", i);
}
- /*
- * QMan driver requires CGRs to be deleted from same CPU from where they
- * were instantiated. Hence we get the module removal execute from the
- * same CPU from where it was originally inserted.
- */
- set_cpus_allowed_ptr(current, get_cpu_mask(mod_init_cpu));
-
- ret = qman_delete_cgr(&priv->cgr);
- if (ret)
- dev_err(qidev, "Deletion of CGR failed: %d\n", ret);
- else
- qman_release_cgrid(priv->cgr.cgrid);
+ qman_delete_cgr_safe(&priv->cgr);
+ qman_release_cgrid(priv->cgr.cgrid);
kmem_cache_destroy(qi_cache);
- /* Now that we're done with the CGRs, restore the cpus allowed mask */
- set_cpus_allowed_ptr(current, &old_cpumask);
-
platform_device_unregister(priv->qi_pdev);
- return ret;
}
static void cgr_cb(struct qman_portal *qm, struct qman_cgr *cgr, int congested)
@@ -718,22 +696,11 @@ int caam_qi_init(struct platform_device
struct device *ctrldev = &caam_pdev->dev, *qidev;
struct caam_drv_private *ctrlpriv;
const cpumask_t *cpus = qman_affine_cpus();
- struct cpumask old_cpumask = current->cpus_allowed;
static struct platform_device_info qi_pdev_info = {
.name = "caam_qi",
.id = PLATFORM_DEVID_NONE
};
- /*
- * QMAN requires CGRs to be removed from same CPU+portal from where it
- * was originally allocated. Hence we need to note down the
- * initialisation CPU and use the same CPU for module exit.
- * We select the first CPU to from the list of portal owning CPUs.
- * Then we pin module init to this CPU.
- */
- mod_init_cpu = cpumask_first(cpus);
- set_cpus_allowed_ptr(current, get_cpu_mask(mod_init_cpu));
-
qi_pdev_info.parent = ctrldev;
qi_pdev_info.dma_mask = dma_get_mask(ctrldev);
qi_pdev = platform_device_register_full(&qi_pdev_info);
@@ -795,8 +762,6 @@ int caam_qi_init(struct platform_device
return -ENOMEM;
}
- /* Done with the CGRs; restore the cpus allowed mask */
- set_cpus_allowed_ptr(current, &old_cpumask);
#ifdef CONFIG_DEBUG_FS
debugfs_create_file("qi_congested", 0444, ctrlpriv->ctl,
&times_congested, &caam_fops_u64_ro);
--- a/drivers/crypto/caam/qi.h
+++ b/drivers/crypto/caam/qi.h
@@ -174,7 +174,7 @@ int caam_drv_ctx_update(struct caam_drv_
void caam_drv_ctx_rel(struct caam_drv_ctx *drv_ctx);
int caam_qi_init(struct platform_device *pdev);
-int caam_qi_shutdown(struct device *dev);
+void caam_qi_shutdown(struct device *dev);
/**
* qi_cache_alloc - Allocate buffers from CAAM-QI cache

View File

@@ -0,0 +1,141 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 7 Jan 2019 13:52:31 +0100
Subject: [PATCH] sched/fair: Robustify CFS-bandwidth timer locking
Traditionally hrtimer callbacks were run with IRQs disabled, but with
the introduction of HRTIMER_MODE_SOFT it is possible they run from
SoftIRQ context, which does _NOT_ have IRQs disabled.
Allow for the CFS bandwidth timers (period_timer and slack_timer) to
be ran from SoftIRQ context; this entails removing the assumption that
IRQs are already disabled from the locking.
While mainline doesn't strictly need this, -RT forces all timers not
explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
And marking these timers as MODE_HARD doesn't make sense as they're
not required for RT operation and can potentially be quite expensive.
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Tom Putzeys <tom.putzeys@be.atlascopco.com>
Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190107125231.GE14122@hirez.programming.kicks-ass.net
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/sched/fair.c | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4553,7 +4553,7 @@ static u64 distribute_cfs_runtime(struct
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4570,7 +4570,7 @@ static u64 distribute_cfs_runtime(struct
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4586,7 +4586,7 @@ static u64 distribute_cfs_runtime(struct
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4628,11 +4628,11 @@ static int do_sched_cfs_period_timer(str
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4741,17 +4741,18 @@ static __always_inline void return_cfs_r
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4762,18 +4763,18 @@ static void do_sched_cfs_slack_timer(str
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4851,20 +4852,21 @@ static enum hrtimer_restart sched_cfs_pe
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

View File

@@ -0,0 +1,411 @@
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Mon, 19 Sep 2011 14:51:14 -0700
Subject: arm: Convert arm boot_lock to raw
The arm boot_lock is used by the secondary processor startup code. The locking
task is the idle thread, which has idle->sched_class == &idle_sched_class.
idle_sched_class->enqueue_task == NULL, so if the idle task blocks on the
lock, the attempt to wake it when the lock becomes available will fail:
try_to_wake_up()
...
activate_task()
enqueue_task()
p->sched_class->enqueue_task(rq, p, flags)
Fix by converting boot_lock to a raw spin lock.
Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Link: http://lkml.kernel.org/r/4E77B952.3010606@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Tony Lindgren <tony@atomide.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Tested-by: Krzysztof Kozlowski <krzk@kernel.org> [Exynos5422 Linaro PM-QA]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/mach-exynos/platsmp.c | 12 ++++++------
arch/arm/mach-hisi/platmcpm.c | 22 +++++++++++-----------
arch/arm/mach-omap2/omap-smp.c | 10 +++++-----
arch/arm/mach-prima2/platsmp.c | 10 +++++-----
arch/arm/mach-qcom/platsmp.c | 10 +++++-----
arch/arm/mach-spear/platsmp.c | 10 +++++-----
arch/arm/mach-sti/platsmp.c | 10 +++++-----
arch/arm/plat-versatile/platsmp.c | 10 +++++-----
8 files changed, 47 insertions(+), 47 deletions(-)
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -239,7 +239,7 @@ static void write_pen_release(int val)
sync_cache_w(&pen_release);
}
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
static void exynos_secondary_init(unsigned int cpu)
{
@@ -252,8 +252,8 @@ static void exynos_secondary_init(unsign
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
@@ -317,7 +317,7 @@ static int exynos_boot_secondary(unsigne
* Set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* The secondary processor is waiting to be released from
@@ -344,7 +344,7 @@ static int exynos_boot_secondary(unsigne
if (timeout == 0) {
printk(KERN_ERR "cpu1 power enable failed");
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return -ETIMEDOUT;
}
}
@@ -390,7 +390,7 @@ static int exynos_boot_secondary(unsigne
* calibrations, then wait for it to finish
*/
fail:
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return pen_release != -1 ? ret : 0;
}
--- a/arch/arm/mach-hisi/platmcpm.c
+++ b/arch/arm/mach-hisi/platmcpm.c
@@ -61,7 +61,7 @@
static void __iomem *sysctrl, *fabric;
static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
static u32 fabric_phys_addr;
/*
* [0]: bootwrapper physical address
@@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned
if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
return -EINVAL;
- spin_lock_irq(&boot_lock);
+ raw_spin_lock_irq(&boot_lock);
if (hip04_cpu_table[cluster][cpu])
goto out;
@@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned
out:
hip04_cpu_table[cluster][cpu]++;
- spin_unlock_irq(&boot_lock);
+ raw_spin_unlock_irq(&boot_lock);
return 0;
}
@@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l
cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
hip04_cpu_table[cluster][cpu]--;
if (hip04_cpu_table[cluster][cpu] == 1) {
/* A power_up request went ahead of us. */
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return;
} else if (hip04_cpu_table[cluster][cpu] > 1) {
pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
@@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l
}
last_man = hip04_cluster_is_down(cluster);
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
if (last_man) {
/* Since it's Cortex A15, disable L2 prefetching. */
asm volatile(
@@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l
cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
count = TIMEOUT_MSEC / POLL_MSEC;
- spin_lock_irq(&boot_lock);
+ raw_spin_lock_irq(&boot_lock);
for (tries = 0; tries < count; tries++) {
if (hip04_cpu_table[cluster][cpu])
goto err;
@@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l
data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
if (data & CORE_WFI_STATUS(cpu))
break;
- spin_unlock_irq(&boot_lock);
+ raw_spin_unlock_irq(&boot_lock);
/* Wait for clean L2 when the whole cluster is down. */
msleep(POLL_MSEC);
- spin_lock_irq(&boot_lock);
+ raw_spin_lock_irq(&boot_lock);
}
if (tries >= count)
goto err;
@@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l
goto err;
if (hip04_cluster_is_down(cluster))
hip04_set_snoop_filter(cluster, 0);
- spin_unlock_irq(&boot_lock);
+ raw_spin_unlock_irq(&boot_lock);
return 1;
err:
- spin_unlock_irq(&boot_lock);
+ raw_spin_unlock_irq(&boot_lock);
return 0;
}
#endif
--- a/arch/arm/mach-omap2/omap-smp.c
+++ b/arch/arm/mach-omap2/omap-smp.c
@@ -69,7 +69,7 @@ static const struct omap_smp_config omap
.startup_addr = omap5_secondary_startup,
};
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
void __iomem *omap4_get_scu_base(void)
{
@@ -177,8 +177,8 @@ static void omap4_secondary_init(unsigne
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -191,7 +191,7 @@ static int omap4_boot_secondary(unsigned
* Set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* Update the AuxCoreBoot0 with boot state for secondary core.
@@ -270,7 +270,7 @@ static int omap4_boot_secondary(unsigned
* Now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return 0;
}
--- a/arch/arm/mach-prima2/platsmp.c
+++ b/arch/arm/mach-prima2/platsmp.c
@@ -22,7 +22,7 @@
static void __iomem *clk_base;
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
static void sirfsoc_secondary_init(unsigned int cpu)
{
@@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsig
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
static const struct of_device_id clk_ids[] = {
@@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsign
/* make sure write buffer is drained */
mb();
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* The secondary processor is waiting to be released from
@@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsign
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return pen_release != -1 ? -ENOSYS : 0;
}
--- a/arch/arm/mach-qcom/platsmp.c
+++ b/arch/arm/mach-qcom/platsmp.c
@@ -46,7 +46,7 @@
extern void secondary_startup_arm(void);
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
#ifdef CONFIG_HOTPLUG_CPU
static void qcom_cpu_die(unsigned int cpu)
@@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
static int scss_release_secondary(unsigned int cpu)
@@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned
* set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* Send the secondary CPU a soft interrupt, thereby causing
@@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return ret;
}
--- a/arch/arm/mach-spear/platsmp.c
+++ b/arch/arm/mach-spear/platsmp.c
@@ -32,7 +32,7 @@ static void write_pen_release(int val)
sync_cache_w(&pen_release);
}
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
@@ -47,8 +47,8 @@ static void spear13xx_secondary_init(uns
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsi
* set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* The secondary processor is waiting to be released from
@@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsi
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return pen_release != -1 ? -ENOSYS : 0;
}
--- a/arch/arm/mach-sti/platsmp.c
+++ b/arch/arm/mach-sti/platsmp.c
@@ -35,7 +35,7 @@ static void write_pen_release(int val)
sync_cache_w(&pen_release);
}
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
static void sti_secondary_init(unsigned int cpu)
{
@@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned i
* set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* The secondary processor is waiting to be released from
@@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned i
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return pen_release != -1 ? -ENOSYS : 0;
}
--- a/arch/arm/plat-versatile/platsmp.c
+++ b/arch/arm/plat-versatile/platsmp.c
@@ -32,7 +32,7 @@ static void write_pen_release(int val)
sync_cache_w(&pen_release);
}
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
void versatile_secondary_init(unsigned int cpu)
{
@@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned i
/*
* Synchronise with the boot thread.
*/
- spin_lock(&boot_lock);
- spin_unlock(&boot_lock);
+ raw_spin_lock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
}
int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
@@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned in
* Set synchronisation state between this boot processor
* and the secondary one
*/
- spin_lock(&boot_lock);
+ raw_spin_lock(&boot_lock);
/*
* This is really belt and braces; we hold unintended secondary
@@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned in
* now the secondary core is starting up let it run its
* calibrations, then wait for it to finish
*/
- spin_unlock(&boot_lock);
+ raw_spin_unlock(&boot_lock);
return pen_release != -1 ? -ENOSYS : 0;
}

View File

@@ -0,0 +1,105 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 17 Jul 2018 18:25:31 +0200
Subject: [PATCH] x86/ioapic: Don't let setaffinity unmask threaded EOI
interrupt too early
There is an issue with threaded interrupts which are marked ONESHOT
and using the fasteoi handler.
if (IS_ONESHOT())
mask_irq();
....
....
cond_unmask_eoi_irq()
chip->irq_eoi();
So if setaffinity is pending then the interrupt will be moved and then
unmasked, which is wrong as it should be kept masked up to the point where
the threaded handler finished. It's not a real problem, the interrupt will
just be able to fire before the threaded handler has finished, though the irq
masked state will be wrong for a bit.
The patch below should cure the issue. It also renames the horribly
misnomed functions so it becomes clear what they are supposed to do.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: add the body of the patch, use the same functions in both
ifdef paths (spotted by Andy Shevchenko)]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/kernel/apic/io_apic.c | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1722,19 +1722,20 @@ static bool io_apic_level_ack_pending(st
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
/* If we are moving the irq we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
- mask_ioapic_irq(data);
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
return true;
}
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- if (unlikely(masked)) {
+ if (unlikely(moveit)) {
/* Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
@@ -1763,15 +1764,17 @@ static inline void ioapic_irqd_unmask(st
*/
if (!io_apic_level_ack_pending(data->chip_data))
irq_move_masked_irq(data);
- unmask_ioapic_irq(data);
+ /* If the irq is masked in the core, leave it */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
}
#endif
@@ -1780,11 +1783,11 @@ static void ioapic_ack_level(struct irq_
{
struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
- bool masked;
+ bool moveit;
int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(irq_data);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -1839,7 +1842,7 @@ static void ioapic_ack_level(struct irq_
eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- ioapic_irqd_unmask(irq_data, masked);
+ ioapic_finish_move(irq_data, moveit);
}
static void ioapic_ir_ack_level(struct irq_data *irq_data)

View File

@@ -0,0 +1,69 @@
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 10 Nov 2016 16:17:55 -0800
Subject: [PATCH] arm: kprobe: replace patch_lock to raw lock
When running kprobe on -rt kernel, the below bug is caught:
BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:931
in_atomic(): 1, irqs_disabled(): 128, pid: 14, name: migration/0
INFO: lockdep is turned off.
irq event stamp: 238
hardirqs last enabled at (237): [<80b5aecc>] _raw_spin_unlock_irqrestore+0x88/0x90
hardirqs last disabled at (238): [<80b56d88>] __schedule+0xec/0x94c
softirqs last enabled at (0): [<80225584>] copy_process.part.5+0x30c/0x1994
softirqs last disabled at (0): [< (null)>] (null)
Preemption disabled at:[<802f2b98>] cpu_stopper_thread+0xc0/0x140
CPU: 0 PID: 14 Comm: migration/0 Tainted: G O 4.8.3-rt2 #1
Hardware name: Freescale LS1021A
[<80212e7c>] (unwind_backtrace) from [<8020cd2c>] (show_stack+0x20/0x24)
[<8020cd2c>] (show_stack) from [<80689e14>] (dump_stack+0xa0/0xcc)
[<80689e14>] (dump_stack) from [<8025a43c>] (___might_sleep+0x1b8/0x2a4)
[<8025a43c>] (___might_sleep) from [<80b5b324>] (rt_spin_lock+0x34/0x74)
[<80b5b324>] (rt_spin_lock) from [<80b5c31c>] (__patch_text_real+0x70/0xe8)
[<80b5c31c>] (__patch_text_real) from [<80b5c3ac>] (patch_text_stop_machine+0x18/0x20)
[<80b5c3ac>] (patch_text_stop_machine) from [<802f2920>] (multi_cpu_stop+0xfc/0x134)
[<802f2920>] (multi_cpu_stop) from [<802f2ba0>] (cpu_stopper_thread+0xc8/0x140)
[<802f2ba0>] (cpu_stopper_thread) from [<802563a4>] (smpboot_thread_fn+0x1a4/0x354)
[<802563a4>] (smpboot_thread_fn) from [<80251d38>] (kthread+0x104/0x11c)
[<80251d38>] (kthread) from [<80207f70>] (ret_from_fork+0x14/0x24)
Since patch_text_stop_machine() is called in stop_machine() which disables IRQ,
sleepable lock should be not used in this atomic context, so replace patch_lock
to raw lock.
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/kernel/patch.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/arch/arm/kernel/patch.c
+++ b/arch/arm/kernel/patch.c
@@ -16,7 +16,7 @@ struct patch {
unsigned int insn;
};
-static DEFINE_SPINLOCK(patch_lock);
+static DEFINE_RAW_SPINLOCK(patch_lock);
static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
__acquires(&patch_lock)
@@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *a
return addr;
if (flags)
- spin_lock_irqsave(&patch_lock, *flags);
+ raw_spin_lock_irqsave(&patch_lock, *flags);
else
__acquire(&patch_lock);
@@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fi
clear_fixmap(fixmap);
if (flags)
- spin_unlock_irqrestore(&patch_lock, *flags);
+ raw_spin_unlock_irqrestore(&patch_lock, *flags);
else
__release(&patch_lock);
}

View File

@@ -0,0 +1,83 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 20 Sep 2013 14:31:54 +0200
Subject: arm/unwind: use a raw_spin_lock
Mostly unwind is done with irqs enabled however SLUB may call it with
irqs disabled while creating a new SLUB cache.
I had system freeze while loading a module which called
kmem_cache_create() on init. That means SLUB's __slab_alloc() disabled
interrupts and then
->new_slab_objects()
->new_slab()
->setup_object()
->setup_object_debug()
->init_tracking()
->set_track()
->save_stack_trace()
->save_stack_trace_tsk()
->walk_stackframe()
->unwind_frame()
->unwind_find_idx()
=>spin_lock_irqsave(&unwind_lock);
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/kernel/unwind.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -93,7 +93,7 @@ extern const struct unwind_idx __start_u
static const struct unwind_idx *__origin_unwind_idx;
extern const struct unwind_idx __stop_unwind_idx[];
-static DEFINE_SPINLOCK(unwind_lock);
+static DEFINE_RAW_SPINLOCK(unwind_lock);
static LIST_HEAD(unwind_tables);
/* Convert a prel31 symbol to an absolute address */
@@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_f
/* module unwind tables */
struct unwind_table *table;
- spin_lock_irqsave(&unwind_lock, flags);
+ raw_spin_lock_irqsave(&unwind_lock, flags);
list_for_each_entry(table, &unwind_tables, list) {
if (addr >= table->begin_addr &&
addr < table->end_addr) {
@@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_f
break;
}
}
- spin_unlock_irqrestore(&unwind_lock, flags);
+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
}
pr_debug("%s: idx = %p\n", __func__, idx);
@@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(un
tab->begin_addr = text_addr;
tab->end_addr = text_addr + text_size;
- spin_lock_irqsave(&unwind_lock, flags);
+ raw_spin_lock_irqsave(&unwind_lock, flags);
list_add_tail(&tab->list, &unwind_tables);
- spin_unlock_irqrestore(&unwind_lock, flags);
+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
return tab;
}
@@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_tabl
if (!tab)
return;
- spin_lock_irqsave(&unwind_lock, flags);
+ raw_spin_lock_irqsave(&unwind_lock, flags);
list_del(&tab->list);
- spin_unlock_irqrestore(&unwind_lock, flags);
+ raw_spin_unlock_irqrestore(&unwind_lock, flags);
kfree(tab);
}

View File

@@ -0,0 +1,43 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 3 Jul 2018 18:19:48 +0200
Subject: [PATCH] cgroup: use irqsave in cgroup_rstat_flush_locked()
All callers of cgroup_rstat_flush_locked() acquire cgroup_rstat_lock
either with spin_lock_irq() or spin_lock_irqsave().
cgroup_rstat_flush_locked() itself acquires cgroup_rstat_cpu_lock which
is a raw_spin_lock. This lock is also acquired in cgroup_rstat_updated()
in IRQ context and therefore requires _irqsave() locking suffix in
cgroup_rstat_flush_locked().
Since there is no difference between spin_lock_t and raw_spin_lock_t
on !RT lockdep does not complain here. On RT lockdep complains because
the interrupts were not disabled here and a deadlock is possible.
Acquire the raw_spin_lock_t with disabled interrupts.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/cgroup/rstat.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -157,8 +157,9 @@ static void cgroup_rstat_flush_locked(st
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -170,7 +171,7 @@ static void cgroup_rstat_flush_locked(st
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||

View File

@@ -0,0 +1,53 @@
From: Clark Williams <williams@redhat.com>
Date: Tue, 3 Jul 2018 13:34:30 -0500
Subject: [PATCH] fscache: initialize cookie hash table raw spinlocks
The fscache cookie mechanism uses a hash table of hlist_bl_head structures. The
PREEMPT_RT patcheset adds a raw spinlock to this structure and so on PREEMPT_RT
the structures get used uninitialized, causing warnings about bad magic numbers
when spinlock debugging is turned on.
Use the init function for fscache cookies.
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/fscache/cookie.c | 8 ++++++++
fs/fscache/main.c | 1 +
include/linux/fscache.h | 1 +
3 files changed, 10 insertions(+)
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -962,3 +962,11 @@ int __fscache_check_consistency(struct f
return -ESTALE;
}
EXPORT_SYMBOL(__fscache_check_consistency);
+
+void __init fscache_cookie_init(void)
+{
+ int i;
+
+ for (i = 0; i < (1 << fscache_cookie_hash_shift) - 1; i++)
+ INIT_HLIST_BL_HEAD(&fscache_cookie_hash[i]);
+}
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -149,6 +149,7 @@ static int __init fscache_init(void)
ret = -ENOMEM;
goto error_cookie_jar;
}
+ fscache_cookie_init();
fscache_root = kobject_create_and_add("fscache", kernel_kobj);
if (!fscache_root)
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -230,6 +230,7 @@ extern void __fscache_readpages_cancel(s
extern void __fscache_disable_cookie(struct fscache_cookie *, const void *, bool);
extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_t,
bool (*)(void *), void *);
+extern void fscache_cookie_init(void);
/**
* fscache_register_netfs - Register a filesystem as desiring caching services

View File

@@ -0,0 +1,33 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 29 Aug 2018 21:59:04 +0200
Subject: [PATCH] Drivers: hv: vmbus: include header for get_irq_regs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
On !RT the header file get_irq_regs() gets pulled in via other header files. On
RT it does not and the build fails:
drivers/hv/vmbus_drv.c:975 implicit declaration of function get_irq_regs [-Werror=implicit-function-declaration]
drivers/hv/hv.c:115 implicit declaration of function get_irq_regs [-Werror=implicit-function-declaration]
Add the header file for get_irq_regs() in a common header so it used by
vmbus_drv.c by hv.c for their get_irq_regs() usage.
Reported-by: Bernhard Landauer <oberon@manjaro.org>
Reported-by: Ralf Ramsauer <ralf.ramsauer@oth-regensburg.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/hv/hyperv_vmbus.h | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -31,6 +31,7 @@
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include "hv_trace.h"

View File

@@ -0,0 +1,26 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 11 Oct 2018 16:39:59 +0200
Subject: [PATCH] percpu: include irqflags.h for raw_local_irq_save()
The header percpu.h header file is using raw_local_irq_save() but does
not include irqflags.h for its definition. It compiles because the
header file is included via an other header file.
On -RT the build fails because raw_local_irq_save() is not defined.
Include irqflags.h in percpu.h.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/asm-generic/percpu.h | 1 +
1 file changed, 1 insertion(+)
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -5,6 +5,7 @@
#include <linux/compiler.h>
#include <linux/threads.h>
#include <linux/percpu-defs.h>
+#include <linux/irqflags.h>
#ifdef CONFIG_SMP

View File

@@ -0,0 +1,25 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 26 Jul 2018 15:06:10 +0200
Subject: [PATCH] efi: Allow efi=runtime
In case the option "efi=noruntime" is default at built-time, the user
could overwrite its sate by `efi=runtime' and allow it again.
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/firmware/efi/efi.c | 3 +++
1 file changed, 3 insertions(+)
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -113,6 +113,9 @@ static int __init parse_efi_cmdline(char
if (parse_option_str(str, "noruntime"))
disable_runtime = true;
+ if (parse_option_str(str, "runtime"))
+ disable_runtime = false;
+
return 0;
}
early_param("efi", parse_efi_cmdline);

View File

@@ -0,0 +1,48 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 24 Jul 2018 14:48:55 +0200
Subject: [PATCH] x86/efi: drop task_lock() from efi_switch_mm()
efi_switch_mm() is a wrapper around switch_mm() which saves current's
->active_mm, sets the requests mm as ->active_mm and invokes
switch_mm().
I don't think that task_lock() is required during that procedure. It
protects ->mm which isn't changed here.
It needs to be mentioned that during the whole procedure (switch to
EFI's mm and back) the preemption needs to be disabled. A context switch
at this point would reset the cr3 value based on current->mm. Also, this
function may not be invoked at the same time on a different CPU because
it would overwrite the efi_scratch.prev_mm information.
Remove task_lock() and also update the comment to reflect it.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/platform/efi/efi_64.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -619,18 +619,16 @@ void __init efi_dump_pagetable(void)
/*
* Makes the calling thread switch to/from efi_mm context. Can be used
- * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well
- * as during efi runtime calls i.e current->active_mm == current_mm.
- * We are not mm_dropping()/mm_grabbing() any mm, because we are not
- * losing/creating any references.
+ * in a kernel thread and user context. Preemption needs to remain disabled
+ * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm
+ * can not change under us.
+ * It should be ensured that there are no concurent calls to this function.
*/
void efi_switch_mm(struct mm_struct *mm)
{
- task_lock(current);
efi_scratch.prev_mm = current->active_mm;
current->active_mm = mm;
switch_mm(efi_scratch.prev_mm, mm, NULL);
- task_unlock(current);
}
#ifdef CONFIG_EFI_MIXED

View File

@@ -0,0 +1,71 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 26 Jul 2018 09:13:42 +0200
Subject: [PATCH] arm64: KVM: compute_layout before altenates are applied
compute_layout() is invoked as part of an alternative fixup under
stop_machine() and needs a sleeping lock as part of get_random_long().
Invoke compute_layout() before the alternatives are applied.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm64/include/asm/alternative.h | 6 ++++++
arch/arm64/kernel/alternative.c | 1 +
arch/arm64/kvm/va_layout.c | 7 +------
3 files changed, 8 insertions(+), 6 deletions(-)
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -35,6 +35,12 @@ void apply_alternatives_module(void *sta
static inline void apply_alternatives_module(void *start, size_t length) { }
#endif
+#ifdef CONFIG_KVM_ARM_HOST
+void kvm_compute_layout(void);
+#else
+static inline void kvm_compute_layout(void) { }
+#endif
+
#define ALTINSTR_ENTRY(feature,cb) \
" .word 661b - .\n" /* label */ \
" .if " __stringify(cb) " == 0\n" \
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -224,6 +224,7 @@ static int __apply_alternatives_multi_st
void __init apply_alternatives_all(void)
{
/* better not try code patching on a live SMP system */
+ kvm_compute_layout();
stop_machine(__apply_alternatives_multi_stop, NULL, cpu_online_mask);
}
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -33,7 +33,7 @@ static u8 tag_lsb;
static u64 tag_val;
static u64 va_mask;
-static void compute_layout(void)
+__init void kvm_compute_layout(void)
{
phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
u64 hyp_va_msb;
@@ -121,8 +121,6 @@ void __init kvm_update_va_mask(struct al
BUG_ON(nr_inst != 5);
- if (!has_vhe() && !va_mask)
- compute_layout();
for (i = 0; i < nr_inst; i++) {
u32 rd, rn, insn, oinsn;
@@ -167,9 +165,6 @@ void kvm_patch_vector_branch(struct alt_
return;
}
- if (!va_mask)
- compute_layout();
-
/*
* Compute HYP VA by using the same computation as kern_hyp_va()
*/

View File

@@ -0,0 +1,95 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 31 Aug 2018 14:16:30 +0200
Subject: [PATCH] of: allocate / free phandle cache outside of the devtree_lock
The phandle cache code allocates memory while holding devtree_lock which
is a raw_spinlock_t. Memory allocation (and free()) is not possible on
RT while a raw_spinlock_t is held.
Invoke the kfree() and kcalloc() while the lock is dropped.
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: devicetree@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/of/base.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -130,31 +130,34 @@ static u32 phandle_cache_mask;
/*
* Caller must hold devtree_lock.
*/
-static void __of_free_phandle_cache(void)
+static struct device_node** __of_free_phandle_cache(void)
{
u32 cache_entries = phandle_cache_mask + 1;
u32 k;
+ struct device_node **shadow;
if (!phandle_cache)
- return;
+ return NULL;
for (k = 0; k < cache_entries; k++)
of_node_put(phandle_cache[k]);
- kfree(phandle_cache);
+ shadow = phandle_cache;
phandle_cache = NULL;
+ return shadow;
}
int of_free_phandle_cache(void)
{
unsigned long flags;
+ struct device_node **shadow;
raw_spin_lock_irqsave(&devtree_lock, flags);
- __of_free_phandle_cache();
+ shadow = __of_free_phandle_cache();
raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
+ kfree(shadow);
return 0;
}
#if !defined(CONFIG_MODULES)
@@ -189,10 +192,11 @@ void of_populate_phandle_cache(void)
u32 cache_entries;
struct device_node *np;
u32 phandles = 0;
+ struct device_node **shadow;
raw_spin_lock_irqsave(&devtree_lock, flags);
- __of_free_phandle_cache();
+ shadow = __of_free_phandle_cache();
for_each_of_allnodes(np)
if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL)
@@ -200,12 +204,14 @@ void of_populate_phandle_cache(void)
if (!phandles)
goto out;
+ raw_spin_unlock_irqrestore(&devtree_lock, flags);
cache_entries = roundup_pow_of_two(phandles);
phandle_cache_mask = cache_entries - 1;
phandle_cache = kcalloc(cache_entries, sizeof(*phandle_cache),
GFP_ATOMIC);
+ raw_spin_lock_irqsave(&devtree_lock, flags);
if (!phandle_cache)
goto out;
@@ -217,6 +223,7 @@ void of_populate_phandle_cache(void)
out:
raw_spin_unlock_irqrestore(&devtree_lock, flags);
+ kfree(shadow);
}
void __init of_core_init(void)

View File

@@ -0,0 +1,91 @@
From: Clark Williams <williams@redhat.com>
Date: Tue, 18 Sep 2018 10:29:31 -0500
Subject: [PATCH] mm/kasan: make quarantine_lock a raw_spinlock_t
The static lock quarantine_lock is used in quarantine.c to protect the
quarantine queue datastructures. It is taken inside quarantine queue
manipulation routines (quarantine_put(), quarantine_reduce() and
quarantine_remove_cache()), with IRQs disabled.
This is not a problem on a stock kernel but is problematic on an RT
kernel where spin locks are sleeping spinlocks, which can sleep and can
not be acquired with disabled interrupts.
Convert the quarantine_lock to a raw spinlock_t. The usage of
quarantine_lock is confined to quarantine.c and the work performed while
the lock is held is limited.
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/kasan/quarantine.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -103,7 +103,7 @@ static int quarantine_head;
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
-static DEFINE_SPINLOCK(quarantine_lock);
+static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
@@ -190,7 +190,7 @@ void quarantine_put(struct kasan_free_me
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- spin_lock(&quarantine_lock);
+ raw_spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -203,7 +203,7 @@ void quarantine_put(struct kasan_free_me
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock(&quarantine_lock);
+ raw_spin_unlock(&quarantine_lock);
}
local_irq_restore(flags);
@@ -230,7 +230,7 @@ void quarantine_reduce(void)
* expected case).
*/
srcu_idx = srcu_read_lock(&remove_cache_srcu);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
@@ -254,7 +254,7 @@ void quarantine_reduce(void)
quarantine_head = 0;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
srcu_read_unlock(&remove_cache_srcu, srcu_idx);
@@ -310,17 +310,17 @@ void quarantine_remove_cache(struct kmem
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
/* Scanning whole quarantine can take a while. */
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
cond_resched();
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);

View File

@@ -0,0 +1,43 @@
From: Paul E. McKenney <paulmck@linux.ibm.com>
Date: Mon, 29 Oct 2018 11:53:01 +0100
Subject: [PATCH] EXP rcu: Revert expedited GP parallelization cleverness
(Commit 258ba8e089db23f760139266c232f01bad73f85c from linux-rcu)
This commit reverts a series of commits starting with fcc635436501 ("rcu:
Make expedited GPs handle CPU 0 being offline") and its successors, thus
queueing each rcu_node structure's expedited grace-period initialization
work on the first CPU of that rcu_node structure.
Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/rcu/tree_exp.h | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -472,7 +472,6 @@ static void sync_rcu_exp_select_node_cpu
static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
smp_call_func_t func)
{
- int cpu;
struct rcu_node *rnp;
trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
@@ -494,13 +493,7 @@ static void sync_rcu_exp_select_cpus(str
continue;
}
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- preempt_disable();
- cpu = cpumask_next(rnp->grplo - 1, cpu_online_mask);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi))
- cpu = WORK_CPU_UNBOUND;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
- preempt_enable();
+ queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
rnp->exp_need_flush = true;
}

View File

@@ -0,0 +1,159 @@
From: He Zhe <zhe.he@windriver.com>
Date: Wed, 19 Dec 2018 16:30:57 +0100
Subject: [PATCH] kmemleak: Turn kmemleak_lock to raw spinlock on RT
kmemleak_lock, as a rwlock on RT, can possibly be held in atomic context and
causes the follow BUG.
BUG: scheduling while atomic: migration/15/132/0x00000002
Preemption disabled at:
[<ffffffff8c927c11>] cpu_stopper_thread+0x71/0x100
CPU: 15 PID: 132 Comm: migration/15 Not tainted 4.19.0-rt1-preempt-rt #1
Call Trace:
schedule+0x3d/0xe0
__rt_spin_lock+0x26/0x30
__write_rt_lock+0x23/0x1a0
rt_write_lock+0x2a/0x30
find_and_remove_object+0x1e/0x80
delete_object_full+0x10/0x20
kmemleak_free+0x32/0x50
kfree+0x104/0x1f0
intel_pmu_cpu_dying+0x67/0x70
x86_pmu_dying_cpu+0x1a/0x30
cpuhp_invoke_callback+0x92/0x700
take_cpu_down+0x70/0xa0
multi_cpu_stop+0x62/0xc0
cpu_stopper_thread+0x79/0x100
smpboot_thread_fn+0x20f/0x2d0
kthread+0x121/0x140
And on v4.18 stable tree the following call trace, caused by grabbing
kmemleak_lock again, is also observed.
kernel BUG at kernel/locking/rtmutex.c:1048!
CPU: 5 PID: 689 Comm: mkfs.ext4 Not tainted 4.18.16-rt9-preempt-rt #1
Call Trace:
rt_write_lock+0x2a/0x30
create_object+0x17d/0x2b0
kmemleak_alloc+0x34/0x50
kmem_cache_alloc+0x146/0x220
mempool_alloc_slab+0x15/0x20
mempool_alloc+0x65/0x170
sg_pool_alloc+0x21/0x60
sg_alloc_table_chained+0x8b/0xb0
blk_flush_plug_list+0x204/0x230
schedule+0x87/0xe0
rt_write_lock+0x2a/0x30
create_object+0x17d/0x2b0
kmemleak_alloc+0x34/0x50
__kmalloc_node+0x1cd/0x340
alloc_request_size+0x30/0x70
mempool_alloc+0x65/0x170
get_request+0x4e3/0x8d0
blk_queue_bio+0x153/0x470
generic_make_request+0x1dc/0x3f0
submit_bio+0x49/0x140
kmemleak is an error detecting feature. We would not expect as good performance
as without it. As there is no raw rwlock defining helpers, we turn kmemleak_lock
to a raw spinlock.
Signed-off-by: He Zhe <zhe.he@windriver.com>
Cc: catalin.marinas@arm.com
Cc: bigeasy@linutronix.de
Cc: tglx@linutronix.de
Cc: rostedt@goodmis.org
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lkml.kernel.org/r/1542877459-144382-1-git-send-email-zhe.he@windriver.com
Link: https://lkml.kernel.org/r/20181218150744.GB20197@arrakis.emea.arm.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/kmemleak.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -26,7 +26,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * - kmemleak_lock (raw spinlock): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -197,7 +197,7 @@ static LIST_HEAD(gray_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
-static DEFINE_RWLOCK(kmemleak_lock);
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -491,9 +491,9 @@ static struct kmemleak_object *find_and_
struct kmemleak_object *object;
rcu_read_lock();
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -513,13 +513,13 @@ static struct kmemleak_object *find_and_
unsigned long flags;
struct kmemleak_object *object;
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object) {
rb_erase(&object->rb_node, &object_tree_root);
list_del_rcu(&object->object_list);
}
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -593,7 +593,7 @@ static struct kmemleak_object *create_ob
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
min_addr = min(min_addr, ptr);
max_addr = max(max_addr, ptr + size);
@@ -624,7 +624,7 @@ static struct kmemleak_object *create_ob
list_add_tail_rcu(&object->object_list, &object_list);
out:
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -1310,7 +1310,7 @@ static void scan_block(void *_start, voi
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1367,7 +1367,7 @@ static void scan_block(void *_start, voi
spin_unlock(&object->lock);
}
}
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*

View File

@@ -0,0 +1,127 @@
Date: Fri, 28 Oct 2016 23:05:11 +0200
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
To: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>,
linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
tglx@linutronix.de
Subject: NFSv4: replace seqcount_t with a seqlock_t
The raw_write_seqcount_begin() in nfs4_reclaim_open_state() bugs me
because it maps to preempt_disable() in -RT which I can't have at this
point. So I took a look at the code.
It the lockdep part was removed in commit abbec2da13f0 ("NFS: Use
raw_write_seqcount_begin/end int nfs4_reclaim_open_state") because
lockdep complained. The whole seqcount thing was introduced in commit
c137afabe330 ("NFSv4: Allow the state manager to mark an open_owner as
being recovered").
The recovery threads runs only once.
write_seqlock() does not work on !RT because it disables preemption and it the
writer side is preemptible (has to remain so despite the fact that it will
block readers).
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/nfs/delegation.c | 4 ++--
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 4 ++--
fs/nfs/nfs4state.c | 22 ++++++++++++++++------
4 files changed, 21 insertions(+), 11 deletions(-)
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -152,11 +152,11 @@ static int nfs_delegation_claim_opens(st
sp = state->owner;
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ seq = read_seqbegin(&sp->so_reclaim_seqlock);
err = nfs4_open_delegation_recall(ctx, state, stateid, type);
if (!err)
err = nfs_delegation_claim_locks(ctx, state, stateid);
- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
put_nfs_open_context(ctx);
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -114,7 +114,7 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- seqcount_t so_reclaim_seqcount;
+ seqlock_t so_reclaim_seqlock;
struct mutex so_delegreturn_mutex;
};
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2859,7 +2859,7 @@ static int _nfs4_open_and_get_state(stru
unsigned int seq;
int ret;
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
ret = _nfs4_proc_open(opendata, ctx);
if (ret != 0)
@@ -2900,7 +2900,7 @@ static int _nfs4_open_and_get_state(stru
if (d_inode(dentry) == state->inode) {
nfs_inode_attach_open_context(ctx);
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ if (read_seqretry(&sp->so_reclaim_seqlock, seq))
nfs4_schedule_stateid_recovery(server, state);
}
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -511,7 +511,7 @@ nfs4_alloc_state_owner(struct nfs_server
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
- seqcount_init(&sp->so_reclaim_seqcount);
+ seqlock_init(&sp->so_reclaim_seqlock);
mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -1564,8 +1564,12 @@ static int nfs4_reclaim_open_state(struc
* recovering after a network partition or a reboot from a
* server that doesn't support a grace period.
*/
+#ifdef CONFIG_PREEMPT_RT_FULL
+ write_seqlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
+#endif
spin_lock(&sp->so_lock);
- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1652,14 +1656,20 @@ static int nfs4_reclaim_open_state(struc
spin_lock(&sp->so_lock);
goto restart;
}
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ write_sequnlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
+#endif
return 0;
out_err:
nfs4_put_open_state(state);
- spin_lock(&sp->so_lock);
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
- spin_unlock(&sp->so_lock);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ write_sequnlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
+#endif
return status;
}

View File

@@ -0,0 +1,733 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 4 Apr 2017 12:50:16 +0200
Subject: [PATCH] kernel: sched: Provide a pointer to the valid CPU mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
In commit 4b53a3412d66 ("sched/core: Remove the tsk_nr_cpus_allowed()
wrapper") the tsk_nr_cpus_allowed() wrapper was removed. There was not
much difference in !RT but in RT we used this to implement
migrate_disable(). Within a migrate_disable() section the CPU mask is
restricted to single CPU while the "normal" CPU mask remains untouched.
As an alternative implementation Ingo suggested to use
struct task_struct {
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
};
with
t->cpus_allowed_ptr = &t->cpus_allowed;
In -RT we then can switch the cpus_ptr to
t->cpus_allowed_ptr = &cpumask_of(task_cpu(p));
in a migration disabled region. The rules are simple:
- Code that 'uses' ->cpus_allowed would use the pointer.
- Code that 'modifies' ->cpus_allowed would use the direct mask.
While converting the existing users I tried to stick with the rules
above however… well mostly CPUFREQ tries to temporary switch the CPU
mask to do something on a certain CPU and then switches the mask back it
its original value. So in theory `cpus_ptr' could or should be used.
However if this is invoked in a migration disabled region (which is not
the case because it would require something like preempt_disable() and
set_cpus_allowed_ptr() might sleep so it can't be) then the "restore"
part would restore the wrong mask. So it only looks strange and I go for
the pointer…
Some drivers copy the cpumask without cpumask_copy() and others use
cpumask_copy but without alloc_cpumask_var(). I did not fix those as
part of this, could do this as a follow up…
So is this the way we want it?
Is the usage of `cpus_ptr' vs `cpus_mask' for the set + restore part
(see cpufreq users) what we want? At some point it looks like they
should use a different interface for their doing. I am not sure why
switching to certain CPU is important but maybe it could be done via a
workqueue from the CPUFREQ core (so we have a comment desribing why are
doing this and a get_online_cpus() to ensure that the CPU does not go
offline too early).
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/ia64/kernel/mca.c | 2 -
arch/mips/include/asm/switch_to.h | 4 +-
arch/mips/kernel/mips-mt-fpaff.c | 2 -
arch/mips/kernel/traps.c | 6 ++--
arch/powerpc/platforms/cell/spufs/sched.c | 2 -
arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c | 2 -
drivers/infiniband/hw/hfi1/affinity.c | 6 ++--
drivers/infiniband/hw/hfi1/sdma.c | 3 --
drivers/infiniband/hw/qib/qib_file_ops.c | 7 ++--
fs/proc/array.c | 4 +-
include/linux/sched.h | 5 ++-
init/init_task.c | 3 +-
kernel/cgroup/cpuset.c | 2 -
kernel/fork.c | 2 +
kernel/sched/core.c | 40 ++++++++++++++--------------
kernel/sched/cpudeadline.c | 4 +-
kernel/sched/cpupri.c | 4 +-
kernel/sched/deadline.c | 6 ++--
kernel/sched/fair.c | 32 +++++++++++-----------
kernel/sched/rt.c | 4 +-
kernel/trace/trace_hwlat.c | 2 -
lib/smp_processor_id.c | 2 -
samples/trace_events/trace-events-sample.c | 2 -
23 files changed, 74 insertions(+), 72 deletions(-)
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1824,7 +1824,7 @@ format_mca_init_stack(void *mca_data, un
ti->cpu = cpu;
p->stack = ti;
p->state = TASK_UNINTERRUPTIBLE;
- cpumask_set_cpu(cpu, &p->cpus_allowed);
+ cpumask_set_cpu(cpu, &p->cpus_mask);
INIT_LIST_HEAD(&p->tasks);
p->parent = p->real_parent = p->group_leader = p;
INIT_LIST_HEAD(&p->children);
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
* inline to try to keep the overhead down. If we have been forced to run on
* a "CPU" with an FPU because of a previous high level of FP computation,
* but did not actually use the FPU during the most recent time-slice (CU1
- * isn't set), we undo the restriction on cpus_allowed.
+ * isn't set), we undo the restriction on cpus_mask.
*
* We're not calling set_cpus_allowed() here, because we have no need to
* force prompt migration - we're already switching the current CPU to a
@@ -57,7 +57,7 @@ do { \
test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
(!(KSTK_STATUS(prev) & ST0_CU1))) { \
clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
- prev->cpus_allowed = prev->thread.user_cpus_allowed; \
+ prev->cpus_mask = prev->thread.user_cpus_allowed; \
} \
next->thread.emulated_fp = 0; \
} while(0)
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffi
if (retval)
goto out_unlock;
- cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
+ cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
cpumask_and(&mask, &allowed, cpu_active_mask);
out_unlock:
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1174,12 +1174,12 @@ static void mt_ase_fp_affinity(void)
* restricted the allowed set to exclude any CPUs with FPUs,
* we'll skip the procedure.
*/
- if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
+ if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
cpumask_t tmask;
current->thread.user_cpus_allowed
- = current->cpus_allowed;
- cpumask_and(&tmask, &current->cpus_allowed,
+ = current->cpus_mask;
+ cpumask_and(&tmask, &current->cpus_mask,
&mt_fpu_cpumask);
set_cpus_allowed_ptr(current, &tmask);
set_thread_flag(TIF_FPUBOUND);
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -141,7 +141,7 @@ void __spu_update_sched_info(struct spu_
* runqueue. The context will be rescheduled on the proper node
* if it is timesliced or preempted.
*/
- cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
+ cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
/* Save the current cpu id for spu interrupt routing. */
ctx->last_ran = raw_smp_processor_id();
--- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -1435,7 +1435,7 @@ static int pseudo_lock_dev_mmap(struct f
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
- if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
+ if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1037,7 +1037,7 @@ int hfi1_get_proc_affinity(int node)
struct hfi1_affinity_node *entry;
cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
const struct cpumask *node_mask,
- *proc_mask = &current->cpus_allowed;
+ *proc_mask = current->cpus_ptr;
struct hfi1_affinity_node_list *affinity = &node_affinity;
struct cpu_mask_set *set = &affinity->proc;
@@ -1045,7 +1045,7 @@ int hfi1_get_proc_affinity(int node)
* check whether process/context affinity has already
* been set
*/
- if (cpumask_weight(proc_mask) == 1) {
+ if (current->nr_cpus_allowed == 1) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
@@ -1056,7 +1056,7 @@ int hfi1_get_proc_affinity(int node)
cpu = cpumask_first(proc_mask);
cpumask_set_cpu(cpu, &set->used);
goto done;
- } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+ } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -855,14 +855,13 @@ struct sdma_engine *sdma_select_user_eng
{
struct sdma_rht_node *rht_node;
struct sdma_engine *sde = NULL;
- const struct cpumask *current_mask = &current->cpus_allowed;
unsigned long cpu_id;
/*
* To ensure that always the same sdma engine(s) will be
* selected make sure the process is pinned to this CPU only.
*/
- if (cpumask_weight(current_mask) != 1)
+ if (current->nr_cpus_allowed != 1)
goto out;
cpu_id = smp_processor_id();
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp
static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
{
struct qib_filedata *fd = fp->private_data;
- const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+ const unsigned int weight = current->nr_cpus_allowed;
const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
int local_cpu;
@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *
ret = find_free_ctxt(i_minor - 1, fp, uinfo);
else {
int unit;
- const unsigned int cpu = cpumask_first(&current->cpus_allowed);
- const unsigned int weight =
- cpumask_weight(&current->cpus_allowed);
+ const unsigned int cpu = cpumask_first(current->cpus_ptr);
+ const unsigned int weight = current->nr_cpus_allowed;
if (weight == 1 && !test_bit(cpu, qib_cpulist))
if (!find_hca(cpu, &unit) && unit >= 0)
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -381,9 +381,9 @@ static inline void task_context_switch_c
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Cpus_allowed:\t%*pb\n",
- cpumask_pr_args(&task->cpus_allowed));
+ cpumask_pr_args(task->cpus_ptr));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
- cpumask_pr_args(&task->cpus_allowed));
+ cpumask_pr_args(task->cpus_ptr));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -660,7 +660,8 @@ struct task_struct {
unsigned int policy;
int nr_cpus_allowed;
- cpumask_t cpus_allowed;
+ const cpumask_t *cpus_ptr;
+ cpumask_t cpus_mask;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
@@ -1390,7 +1391,7 @@ extern struct pid *cad_pid;
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
-#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
+#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -71,7 +71,8 @@ struct task_struct init_task
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
- .cpus_allowed = CPU_MASK_ALL,
+ .cpus_ptr = &init_task.cpus_mask,
+ .cpus_mask = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2090,7 +2090,7 @@ static void cpuset_fork(struct task_stru
if (task_css_is_root(task, cpuset_cgrp_id))
return;
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -845,6 +845,8 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -876,7 +876,7 @@ static inline bool is_per_cpu_kthread(st
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
@@ -971,7 +971,7 @@ static int migration_cpu_stop(void *data
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1002,7 +1002,7 @@ static int migration_cpu_stop(void *data
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- cpumask_copy(&p->cpus_allowed, new_mask);
+ cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
@@ -1072,7 +1072,7 @@ static int __set_cpus_allowed_ptr(struct
goto out;
}
- if (cpumask_equal(&p->cpus_allowed, new_mask))
+ if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1235,10 +1235,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1280,10 +1280,10 @@ int migrate_swap(struct task_struct *cur
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1428,7 +1428,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1468,14 +1468,14 @@ static int select_fallback_rq(int cpu, s
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1519,7 +1519,7 @@ static int select_fallback_rq(int cpu, s
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1529,11 +1529,11 @@ int select_task_rq(struct task_struct *p
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(&p->cpus_allowed);
+ cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -2400,7 +2400,7 @@ void wake_up_new_task(struct task_struct
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
+ * - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -4273,7 +4273,7 @@ static int __sched_setscheduler(struct t
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
- if (!cpumask_subset(span, &p->cpus_allowed) ||
+ if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -4872,7 +4872,7 @@ long sched_getaffinity(pid_t pid, struct
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5452,7 +5452,7 @@ int task_can_attach(struct task_struct *
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5479,7 +5479,7 @@ int migrate_task_to(struct task_struct *
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5617,7 +5617,7 @@ static void migrate_tasks(struct rq *dea
put_prev_task(rq, next);
/*
- * Rules for changing task_struct::cpus_allowed are holding
+ * Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -124,14 +124,14 @@ int cpudl_find(struct cpudl *cp, struct
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -98,11 +98,11 @@ int cpupri_find(struct cpupri *cp, struc
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -539,7 +539,7 @@ static struct rq *dl_task_offline_migrat
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(str
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1630,7 +1630,7 @@ static void task_numa_compare(struct tas
* be incurred if the tasks were swapped.
*/
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
@@ -1727,7 +1727,7 @@ static void task_numa_find_cpu(struct ta
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
@@ -5712,7 +5712,7 @@ find_idlest_group(struct sched_domain *s
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
- &p->cpus_allowed))
+ p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5844,7 +5844,7 @@ find_idlest_group_cpu(struct sched_group
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5884,7 +5884,7 @@ static inline int find_idlest_cpu(struct
{
int new_cpu = cpu;
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
@@ -6001,7 +6001,7 @@ static int select_idle_core(struct task_
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
@@ -6035,7 +6035,7 @@ static int select_idle_smt(struct task_s
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -6098,7 +6098,7 @@ static int select_idle_cpu(struct task_s
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
@@ -6135,7 +6135,7 @@ static int select_idle_sibling(struct ta
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
available_idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6353,7 +6353,7 @@ select_task_rq_fair(struct task_struct *
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, &p->cpus_allowed);
+ && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -7092,14 +7092,14 @@ int can_migrate_task(struct task_struct
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7119,7 +7119,7 @@ int can_migrate_task(struct task_struct
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7716,7 +7716,7 @@ check_cpu_capacity(struct rq *rq, struct
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8331,7 +8331,7 @@ static struct sched_group *find_busiest_
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
@@ -8727,7 +8727,7 @@ static int load_balance(int this_cpu, st
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1611,7 +1611,7 @@ static void put_prev_task_rt(struct rq *
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1748,7 +1748,7 @@ static struct rq *find_lock_lowest_rq(st
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
- if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -22,7 +22,7 @@ notrace static unsigned int check_preemp
* Kernel threads bound to a single CPU can safely use
* smp_processor_id():
*/
- if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
+ if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
goto out;
/*
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -33,7 +33,7 @@ static void simple_thread_func(int cnt)
/* Silly tracepoints */
trace_foo_bar("hello", cnt, array, random_strings[len],
- &current->cpus_allowed);
+ current->cpus_ptr);
trace_foo_with_template_simple("HELLO", cnt);

View File

@@ -0,0 +1,251 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 27 May 2017 19:02:06 +0200
Subject: kernel/sched/core: add migrate_disable()
---
include/linux/preempt.h | 23 ++++++++
include/linux/sched.h | 7 ++
include/linux/smp.h | 3 +
kernel/sched/core.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/debug.c | 4 +
5 files changed, 165 insertions(+), 2 deletions(-)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -185,6 +185,22 @@ do { \
#define preemptible() (preempt_count() == 0 && !irqs_disabled())
+#ifdef CONFIG_SMP
+
+extern void migrate_disable(void);
+extern void migrate_enable(void);
+
+int __migrate_disabled(struct task_struct *p);
+
+#else
+#define migrate_disable() barrier()
+#define migrate_enable() barrier()
+static inline int __migrate_disabled(struct task_struct *p)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_PREEMPT
#define preempt_enable() \
do { \
@@ -253,6 +269,13 @@ do { \
#define preempt_enable_notrace() barrier()
#define preemptible() 0
+#define migrate_disable() barrier()
+#define migrate_enable() barrier()
+
+static inline int __migrate_disabled(struct task_struct *p)
+{
+ return 0;
+}
#endif /* CONFIG_PREEMPT_COUNT */
#ifdef MODULE
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -662,6 +662,13 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+ int migrate_disable;
+ int migrate_disable_update;
+# ifdef CONFIG_SCHED_DEBUG
+ int migrate_disable_atomic;
+# endif
+#endif
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -202,6 +202,9 @@ static inline int get_boot_cpu_id(void)
#define get_cpu() ({ preempt_disable(); smp_processor_id(); })
#define put_cpu() preempt_enable()
+#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
+#define put_cpu_light() migrate_enable()
+
/*
* Callback to arch code if there's nosmp or maxcpus=0 on the
* boot command line:
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1006,7 +1006,15 @@ void set_cpus_allowed_common(struct task
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+int __migrate_disabled(struct task_struct *p)
+{
+ return p->migrate_disable;
+}
+#endif
+
+static void __do_set_cpus_allowed_tail(struct task_struct *p,
+ const struct cpumask *new_mask)
{
struct rq *rq = task_rq(p);
bool queued, running;
@@ -1035,6 +1043,20 @@ void do_set_cpus_allowed(struct task_str
set_curr_task(rq, p);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+ if (__migrate_disabled(p)) {
+ lockdep_assert_held(&p->pi_lock);
+
+ cpumask_copy(&p->cpus_mask, new_mask);
+ p->migrate_disable_update = 1;
+ return;
+ }
+#endif
+ __do_set_cpus_allowed_tail(p, new_mask);
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1093,9 +1115,16 @@ static int __set_cpus_allowed_ptr(struct
}
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
+ if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
goto out;
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+ if (__migrate_disabled(p)) {
+ p->migrate_disable_update = 1;
+ goto out;
+ }
+#endif
+
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
@@ -7058,3 +7087,100 @@ const u32 sched_prio_to_wmult[40] = {
};
#undef CREATE_TRACE_POINTS
+
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (in_atomic() || irqs_disabled()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic++;
+#endif
+ return;
+ }
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
+ if (p->migrate_disable) {
+ p->migrate_disable++;
+ return;
+ }
+
+ preempt_disable();
+ p->migrate_disable = 1;
+
+ p->cpus_ptr = cpumask_of(smp_processor_id());
+ p->nr_cpus_allowed = 1;
+
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+ if (in_atomic() || irqs_disabled()) {
+#ifdef CONFIG_SCHED_DEBUG
+ p->migrate_disable_atomic--;
+#endif
+ return;
+ }
+
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(p->migrate_disable_atomic);
+#endif
+
+ WARN_ON_ONCE(p->migrate_disable <= 0);
+ if (p->migrate_disable > 1) {
+ p->migrate_disable--;
+ return;
+ }
+
+ preempt_disable();
+
+ p->cpus_ptr = &p->cpus_mask;
+ p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask);
+ p->migrate_disable = 0;
+
+ if (p->migrate_disable_update) {
+ struct rq *rq;
+ struct rq_flags rf;
+
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ __do_set_cpus_allowed_tail(p, &p->cpus_mask);
+ task_rq_unlock(rq, p, &rf);
+
+ p->migrate_disable_update = 0;
+
+ WARN_ON(smp_processor_id() != task_cpu(p));
+ if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ struct migration_arg arg;
+ unsigned int dest_cpu;
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
+ dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask);
+ arg.task = p;
+ arg.dest_cpu = dest_cpu;
+
+ preempt_enable();
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return;
+ }
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#endif
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -978,6 +978,10 @@ void proc_sched_show_task(struct task_st
P(dl.runtime);
P(dl.deadline);
}
+#if defined(CONFIG_PREEMPT_COUNT) && defined(CONFIG_SMP)
+ P(migrate_disable);
+#endif
+ P(nr_cpus_allowed);
#undef PN_SCHEDSTAT
#undef PN
#undef __PN

View File

@@ -0,0 +1,31 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Oct 2018 17:34:50 +0200
Subject: [PATCH] sched/migrate_disable: Add export_symbol_gpl for
__migrate_disabled
Jonathan reported that lttng/modules can't use __migrate_disabled().
This function is only used by sched/core itself and the tracing
infrastructure to report the migrate counter (lttng does probably the
same). Since the rework migrate_disable() it moved from sched.h to
preempt.h and is became an exported function instead of a "static
inline" due to the header recursion of preempt vs sched.
Since the compiler inlines the function for sched/core usage, add a
EXPORT_SYMBOL_GPL to allow the module/LTTNG usage.
Reported-by: Jonathan Rajott <jonathan.rajotte-julien@efficios.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/sched/core.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1011,6 +1011,7 @@ int __migrate_disabled(struct task_struc
{
return p->migrate_disable;
}
+EXPORT_SYMBOL_GPL(__migrate_disabled);
#endif
static void __do_set_cpus_allowed_tail(struct task_struct *p,

View File

@@ -0,0 +1,91 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 09 Mar 2016 10:51:06 +0100
Subject: arm: at91: do not disable/enable clocks in a row
Currently the driver will disable the clock and enable it one line later
if it is switching from periodic mode into one shot.
This can be avoided and causes a needless warning on -RT.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/clocksource/tcb_clksrc.c | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -126,6 +126,7 @@ static struct clocksource clksrc = {
struct tc_clkevt_device {
struct clock_event_device clkevt;
struct clk *clk;
+ bool clk_enabled;
void __iomem *regs;
};
@@ -143,6 +144,24 @@ static struct tc_clkevt_device *to_tc_cl
*/
static u32 timer_clock;
+static void tc_clk_disable(struct clock_event_device *d)
+{
+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
+
+ clk_disable(tcd->clk);
+ tcd->clk_enabled = false;
+}
+
+static void tc_clk_enable(struct clock_event_device *d)
+{
+ struct tc_clkevt_device *tcd = to_tc_clkevt(d);
+
+ if (tcd->clk_enabled)
+ return;
+ clk_enable(tcd->clk);
+ tcd->clk_enabled = true;
+}
+
static int tc_shutdown(struct clock_event_device *d)
{
struct tc_clkevt_device *tcd = to_tc_clkevt(d);
@@ -150,8 +169,14 @@ static int tc_shutdown(struct clock_even
writel(0xff, regs + ATMEL_TC_REG(2, IDR));
writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
+ return 0;
+}
+
+static int tc_shutdown_clk_off(struct clock_event_device *d)
+{
+ tc_shutdown(d);
if (!clockevent_state_detached(d))
- clk_disable(tcd->clk);
+ tc_clk_disable(d);
return 0;
}
@@ -164,7 +189,7 @@ static int tc_set_oneshot(struct clock_e
if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
tc_shutdown(d);
- clk_enable(tcd->clk);
+ tc_clk_enable(d);
/* slow clock, count up to RC, then irq and stop */
writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
@@ -186,7 +211,7 @@ static int tc_set_periodic(struct clock_
/* By not making the gentime core emulate periodic mode on top
* of oneshot, we get lower overhead and improved accuracy.
*/
- clk_enable(tcd->clk);
+ tc_clk_enable(d);
/* slow clock, count up to RC, then irq and restart */
writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
@@ -220,7 +245,7 @@ static struct tc_clkevt_device clkevt =
/* Should be lower than at91rm9200's system timer */
.rating = 125,
.set_next_event = tc_next_event,
- .set_state_shutdown = tc_shutdown,
+ .set_state_shutdown = tc_shutdown_clk_off,
.set_state_periodic = tc_set_periodic,
.set_state_oneshot = tc_set_oneshot,
},

View File

@@ -0,0 +1,157 @@
From: Benedikt Spranger <b.spranger@linutronix.de>
Date: Mon, 8 Mar 2010 18:57:04 +0100
Subject: clocksource: TCLIB: Allow higher clock rates for clock events
As default the TCLIB uses the 32KiHz base clock rate for clock events.
Add a compile time selection to allow higher clock resulution.
(fixed up by Sami Pietikäinen <Sami.Pietikainen@wapice.com>)
Signed-off-by: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/clocksource/tcb_clksrc.c | 36 +++++++++++++++++++++---------------
drivers/misc/Kconfig | 12 ++++++++++--
2 files changed, 31 insertions(+), 17 deletions(-)
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -25,8 +25,7 @@
* this 32 bit free-running counter. the second channel is not used.
*
* - The third channel may be used to provide a 16-bit clockevent
- * source, used in either periodic or oneshot mode. This runs
- * at 32 KiHZ, and can handle delays of up to two seconds.
+ * source, used in either periodic or oneshot mode.
*
* A boot clocksource and clockevent source are also currently needed,
* unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
@@ -127,6 +126,7 @@ struct tc_clkevt_device {
struct clock_event_device clkevt;
struct clk *clk;
bool clk_enabled;
+ u32 freq;
void __iomem *regs;
};
@@ -135,13 +135,6 @@ static struct tc_clkevt_device *to_tc_cl
return container_of(clkevt, struct tc_clkevt_device, clkevt);
}
-/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
- * because using one of the divided clocks would usually mean the
- * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
- *
- * A divided clock could be good for high resolution timers, since
- * 30.5 usec resolution can seem "low".
- */
static u32 timer_clock;
static void tc_clk_disable(struct clock_event_device *d)
@@ -191,7 +184,7 @@ static int tc_set_oneshot(struct clock_e
tc_clk_enable(d);
- /* slow clock, count up to RC, then irq and stop */
+ /* count up to RC, then irq and stop */
writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
@@ -213,10 +206,10 @@ static int tc_set_periodic(struct clock_
*/
tc_clk_enable(d);
- /* slow clock, count up to RC, then irq and restart */
+ /* count up to RC, then irq and restart */
writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
regs + ATMEL_TC_REG(2, CMR));
- writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
+ writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
/* Enable clock and interrupts on RC compare */
writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
@@ -243,7 +236,11 @@ static struct tc_clkevt_device clkevt =
.features = CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_ONESHOT,
/* Should be lower than at91rm9200's system timer */
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
.rating = 125,
+#else
+ .rating = 200,
+#endif
.set_next_event = tc_next_event,
.set_state_shutdown = tc_shutdown_clk_off,
.set_state_periodic = tc_set_periodic,
@@ -265,8 +262,9 @@ static irqreturn_t ch2_irq(int irq, void
return IRQ_NONE;
}
-static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
+static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
{
+ unsigned divisor = atmel_tc_divisors[divisor_idx];
int ret;
struct clk *t2_clk = tc->clk[2];
int irq = tc->irq[2];
@@ -287,7 +285,11 @@ static int __init setup_clkevents(struct
clkevt.regs = tc->regs;
clkevt.clk = t2_clk;
- timer_clock = clk32k_divisor_idx;
+ timer_clock = divisor_idx;
+ if (!divisor)
+ clkevt.freq = 32768;
+ else
+ clkevt.freq = clk_get_rate(t2_clk) / divisor;
clkevt.clkevt.cpumask = cpumask_of(0);
@@ -298,7 +300,7 @@ static int __init setup_clkevents(struct
return ret;
}
- clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
+ clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
return ret;
}
@@ -435,7 +437,11 @@ static int __init tcb_clksrc_init(void)
goto err_disable_t1;
/* channel 2: periodic and oneshot timer support */
+#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
ret = setup_clkevents(tc, clk32k_divisor_idx);
+#else
+ ret = setup_clkevents(tc, best_divisor_idx);
+#endif
if (ret)
goto err_unregister_clksrc;
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -69,8 +69,7 @@ config ATMEL_TCB_CLKSRC
are combined to make a single 32-bit timer.
When GENERIC_CLOCKEVENTS is defined, the third timer channel
- may be used as a clock event device supporting oneshot mode
- (delays of up to two seconds) based on the 32 KiHz clock.
+ may be used as a clock event device supporting oneshot mode.
config ATMEL_TCB_CLKSRC_BLOCK
int
@@ -83,6 +82,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
TC can be used for other purposes, such as PWM generation and
interval timing.
+config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
+ bool "TC Block use 32 KiHz clock"
+ depends on ATMEL_TCB_CLKSRC
+ default y
+ help
+ Select this to use 32 KiHz base clock rate as TC block clock
+ source for clock events.
+
+
config DUMMY_IRQ
tristate "Dummy IRQ handler"
default n

View File

@@ -0,0 +1,156 @@
Subject: timekeeping: Split jiffies seqlock
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:36:59 +0100
Replace jiffies_lock seqlock with a simple seqcounter and a rawlock so
it can be taken in atomic context on RT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/jiffies.c | 7 ++++---
kernel/time/tick-common.c | 10 ++++++----
kernel/time/tick-sched.c | 19 ++++++++++++-------
kernel/time/timekeeping.c | 6 ++++--
kernel/time/timekeeping.h | 3 ++-
5 files changed, 28 insertions(+), 17 deletions(-)
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -74,7 +74,8 @@ static struct clocksource clocksource_ji
.max_cycles = 10,
};
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
@@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
ret = jiffies_64;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_ev
ktime_t next;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
next = tick_next_period;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -67,7 +67,8 @@ static void tick_do_update_jiffies64(kti
return;
/* Reevaluate with jiffies_lock held */
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, last_jiffies_update);
if (delta >= tick_period) {
@@ -90,10 +91,12 @@ static void tick_do_update_jiffies64(kti
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return;
}
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -104,12 +107,14 @@ static ktime_t tick_init_jiffy_update(vo
{
ktime_t period;
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return period;
}
@@ -652,10 +657,10 @@ static ktime_t tick_nohz_next_event(stru
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2417,8 +2417,10 @@ EXPORT_SYMBOL(hardpps);
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
do_timer(ticks);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -18,7 +18,8 @@ extern void timekeeping_resume(void);
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
#define CS_NAME_LEN 32

View File

@@ -0,0 +1,31 @@
Subject: signal: Revert ptrace preempt magic
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Sep 2011 19:57:12 +0200
Upstream commit '53da1d9456fe7f8 fix ptrace slowness' is nothing more
than a bandaid around the ptrace design trainwreck. It's not a
correctness issue, it's merily a cosmetic bandaid.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/signal.c | 8 --------
1 file changed, 8 deletions(-)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2094,15 +2094,7 @@ static void ptrace_stop(int exit_code, i
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
read_unlock(&tasklist_lock);
- preempt_enable_no_resched();
freezable_schedule();
} else {
/*

View File

@@ -0,0 +1,57 @@
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 5 Mar 2014 00:49:47 +0100
Subject: net: sched: Use msleep() instead of yield()
On PREEMPT_RT enabled systems the interrupt handler run as threads at prio 50
(by default). If a high priority userspace process tries to shut down a busy
network interface it might spin in a yield loop waiting for the device to
become idle. With the interrupt thread having a lower priority than the
looping process it might never be scheduled and so result in a deadlock on UP
systems.
With Magic SysRq the following backtrace can be produced:
> test_app R running 0 174 168 0x00000000
> [<c02c7070>] (__schedule+0x220/0x3fc) from [<c02c7870>] (preempt_schedule_irq+0x48/0x80)
> [<c02c7870>] (preempt_schedule_irq+0x48/0x80) from [<c0008fa8>] (svc_preempt+0x8/0x20)
> [<c0008fa8>] (svc_preempt+0x8/0x20) from [<c001a984>] (local_bh_enable+0x18/0x88)
> [<c001a984>] (local_bh_enable+0x18/0x88) from [<c025316c>] (dev_deactivate_many+0x220/0x264)
> [<c025316c>] (dev_deactivate_many+0x220/0x264) from [<c023be04>] (__dev_close_many+0x64/0xd4)
> [<c023be04>] (__dev_close_many+0x64/0xd4) from [<c023be9c>] (__dev_close+0x28/0x3c)
> [<c023be9c>] (__dev_close+0x28/0x3c) from [<c023f7f0>] (__dev_change_flags+0x88/0x130)
> [<c023f7f0>] (__dev_change_flags+0x88/0x130) from [<c023f904>] (dev_change_flags+0x10/0x48)
> [<c023f904>] (dev_change_flags+0x10/0x48) from [<c024c140>] (do_setlink+0x370/0x7ec)
> [<c024c140>] (do_setlink+0x370/0x7ec) from [<c024d2f0>] (rtnl_newlink+0x2b4/0x450)
> [<c024d2f0>] (rtnl_newlink+0x2b4/0x450) from [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4)
> [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4) from [<c0256740>] (netlink_rcv_skb+0xac/0xc0)
> [<c0256740>] (netlink_rcv_skb+0xac/0xc0) from [<c024bbd8>] (rtnetlink_rcv+0x18/0x24)
> [<c024bbd8>] (rtnetlink_rcv+0x18/0x24) from [<c02561b8>] (netlink_unicast+0x13c/0x198)
> [<c02561b8>] (netlink_unicast+0x13c/0x198) from [<c025651c>] (netlink_sendmsg+0x264/0x2e0)
> [<c025651c>] (netlink_sendmsg+0x264/0x2e0) from [<c022af98>] (sock_sendmsg+0x78/0x98)
> [<c022af98>] (sock_sendmsg+0x78/0x98) from [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278)
> [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278) from [<c022cf08>] (__sys_sendmsg+0x48/0x78)
> [<c022cf08>] (__sys_sendmsg+0x48/0x78) from [<c0009320>] (ret_fast_syscall+0x0/0x2c)
This patch works around the problem by replacing yield() by msleep(1), giving
the interrupt thread time to finish, similar to other changes contained in the
rt patch set. Using wait_for_completion() instead would probably be a better
solution.
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/sched/sch_generic.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1184,7 +1184,7 @@ void dev_deactivate_many(struct list_hea
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
while (some_qdisc_is_busy(dev))
- yield();
+ msleep(1);
/* The new qdisc is assigned at this point so we can safely
* unwind stale skb lists and qdisc statistics
*/

View File

@@ -0,0 +1,30 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 27 Mar 2018 16:24:15 +0200
Subject: [PATCH] dm rq: remove BUG_ON(!irqs_disabled) check
In commit 052189a2ec95 ("dm: remove superfluous irq disablement in
dm_request_fn") the spin_lock_irq() was replaced with spin_lock() + a
check for disabled interrupts. Later the locking part was removed in
commit 2eb6e1e3aa87 ("dm: submit stacked requests in irq enabled
context") but the BUG_ON() check remained.
Since the original purpose for the "are-irqs-off" check is gone (the
->queue_lock has been removed) remove it.
Cc: Keith Busch <keith.busch@intel.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/md/dm-rq.c | 1 -
1 file changed, 1 deletion(-)
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -688,7 +688,6 @@ static void dm_old_request_fn(struct req
/* Establish tio->ti before queuing work (map_tio_request) */
tio->ti = ti;
kthread_queue_work(&md->kworker, &tio->work);
- BUG_ON(!irqs_disabled());
}
}

View File

@@ -0,0 +1,39 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Nov 2013 17:34:54 +0100
Subject: usb: do no disable interrupts in giveback
Since commit 94dfd7ed ("USB: HCD: support giveback of URB in tasklet
context") the USB code disables interrupts before invoking the complete
callback.
This should not be required the HCD completes the URBs either in hard-irq
context or in BH context. Lockdep may report false positives if one has two
HCDs (one completes in IRQ and the other in BH context) and is using the same
USB driver (device) with both HCDs. This is safe since the same URBs are never
mixed with those two HCDs.
Longeterm we should force all HCDs to complete in the same context.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/usb/core/hcd.c | 3 ---
1 file changed, 3 deletions(-)
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1738,7 +1738,6 @@ static void __usb_hcd_giveback_urb(struc
struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus);
struct usb_anchor *anchor = urb->anchor;
int status = urb->unlinked;
- unsigned long flags;
urb->hcpriv = NULL;
if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) &&
@@ -1766,9 +1765,7 @@ static void __usb_hcd_giveback_urb(struc
* and no one may trigger the above deadlock situation when
* running complete() in tasklet.
*/
- local_irq_save(flags);
urb->complete(urb);
- local_irq_restore(flags);
usb_anchor_resume_wakeups(anchor);
atomic_dec(&urb->use_count);

View File

@@ -0,0 +1,57 @@
Subject: rt: Provide PREEMPT_RT_BASE config switch
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 17 Jun 2011 12:39:57 +0200
Introduce PREEMPT_RT_BASE which enables parts of
PREEMPT_RT_FULL. Forces interrupt threading and enables some of the RT
substitutions for testing.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/Kconfig.preempt | 21 ++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,10 @@
+config PREEMPT
+ bool
+ select PREEMPT_COUNT
+
+config PREEMPT_RT_BASE
+ bool
+ select PREEMPT
choice
prompt "Preemption Model"
@@ -34,10 +41,10 @@ config PREEMPT_VOLUNTARY
Select this if you are building a kernel for a desktop system.
-config PREEMPT
+config PREEMPT__LL
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_COUNT
+ select PREEMPT
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
@@ -54,7 +61,15 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_RTB
+ bool "Preemptible Kernel (Basic RT)"
+ select PREEMPT_RT_BASE
+ help
+ This option is basically the same as (Low-Latency Desktop) but
+ enables changes which are preliminary for the full preemptible
+ RT kernel.
+
endchoice
config PREEMPT_COUNT
- bool
\ No newline at end of file
+ bool

View File

@@ -0,0 +1,67 @@
Subject: cpumask: Disable CONFIG_CPUMASK_OFFSTACK for RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Dec 2011 01:03:49 +0100
There are "valid" GFP_ATOMIC allocations such as
|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:931
|in_atomic(): 1, irqs_disabled(): 0, pid: 2130, name: tar
|1 lock held by tar/2130:
| #0: (&mm->mmap_sem){++++++}, at: [<ffffffff811d4e89>] SyS_brk+0x39/0x190
|Preemption disabled at:[<ffffffff81063048>] flush_tlb_mm_range+0x28/0x350
|
|CPU: 1 PID: 2130 Comm: tar Tainted: G W 4.8.2-rt2+ #747
|Call Trace:
| [<ffffffff814d52dc>] dump_stack+0x86/0xca
| [<ffffffff810a26fb>] ___might_sleep+0x14b/0x240
| [<ffffffff819bc1d4>] rt_spin_lock+0x24/0x60
| [<ffffffff81194fba>] get_page_from_freelist+0x83a/0x11b0
| [<ffffffff81195e8b>] __alloc_pages_nodemask+0x15b/0x1190
| [<ffffffff811f0b81>] alloc_pages_current+0xa1/0x1f0
| [<ffffffff811f7df5>] new_slab+0x3e5/0x690
| [<ffffffff811fb0d5>] ___slab_alloc+0x495/0x660
| [<ffffffff811fb311>] __slab_alloc.isra.79+0x71/0xc0
| [<ffffffff811fb447>] __kmalloc_node+0xe7/0x240
| [<ffffffff814d4ee0>] alloc_cpumask_var_node+0x20/0x50
| [<ffffffff814d4f3e>] alloc_cpumask_var+0xe/0x10
| [<ffffffff810430c1>] native_send_call_func_ipi+0x21/0x130
| [<ffffffff8111c13f>] smp_call_function_many+0x22f/0x370
| [<ffffffff81062b64>] native_flush_tlb_others+0x1a4/0x3a0
| [<ffffffff8106309b>] flush_tlb_mm_range+0x7b/0x350
| [<ffffffff811c88e2>] tlb_flush_mmu_tlbonly+0x62/0xd0
| [<ffffffff811c9af4>] tlb_finish_mmu+0x14/0x50
| [<ffffffff811d1c84>] unmap_region+0xe4/0x110
| [<ffffffff811d3db3>] do_munmap+0x293/0x470
| [<ffffffff811d4f8c>] SyS_brk+0x13c/0x190
| [<ffffffff810032e2>] do_fast_syscall_32+0xb2/0x2f0
| [<ffffffff819be181>] entry_SYSENTER_compat+0x51/0x60
which forbid allocations at run-time.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/Kconfig | 2 +-
lib/Kconfig | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -934,7 +934,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
- select CPUMASK_OFFSTACK
+ select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
---help---
Enable maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -441,6 +441,7 @@ config CHECK_SIGNATURE
config CPUMASK_OFFSTACK
bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+ depends on !PREEMPT_RT_FULL
help
Use dynamic allocation for cpumask_var_t, instead of putting
them on the stack. This is a bit more expensive, but avoids

View File

@@ -0,0 +1,35 @@
Subject: jump-label: disable if stop_machine() is used
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 08 Jul 2015 17:14:48 +0200
Some architectures are using stop_machine() while switching the opcode which
leads to latency spikes.
The architectures which use stop_machine() atm:
- ARM stop machine
- s390 stop machine
The architecures which use other sorcery:
- MIPS
- X86
- powerpc
- sparc
- arm64
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: only ARM for now]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -51,7 +51,7 @@ config ARM
select HARDIRQS_SW_RESEND
select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
+ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)

View File

@@ -0,0 +1,33 @@
Subject: kconfig: Disable config options which are not RT compatible
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 24 Jul 2011 12:11:43 +0200
Disable stuff which is known to have issues on RT
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/Kconfig | 1 +
mm/Kconfig | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -28,6 +28,7 @@ config OPROFILE
tristate "OProfile system profiling"
depends on PROFILING
depends on HAVE_OPROFILE
+ depends on !PREEMPT_RT_FULL
select RING_BUFFER
select RING_BUFFER_ALLOW_SWAP
help
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -377,7 +377,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
select COMPACTION
select RADIX_TREE_MULTIORDER
help

View File

@@ -0,0 +1,28 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 17 Oct 2017 16:36:18 +0200
Subject: [PATCH] lockdep: disable self-test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The self-test wasn't always 100% accurate for RT. We disabled a few
tests which failed because they had a different semantic for RT. Some
still reported false positives. Now the selftest locks up the system
during boot and it needs to be investigated…
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
lib/Kconfig.debug | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1207,7 +1207,7 @@ config DEBUG_ATOMIC_SLEEP
config DEBUG_LOCKING_API_SELFTESTS
bool "Locking API boot-time self-tests"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !PREEMPT_RT_FULL
help
Say Y here if you want the kernel to run a short self-test during
bootup. The self-test checks whether common types of locking bugs

View File

@@ -0,0 +1,31 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:44:03 -0500
Subject: mm: Allow only slub on RT
Disable SLAB and SLOB on -RT. Only SLUB is adopted to -RT needs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
init/Kconfig | 2 ++
1 file changed, 2 insertions(+)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1634,6 +1634,7 @@ choice
config SLAB
bool "SLAB"
+ depends on !PREEMPT_RT_FULL
select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
The regular slab allocator that is established and known to work
@@ -1654,6 +1655,7 @@ config SLUB
config SLOB
depends on EXPERT
bool "SLOB (Simple Allocator)"
+ depends on !PREEMPT_RT_FULL
help
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:51:45 +0200
Subject: locking: Disable spin on owner for RT
Drop spin on owner for mutex / rwsem. We are most likely not using it
but…
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/Kconfig.locks | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
config RWSEM_SPIN_ON_OWNER
def_bool y
- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
config LOCK_SPIN_ON_OWNER
def_bool y

View File

@@ -0,0 +1,24 @@
Subject: rcu: Disable RCU_FAST_NO_HZ on RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 28 Oct 2012 13:26:09 +0000
This uses a timer_list timer from the irq disabled guts of the idle
code. Disable it for now to prevent wreckage.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/rcu/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -172,7 +172,7 @@ config RCU_FANOUT_LEAF
config RCU_FAST_NO_HZ
bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
+ depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
default n
help
This option permits CPUs to enter dynticks-idle state even if

View File

@@ -0,0 +1,27 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 21 Mar 2014 20:19:05 +0100
Subject: rcu: make RCU_BOOST default on RT
Since it is no longer invoked from the softirq people run into OOM more
often if the priority of the RCU thread is too low. Making boosting
default on RT should help in those case and it can be switched off if
someone knows better.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/rcu/Kconfig | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -190,8 +190,8 @@ config RCU_FAST_NO_HZ
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT_FULL
+ default y if PREEMPT_RT_FULL
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.

View File

@@ -0,0 +1,28 @@
Subject: sched: Disable CONFIG_RT_GROUP_SCHED on RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:03:52 +0200
Carsten reported problems when running:
taskset 01 chrt -f 1 sleep 1
from within rc.local on a F15 machine. The task stays running and
never gets on the run queue because some of the run queues have
rt_throttled=1 which does not go away. Works nice from a ssh login
shell. Disabling CONFIG_RT_GROUP_SCHED solves that as well.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
init/Kconfig | 1 +
1 file changed, 1 insertion(+)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -781,6 +781,7 @@ config CFS_BANDWIDTH
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on CGROUP_SCHED
+ depends on !PREEMPT_RT_FULL
default n
help
This feature lets you explicitly allocate real CPU bandwidth

View File

@@ -0,0 +1,28 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 27 May 2017 19:02:06 +0200
Subject: net/core: disable NET_RX_BUSY_POLL
sk_busy_loop() does preempt_disable() followed by a few operations which can
take sleeping locks and may get long.
I _think_ that we could use preempt_disable_nort() (in sk_busy_loop()) instead
but after a successfull cmpxchg(&napi->state, …) we would gain the ressource
and could be scheduled out. At this point nobody knows who (which context) owns
it and so it could take a while until the state is realeased and napi_poll()
could be invoked again.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -275,7 +275,7 @@ config CGROUP_NET_CLASSID
config NET_RX_BUSY_POLL
bool
- default y
+ default y if !PREEMPT_RT_FULL
config BQL
bool

View File

@@ -0,0 +1,155 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 1 Dec 2017 10:42:03 +0100
Subject: [PATCH] arm*: disable NEON in kernel mode
NEON in kernel mode is used by the crypto algorithms and raid6 code.
While the raid6 code looks okay, the crypto algorithms do not: NEON
is enabled on first invocation and may allocate/free/map memory before
the NEON mode is disabled again.
This needs to be changed until it can be enabled.
On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
stay on due to possible EFI callbacks so here I disable each algorithm.
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/Kconfig | 2 +-
arch/arm64/crypto/Kconfig | 28 ++++++++++++++--------------
arch/arm64/crypto/crc32-ce-glue.c | 3 ++-
3 files changed, 17 insertions(+), 16 deletions(-)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2160,7 +2160,7 @@ config NEON
config KERNEL_MODE_NEON
bool "Support for NEON in kernel mode"
- depends on NEON && AEABI
+ depends on NEON && AEABI && !PREEMPT_RT_BASE
help
Say Y to include support for NEON in kernel mode.
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -19,43 +19,43 @@ config CRYPTO_SHA512_ARM64
config CRYPTO_SHA1_ARM64_CE
tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_SHA1
config CRYPTO_SHA2_ARM64_CE
tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_SHA256_ARM64
config CRYPTO_SHA512_ARM64_CE
tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_SHA512_ARM64
config CRYPTO_SHA3_ARM64
tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_SHA3
config CRYPTO_SM3_ARM64_CE
tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_SM3
config CRYPTO_SM4_ARM64_CE
tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_ALGAPI
select CRYPTO_SM4
config CRYPTO_GHASH_ARM64_CE
tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
select CRYPTO_GF128MUL
select CRYPTO_AES
@@ -63,7 +63,7 @@ config CRYPTO_GHASH_ARM64_CE
config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
- depends on KERNEL_MODE_NEON && CRC_T10DIF
+ depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE
select CRYPTO_HASH
config CRYPTO_CRC32_ARM64_CE
@@ -77,13 +77,13 @@ config CRYPTO_AES_ARM64
config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM64
config CRYPTO_AES_ARM64_CE_CCM
tristate "AES in CCM mode using ARMv8 Crypto Extensions"
- depends on ARM64 && KERNEL_MODE_NEON
+ depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
@@ -91,7 +91,7 @@ config CRYPTO_AES_ARM64_CE_CCM
config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
@@ -99,7 +99,7 @@ config CRYPTO_AES_ARM64_CE_BLK
config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64
select CRYPTO_AES
@@ -107,13 +107,13 @@ config CRYPTO_AES_ARM64_NEON_BLK
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
- depends on KERNEL_MODE_NEON
+ depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES_ARM64
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -208,7 +208,8 @@ static struct shash_alg crc32_pmull_algs
static int __init crc32_pmull_mod_init(void)
{
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) {
crc32_pmull_algs[0].update = crc32_pmull_update;
crc32_pmull_algs[1].update = crc32c_pmull_update;

View File

@@ -0,0 +1,26 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 14 Jul 2015 14:26:34 +0200
Subject: powerpc: Use generic rwsem on RT
Use generic code which uses rtmutex
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/Kconfig | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -105,10 +105,11 @@ config LOCKDEP_SUPPORT
config RWSEM_GENERIC_SPINLOCK
bool
+ default y if PREEMPT_RT_FULL
config RWSEM_XCHGADD_ALGORITHM
bool
- default y
+ default y if !PREEMPT_RT_FULL
config GENERIC_LOCKBREAK
bool

View File

@@ -0,0 +1,37 @@
From: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Date: Fri, 24 Apr 2015 15:53:13 +0000
Subject: powerpc/kvm: Disable in-kernel MPIC emulation for PREEMPT_RT_FULL
While converting the openpic emulation code to use a raw_spinlock_t enables
guests to run on RT, there's still a performance issue. For interrupts sent in
directed delivery mode with a multiple CPU mask, the emulated openpic will loop
through all of the VCPUs, and for each VCPUs, it call IRQ_check, which will loop
through all the pending interrupts for that VCPU. This is done while holding the
raw_lock, meaning that in all this time the interrupts and preemption are
disabled on the host Linux. A malicious user app can max both these number and
cause a DoS.
This temporary fix is sent for two reasons. First is so that users who want to
use the in-kernel MPIC emulation are aware of the potential latencies, thus
making sure that the hardware MPIC and their usage scenario does not involve
interrupts sent in directed delivery mode, and the number of possible pending
interrupts is kept small. Secondly, this should incentivize the development of a
proper openpic emulation that would be better suited for RT.
Acked-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Bogdan Purcareata <bogdan.purcareata@freescale.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/powerpc/kvm/Kconfig | 1 +
1 file changed, 1 insertion(+)
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -178,6 +178,7 @@ config KVM_E500MC
config KVM_MPIC
bool "KVM in-kernel MPIC emulation"
depends on KVM && E500
+ depends on !PREEMPT_RT_FULL
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING

View File

@@ -0,0 +1,22 @@
Subject: powerpc: Disable highmem on RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:08:34 +0200
The current highmem handling on -RT is not compatible and needs fixups.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -398,7 +398,7 @@ menu "Kernel options"
config HIGHMEM
bool "High memory support"
- depends on PPC32
+ depends on PPC32 && !PREEMPT_RT_FULL
source kernel/Kconfig.hz

View File

@@ -0,0 +1,22 @@
Subject: mips: Disable highmem on RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 18 Jul 2011 17:10:12 +0200
The current highmem handling on -RT is not compatible and needs fixups.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/mips/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2517,7 +2517,7 @@ config MIPS_CRC_SUPPORT
#
config HIGHMEM
bool "High Memory Support"
- depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
+ depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
config CPU_SUPPORTS_HIGHMEM
bool

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 26 Jul 2009 02:21:32 +0200
Subject: x86: Use generic rwsem_spinlocks on -rt
Simplifies the separation of anon_rw_semaphores and rw_semaphores for
-rt.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/Kconfig | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -264,8 +264,11 @@ config ARCH_MAY_HAVE_PC_FDC
def_bool y
depends on ISA_DMA_API
+config RWSEM_GENERIC_SPINLOCK
+ def_bool PREEMPT_RT_FULL
+
config RWSEM_XCHGADD_ALGORITHM
- def_bool y
+ def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
config GENERIC_CALIBRATE_DELAY
def_bool y

View File

@@ -0,0 +1,34 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 23 Jan 2014 14:45:59 +0100
Subject: leds: trigger: disable CPU trigger on -RT
as it triggers:
|CPU: 0 PID: 0 Comm: swapper Not tainted 3.12.8-rt10 #141
|[<c0014aa4>] (unwind_backtrace+0x0/0xf8) from [<c0012788>] (show_stack+0x1c/0x20)
|[<c0012788>] (show_stack+0x1c/0x20) from [<c043c8dc>] (dump_stack+0x20/0x2c)
|[<c043c8dc>] (dump_stack+0x20/0x2c) from [<c004c5e8>] (__might_sleep+0x13c/0x170)
|[<c004c5e8>] (__might_sleep+0x13c/0x170) from [<c043f270>] (__rt_spin_lock+0x28/0x38)
|[<c043f270>] (__rt_spin_lock+0x28/0x38) from [<c043fa00>] (rt_read_lock+0x68/0x7c)
|[<c043fa00>] (rt_read_lock+0x68/0x7c) from [<c036cf74>] (led_trigger_event+0x2c/0x5c)
|[<c036cf74>] (led_trigger_event+0x2c/0x5c) from [<c036e0bc>] (ledtrig_cpu+0x54/0x5c)
|[<c036e0bc>] (ledtrig_cpu+0x54/0x5c) from [<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c)
|[<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c) from [<c00590b8>] (cpu_startup_entry+0xa8/0x234)
|[<c00590b8>] (cpu_startup_entry+0xa8/0x234) from [<c043b2cc>] (rest_init+0xb8/0xe0)
|[<c043b2cc>] (rest_init+0xb8/0xe0) from [<c061ebe0>] (start_kernel+0x2c4/0x380)
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/leds/trigger/Kconfig | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -63,6 +63,7 @@ config LEDS_TRIGGER_BACKLIGHT
config LEDS_TRIGGER_CPU
bool "LED CPU Trigger"
+ depends on !PREEMPT_RT_BASE
help
This allows LEDs to be controlled by active CPUs. This shows
the active CPUs across an array of LEDs so you can see which

View File

@@ -0,0 +1,32 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 9 Apr 2015 15:23:01 +0200
Subject: cpufreq: drop K8's driver from beeing selected
Ralf posted a picture of a backtrace from
| powernowk8_target_fn() -> transition_frequency_fidvid() and then at the
| end:
| 932 policy = cpufreq_cpu_get(smp_processor_id());
| 933 cpufreq_cpu_put(policy);
crashing the system on -RT. I assumed that policy was a NULL pointer but
was rulled out. Since Ralf can't do any more investigations on this and
I have no machine with this, I simply switch it off.
Reported-by: Ralf Mardorf <ralf.mardorf@alice-dsl.net>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/cpufreq/Kconfig.x86 | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -125,7 +125,7 @@ config X86_POWERNOW_K7_ACPI
config X86_POWERNOW_K8
tristate "AMD Opteron/Athlon64 PowerNow!"
- depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
+ depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
help
This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
Support for K10 and newer processors is now in acpi-cpufreq.

View File

@@ -0,0 +1,31 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 29 Aug 2013 11:48:57 +0200
Subject: md: disable bcache
It uses anon semaphores
|drivers/md/bcache/request.c: In function cached_dev_write_complete:
|drivers/md/bcache/request.c:1007:2: error: implicit declaration of function up_read_non_owner [-Werror=implicit-function-declaration]
| up_read_non_owner(&dc->writeback_lock);
| ^
|drivers/md/bcache/request.c: In function request_write:
|drivers/md/bcache/request.c:1033:2: error: implicit declaration of function down_read_non_owner [-Werror=implicit-function-declaration]
| down_read_non_owner(&dc->writeback_lock);
| ^
either we get rid of those or we have to introduce them…
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/md/bcache/Kconfig | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,6 +1,7 @@
config BCACHE
tristate "Block device as cache"
+ depends on !PREEMPT_RT_FULL
select CRC64
help
Allows a block device to be used as cache for other devices; uses

View File

@@ -0,0 +1,39 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 26 Jul 2018 15:03:16 +0200
Subject: [PATCH] efi: Disable runtime services on RT
Based on meassurements the EFI functions get_variable /
get_next_variable take up to 2us which looks okay.
The functions get_time, set_time take around 10ms. Those 10ms are too
much. Even one ms would be too much.
Ard mentioned that SetVariable might even trigger larger latencies if
the firware will erase flash blocks on NOR.
The time-functions are used by efi-rtc and can be triggered during
runtimed (either via explicit read/write or ntp sync).
The variable write could be used by pstore.
These functions can be disabled without much of a loss. The poweroff /
reboot hooks may be provided by PSCI.
Disable EFI's runtime wrappers.
This was observed on "EFI v2.60 by SoftIron Overdrive 1000".
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/firmware/efi/efi.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -87,7 +87,7 @@ struct mm_struct efi_mm = {
struct workqueue_struct *efi_rts_wq;
-static bool disable_runtime;
+static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT_BASE);
static int __init setup_noefi(char *arg)
{
disable_runtime = true;

View File

@@ -0,0 +1,163 @@
Subject: printk: Add a printk kill switch
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 Jul 2011 17:58:40 +0200
Add a prinkt-kill-switch. This is used from (NMI) watchdog to ensure that
it does not dead-lock with the early printk code.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/printk.h | 2 +
kernel/printk/printk.c | 79 ++++++++++++++++++++++++++++++++++++-------------
kernel/watchdog_hld.c | 10 ++++++
3 files changed, 71 insertions(+), 20 deletions(-)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -140,9 +140,11 @@ struct va_format {
#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
+extern void printk_kill(void);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
+static inline void printk_kill(void) { }
#endif
#ifdef CONFIG_PRINTK_NMI
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -405,6 +405,58 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
printk_safe_exit_irqrestore(flags); \
} while (0)
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+static void early_vprintk(const char *fmt, va_list ap)
+{
+ if (early_console) {
+ char buf[512];
+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+ early_console->write(early_console, buf, n);
+ }
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ early_vprintk(fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * This is independent of any log levels - a global
+ * kill switch that turns off all of printk.
+ *
+ * Used by the NMI watchdog if early-printk is enabled.
+ */
+static bool __read_mostly printk_killswitch;
+
+void printk_kill(void)
+{
+ printk_killswitch = true;
+}
+
+#ifdef CONFIG_PRINTK
+static int forced_early_printk(const char *fmt, va_list ap)
+{
+ if (!printk_killswitch)
+ return 0;
+ early_vprintk(fmt, ap);
+ return 1;
+}
+#endif
+
+#else
+static inline int forced_early_printk(const char *fmt, va_list ap)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -1897,6 +1949,13 @@ asmlinkage int vprintk_emit(int facility
bool in_sched = false;
unsigned long flags;
+ /*
+ * Fall back to early_printk if a debugging subsystem has
+ * killed printk output
+ */
+ if (unlikely(forced_early_printk(fmt, args)))
+ return 1;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -2037,26 +2096,6 @@ static bool suppress_message_printing(in
#endif /* CONFIG_PRINTK */
-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-asmlinkage __visible void early_printk(const char *fmt, ...)
-{
- va_list ap;
- char buf[512];
- int n;
-
- if (!early_console)
- return;
-
- va_start(ap, fmt);
- n = vscnprintf(buf, sizeof(buf), fmt, ap);
- va_end(ap);
-
- early_console->write(early_console, buf, n);
-}
-#endif
-
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
{
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -24,6 +24,8 @@ static DEFINE_PER_CPU(bool, hard_watchdo
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
+
static struct cpumask dead_events_mask;
static unsigned long hardlockup_allcpu_dumped;
@@ -134,6 +136,13 @@ static void watchdog_overflow_callback(s
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
+ /*
+ * If early-printk is enabled then make sure we do not
+ * lock up in printk() and kill console logging:
+ */
+ printk_kill();
+
+ raw_spin_lock(&watchdog_output_lock);
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
print_modules();
@@ -151,6 +160,7 @@ static void watchdog_overflow_callback(s
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();
+ raw_spin_unlock(&watchdog_output_lock);
if (hardlockup_panic)
nmi_panic(regs, "Hard LOCKUP");

View File

@@ -0,0 +1,31 @@
Subject: printk: Add "force_early_printk" boot param to help with debugging
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 02 Sep 2011 14:41:29 +0200
Gives me an option to screw printk and actually see what the machine
says.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1314967289.1301.11.camel@twins
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-ykb97nsfmobq44xketrxs977@git.kernel.org
---
kernel/printk/printk.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -435,6 +435,13 @@ asmlinkage void early_printk(const char
*/
static bool __read_mostly printk_killswitch;
+static int __init force_early_printk_setup(char *str)
+{
+ printk_killswitch = true;
+ return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
void printk_kill(void)
{
printk_killswitch = true;

View File

@@ -0,0 +1,47 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2009 12:38:56 +0200
Subject: preempt: Provide preempt_*_(no)rt variants
RT needs a few preempt_disable/enable points which are not necessary
otherwise. Implement variants to avoid #ifdeffery.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/preempt.h | 18 +++++++++++++++++-
1 file changed, 17 insertions(+), 1 deletion(-)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -181,7 +181,11 @@ do { \
preempt_count_dec(); \
} while (0)
-#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
+#else
+# define preempt_enable_no_resched() preempt_enable()
+#endif
#define preemptible() (preempt_count() == 0 && !irqs_disabled())
@@ -298,6 +302,18 @@ do { \
set_preempt_need_resched(); \
} while (0)
+#ifdef CONFIG_PREEMPT_RT_FULL
+# define preempt_disable_rt() preempt_disable()
+# define preempt_enable_rt() preempt_enable()
+# define preempt_disable_nort() barrier()
+# define preempt_enable_nort() barrier()
+#else
+# define preempt_disable_rt() barrier()
+# define preempt_enable_rt() barrier()
+# define preempt_disable_nort() preempt_disable()
+# define preempt_enable_nort() preempt_enable()
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier;

View File

@@ -0,0 +1,62 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 8 Mar 2017 14:23:35 +0100
Subject: [PATCH] futex: workaround migrate_disable/enable in different context
migrate_disable()/migrate_enable() takes a different path in atomic() vs
!atomic() context. These little hacks ensure that we don't underflow / overflow
the migrate code counts properly while we lock the hb lockwith interrupts
enabled and unlock it with interrupts disabled.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2856,6 +2856,14 @@ static int futex_lock_pi(u32 __user *uad
* before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ /*
+ * the migrate_disable() here disables migration in the in_atomic() fast
+ * path which is enabled again in the following spin_unlock(). We have
+ * one migrate_disable() pending in the slow-path which is reversed
+ * after the raw_spin_unlock_irq() where we leave the atomic context.
+ */
+ migrate_disable();
+
spin_unlock(q.lock_ptr);
/*
* __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
@@ -2864,6 +2872,7 @@ static int futex_lock_pi(u32 __user *uad
*/
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+ migrate_enable();
if (ret) {
if (ret == 1)
@@ -3012,11 +3021,21 @@ static int futex_unlock_pi(u32 __user *u
* rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ /*
+ * Magic trickery for now to make the RT migrate disable
+ * logic happy. The following spin_unlock() happens with
+ * interrupts disabled so the internal migrate_enable()
+ * won't undo the migrate_disable() which was issued when
+ * locking hb->lock.
+ */
+ migrate_disable();
spin_unlock(&hb->lock);
/* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
+ migrate_enable();
+
put_pi_state(pi_state);
/*

View File

@@ -0,0 +1,330 @@
Subject: rt: Add local irq locks
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Jun 2011 09:03:47 +0200
Introduce locallock. For !RT this maps to preempt_disable()/
local_irq_disable() so there is not much that changes. For RT this will
map to a spinlock. This makes preemption possible and locked "ressource"
gets the lockdep anotation it wouldn't have otherwise. The locks are
recursive for owner == current. Also, all locks user migrate_disable()
which ensures that the task is not migrated to another CPU while the lock
is held and the owner is preempted.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/locallock.h | 271 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/percpu.h | 29 ++++
2 files changed, 300 insertions(+)
--- /dev/null
+++ b/include/linux/locallock.h
@@ -0,0 +1,271 @@
+#ifndef _LINUX_LOCALLOCK_H
+#define _LINUX_LOCALLOCK_H
+
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+# define LL_WARN(cond) WARN_ON(cond)
+#else
+# define LL_WARN(cond) do { } while (0)
+#endif
+
+/*
+ * per cpu lock based substitute for local_irq_*()
+ */
+struct local_irq_lock {
+ spinlock_t lock;
+ struct task_struct *owner;
+ int nestcnt;
+ unsigned long flags;
+};
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
+ DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
+ .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
+
+#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
+ DECLARE_PER_CPU(struct local_irq_lock, lvar)
+
+#define local_irq_lock_init(lvar) \
+ do { \
+ int __cpu; \
+ for_each_possible_cpu(__cpu) \
+ spin_lock_init(&per_cpu(lvar, __cpu).lock); \
+ } while (0)
+
+static inline void __local_lock(struct local_irq_lock *lv)
+{
+ if (lv->owner != current) {
+ spin_lock(&lv->lock);
+ LL_WARN(lv->owner);
+ LL_WARN(lv->nestcnt);
+ lv->owner = current;
+ }
+ lv->nestcnt++;
+}
+
+#define local_lock(lvar) \
+ do { __local_lock(&get_local_var(lvar)); } while (0)
+
+#define local_lock_on(lvar, cpu) \
+ do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
+
+static inline int __local_trylock(struct local_irq_lock *lv)
+{
+ if (lv->owner != current && spin_trylock(&lv->lock)) {
+ LL_WARN(lv->owner);
+ LL_WARN(lv->nestcnt);
+ lv->owner = current;
+ lv->nestcnt = 1;
+ return 1;
+ } else if (lv->owner == current) {
+ lv->nestcnt++;
+ return 1;
+ }
+ return 0;
+}
+
+#define local_trylock(lvar) \
+ ({ \
+ int __locked; \
+ __locked = __local_trylock(&get_local_var(lvar)); \
+ if (!__locked) \
+ put_local_var(lvar); \
+ __locked; \
+ })
+
+static inline void __local_unlock(struct local_irq_lock *lv)
+{
+ LL_WARN(lv->nestcnt == 0);
+ LL_WARN(lv->owner != current);
+ if (--lv->nestcnt)
+ return;
+
+ lv->owner = NULL;
+ spin_unlock(&lv->lock);
+}
+
+#define local_unlock(lvar) \
+ do { \
+ __local_unlock(this_cpu_ptr(&lvar)); \
+ put_local_var(lvar); \
+ } while (0)
+
+#define local_unlock_on(lvar, cpu) \
+ do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
+
+static inline void __local_lock_irq(struct local_irq_lock *lv)
+{
+ spin_lock_irqsave(&lv->lock, lv->flags);
+ LL_WARN(lv->owner);
+ LL_WARN(lv->nestcnt);
+ lv->owner = current;
+ lv->nestcnt = 1;
+}
+
+#define local_lock_irq(lvar) \
+ do { __local_lock_irq(&get_local_var(lvar)); } while (0)
+
+#define local_lock_irq_on(lvar, cpu) \
+ do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
+
+static inline void __local_unlock_irq(struct local_irq_lock *lv)
+{
+ LL_WARN(!lv->nestcnt);
+ LL_WARN(lv->owner != current);
+ lv->owner = NULL;
+ lv->nestcnt = 0;
+ spin_unlock_irq(&lv->lock);
+}
+
+#define local_unlock_irq(lvar) \
+ do { \
+ __local_unlock_irq(this_cpu_ptr(&lvar)); \
+ put_local_var(lvar); \
+ } while (0)
+
+#define local_unlock_irq_on(lvar, cpu) \
+ do { \
+ __local_unlock_irq(&per_cpu(lvar, cpu)); \
+ } while (0)
+
+static inline int __local_lock_irqsave(struct local_irq_lock *lv)
+{
+ if (lv->owner != current) {
+ __local_lock_irq(lv);
+ return 0;
+ } else {
+ lv->nestcnt++;
+ return 1;
+ }
+}
+
+#define local_lock_irqsave(lvar, _flags) \
+ do { \
+ if (__local_lock_irqsave(&get_local_var(lvar))) \
+ put_local_var(lvar); \
+ _flags = __this_cpu_read(lvar.flags); \
+ } while (0)
+
+#define local_lock_irqsave_on(lvar, _flags, cpu) \
+ do { \
+ __local_lock_irqsave(&per_cpu(lvar, cpu)); \
+ _flags = per_cpu(lvar, cpu).flags; \
+ } while (0)
+
+static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
+ unsigned long flags)
+{
+ LL_WARN(!lv->nestcnt);
+ LL_WARN(lv->owner != current);
+ if (--lv->nestcnt)
+ return 0;
+
+ lv->owner = NULL;
+ spin_unlock_irqrestore(&lv->lock, lv->flags);
+ return 1;
+}
+
+#define local_unlock_irqrestore(lvar, flags) \
+ do { \
+ if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
+ put_local_var(lvar); \
+ } while (0)
+
+#define local_unlock_irqrestore_on(lvar, flags, cpu) \
+ do { \
+ __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
+ } while (0)
+
+#define local_spin_trylock_irq(lvar, lock) \
+ ({ \
+ int __locked; \
+ local_lock_irq(lvar); \
+ __locked = spin_trylock(lock); \
+ if (!__locked) \
+ local_unlock_irq(lvar); \
+ __locked; \
+ })
+
+#define local_spin_lock_irq(lvar, lock) \
+ do { \
+ local_lock_irq(lvar); \
+ spin_lock(lock); \
+ } while (0)
+
+#define local_spin_unlock_irq(lvar, lock) \
+ do { \
+ spin_unlock(lock); \
+ local_unlock_irq(lvar); \
+ } while (0)
+
+#define local_spin_lock_irqsave(lvar, lock, flags) \
+ do { \
+ local_lock_irqsave(lvar, flags); \
+ spin_lock(lock); \
+ } while (0)
+
+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
+ do { \
+ spin_unlock(lock); \
+ local_unlock_irqrestore(lvar, flags); \
+ } while (0)
+
+#define get_locked_var(lvar, var) \
+ (*({ \
+ local_lock(lvar); \
+ this_cpu_ptr(&var); \
+ }))
+
+#define put_locked_var(lvar, var) local_unlock(lvar);
+
+#define local_lock_cpu(lvar) \
+ ({ \
+ local_lock(lvar); \
+ smp_processor_id(); \
+ })
+
+#define local_unlock_cpu(lvar) local_unlock(lvar)
+
+#else /* PREEMPT_RT_BASE */
+
+#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
+#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
+
+static inline void local_irq_lock_init(int lvar) { }
+
+#define local_trylock(lvar) \
+ ({ \
+ preempt_disable(); \
+ 1; \
+ })
+
+#define local_lock(lvar) preempt_disable()
+#define local_unlock(lvar) preempt_enable()
+#define local_lock_irq(lvar) local_irq_disable()
+#define local_lock_irq_on(lvar, cpu) local_irq_disable()
+#define local_unlock_irq(lvar) local_irq_enable()
+#define local_unlock_irq_on(lvar, cpu) local_irq_enable()
+#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
+#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
+
+#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
+#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
+#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
+#define local_spin_lock_irqsave(lvar, lock, flags) \
+ spin_lock_irqsave(lock, flags)
+#define local_spin_unlock_irqrestore(lvar, lock, flags) \
+ spin_unlock_irqrestore(lock, flags)
+
+#define get_locked_var(lvar, var) get_cpu_var(var)
+#define put_locked_var(lvar, var) put_cpu_var(var)
+
+#define local_lock_cpu(lvar) get_cpu()
+#define local_unlock_cpu(lvar) put_cpu()
+
+#endif
+
+#endif
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -19,6 +19,35 @@
#define PERCPU_MODULE_RESERVE 0
#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+#define get_local_var(var) (*({ \
+ migrate_disable(); \
+ this_cpu_ptr(&var); }))
+
+#define put_local_var(var) do { \
+ (void)&(var); \
+ migrate_enable(); \
+} while (0)
+
+# define get_local_ptr(var) ({ \
+ migrate_disable(); \
+ this_cpu_ptr(var); })
+
+# define put_local_ptr(var) do { \
+ (void)(var); \
+ migrate_enable(); \
+} while (0)
+
+#else
+
+#define get_local_var(var) get_cpu_var(var)
+#define put_local_var(var) put_cpu_var(var)
+#define get_local_ptr(var) get_cpu_ptr(var)
+#define put_local_ptr(var) put_cpu_ptr(var)
+
+#endif
+
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)

View File

@@ -0,0 +1,42 @@
From: Julia Cartwright <julia@ni.com>
Date: Mon, 7 May 2018 08:58:56 -0500
Subject: [PATCH] locallock: provide {get,put}_locked_ptr() variants
Provide a set of locallocked accessors for pointers to per-CPU data;
this is useful for dynamically-allocated per-CPU regions, for example.
These are symmetric with the {get,put}_cpu_ptr() per-CPU accessor
variants.
Signed-off-by: Julia Cartwright <julia@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/locallock.h | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -222,6 +222,14 @@ static inline int __local_unlock_irqrest
#define put_locked_var(lvar, var) local_unlock(lvar);
+#define get_locked_ptr(lvar, var) \
+ ({ \
+ local_lock(lvar); \
+ this_cpu_ptr(var); \
+ })
+
+#define put_locked_ptr(lvar, var) local_unlock(lvar);
+
#define local_lock_cpu(lvar) \
({ \
local_lock(lvar); \
@@ -262,6 +270,8 @@ static inline void local_irq_lock_init(i
#define get_locked_var(lvar, var) get_cpu_var(var)
#define put_locked_var(lvar, var) put_cpu_var(var)
+#define get_locked_ptr(lvar, var) get_cpu_ptr(var)
+#define put_locked_ptr(lvar, var) put_cpu_ptr(var)
#define local_lock_cpu(lvar) get_cpu()
#define local_unlock_cpu(lvar) put_cpu()

View File

@@ -0,0 +1,23 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 3 Jul 2009 08:44:34 -0500
Subject: mm/scatterlist: Do not disable irqs on RT
For -RT it is enough to keep pagefault disabled (which is currently handled by
kmap_atomic()).
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
lib/scatterlist.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -776,7 +776,7 @@ void sg_miter_stop(struct sg_mapping_ite
flush_kernel_dcache_page(miter->page);
if (miter->__flags & SG_MITER_ATOMIC) {
- WARN_ON_ONCE(preemptible());
+ WARN_ON_ONCE(!pagefault_disabled());
kunmap_atomic(miter->addr);
} else
kunmap(miter->page);

View File

@@ -0,0 +1,142 @@
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 14 Jul 2015 14:26:34 +0200
Subject: signal/x86: Delay calling signals in atomic
On x86_64 we must disable preemption before we enable interrupts
for stack faults, int3 and debugging, because the current task is using
a per CPU debug stack defined by the IST. If we schedule out, another task
can come in and use the same stack and cause the stack to be corrupted
and crash the kernel on return.
When CONFIG_PREEMPT_RT_FULL is enabled, spin_locks become mutexes, and
one of these is the spin lock used in signal handling.
Some of the debug code (int3) causes do_trap() to send a signal.
This function calls a spin lock that has been converted to a mutex
and has the possibility to sleep. If this happens, the above issues with
the corrupted stack is possible.
Instead of calling the signal right away, for PREEMPT_RT and x86_64,
the signal information is stored on the stacks task_struct and
TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume
code will send the signal when preemption is enabled.
[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT_FULL to
ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/entry/common.c | 7 +++++++
arch/x86/include/asm/signal.h | 13 +++++++++++++
include/linux/sched.h | 4 ++++
kernel/signal.c | 37 +++++++++++++++++++++++++++++++++++--
4 files changed, 59 insertions(+), 2 deletions(-)
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -151,6 +151,13 @@ static void exit_to_usermode_loop(struct
if (cached_flags & _TIF_NEED_RESCHED)
schedule();
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (unlikely(current->forced_info.si_signo)) {
+ struct task_struct *t = current;
+ force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
+ t->forced_info.si_signo = 0;
+ }
+#endif
if (cached_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -28,6 +28,19 @@ typedef struct {
#define SA_IA32_ABI 0x02000000u
#define SA_X32_ABI 0x01000000u
+/*
+ * Because some traps use the IST stack, we must keep preemption
+ * disabled while calling do_trap(), but do_trap() may call
+ * force_sig_info() which will grab the signal spin_locks for the
+ * task, which in PREEMPT_RT_FULL are mutexes. By defining
+ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
+ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
+ * trap.
+ */
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#define ARCH_RT_DELAYS_SIGNAL_SEND
+#endif
+
#ifndef CONFIG_COMPAT
typedef sigset_t compat_sigset_t;
#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -881,6 +881,10 @@ struct task_struct {
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /* TODO: move me into ->restart_block ? */
+ struct siginfo forced_info;
+#endif
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1268,8 +1268,8 @@ int do_send_sig_info(int sig, struct sig
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
-int
-force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+static int
+do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
int ret, blocked, ignored;
@@ -1298,6 +1298,39 @@ force_sig_info(int sig, struct siginfo *
return ret;
}
+int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
+{
+/*
+ * On some archs, PREEMPT_RT has to delay sending a signal from a trap
+ * since it can not enable preemption, and the signal code's spin_locks
+ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
+ * send the signal on exit of the trap.
+ */
+#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
+ if (in_atomic()) {
+ if (WARN_ON_ONCE(t != current))
+ return 0;
+ if (WARN_ON_ONCE(t->forced_info.si_signo))
+ return 0;
+
+ if (is_si_special(info)) {
+ WARN_ON_ONCE(info != SEND_SIG_PRIV);
+ t->forced_info.si_signo = sig;
+ t->forced_info.si_errno = 0;
+ t->forced_info.si_code = SI_KERNEL;
+ t->forced_info.si_pid = 0;
+ t->forced_info.si_uid = 0;
+ } else {
+ t->forced_info = *info;
+ }
+
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+ return 0;
+ }
+#endif
+ return do_force_sig_info(sig, info, t);
+}
+
/*
* Nuke all other threads in the group.
*/

View File

@@ -0,0 +1,42 @@
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 10 Dec 2015 10:58:51 -0800
Subject: x86/signal: delay calling signals on 32bit
When running some ptrace single step tests on x86-32 machine, the below problem
is triggered:
BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:917
in_atomic(): 1, irqs_disabled(): 0, pid: 1041, name: dummy2
Preemption disabled at:[<c100326f>] do_debug+0x1f/0x1a0
CPU: 10 PID: 1041 Comm: dummy2 Tainted: G W 4.1.13-rt13 #1
Call Trace:
[<c1aa8306>] dump_stack+0x46/0x5c
[<c1080517>] ___might_sleep+0x137/0x220
[<c1ab0eff>] rt_spin_lock+0x1f/0x80
[<c1064b5a>] do_force_sig_info+0x2a/0xc0
[<c106567d>] force_sig_info+0xd/0x10
[<c1010cff>] send_sigtrap+0x6f/0x80
[<c10033b1>] do_debug+0x161/0x1a0
[<c1ab2921>] debug_stack_correct+0x2e/0x35
This happens since 959274753857 ("x86, traps: Track entry into and exit
from IST context") which was merged in v4.1-rc1.
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/include/asm/signal.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -37,7 +37,7 @@ typedef struct {
* TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
* trap.
*/
-#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
+#if defined(CONFIG_PREEMPT_RT_FULL)
#define ARCH_RT_DELAYS_SIGNAL_SEND
#endif

View File

@@ -0,0 +1,184 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 09:18:52 +0100
Subject: buffer_head: Replace bh_uptodate_lock for -rt
Wrap the bit_spin_lock calls into a separate inline and add the RT
replacements with a real spinlock.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
fs/buffer.c | 21 +++++++--------------
fs/ext4/page-io.c | 6 ++----
fs/ntfs/aops.c | 10 +++-------
include/linux/buffer_head.h | 34 ++++++++++++++++++++++++++++++++++
4 files changed, 46 insertions(+), 25 deletions(-)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -273,8 +273,7 @@ static void end_buffer_async_read(struct
* decide that the page is now completely done.
*/
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ flags = bh_uptodate_lock_irqsave(first);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -287,8 +286,7 @@ static void end_buffer_async_read(struct
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ bh_uptodate_unlock_irqrestore(first, flags);
/*
* If none of the buffers had errors and they are all
@@ -300,9 +298,7 @@ static void end_buffer_async_read(struct
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
- return;
+ bh_uptodate_unlock_irqrestore(first, flags);
}
/*
@@ -329,8 +325,7 @@ void end_buffer_async_write(struct buffe
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ flags = bh_uptodate_lock_irqsave(first);
clear_buffer_async_write(bh);
unlock_buffer(bh);
@@ -342,15 +337,12 @@ void end_buffer_async_write(struct buffe
}
tmp = tmp->b_this_page;
}
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ bh_uptodate_unlock_irqrestore(first, flags);
end_page_writeback(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
- return;
+ bh_uptodate_unlock_irqrestore(first, flags);
}
EXPORT_SYMBOL(end_buffer_async_write);
@@ -3360,6 +3352,7 @@ struct buffer_head *alloc_buffer_head(gf
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
+ buffer_head_init_locks(ret);
preempt_disable();
__this_cpu_inc(bh_accounting.nr);
recalc_bh_state();
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *
* We check all buffers in the page under BH_Uptodate_Lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ flags = bh_uptodate_lock_irqsave(head);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *
if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ bh_uptodate_unlock_irqrestore(head, flags);
if (!under_io) {
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (data_page)
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -106,8 +106,7 @@ static void ntfs_end_buffer_async_read(s
"0x%llx.", (unsigned long long)bh->b_blocknr);
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ flags = bh_uptodate_lock_irqsave(first);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -122,8 +121,7 @@ static void ntfs_end_buffer_async_read(s
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ bh_uptodate_unlock_irqrestore(first, flags);
/*
* If none of the buffers had errors then we can set the page uptodate,
* but we first have to perform the post read mst fixups, if the
@@ -156,9 +154,7 @@ static void ntfs_end_buffer_async_read(s
unlock_page(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
- return;
+ bh_uptodate_unlock_irqrestore(first, flags);
}
/**
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -76,8 +76,42 @@ struct buffer_head {
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
atomic_t b_count; /* users using this buffer_head */
+#ifdef CONFIG_PREEMPT_RT_BASE
+ spinlock_t b_uptodate_lock;
+#endif
};
+static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
+{
+ unsigned long flags;
+
+#ifndef CONFIG_PREEMPT_RT_BASE
+ local_irq_save(flags);
+ bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
+#else
+ spin_lock_irqsave(&bh->b_uptodate_lock, flags);
+#endif
+ return flags;
+}
+
+static inline void
+bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
+{
+#ifndef CONFIG_PREEMPT_RT_BASE
+ bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
+ local_irq_restore(flags);
+#else
+ spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
+#endif
+}
+
+static inline void buffer_head_init_locks(struct buffer_head *bh)
+{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ spin_lock_init(&bh->b_uptodate_lock);
+#endif
+}
+
/*
* macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
* and buffer_foo() functions.

View File

@@ -0,0 +1,96 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Mar 2011 10:11:25 +0100
Subject: fs: jbd/jbd2: Make state lock and journal head lock rt safe
bit_spin_locks break under RT.
Based on a previous patch from Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--
include/linux/buffer_head.h | 8 ++++++++
include/linux/jbd2.h | 24 ++++++++++++++++++++++++
2 files changed, 32 insertions(+)
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -78,6 +78,10 @@ struct buffer_head {
atomic_t b_count; /* users using this buffer_head */
#ifdef CONFIG_PREEMPT_RT_BASE
spinlock_t b_uptodate_lock;
+#if IS_ENABLED(CONFIG_JBD2)
+ spinlock_t b_state_lock;
+ spinlock_t b_journal_head_lock;
+#endif
#endif
};
@@ -109,6 +113,10 @@ static inline void buffer_head_init_lock
{
#ifdef CONFIG_PREEMPT_RT_BASE
spin_lock_init(&bh->b_uptodate_lock);
+#if IS_ENABLED(CONFIG_JBD2)
+ spin_lock_init(&bh->b_state_lock);
+ spin_lock_init(&bh->b_journal_head_lock);
+#endif
#endif
}
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh
static inline void jbd_lock_bh_state(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
bit_spin_lock(BH_State, &bh->b_state);
+#else
+ spin_lock(&bh->b_state_lock);
+#endif
}
static inline int jbd_trylock_bh_state(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
return bit_spin_trylock(BH_State, &bh->b_state);
+#else
+ return spin_trylock(&bh->b_state_lock);
+#endif
}
static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
return bit_spin_is_locked(BH_State, &bh->b_state);
+#else
+ return spin_is_locked(&bh->b_state_lock);
+#endif
}
static inline void jbd_unlock_bh_state(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
bit_spin_unlock(BH_State, &bh->b_state);
+#else
+ spin_unlock(&bh->b_state_lock);
+#endif
}
static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
bit_spin_lock(BH_JournalHead, &bh->b_state);
+#else
+ spin_lock(&bh->b_journal_head_lock);
+#endif
}
static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
bit_spin_unlock(BH_JournalHead, &bh->b_state);
+#else
+ spin_unlock(&bh->b_journal_head_lock);
+#endif
}
#define J_ASSERT(assert) BUG_ON(!(assert))

View File

@@ -0,0 +1,114 @@
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 21 Jun 2013 15:07:25 -0400
Subject: list_bl: Make list head locking RT safe
As per changes in include/linux/jbd_common.h for avoiding the
bit_spin_locks on RT ("fs: jbd/jbd2: Make state lock and journal
head lock rt safe") we do the same thing here.
We use the non atomic __set_bit and __clear_bit inside the scope of
the lock to preserve the ability of the existing LIST_DEBUG code to
use the zero'th bit in the sanity checks.
As a bit spinlock, we had no lockdep visibility into the usage
of the list head locking. Now, if we were to implement it as a
standard non-raw spinlock, we would see:
BUG: sleeping function called from invalid context at kernel/rtmutex.c:658
in_atomic(): 1, irqs_disabled(): 0, pid: 122, name: udevd
5 locks held by udevd/122:
#0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [<ffffffff811967e8>] lock_rename+0xe8/0xf0
#1: (rename_lock){+.+...}, at: [<ffffffff811a277c>] d_move+0x2c/0x60
#2: (&dentry->d_lock){+.+...}, at: [<ffffffff811a0763>] dentry_lock_for_move+0xf3/0x130
#3: (&dentry->d_lock/2){+.+...}, at: [<ffffffff811a0734>] dentry_lock_for_move+0xc4/0x130
#4: (&dentry->d_lock/3){+.+...}, at: [<ffffffff811a0747>] dentry_lock_for_move+0xd7/0x130
Pid: 122, comm: udevd Not tainted 3.4.47-rt62 #7
Call Trace:
[<ffffffff810b9624>] __might_sleep+0x134/0x1f0
[<ffffffff817a24d4>] rt_spin_lock+0x24/0x60
[<ffffffff811a0c4c>] __d_shrink+0x5c/0xa0
[<ffffffff811a1b2d>] __d_drop+0x1d/0x40
[<ffffffff811a24be>] __d_move+0x8e/0x320
[<ffffffff811a278e>] d_move+0x3e/0x60
[<ffffffff81199598>] vfs_rename+0x198/0x4c0
[<ffffffff8119b093>] sys_renameat+0x213/0x240
[<ffffffff817a2de5>] ? _raw_spin_unlock+0x35/0x60
[<ffffffff8107781c>] ? do_page_fault+0x1ec/0x4b0
[<ffffffff817a32ca>] ? retint_swapgs+0xe/0x13
[<ffffffff813eb0e6>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[<ffffffff8119b0db>] sys_rename+0x1b/0x20
[<ffffffff817a3b96>] system_call_fastpath+0x1a/0x1f
Since we are only taking the lock during short lived list operations,
lets assume for now that it being raw won't be a significant latency
concern.
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/list_bl.h | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -3,6 +3,7 @@
#define _LINUX_LIST_BL_H
#include <linux/list.h>
+#include <linux/spinlock.h>
#include <linux/bit_spinlock.h>
/*
@@ -33,13 +34,22 @@
struct hlist_bl_head {
struct hlist_bl_node *first;
+#ifdef CONFIG_PREEMPT_RT_BASE
+ raw_spinlock_t lock;
+#endif
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
-#define INIT_HLIST_BL_HEAD(ptr) \
- ((ptr)->first = NULL)
+
+static inline void INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
+{
+ h->first = NULL;
+#ifdef CONFIG_PREEMPT_RT_BASE
+ raw_spin_lock_init(&h->lock);
+#endif
+}
static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
@@ -119,12 +129,26 @@ static inline void hlist_bl_del_init(str
static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
bit_spin_lock(0, (unsigned long *)b);
+#else
+ raw_spin_lock(&b->lock);
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ __set_bit(0, (unsigned long *)b);
+#endif
+#endif
}
static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
+#ifndef CONFIG_PREEMPT_RT_BASE
__bit_spin_unlock(0, (unsigned long *)b);
+#else
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ __clear_bit(0, (unsigned long *)b);
+#endif
+ raw_spin_unlock(&b->lock);
+#endif
}
static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)

View File

@@ -0,0 +1,97 @@
From: Josh Cartwright <joshc@ni.com>
Date: Thu, 31 Mar 2016 00:04:25 -0500
Subject: [PATCH] list_bl: fixup bogus lockdep warning
At first glance, the use of 'static inline' seems appropriate for
INIT_HLIST_BL_HEAD().
However, when a 'static inline' function invocation is inlined by gcc,
all callers share any static local data declared within that inline
function.
This presents a problem for how lockdep classes are setup. raw_spinlocks, for
example, when CONFIG_DEBUG_SPINLOCK,
# define raw_spin_lock_init(lock) \
do { \
static struct lock_class_key __key; \
\
__raw_spin_lock_init((lock), #lock, &__key); \
} while (0)
When this macro is expanded into a 'static inline' caller, like
INIT_HLIST_BL_HEAD():
static inline INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
{
h->first = NULL;
raw_spin_lock_init(&h->lock);
}
...the static local lock_class_key object is made a function static.
For compilation units which initialize invoke INIT_HLIST_BL_HEAD() more
than once, then, all of the invocations share this same static local
object.
This can lead to some very confusing lockdep splats (example below).
Solve this problem by forcing the INIT_HLIST_BL_HEAD() to be a macro,
which prevents the lockdep class object sharing.
=============================================
[ INFO: possible recursive locking detected ]
4.4.4-rt11 #4 Not tainted
---------------------------------------------
kswapd0/59 is trying to acquire lock:
(&h->lock#2){+.+.-.}, at: mb_cache_shrink_scan
but task is already holding lock:
(&h->lock#2){+.+.-.}, at: mb_cache_shrink_scan
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&h->lock#2);
lock(&h->lock#2);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by kswapd0/59:
#0: (shrinker_rwsem){+.+...}, at: rt_down_read_trylock
#1: (&h->lock#2){+.+.-.}, at: mb_cache_shrink_scan
Reported-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Tested-by: Luis Claudio R. Goncalves <lclaudio@uudg.org>
Signed-off-by: Josh Cartwright <joshc@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/list_bl.h | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -43,13 +43,15 @@ struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
-static inline void INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
-{
- h->first = NULL;
#ifdef CONFIG_PREEMPT_RT_BASE
- raw_spin_lock_init(&h->lock);
+#define INIT_HLIST_BL_HEAD(h) \
+do { \
+ (h)->first = NULL; \
+ raw_spin_lock_init(&(h)->lock); \
+} while (0)
+#else
+#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
#endif
-}
static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{

View File

@@ -0,0 +1,37 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:57 -0500
Subject: genirq: Disable irqpoll on -rt
Creates long latencies for no value
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/irq/spurious.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable ir
static int __init irqfixup_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
+#ifdef CONFIG_PREEMPT_RT_BASE
+ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
+ return 1;
+#endif
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");

View File

@@ -0,0 +1,45 @@
Subject: genirq: Force interrupt thread on RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 03 Apr 2011 11:57:29 +0200
Force threaded_irqs and optimize the code (force_irqthreads) in regard
to this.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/interrupt.h | 4 ++++
kernel/irq/manage.c | 2 ++
2 files changed, 6 insertions(+)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -427,7 +427,11 @@ extern int irq_set_irqchip_state(unsigne
bool state);
#ifdef CONFIG_IRQ_FORCED_THREADING
+# ifdef CONFIG_PREEMPT_RT_BASE
+# define force_irqthreads (true)
+# else
extern bool force_irqthreads;
+# endif
#else
#define force_irqthreads (0)
#endif
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -23,6 +23,7 @@
#include "internals.h"
#ifdef CONFIG_IRQ_FORCED_THREADING
+# ifndef CONFIG_PREEMPT_RT_BASE
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
@@ -32,6 +33,7 @@ static int __init setup_forced_irqthread
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
+# endif
#endif
static void __synchronize_hardirq(struct irq_desc *desc)

View File

@@ -0,0 +1,166 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 May 2018 15:24:20 +0200
Subject: [PATCH 1/4] Split IRQ-off and zone->lock while freeing pages from PCP
list #1
Split the IRQ-off section while accessing the PCP list from zone->lock
while freeing pages.
Introcude isolate_pcp_pages() which separates the pages from the PCP
list onto a temporary list and then free the temporary list via
free_pcppages_bulk().
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/page_alloc.c | 82 +++++++++++++++++++++++++++++++++++---------------------
1 file changed, 52 insertions(+), 30 deletions(-)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1095,7 +1095,7 @@ static inline void prefetch_buddy(struct
}
/*
- * Frees a number of pages from the PCP lists
+ * Frees a number of pages which have been collected from the pcp lists.
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
@@ -1106,14 +1106,41 @@ static inline void prefetch_buddy(struct
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+ struct list_head *head)
+{
+ bool isolated_pageblocks;
+ struct page *page, *tmp;
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
+ /*
+ * Use safe version since after __free_one_page(),
+ * page->lru.next will not point to original list.
+ */
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ int mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
+
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp,
+ struct list_head *dst)
+
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
- bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
while (count) {
struct list_head *list;
@@ -1145,7 +1172,7 @@ static void free_pcppages_bulk(struct zo
if (bulkfree_pcp_prepare(page))
continue;
- list_add_tail(&page->lru, &head);
+ list_add_tail(&page->lru, dst);
/*
* We are going to put the page back to the global
@@ -1160,26 +1187,6 @@ static void free_pcppages_bulk(struct zo
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
-
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
-
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, 0, mt);
- trace_mm_page_pcpu_drain(page, 0, mt);
- }
- spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
@@ -2536,13 +2543,18 @@ void drain_zone_pages(struct zone *zone,
{
unsigned long flags;
int to_drain, batch;
+ LIST_HEAD(dst);
local_irq_save(flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
+ isolate_pcp_pages(to_drain, pcp, &dst);
+
local_irq_restore(flags);
+
+ if (to_drain > 0)
+ free_pcppages_bulk(zone, to_drain, &dst);
}
#endif
@@ -2558,14 +2570,21 @@ static void drain_pages_zone(unsigned in
unsigned long flags;
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
+ LIST_HEAD(dst);
+ int count;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
+ count = pcp->count;
+ if (count)
+ isolate_pcp_pages(count, pcp, &dst);
+
local_irq_restore(flags);
+
+ if (count)
+ free_pcppages_bulk(zone, count, &dst);
}
/*
@@ -2787,7 +2806,10 @@ static void free_unref_page_commit(struc
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
+ LIST_HEAD(dst);
+
+ isolate_pcp_pages(batch, pcp, &dst);
+ free_pcppages_bulk(zone, batch, &dst);
}
}

View File

@@ -0,0 +1,165 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 May 2018 15:24:21 +0200
Subject: [PATCH 2/4] Split IRQ-off and zone->lock while freeing pages from PCP
list #2
Split the IRQ-off section while accessing the PCP list from zone->lock
while freeing pages.
Introcude isolate_pcp_pages() which separates the pages from the PCP
list onto a temporary list and then free the temporary list via
free_pcppages_bulk().
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/page_alloc.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 50 insertions(+), 10 deletions(-)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1105,8 +1105,8 @@ static inline void prefetch_buddy(struct
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pcppages_bulk(struct zone *zone, int count,
- struct list_head *head)
+static void free_pcppages_bulk(struct zone *zone, struct list_head *head,
+ bool zone_retry)
{
bool isolated_pageblocks;
struct page *page, *tmp;
@@ -1121,12 +1121,27 @@ static void free_pcppages_bulk(struct zo
*/
list_for_each_entry_safe(page, tmp, head, lru) {
int mt = get_pcppage_migratetype(page);
+
+ if (page_zone(page) != zone) {
+ /*
+ * free_unref_page_list() sorts pages by zone. If we end
+ * up with pages from a different NUMA nodes belonging
+ * to the same ZONE index then we need to redo with the
+ * correct ZONE pointer. Skip the page for now, redo it
+ * on the next iteration.
+ */
+ WARN_ON_ONCE(zone_retry == false);
+ if (zone_retry)
+ continue;
+ }
+
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
+ list_del(&page->lru);
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
}
@@ -2554,7 +2569,7 @@ void drain_zone_pages(struct zone *zone,
local_irq_restore(flags);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, &dst);
+ free_pcppages_bulk(zone, &dst, false);
}
#endif
@@ -2584,7 +2599,7 @@ static void drain_pages_zone(unsigned in
local_irq_restore(flags);
if (count)
- free_pcppages_bulk(zone, count, &dst);
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -2777,7 +2792,8 @@ static bool free_unref_page_prepare(stru
return true;
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn)
+static void free_unref_page_commit(struct page *page, unsigned long pfn,
+ struct list_head *dst)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
@@ -2806,10 +2822,8 @@ static void free_unref_page_commit(struc
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
- LIST_HEAD(dst);
- isolate_pcp_pages(batch, pcp, &dst);
- free_pcppages_bulk(zone, batch, &dst);
+ isolate_pcp_pages(batch, pcp, dst);
}
}
@@ -2820,13 +2834,17 @@ void free_unref_page(struct page *page)
{
unsigned long flags;
unsigned long pfn = page_to_pfn(page);
+ struct zone *zone = page_zone(page);
+ LIST_HEAD(dst);
if (!free_unref_page_prepare(page, pfn))
return;
local_irq_save(flags);
- free_unref_page_commit(page, pfn);
+ free_unref_page_commit(page, pfn, &dst);
local_irq_restore(flags);
+ if (!list_empty(&dst))
+ free_pcppages_bulk(zone, &dst, false);
}
/*
@@ -2837,6 +2855,11 @@ void free_unref_page_list(struct list_he
struct page *page, *next;
unsigned long flags, pfn;
int batch_count = 0;
+ struct list_head dsts[__MAX_NR_ZONES];
+ int i;
+
+ for (i = 0; i < __MAX_NR_ZONES; i++)
+ INIT_LIST_HEAD(&dsts[i]);
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
@@ -2849,10 +2872,12 @@ void free_unref_page_list(struct list_he
local_irq_save(flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
+ enum zone_type type;
set_page_private(page, 0);
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn);
+ type = page_zonenum(page);
+ free_unref_page_commit(page, pfn, &dsts[type]);
/*
* Guard against excessive IRQ disabled times when we get
@@ -2865,6 +2890,21 @@ void free_unref_page_list(struct list_he
}
}
local_irq_restore(flags);
+
+ for (i = 0; i < __MAX_NR_ZONES; ) {
+ struct page *page;
+ struct zone *zone;
+
+ if (list_empty(&dsts[i])) {
+ i++;
+ continue;
+ }
+
+ page = list_first_entry(&dsts[i], struct page, lru);
+ zone = page_zone(page);
+
+ free_pcppages_bulk(zone, &dsts[i], true);
+ }
}
/*

View File

@@ -0,0 +1,608 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 28 May 2018 15:24:22 +0200
Subject: [PATCH 3/4] mm/SLxB: change list_lock to raw_spinlock_t
The list_lock is used with used with IRQs off on RT. Make it a raw_spinlock_t
otherwise the interrupts won't be disabled on -RT. The locking rules remain
the same on !RT.
This patch changes it for SLAB and SLUB since both share the same header
file for struct kmem_cache_node defintion.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/slab.c | 94 +++++++++++++++++++++++++++++++-------------------------------
mm/slab.h | 2 -
mm/slub.c | 50 ++++++++++++++++----------------
3 files changed, 73 insertions(+), 73 deletions(-)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct
parent->shared = NULL;
parent->alien = NULL;
parent->colour_next = 0;
- spin_lock_init(&parent->list_lock);
+ raw_spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
}
@@ -600,9 +600,9 @@ static noinline void cache_free_pfmemall
page_node = page_to_nid(page);
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -730,7 +730,7 @@ static void __drain_alien_cache(struct k
struct kmem_cache_node *n = get_node(cachep, node);
if (ac->avail) {
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
/*
* Stuff objects into the remote nodes shared array first.
* That way we could avoid the overhead of putting the objects
@@ -741,7 +741,7 @@ static void __drain_alien_cache(struct k
free_block(cachep, ac->entry, ac->avail, node, list);
ac->avail = 0;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
}
}
@@ -814,9 +814,9 @@ static int __cache_free_alien(struct kme
slabs_destroy(cachep, &list);
} else {
n = get_node(cachep, page_node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, &objp, 1, page_node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
@@ -857,10 +857,10 @@ static int init_cache_node(struct kmem_c
*/
n = get_node(cachep, node);
if (n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
cachep->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
return 0;
}
@@ -939,7 +939,7 @@ static int setup_kmem_cache_node(struct
goto fail;
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
if (n->shared && force_change) {
free_block(cachep, n->shared->entry,
n->shared->avail, node, &list);
@@ -957,7 +957,7 @@ static int setup_kmem_cache_node(struct
new_alien = NULL;
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
/*
@@ -996,7 +996,7 @@ static void cpuup_canceled(long cpu)
if (!n)
continue;
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
@@ -1009,7 +1009,7 @@ static void cpuup_canceled(long cpu)
}
if (!cpumask_empty(mask)) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto free_slab;
}
@@ -1023,7 +1023,7 @@ static void cpuup_canceled(long cpu)
alien = n->alien;
n->alien = NULL;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
kfree(shared);
if (alien) {
@@ -1207,7 +1207,7 @@ static void __init init_list(struct kmem
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
- spin_lock_init(&ptr->list_lock);
+ raw_spin_lock_init(&ptr->list_lock);
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->node[nodeid] = ptr;
@@ -1378,11 +1378,11 @@ slab_out_of_memory(struct kmem_cache *ca
for_each_kmem_cache_node(cachep, node, n) {
unsigned long total_slabs, free_slabs, free_objs;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
total_slabs = n->total_slabs;
free_slabs = n->free_slabs;
free_objs = n->free_objects;
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
node, total_slabs - free_slabs, total_slabs,
@@ -2175,7 +2175,7 @@ static void check_spinlock_acquired(stru
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
#endif
}
@@ -2183,7 +2183,7 @@ static void check_spinlock_acquired_node
{
#ifdef CONFIG_SMP
check_irq_off();
- assert_spin_locked(&get_node(cachep, node)->list_lock);
+ assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
#endif
}
@@ -2223,9 +2223,9 @@ static void do_drain(void *arg)
check_irq_off();
ac = cpu_cache_get(cachep);
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail = 0;
}
@@ -2243,9 +2243,9 @@ static void drain_cpu_caches(struct kmem
drain_alien_cache(cachep, n->alien);
for_each_kmem_cache_node(cachep, node, n) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, n->shared, node, true, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -2267,10 +2267,10 @@ static int drain_freelist(struct kmem_ca
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
p = n->slabs_free.prev;
if (p == &n->slabs_free) {
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
goto out;
}
@@ -2283,7 +2283,7 @@ static int drain_freelist(struct kmem_ca
* to the cache.
*/
n->free_objects -= cache->num;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slab_destroy(cache, page);
nr_freed++;
}
@@ -2731,7 +2731,7 @@ static void cache_grow_end(struct kmem_c
INIT_LIST_HEAD(&page->lru);
n = get_node(cachep, page_to_nid(page));
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
list_add_tail(&page->lru, &(n->slabs_free));
@@ -2741,7 +2741,7 @@ static void cache_grow_end(struct kmem_c
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num - page->active;
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
}
@@ -2909,7 +2909,7 @@ static struct page *get_first_slab(struc
{
struct page *page;
- assert_spin_locked(&n->list_lock);
+ assert_raw_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
if (!page) {
n->free_touched = 1;
@@ -2935,10 +2935,10 @@ static noinline void *cache_alloc_pfmema
if (!gfp_pfmemalloc_allowed(flags))
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, true);
if (!page) {
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return NULL;
}
@@ -2947,7 +2947,7 @@ static noinline void *cache_alloc_pfmema
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
@@ -3006,7 +3006,7 @@ static void *cache_alloc_refill(struct k
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
@@ -3030,7 +3030,7 @@ static void *cache_alloc_refill(struct k
must_grow:
n->free_objects -= ac->avail;
alloc_done:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
direct_grow:
@@ -3255,7 +3255,7 @@ static void *____cache_alloc_node(struct
BUG_ON(!n);
check_irq_off();
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3273,12 +3273,12 @@ static void *____cache_alloc_node(struct
fixup_slab_list(cachep, n, page, &list);
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
return obj;
must_grow:
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
if (page) {
/* This slab isn't counted yet so don't update free_objects */
@@ -3454,7 +3454,7 @@ static void cache_flusharray(struct kmem
check_irq_off();
n = get_node(cachep, node);
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
if (n->shared) {
struct array_cache *shared_array = n->shared;
int max = shared_array->limit - shared_array->avail;
@@ -3483,7 +3483,7 @@ static void cache_flusharray(struct kmem
STATS_SET_FREEABLE(cachep, i);
}
#endif
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
@@ -3893,9 +3893,9 @@ static int __do_tune_cpucache(struct kme
node = cpu_to_mem(cpu);
n = get_node(cachep, node);
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
free_percpu(prev);
@@ -4020,9 +4020,9 @@ static void drain_array(struct kmem_cach
return;
}
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
drain_array_locked(cachep, ac, node, false, &list);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -4106,7 +4106,7 @@ void get_slabinfo(struct kmem_cache *cac
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
total_slabs += n->total_slabs;
free_slabs += n->free_slabs;
@@ -4115,7 +4115,7 @@ void get_slabinfo(struct kmem_cache *cac
if (n->shared)
shared_avail += n->shared->avail;
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
num_objs = total_slabs * cachep->num;
active_slabs = total_slabs - free_slabs;
@@ -4330,13 +4330,13 @@ static int leaks_show(struct seq_file *m
for_each_kmem_cache_node(cachep, node, n) {
check_irq_on();
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
list_for_each_entry(page, &n->slabs_full, lru)
handle_slab(x, cachep, page);
list_for_each_entry(page, &n->slabs_partial, lru)
handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
}
} while (!is_store_user_clean(cachep));
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -453,7 +453,7 @@ static inline void slab_post_alloc_hook(
* The slab lists for all objects.
*/
struct kmem_cache_node {
- spinlock_t list_lock;
+ raw_spinlock_t list_lock;
#ifdef CONFIG_SLAB
struct list_head slabs_partial; /* partial list first, better asm code */
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1167,7 +1167,7 @@ static noinline int free_debug_processin
unsigned long uninitialized_var(flags);
int ret = 0;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
slab_lock(page);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
@@ -1202,7 +1202,7 @@ static noinline int free_debug_processin
bulk_cnt, cnt);
slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
return ret;
@@ -1802,7 +1802,7 @@ static void *get_partial_node(struct kme
if (!n || !n->nr_partial)
return NULL;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
void *t;
@@ -1827,7 +1827,7 @@ static void *get_partial_node(struct kme
break;
}
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
return object;
}
@@ -2073,7 +2073,7 @@ static void deactivate_slab(struct kmem_
* that acquire_slab() will see a slab page that
* is frozen
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
} else {
m = M_FULL;
@@ -2084,7 +2084,7 @@ static void deactivate_slab(struct kmem_
* slabs from diagnostic functions will not see
* any frozen slabs.
*/
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
}
@@ -2119,7 +2119,7 @@ static void deactivate_slab(struct kmem_
goto redo;
if (lock)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
@@ -2154,10 +2154,10 @@ static void unfreeze_partials(struct kme
n2 = get_node(s, page_to_nid(page));
if (n != n2) {
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
n = n2;
- spin_lock(&n->list_lock);
+ raw_spin_lock(&n->list_lock);
}
do {
@@ -2186,7 +2186,7 @@ static void unfreeze_partials(struct kme
}
if (n)
- spin_unlock(&n->list_lock);
+ raw_spin_unlock(&n->list_lock);
while (discard_page) {
page = discard_page;
@@ -2355,10 +2355,10 @@ static unsigned long count_partial(struc
unsigned long x = 0;
struct page *page;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
x += get_count(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
@@ -2793,7 +2793,7 @@ static void __slab_free(struct kmem_cach
do {
if (unlikely(n)) {
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
prior = page->freelist;
@@ -2825,7 +2825,7 @@ static void __slab_free(struct kmem_cach
* Otherwise the list_lock will synchronize with
* other processors updating the list of slabs.
*/
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
}
}
@@ -2867,7 +2867,7 @@ static void __slab_free(struct kmem_cach
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return;
slab_empty:
@@ -2882,7 +2882,7 @@ static void __slab_free(struct kmem_cach
remove_full(s, n, page);
}
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
}
@@ -3269,7 +3269,7 @@ static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
n->nr_partial = 0;
- spin_lock_init(&n->list_lock);
+ raw_spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
@@ -3653,7 +3653,7 @@ static void free_partial(struct kmem_cac
struct page *page, *h;
BUG_ON(irqs_disabled());
- spin_lock_irq(&n->list_lock);
+ raw_spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
remove_partial(n, page);
@@ -3663,7 +3663,7 @@ static void free_partial(struct kmem_cac
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
- spin_unlock_irq(&n->list_lock);
+ raw_spin_unlock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &discard, lru)
discard_slab(s, page);
@@ -3936,7 +3936,7 @@ int __kmem_cache_shrink(struct kmem_cach
for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
INIT_LIST_HEAD(promote + i);
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
/*
* Build lists of slabs to discard or promote.
@@ -3967,7 +3967,7 @@ int __kmem_cache_shrink(struct kmem_cach
for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
list_splice(promote + i, &n->partial);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
list_for_each_entry_safe(page, t, &discard, lru)
@@ -4381,7 +4381,7 @@ static int validate_slab_node(struct kme
struct page *page;
unsigned long flags;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru) {
validate_slab_slab(s, page, map);
@@ -4403,7 +4403,7 @@ static int validate_slab_node(struct kme
s->name, count, atomic_long_read(&n->nr_slabs));
out:
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
return count;
}
@@ -4593,12 +4593,12 @@ static int list_locations(struct kmem_ca
if (!atomic_long_read(&n->nr_slabs))
continue;
- spin_lock_irqsave(&n->list_lock, flags);
+ raw_spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, lru)
process_slab(&t, s, page, alloc, map);
list_for_each_entry(page, &n->full, lru)
process_slab(&t, s, page, alloc, map);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ raw_spin_unlock_irqrestore(&n->list_lock, flags);
}
for (i = 0; i < t.count; i++) {

View File

@@ -0,0 +1,216 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Jun 2018 17:29:19 +0200
Subject: [PATCH 4/4] mm/SLUB: delay giving back empty slubs to IRQ enabled
regions
__free_slab() is invoked with disabled interrupts which increases the
irq-off time while __free_pages() is doing the work.
Allow __free_slab() to be invoked with enabled interrupts and move
everything from interrupts-off invocations to a temporary per-CPU list
so it can be processed later.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/slub.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 69 insertions(+), 5 deletions(-)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1330,6 +1330,12 @@ static inline void dec_slabs_node(struct
#endif /* CONFIG_SLUB_DEBUG */
+struct slub_free_list {
+ raw_spinlock_t lock;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
+
/*
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
@@ -1684,6 +1690,16 @@ static void __free_slab(struct kmem_cach
__free_pages(page, order);
}
+static void free_delayed(struct list_head *h)
+{
+ while (!list_empty(h)) {
+ struct page *page = list_first_entry(h, struct page, lru);
+
+ list_del(&page->lru);
+ __free_slab(page->slab_cache, page);
+ }
+}
+
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page = container_of(h, struct page, rcu_head);
@@ -1695,6 +1711,12 @@ static void free_slab(struct kmem_cache
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
call_rcu(&page->rcu_head, rcu_free_slab);
+ } else if (irqs_disabled()) {
+ struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
+
+ raw_spin_lock(&f->lock);
+ list_add(&page->lru, &f->list);
+ raw_spin_unlock(&f->lock);
} else
__free_slab(s, page);
}
@@ -2223,14 +2245,21 @@ static void put_cpu_partial(struct kmem_
pobjects = oldpage->pobjects;
pages = oldpage->pages;
if (drain && pobjects > s->cpu_partial) {
+ struct slub_free_list *f;
unsigned long flags;
+ LIST_HEAD(tofree);
/*
* partial array is full. Move the existing
* set to the per node partial list.
*/
local_irq_save(flags);
unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock(&f->lock);
local_irq_restore(flags);
+ free_delayed(&tofree);
oldpage = NULL;
pobjects = 0;
pages = 0;
@@ -2300,7 +2329,22 @@ static bool has_cpu_slab(int cpu, void *
static void flush_all(struct kmem_cache *s)
{
+ LIST_HEAD(tofree);
+ int cpu;
+
on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+ for_each_online_cpu(cpu) {
+ struct slub_free_list *f;
+
+ if (!has_cpu_slab(cpu, s))
+ continue;
+
+ f = &per_cpu(slub_free_list, cpu);
+ raw_spin_lock_irq(&f->lock);
+ list_splice_init(&f->list, &tofree);
+ raw_spin_unlock_irq(&f->lock);
+ free_delayed(&tofree);
+ }
}
/*
@@ -2498,8 +2542,10 @@ static inline void *get_freelist(struct
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c,
+ struct list_head *to_free)
{
+ struct slub_free_list *f;
void *freelist;
struct page *page;
@@ -2555,6 +2601,13 @@ static void *___slab_alloc(struct kmem_c
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
+
+out:
+ f = this_cpu_ptr(&slub_free_list);
+ raw_spin_lock(&f->lock);
+ list_splice_init(&f->list, to_free);
+ raw_spin_unlock(&f->lock);
+
return freelist;
new_slab:
@@ -2570,7 +2623,7 @@ static void *___slab_alloc(struct kmem_c
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- return NULL;
+ goto out;
}
page = c->page;
@@ -2583,7 +2636,7 @@ static void *___slab_alloc(struct kmem_c
goto new_slab; /* Slab failed checks. Next slab needed */
deactivate_slab(s, page, get_freepointer(s, freelist), c);
- return freelist;
+ goto out;
}
/*
@@ -2595,6 +2648,7 @@ static void *__slab_alloc(struct kmem_ca
{
void *p;
unsigned long flags;
+ LIST_HEAD(tofree);
local_irq_save(flags);
#ifdef CONFIG_PREEMPT
@@ -2606,8 +2660,9 @@ static void *__slab_alloc(struct kmem_ca
c = this_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
local_irq_restore(flags);
+ free_delayed(&tofree);
return p;
}
@@ -3085,6 +3140,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
void **p)
{
struct kmem_cache_cpu *c;
+ LIST_HEAD(to_free);
int i;
/* memcg and kmem_cache debug support */
@@ -3108,7 +3164,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, &to_free);
if (unlikely(!p[i]))
goto error;
@@ -3120,6 +3176,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
}
c->tid = next_tid(c->tid);
local_irq_enable();
+ free_delayed(&to_free);
/* Clear memory outside IRQ disabled fastpath loop */
if (unlikely(flags & __GFP_ZERO)) {
@@ -3134,6 +3191,7 @@ int kmem_cache_alloc_bulk(struct kmem_ca
return i;
error:
local_irq_enable();
+ free_delayed(&to_free);
slab_post_alloc_hook(s, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
@@ -4180,6 +4238,12 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
+ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
+ }
if (debug_guardpage_minorder())
slub_max_order = 0;

View File

@@ -0,0 +1,232 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:37 -0500
Subject: mm: page_alloc: rt-friendly per-cpu pages
rt-friendly per-cpu pages: convert the irqs-off per-cpu locking
method into a preemptible, explicit-per-cpu-locks method.
Contains fixes from:
Peter Zijlstra <a.p.zijlstra@chello.nl>
Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
mm/page_alloc.c | 63 ++++++++++++++++++++++++++++++++++++++------------------
1 file changed, 43 insertions(+), 20 deletions(-)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -60,6 +60,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
+#include <linux/locallock.h>
#include <linux/page_owner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -291,6 +292,18 @@ EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
+
+#ifdef CONFIG_PREEMPT_RT_BASE
+# define cpu_lock_irqsave(cpu, flags) \
+ local_lock_irqsave_on(pa_lock, flags, cpu)
+# define cpu_unlock_irqrestore(cpu, flags) \
+ local_unlock_irqrestore_on(pa_lock, flags, cpu)
+#else
+# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
+# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
+#endif
+
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1296,10 +1309,10 @@ static void __free_pages_ok(struct page
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2560,13 +2573,13 @@ void drain_zone_pages(struct zone *zone,
int to_drain, batch;
LIST_HEAD(dst);
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
isolate_pcp_pages(to_drain, pcp, &dst);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
if (to_drain > 0)
free_pcppages_bulk(zone, &dst, false);
@@ -2588,7 +2601,7 @@ static void drain_pages_zone(unsigned in
LIST_HEAD(dst);
int count;
- local_irq_save(flags);
+ cpu_lock_irqsave(cpu, flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
@@ -2596,7 +2609,7 @@ static void drain_pages_zone(unsigned in
if (count)
isolate_pcp_pages(count, pcp, &dst);
- local_irq_restore(flags);
+ cpu_unlock_irqrestore(cpu, flags);
if (count)
free_pcppages_bulk(zone, &dst, false);
@@ -2634,6 +2647,7 @@ void drain_local_pages(struct zone *zone
drain_pages(cpu);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
static void drain_local_pages_wq(struct work_struct *work)
{
/*
@@ -2647,6 +2661,7 @@ static void drain_local_pages_wq(struct
drain_local_pages(NULL);
preempt_enable();
}
+#endif
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
@@ -2713,7 +2728,14 @@ void drain_all_pages(struct zone *zone)
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
-
+#ifdef CONFIG_PREEMPT_RT_BASE
+ for_each_cpu(cpu, &cpus_with_pcps) {
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
+ }
+#else
for_each_cpu(cpu, &cpus_with_pcps) {
struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
INIT_WORK(work, drain_local_pages_wq);
@@ -2721,6 +2743,7 @@ void drain_all_pages(struct zone *zone)
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+#endif
mutex_unlock(&pcpu_drain_mutex);
}
@@ -2840,9 +2863,9 @@ void free_unref_page(struct page *page)
if (!free_unref_page_prepare(page, pfn))
return;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
free_unref_page_commit(page, pfn, &dst);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
if (!list_empty(&dst))
free_pcppages_bulk(zone, &dst, false);
}
@@ -2869,7 +2892,7 @@ void free_unref_page_list(struct list_he
set_page_private(page, pfn);
}
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
unsigned long pfn = page_private(page);
enum zone_type type;
@@ -2884,12 +2907,12 @@ void free_unref_page_list(struct list_he
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
batch_count = 0;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
}
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
for (i = 0; i < __MAX_NR_ZONES; ) {
struct page *page;
@@ -3038,7 +3061,7 @@ static struct page *rmqueue_pcplist(stru
struct page *page;
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, pcp, list);
@@ -3046,7 +3069,7 @@ static struct page *rmqueue_pcplist(stru
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return page;
}
@@ -3073,7 +3096,7 @@ struct page *rmqueue(struct zone *prefer
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
+ local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
do {
page = NULL;
@@ -3093,14 +3116,14 @@ struct page *rmqueue(struct zone *prefer
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
out:
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
return NULL;
}
@@ -8094,7 +8117,7 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
- local_irq_save(flags);
+ local_lock_irqsave(pa_lock, flags);
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
@@ -8103,7 +8126,7 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(pa_lock, flags);
}
#ifdef CONFIG_MEMORY_HOTREMOVE

View File

@@ -0,0 +1,199 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:51 -0500
Subject: mm/swap: Convert to percpu locked
Replace global locks (get_cpu + local_irq_save) with "local_locks()".
Currently there is one of for "rotate" and one for "swap".
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/swap.h | 2 ++
mm/compaction.c | 6 ++++--
mm/page_alloc.c | 3 ++-
mm/swap.c | 38 ++++++++++++++++++++++----------------
4 files changed, 30 insertions(+), 19 deletions(-)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
+#include <linux/locallock.h>
#include <asm/page.h>
struct notifier_block;
@@ -331,6 +332,7 @@ extern unsigned long nr_free_pagecache_p
/* linux/mm/swap.c */
+DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
extern void lru_cache_add(struct page *);
extern void lru_cache_add_anon(struct page *page);
extern void lru_cache_add_file(struct page *page);
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1657,10 +1657,12 @@ static enum compact_result compact_zone(
block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
+ cpu = get_cpu_light();
+ local_lock_irq(swapvec_lock);
lru_add_drain_cpu(cpu);
+ local_unlock_irq(swapvec_lock);
drain_local_pages(zone);
- put_cpu();
+ put_cpu_light();
/* No more flushing until we migrate again */
cc->last_migrated_pfn = 0;
}
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7205,8 +7205,9 @@ void __init free_area_init(unsigned long
static int page_alloc_cpu_dead(unsigned int cpu)
{
-
+ local_lock_irq_on(swapvec_lock, cpu);
lru_add_drain_cpu(cpu);
+ local_unlock_irq_on(swapvec_lock, cpu);
drain_pages(cpu);
/*
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -33,6 +33,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/locallock.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
@@ -51,6 +52,8 @@ static DEFINE_PER_CPU(struct pagevec, lr
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
+static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
+DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
/*
* This path almost never happens for VM activity - pages are normally
@@ -253,11 +256,11 @@ void rotate_reclaimable_page(struct page
unsigned long flags;
get_page(page);
- local_irq_save(flags);
+ local_lock_irqsave(rotate_lock, flags);
pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
}
@@ -307,12 +310,13 @@ void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ activate_page_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_cpu_var(activate_page_pvecs);
+ put_locked_var(swapvec_lock, activate_page_pvecs);
}
}
@@ -339,7 +343,7 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
int i;
/*
@@ -361,7 +365,7 @@ static void __lru_cache_activate_page(st
}
}
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/*
@@ -403,12 +407,12 @@ EXPORT_SYMBOL(mark_page_accessed);
static void __lru_cache_add(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
+ put_locked_var(swapvec_lock, lru_add_pvec);
}
/**
@@ -586,9 +590,9 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_irq_save(flags);
+ local_lock_irqsave(rotate_lock, flags);
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(rotate_lock, flags);
}
pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
@@ -620,11 +624,12 @@ void deactivate_file_page(struct page *p
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_deactivate_file_pvecs);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- put_cpu_var(lru_deactivate_file_pvecs);
+ put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
}
}
@@ -639,19 +644,20 @@ void mark_page_lazyfree(struct page *pag
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
+ struct pagevec *pvec = &get_locked_var(swapvec_lock,
+ lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- put_cpu_var(lru_lazyfree_pvecs);
+ put_locked_var(swapvec_lock, lru_lazyfree_pvecs);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(get_cpu());
- put_cpu();
+ lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
+ local_unlock_cpu(swapvec_lock);
}
static void lru_add_drain_per_cpu(struct work_struct *dummy)

View File

@@ -0,0 +1,102 @@
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 27 May 2016 15:03:28 +0200
Subject: [PATCH] mm: perform lru_add_drain_all() remotely
lru_add_drain_all() works by scheduling lru_add_drain_cpu() to run
on all CPUs that have non-empty LRU pagevecs and then waiting for
the scheduled work to complete. However, workqueue threads may never
have the chance to run on a CPU that's running a SCHED_FIFO task.
This causes lru_add_drain_all() to block forever.
This commit solves this problem by changing lru_add_drain_all()
to drain the LRU pagevecs of remote CPUs. This is done by grabbing
swapvec_lock and calling lru_add_drain_cpu().
PS: This is based on an idea and initial implementation by
Rik van Riel.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/swap.c | 36 ++++++++++++++++++++++++++++++------
1 file changed, 30 insertions(+), 6 deletions(-)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -590,9 +590,15 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
+#ifdef CONFIG_PREEMPT_RT_BASE
+ local_lock_irqsave_on(rotate_lock, flags, cpu);
+ pagevec_move_tail(pvec);
+ local_unlock_irqrestore_on(rotate_lock, flags, cpu);
+#else
local_lock_irqsave(rotate_lock, flags);
pagevec_move_tail(pvec);
local_unlock_irqrestore(rotate_lock, flags);
+#endif
}
pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
@@ -660,6 +666,16 @@ void lru_add_drain(void)
local_unlock_cpu(swapvec_lock);
}
+#ifdef CONFIG_PREEMPT_RT_BASE
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ local_lock_on(swapvec_lock, cpu);
+ lru_add_drain_cpu(cpu);
+ local_unlock_on(swapvec_lock, cpu);
+}
+
+#else
+
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
@@ -667,6 +683,16 @@ static void lru_add_drain_per_cpu(struct
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
+{
+ struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+ INIT_WORK(work, lru_add_drain_per_cpu);
+ queue_work_on(cpu, mm_percpu_wq, work);
+ cpumask_set_cpu(cpu, has_work);
+}
+#endif
+
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -691,21 +717,19 @@ void lru_add_drain_all(void)
cpumask_clear(&has_work);
for_each_online_cpu(cpu) {
- struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
- need_activate_page_drain(cpu)) {
- INIT_WORK(work, lru_add_drain_per_cpu);
- queue_work_on(cpu, mm_percpu_wq, work);
- cpumask_set_cpu(cpu, &has_work);
- }
+ need_activate_page_drain(cpu))
+ remote_lru_add_drain(cpu, &has_work);
}
+#ifndef CONFIG_PREEMPT_RT_BASE
for_each_cpu(cpu, &has_work)
flush_work(&per_cpu(lru_add_drain_work, cpu));
+#endif
mutex_unlock(&lock);
}

View File

@@ -0,0 +1,136 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:30:13 -0500
Subject: mm/vmstat: Protect per cpu variables with preempt disable on RT
Disable preemption on -RT for the vmstat code. On vanila the code runs in
IRQ-off regions while on -RT it is not. "preempt_disable" ensures that the
same ressources is not updated in parallel due to preemption.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/vmstat.h | 4 ++++
mm/vmstat.c | 12 ++++++++++++
2 files changed, 16 insertions(+)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -54,7 +54,9 @@ DECLARE_PER_CPU(struct vm_event_state, v
*/
static inline void __count_vm_event(enum vm_event_item item)
{
+ preempt_disable_rt();
raw_cpu_inc(vm_event_states.event[item]);
+ preempt_enable_rt();
}
static inline void count_vm_event(enum vm_event_item item)
@@ -64,7 +66,9 @@ static inline void count_vm_event(enum v
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
+ preempt_disable_rt();
raw_cpu_add(vm_event_states.event[item], delta);
+ preempt_enable_rt();
}
static inline void count_vm_events(enum vm_event_item item, long delta)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -320,6 +320,7 @@ void __mod_zone_page_state(struct zone *
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -329,6 +330,7 @@ void __mod_zone_page_state(struct zone *
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -340,6 +342,7 @@ void __mod_node_page_state(struct pglist
long x;
long t;
+ preempt_disable_rt();
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -349,6 +352,7 @@ void __mod_node_page_state(struct pglist
x = 0;
}
__this_cpu_write(*p, x);
+ preempt_enable_rt();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -381,6 +385,7 @@ void __inc_zone_state(struct zone *zone,
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -389,6 +394,7 @@ void __inc_zone_state(struct zone *zone,
zone_page_state_add(v + overstep, zone, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -397,6 +403,7 @@ void __inc_node_state(struct pglist_data
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -405,6 +412,7 @@ void __inc_node_state(struct pglist_data
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
}
+ preempt_enable_rt();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -425,6 +433,7 @@ void __dec_zone_state(struct zone *zone,
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -433,6 +442,7 @@ void __dec_zone_state(struct zone *zone,
zone_page_state_add(v - overstep, zone, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -441,6 +451,7 @@ void __dec_node_state(struct pglist_data
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ preempt_disable_rt();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -449,6 +460,7 @@ void __dec_node_state(struct pglist_data
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
}
+ preempt_enable_rt();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)

View File

@@ -0,0 +1,68 @@
Subject: ARM: Initialize split page table locks for vector page
From: Frank Rowand <frank.rowand@am.sony.com>
Date: Sat, 1 Oct 2011 18:58:13 -0700
Without this patch, ARM can not use SPLIT_PTLOCK_CPUS if
PREEMPT_RT_FULL=y because vectors_user_mapping() creates a
VM_ALWAYSDUMP mapping of the vector page (address 0xffff0000), but no
ptl->lock has been allocated for the page. An attempt to coredump
that page will result in a kernel NULL pointer dereference when
follow_page() attempts to lock the page.
The call tree to the NULL pointer dereference is:
do_notify_resume()
get_signal_to_deliver()
do_coredump()
elf_core_dump()
get_dump_page()
__get_user_pages()
follow_page()
pte_offset_map_lock() <----- a #define
...
rt_spin_lock()
The underlying problem is exposed by mm-shrink-the-page-frame-to-rt-size.patch.
Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
Cc: Frank <Frank_Rowand@sonyusa.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/4E87C535.2030907@am.sony.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/arm/kernel/process.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -324,6 +324,30 @@ unsigned long arch_randomize_brk(struct
}
#ifdef CONFIG_MMU
+/*
+ * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
+ * initialized by pgtable_page_ctor() then a coredump of the vector page will
+ * fail.
+ */
+static int __init vectors_user_mapping_init_page(void)
+{
+ struct page *page;
+ unsigned long addr = 0xffff0000;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset_k(addr);
+ pud = pud_offset(pgd, addr);
+ pmd = pmd_offset(pud, addr);
+ page = pmd_page(*(pmd));
+
+ pgtable_page_ctor(page);
+
+ return 0;
+}
+late_initcall(vectors_user_mapping_init_page);
+
#ifdef CONFIG_KUSER_HELPERS
/*
* The vectors page is always readable from user space for the

View File

@@ -0,0 +1,35 @@
Subject: mm: Enable SLUB for RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 25 Oct 2012 10:32:35 +0100
Avoid the memory allocation in IRQ section
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: factor out everything except the kcalloc() workaorund ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/slub.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3677,6 +3677,11 @@ static void list_slab_objects(struct kme
const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_PREEMPT_RT_BASE
+ /* XXX move out of irq-off section */
+ slab_err(s, page, text, s->name);
+#else
+
void *addr = page_address(page);
void *p;
unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
@@ -3698,6 +3703,7 @@ static void list_slab_objects(struct kme
slab_unlock(page);
kfree(map);
#endif
+#endif
}
/*

View File

@@ -0,0 +1,41 @@
Subject: slub: Enable irqs for __GFP_WAIT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 09 Jan 2013 12:08:15 +0100
SYSTEM_RUNNING might be too late for enabling interrupts. Allocations
with GFP_WAIT can happen before that. So use this as an indicator.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
mm/slub.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1570,10 +1570,17 @@ static struct page *allocate_slab(struct
void *start, *p;
int idx, order;
bool shuffle;
+ bool enableirqs = false;
flags &= gfp_allowed_mask;
if (gfpflags_allow_blocking(flags))
+ enableirqs = true;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (system_state > SYSTEM_BOOTING)
+ enableirqs = true;
+#endif
+ if (enableirqs)
local_irq_enable();
flags |= s->allocflags;
@@ -1632,7 +1639,7 @@ static struct page *allocate_slab(struct
page->frozen = 1;
out:
- if (gfpflags_allow_blocking(flags))
+ if (enableirqs)
local_irq_disable();
if (!page)
return NULL;

View File

@@ -0,0 +1,47 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 15 Apr 2015 19:00:47 +0200
Subject: slub: Disable SLUB_CPU_PARTIAL
|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:915
|in_atomic(): 1, irqs_disabled(): 0, pid: 87, name: rcuop/7
|1 lock held by rcuop/7/87:
| #0: (rcu_callback){......}, at: [<ffffffff8112c76a>] rcu_nocb_kthread+0x1ca/0x5d0
|Preemption disabled at:[<ffffffff811eebd9>] put_cpu_partial+0x29/0x220
|
|CPU: 0 PID: 87 Comm: rcuop/7 Tainted: G W 4.0.0-rt0+ #477
|Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
| 000000000007a9fc ffff88013987baf8 ffffffff817441c7 0000000000000007
| 0000000000000000 ffff88013987bb18 ffffffff810eee51 0000000000000000
| ffff88013fc10200 ffff88013987bb48 ffffffff8174a1c4 000000000007a9fc
|Call Trace:
| [<ffffffff817441c7>] dump_stack+0x4f/0x90
| [<ffffffff810eee51>] ___might_sleep+0x121/0x1b0
| [<ffffffff8174a1c4>] rt_spin_lock+0x24/0x60
| [<ffffffff811a689a>] __free_pages_ok+0xaa/0x540
| [<ffffffff811a729d>] __free_pages+0x1d/0x30
| [<ffffffff811eddd5>] __free_slab+0xc5/0x1e0
| [<ffffffff811edf46>] free_delayed+0x56/0x70
| [<ffffffff811eecfd>] put_cpu_partial+0x14d/0x220
| [<ffffffff811efc98>] __slab_free+0x158/0x2c0
| [<ffffffff811f0021>] kmem_cache_free+0x221/0x2d0
| [<ffffffff81204d0c>] file_free_rcu+0x2c/0x40
| [<ffffffff8112c7e3>] rcu_nocb_kthread+0x243/0x5d0
| [<ffffffff810e951c>] kthread+0xfc/0x120
| [<ffffffff8174abc8>] ret_from_fork+0x58/0x90
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
init/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1698,7 +1698,7 @@ config SLAB_FREELIST_HARDENED
config SLUB_CPU_PARTIAL
default y
- depends on SLUB && SMP
+ depends on SLUB && SMP && !PREEMPT_RT_FULL
bool "SLUB per cpu partial cache"
help
Per cpu partial caches accellerate objects allocation and freeing

View File

@@ -0,0 +1,68 @@
From: Yang Shi <yang.shi@windriver.com>
Subject: mm/memcontrol: Don't call schedule_work_on in preemption disabled context
Date: Wed, 30 Oct 2013 11:48:33 -0700
The following trace is triggered when running ltp oom test cases:
BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
in_atomic(): 1, irqs_disabled(): 0, pid: 17188, name: oom03
Preemption disabled at:[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0
CPU: 2 PID: 17188 Comm: oom03 Not tainted 3.10.10-rt3 #2
Hardware name: Intel Corporation Calpella platform/MATXM-CORE-411-B, BIOS 4.6.3 08/18/2010
ffff88007684d730 ffff880070df9b58 ffffffff8169918d ffff880070df9b70
ffffffff8106db31 ffff88007688b4a0 ffff880070df9b88 ffffffff8169d9c0
ffff88007688b4a0 ffff880070df9bc8 ffffffff81059da1 0000000170df9bb0
Call Trace:
[<ffffffff8169918d>] dump_stack+0x19/0x1b
[<ffffffff8106db31>] __might_sleep+0xf1/0x170
[<ffffffff8169d9c0>] rt_spin_lock+0x20/0x50
[<ffffffff81059da1>] queue_work_on+0x61/0x100
[<ffffffff8112b361>] drain_all_stock+0xe1/0x1c0
[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0
[<ffffffff8112beda>] __mem_cgroup_try_charge+0x41a/0xc40
[<ffffffff810f1c91>] ? release_pages+0x1b1/0x1f0
[<ffffffff8106f200>] ? sched_exec+0x40/0xb0
[<ffffffff8112cc87>] mem_cgroup_charge_common+0x37/0x70
[<ffffffff8112e2c6>] mem_cgroup_newpage_charge+0x26/0x30
[<ffffffff8110af68>] handle_pte_fault+0x618/0x840
[<ffffffff8103ecf6>] ? unpin_current_cpu+0x16/0x70
[<ffffffff81070f94>] ? migrate_enable+0xd4/0x200
[<ffffffff8110cde5>] handle_mm_fault+0x145/0x1e0
[<ffffffff810301e1>] __do_page_fault+0x1a1/0x4c0
[<ffffffff8169c9eb>] ? preempt_schedule_irq+0x4b/0x70
[<ffffffff8169e3b7>] ? retint_kernel+0x37/0x40
[<ffffffff8103053e>] do_page_fault+0xe/0x10
[<ffffffff8169e4c2>] page_fault+0x22/0x30
So, to prevent schedule_work_on from being called in preempt disabled context,
replace the pair of get/put_cpu() to get/put_cpu_light().
Signed-off-by: Yang Shi <yang.shi@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2052,7 +2052,7 @@ static void drain_all_stock(struct mem_c
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ curcpu = get_cpu_light();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2072,7 +2072,7 @@ static void drain_all_stock(struct mem_c
}
css_put(&memcg->css);
}
- put_cpu();
+ put_cpu_light();
mutex_unlock(&percpu_charge_mutex);
}

View File

@@ -0,0 +1,116 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Subject: mm/memcontrol: Replace local_irq_disable with local locks
Date: Wed, 28 Jan 2015 17:14:16 +0100
There are a few local_irq_disable() which then take sleeping locks. This
patch converts them local locks.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 24 ++++++++++++++++--------
1 file changed, 16 insertions(+), 8 deletions(-)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -69,6 +69,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include <linux/locallock.h>
#include <linux/uaccess.h>
@@ -94,6 +95,8 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+static DEFINE_LOCAL_IRQ_LOCK(event_lock);
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -4859,12 +4862,12 @@ static int mem_cgroup_move_account(struc
ret = 0;
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
out_unlock:
unlock_page(page);
out:
@@ -5983,10 +5986,10 @@ void mem_cgroup_commit_charge(struct pag
commit_charge(page, memcg, lrucare);
- local_irq_disable();
+ local_lock_irq(event_lock);
mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
- local_irq_enable();
+ local_unlock_irq(event_lock);
if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -6055,7 +6058,7 @@ static void uncharge_batch(const struct
memcg_oom_recover(ug->memcg);
}
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
__mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
__mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
@@ -6063,7 +6066,7 @@ static void uncharge_batch(const struct
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
if (!mem_cgroup_is_root(ug->memcg))
css_put_many(&ug->memcg->css, nr_pages);
@@ -6226,10 +6229,10 @@ void mem_cgroup_migrate(struct page *old
commit_charge(newpage, memcg, false);
- local_irq_save(flags);
+ local_lock_irqsave(event_lock, flags);
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
memcg_check_events(memcg, newpage);
- local_irq_restore(flags);
+ local_unlock_irqrestore(event_lock, flags);
}
DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -6421,6 +6424,7 @@ void mem_cgroup_swapout(struct page *pag
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
+ unsigned long flags;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
@@ -6466,13 +6470,17 @@ void mem_cgroup_swapout(struct page *pag
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
+ local_lock_irqsave(event_lock, flags);
+#ifndef CONFIG_PREEMPT_RT_BASE
VM_BUG_ON(!irqs_disabled());
+#endif
mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
-nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
css_put_many(&memcg->css, nr_entries);
+ local_unlock_irqrestore(event_lock, flags);
}
/**

View File

@@ -0,0 +1,196 @@
From: Mike Galbraith <umgwanakikbuti@gmail.com>
Date: Tue, 22 Mar 2016 11:16:09 +0100
Subject: [PATCH] mm/zsmalloc: copy with get_cpu_var() and locking
get_cpu_var() disables preemption and triggers a might_sleep() splat later.
This is replaced with get_locked_var().
This bitspinlocks are replaced with a proper mutex which requires a slightly
larger struct to allocate.
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
[bigeasy: replace the bitspin_lock() with a mutex, get_locked_var(). Mike then
fixed the size magic]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/zsmalloc.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 74 insertions(+), 6 deletions(-)
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -55,6 +55,7 @@
#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/locallock.h>
#define ZSPAGE_MAGIC 0x58
@@ -72,9 +73,22 @@
*/
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+#ifdef CONFIG_PREEMPT_RT_FULL
+
+struct zsmalloc_handle {
+ unsigned long addr;
+ struct mutex lock;
+};
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
+
+#else
+
+#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
+#endif
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -320,7 +334,7 @@ static void SetZsPageMovable(struct zs_p
static int create_cache(struct zs_pool *pool)
{
- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
0, 0, NULL);
if (!pool->handle_cachep)
return 1;
@@ -344,10 +358,27 @@ static void destroy_cache(struct zs_pool
static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ void *p;
+
+ p = kmem_cache_alloc(pool->handle_cachep,
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (p) {
+ struct zsmalloc_handle *zh = p;
+
+ mutex_init(&zh->lock);
+ }
+#endif
+ return (unsigned long)p;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
+{
+ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
+}
+#endif
+
static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
kmem_cache_free(pool->handle_cachep, (void *)handle);
@@ -366,12 +397,18 @@ static void cache_free_zspage(struct zs_
static void record_obj(unsigned long handle, unsigned long obj)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ WRITE_ONCE(zh->addr, obj);
+#else
/*
* lsb of @obj represents handle lock while other bits
* represent object value the handle is pointing so
* updating shouldn't do store tearing.
*/
WRITE_ONCE(*(unsigned long *)handle, obj);
+#endif
}
/* zpool driver */
@@ -453,6 +490,7 @@ MODULE_ALIAS("zpool-zsmalloc");
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
+static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
static bool is_zspage_isolated(struct zspage *zspage)
{
@@ -882,7 +920,13 @@ static unsigned long location_to_obj(str
static unsigned long handle_to_obj(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return zh->addr;
+#else
return *(unsigned long *)handle;
+#endif
}
static unsigned long obj_to_head(struct page *page, void *obj)
@@ -896,22 +940,46 @@ static unsigned long obj_to_head(struct
static inline int testpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_is_locked(&zh->lock);
+#else
return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static inline int trypin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_trylock(&zh->lock);
+#else
return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void pin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_lock(&zh->lock);
+#else
bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void unpin_tag(unsigned long handle)
{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
+
+ return mutex_unlock(&zh->lock);
+#else
bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+#endif
}
static void reset_page(struct page *page)
@@ -1337,7 +1405,7 @@ void *zs_map_object(struct zs_pool *pool
class = pool->size_class[class_idx];
off = (class->size * obj_idx) & ~PAGE_MASK;
- area = &get_cpu_var(zs_map_area);
+ area = &get_locked_var(zs_map_area_lock, zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1391,7 +1459,7 @@ void zs_unmap_object(struct zs_pool *poo
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ put_locked_var(zs_map_area_lock, zs_map_area);
migrate_read_unlock(zspage);
unpin_tag(handle);

View File

@@ -0,0 +1,54 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 11 Dec 2018 21:53:43 +0100
Subject: [PATCH] x86/mm/pat: disable preemption __split_large_page() after
spin_lock()
Commit "x86/mm/pat: Disable preemption around __flush_tlb_all()" added a
warning if __flush_tlb_all() is invoked in preemptible context. On !RT
the warning does not trigger because a spin lock is acquired which
disables preemption. On RT the spin lock does not disable preemption and
so the warning is seen.
Disable preemption to avoid the warning __flush_tlb_all().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/mm/pageattr.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -688,11 +688,17 @@ static int
spin_lock(&pgd_lock);
/*
+ * Keep preemption disabled after __flush_tlb_all() which expects not be
+ * preempted during the flush of the local TLB.
+ */
+ preempt_disable();
+ /*
* Check for races, another CPU might have split this page
* up for us already:
*/
tmp = _lookup_address_cpa(cpa, address, &level);
if (tmp != kpte) {
+ preempt_enable();
spin_unlock(&pgd_lock);
return 1;
}
@@ -726,6 +732,7 @@ static int
break;
default:
+ preempt_enable();
spin_unlock(&pgd_lock);
return 1;
}
@@ -764,6 +771,7 @@ static int
* going on.
*/
__flush_tlb_all();
+ preempt_enable();
spin_unlock(&pgd_lock);
return 0;

View File

@@ -0,0 +1,165 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 25 Jan 2017 16:34:27 +0100
Subject: [PATCH] radix-tree: use local locks
The preload functionality uses per-CPU variables and preempt-disable to
ensure that it does not switch CPUs during its usage. This patch adds
local_locks() instead preempt_disable() for the same purpose and to
remain preemptible on -RT.
Cc: stable-rt@vger.kernel.org
Reported-and-debugged-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/idr.h | 5 +----
include/linux/radix-tree.h | 7 ++-----
lib/radix-tree.c | 32 +++++++++++++++++++++++---------
3 files changed, 26 insertions(+), 18 deletions(-)
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -169,10 +169,7 @@ static inline bool idr_is_empty(const st
* Each idr_preload() should be matched with an invocation of this
* function. See idr_preload() for details.
*/
-static inline void idr_preload_end(void)
-{
- preempt_enable();
-}
+void idr_preload_end(void);
/**
* idr_for_each_entry() - Iterate over an IDR's elements of a given type.
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -330,6 +330,8 @@ unsigned int radix_tree_gang_lookup_slot
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
+void radix_tree_preload_end(void);
+
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
unsigned long index, unsigned int tag);
@@ -349,11 +351,6 @@ unsigned int radix_tree_gang_lookup_tag_
unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);
-static inline void radix_tree_preload_end(void)
-{
- preempt_enable();
-}
-
int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
int radix_tree_split(struct radix_tree_root *, unsigned long index,
unsigned new_order);
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,7 +38,7 @@
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
-
+#include <linux/locallock.h>
/* Number of nodes in fully populated tree of given height */
static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
@@ -87,6 +87,7 @@ struct radix_tree_preload {
struct radix_tree_node *nodes;
};
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
static inline struct radix_tree_node *entry_to_node(void *ptr)
{
@@ -405,12 +406,13 @@ radix_tree_node_alloc(gfp_t gfp_mask, st
* succeed in getting a node here (and never reach
* kmem_cache_alloc)
*/
- rtp = this_cpu_ptr(&radix_tree_preloads);
+ rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
if (rtp->nr) {
ret = rtp->nodes;
rtp->nodes = ret->parent;
rtp->nr--;
}
+ put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
/*
* Update the allocation stack trace as this is more useful
* for debugging.
@@ -476,14 +478,14 @@ static __must_check int __radix_tree_pre
*/
gfp_mask &= ~__GFP_ACCOUNT;
- preempt_disable();
+ local_lock(radix_tree_preloads_lock);
rtp = this_cpu_ptr(&radix_tree_preloads);
while (rtp->nr < nr) {
- preempt_enable();
+ local_unlock(radix_tree_preloads_lock);
node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
if (node == NULL)
goto out;
- preempt_disable();
+ local_lock(radix_tree_preloads_lock);
rtp = this_cpu_ptr(&radix_tree_preloads);
if (rtp->nr < nr) {
node->parent = rtp->nodes;
@@ -525,7 +527,7 @@ int radix_tree_maybe_preload(gfp_t gfp_m
if (gfpflags_allow_blocking(gfp_mask))
return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
/* Preloading doesn't help anything with this gfp mask, skip it */
- preempt_disable();
+ local_lock(radix_tree_preloads_lock);
return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);
@@ -563,7 +565,7 @@ int radix_tree_maybe_preload_order(gfp_t
/* Preloading doesn't help anything with this gfp mask, skip it */
if (!gfpflags_allow_blocking(gfp_mask)) {
- preempt_disable();
+ local_lock(radix_tree_preloads_lock);
return 0;
}
@@ -597,6 +599,12 @@ int radix_tree_maybe_preload_order(gfp_t
return __radix_tree_preload(gfp_mask, nr_nodes);
}
+void radix_tree_preload_end(void)
+{
+ local_unlock(radix_tree_preloads_lock);
+}
+EXPORT_SYMBOL(radix_tree_preload_end);
+
static unsigned radix_tree_load_root(const struct radix_tree_root *root,
struct radix_tree_node **nodep, unsigned long *maxindex)
{
@@ -2102,10 +2110,16 @@ EXPORT_SYMBOL(radix_tree_tagged);
void idr_preload(gfp_t gfp_mask)
{
if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
- preempt_disable();
+ local_lock(radix_tree_preloads_lock);
}
EXPORT_SYMBOL(idr_preload);
+void idr_preload_end(void)
+{
+ local_unlock(radix_tree_preloads_lock);
+}
+EXPORT_SYMBOL(idr_preload_end);
+
int ida_pre_get(struct ida *ida, gfp_t gfp)
{
/*
@@ -2114,7 +2128,7 @@ int ida_pre_get(struct ida *ida, gfp_t g
* to return to the ida_pre_get() step.
*/
if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE))
- preempt_enable();
+ local_unlock(radix_tree_preloads_lock);
if (!this_cpu_read(ida_bitmap)) {
struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp);

View File

@@ -0,0 +1,166 @@
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:34 -0500
Subject: timers: Prepare for full preemption
When softirqs can be preempted we need to make sure that cancelling
the timer from the active thread can not deadlock vs. a running timer
callback. Add a waitqueue to resolve that.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/timer.h | 2 +-
kernel/sched/core.c | 9 +++++++--
kernel/time/timer.c | 45 +++++++++++++++++++++++++++++++++++++++++----
3 files changed, 49 insertions(+), 7 deletions(-)
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -172,7 +172,7 @@ extern void add_timer(struct timer_list
extern int try_to_del_timer_sync(struct timer_list *timer);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
extern int del_timer_sync(struct timer_list *timer);
#else
# define del_timer_sync(t) del_timer(t)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -496,11 +496,14 @@ void resched_cpu(int cpu)
*/
int get_nohz_timer_target(void)
{
- int i, cpu = smp_processor_id();
+ int i, cpu;
struct sched_domain *sd;
+ preempt_disable_rt();
+ cpu = smp_processor_id();
+
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
- return cpu;
+ goto preempt_en_rt;
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -519,6 +522,8 @@ int get_nohz_timer_target(void)
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
+preempt_en_rt:
+ preempt_enable_rt();
return cpu;
}
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -44,6 +44,7 @@
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/swait.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -197,6 +198,9 @@ EXPORT_SYMBOL(jiffies_64);
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ struct swait_queue_head wait_for_running_timer;
+#endif
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
@@ -1178,6 +1182,33 @@ void add_timer_on(struct timer_list *tim
}
EXPORT_SYMBOL_GPL(add_timer_on);
+#ifdef CONFIG_PREEMPT_RT_FULL
+/*
+ * Wait for a running timer
+ */
+static void wait_for_running_timer(struct timer_list *timer)
+{
+ struct timer_base *base;
+ u32 tf = timer->flags;
+
+ if (tf & TIMER_MIGRATING)
+ return;
+
+ base = get_timer_base(tf);
+ swait_event_exclusive(base->wait_for_running_timer,
+ base->running_timer != timer);
+}
+
+# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer)
+#else
+static inline void wait_for_running_timer(struct timer_list *timer)
+{
+ cpu_relax();
+}
+
+# define wakeup_timer_waiters(b) do { } while (0)
+#endif
+
/**
* del_timer - deactivate a timer.
* @timer: the timer to be deactivated
@@ -1233,7 +1264,7 @@ int try_to_del_timer_sync(struct timer_l
}
EXPORT_SYMBOL(try_to_del_timer_sync);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1293,7 +1324,7 @@ int del_timer_sync(struct timer_list *ti
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
return ret;
- cpu_relax();
+ wait_for_running_timer(timer);
}
}
EXPORT_SYMBOL(del_timer_sync);
@@ -1354,13 +1385,16 @@ static void expire_timers(struct timer_b
fn = timer->function;
- if (timer->flags & TIMER_IRQSAFE) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
+ timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn);
+ base->running_timer = NULL;
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn);
+ base->running_timer = NULL;
raw_spin_lock_irq(&base->lock);
}
}
@@ -1681,8 +1715,8 @@ static inline void __run_timers(struct t
while (levels--)
expire_timers(base, heads + levels);
}
- base->running_timer = NULL;
raw_spin_unlock_irq(&base->lock);
+ wakeup_timer_waiters(base);
}
/*
@@ -1927,6 +1961,9 @@ static void __init init_timer_cpu(int cp
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
+#ifdef CONFIG_PREEMPT_RT_FULL
+ init_swait_queue_head(&base->wait_for_running_timer);
+#endif
}
}

View File

@@ -0,0 +1,30 @@
Subject: x86: kvm Require const tsc for RT
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 06 Nov 2011 12:26:18 +0100
Non constant TSC is a nightmare on bare metal already, but with
virtualization it becomes a complete disaster because the workarounds
are horrible latency wise. That's also a preliminary for running RT in
a guest on top of a RT host.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/kvm/x86.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6698,6 +6698,13 @@ int kvm_arch_init(void *opaque)
goto out;
}
+#ifdef CONFIG_PREEMPT_RT_FULL
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
+ return -EOPNOTSUPP;
+ }
+#endif
+
r = kvm_mmu_module_init();
if (r)
goto out_free_percpu;

View File

@@ -0,0 +1,108 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Oct 2017 10:24:23 +0200
Subject: [PATCH] pci/switchtec: Don't use completion's wait queue
The poll callback is using completion's wait_queue_head_t member and
puts it in poll_wait() so the poll() caller gets a wakeup after command
completed. This does not work on RT because we don't have a
wait_queue_head_t in our completion implementation. Nobody in tree does
like that in tree so this is the only driver that breaks.
Instead of using the completion here is waitqueue with a status flag as
suggested by Logan.
I don't have the HW so I have no idea if it works as expected, so please
test it.
Cc: Kurt Schwemmer <kurt.schwemmer@microsemi.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/pci/switch/switchtec.c | 22 +++++++++++++---------
1 file changed, 13 insertions(+), 9 deletions(-)
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -43,10 +43,11 @@ struct switchtec_user {
enum mrpc_state state;
- struct completion comp;
+ wait_queue_head_t cmd_comp;
struct kref kref;
struct list_head list;
+ bool cmd_done;
u32 cmd;
u32 status;
u32 return_code;
@@ -68,7 +69,7 @@ static struct switchtec_user *stuser_cre
stuser->stdev = stdev;
kref_init(&stuser->kref);
INIT_LIST_HEAD(&stuser->list);
- init_completion(&stuser->comp);
+ init_waitqueue_head(&stuser->cmd_comp);
stuser->event_cnt = atomic_read(&stdev->event_cnt);
dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
@@ -151,7 +152,7 @@ static int mrpc_queue_cmd(struct switcht
kref_get(&stuser->kref);
stuser->read_len = sizeof(stuser->data);
stuser_set_state(stuser, MRPC_QUEUED);
- init_completion(&stuser->comp);
+ stuser->cmd_done = false;
list_add_tail(&stuser->list, &stdev->mrpc_queue);
mrpc_cmd_submit(stdev);
@@ -188,7 +189,8 @@ static void mrpc_complete_cmd(struct swi
stuser->read_len);
out:
- complete_all(&stuser->comp);
+ stuser->cmd_done = true;
+ wake_up_interruptible(&stuser->cmd_comp);
list_del_init(&stuser->list);
stuser_put(stuser);
stdev->mrpc_busy = 0;
@@ -458,10 +460,11 @@ static ssize_t switchtec_dev_read(struct
mutex_unlock(&stdev->mrpc_mutex);
if (filp->f_flags & O_NONBLOCK) {
- if (!try_wait_for_completion(&stuser->comp))
+ if (!READ_ONCE(stuser->cmd_done))
return -EAGAIN;
} else {
- rc = wait_for_completion_interruptible(&stuser->comp);
+ rc = wait_event_interruptible(stuser->cmd_comp,
+ stuser->cmd_done);
if (rc < 0)
return rc;
}
@@ -509,7 +512,7 @@ static __poll_t switchtec_dev_poll(struc
struct switchtec_dev *stdev = stuser->stdev;
__poll_t ret = 0;
- poll_wait(filp, &stuser->comp.wait, wait);
+ poll_wait(filp, &stuser->cmd_comp, wait);
poll_wait(filp, &stdev->event_wq, wait);
if (lock_mutex_and_test_alive(stdev))
@@ -517,7 +520,7 @@ static __poll_t switchtec_dev_poll(struc
mutex_unlock(&stdev->mrpc_mutex);
- if (try_wait_for_completion(&stuser->comp))
+ if (READ_ONCE(stuser->cmd_done))
ret |= EPOLLIN | EPOLLRDNORM;
if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
@@ -1041,7 +1044,8 @@ static void stdev_kill(struct switchtec_
/* Wake up and kill any users waiting on an MRPC request */
list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) {
- complete_all(&stuser->comp);
+ stuser->cmd_done = true;
+ wake_up_interruptible(&stuser->cmd_comp);
list_del_init(&stuser->list);
stuser_put(stuser);
}

View File

@@ -0,0 +1,32 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Oct 2013 12:19:57 +0100
Subject: wait.h: include atomic.h
| CC init/main.o
|In file included from include/linux/mmzone.h:9:0,
| from include/linux/gfp.h:4,
| from include/linux/kmod.h:22,
| from include/linux/module.h:13,
| from init/main.c:15:
|include/linux/wait.h: In function wait_on_atomic_t:
|include/linux/wait.h:982:2: error: implicit declaration of function atomic_read [-Werror=implicit-function-declaration]
| if (atomic_read(val) == 0)
| ^
This pops up on ARM. Non-RT gets its atomic.h include from spinlock.h
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/wait.h | 1 +
1 file changed, 1 insertion(+)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -10,6 +10,7 @@
#include <asm/current.h>
#include <uapi/linux/wait.h>
+#include <linux/atomic.h>
typedef struct wait_queue_entry wait_queue_entry_t;

View File

@@ -0,0 +1,231 @@
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Fri, 11 Jul 2014 15:26:11 +0200
Subject: work-simple: Simple work queue implemenation
Provides a framework for enqueuing callbacks from irq context
PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
Bases on wait-simple.
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
---
include/linux/swork.h | 24 ++++++
kernel/sched/Makefile | 2
kernel/sched/swork.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 198 insertions(+), 1 deletion(-)
--- /dev/null
+++ b/include/linux/swork.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_SWORK_H
+#define _LINUX_SWORK_H
+
+#include <linux/list.h>
+
+struct swork_event {
+ struct list_head item;
+ unsigned long flags;
+ void (*func)(struct swork_event *);
+};
+
+static inline void INIT_SWORK(struct swork_event *event,
+ void (*func)(struct swork_event *))
+{
+ event->flags = 0;
+ event->func = func;
+}
+
+bool swork_queue(struct swork_event *sev);
+
+int swork_get(void);
+void swork_put(void);
+
+#endif /* _LINUX_SWORK_H */
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -18,7 +18,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
+obj-y += wait.o wait_bit.o swait.o swork.o completion.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
--- /dev/null
+++ b/kernel/sched/swork.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
+ *
+ * Provides a framework for enqueuing callbacks from irq context
+ * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
+ */
+
+#include <linux/swait.h>
+#include <linux/swork.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+
+#define SWORK_EVENT_PENDING (1 << 0)
+
+static DEFINE_MUTEX(worker_mutex);
+static struct sworker *glob_worker;
+
+struct sworker {
+ struct list_head events;
+ struct swait_queue_head wq;
+
+ raw_spinlock_t lock;
+
+ struct task_struct *task;
+ int refs;
+};
+
+static bool swork_readable(struct sworker *worker)
+{
+ bool r;
+
+ if (kthread_should_stop())
+ return true;
+
+ raw_spin_lock_irq(&worker->lock);
+ r = !list_empty(&worker->events);
+ raw_spin_unlock_irq(&worker->lock);
+
+ return r;
+}
+
+static int swork_kthread(void *arg)
+{
+ struct sworker *worker = arg;
+
+ for (;;) {
+ swait_event_interruptible_exclusive(worker->wq,
+ swork_readable(worker));
+ if (kthread_should_stop())
+ break;
+
+ raw_spin_lock_irq(&worker->lock);
+ while (!list_empty(&worker->events)) {
+ struct swork_event *sev;
+
+ sev = list_first_entry(&worker->events,
+ struct swork_event, item);
+ list_del(&sev->item);
+ raw_spin_unlock_irq(&worker->lock);
+
+ WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
+ &sev->flags));
+ sev->func(sev);
+ raw_spin_lock_irq(&worker->lock);
+ }
+ raw_spin_unlock_irq(&worker->lock);
+ }
+ return 0;
+}
+
+static struct sworker *swork_create(void)
+{
+ struct sworker *worker;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (!worker)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&worker->events);
+ raw_spin_lock_init(&worker->lock);
+ init_swait_queue_head(&worker->wq);
+
+ worker->task = kthread_run(swork_kthread, worker, "kswork");
+ if (IS_ERR(worker->task)) {
+ kfree(worker);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return worker;
+}
+
+static void swork_destroy(struct sworker *worker)
+{
+ kthread_stop(worker->task);
+
+ WARN_ON(!list_empty(&worker->events));
+ kfree(worker);
+}
+
+/**
+ * swork_queue - queue swork
+ *
+ * Returns %false if @work was already on a queue, %true otherwise.
+ *
+ * The work is queued and processed on a random CPU
+ */
+bool swork_queue(struct swork_event *sev)
+{
+ unsigned long flags;
+
+ if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
+ return false;
+
+ raw_spin_lock_irqsave(&glob_worker->lock, flags);
+ list_add_tail(&sev->item, &glob_worker->events);
+ raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
+
+ swake_up_one(&glob_worker->wq);
+ return true;
+}
+EXPORT_SYMBOL_GPL(swork_queue);
+
+/**
+ * swork_get - get an instance of the sworker
+ *
+ * Returns an negative error code if the initialization if the worker did not
+ * work, %0 otherwise.
+ *
+ */
+int swork_get(void)
+{
+ struct sworker *worker;
+
+ mutex_lock(&worker_mutex);
+ if (!glob_worker) {
+ worker = swork_create();
+ if (IS_ERR(worker)) {
+ mutex_unlock(&worker_mutex);
+ return -ENOMEM;
+ }
+
+ glob_worker = worker;
+ }
+
+ glob_worker->refs++;
+ mutex_unlock(&worker_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(swork_get);
+
+/**
+ * swork_put - puts an instance of the sworker
+ *
+ * Will destroy the sworker thread. This function must not be called until all
+ * queued events have been completed.
+ */
+void swork_put(void)
+{
+ mutex_lock(&worker_mutex);
+
+ glob_worker->refs--;
+ if (glob_worker->refs > 0)
+ goto out;
+
+ swork_destroy(glob_worker);
+ glob_worker = NULL;
+out:
+ mutex_unlock(&worker_mutex);
+}
+EXPORT_SYMBOL_GPL(swork_put);

View File

@@ -0,0 +1,29 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 10 Sep 2018 18:00:31 +0200
Subject: [PATCH] work-simple: drop a shit statement in SWORK_EVENT_PENDING
Dan Carpenter reported
| smatch warnings:
|kernel/sched/swork.c:63 swork_kthread() warn: test_bit() takes a bit number
This is not a bug because we shift by zero (and use the same value in
both places).
Nevertheless I'm dropping that shift by zero to keep smatch quiet.
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/sched/swork.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/sched/swork.c
+++ b/kernel/sched/swork.c
@@ -12,7 +12,7 @@
#include <linux/spinlock.h>
#include <linux/export.h>
-#define SWORK_EVENT_PENDING (1 << 0)
+#define SWORK_EVENT_PENDING 1
static DEFINE_MUTEX(worker_mutex);
static struct sworker *glob_worker;

Some files were not shown because too many files have changed in this diff Show More