port 5.11.4-rt

Signed-off-by: Tiejun Chen <tiejun.china@gmail.com>
This commit is contained in:
Tiejun Chen
2021-06-29 10:34:43 -07:00
parent 39ad5a1ab6
commit ab288c4526
400 changed files with 16201 additions and 25888 deletions

View File

@@ -0,0 +1,77 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 30 Oct 2020 13:59:06 +0100
Subject: [PATCH] highmem: Don't disable preemption on RT in kmap_atomic()
Disabling preemption makes it impossible to acquire sleeping locks within
kmap_atomic() section.
For PREEMPT_RT it is sufficient to disable migration.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/highmem-internal.h | 27 ++++++++++++++++++++++-----
1 file changed, 22 insertions(+), 5 deletions(-)
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -90,7 +90,11 @@ static inline void __kunmap_local(void *
static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
+
pagefault_disable();
return __kmap_local_page_prot(page, prot);
}
@@ -102,7 +106,11 @@ static inline void *kmap_atomic(struct p
static inline void *kmap_atomic_pfn(unsigned long pfn)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
+
pagefault_disable();
return __kmap_local_pfn_prot(pfn, kmap_prot);
}
@@ -111,7 +119,10 @@ static inline void __kunmap_atomic(void
{
kunmap_local_indexed(addr);
pagefault_enable();
- preempt_enable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_enable();
+ else
+ preempt_enable();
}
unsigned int __nr_free_highpages(void);
@@ -184,7 +195,10 @@ static inline void __kunmap_local(void *
static inline void *kmap_atomic(struct page *page)
{
- preempt_disable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_disable();
+ else
+ preempt_disable();
pagefault_disable();
return page_address(page);
}
@@ -205,7 +219,10 @@ static inline void __kunmap_atomic(void
kunmap_flush_on_unmap(addr);
#endif
pagefault_enable();
- preempt_enable();
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ migrate_enable();
+ else
+ preempt_enable();
}
static inline unsigned int nr_free_highpages(void) { return 0; }

View File

@@ -0,0 +1,55 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 6 Dec 2020 22:40:07 +0100
Subject: [PATCH] timers: Move clearing of base::timer_running under base::lock
syzbot reported KCSAN data races vs. timer_base::timer_running being set to
NULL without holding base::lock in expire_timers().
This looks innocent and most reads are clearly not problematic but for a
non-RT kernel it's completely irrelevant whether the store happens before
or after taking the lock. For an RT kernel moving the store under the lock
requires an extra unlock/lock pair in the case that there is a waiter for
the timer. But that's not the end of the world and definitely not worth the
trouble of adding boatloads of comments and annotations to the code. Famous
last words...
Reported-by: syzbot+aa7c2385d46c5eba0b89@syzkaller.appspotmail.com
Reported-by: syzbot+abea4558531bae1ba9fe@syzkaller.appspotmail.com
Link: https://lkml.kernel.org/r/87lfea7gw8.fsf@nanos.tec.linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable-rt@vger.kernel.org
---
kernel/time/timer.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1263,8 +1263,10 @@ static inline void timer_base_unlock_exp
static void timer_sync_wait_running(struct timer_base *base)
{
if (atomic_read(&base->timer_waiters)) {
+ raw_spin_unlock_irq(&base->lock);
spin_unlock(&base->expiry_lock);
spin_lock(&base->expiry_lock);
+ raw_spin_lock_irq(&base->lock);
}
}
@@ -1455,14 +1457,14 @@ static void expire_timers(struct timer_b
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn, baseclk);
- base->running_timer = NULL;
raw_spin_lock(&base->lock);
+ base->running_timer = NULL;
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn, baseclk);
+ raw_spin_lock_irq(&base->lock);
base->running_timer = NULL;
timer_sync_wait_running(base);
- raw_spin_lock_irq(&base->lock);
}
}
}

View File

@@ -0,0 +1,245 @@
From: Tian Tao <tiantao6@hisilicon.com>
Date: Sat, 13 Feb 2021 20:58:30 +1300
Subject: [PATCH 1/2] mm/zswap: add a flag to indicate if zpool can do sleep
map
Patch series "Fix the compatibility of zsmalloc and zswap".
The compatibility of zsmalloc and zswap was broken by commit 1ec3b5fe6eec
("mm/zswap: move to use crypto_acomp API for hardware acceleration").
Patch #1 adds a flag to zpool, then zswap used it to determine if zpool
drivers such as zbud/z3fold/zsmalloc will enter an atomic context after
mapping.
The difference between zbud/z3fold and zsmalloc is that zsmalloc requires
an atomic context that since its map function holds a preempt-disabled
lock, but zbud/z3fold don't require an atomic context. So patch #2 sets
flag sleep_mapped to true indicating that zbud/z3fold can sleep after
mapping. zsmalloc didn't support sleep after mapping, so don't set that
flag to true.
This patch (of 2):
Add a flag to zpool, named as "sleep_mapped", and have it set true for
zbud/z3fold, not set this flag for zsmalloc, so its default value is
false. Then zswap could go the current path if the flag is true; and if
it's false, copy data from src to a temporary buffer, then unmap the
handle, take the mutex, process the buffer instead of src to avoid
sleeping function called from atomic context.
[natechancellor@gmail.com: add return value in zswap_frontswap_load]
Link: https://lkml.kernel.org/r/20210121214804.926843-1-natechancellor@gmail.com
[tiantao6@hisilicon.com: fix potential memory leak]
Link: https://lkml.kernel.org/r/1611538365-51811-1-git-send-email-tiantao6@hisilicon.com
[colin.king@canonical.com: fix potential uninitialized pointer read on tmp]
Link: https://lkml.kernel.org/r/20210128141728.639030-1-colin.king@canonical.com
[tiantao6@hisilicon.com: fix variable 'entry' is uninitialized when used]
Link: https://lkml.kernel.org/r/1611223030-58346-1-git-send-email-tiantao6@hisilicon.com
Link: https://lkml.kernel.org/r/1611035683-12732-1-git-send-email-tiantao6@hisilicon.com
Link: https://lkml.kernel.org/r/1611035683-12732-2-git-send-email-tiantao6@hisilicon.com
[song.bao.hua@hisilicon.com: Rewrote changelog]
Fixes: 1ec3b5fe6e ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Vitaly Wool <vitaly.wool@konsulko.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/zpool.h | 3 ++
mm/zpool.c | 13 ++++++++++++
mm/zswap.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 62 insertions(+), 5 deletions(-)
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -73,6 +73,7 @@ u64 zpool_get_total_size(struct zpool *p
* @malloc: allocate mem from a pool.
* @free: free mem from a pool.
* @shrink: shrink the pool.
+ * @sleep_mapped: whether zpool driver can sleep during map.
* @map: map a handle.
* @unmap: unmap a handle.
* @total_size: get total size of a pool.
@@ -100,6 +101,7 @@ struct zpool_driver {
int (*shrink)(void *pool, unsigned int pages,
unsigned int *reclaimed);
+ bool sleep_mapped;
void *(*map)(void *pool, unsigned long handle,
enum zpool_mapmode mm);
void (*unmap)(void *pool, unsigned long handle);
@@ -112,5 +114,6 @@ void zpool_register_driver(struct zpool_
int zpool_unregister_driver(struct zpool_driver *driver);
bool zpool_evictable(struct zpool *pool);
+bool zpool_can_sleep_mapped(struct zpool *pool);
#endif
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -23,6 +23,7 @@ struct zpool {
void *pool;
const struct zpool_ops *ops;
bool evictable;
+ bool can_sleep_mapped;
struct list_head list;
};
@@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const ch
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
zpool->evictable = driver->shrink && ops && ops->evict;
+ zpool->can_sleep_mapped = driver->sleep_mapped;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool
return zpool->evictable;
}
+/**
+ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
+ * @zpool: The zpool to test
+ *
+ * Returns: true if zpool can sleep; false otherwise.
+ */
+bool zpool_can_sleep_mapped(struct zpool *zpool)
+{
+ return zpool->can_sleep_mapped;
+}
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src;
+ u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
+ if (!zpool_can_sleep_mapped(pool)) {
+ tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ }
+
/* extract swpentry from data */
zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
swpentry = zhdr->swpentry; /* here */
@@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct
/* entry was invalidated */
spin_unlock(&tree->lock);
zpool_unmap_handle(pool, handle);
+ kfree(tmp);
return 0;
}
spin_unlock(&tree->lock);
@@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct
dlen = PAGE_SIZE;
src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+
+ zpool_unmap_handle(pool, handle);
+ }
+
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
@@ -1033,7 +1048,11 @@ static int zswap_writeback_entry(struct
spin_unlock(&tree->lock);
end:
- zpool_unmap_handle(pool, handle);
+ if (zpool_can_sleep_mapped(pool))
+ zpool_unmap_handle(pool, handle);
+ else
+ kfree(tmp);
+
return ret;
}
@@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned
struct zswap_entry *entry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src, *dst;
+ u8 *src, *dst, *tmp;
unsigned int dlen;
int ret;
@@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned
dst = kmap_atomic(page);
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
+ ret = 0;
goto freeentry;
}
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+ tmp = kmalloc(entry->length, GFP_ATOMIC);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto freeentry;
+ }
+ }
+
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
if (zpool_evictable(entry->pool->zpool))
src += sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ }
+
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
@@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned
ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
mutex_unlock(acomp_ctx->mutex);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ if (zpool_can_sleep_mapped(entry->pool->zpool))
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ else
+ kfree(tmp);
+
BUG_ON(ret);
freeentry:
@@ -1279,7 +1320,7 @@ static int zswap_frontswap_load(unsigned
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- return 0;
+ return ret;
}
/* frees an entry in zswap */

View File

@@ -0,0 +1,45 @@
From: Tian Tao <tiantao6@hisilicon.com>
Date: Sat, 13 Feb 2021 20:58:31 +1300
Subject: [PATCH 2/2] mm: set the sleep_mapped to true for zbud and z3fold
zpool driver adds a flag to indicate whether the zpool driver can enter
an atomic context after mapping. This patch sets it true for z3fold and
zbud.
Link: https://lkml.kernel.org/r/1611035683-12732-3-git-send-email-tiantao6@hisilicon.com
[song.bao.hua@hisilicon.com: Rewrote changelog]
Fixes: 1ec3b5fe6e ("mm/zswap: move to use crypto_acomp API for hardware acceleration")
Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Reviewed-by: Vitaly Wool <vitaly.wool@konsulko.com>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reported-by: Mike Galbraith <efault@gmx.de>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/z3fold.c | 1 +
mm/zbud.c | 1 +
2 files changed, 2 insertions(+)
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1778,6 +1778,7 @@ static u64 z3fold_zpool_total_size(void
static struct zpool_driver z3fold_zpool_driver = {
.type = "z3fold",
+ .sleep_mapped = true,
.owner = THIS_MODULE,
.create = z3fold_zpool_create,
.destroy = z3fold_zpool_destroy,
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *p
static struct zpool_driver zbud_zpool_driver = {
.type = "zbud",
+ .sleep_mapped = true,
.owner = THIS_MODULE,
.create = zbud_zpool_create,
.destroy = zbud_zpool_destroy,

View File

@@ -0,0 +1,42 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 23 Jan 2021 21:10:26 +0100
Subject: [PATCH] blk-mq: Always complete remote completions requests in
softirq
Controllers with multiple queues have their IRQ-handelers pinned to a
CPU. The core shouldn't need to complete the request on a remote CPU.
Remove this case and always raise the softirq to complete the request.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
block/blk-mq.c | 14 +-------------
1 file changed, 1 insertion(+), 13 deletions(-)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -628,19 +628,7 @@ static void __blk_mq_complete_request_re
{
struct request *rq = data;
- /*
- * For most of single queue controllers, there is only one irq vector
- * for handling I/O completion, and the only irq's affinity is set
- * to all possible CPUs. On most of ARCHs, this affinity means the irq
- * is handled on one specific CPU.
- *
- * So complete I/O requests in softirq context in case of single queue
- * devices to avoid degrading I/O performance due to irqsoff latency.
- */
- if (rq->q->nr_hw_queues == 1)
- blk_mq_trigger_softirq(rq);
- else
- rq->q->mq_ops->complete(rq);
+ blk_mq_trigger_softirq(rq);
}
static inline bool blk_mq_complete_need_ipi(struct request *rq)

View File

@@ -0,0 +1,188 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 23 Jan 2021 21:10:27 +0100
Subject: [PATCH] blk-mq: Use llist_head for blk_cpu_done
With llist_head it is possible to avoid the locking (the irq-off region)
when items are added. This makes it possible to add items on a remote
CPU without additional locking.
llist_add() returns true if the list was previously empty. This can be
used to invoke the SMP function call / raise sofirq only if the first
item was added (otherwise it is already pending).
This simplifies the code a little and reduces the IRQ-off regions.
blk_mq_raise_softirq() needs a preempt-disable section to ensure the
request is enqueued on the same CPU as the softirq is raised.
Some callers (USB-storage) invoke this path in preemptible context.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
block/blk-mq.c | 97 ++++++++++++++++++++-----------------------------
include/linux/blkdev.h | 2 -
2 files changed, 42 insertions(+), 57 deletions(-)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,7 +41,7 @@
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -567,68 +567,29 @@ void blk_mq_end_request(struct request *
}
EXPORT_SYMBOL(blk_mq_end_request);
-/*
- * Softirq action handler - move entries to local list and loop over them
- * while passing them to the queue registered handler.
- */
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+static void blk_complete_reqs(struct llist_head *list)
{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = this_cpu_ptr(&blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
+ struct request *rq, *next;
- rq = list_entry(local_list.next, struct request, ipi_list);
- list_del_init(&rq->ipi_list);
+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
rq->q->mq_ops->complete(rq);
- }
}
-static void blk_mq_trigger_softirq(struct request *rq)
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
- struct list_head *list;
- unsigned long flags;
-
- local_irq_save(flags);
- list = this_cpu_ptr(&blk_cpu_done);
- list_add_tail(&rq->ipi_list, list);
-
- /*
- * If the list only contains our just added request, signal a raise of
- * the softirq. If there are already entries there, someone already
- * raised the irq but it hasn't run yet.
- */
- if (list->next == &rq->ipi_list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_restore(flags);
+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}
static int blk_softirq_cpu_dead(unsigned int cpu)
{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- this_cpu_ptr(&blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
-
+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
return 0;
}
-
static void __blk_mq_complete_request_remote(void *data)
{
- struct request *rq = data;
-
- blk_mq_trigger_softirq(rq);
+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
}
static inline bool blk_mq_complete_need_ipi(struct request *rq)
@@ -657,6 +618,30 @@ static inline bool blk_mq_complete_need_
return cpu_online(rq->mq_ctx->cpu);
}
+static void blk_mq_complete_send_ipi(struct request *rq)
+{
+ struct llist_head *list;
+ unsigned int cpu;
+
+ cpu = rq->mq_ctx->cpu;
+ list = &per_cpu(blk_cpu_done, cpu);
+ if (llist_add(&rq->ipi_list, list)) {
+ INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
+ smp_call_function_single_async(cpu, &rq->csd);
+ }
+}
+
+static void blk_mq_raise_softirq(struct request *rq)
+{
+ struct llist_head *list;
+
+ preempt_disable();
+ list = this_cpu_ptr(&blk_cpu_done);
+ if (llist_add(&rq->ipi_list, list))
+ raise_softirq(BLOCK_SOFTIRQ);
+ preempt_enable();
+}
+
bool blk_mq_complete_request_remote(struct request *rq)
{
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
@@ -669,15 +654,15 @@ bool blk_mq_complete_request_remote(stru
return false;
if (blk_mq_complete_need_ipi(rq)) {
- INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
- smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
- } else {
- if (rq->q->nr_hw_queues > 1)
- return false;
- blk_mq_trigger_softirq(rq);
+ blk_mq_complete_send_ipi(rq);
+ return true;
}
- return true;
+ if (rq->q->nr_hw_queues == 1) {
+ blk_mq_raise_softirq(rq);
+ return true;
+ }
+ return false;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
@@ -3892,7 +3877,7 @@ static int __init blk_mq_init(void)
int i;
for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+ init_llist_head(&per_cpu(blk_cpu_done, i));
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -153,7 +153,7 @@ struct request {
*/
union {
struct hlist_node hash; /* merge hash */
- struct list_head ipi_list;
+ struct llist_node ipi_list;
};
/*

View File

@@ -0,0 +1,79 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 9 Nov 2020 21:30:41 +0100
Subject: [PATCH 1/2] kthread: Move prio/affinite change into the newly created
thread
With enabled threaded interrupts the nouveau driver reported the
following:
| Chain exists of:
| &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
| Possible unsafe locking scenario:
|
| CPU0 CPU1
| ---- ----
| lock(&cpuset_rwsem);
| lock(&device->mutex);
| lock(&cpuset_rwsem);
| lock(&mm->mmap_lock#2);
The device->mutex is nvkm_device::mutex.
Unblocking the lockchain at `cpuset_rwsem' is probably the easiest thing
to do.
Move the priority reset to the start of the newly created thread.
Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
kernel/kthread.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -243,6 +243,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
+ static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@@ -273,6 +274,13 @@ static int kthread(void *_create)
init_completion(&self->parked);
current->vfork_done = &self->exited;
+ /*
+ * The new thread inherited kthreadd's priority and CPU mask. Reset
+ * back to default in case they have been changed.
+ */
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
@@ -370,7 +378,6 @@ struct task_struct *__kthread_create_on_
}
task = create->result;
if (!IS_ERR(task)) {
- static const struct sched_param param = { .sched_priority = 0 };
char name[TASK_COMM_LEN];
/*
@@ -379,13 +386,6 @@ struct task_struct *__kthread_create_on_
*/
vsnprintf(name, sizeof(name), namefmt, args);
set_task_comm(task, name);
- /*
- * root may have changed our (kthreadd's) priority or CPU mask.
- * The kernel thread should not inherit these properties.
- */
- sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task,
- housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;

View File

@@ -0,0 +1,55 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Nov 2020 23:32:39 +0100
Subject: [PATCH 2/2] genirq: Move prio assignment into the newly created
thread
With enabled threaded interrupts the nouveau driver reported the
following:
| Chain exists of:
| &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem
|
| Possible unsafe locking scenario:
|
| CPU0 CPU1
| ---- ----
| lock(&cpuset_rwsem);
| lock(&device->mutex);
| lock(&cpuset_rwsem);
| lock(&mm->mmap_lock#2);
The device->mutex is nvkm_device::mutex.
Unblocking the lockchain at `cpuset_rwsem' is probably the easiest thing
to do.
Move the priority assignment to the start of the newly created thread.
Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()")
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: Patch description]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de
---
kernel/irq/manage.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1225,6 +1225,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
+ sched_set_fifo(current);
+
if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
@@ -1390,8 +1392,6 @@ setup_irq_thread(struct irqaction *new,
if (IS_ERR(t))
return PTR_ERR(t);
- sched_set_fifo(t);
-
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code

View File

@@ -0,0 +1,123 @@
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Sun, 22 Nov 2020 20:19:04 +0000
Subject: [PATCH] notifier: Make atomic_notifiers use raw_spinlock
Booting a recent PREEMPT_RT kernel (v5.10-rc3-rt7-rebase) on my arm64 Juno
leads to the idle task blocking on an RT sleeping spinlock down some
notifier path:
[ 1.809101] BUG: scheduling while atomic: swapper/5/0/0x00000002
[ 1.809116] Modules linked in:
[ 1.809123] Preemption disabled at:
[ 1.809125] secondary_start_kernel (arch/arm64/kernel/smp.c:227)
[ 1.809146] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.10.0-rc3-rt7 #168
[ 1.809153] Hardware name: ARM Juno development board (r0) (DT)
[ 1.809158] Call trace:
[ 1.809160] dump_backtrace (arch/arm64/kernel/stacktrace.c:100 (discriminator 1))
[ 1.809170] show_stack (arch/arm64/kernel/stacktrace.c:198)
[ 1.809178] dump_stack (lib/dump_stack.c:122)
[ 1.809188] __schedule_bug (kernel/sched/core.c:4886)
[ 1.809197] __schedule (./arch/arm64/include/asm/preempt.h:18 kernel/sched/core.c:4913 kernel/sched/core.c:5040)
[ 1.809204] preempt_schedule_lock (kernel/sched/core.c:5365 (discriminator 1))
[ 1.809210] rt_spin_lock_slowlock_locked (kernel/locking/rtmutex.c:1072)
[ 1.809217] rt_spin_lock_slowlock (kernel/locking/rtmutex.c:1110)
[ 1.809224] rt_spin_lock (./include/linux/rcupdate.h:647 kernel/locking/rtmutex.c:1139)
[ 1.809231] atomic_notifier_call_chain_robust (kernel/notifier.c:71 kernel/notifier.c:118 kernel/notifier.c:186)
[ 1.809240] cpu_pm_enter (kernel/cpu_pm.c:39 kernel/cpu_pm.c:93)
[ 1.809249] psci_enter_idle_state (drivers/cpuidle/cpuidle-psci.c:52 drivers/cpuidle/cpuidle-psci.c:129)
[ 1.809258] cpuidle_enter_state (drivers/cpuidle/cpuidle.c:238)
[ 1.809267] cpuidle_enter (drivers/cpuidle/cpuidle.c:353)
[ 1.809275] do_idle (kernel/sched/idle.c:132 kernel/sched/idle.c:213 kernel/sched/idle.c:273)
[ 1.809282] cpu_startup_entry (kernel/sched/idle.c:368 (discriminator 1))
[ 1.809288] secondary_start_kernel (arch/arm64/kernel/smp.c:273)
Two points worth noting:
1) That this is conceptually the same issue as pointed out in:
313c8c16ee62 ("PM / CPU: replace raw_notifier with atomic_notifier")
2) Only the _robust() variant of atomic_notifier callchains suffer from
this
AFAICT only the cpu_pm_notifier_chain really needs to be changed, but
singling it out would mean introducing a new (truly) non-blocking API. At
the same time, callers that are fine with any blocking within the call
chain should use blocking notifiers, so patching up all atomic_notifier's
doesn't seem *too* crazy to me.
Fixes: 70d932985757 ("notifier: Fix broken error handling pattern")
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201122201904.30940-1-valentin.schneider@arm.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/notifier.h | 6 +++---
kernel/notifier.c | 12 ++++++------
2 files changed, 9 insertions(+), 9 deletions(-)
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -58,7 +58,7 @@ struct notifier_block {
};
struct atomic_notifier_head {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct notifier_block __rcu *head;
};
@@ -78,7 +78,7 @@ struct srcu_notifier_head {
};
#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \
- spin_lock_init(&(name)->lock); \
+ raw_spin_lock_init(&(name)->lock); \
(name)->head = NULL; \
} while (0)
#define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \
@@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(stru
cleanup_srcu_struct(&(name)->srcu);
#define ATOMIC_NOTIFIER_INIT(name) { \
- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
.head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) { \
.rwsem = __RWSEM_INITIALIZER((name).rwsem), \
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struc
unsigned long flags;
int ret;
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_register(&nh->head, n);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);
@@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(str
unsigned long flags;
int ret;
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_chain_unregister(&nh->head, n);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
synchronize_rcu();
return ret;
}
@@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(st
* Musn't use RCU; because then the notifier list can
* change between the up and down traversal.
*/
- spin_lock_irqsave(&nh->lock, flags);
+ raw_spin_lock_irqsave(&nh->lock, flags);
ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
- spin_unlock_irqrestore(&nh->lock, flags);
+ raw_spin_unlock_irqrestore(&nh->lock, flags);
return ret;
}

View File

@@ -0,0 +1,34 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Dec 2020 15:16:45 +0100
Subject: [PATCH 1/5] rcu: Make RCU_BOOST default on CONFIG_PREEMPT_RT
On PREEMPT_RT kernels, RCU callbacks are deferred to the `rcuc' kthread.
This can stall RCU grace periods due to lengthy preemption not only of RCU
readers but also of 'rcuc' kthreads, either of which prevent grace periods
from completing, which can in turn result in OOM. Because PREEMPT_RT
kernels have more kthreads that can block grace periods, it is more
important for such kernels to enable RCU_BOOST.
This commit therefore makes RCU_BOOST the default on PREEMPT_RT.
RCU_BOOST can still be manually disabled if need be.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/rcu/Kconfig | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -188,8 +188,8 @@ config RCU_FAST_NO_HZ
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.

View File

@@ -0,0 +1,57 @@
From: Scott Wood <swood@redhat.com>
Date: Tue, 15 Dec 2020 15:16:46 +0100
Subject: [PATCH 2/5] rcu: Unconditionally use rcuc threads on PREEMPT_RT
PREEMPT_RT systems have long used the rcutree.use_softirq kernel
boot parameter to avoid use of RCU_SOFTIRQ handlers, which can disrupt
real-time applications by invoking callbacks during return from interrupts
that arrived while executing time-critical code. This kernel boot
parameter instead runs RCU core processing in an 'rcuc' kthread, thus
allowing the scheduler to do its job of avoiding disrupting time-critical
code.
This commit therefore disables the rcutree.use_softirq kernel boot
parameter on PREEMPT_RT systems, thus forcing such systems to do RCU
core processing in 'rcuc' kthreads. This approach has long been in
use by users of the -rt patchset, and there have been no complaints.
There is therefore no way for the system administrator to override this
choice, at least without modifying and rebuilding the kernel.
Signed-off-by: Scott Wood <swood@redhat.com>
[bigeasy: Reword commit message]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[ paulmck: Update kernel-parameters.txt accordingly. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/admin-guide/kernel-parameters.txt | 4 ++++
kernel/rcu/tree.c | 4 +++-
2 files changed, 7 insertions(+), 1 deletion(-)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4092,6 +4092,10 @@
value, meaning that RCU_SOFTIRQ is used by default.
Specify rcutree.use_softirq=0 to use rcuc kthreads.
+ But note that CONFIG_PREEMPT_RT=y kernels disable
+ this kernel boot parameter, forcibly setting it
+ to zero.
+
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
tree. This is used by rcutorture, and might
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -100,8 +100,10 @@ static struct rcu_state rcu_state = {
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);

View File

@@ -0,0 +1,62 @@
From: Julia Cartwright <julia@ni.com>
Date: Tue, 15 Dec 2020 15:16:47 +0100
Subject: [PATCH 3/5] rcu: Enable rcu_normal_after_boot unconditionally for RT
Expedited RCU grace periods send IPIs to all non-idle CPUs, and thus can
disrupt time-critical code in real-time applications. However, there
is a portion of boot-time processing (presumably before any real-time
applications have started) where expedited RCU grace periods are the only
option. And so it is that experience with the -rt patchset indicates that
PREEMPT_RT systems should always set the rcupdate.rcu_normal_after_boot
kernel boot parameter.
This commit therefore makes the post-boot application environment safe
for real-time applications by making PREEMPT_RT systems disable the
rcupdate.rcu_normal_after_boot kernel boot parameter and acting as
if this parameter had been set. This means that post-boot calls to
synchronize_rcu_expedited() will be treated as if they were instead
calls to synchronize_rcu(), thus preventing the IPIs, and thus avoiding
disrupting real-time applications.
Suggested-by: Luiz Capitulino <lcapitulino@redhat.com>
Acked-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Julia Cartwright <julia@ni.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[ paulmck: Update kernel-parameters.txt accordingly. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/admin-guide/kernel-parameters.txt | 7 +++++++
kernel/rcu/update.c | 4 +++-
2 files changed, 10 insertions(+), 1 deletion(-)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4474,6 +4474,13 @@
only normal grace-period primitives. No effect
on CONFIG_TINY_RCU kernels.
+ But note that CONFIG_PREEMPT_RT=y kernels enables
+ this kernel boot parameter, forcibly setting
+ it to the value one, that is, converting any
+ post-boot attempt at an expedited RCU grace
+ period to instead use normal non-expedited
+ grace-period processing.
+
rcupdate.rcu_task_ipi_delay= [KNL]
Set time in jiffies during which RCU tasks will
avoid sending IPIs, starting with the beginning
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -56,8 +56,10 @@
#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(rcu_normal_after_boot, int, 0);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

View File

@@ -0,0 +1,28 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Dec 2020 15:16:48 +0100
Subject: [PATCH 4/5] doc: Update RCU's requirements page about the PREEMPT_RT
wiki.
The PREEMPT_RT wiki moved from kernel.org to the Linux Foundation wiki.
The kernel.org wiki is read only.
This commit therefore updates the URL of the active PREEMPT_RT wiki.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/RCU/Design/Requirements/Requirements.rst | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -2319,7 +2319,7 @@ decides to throw at it.
The Linux kernel is used for real-time workloads, especially in
conjunction with the `-rt
-patchset <https://rt.wiki.kernel.org/index.php/Main_Page>`__. The
+patchset <https://wiki.linuxfoundation.org/realtime/>`__. The
real-time-latency response requirements are such that the traditional
approach of disabling preemption across RCU read-side critical sections
is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use

View File

@@ -0,0 +1,233 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Dec 2020 15:16:49 +0100
Subject: [PATCH 5/5] doc: Use CONFIG_PREEMPTION
CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT.
Both PREEMPT and PREEMPT_RT require the same functionality which today
depends on CONFIG_PREEMPT.
Update the documents and mention CONFIG_PREEMPTION. Spell out
CONFIG_PREEMPT_RT (instead PREEMPT_RT) since it is an option now.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst | 4 -
Documentation/RCU/Design/Requirements/Requirements.rst | 24 +++++-----
Documentation/RCU/checklist.rst | 2
Documentation/RCU/rcubarrier.rst | 6 +-
Documentation/RCU/stallwarn.rst | 4 -
Documentation/RCU/whatisRCU.rst | 10 ++--
6 files changed, 25 insertions(+), 25 deletions(-)
--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
+++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
@@ -38,7 +38,7 @@ sections.
RCU-preempt Expedited Grace Periods
===================================
-``CONFIG_PREEMPT=y`` kernels implement RCU-preempt.
+``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt.
The overall flow of the handling of a given CPU by an RCU-preempt
expedited grace period is shown in the following diagram:
@@ -112,7 +112,7 @@ things.
RCU-sched Expedited Grace Periods
---------------------------------
-``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of
+``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of
the handling of a given CPU by an RCU-sched expedited grace period is
shown in the following diagram:
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -78,7 +78,7 @@ RCU treats a nested set as one big RCU r
Production-quality implementations of ``rcu_read_lock()`` and
``rcu_read_unlock()`` are extremely lightweight, and in fact have
exactly zero overhead in Linux kernels built for production use with
-``CONFIG_PREEMPT=n``.
+``CONFIG_PREEMPTION=n``.
This guarantee allows ordering to be enforced with extremely low
overhead to readers, for example:
@@ -1182,7 +1182,7 @@ and has become decreasingly so as memory
costs have plummeted. However, as I learned from Matt Mackall's
`bloatwatch <http://elinux.org/Linux_Tiny-FAQ>`__ efforts, memory
footprint is critically important on single-CPU systems with
-non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny
+non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny
RCU <https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com>`__
was born. Josh Triplett has since taken over the small-memory banner
with his `Linux kernel tinification <https://tiny.wiki.kernel.org/>`__
@@ -1498,7 +1498,7 @@ limitations.
Implementations of RCU for which ``rcu_read_lock()`` and
``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when
-``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there
+``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there
is no overhead. Except that if all these instances of
``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the
compiler, compilation will eventually fail due to exhausting memory,
@@ -1771,7 +1771,7 @@ implementation can be a no-op.
However, once the scheduler has spawned its first kthread, this early
boot trick fails for ``synchronize_rcu()`` (as well as for
-``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The
+``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPTION=y`` kernels. The
reason is that an RCU read-side critical section might be preempted,
which means that a subsequent ``synchronize_rcu()`` really does have to
wait for something, as opposed to simply returning immediately.
@@ -2040,7 +2040,7 @@ The compiler must not be permitted to tr
5 rcu_read_unlock();
6 do_something_with(v, user_v);
-If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel
+If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel
build, and if ``get_user()`` did page fault, the result would be a quiescent
state in the middle of an RCU read-side critical section. This misplaced
quiescent state could result in line 4 being a use-after-free access,
@@ -2322,7 +2322,7 @@ conjunction with the `-rt
patchset <https://wiki.linuxfoundation.org/realtime/>`__. The
real-time-latency response requirements are such that the traditional
approach of disabling preemption across RCU read-side critical sections
-is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use
+is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use
an RCU implementation that allows RCU read-side critical sections to be
preempted. This requirement made its presence known after users made it
clear that an earlier `real-time
@@ -2444,7 +2444,7 @@ includes ``rcu_read_lock_bh()``, ``rcu_r
``call_rcu_bh()``, ``rcu_barrier_bh()``, and
``rcu_read_lock_bh_held()``. However, the update-side APIs are now
simple wrappers for other RCU flavors, namely RCU-sched in
-CONFIG_PREEMPT=n kernels and RCU-preempt otherwise.
+CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise.
Sched Flavor (Historical)
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2462,11 +2462,11 @@ not have this property, given that any p
RCU read-side critical section can be a quiescent state. Therefore,
*RCU-sched* was created, which follows “classic” RCU in that an
RCU-sched grace period waits for pre-existing interrupt and NMI
-handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and
+handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and
RCU-sched APIs have identical implementations, while kernels built with
-``CONFIG_PREEMPT=y`` provide a separate implementation for each.
+``CONFIG_PREEMPTION=y`` provide a separate implementation for each.
-Note well that in ``CONFIG_PREEMPT=y`` kernels,
+Note well that in ``CONFIG_PREEMPTION=y`` kernels,
``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and
re-enable preemption, respectively. This means that if there was a
preemption attempt during the RCU-sched read-side critical section,
@@ -2629,10 +2629,10 @@ userspace execution also delimit tasks-R
The tasks-RCU API is quite compact, consisting only of
``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and
-``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines
+``rcu_barrier_tasks()``. In ``CONFIG_PREEMPTION=n`` kernels, trampolines
cannot be preempted, so these APIs map to ``call_rcu()``,
``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In
-``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these
+``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these
three APIs are therefore implemented by separate functions that check
for voluntary context switches.
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -214,7 +214,7 @@ over a rather long period of time, but i
the rest of the system.
7. As of v4.20, a given kernel implements only one RCU flavor,
- which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y.
+ which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y.
If the updater uses call_rcu() or synchronize_rcu(),
then the corresponding readers my use rcu_read_lock() and
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
--- a/Documentation/RCU/rcubarrier.rst
+++ b/Documentation/RCU/rcubarrier.rst
@@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronizat
of as a replacement for read-writer locking (among other things), but with
very low-overhead readers that are immune to deadlock, priority inversion,
and unbounded latency. RCU read-side critical sections are delimited
-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT
+by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION
kernels, generate no code whatsoever.
This means that RCU writers are unaware of the presence of concurrent
@@ -329,10 +329,10 @@ Answer: This cannot happen. The reason i
to smp_call_function() and further to smp_call_function_on_cpu(),
causing this latter to spin until the cross-CPU invocation of
rcu_barrier_func() has completed. This by itself would prevent
- a grace period from completing on non-CONFIG_PREEMPT kernels,
+ a grace period from completing on non-CONFIG_PREEMPTION kernels,
since each CPU must undergo a context switch (or other quiescent
state) before the grace period can complete. However, this is
- of no use in CONFIG_PREEMPT kernels.
+ of no use in CONFIG_PREEMPTION kernels.
Therefore, on_each_cpu() disables preemption across its call
to smp_call_function() and also across the local call to
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -25,7 +25,7 @@ So your kernel printed an RCU CPU stall
- A CPU looping with bottom halves disabled.
-- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel
without invoking schedule(). If the looping in the kernel is
really expected and desirable behavior, you might need to add
some calls to cond_resched().
@@ -44,7 +44,7 @@ So your kernel printed an RCU CPU stall
result in the ``rcu_.*kthread starved for`` console-log message,
which will include additional debugging information.
-- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+- A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might
happen to preempt a low-priority task in the middle of an RCU
read-side critical section. This is especially damaging if
that low-priority task is not permitted to run on any other CPU,
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -683,7 +683,7 @@ so there can be no deadlock cycle.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This section presents a "toy" RCU implementation that is based on
"classic RCU". It is also short on performance (but only for updates) and
-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT
+on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION
kernels. The definitions of rcu_dereference() and rcu_assign_pointer()
are the same as those shown in the preceding section, so they are omitted.
::
@@ -739,7 +739,7 @@ to that data item, so we can safely recl
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
+ CONFIG_PREEMPT_RT, where normal spinlocks can block???
:ref:`Answers to Quick Quiz <8_whatisRCU>`
@@ -1093,7 +1093,7 @@ the right tool for your job.
overhead is **negative**.
Answer:
- Imagine a single-CPU system with a non-CONFIG_PREEMPT
+ Imagine a single-CPU system with a non-CONFIG_PREEMPTION
kernel where a routing table is used by process-context
code, but can be updated by irq-context code (for example,
by an "ICMP REDIRECT" packet). The usual way of handling
@@ -1120,10 +1120,10 @@ the right tool for your job.
Quick Quiz #3:
If it is illegal to block in an RCU read-side
critical section, what the heck do you do in
- PREEMPT_RT, where normal spinlocks can block???
+ CONFIG_PREEMPT_RT, where normal spinlocks can block???
Answer:
- Just as PREEMPT_RT permits preemption of spinlock
+ Just as CONFIG_PREEMPT_RT permits preemption of spinlock
critical sections, it permits preemption of RCU
read-side critical sections. It also permits
spinlocks blocking while in RCU read-side critical

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,173 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:09 +0100
Subject: [PATCH 2/4] tracing: Inline tracing_gen_ctx_flags()
Inline tracing_gen_ctx_flags(). This allows to have one ifdef
CONFIG_TRACE_IRQFLAGS_SUPPORT.
This requires to move `trace_flag_type' so tracing_gen_ctx_flags() can
use it.
Link: https://lkml.kernel.org/r/20210125194511.3924915-3-bigeasy@linutronix.de
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/20210125140323.6b1ff20c@gandalf.local.home
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/trace_events.h | 54 ++++++++++++++++++++++++++++++++++++++++---
kernel/trace/trace.c | 38 +-----------------------------
kernel/trace/trace.h | 19 ---------------
3 files changed, 53 insertions(+), 58 deletions(-)
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -160,9 +160,57 @@ static inline void tracing_generic_entry
entry->flags = trace_ctx >> 16;
}
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags);
-unsigned int tracing_gen_ctx(void);
-unsigned int tracing_gen_ctx_dec(void);
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status);
+
+enum trace_flag_type {
+ TRACE_FLAG_IRQS_OFF = 0x01,
+ TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
+ TRACE_FLAG_NEED_RESCHED = 0x04,
+ TRACE_FLAG_HARDIRQ = 0x08,
+ TRACE_FLAG_SOFTIRQ = 0x10,
+ TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+ TRACE_FLAG_NMI = 0x40,
+};
+
+#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+ unsigned int irq_status = irqs_disabled_flags(irqflags) ?
+ TRACE_FLAG_IRQS_OFF : 0;
+ return tracing_gen_ctx_irq_test(irq_status);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+ unsigned long irqflags;
+
+ local_save_flags(irqflags);
+ return tracing_gen_ctx_flags(irqflags);
+}
+#else
+
+static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+{
+ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+static inline unsigned int tracing_gen_ctx(void)
+{
+ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT);
+}
+#endif
+
+static inline unsigned int tracing_gen_ctx_dec(void)
+{
+ unsigned int trace_ctx;
+
+ trace_ctx = tracing_gen_ctx();
+ /*
+ * Subtract one from the preeption counter if preemption is enabled,
+ * see trace_event_buffer_reserve()for details.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ trace_ctx--;
+ return trace_ctx;
+}
struct trace_event_file;
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2578,20 +2578,13 @@ enum print_line_t trace_handle_return(st
}
EXPORT_SYMBOL_GPL(trace_handle_return);
-unsigned int tracing_gen_ctx_flags(unsigned long irqflags)
+unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
{
- unsigned int trace_flags = 0;
+ unsigned int trace_flags = irqs_status;
unsigned int pc;
pc = preempt_count();
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- if (irqs_disabled_flags(irqflags))
- trace_flags |= TRACE_FLAG_IRQS_OFF;
-#else
- trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT;
-#endif
-
if (pc & NMI_MASK)
trace_flags |= TRACE_FLAG_NMI;
if (pc & HARDIRQ_MASK)
@@ -2607,33 +2600,6 @@ unsigned int tracing_gen_ctx_flags(unsig
return (trace_flags << 16) | (pc & 0xff);
}
-unsigned int tracing_gen_ctx(void)
-{
- unsigned long irqflags;
-
-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
- local_save_flags(irqflags);
-#else
- irqflags = 0;
-#endif
- return tracing_gen_ctx_flags(irqflags);
-}
-
-unsigned int tracing_gen_ctx_dec(void)
-{
- unsigned int trace_ctx;
-
- trace_ctx = tracing_gen_ctx();
-
- /*
- * Subtract one from the preeption counter if preemption is enabled,
- * see trace_event_buffer_reserve()for details.
- */
- if (IS_ENABLED(CONFIG_PREEMPTION))
- trace_ctx--;
- return trace_ctx;
-}
-
struct ring_buffer_event *
trace_buffer_lock_reserve(struct trace_buffer *buffer,
int type,
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- * IRQS_OFF - interrupts were disabled
- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
- * NEED_RESCHED - reschedule is requested
- * HARDIRQ - inside an interrupt handler
- * SOFTIRQ - inside a softirq handler
- */
-enum trace_flag_type {
- TRACE_FLAG_IRQS_OFF = 0x01,
- TRACE_FLAG_IRQS_NOSUPPORT = 0x02,
- TRACE_FLAG_NEED_RESCHED = 0x04,
- TRACE_FLAG_HARDIRQ = 0x08,
- TRACE_FLAG_SOFTIRQ = 0x10,
- TRACE_FLAG_PREEMPT_RESCHED = 0x20,
- TRACE_FLAG_NMI = 0x40,
-};
-
#define TRACE_BUF_SIZE 1024
struct trace_array;

View File

@@ -0,0 +1,41 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:10 +0100
Subject: [PATCH 3/4] tracing: Use in_serving_softirq() to deduct softirq
status.
PREEMPT_RT does not report "serving softirq" because the tracing core
looks at the preemption counter while PREEMPT_RT does not update it
while processing softirqs in order to remain preemptible. The
information is stored somewhere else.
The in_serving_softirq() macro and the SOFTIRQ_OFFSET define are still
working but not on the preempt-counter.
Use in_serving_softirq() macro which works on PREEMPT_RT. On !PREEMPT_RT
the compiler (gcc-10 / clang-11) is smart enough to optimize the
in_serving_softirq() related read of the preemption counter away.
The only difference I noticed by using in_serving_softirq() on
!PREEMPT_RT is that gcc-10 implemented tracing_gen_ctx_flags() as
reading FLAG, jmp _tracing_gen_ctx_flags(). Without in_serving_softirq()
it inlined _tracing_gen_ctx_flags() into tracing_gen_ctx_flags().
Link: https://lkml.kernel.org/r/20210125194511.3924915-4-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2589,8 +2589,7 @@ unsigned int tracing_gen_ctx_irq_test(un
trace_flags |= TRACE_FLAG_NMI;
if (pc & HARDIRQ_MASK)
trace_flags |= TRACE_FLAG_HARDIRQ;
-
- if (pc & SOFTIRQ_OFFSET)
+ if (in_serving_softirq())
trace_flags |= TRACE_FLAG_SOFTIRQ;
if (tif_need_resched())

View File

@@ -0,0 +1,36 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Jan 2021 20:45:11 +0100
Subject: [PATCH 4/4] tracing: Remove NULL check from current in
tracing_generic_entry_update().
I can't imagine when or why `current' would return a NULL pointer. This
check was added in commit
72829bc3d63cd ("ftrace: move enums to ftrace.h and make helper function global")
but it doesn't give me hint why it was needed.
Assume `current' never returns a NULL pointer and remove the check.
Link: https://lkml.kernel.org/r/20210125194511.3924915-5-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/trace_events.h | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -152,10 +152,8 @@ static inline void tracing_generic_entry
unsigned short type,
unsigned int trace_ctx)
{
- struct task_struct *tsk = current;
-
entry->preempt_count = trace_ctx & 0xff;
- entry->pid = (tsk) ? tsk->pid : 0;
+ entry->pid = current->pid;
entry->type = type;
entry->flags = trace_ctx >> 16;
}

View File

@@ -0,0 +1,37 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 19 Feb 2021 17:51:07 +0100
Subject: [PATCH] powerpc/mm: Move the linear_mapping_mutex to the ifdef where
it is used
The mutex linear_mapping_mutex is defined at the of the file while its
only two user are within the CONFIG_MEMORY_HOTPLUG block.
A compile without CONFIG_MEMORY_HOTPLUG set fails on PREEMPT_RT because
its mutex implementation is smart enough to realize that it is unused.
Move the definition of linear_mapping_mutex to ifdef block where it is
used.
Fixes: 1f73ad3e8d755 ("powerpc/mm: print warning in arch_remove_linear_mapping()")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/powerpc/mm/mem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -54,7 +54,6 @@
#include <mm/mmu_decl.h>
-static DEFINE_MUTEX(linear_mapping_mutex);
unsigned long long memory_limit;
bool init_mem_is_free;
@@ -72,6 +71,7 @@ pgprot_t phys_mem_access_prot(struct fil
EXPORT_SYMBOL(phys_mem_access_prot);
#ifdef CONFIG_MEMORY_HOTPLUG
+static DEFINE_MUTEX(linear_mapping_mutex);
#ifdef CONFIG_NUMA
int memory_add_physaddr_to_nid(u64 start)

View File

@@ -0,0 +1,49 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 17 Feb 2021 16:15:31 +0100
Subject: [PATCH 02/28] printk: limit second loop of syslog_print_all
The second loop of syslog_print_all() subtracts lengths that were
added in the first loop. With commit b031a684bfd0 ("printk: remove
logbuf_lock writer-protection of ringbuffer") it is possible that
records are (over)written during syslog_print_all(). This allows the
possibility of the second loop subtracting lengths that were never
added in the first loop.
This situation can result in syslog_print_all() filling the buffer
starting from a later record, even though there may have been room
to fit the earlier record(s) as well.
Fixes: b031a684bfd0 ("printk: remove logbuf_lock writer-protection of ringbuffer")
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
---
kernel/printk/printk.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1494,6 +1494,7 @@ static int syslog_print_all(char __user
struct printk_info info;
unsigned int line_count;
struct printk_record r;
+ u64 max_seq;
char *text;
int len = 0;
u64 seq;
@@ -1512,9 +1513,15 @@ static int syslog_print_all(char __user
prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
len += get_record_print_text_size(&info, line_count, true, time);
+ /*
+ * Set an upper bound for the next loop to avoid subtracting lengths
+ * that were never added.
+ */
+ max_seq = seq;
+
/* move first record forward until length fits into the buffer */
prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
- if (len <= size)
+ if (len <= size || info.seq >= max_seq)
break;
len -= get_record_print_text_size(&info, line_count, true, time);
}

View File

@@ -0,0 +1,36 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 21 Dec 2020 11:19:39 +0106
Subject: [PATCH 03/28] printk: kmsg_dump: remove unused fields
struct kmsg_dumper still contains some fields that were used to
iterate the old ringbuffer. They are no longer used. Remove them
and update the struct documentation.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kmsg_dump.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -36,6 +36,9 @@ enum kmsg_dump_reason {
* through the record iterator
* @max_reason: filter for highest reason number that should be dumped
* @registered: Flag that specifies if this is already registered
+ * @active: Flag that specifies if this is currently dumping
+ * @cur_seq: Points to the oldest message to dump (private)
+ * @next_seq: Points after the newest message to dump (private)
*/
struct kmsg_dumper {
struct list_head list;
@@ -45,8 +48,6 @@ struct kmsg_dumper {
bool registered;
/* private state of the kmsg iterator */
- u32 cur_idx;
- u32 next_idx;
u64 cur_seq;
u64 next_seq;
};

View File

@@ -0,0 +1,136 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:41:56 +0106
Subject: [PATCH 04/28] printk: refactor kmsg_dump_get_buffer()
kmsg_dump_get_buffer() requires nearly the same logic as
syslog_print_all(), but uses different variable names and
does not make use of the ringbuffer loop macros. Modify
kmsg_dump_get_buffer() so that the implementation is as similar
to syslog_print_all() as possible.
A follow-up commit will move this common logic into a
separate helper function.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kmsg_dump.h | 2 -
kernel/printk/printk.c | 60 ++++++++++++++++++++++++----------------------
2 files changed, 33 insertions(+), 29 deletions(-)
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -62,7 +62,7 @@ bool kmsg_dump_get_line(struct kmsg_dump
char *line, size_t size, size_t *len);
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len);
+ char *buf, size_t size, size_t *len_out);
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper);
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3424,7 +3424,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* read.
*/
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
- char *buf, size_t size, size_t *len)
+ char *buf, size_t size, size_t *len_out)
{
struct printk_info info;
unsigned int line_count;
@@ -3432,12 +3432,10 @@ bool kmsg_dump_get_buffer(struct kmsg_du
unsigned long flags;
u64 seq;
u64 next_seq;
- size_t l = 0;
+ size_t len = 0;
bool ret = false;
bool time = printk_time;
- prb_rec_init_rd(&r, &info, buf, size);
-
if (!dumper->active || !buf || !size)
goto out;
@@ -3455,48 +3453,54 @@ bool kmsg_dump_get_buffer(struct kmsg_du
goto out;
}
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- while (prb_read_valid_info(prb, seq, &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
+ /*
+ * Find first record that fits, including all following records,
+ * into the user-provided buffer for this dump.
+ */
+
+ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= dumper->next_seq)
break;
- l += get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
}
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- while (l >= size && prb_read_valid_info(prb, seq,
- &info, &line_count)) {
- if (r.info->seq >= dumper->next_seq)
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is the best effort
+ * that prevents an infinite loop.
+ */
+ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
+ if (len < size || info.seq >= dumper->next_seq)
break;
- l -= get_record_print_text_size(&info, line_count, syslog, time);
- seq = r.info->seq + 1;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
}
- /* last message in next interation */
+ /*
+ * Next kmsg_dump_get_buffer() invocation will dump block of
+ * older records stored right before this one.
+ */
next_seq = seq;
- /* actually read text into the buffer now */
- l = 0;
- while (prb_read_valid(prb, seq, &r)) {
+ prb_rec_init_rd(&r, &info, buf, size);
+
+ len = 0;
+ prb_for_each_record(seq, prb, seq, &r) {
if (r.info->seq >= dumper->next_seq)
break;
- l += record_print_text(&r, syslog, time);
-
- /* adjust record to store to remaining buffer space */
- prb_rec_init_rd(&r, &info, buf + l, size - l);
+ len += record_print_text(&r, syslog, time);
- seq = r.info->seq + 1;
+ /* Adjust record to store to remaining buffer space. */
+ prb_rec_init_rd(&r, &info, buf + len, size - len);
}
dumper->next_seq = next_seq;
ret = true;
logbuf_unlock_irqrestore(flags);
out:
- if (len)
- *len = l;
+ if (len_out)
+ *len_out = len;
return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);

View File

@@ -0,0 +1,140 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 13 Jan 2021 11:29:53 +0106
Subject: [PATCH 05/28] printk: consolidate
kmsg_dump_get_buffer/syslog_print_all code
The logic for finding records to fit into a buffer is the same for
kmsg_dump_get_buffer() and syslog_print_all(). Introduce a helper
function find_first_fitting_seq() to handle this logic.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
kernel/printk/printk.c | 87 ++++++++++++++++++++++++++++---------------------
1 file changed, 50 insertions(+), 37 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1421,6 +1421,50 @@ static size_t get_record_print_text_size
return ((prefix_len * line_count) + info->text_len + 1);
}
+/*
+ * Beginning with @start_seq, find the first record where it and all following
+ * records up to (but not including) @max_seq fit into @size.
+ *
+ * @max_seq is simply an upper bound and does not need to exist. If the caller
+ * does not require an upper bound, -1 can be used for @max_seq.
+ */
+static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
+ bool syslog, bool time)
+{
+ struct printk_info info;
+ unsigned int line_count;
+ size_t len = 0;
+ u64 seq;
+
+ /* Determine the size of the records up to @max_seq. */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (info.seq >= max_seq)
+ break;
+ len += get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ /*
+ * Adjust the upper bound for the next loop to avoid subtracting
+ * lengths that were never added.
+ */
+ if (seq < max_seq)
+ max_seq = seq;
+
+ /*
+ * Move first record forward until length fits into the buffer. Ignore
+ * newest messages that were not counted in the above cycle. Messages
+ * might appear and get lost in the meantime. This is a best effort
+ * that prevents an infinite loop that could occur with a retry.
+ */
+ prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
+ if (len <= size || info.seq >= max_seq)
+ break;
+ len -= get_record_print_text_size(&info, line_count, syslog, time);
+ }
+
+ return seq;
+}
+
static int syslog_print(char __user *buf, int size)
{
struct printk_info info;
@@ -1492,9 +1536,7 @@ static int syslog_print(char __user *buf
static int syslog_print_all(char __user *buf, int size, bool clear)
{
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
- u64 max_seq;
char *text;
int len = 0;
u64 seq;
@@ -1510,21 +1552,7 @@ static int syslog_print_all(char __user
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
- len += get_record_print_text_size(&info, line_count, true, time);
-
- /*
- * Set an upper bound for the next loop to avoid subtracting lengths
- * that were never added.
- */
- max_seq = seq;
-
- /* move first record forward until length fits into the buffer */
- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
- if (len <= size || info.seq >= max_seq)
- break;
- len -= get_record_print_text_size(&info, line_count, true, time);
- }
+ seq = find_first_fitting_seq(clear_seq, -1, size, true, time);
prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
@@ -3427,7 +3455,6 @@ bool kmsg_dump_get_buffer(struct kmsg_du
char *buf, size_t size, size_t *len_out)
{
struct printk_info info;
- unsigned int line_count;
struct printk_record r;
unsigned long flags;
u64 seq;
@@ -3455,26 +3482,12 @@ bool kmsg_dump_get_buffer(struct kmsg_du
/*
* Find first record that fits, including all following records,
- * into the user-provided buffer for this dump.
+ * into the user-provided buffer for this dump. Pass in size-1
+ * because this function (by way of record_print_text()) will
+ * not write more than size-1 bytes of text into @buf.
*/
-
- prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
- if (info.seq >= dumper->next_seq)
- break;
- len += get_record_print_text_size(&info, line_count, syslog, time);
- }
-
- /*
- * Move first record forward until length fits into the buffer. Ignore
- * newest messages that were not counted in the above cycle. Messages
- * might appear and get lost in the meantime. This is the best effort
- * that prevents an infinite loop.
- */
- prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) {
- if (len < size || info.seq >= dumper->next_seq)
- break;
- len -= get_record_print_text_size(&info, line_count, syslog, time);
- }
+ seq = find_first_fitting_seq(dumper->cur_seq, dumper->next_seq,
+ size - 1, syslog, time);
/*
* Next kmsg_dump_get_buffer() invocation will dump block of

View File

@@ -0,0 +1,88 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 10 Dec 2020 12:48:01 +0106
Subject: [PATCH 06/28] printk: introduce CONSOLE_LOG_MAX for improved
multi-line support
Instead of using "LOG_LINE_MAX + PREFIX_MAX" for temporary buffer
sizes, introduce CONSOLE_LOG_MAX. This represents the maximum size
that is allowed to be printed to the console for a single record.
Rather than setting CONSOLE_LOG_MAX to "LOG_LINE_MAX + PREFIX_MAX"
(1024), increase it to 4096. With a larger buffer size, multi-line
records that are nearly LOG_LINE_MAX in length will have a better
chance of being fully printed. (When formatting a record for the
console, each line of a multi-line record is prepended with a copy
of the prefix.)
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
kernel/printk/printk.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -410,8 +410,13 @@ static u64 clear_seq;
#else
#define PREFIX_MAX 32
#endif
+
+/* the maximum size allowed to be reserved for a record */
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+/* the maximum size of a formatted record (i.e. with prefix added per line) */
+#define CONSOLE_LOG_MAX 4096
+
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -1472,11 +1477,11 @@ static int syslog_print(char __user *buf
char *text;
int len = 0;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
while (size > 0) {
size_t n;
@@ -1542,7 +1547,7 @@ static int syslog_print_all(char __user
u64 seq;
bool time;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -1554,7 +1559,7 @@ static int syslog_print_all(char __user
*/
seq = find_first_fitting_seq(clear_seq, -1, size, true, time);
- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
@@ -2187,8 +2192,7 @@ EXPORT_SYMBOL(printk);
#else /* CONFIG_PRINTK */
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
+#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
@@ -2506,7 +2510,7 @@ static inline int can_use_console(void)
void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
+ static char text[CONSOLE_LOG_MAX];
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;

View File

@@ -0,0 +1,140 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:41:58 +0106
Subject: [PATCH 07/28] printk: use seqcount_latch for clear_seq
kmsg_dump_rewind_nolock() locklessly reads @clear_seq. However,
this is not done atomically. Since @clear_seq is 64-bit, this
cannot be an atomic operation for all platforms. Therefore, use
a seqcount_latch to allow readers to always read a consistent
value.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 58 ++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 50 insertions(+), 8 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -402,8 +402,21 @@ static u64 console_seq;
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
+struct latched_seq {
+ seqcount_latch_t latch;
+ u64 val[2];
+};
+
+/*
+ * The next printk record to read after the last 'clear' command. There are
+ * two copies (updated with seqcount_latch) so that reads can locklessly
+ * access a valid value. Writers are synchronized by @logbuf_lock.
+ */
+static struct latched_seq clear_seq = {
+ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
+ .val[0] = 0,
+ .val[1] = 0,
+};
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -457,6 +470,31 @@ bool printk_percpu_data_ready(void)
return __printk_percpu_data_ready;
}
+/* Must be called under logbuf_lock. */
+static void latched_seq_write(struct latched_seq *ls, u64 val)
+{
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[0] = val;
+ raw_write_seqcount_latch(&ls->latch);
+ ls->val[1] = val;
+}
+
+/* Can be called from any context. */
+static u64 latched_seq_read_nolock(struct latched_seq *ls)
+{
+ unsigned int seq;
+ unsigned int idx;
+ u64 val;
+
+ do {
+ seq = raw_read_seqcount_latch(&ls->latch);
+ idx = seq & 0x1;
+ val = ls->val[idx];
+ } while (read_seqcount_latch_retry(&ls->latch, seq));
+
+ return val;
+}
+
/* Return log buffer address */
char *log_buf_addr_get(void)
{
@@ -801,7 +839,7 @@ static loff_t devkmsg_llseek(struct file
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->seq = clear_seq;
+ user->seq = latched_seq_read_nolock(&clear_seq);
break;
case SEEK_END:
/* after the last record */
@@ -960,6 +998,9 @@ void log_buf_vmcoreinfo_setup(void)
VMCOREINFO_SIZE(atomic_long_t);
VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
+
+ VMCOREINFO_STRUCT_SIZE(latched_seq);
+ VMCOREINFO_OFFSET(latched_seq, val);
}
#endif
@@ -1557,7 +1598,8 @@ static int syslog_print_all(char __user
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = find_first_fitting_seq(clear_seq, -1, size, true, time);
+ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
+ size, true, time);
prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);
@@ -1584,7 +1626,7 @@ static int syslog_print_all(char __user
}
if (clear)
- clear_seq = seq;
+ latched_seq_write(&clear_seq, seq);
logbuf_unlock_irq();
kfree(text);
@@ -1594,7 +1636,7 @@ static int syslog_print_all(char __user
static void syslog_clear(void)
{
logbuf_lock_irq();
- clear_seq = prb_next_seq(prb);
+ latched_seq_write(&clear_seq, prb_next_seq(prb));
logbuf_unlock_irq();
}
@@ -3336,7 +3378,7 @@ void kmsg_dump(enum kmsg_dump_reason rea
dumper->active = true;
logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
+ dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
dumper->next_seq = prb_next_seq(prb);
logbuf_unlock_irqrestore(flags);
@@ -3534,7 +3576,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
- dumper->cur_seq = clear_seq;
+ dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
dumper->next_seq = prb_next_seq(prb);
}

View File

@@ -0,0 +1,105 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 10 Dec 2020 15:33:40 +0106
Subject: [PATCH 08/28] printk: use atomic64_t for devkmsg_user.seq
@user->seq is indirectly protected by @logbuf_lock. Once @logbuf_lock
is removed, @user->seq will be no longer safe from an atomicity point
of view.
In preparation for the removal of @logbuf_lock, change it to
atomic64_t to provide this safety.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
kernel/printk/printk.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -662,7 +662,7 @@ static ssize_t msg_print_ext_body(char *
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
- u64 seq;
+ atomic64_t seq;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
@@ -763,7 +763,7 @@ static ssize_t devkmsg_read(struct file
return ret;
logbuf_lock_irq();
- if (!prb_read_valid(prb, user->seq, r)) {
+ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
logbuf_unlock_irq();
@@ -772,15 +772,15 @@ static ssize_t devkmsg_read(struct file
logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
- prb_read_valid(prb, user->seq, r));
+ prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
logbuf_lock_irq();
}
- if (r->info->seq != user->seq) {
+ if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
- user->seq = r->info->seq;
+ atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
logbuf_unlock_irq();
goto out;
@@ -791,7 +791,7 @@ static ssize_t devkmsg_read(struct file
&r->text_buf[0], r->info->text_len,
&r->info->dev_info);
- user->seq = r->info->seq + 1;
+ atomic64_set(&user->seq, r->info->seq + 1);
logbuf_unlock_irq();
if (len > count) {
@@ -831,7 +831,7 @@ static loff_t devkmsg_llseek(struct file
switch (whence) {
case SEEK_SET:
/* the first record */
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
break;
case SEEK_DATA:
/*
@@ -839,11 +839,11 @@ static loff_t devkmsg_llseek(struct file
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->seq = latched_seq_read_nolock(&clear_seq);
+ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
break;
case SEEK_END:
/* after the last record */
- user->seq = prb_next_seq(prb);
+ atomic64_set(&user->seq, prb_next_seq(prb));
break;
default:
ret = -EINVAL;
@@ -866,7 +866,7 @@ static __poll_t devkmsg_poll(struct file
logbuf_lock_irq();
if (prb_read_valid_info(prb, user->seq, &info, NULL)) {
/* return error when data has vanished underneath us */
- if (info.seq != user->seq)
+ if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
@@ -905,7 +905,7 @@ static int devkmsg_open(struct inode *in
&user->text_buf[0], sizeof(user->text_buf));
logbuf_lock_irq();
- user->seq = prb_first_valid_seq(prb);
+ atomic64_set(&user->seq, prb_first_valid_seq(prb));
logbuf_unlock_irq();
file->private_data = user;

View File

@@ -0,0 +1,152 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 10 Dec 2020 16:58:02 +0106
Subject: [PATCH 09/28] printk: add syslog_lock
The global variables @syslog_seq, @syslog_partial, @syslog_time
and write access to @clear_seq are protected by @logbuf_lock.
Once @logbuf_lock is removed, these variables will need their
own synchronization method. Introduce @syslog_lock for this
purpose.
@syslog_lock is a raw_spin_lock for now. This simplifies the
transition to removing @logbuf_lock. Once @logbuf_lock and the
safe buffers are removed, @syslog_lock can change to spin_lock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 41 +++++++++++++++++++++++++++++++++++++----
1 file changed, 37 insertions(+), 4 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -390,8 +390,12 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
printk_safe_exit_irqrestore(flags); \
} while (0)
+/* syslog_lock protects syslog_* variables and write access to clear_seq. */
+static DEFINE_RAW_SPINLOCK(syslog_lock);
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
+/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
@@ -410,7 +414,7 @@ struct latched_seq {
/*
* The next printk record to read after the last 'clear' command. There are
* two copies (updated with seqcount_latch) so that reads can locklessly
- * access a valid value. Writers are synchronized by @logbuf_lock.
+ * access a valid value. Writers are synchronized by @syslog_lock.
*/
static struct latched_seq clear_seq = {
.latch = SEQCNT_LATCH_ZERO(clear_seq.latch),
@@ -470,7 +474,7 @@ bool printk_percpu_data_ready(void)
return __printk_percpu_data_ready;
}
-/* Must be called under logbuf_lock. */
+/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
raw_write_seqcount_latch(&ls->latch);
@@ -1529,7 +1533,9 @@ static int syslog_print(char __user *buf
size_t skip;
logbuf_lock_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
+ raw_spin_unlock(&syslog_lock);
logbuf_unlock_irq();
break;
}
@@ -1559,6 +1565,7 @@ static int syslog_print(char __user *buf
syslog_partial += n;
} else
n = 0;
+ raw_spin_unlock(&syslog_lock);
logbuf_unlock_irq();
if (!n)
@@ -1625,8 +1632,11 @@ static int syslog_print_all(char __user
break;
}
- if (clear)
+ if (clear) {
+ raw_spin_lock(&syslog_lock);
latched_seq_write(&clear_seq, seq);
+ raw_spin_unlock(&syslog_lock);
+ }
logbuf_unlock_irq();
kfree(text);
@@ -1636,10 +1646,24 @@ static int syslog_print_all(char __user
static void syslog_clear(void)
{
logbuf_lock_irq();
+ raw_spin_lock(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
+ raw_spin_unlock(&syslog_lock);
logbuf_unlock_irq();
}
+/* Return a consistent copy of @syslog_seq. */
+static u64 read_syslog_seq_irq(void)
+{
+ u64 seq;
+
+ raw_spin_lock_irq(&syslog_lock);
+ seq = syslog_seq;
+ raw_spin_unlock_irq(&syslog_lock);
+
+ return seq;
+}
+
int do_syslog(int type, char __user *buf, int len, int source)
{
struct printk_info info;
@@ -1663,8 +1687,9 @@ int do_syslog(int type, char __user *buf
return 0;
if (!access_ok(buf, len))
return -EFAULT;
+
error = wait_event_interruptible(log_wait,
- prb_read_valid(prb, syslog_seq, NULL));
+ prb_read_valid(prb, read_syslog_seq_irq(), NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1713,8 +1738,10 @@ int do_syslog(int type, char __user *buf
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
logbuf_lock_irq();
+ raw_spin_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
+ raw_spin_unlock(&syslog_lock);
logbuf_unlock_irq();
return 0;
}
@@ -1743,6 +1770,7 @@ int do_syslog(int type, char __user *buf
}
error -= syslog_partial;
}
+ raw_spin_unlock(&syslog_lock);
logbuf_unlock_irq();
break;
/* Size of the log buffer */
@@ -2992,7 +3020,12 @@ void register_console(struct console *ne
*/
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
+
+ /* Get a consistent copy of @syslog_seq. */
+ raw_spin_lock(&syslog_lock);
console_seq = syslog_seq;
+ raw_spin_unlock(&syslog_lock);
+
logbuf_unlock_irqrestore(flags);
}
console_unlock();

View File

@@ -0,0 +1,535 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 18 Dec 2020 11:40:08 +0000
Subject: [PATCH 10/28] printk: introduce a kmsg_dump iterator
Rather than store the iterator information into the registered
kmsg_dump structure, create a separate iterator structure. The
kmsg_dump_iter structure can reside on the stack of the caller,
thus allowing lockless use of the kmsg_dump functions.
This is in preparation for removal of @logbuf_lock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/powerpc/kernel/nvram_64.c | 12 +++--
arch/powerpc/platforms/powernv/opal-kmsg.c | 3 -
arch/powerpc/xmon/xmon.c | 6 +-
arch/um/kernel/kmsg_dump.c | 5 +-
drivers/hv/vmbus_drv.c | 5 +-
drivers/mtd/mtdoops.c | 5 +-
fs/pstore/platform.c | 5 +-
include/linux/kmsg_dump.h | 43 ++++++++++---------
kernel/debug/kdb/kdb_main.c | 10 ++--
kernel/printk/printk.c | 65 +++++++++++++----------------
10 files changed, 84 insertions(+), 75 deletions(-)
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -73,7 +73,8 @@ static const char *nvram_os_partitions[]
};
static void oops_to_nvram(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason);
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter);
static struct kmsg_dumper nvram_kmsg_dumper = {
.dump = oops_to_nvram
@@ -643,7 +644,8 @@ void __init nvram_init_oops_partition(in
* partition. If that's too much, go back and capture uncompressed text.
*/
static void oops_to_nvram(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf;
static unsigned int oops_count = 0;
@@ -681,13 +683,13 @@ static void oops_to_nvram(struct kmsg_du
return;
if (big_oops_buf) {
- kmsg_dump_get_buffer(dumper, false,
+ kmsg_dump_get_buffer(iter, false,
big_oops_buf, big_oops_buf_sz, &text_len);
rc = zip_oops(text_len);
}
if (rc != 0) {
- kmsg_dump_rewind(dumper);
- kmsg_dump_get_buffer(dumper, false,
+ kmsg_dump_rewind(iter);
+ kmsg_dump_get_buffer(iter, false,
oops_data, oops_data_sz, &text_len);
err_type = ERR_TYPE_KERNEL_PANIC;
oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
--- a/arch/powerpc/platforms/powernv/opal-kmsg.c
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -20,7 +20,8 @@
* message, it just ensures that OPAL completely flushes the console buffer.
*/
static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
/*
* Outside of a panic context the pollers will continue to run,
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3005,7 +3005,7 @@ print_address(unsigned long addr)
static void
dump_log_buf(void)
{
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dumper_iter iter = { .active = 1 };
unsigned char buf[128];
size_t len;
@@ -3017,9 +3017,9 @@ dump_log_buf(void)
catch_memory_errors = 1;
sync();
- kmsg_dump_rewind_nolock(&dumper);
+ kmsg_dump_rewind_nolock(&iter);
xmon_start_pagination();
- while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) {
+ while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) {
buf[len] = '\0';
printf("%s", buf);
}
--- a/arch/um/kernel/kmsg_dump.c
+++ b/arch/um/kernel/kmsg_dump.c
@@ -7,7 +7,8 @@
#include <os.h>
static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
static char line[1024];
struct console *con;
@@ -30,7 +31,7 @@ static void kmsg_dumper_stdout(struct km
return;
printf("kmsg_dump:\n");
- while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) {
+ while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) {
line[len] = '\0';
printf("%s", line);
}
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1362,7 +1362,8 @@ static void vmbus_isr(void)
* buffer and call into Hyper-V to transfer the data.
*/
static void hv_kmsg_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
size_t bytes_written;
phys_addr_t panic_pa;
@@ -1377,7 +1378,7 @@ static void hv_kmsg_dump(struct kmsg_dum
* Write dump contents to the page. No need to synchronize; panic should
* be single-threaded.
*/
- kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE,
+ kmsg_dump_get_buffer(iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
if (bytes_written)
hyperv_report_panic_msg(panic_pa, bytes_written);
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -267,7 +267,8 @@ static void find_next_position(struct mt
}
static void mtdoops_do_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
struct mtdoops_context *cxt = container_of(dumper,
struct mtdoops_context, dump);
@@ -276,7 +277,7 @@ static void mtdoops_do_dump(struct kmsg_
if (reason == KMSG_DUMP_OOPS && !dump_oops)
return;
- kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE,
+ kmsg_dump_get_buffer(iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE,
record_size - MTDOOPS_HEADER_SIZE, NULL);
if (reason != KMSG_DUMP_OOPS) {
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -383,7 +383,8 @@ void pstore_record_init(struct pstore_re
* end of the buffer.
*/
static void pstore_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter)
{
unsigned long total = 0;
const char *why;
@@ -435,7 +436,7 @@ static void pstore_dump(struct kmsg_dump
dst_size -= header_size;
/* Write dump contents. */
- if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+ if (!kmsg_dump_get_buffer(iter, true, dst + header_size,
dst_size, &dump_size))
break;
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -30,43 +30,48 @@ enum kmsg_dump_reason {
};
/**
+ * struct kmsg_dumper_iter - iterator for kernel crash message dumper
+ * @active: Flag that specifies if this is currently dumping
+ * @cur_seq: Points to the oldest message to dump (private)
+ * @next_seq: Points after the newest message to dump (private)
+ */
+struct kmsg_dumper_iter {
+ bool active;
+ u64 cur_seq;
+ u64 next_seq;
+};
+
+/**
* struct kmsg_dumper - kernel crash message dumper structure
* @list: Entry in the dumper list (private)
* @dump: Call into dumping code which will retrieve the data with
* through the record iterator
* @max_reason: filter for highest reason number that should be dumped
* @registered: Flag that specifies if this is already registered
- * @active: Flag that specifies if this is currently dumping
- * @cur_seq: Points to the oldest message to dump (private)
- * @next_seq: Points after the newest message to dump (private)
*/
struct kmsg_dumper {
struct list_head list;
- void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason);
+ void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason,
+ struct kmsg_dumper_iter *iter);
enum kmsg_dump_reason max_reason;
- bool active;
bool registered;
-
- /* private state of the kmsg iterator */
- u64 cur_seq;
- u64 next_seq;
};
#ifdef CONFIG_PRINTK
void kmsg_dump(enum kmsg_dump_reason reason);
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog,
char *line, size_t size, size_t *len);
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
char *line, size_t size, size_t *len);
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog,
char *buf, size_t size, size_t *len_out);
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper);
+void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter);
-void kmsg_dump_rewind(struct kmsg_dumper *dumper);
+void kmsg_dump_rewind(struct kmsg_dumper_iter *dumper_iter);
int kmsg_dump_register(struct kmsg_dumper *dumper);
@@ -78,30 +83,30 @@ static inline void kmsg_dump(enum kmsg_d
{
}
-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper,
+static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter,
bool syslog, const char *line,
size_t size, size_t *len)
{
return false;
}
-static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
const char *line, size_t size, size_t *len)
{
return false;
}
-static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog,
char *buf, size_t size, size_t *len)
{
return false;
}
-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter)
{
}
-static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter)
{
}
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const cha
int adjust = 0;
int n = 0;
int skip = 0;
- struct kmsg_dumper dumper = { .active = 1 };
+ struct kmsg_dumper_iter iter = { .active = 1 };
size_t len;
char buf[201];
@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const cha
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
+ kmsg_dump_rewind_nolock(&iter);
+ while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const cha
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&dumper);
- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind_nolock(&iter);
+ while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3389,6 +3389,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
+ struct kmsg_dumper_iter iter;
struct kmsg_dumper *dumper;
unsigned long flags;
@@ -3408,25 +3409,21 @@ void kmsg_dump(enum kmsg_dump_reason rea
continue;
/* initialize iterator with data about the stored records */
- dumper->active = true;
-
+ iter.active = true;
logbuf_lock_irqsave(flags);
- dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
- dumper->next_seq = prb_next_seq(prb);
+ iter.cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter.next_seq = prb_next_seq(prb);
logbuf_unlock_irqrestore(flags);
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ dumper->dump(dumper, reason, &iter);
}
rcu_read_unlock();
}
/**
* kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dumper iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3443,7 +3440,7 @@ void kmsg_dump(enum kmsg_dump_reason rea
*
* The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog,
char *line, size_t size, size_t *len)
{
struct printk_info info;
@@ -3454,16 +3451,16 @@ bool kmsg_dump_get_line_nolock(struct km
prb_rec_init_rd(&r, &info, line, size);
- if (!dumper->active)
+ if (!iter->active)
goto out;
/* Read text or count text lines? */
if (line) {
- if (!prb_read_valid(prb, dumper->cur_seq, &r))
+ if (!prb_read_valid(prb, iter->cur_seq, &r))
goto out;
l = record_print_text(&r, syslog, printk_time);
} else {
- if (!prb_read_valid_info(prb, dumper->cur_seq,
+ if (!prb_read_valid_info(prb, iter->cur_seq,
&info, &line_count)) {
goto out;
}
@@ -3472,7 +3469,7 @@ bool kmsg_dump_get_line_nolock(struct km
}
- dumper->cur_seq = r.info->seq + 1;
+ iter->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
@@ -3482,7 +3479,7 @@ bool kmsg_dump_get_line_nolock(struct km
/**
* kmsg_dump_get_line - retrieve one kmsg log line
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dumper iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3497,14 +3494,14 @@ bool kmsg_dump_get_line_nolock(struct km
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
char *line, size_t size, size_t *len)
{
unsigned long flags;
bool ret;
logbuf_lock_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+ ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len);
logbuf_unlock_irqrestore(flags);
return ret;
@@ -3513,7 +3510,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
* kmsg_dump_get_buffer - copy kmsg log lines
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dumper iterator
* @syslog: include the "<4>" prefixes
* @buf: buffer to copy the line to
* @size: maximum size of the buffer
@@ -3530,7 +3527,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* A return value of FALSE indicates that there are no more records to
* read.
*/
-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog,
char *buf, size_t size, size_t *len_out)
{
struct printk_info info;
@@ -3542,19 +3539,19 @@ bool kmsg_dump_get_buffer(struct kmsg_du
bool ret = false;
bool time = printk_time;
- if (!dumper->active || !buf || !size)
+ if (!iter->active || !buf || !size)
goto out;
logbuf_lock_irqsave(flags);
- if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) {
- if (info.seq != dumper->cur_seq) {
+ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
+ if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
- dumper->cur_seq = info.seq;
+ iter->cur_seq = info.seq;
}
}
/* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
+ if (iter->cur_seq >= iter->next_seq) {
logbuf_unlock_irqrestore(flags);
goto out;
}
@@ -3565,7 +3562,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
* because this function (by way of record_print_text()) will
* not write more than size-1 bytes of text into @buf.
*/
- seq = find_first_fitting_seq(dumper->cur_seq, dumper->next_seq,
+ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
size - 1, syslog, time);
/*
@@ -3578,7 +3575,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
len = 0;
prb_for_each_record(seq, prb, seq, &r) {
- if (r.info->seq >= dumper->next_seq)
+ if (r.info->seq >= iter->next_seq)
break;
len += record_print_text(&r, syslog, time);
@@ -3587,7 +3584,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
prb_rec_init_rd(&r, &info, buf + len, size - len);
}
- dumper->next_seq = next_seq;
+ iter->next_seq = next_seq;
ret = true;
logbuf_unlock_irqrestore(flags);
out:
@@ -3599,7 +3596,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
* kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dumper iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
@@ -3607,26 +3604,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*
* The function is similar to kmsg_dump_rewind(), but grabs no locks.
*/
-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter)
{
- dumper->cur_seq = latched_seq_read_nolock(&clear_seq);
- dumper->next_seq = prb_next_seq(prb);
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
}
/**
* kmsg_dump_rewind - reset the iterator
- * @dumper: registered kmsg dumper
+ * @iter: kmsg dumper iterator
*
* Reset the dumper's iterator so that kmsg_dump_get_line() and
* kmsg_dump_get_buffer() can be called again and used multiple
* times within the same dumper.dump() callback.
*/
-void kmsg_dump_rewind(struct kmsg_dumper *dumper)
+void kmsg_dump_rewind(struct kmsg_dumper_iter *iter)
{
unsigned long flags;
logbuf_lock_irqsave(flags);
- kmsg_dump_rewind_nolock(dumper);
+ kmsg_dump_rewind_nolock(iter);
logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

View File

@@ -0,0 +1,54 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 21 Dec 2020 11:10:03 +0106
Subject: [PATCH 11/28] um: synchronize kmsg_dumper
The kmsg_dumper can be called from any context and CPU, possibly
from multiple CPUs simultaneously. Since a static buffer is used
to retrieve the kernel logs, this buffer must be protected against
simultaneous dumping.
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/um/kernel/kmsg_dump.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/arch/um/kernel/kmsg_dump.c
+++ b/arch/um/kernel/kmsg_dump.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kmsg_dump.h>
+#include <linux/spinlock.h>
#include <linux/console.h>
#include <linux/string.h>
#include <shared/init.h>
@@ -10,8 +11,10 @@ static void kmsg_dumper_stdout(struct km
enum kmsg_dump_reason reason,
struct kmsg_dumper_iter *iter)
{
+ static DEFINE_SPINLOCK(lock);
static char line[1024];
struct console *con;
+ unsigned long flags;
size_t len = 0;
/* only dump kmsg when no console is available */
@@ -30,11 +33,16 @@ static void kmsg_dumper_stdout(struct km
if (con)
return;
+ if (!spin_trylock_irqsave(&lock, flags))
+ return;
+
printf("kmsg_dump:\n");
while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) {
line[len] = '\0';
printf("%s", line);
}
+
+ spin_unlock_irqrestore(&lock, flags);
}
static struct kmsg_dumper kmsg_dumper = {

View File

@@ -0,0 +1,475 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 26 Jan 2021 17:43:19 +0106
Subject: [PATCH 12/28] printk: remove logbuf_lock
Since the ringbuffer is lockless, there is no need for it to be
protected by @logbuf_lock. Remove @logbuf_lock.
This means that printk_nmi_direct and printk_safe_flush_on_panic()
no longer need to acquire any lock to run.
@console_seq, @exclusive_console_stop_seq, @console_dropped are
protected by @console_lock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/internal.h | 4 -
kernel/printk/printk.c | 118 ++++++++++++++------------------------------
kernel/printk/printk_safe.c | 29 ++--------
3 files changed, 48 insertions(+), 103 deletions(-)
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -12,8 +12,6 @@
#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-extern raw_spinlock_t logbuf_lock;
-
__printf(4, 0)
int vprintk_store(int facility, int level,
const struct dev_printk_info *dev_info,
@@ -59,7 +57,7 @@ void defer_console_output(void);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * In !PRINTK builds we still export console_sem
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -355,41 +355,6 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
-
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_RAW_SPINLOCK(syslog_lock);
@@ -401,6 +366,7 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
+/* All 3 protected by @console_sem. */
/* the next printk record to write to the console */
static u64 console_seq;
static u64 exclusive_console_stop_seq;
@@ -766,27 +732,27 @@ static ssize_t devkmsg_read(struct file
if (ret)
return ret;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
}
if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
goto out;
}
@@ -796,7 +762,7 @@ static ssize_t devkmsg_read(struct file
&r->info->dev_info);
atomic64_set(&user->seq, r->info->seq + 1);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -831,7 +797,7 @@ static loff_t devkmsg_llseek(struct file
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -852,7 +818,7 @@ static loff_t devkmsg_llseek(struct file
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -867,15 +833,15 @@ static __poll_t devkmsg_poll(struct file
poll_wait(file, &log_wait, wait);
- logbuf_lock_irq();
- if (prb_read_valid_info(prb, user->seq, &info, NULL)) {
+ printk_safe_enter_irq();
+ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return ret;
}
@@ -908,9 +874,9 @@ static int devkmsg_open(struct inode *in
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- logbuf_lock_irq();
+ printk_safe_enter_irq();
atomic64_set(&user->seq, prb_first_valid_seq(prb));
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -1532,11 +1498,11 @@ static int syslog_print(char __user *buf
size_t n;
size_t skip;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
raw_spin_lock(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
break;
}
if (r.info->seq != syslog_seq) {
@@ -1566,7 +1532,7 @@ static int syslog_print(char __user *buf
} else
n = 0;
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
if (!n)
break;
@@ -1600,7 +1566,7 @@ static int syslog_print_all(char __user
return -ENOMEM;
time = printk_time;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1621,12 +1587,12 @@ static int syslog_print_all(char __user
break;
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
+ printk_safe_enter_irq();
if (len < 0)
break;
@@ -1637,7 +1603,7 @@ static int syslog_print_all(char __user
latched_seq_write(&clear_seq, seq);
raw_spin_unlock(&syslog_lock);
}
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
kfree(text);
return len;
@@ -1645,11 +1611,11 @@ static int syslog_print_all(char __user
static void syslog_clear(void)
{
- logbuf_lock_irq();
+ printk_safe_enter_irq();
raw_spin_lock(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
}
/* Return a consistent copy of @syslog_seq. */
@@ -1737,12 +1703,12 @@ int do_syslog(int type, char __user *buf
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
+ printk_safe_enter_irq();
raw_spin_lock(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
return 0;
}
if (info.seq != syslog_seq) {
@@ -1771,7 +1737,7 @@ int do_syslog(int type, char __user *buf
error -= syslog_partial;
}
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irq();
+ printk_safe_exit_irq();
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -2627,7 +2593,6 @@ void console_unlock(void)
size_t len;
printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2671,7 +2636,6 @@ void console_unlock(void)
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
console_seq++;
- raw_spin_unlock(&logbuf_lock);
/*
* While actively printing out messages, if another printk()
@@ -2698,8 +2662,6 @@ void console_unlock(void)
console_locked = 0;
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
/*
@@ -2708,9 +2670,7 @@ void console_unlock(void)
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- raw_spin_lock(&logbuf_lock);
retry = prb_read_valid(prb, console_seq, NULL);
- raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
@@ -2777,9 +2737,9 @@ void console_flush_on_panic(enum con_flu
if (mode == CONSOLE_REPLAY_ALL) {
unsigned long flags;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
console_seq = prb_first_valid_seq(prb);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
}
console_unlock();
}
@@ -3008,7 +2968,7 @@ void register_console(struct console *ne
* console_unlock(); will print out the buffered messages
* for us.
*/
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -3026,7 +2986,7 @@ void register_console(struct console *ne
console_seq = syslog_seq;
raw_spin_unlock(&syslog_lock);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
}
console_unlock();
console_sysfs_notify();
@@ -3410,10 +3370,10 @@ void kmsg_dump(enum kmsg_dump_reason rea
/* initialize iterator with data about the stored records */
iter.active = true;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
iter.cur_seq = latched_seq_read_nolock(&clear_seq);
iter.next_seq = prb_next_seq(prb);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason, &iter);
@@ -3500,9 +3460,9 @@ bool kmsg_dump_get_line(struct kmsg_dump
unsigned long flags;
bool ret;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
return ret;
}
@@ -3542,7 +3502,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
if (!iter->active || !buf || !size)
goto out;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
@@ -3552,7 +3512,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
/* last entry */
if (iter->cur_seq >= iter->next_seq) {
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
goto out;
}
@@ -3586,7 +3546,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
iter->next_seq = next_seq;
ret = true;
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
out:
if (len_out)
*len_out = len;
@@ -3622,9 +3582,9 @@ void kmsg_dump_rewind(struct kmsg_dumper
{
unsigned long flags;
- logbuf_lock_irqsave(flags);
+ printk_safe_enter_irqsave(flags);
kmsg_dump_rewind_nolock(iter);
- logbuf_unlock_irqrestore(flags);
+ printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -16,7 +16,7 @@
#include "internal.h"
/*
- * printk() could not take logbuf_lock in NMI context. Instead,
+ * In NMI and safe mode, printk() avoids taking locks. Instead,
* it uses an alternative implementation that temporary stores
* the strings into a per-CPU buffer. The content of the buffer
* is later flushed into the main ring buffer via IRQ work.
@@ -266,18 +266,6 @@ void printk_safe_flush(void)
*/
void printk_safe_flush_on_panic(void)
{
- /*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
if (raw_spin_is_locked(&safe_read_lock)) {
if (num_online_cpus() > 1)
return;
@@ -319,9 +307,7 @@ void noinstr printk_nmi_exit(void)
* reordering.
*
* It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
+ * will store the messages into the main logbuf directly.
*/
void printk_nmi_direct_enter(void)
{
@@ -376,20 +362,21 @@ void __printk_safe_exit(void)
#endif
/*
- * Try to use the main logbuf even in NMI. But avoid calling console
+ * Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
+ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
+ unsigned long flags;
int len;
+ printk_safe_enter_irqsave(flags);
len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- raw_spin_unlock(&logbuf_lock);
+ printk_safe_exit_irqrestore(flags);
defer_console_output();
return len;
}
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
+ /* Use extra buffer in NMI. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);

View File

@@ -0,0 +1,213 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 21 Dec 2020 10:27:58 +0106
Subject: [PATCH 13/28] printk: kmsg_dump: remove _nolock() variants
kmsg_dump_rewind() and kmsg_dump_get_line() are lockless, so there is
no need for _nolock() variants. Remove these functions and switch all
callers of the _nolock() variants.
The functions without _nolock() were chosen because they are already
exported to kernel modules.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
arch/powerpc/xmon/xmon.c | 4 +-
include/linux/kmsg_dump.h | 18 -------------
kernel/debug/kdb/kdb_main.c | 8 ++---
kernel/printk/printk.c | 60 +++++---------------------------------------
4 files changed, 15 insertions(+), 75 deletions(-)
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3017,9 +3017,9 @@ dump_log_buf(void)
catch_memory_errors = 1;
sync();
- kmsg_dump_rewind_nolock(&iter);
+ kmsg_dump_rewind(&iter);
xmon_start_pagination();
- while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) {
+ while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) {
buf[len] = '\0';
printf("%s", buf);
}
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -60,18 +60,13 @@ struct kmsg_dumper {
#ifdef CONFIG_PRINTK
void kmsg_dump(enum kmsg_dump_reason reason);
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog,
- char *line, size_t size, size_t *len);
-
bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
char *line, size_t size, size_t *len);
bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog,
char *buf, size_t size, size_t *len_out);
-void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter);
-
-void kmsg_dump_rewind(struct kmsg_dumper_iter *dumper_iter);
+void kmsg_dump_rewind(struct kmsg_dumper_iter *iter);
int kmsg_dump_register(struct kmsg_dumper *dumper);
@@ -83,13 +78,6 @@ static inline void kmsg_dump(enum kmsg_d
{
}
-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter,
- bool syslog, const char *line,
- size_t size, size_t *len)
-{
- return false;
-}
-
static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
const char *line, size_t size, size_t *len)
{
@@ -102,10 +90,6 @@ static inline bool kmsg_dump_get_buffer(
return false;
}
-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter)
-{
-}
-
static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter)
{
}
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const cha
kdb_set(2, setargs);
}
- kmsg_dump_rewind_nolock(&iter);
- while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL))
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL))
n++;
if (lines < 0) {
@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const cha
if (skip >= n || skip < 0)
return 0;
- kmsg_dump_rewind_nolock(&iter);
- while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) {
+ kmsg_dump_rewind(&iter);
+ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) {
if (skip) {
skip--;
continue;
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3382,7 +3382,7 @@ void kmsg_dump(enum kmsg_dump_reason rea
}
/**
- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
+ * kmsg_dump_get_line - retrieve one kmsg log line
* @iter: kmsg dumper iterator
* @syslog: include the "<4>" prefixes
* @line: buffer to copy the line to
@@ -3397,18 +3397,18 @@ void kmsg_dump(enum kmsg_dump_reason rea
*
* A return value of FALSE indicates that there are no more records to
* read.
- *
- * The function is similar to kmsg_dump_get_line(), but grabs no locks.
*/
-bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog,
- char *line, size_t size, size_t *len)
+bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
+ char *line, size_t size, size_t *len)
{
struct printk_info info;
unsigned int line_count;
struct printk_record r;
+ unsigned long flags;
size_t l = 0;
bool ret = false;
+ printk_safe_enter_irqsave(flags);
prb_rec_init_rd(&r, &info, line, size);
if (!iter->active)
@@ -3432,40 +3432,11 @@ bool kmsg_dump_get_line_nolock(struct km
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
+ printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
}
-
-/**
- * kmsg_dump_get_line - retrieve one kmsg log line
- * @iter: kmsg dumper iterator
- * @syslog: include the "<4>" prefixes
- * @line: buffer to copy the line to
- * @size: maximum size of the buffer
- * @len: length of line placed into buffer
- *
- * Start at the beginning of the kmsg buffer, with the oldest kmsg
- * record, and copy one record into the provided buffer.
- *
- * Consecutive calls will return the next available record moving
- * towards the end of the buffer with the youngest messages.
- *
- * A return value of FALSE indicates that there are no more records to
- * read.
- */
-bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog,
- char *line, size_t size, size_t *len)
-{
- unsigned long flags;
- bool ret;
-
- printk_safe_enter_irqsave(flags);
- ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len);
- printk_safe_exit_irqrestore(flags);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
/**
@@ -3555,22 +3526,6 @@ bool kmsg_dump_get_buffer(struct kmsg_du
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
/**
- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version)
- * @iter: kmsg dumper iterator
- *
- * Reset the dumper's iterator so that kmsg_dump_get_line() and
- * kmsg_dump_get_buffer() can be called again and used multiple
- * times within the same dumper.dump() callback.
- *
- * The function is similar to kmsg_dump_rewind(), but grabs no locks.
- */
-void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter)
-{
- iter->cur_seq = latched_seq_read_nolock(&clear_seq);
- iter->next_seq = prb_next_seq(prb);
-}
-
-/**
* kmsg_dump_rewind - reset the iterator
* @iter: kmsg dumper iterator
*
@@ -3583,7 +3538,8 @@ void kmsg_dump_rewind(struct kmsg_dumper
unsigned long flags;
printk_safe_enter_irqsave(flags);
- kmsg_dump_rewind_nolock(iter);
+ iter->cur_seq = latched_seq_read_nolock(&clear_seq);
+ iter->next_seq = prb_next_seq(prb);
printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

View File

@@ -0,0 +1,35 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 17 Feb 2021 18:23:16 +0100
Subject: [PATCH 14/28] printk: kmsg_dump: use kmsg_dump_rewind
kmsg_dump() is open coding the kmsg_dump_rewind(). Call
kmsg_dump_rewind() instead.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3351,7 +3351,6 @@ void kmsg_dump(enum kmsg_dump_reason rea
{
struct kmsg_dumper_iter iter;
struct kmsg_dumper *dumper;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
@@ -3370,10 +3369,7 @@ void kmsg_dump(enum kmsg_dump_reason rea
/* initialize iterator with data about the stored records */
iter.active = true;
- printk_safe_enter_irqsave(flags);
- iter.cur_seq = latched_seq_read_nolock(&clear_seq);
- iter.next_seq = prb_next_seq(prb);
- printk_safe_exit_irqrestore(flags);
+ kmsg_dump_rewind(&iter);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason, &iter);

View File

@@ -0,0 +1,41 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 17 Feb 2021 18:28:05 +0100
Subject: [PATCH 15/28] printk: console: remove unnecessary safe buffer usage
Upon registering a console, safe buffers are activated when setting
up the sequence number to replay the log. However, these are already
protected by @console_sem and @syslog_lock. Remove the unnecessary
safe buffer usage.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
---
kernel/printk/printk.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2967,9 +2967,7 @@ void register_console(struct console *ne
/*
* console_unlock(); will print out the buffered messages
* for us.
- */
- printk_safe_enter_irqsave(flags);
- /*
+ *
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
* the already-registered consoles.
@@ -2982,11 +2980,9 @@ void register_console(struct console *ne
exclusive_console_stop_seq = console_seq;
/* Get a consistent copy of @syslog_seq. */
- raw_spin_lock(&syslog_lock);
+ raw_spin_lock_irqsave(&syslog_lock, flags);
console_seq = syslog_seq;
- raw_spin_unlock(&syslog_lock);
-
- printk_safe_exit_irqrestore(flags);
+ raw_spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();
console_sysfs_notify();

View File

@@ -0,0 +1,136 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 11 Dec 2020 00:55:25 +0106
Subject: [PATCH 16/28] printk: track/limit recursion
Limit printk() recursion to 1 level. This is enough to print a
stacktrace for the printk call, should a WARN or BUG occur.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 71 insertions(+), 3 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1940,6 +1940,65 @@ static void call_console_drivers(const c
}
}
+#ifdef CONFIG_PRINTK_NMI
+#define NUM_RECURSION_CTX 2
+#else
+#define NUM_RECURSION_CTX 1
+#endif
+
+struct printk_recursion {
+ char count[NUM_RECURSION_CTX];
+};
+
+static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion);
+static char printk_recursion_count[NUM_RECURSION_CTX];
+
+static char *printk_recursion_counter(void)
+{
+ struct printk_recursion *rec;
+ char *count;
+
+ if (!printk_percpu_data_ready()) {
+ count = &printk_recursion_count[0];
+ } else {
+ rec = this_cpu_ptr(&percpu_printk_recursion);
+
+ count = &rec->count[0];
+ }
+
+#ifdef CONFIG_PRINTK_NMI
+ if (in_nmi())
+ count++;
+#endif
+
+ return count;
+}
+
+static bool printk_enter_irqsave(unsigned long *flags)
+{
+ char *count;
+
+ local_irq_save(*flags);
+ count = printk_recursion_counter();
+ /* Only 1 level of recursion allowed. */
+ if (*count > 1) {
+ local_irq_restore(*flags);
+ return false;
+ }
+ (*count)++;
+
+ return true;
+}
+
+static void printk_exit_irqrestore(unsigned long flags)
+{
+ char *count;
+
+ count = printk_recursion_counter();
+ (*count)--;
+ local_irq_restore(flags);
+}
+
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -2040,11 +2099,13 @@ int vprintk_store(int facility, int leve
struct prb_reserved_entry e;
enum log_flags lflags = 0;
struct printk_record r;
+ unsigned long irqflags;
u16 trunc_msg_len = 0;
char prefix_buf[8];
u16 reserve_size;
va_list args2;
u16 text_len;
+ int ret = 0;
u64 ts_nsec;
/*
@@ -2055,6 +2116,9 @@ int vprintk_store(int facility, int leve
*/
ts_nsec = local_clock();
+ if (!printk_enter_irqsave(&irqflags))
+ return 0;
+
/*
* The sprintf needs to come first since the syslog prefix might be
* passed in as a parameter. An extra byte must be reserved so that
@@ -2092,7 +2156,8 @@ int vprintk_store(int facility, int leve
prb_commit(&e);
}
- return text_len;
+ ret = text_len;
+ goto out;
}
}
@@ -2108,7 +2173,7 @@ int vprintk_store(int facility, int leve
prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
if (!prb_reserve(&e, prb, &r))
- return 0;
+ goto out;
}
/* fill message */
@@ -2130,7 +2195,10 @@ int vprintk_store(int facility, int leve
else
prb_final_commit(&e);
- return (text_len + trunc_msg_len);
+ ret = text_len + trunc_msg_len;
+out:
+ printk_exit_irqrestore(irqflags);
+ return ret;
}
asmlinkage int vprintk_emit(int facility, int level,

View File

@@ -0,0 +1,854 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:00 +0106
Subject: [PATCH 17/28] printk: remove safe buffers
With @logbuf_lock removed, the high level printk functions for
storing messages are lockless. Messages can be stored from any
context, so there is no need for the NMI and safe buffers anymore.
Remove the NMI and safe buffers. In NMI or safe contexts, store
the message immediately but still use irq_work to defer the console
printing.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/powerpc/kernel/traps.c | 1
arch/powerpc/kernel/watchdog.c | 5
include/linux/printk.h | 10 -
kernel/kexec_core.c | 1
kernel/panic.c | 3
kernel/printk/internal.h | 2
kernel/printk/printk.c | 85 +---------
kernel/printk/printk_safe.c | 329 -----------------------------------------
lib/nmi_backtrace.c | 6
9 files changed, 17 insertions(+), 425 deletions(-)
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void)
extern void panic_flush_kmsg_end(void)
{
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(0);
debug_locks_off();
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -181,11 +181,6 @@ static void watchdog_smp_panic(int cpu,
wd_smp_unlock(&flags);
- printk_safe_flush();
- /*
- * printk_safe_flush() seems to require another print
- * before anything actually goes out to console.
- */
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -207,8 +207,6 @@ void __init setup_log_buf(int early);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
-extern void printk_safe_flush(void);
-extern void printk_safe_flush_on_panic(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
@@ -272,14 +270,6 @@ static inline void show_regs_print_info(
static inline void dump_stack(void)
{
}
-
-static inline void printk_safe_flush(void)
-{
-}
-
-static inline void printk_safe_flush_on_panic(void)
-{
-}
#endif
extern int kptr_restrict;
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -977,7 +977,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -247,7 +247,6 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -271,8 +270,6 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -23,7 +23,6 @@ int vprintk_store(int facility, int leve
void __printk_safe_enter(void);
void __printk_safe_exit(void);
-void printk_safe_init(void);
bool printk_percpu_data_ready(void);
#define printk_safe_enter_irqsave(flags) \
@@ -67,6 +66,5 @@ void defer_console_output(void);
#define printk_safe_enter_irq() local_irq_disable()
#define printk_safe_exit_irq() local_irq_enable()
-static inline void printk_safe_init(void) { }
static inline bool printk_percpu_data_ready(void) { return false; }
#endif /* CONFIG_PRINTK */
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -732,27 +732,22 @@ static ssize_t devkmsg_read(struct file
if (ret)
return ret;
- printk_safe_enter_irq();
if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- printk_safe_exit_irq();
goto out;
}
- printk_safe_exit_irq();
ret = wait_event_interruptible(log_wait,
prb_read_valid(prb, atomic64_read(&user->seq), r));
if (ret)
goto out;
- printk_safe_enter_irq();
}
if (r->info->seq != atomic64_read(&user->seq)) {
/* our last seen message is gone, return error and reset */
atomic64_set(&user->seq, r->info->seq);
ret = -EPIPE;
- printk_safe_exit_irq();
goto out;
}
@@ -762,7 +757,6 @@ static ssize_t devkmsg_read(struct file
&r->info->dev_info);
atomic64_set(&user->seq, r->info->seq + 1);
- printk_safe_exit_irq();
if (len > count) {
ret = -EINVAL;
@@ -797,7 +791,6 @@ static loff_t devkmsg_llseek(struct file
if (offset)
return -ESPIPE;
- printk_safe_enter_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -818,7 +811,6 @@ static loff_t devkmsg_llseek(struct file
default:
ret = -EINVAL;
}
- printk_safe_exit_irq();
return ret;
}
@@ -833,7 +825,6 @@ static __poll_t devkmsg_poll(struct file
poll_wait(file, &log_wait, wait);
- printk_safe_enter_irq();
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
/* return error when data has vanished underneath us */
if (info.seq != atomic64_read(&user->seq))
@@ -841,7 +832,6 @@ static __poll_t devkmsg_poll(struct file
else
ret = EPOLLIN|EPOLLRDNORM;
}
- printk_safe_exit_irq();
return ret;
}
@@ -874,9 +864,7 @@ static int devkmsg_open(struct inode *in
prb_rec_init_rd(&user->record, &user->info,
&user->text_buf[0], sizeof(user->text_buf));
- printk_safe_enter_irq();
atomic64_set(&user->seq, prb_first_valid_seq(prb));
- printk_safe_exit_irq();
file->private_data = user;
return 0;
@@ -1042,9 +1030,6 @@ static inline void log_buf_add_cpu(void)
static void __init set_percpu_data_ready(void)
{
- printk_safe_init();
- /* Make sure we set this flag only after printk_safe() init is done */
- barrier();
__printk_percpu_data_ready = true;
}
@@ -1142,8 +1127,6 @@ void __init setup_log_buf(int early)
new_descs, ilog2(new_descs_count),
new_infos);
- printk_safe_enter_irqsave(flags);
-
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
@@ -1159,8 +1142,6 @@ void __init setup_log_buf(int early)
*/
prb = &printk_rb_dynamic;
- printk_safe_exit_irqrestore(flags);
-
if (seq != prb_next_seq(&printk_rb_static)) {
pr_err("dropped %llu messages\n",
prb_next_seq(&printk_rb_static) - seq);
@@ -1498,11 +1479,9 @@ static int syslog_print(char __user *buf
size_t n;
size_t skip;
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ raw_spin_lock_irq(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ raw_spin_unlock_irq(&syslog_lock);
break;
}
if (r.info->seq != syslog_seq) {
@@ -1531,8 +1510,7 @@ static int syslog_print(char __user *buf
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ raw_spin_unlock_irq(&syslog_lock);
if (!n)
break;
@@ -1566,7 +1544,6 @@ static int syslog_print_all(char __user
return -ENOMEM;
time = printk_time;
- printk_safe_enter_irq();
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1587,23 +1564,20 @@ static int syslog_print_all(char __user
break;
}
- printk_safe_exit_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- printk_safe_enter_irq();
if (len < 0)
break;
}
if (clear) {
- raw_spin_lock(&syslog_lock);
+ raw_spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, seq);
- raw_spin_unlock(&syslog_lock);
+ raw_spin_unlock_irq(&syslog_lock);
}
- printk_safe_exit_irq();
kfree(text);
return len;
@@ -1611,11 +1585,9 @@ static int syslog_print_all(char __user
static void syslog_clear(void)
{
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ raw_spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ raw_spin_unlock_irq(&syslog_lock);
}
/* Return a consistent copy of @syslog_seq. */
@@ -1703,12 +1675,10 @@ int do_syslog(int type, char __user *buf
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- printk_safe_enter_irq();
- raw_spin_lock(&syslog_lock);
+ raw_spin_lock_irq(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ raw_spin_unlock_irq(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
@@ -1736,8 +1706,7 @@ int do_syslog(int type, char __user *buf
}
error -= syslog_partial;
}
- raw_spin_unlock(&syslog_lock);
- printk_safe_exit_irq();
+ raw_spin_unlock_irq(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -2207,7 +2176,6 @@ asmlinkage int vprintk_emit(int facility
{
int printed_len;
bool in_sched = false;
- unsigned long flags;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -2221,9 +2189,7 @@ asmlinkage int vprintk_emit(int facility
boot_delay_msec(level);
printk_delay();
- printk_safe_enter_irqsave(flags);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- printk_safe_exit_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
@@ -2615,7 +2581,6 @@ void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[CONSOLE_LOG_MAX];
- unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
struct printk_record r;
@@ -2660,7 +2625,6 @@ void console_unlock(void)
size_t ext_len = 0;
size_t len;
- printk_safe_enter_irqsave(flags);
skip:
if (!prb_read_valid(prb, console_seq, &r))
break;
@@ -2717,12 +2681,8 @@ void console_unlock(void)
call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
+ if (console_lock_spinning_disable_and_check())
return;
- }
-
- printk_safe_exit_irqrestore(flags);
if (do_cond_resched)
cond_resched();
@@ -2739,8 +2699,6 @@ void console_unlock(void)
* flush, no worries.
*/
retry = prb_read_valid(prb, console_seq, NULL);
- printk_safe_exit_irqrestore(flags);
-
if (retry && console_trylock())
goto again;
}
@@ -2802,13 +2760,8 @@ void console_flush_on_panic(enum con_flu
console_trylock();
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
+ if (mode == CONSOLE_REPLAY_ALL)
console_seq = prb_first_valid_seq(prb);
- printk_safe_exit_irqrestore(flags);
- }
console_unlock();
}
@@ -3464,11 +3417,9 @@ bool kmsg_dump_get_line(struct kmsg_dump
struct printk_info info;
unsigned int line_count;
struct printk_record r;
- unsigned long flags;
size_t l = 0;
bool ret = false;
- printk_safe_enter_irqsave(flags);
prb_rec_init_rd(&r, &info, line, size);
if (!iter->active)
@@ -3492,7 +3443,6 @@ bool kmsg_dump_get_line(struct kmsg_dump
iter->cur_seq = r.info->seq + 1;
ret = true;
out:
- printk_safe_exit_irqrestore(flags);
if (len)
*len = l;
return ret;
@@ -3523,7 +3473,6 @@ bool kmsg_dump_get_buffer(struct kmsg_du
{
struct printk_info info;
struct printk_record r;
- unsigned long flags;
u64 seq;
u64 next_seq;
size_t len = 0;
@@ -3533,7 +3482,6 @@ bool kmsg_dump_get_buffer(struct kmsg_du
if (!iter->active || !buf || !size)
goto out;
- printk_safe_enter_irqsave(flags);
if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
if (info.seq != iter->cur_seq) {
/* messages are gone, move to first available one */
@@ -3542,10 +3490,8 @@ bool kmsg_dump_get_buffer(struct kmsg_du
}
/* last entry */
- if (iter->cur_seq >= iter->next_seq) {
- printk_safe_exit_irqrestore(flags);
+ if (iter->cur_seq >= iter->next_seq)
goto out;
- }
/*
* Find first record that fits, including all following records,
@@ -3577,7 +3523,6 @@ bool kmsg_dump_get_buffer(struct kmsg_du
iter->next_seq = next_seq;
ret = true;
- printk_safe_exit_irqrestore(flags);
out:
if (len_out)
*len_out = len;
@@ -3595,12 +3540,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind(struct kmsg_dumper_iter *iter)
{
- unsigned long flags;
-
- printk_safe_enter_irqsave(flags);
iter->cur_seq = latched_seq_read_nolock(&clear_seq);
iter->next_seq = prb_next_seq(prb);
- printk_safe_exit_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -15,282 +15,9 @@
#include "internal.h"
-/*
- * In NMI and safe mode, printk() avoids taking locks. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examining current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
static DEFINE_PER_CPU(int, printk_context);
-static DEFINE_RAW_SPINLOCK(safe_read_lock);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_percpu_data_ready())
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&safe_read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&safe_read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- if (raw_spin_is_locked(&safe_read_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&safe_read_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
void noinstr printk_nmi_enter(void)
{
this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
@@ -305,9 +32,6 @@ void noinstr printk_nmi_exit(void)
* Marks a code that might produce many messages in NMI context
* and the risk of losing them is more critical than eventual
* reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will store the messages into the main logbuf directly.
*/
void printk_nmi_direct_enter(void)
{
@@ -320,27 +44,8 @@ void printk_nmi_direct_exit(void)
this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
}
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
#endif /* CONFIG_PRINTK_NMI */
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
@@ -365,8 +70,10 @@ void __printk_safe_exit(void)
* Use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
*/
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) {
- unsigned long flags;
+ if (this_cpu_read(printk_context) &
+ (PRINTK_NMI_DIRECT_CONTEXT_MASK |
+ PRINTK_NMI_CONTEXT_MASK |
+ PRINTK_SAFE_CONTEXT_MASK)) {
int len;
printk_safe_enter_irqsave(flags);
@@ -376,34 +83,6 @@ void __printk_safe_exit(void)
return len;
}
- /* Use extra buffer in NMI. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
/* No obstacles. */
return vprintk_default(fmt, args);
}
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const
touch_softlockup_watchdog();
}
- /*
- * Force flush any remote buffers that might be stuck in IRQ context
- * and therefore could not run their irq_work.
- */
- printk_safe_flush();
-
clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}

View File

@@ -0,0 +1,112 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 18 Feb 2021 17:37:41 +0100
Subject: [PATCH 18/28] printk: convert @syslog_lock to spin_lock
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -356,7 +356,7 @@ enum log_flags {
};
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
-static DEFINE_RAW_SPINLOCK(syslog_lock);
+static DEFINE_SPINLOCK(syslog_lock);
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
@@ -1479,9 +1479,9 @@ static int syslog_print(char __user *buf
size_t n;
size_t skip;
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
if (!prb_read_valid(prb, syslog_seq, &r)) {
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
break;
}
if (r.info->seq != syslog_seq) {
@@ -1510,7 +1510,7 @@ static int syslog_print(char __user *buf
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
if (!n)
break;
@@ -1574,9 +1574,9 @@ static int syslog_print_all(char __user
}
if (clear) {
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, seq);
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
}
kfree(text);
@@ -1585,9 +1585,9 @@ static int syslog_print_all(char __user
static void syslog_clear(void)
{
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
latched_seq_write(&clear_seq, prb_next_seq(prb));
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
}
/* Return a consistent copy of @syslog_seq. */
@@ -1595,9 +1595,9 @@ static u64 read_syslog_seq_irq(void)
{
u64 seq;
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
seq = syslog_seq;
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
return seq;
}
@@ -1675,10 +1675,10 @@ int do_syslog(int type, char __user *buf
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- raw_spin_lock_irq(&syslog_lock);
+ spin_lock_irq(&syslog_lock);
if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
/* No unread messages. */
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
return 0;
}
if (info.seq != syslog_seq) {
@@ -1706,7 +1706,7 @@ int do_syslog(int type, char __user *buf
}
error -= syslog_partial;
}
- raw_spin_unlock_irq(&syslog_lock);
+ spin_unlock_irq(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -3001,9 +3001,9 @@ void register_console(struct console *ne
exclusive_console_stop_seq = console_seq;
/* Get a consistent copy of @syslog_seq. */
- raw_spin_lock_irqsave(&syslog_lock, flags);
+ spin_lock_irqsave(&syslog_lock, flags);
console_seq = syslog_seq;
- raw_spin_unlock_irqrestore(&syslog_lock, flags);
+ spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();
console_sysfs_notify();

View File

@@ -0,0 +1,154 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:01 +0106
Subject: [PATCH 19/28] console: add write_atomic interface
Add a write_atomic() callback to the console. This is an optional
function for console drivers. The function must be atomic (including
NMI safe) for writing to the console.
Console drivers must still implement the write() callback. The
write_atomic() callback will only be used in special situations,
such as when the kernel panics.
Creating an NMI safe write_atomic() that must synchronize with
write() requires a careful implementation of the console driver. To
aid with the implementation, a set of console_atomic_*() functions
are provided:
void console_atomic_lock(unsigned int *flags);
void console_atomic_unlock(unsigned int flags);
These functions synchronize using a processor-reentrant spinlock
(called a cpulock).
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 4 +
kernel/printk/printk.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -140,6 +140,7 @@ static inline int con_debug_leave(void)
struct console {
char name[16];
void (*write)(struct console *, const char *, unsigned);
+ void (*write_atomic)(struct console *co, const char *s, unsigned int count);
int (*read)(struct console *, char *, unsigned);
struct tty_driver *(*device)(struct console *, int *);
void (*unblank)(void);
@@ -229,4 +230,7 @@ extern void console_init(void);
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);
+extern void console_atomic_lock(unsigned int *flags);
+extern void console_atomic_unlock(unsigned int flags);
+
#endif /* _LINUX_CONSOLE_H */
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3546,3 +3546,103 @@ void kmsg_dump_rewind(struct kmsg_dumper
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
+
+struct prb_cpulock {
+ atomic_t owner;
+ unsigned long __percpu *irqflags;
+};
+
+#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \
+static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \
+static struct prb_cpulock name = { \
+ .owner = ATOMIC_INIT(-1), \
+ .irqflags = &_##name##_percpu_irqflags, \
+}
+
+static bool __prb_trylock(struct prb_cpulock *cpu_lock,
+ unsigned int *cpu_store)
+{
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = get_cpu();
+
+ *cpu_store = atomic_read(&cpu_lock->owner);
+ /* memory barrier to ensure the current lock owner is visible */
+ smp_rmb();
+ if (*cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_save(*flags);
+ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner,
+ cpu_store, cpu)) {
+ return true;
+ }
+ local_irq_restore(*flags);
+ } else if (*cpu_store == cpu) {
+ return true;
+ }
+
+ put_cpu();
+ return false;
+}
+
+/*
+ * prb_lock: Perform a processor-reentrant spin lock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" pointer to store lock status information.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately. If lock is locked by another
+ * processor, this function spins until the calling processor becomes the
+ * owner.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store)
+{
+ for (;;) {
+ if (__prb_trylock(cpu_lock, cpu_store))
+ break;
+ cpu_relax();
+ }
+}
+
+/*
+ * prb_unlock: Perform a processor-reentrant spin unlock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" object storing lock status information.
+ *
+ * Release the lock. The calling processor must be the owner of the lock.
+ *
+ * It is safe to call this function from any context and state.
+ */
+static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store)
+{
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = atomic_read(&cpu_lock->owner);
+ atomic_set_release(&cpu_lock->owner, cpu_store);
+
+ if (cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_restore(*flags);
+ }
+
+ put_cpu();
+}
+
+DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
+
+void console_atomic_lock(unsigned int *flags)
+{
+ prb_lock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_lock);
+
+void console_atomic_unlock(unsigned int flags)
+{
+ prb_unlock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_unlock);

View File

@@ -1,83 +1,99 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:58 +0100
Subject: [PATCH 20/25] serial: 8250: implement write_atomic
Date: Mon, 30 Nov 2020 01:42:02 +0106
Subject: [PATCH 20/28] serial: 8250: implement write_atomic
Implement a non-sleeping NMI-safe write_atomic console function in
order to support emergency printk messages.
Implement a non-sleeping NMI-safe write_atomic() console function in
order to support emergency console printing.
Since interrupts need to be disabled during transmit, all usage of
the IER register was wrapped with access functions that use the
console_atomic_lock function to synchronize register access while
tracking the state of the interrupts. This was necessary because
write_atomic is can be calling from an NMI context that has
preempted write_atomic.
the IER register is wrapped with access functions that use the
console_atomic_lock() function to synchronize register access while
tracking the state of the interrupts. This is necessary because
write_atomic() can be called from an NMI context that has preempted
write_atomic().
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250.h | 22 +++++
drivers/tty/serial/8250/8250_core.c | 19 +++-
drivers/tty/serial/8250/8250_dma.c | 4
drivers/tty/serial/8250/8250_port.c | 154 ++++++++++++++++++++++++++----------
include/linux/serial_8250.h | 5 +
5 files changed, 157 insertions(+), 47 deletions(-)
drivers/tty/serial/8250/8250.h | 47 ++++++++++++++++
drivers/tty/serial/8250/8250_core.c | 17 ++++--
drivers/tty/serial/8250/8250_fsl.c | 9 +++
drivers/tty/serial/8250/8250_ingenic.c | 7 ++
drivers/tty/serial/8250/8250_mtk.c | 29 +++++++++-
drivers/tty/serial/8250/8250_port.c | 92 ++++++++++++++++++++-------------
include/linux/serial_8250.h | 5 +
7 files changed, 162 insertions(+), 44 deletions(-)
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -96,6 +96,10 @@ struct serial8250_config {
#define SERIAL8250_SHARE_IRQS 0
#endif
+void set_ier(struct uart_8250_port *up, unsigned char ier);
+void clear_ier(struct uart_8250_port *up);
+void restore_ier(struct uart_8250_port *up);
+
#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \
{ \
.iobase = _base, \
@@ -139,6 +143,15 @@ static inline bool serial8250_set_THRI(s
return true;
@@ -130,12 +130,55 @@ static inline void serial_dl_write(struc
up->dl_write(up, value);
}
+static inline bool serial8250_set_THRI_sier(struct uart_8250_port *up)
+static inline void serial8250_set_IER(struct uart_8250_port *up,
+ unsigned char ier)
+{
+ if (up->ier & UART_IER_THRI)
+ return false;
+ up->ier |= UART_IER_THRI;
+ set_ier(up, up->ier);
+ return true;
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ serial_out(up, UART_IER, ier);
+
+ if (is_console)
+ console_atomic_unlock(flags);
+}
+
static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
+static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int clearval = 0;
+ unsigned int prior;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (up->capabilities & UART_CAP_UUE)
+ clearval = UART_IER_UUE;
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ prior = serial_port_in(port, UART_IER);
+ serial_port_out(port, UART_IER, clearval);
+
+ if (is_console)
+ console_atomic_unlock(flags);
+
+ return prior;
+}
+
static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
if (!(up->ier & UART_IER_THRI))
@@ -148,6 +161,15 @@ static inline bool serial8250_clear_THRI
if (up->ier & UART_IER_THRI)
return false;
up->ier |= UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
+static inline bool serial8250_clear_THRI_sier(struct uart_8250_port *up)
+{
+ if (!(up->ier & UART_IER_THRI))
+ return false;
+ up->ier &= ~UART_IER_THRI;
+ set_ier(up, up->ier);
+ return true;
+}
+
struct uart_8250_port *serial8250_get_port(int line);
@@ -144,7 +187,7 @@ static inline bool serial8250_clear_THRI
if (!(up->ier & UART_IER_THRI))
return false;
up->ier &= ~UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
void serial8250_rpm_get(struct uart_8250_port *p);
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -265,7 +265,7 @@ static void serial8250_timeout(struct ti
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
- unsigned int iir, ier = 0, lsr;
+ unsigned int iir, lsr;
unsigned long flags;
spin_lock_irqsave(&up->port.lock, flags);
@@ -274,10 +274,8 @@ static void serial8250_backup_timeout(st
* Must disable interrupts or else we risk racing with the interrupt
* based handler.
@@ -87,7 +103,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- serial_out(up, UART_IER, 0);
- }
+ if (up->port.irq)
+ clear_ier(up);
+ ier = serial8250_clear_IER(up);
iir = serial_in(up, UART_IIR);
@@ -96,7 +112,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (up->port.irq)
- serial_out(up, UART_IER, ier);
+ restore_ier(up);
+ serial8250_set_IER(up, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
@@ -115,7 +131,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static void univ8250_console_write(struct console *co, const char *s,
unsigned int count)
{
@@ -663,6 +669,7 @@ static int univ8250_console_match(struct
@@ -671,6 +677,7 @@ static int univ8250_console_match(struct
static struct console univ8250_console = {
.name = "ttyS",
@@ -123,149 +139,141 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
.write = univ8250_console_write,
.device = uart_console_device,
.setup = univ8250_console_setup,
--- a/drivers/tty/serial/8250/8250_dma.c
+++ b/drivers/tty/serial/8250/8250_dma.c
@@ -35,7 +35,7 @@ static void __dma_tx_complete(void *para
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port
ret = serial8250_tx_dma(p);
if (ret)
- serial8250_set_THRI(p);
+ serial8250_set_THRI_sier(p);
/* Stop processing interrupts on input overrun */
if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
+ unsigned int ca_flags;
unsigned long delay;
+ bool is_console;
spin_unlock_irqrestore(&p->port.lock, flags);
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&ca_flags);
up->ier = port->serial_in(port, UART_IER);
+ if (is_console)
+ console_atomic_unlock(ca_flags);
+
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic
static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
{
+ unsigned int flags;
+ bool is_console;
int ier;
switch (offset) {
@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(stru
* If we have enabled modem status IRQs we should enable
* modem mode.
*/
+ is_console = uart_console(p);
+ if (is_console)
+ console_atomic_lock(&flags);
ier = p->serial_in(p, UART_IER);
+ if (is_console)
+ console_atomic_unlock(flags);
if (ier & UART_IER_MSI)
value |= UART_MCR_MDCE | UART_MCR_FCM;
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -213,12 +213,37 @@ static void mtk8250_shutdown(struct uart
static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier & (~mask));
+
+ if (is_console)
+ console_atomic_unlock(flags);
}
@@ -98,7 +98,7 @@ int serial8250_tx_dma(struct uart_8250_p
dma_async_issue_pending(dma->txchan);
if (dma->tx_err) {
dma->tx_err = 0;
- serial8250_clear_THRI(p);
+ serial8250_clear_THRI_sier(p);
}
return 0;
err:
static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+
+ if (uart_console(port))
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier | mask);
+
+ if (uart_console(port))
+ console_atomic_unlock(flags);
}
static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -721,7 +721,7 @@ static void serial8250_set_sleep(struct
@@ -757,7 +757,7 @@ static void serial8250_set_sleep(struct
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+ set_ier(p, sleep ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
@@ -1390,7 +1390,7 @@ static void serial8250_stop_rx(struct ua
@@ -1429,7 +1429,7 @@ static void serial8250_stop_rx(struct ua
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1408,7 +1408,7 @@ static void __do_stop_tx_rs485(struct ua
@@ -1459,7 +1459,7 @@ void serial8250_em485_stop_tx(struct uar
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_port_out(&p->port, UART_IER, p->ier);
+ set_ier(p, p->ier);
+ serial8250_set_IER(p, p->ier);
}
}
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
@@ -1459,7 +1459,7 @@ static void __stop_tx_rs485(struct uart_
static inline void __do_stop_tx(struct uart_8250_port *p)
{
- if (serial8250_clear_THRI(p))
+ if (serial8250_clear_THRI_sier(p))
serial8250_rpm_put_tx(p);
}
@@ -1509,7 +1509,7 @@ static inline void __start_tx(struct uar
if (up->dma && !up->dma->tx_dma(up))
return;
- if (serial8250_set_THRI(up)) {
+ if (serial8250_set_THRI_sier(up)) {
if (up->bugs & UART_BUG_TXEN) {
unsigned char lsr;
@@ -1616,7 +1616,7 @@ static void serial8250_disable_ms(struct
EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx);
@@ -1687,7 +1687,7 @@ static void serial8250_disable_ms(struct
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1632,7 +1632,7 @@ static void serial8250_enable_ms(struct
@@ -1703,7 +1703,7 @@ static void serial8250_enable_ms(struct
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1991,6 +1991,52 @@ static void wait_for_xmitr(struct uart_8
}
}
+static atomic_t ier_counter = ATOMIC_INIT(0);
+static atomic_t ier_value = ATOMIC_INIT(0);
+
+void set_ier(struct uart_8250_port *up, unsigned char ier)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+
+ console_atomic_lock(&flags);
+ if (atomic_read(&ier_counter) > 0)
+ atomic_set(&ier_value, ier);
+ else
+ serial_port_out(port, UART_IER, ier);
+ console_atomic_unlock(flags);
+}
+
+void clear_ier(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int ier_cleared = 0;
+ unsigned int flags;
+ unsigned int ier;
+
+ console_atomic_lock(&flags);
+ atomic_inc(&ier_counter);
+ ier = serial_port_in(port, UART_IER);
+ if (up->capabilities & UART_CAP_UUE)
+ ier_cleared = UART_IER_UUE;
+ if (ier != ier_cleared) {
+ serial_port_out(port, UART_IER, ier_cleared);
+ atomic_set(&ier_value, ier);
+ }
+ console_atomic_unlock(flags);
+}
+
+void restore_ier(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+
+ console_atomic_lock(&flags);
+ if (atomic_fetch_dec(&ier_counter) == 1)
+ serial_port_out(port, UART_IER, atomic_read(&ier_value));
+ console_atomic_unlock(flags);
+}
+
#ifdef CONFIG_CONSOLE_POLL
/*
* Console polling routines for writing and reading from the uart while
@@ -2022,18 +2068,10 @@ static int serial8250_get_poll_char(stru
static void serial8250_put_poll_char(struct uart_port *port,
unsigned char c)
{
- unsigned int ier;
@@ -2118,14 +2118,7 @@ static void serial8250_put_poll_char(str
struct uart_8250_port *up = up_to_u8250p(port);
serial8250_rpm_get(up);
@@ -277,38 +285,38 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ clear_ier(up);
+ ier = serial8250_clear_IER(up);
wait_for_xmitr(up, BOTH_EMPTY);
/*
@@ -2046,7 +2084,7 @@ static void serial8250_put_poll_char(str
@@ -2138,7 +2131,7 @@ static void serial8250_put_poll_char(str
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ restore_ier(up);
+ serial8250_set_IER(up, ier);
serial8250_rpm_put(up);
}
@@ -2358,7 +2396,7 @@ void serial8250_do_shutdown(struct uart_
@@ -2441,7 +2434,7 @@ void serial8250_do_shutdown(struct uart_
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- serial_port_out(port, UART_IER, 0);
+ set_ier(up, 0);
+ serial8250_set_IER(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2643,7 +2681,7 @@ serial8250_do_set_termios(struct uart_po
@@ -2771,7 +2764,7 @@ serial8250_do_set_termios(struct uart_po
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3107,7 +3145,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default
@@ -3237,7 +3230,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default
#ifdef CONFIG_SERIAL_8250_CONSOLE
@@ -317,7 +325,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct uart_8250_port *up = up_to_u8250p(port);
@@ -3115,6 +3153,18 @@ static void serial8250_console_putchar(s
@@ -3245,6 +3238,18 @@ static void serial8250_console_putchar(s
serial_port_out(port, UART_TX, ch);
}
@@ -336,7 +344,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Restore serial console when h/w power-off detected
*/
@@ -3136,6 +3186,42 @@ static void serial8250_console_restore(s
@@ -3266,6 +3271,32 @@ static void serial8250_console_restore(s
serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
}
@@ -345,20 +353,13 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ bool locked;
+ unsigned int ier;
+
+ console_atomic_lock(&flags);
+
+ /*
+ * If possible, keep any other CPUs from working with the
+ * UART until the atomic message is completed. This helps
+ * to keep the output more orderly.
+ */
+ locked = spin_trylock(&port->lock);
+
+ touch_nmi_watchdog();
+
+ clear_ier(up);
+ ier = serial8250_clear_IER(up);
+
+ if (atomic_fetch_inc(&up->console_printing)) {
+ uart_console_write(port, "\n", 1,
@@ -368,10 +369,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ atomic_dec(&up->console_printing);
+
+ wait_for_xmitr(up, BOTH_EMPTY);
+ restore_ier(up);
+
+ if (locked)
+ spin_unlock(&port->lock);
+ serial8250_set_IER(up, ier);
+
+ console_atomic_unlock(flags);
+}
@@ -379,18 +377,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Print a string to the serial port trying not to disturb
* any possible real use of the port...
@@ -3147,27 +3233,13 @@ void serial8250_console_write(struct uar
{
@@ -3282,24 +3313,12 @@ void serial8250_console_write(struct uar
struct uart_port *port = &up->port;
unsigned long flags;
- unsigned int ier;
unsigned int ier;
- int locked = 1;
touch_nmi_watchdog();
serial8250_rpm_get(up);
+ spin_lock_irqsave(&port->lock, flags);
- if (oops_in_progress)
- locked = spin_trylock_irqsave(&port->lock, flags);
- else
@@ -400,17 +394,18 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- * First save the IER then disable the interrupts
- */
- ier = serial_port_in(port, UART_IER);
-
+ spin_lock_irqsave(&port->lock, flags);
- if (up->capabilities & UART_CAP_UUE)
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ clear_ier(up);
+ ier = serial8250_clear_IER(up);
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
@@ -3175,14 +3247,16 @@ void serial8250_console_write(struct uar
up->canary = 0;
@@ -3313,7 +3332,9 @@ void serial8250_console_write(struct uar
mdelay(port->rs485.delay_rts_before_send);
}
+ atomic_inc(&up->console_printing);
@@ -419,25 +414,27 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Finally, wait for transmitter to become empty
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
@@ -3326,8 +3347,7 @@ void serial8250_console_write(struct uar
if (em485->tx_stopped)
up->rs485_stop_tx(up);
}
-
- serial_port_out(port, UART_IER, ier);
+ restore_ier(up);
+ serial8250_set_IER(up, ier);
/*
* The receive handling will happen properly because the
@@ -3194,8 +3268,7 @@ void serial8250_console_write(struct uar
@@ -3339,8 +3359,7 @@ void serial8250_console_write(struct uar
if (up->msr_saved_flags)
serial8250_modem_status(up);
- if (locked)
- spin_unlock_irqrestore(&port->lock, flags);
+ spin_unlock_irqrestore(&port->lock, flags);
serial8250_rpm_put(up);
}
@@ -3216,6 +3289,7 @@ static unsigned int probe_baud(struct ua
static unsigned int probe_baud(struct uart_port *port)
@@ -3360,6 +3379,7 @@ static unsigned int probe_baud(struct ua
int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
@@ -445,7 +442,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
int baud = 9600;
int bits = 8;
int parity = 'n';
@@ -3224,6 +3298,8 @@ int serial8250_console_setup(struct uart
@@ -3369,6 +3389,8 @@ int serial8250_console_setup(struct uart
if (!port->iobase && !port->membase)
return -ENODEV;
@@ -464,7 +461,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#include <linux/serial_core.h>
#include <linux/serial_reg.h>
#include <linux/platform_device.h>
@@ -123,6 +124,8 @@ struct uart_8250_port {
@@ -125,6 +126,8 @@ struct uart_8250_port {
#define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
unsigned char msr_saved_flags;
@@ -473,12 +470,12 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
struct uart_8250_dma *dma;
const struct uart_8250_ops *ops;
@@ -174,6 +177,8 @@ void serial8250_init_port(struct uart_82
@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_82
void serial8250_set_defaults(struct uart_8250_port *up);
void serial8250_console_write(struct uart_8250_port *up, const char *s,
unsigned int count);
+void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s,
+ unsigned int count);
int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
int serial8250_console_exit(struct uart_port *port);
extern void serial8250_set_isa_configurator(void (*v)

View File

@@ -0,0 +1,82 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:03 +0106
Subject: [PATCH 21/28] printk: relocate printk_delay() and vprintk_default()
Move printk_delay() and vprintk_default() "as is" further up so that
they can be used by new functions in an upcoming commit.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 40 ++++++++++++++++++++--------------------
1 file changed, 20 insertions(+), 20 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1725,6 +1725,20 @@ SYSCALL_DEFINE3(syslog, int, type, char
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
+int printk_delay_msec __read_mostly;
+
+static inline void printk_delay(void)
+{
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
+
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
+
/*
* Special console_lock variants that help to reduce the risk of soft-lockups.
* They allow to pass console_lock to another printk() call using a busy wait.
@@ -1968,20 +1982,6 @@ static void printk_exit_irqrestore(unsig
local_irq_restore(flags);
}
-int printk_delay_msec __read_mostly;
-
-static inline void printk_delay(void)
-{
- if (unlikely(printk_delay_msec)) {
- int m = printk_delay_msec;
-
- while (m--) {
- mdelay(1);
- touch_nmi_watchdog();
- }
- }
-}
-
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
@@ -2214,18 +2214,18 @@ asmlinkage int vprintk_emit(int facility
}
EXPORT_SYMBOL(vprintk_emit);
-asmlinkage int vprintk(const char *fmt, va_list args)
-{
- return vprintk_func(fmt, args);
-}
-EXPORT_SYMBOL(vprintk);
-
int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ return vprintk_func(fmt, args);
+}
+EXPORT_SYMBOL(vprintk);
+
/**
* printk - print a kernel message
* @fmt: format string

View File

@@ -0,0 +1,37 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:04 +0106
Subject: [PATCH 22/28] printk: combine boot_delay_msec() into printk_delay()
boot_delay_msec() is always called immediately before printk_delay()
so just combine the two.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1727,8 +1727,10 @@ SYSCALL_DEFINE3(syslog, int, type, char
int printk_delay_msec __read_mostly;
-static inline void printk_delay(void)
+static inline void printk_delay(int level)
{
+ boot_delay_msec(level);
+
if (unlikely(printk_delay_msec)) {
int m = printk_delay_msec;
@@ -2186,8 +2188,7 @@ asmlinkage int vprintk_emit(int facility
in_sched = true;
}
- boot_delay_msec(level);
- printk_delay();
+ printk_delay(level);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);

View File

@@ -0,0 +1,125 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:05 +0106
Subject: [PATCH 23/28] printk: change @console_seq to atomic64_t
In preparation for atomic printing, change @console_seq to atomic
so that it can be accessed without requiring @console_sem.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 34 +++++++++++++++++++---------------
1 file changed, 19 insertions(+), 15 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -366,12 +366,13 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* All 3 protected by @console_sem. */
-/* the next printk record to write to the console */
-static u64 console_seq;
+/* Both protected by @console_sem. */
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
+/* the next printk record to write to the console */
+static atomic64_t console_seq = ATOMIC64_INIT(0);
+
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
@@ -2270,7 +2271,7 @@ EXPORT_SYMBOL(printk);
#define prb_first_valid_seq(rb) 0
static u64 syslog_seq;
-static u64 console_seq;
+static atomic64_t console_seq = ATOMIC64_INIT(0);
static u64 exclusive_console_stop_seq;
static unsigned long console_dropped;
@@ -2585,6 +2586,7 @@ void console_unlock(void)
bool do_cond_resched, retry;
struct printk_info info;
struct printk_record r;
+ u64 seq;
if (console_suspended) {
up_console_sem();
@@ -2627,12 +2629,14 @@ void console_unlock(void)
size_t len;
skip:
- if (!prb_read_valid(prb, console_seq, &r))
+ seq = atomic64_read(&console_seq);
+ if (!prb_read_valid(prb, seq, &r))
break;
- if (console_seq != r.info->seq) {
- console_dropped += r.info->seq - console_seq;
- console_seq = r.info->seq;
+ if (seq != r.info->seq) {
+ console_dropped += r.info->seq - seq;
+ atomic64_set(&console_seq, r.info->seq);
+ seq = r.info->seq;
}
if (suppress_message_printing(r.info->level)) {
@@ -2641,13 +2645,13 @@ void console_unlock(void)
* directly to the console when we received it, and
* record that has level above the console loglevel.
*/
- console_seq++;
+ atomic64_set(&console_seq, seq + 1);
goto skip;
}
/* Output to all consoles once old messages replayed. */
if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
+ seq >= exclusive_console_stop_seq)) {
exclusive_console = NULL;
}
@@ -2668,7 +2672,7 @@ void console_unlock(void)
len = record_print_text(&r,
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time);
- console_seq++;
+ atomic64_set(&console_seq, seq + 1);
/*
* While actively printing out messages, if another printk()
@@ -2699,7 +2703,7 @@ void console_unlock(void)
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- retry = prb_read_valid(prb, console_seq, NULL);
+ retry = prb_read_valid(prb, atomic64_read(&console_seq), NULL);
if (retry && console_trylock())
goto again;
}
@@ -2762,7 +2766,7 @@ void console_flush_on_panic(enum con_flu
console_may_schedule = 0;
if (mode == CONSOLE_REPLAY_ALL)
- console_seq = prb_first_valid_seq(prb);
+ atomic64_set(&console_seq, prb_first_valid_seq(prb));
console_unlock();
}
@@ -2999,11 +3003,11 @@ void register_console(struct console *ne
* ignores console_lock.
*/
exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
+ exclusive_console_stop_seq = atomic64_read(&console_seq);
/* Get a consistent copy of @syslog_seq. */
spin_lock_irqsave(&syslog_lock, flags);
- console_seq = syslog_seq;
+ atomic64_set(&console_seq, syslog_seq);
spin_unlock_irqrestore(&syslog_lock, flags);
}
console_unlock();

View File

@@ -0,0 +1,298 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:06 +0106
Subject: [PATCH 24/28] printk: introduce kernel sync mode
When the kernel performs an OOPS, enter into "sync mode":
- only atomic consoles (write_atomic() callback) will print
- printing occurs within vprintk_store() instead of console_unlock()
CONSOLE_LOG_MAX is moved to printk.h to support the per-console
buffer used in sync mode.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 4 +
include/linux/printk.h | 6 ++
kernel/printk/printk.c | 133 +++++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 137 insertions(+), 6 deletions(-)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -16,6 +16,7 @@
#include <linux/atomic.h>
#include <linux/types.h>
+#include <linux/printk.h>
struct vc_data;
struct console_font_op;
@@ -150,6 +151,9 @@ struct console {
short flags;
short index;
int cflag;
+#ifdef CONFIG_PRINTK
+ char sync_buf[CONSOLE_LOG_MAX];
+#endif
void *data;
struct console *next;
};
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -46,6 +46,12 @@ static inline const char *printk_skip_he
#define CONSOLE_EXT_LOG_MAX 8192
+/*
+ * The maximum size of a record formatted for console printing
+ * (i.e. with the prefix prepended to every line).
+ */
+#define CONSOLE_LOG_MAX 4096
+
/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -44,6 +44,7 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/clocksource.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
@@ -359,6 +360,9 @@ enum log_flags {
static DEFINE_SPINLOCK(syslog_lock);
#ifdef CONFIG_PRINTK
+/* Set to enable sync mode. Once set, it is never cleared. */
+static bool sync_mode;
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -398,9 +402,6 @@ static struct latched_seq clear_seq = {
/* the maximum size allowed to be reserved for a record */
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
-/* the maximum size of a formatted record (i.e. with prefix added per line) */
-#define CONSOLE_LOG_MAX 4096
-
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
@@ -1742,6 +1743,91 @@ static inline void printk_delay(int leve
}
}
+static bool kernel_sync_mode(void)
+{
+ return (oops_in_progress || sync_mode);
+}
+
+static bool console_can_sync(struct console *con)
+{
+ if (!(con->flags & CON_ENABLED))
+ return false;
+ if (con->write_atomic && kernel_sync_mode())
+ return true;
+ return false;
+}
+
+static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len)
+{
+ if (!(con->flags & CON_ENABLED))
+ return false;
+ if (con->write_atomic && kernel_sync_mode())
+ con->write_atomic(con, text, text_len);
+ else
+ return false;
+
+ return true;
+}
+
+static bool have_atomic_console(void)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic)
+ return true;
+ }
+ return false;
+}
+
+static bool print_sync(struct console *con, u64 *seq)
+{
+ struct printk_info info;
+ struct printk_record r;
+ size_t text_len;
+
+ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf));
+
+ if (!prb_read_valid(prb, *seq, &r))
+ return false;
+
+ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
+
+ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len))
+ return false;
+
+ *seq = r.info->seq;
+
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
+
+ if (text_len)
+ printk_delay(r.info->level);
+
+ return true;
+}
+
+static void print_sync_until(struct console *con, u64 seq)
+{
+ unsigned int flags;
+ u64 printk_seq;
+
+ console_atomic_lock(&flags);
+ for (;;) {
+ printk_seq = atomic64_read(&console_seq);
+ if (printk_seq >= seq)
+ break;
+ if (!print_sync(con, &printk_seq))
+ break;
+ atomic64_set(&console_seq, printk_seq + 1);
+ }
+ console_atomic_unlock(flags);
+}
+
/*
* Special console_lock variants that help to reduce the risk of soft-lockups.
* They allow to pass console_lock to another printk() call using a busy wait.
@@ -1916,6 +2002,8 @@ static void call_console_drivers(const c
if (!cpu_online(smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
+ if (kernel_sync_mode())
+ continue;
if (con->flags & CON_EXTENDED)
con->write(con, ext_text, ext_len);
else {
@@ -2070,6 +2158,7 @@ int vprintk_store(int facility, int leve
const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
enum log_flags lflags = 0;
+ bool final_commit = false;
struct printk_record r;
unsigned long irqflags;
u16 trunc_msg_len = 0;
@@ -2079,6 +2168,7 @@ int vprintk_store(int facility, int leve
u16 text_len;
int ret = 0;
u64 ts_nsec;
+ u64 seq;
/*
* Since the duration of printk() can vary depending on the message
@@ -2117,6 +2207,7 @@ int vprintk_store(int facility, int leve
if (lflags & LOG_CONT) {
prb_rec_init_wr(&r, reserve_size);
if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ seq = r.info->seq;
text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
facility, &lflags, fmt, args);
r.info->text_len += text_len;
@@ -2124,6 +2215,7 @@ int vprintk_store(int facility, int leve
if (lflags & LOG_NEWLINE) {
r.info->flags |= LOG_NEWLINE;
prb_final_commit(&e);
+ final_commit = true;
} else {
prb_commit(&e);
}
@@ -2148,6 +2240,8 @@ int vprintk_store(int facility, int leve
goto out;
}
+ seq = r.info->seq;
+
/* fill message */
text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args);
if (trunc_msg_len)
@@ -2162,13 +2256,25 @@ int vprintk_store(int facility, int leve
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* A message without a trailing newline can be continued. */
- if (!(lflags & LOG_NEWLINE))
+ if (!(lflags & LOG_NEWLINE)) {
prb_commit(&e);
- else
+ } else {
prb_final_commit(&e);
+ final_commit = true;
+ }
ret = text_len + trunc_msg_len;
out:
+ /* only the kernel may perform synchronous printing */
+ if (facility == 0 && final_commit) {
+ struct console *con;
+
+ for_each_console(con) {
+ if (console_can_sync(con))
+ print_sync_until(con, seq + 1);
+ }
+ }
+
printk_exit_irqrestore(irqflags);
return ret;
}
@@ -2264,12 +2370,13 @@ EXPORT_SYMBOL(printk);
#else /* CONFIG_PRINTK */
-#define CONSOLE_LOG_MAX 0
#define printk_time false
#define prb_read_valid(rb, seq, r) false
#define prb_first_valid_seq(rb) 0
+#define kernel_sync_mode() false
+
static u64 syslog_seq;
static atomic64_t console_seq = ATOMIC64_INIT(0);
static u64 exclusive_console_stop_seq;
@@ -2562,6 +2669,8 @@ static int have_callable_console(void)
*/
static inline int can_use_console(void)
{
+ if (kernel_sync_mode())
+ return false;
return cpu_online(raw_smp_processor_id()) || have_callable_console();
}
@@ -3374,6 +3483,18 @@ void kmsg_dump(enum kmsg_dump_reason rea
struct kmsg_dumper_iter iter;
struct kmsg_dumper *dumper;
+ if (!oops_in_progress) {
+ /*
+ * If atomic consoles are available, activate kernel sync mode
+ * to make sure any final messages are visible. The trailing
+ * printk message is important to flush any pending messages.
+ */
+ if (have_atomic_console()) {
+ sync_mode = true;
+ pr_info("enabled sync mode\n");
+ }
+ }
+
rcu_read_lock();
list_for_each_entry_rcu(dumper, &dump_list, list) {
enum kmsg_dump_reason max_reason = dumper->max_reason;

View File

@@ -0,0 +1,838 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:07 +0106
Subject: [PATCH 25/28] printk: move console printing to kthreads
Create a kthread for each console to perform console printing. Now
all console printing is fully asynchronous except for the boot
console and when the kernel enters sync mode (and there are atomic
consoles available).
The console_lock() and console_unlock() functions now only do what
their name says... locking and unlocking of the console.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 2
kernel/printk/printk.c | 625 ++++++++++++++----------------------------------
2 files changed, 186 insertions(+), 441 deletions(-)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -154,6 +154,8 @@ struct console {
#ifdef CONFIG_PRINTK
char sync_buf[CONSOLE_LOG_MAX];
#endif
+ atomic64_t printk_seq;
+ struct task_struct *thread;
void *data;
struct console *next;
};
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -44,6 +44,7 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/kthread.h>
#include <linux/clocksource.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
@@ -268,11 +269,6 @@ static void __up_console_sem(unsigned lo
static int console_locked, console_suspended;
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
* Array of consoles built from command line options (console=)
*/
@@ -356,10 +352,10 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
+#ifdef CONFIG_PRINTK
/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_SPINLOCK(syslog_lock);
-#ifdef CONFIG_PRINTK
/* Set to enable sync mode. Once set, it is never cleared. */
static bool sync_mode;
@@ -370,13 +366,6 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* Both protected by @console_sem. */
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
-
-/* the next printk record to write to the console */
-static atomic64_t console_seq = ATOMIC64_INIT(0);
-
struct latched_seq {
seqcount_latch_t latch;
u64 val[2];
@@ -1754,6 +1743,8 @@ static bool console_can_sync(struct cons
return false;
if (con->write_atomic && kernel_sync_mode())
return true;
+ if (con->write && (con->flags & CON_BOOT) && !con->thread)
+ return true;
return false;
}
@@ -1763,6 +1754,8 @@ static bool call_sync_console_driver(str
return false;
if (con->write_atomic && kernel_sync_mode())
con->write_atomic(con, text, text_len);
+ else if (con->write && (con->flags & CON_BOOT) && !con->thread)
+ con->write(con, text, text_len);
else
return false;
@@ -1818,202 +1811,16 @@ static void print_sync_until(struct cons
console_atomic_lock(&flags);
for (;;) {
- printk_seq = atomic64_read(&console_seq);
+ printk_seq = atomic64_read(&con->printk_seq);
if (printk_seq >= seq)
break;
if (!print_sync(con, &printk_seq))
break;
- atomic64_set(&console_seq, printk_seq + 1);
+ atomic64_set(&con->printk_seq, printk_seq + 1);
}
console_atomic_unlock(flags);
}
-/*
- * Special console_lock variants that help to reduce the risk of soft-lockups.
- * They allow to pass console_lock to another printk() call using a busy wait.
- */
-
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map console_owner_dep_map = {
- .name = "console_owner"
-};
-#endif
-
-static DEFINE_RAW_SPINLOCK(console_owner_lock);
-static struct task_struct *console_owner;
-static bool console_waiter;
-
-/**
- * console_lock_spinning_enable - mark beginning of code where another
- * thread might safely busy wait
- *
- * This basically converts console_lock into a spinlock. This marks
- * the section where the console_lock owner can not sleep, because
- * there may be a waiter spinning (like a spinlock). Also it must be
- * ready to hand over the lock at the end of the section.
- */
-static void console_lock_spinning_enable(void)
-{
- raw_spin_lock(&console_owner_lock);
- console_owner = current;
- raw_spin_unlock(&console_owner_lock);
-
- /* The waiter may spin on us after setting console_owner */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
-}
-
-/**
- * console_lock_spinning_disable_and_check - mark end of code where another
- * thread was able to busy wait and check if there is a waiter
- *
- * This is called at the end of the section where spinning is allowed.
- * It has two functions. First, it is a signal that it is no longer
- * safe to start busy waiting for the lock. Second, it checks if
- * there is a busy waiter and passes the lock rights to her.
- *
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
- *
- * Return: 1 if the lock rights were passed, 0 otherwise.
- */
-static int console_lock_spinning_disable_and_check(void)
-{
- int waiter;
-
- raw_spin_lock(&console_owner_lock);
- waiter = READ_ONCE(console_waiter);
- console_owner = NULL;
- raw_spin_unlock(&console_owner_lock);
-
- if (!waiter) {
- spin_release(&console_owner_dep_map, _THIS_IP_);
- return 0;
- }
-
- /* The waiter is now free to continue */
- WRITE_ONCE(console_waiter, false);
-
- spin_release(&console_owner_dep_map, _THIS_IP_);
-
- /*
- * Hand off console_lock to waiter. The waiter will perform
- * the up(). After this, the waiter is the console_lock owner.
- */
- mutex_release(&console_lock_dep_map, _THIS_IP_);
- return 1;
-}
-
-/**
- * console_trylock_spinning - try to get console_lock by busy waiting
- *
- * This allows to busy wait for the console_lock when the current
- * owner is running in specially marked sections. It means that
- * the current owner is running and cannot reschedule until it
- * is ready to lose the lock.
- *
- * Return: 1 if we got the lock, 0 othrewise
- */
-static int console_trylock_spinning(void)
-{
- struct task_struct *owner = NULL;
- bool waiter;
- bool spin = false;
- unsigned long flags;
-
- if (console_trylock())
- return 1;
-
- printk_safe_enter_irqsave(flags);
-
- raw_spin_lock(&console_owner_lock);
- owner = READ_ONCE(console_owner);
- waiter = READ_ONCE(console_waiter);
- if (!waiter && owner && owner != current) {
- WRITE_ONCE(console_waiter, true);
- spin = true;
- }
- raw_spin_unlock(&console_owner_lock);
-
- /*
- * If there is an active printk() writing to the
- * consoles, instead of having it write our data too,
- * see if we can offload that load from the active
- * printer, and do some printing ourselves.
- * Go into a spin only if there isn't already a waiter
- * spinning, and there is an active printer, and
- * that active printer isn't us (recursive printk?).
- */
- if (!spin) {
- printk_safe_exit_irqrestore(flags);
- return 0;
- }
-
- /* We spin waiting for the owner to release us */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
- /* Owner will clear console_waiter on hand off */
- while (READ_ONCE(console_waiter))
- cpu_relax();
- spin_release(&console_owner_dep_map, _THIS_IP_);
-
- printk_safe_exit_irqrestore(flags);
- /*
- * The owner passed the console lock to us.
- * Since we did not spin on console lock, annotate
- * this as a trylock. Otherwise lockdep will
- * complain.
- */
- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
-
- return 1;
-}
-
-/*
- * Call the console drivers, asking them to write out
- * log_buf[start] to log_buf[end - 1].
- * The console_lock must be held.
- */
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len)
-{
- static char dropped_text[64];
- size_t dropped_len = 0;
- struct console *con;
-
- trace_console_rcuidle(text, len);
-
- if (!console_drivers)
- return;
-
- if (console_dropped) {
- dropped_len = snprintf(dropped_text, sizeof(dropped_text),
- "** %lu printk messages dropped **\n",
- console_dropped);
- console_dropped = 0;
- }
-
- for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
- if (!(con->flags & CON_ENABLED))
- continue;
- if (!con->write)
- continue;
- if (!cpu_online(smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
- continue;
- if (kernel_sync_mode())
- continue;
- if (con->flags & CON_EXTENDED)
- con->write(con, ext_text, ext_len);
- else {
- if (dropped_len)
- con->write(con, dropped_text, dropped_len);
- con->write(con, text, len);
- }
- }
-}
-
#ifdef CONFIG_PRINTK_NMI
#define NUM_RECURSION_CTX 2
#else
@@ -2284,39 +2091,16 @@ asmlinkage int vprintk_emit(int facility
const char *fmt, va_list args)
{
int printed_len;
- bool in_sched = false;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
return 0;
- if (level == LOGLEVEL_SCHED) {
+ if (level == LOGLEVEL_SCHED)
level = LOGLEVEL_DEFAULT;
- in_sched = true;
- }
-
- printk_delay(level);
printed_len = vprintk_store(facility, level, dev_info, fmt, args);
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched) {
- /*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
- /*
- * Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
- */
- if (console_trylock_spinning())
- console_unlock();
- preempt_enable();
- }
-
wake_up_klogd();
return printed_len;
}
@@ -2368,38 +2152,158 @@ asmlinkage __visible int printk(const ch
}
EXPORT_SYMBOL(printk);
-#else /* CONFIG_PRINTK */
+static int printk_kthread_func(void *data)
+{
+ struct console *con = data;
+ unsigned long dropped = 0;
+ char *dropped_text = NULL;
+ struct printk_info info;
+ struct printk_record r;
+ char *ext_text = NULL;
+ size_t dropped_len;
+ int ret = -ENOMEM;
+ char *text = NULL;
+ char *write_text;
+ u64 printk_seq;
+ size_t len;
+ int error;
+ u64 seq;
-#define printk_time false
+ if (con->flags & CON_EXTENDED) {
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ if (!ext_text)
+ goto out;
+ }
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ dropped_text = kmalloc(64, GFP_KERNEL);
+ if (!text || !dropped_text)
+ goto out;
-#define prb_read_valid(rb, seq, r) false
-#define prb_first_valid_seq(rb) 0
+ if (con->flags & CON_EXTENDED)
+ write_text = ext_text;
+ else
+ write_text = text;
-#define kernel_sync_mode() false
+ seq = atomic64_read(&con->printk_seq);
-static u64 syslog_seq;
-static atomic64_t console_seq = ATOMIC64_INIT(0);
-static u64 exclusive_console_stop_seq;
-static unsigned long console_dropped;
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
+ for (;;) {
+ error = wait_event_interruptible(log_wait,
+ prb_read_valid(prb, seq, &r) || kthread_should_stop());
+
+ if (kthread_should_stop())
+ break;
+
+ if (error)
+ continue;
+
+ if (seq != r.info->seq) {
+ dropped += r.info->seq - seq;
+ seq = r.info->seq;
+ }
+
+ seq++;
+
+ if (!(con->flags & CON_ENABLED))
+ continue;
+
+ if (suppress_message_printing(r.info->level))
+ continue;
+
+ if (con->flags & CON_EXTENDED) {
+ len = info_print_ext_header(ext_text,
+ CONSOLE_EXT_LOG_MAX,
+ r.info);
+ len += msg_print_ext_body(ext_text + len,
+ CONSOLE_EXT_LOG_MAX - len,
+ &r.text_buf[0], r.info->text_len,
+ &r.info->dev_info);
+ } else {
+ len = record_print_text(&r,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time);
+ }
+
+ printk_seq = atomic64_read(&con->printk_seq);
+
+ console_lock();
+ console_may_schedule = 0;
-static size_t record_print_text(const struct printk_record *r,
- bool syslog, bool time)
+ if (kernel_sync_mode() && con->write_atomic) {
+ console_unlock();
+ break;
+ }
+
+ if (!(con->flags & CON_EXTENDED) && dropped) {
+ dropped_len = snprintf(dropped_text, 64,
+ "** %lu printk messages dropped **\n",
+ dropped);
+ dropped = 0;
+
+ con->write(con, dropped_text, dropped_len);
+ printk_delay(r.info->level);
+ }
+
+ con->write(con, write_text, len);
+ if (len)
+ printk_delay(r.info->level);
+
+ atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq);
+
+ console_unlock();
+ }
+out:
+ kfree(dropped_text);
+ kfree(text);
+ kfree(ext_text);
+ pr_info("%sconsole [%s%d]: printing thread stopped\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
+ return ret;
+}
+
+/* Must be called within console_lock(). */
+static void start_printk_kthread(struct console *con)
{
- return 0;
+ con->thread = kthread_run(printk_kthread_func, con,
+ "pr/%s%d", con->name, con->index);
+ if (IS_ERR(con->thread)) {
+ pr_err("%sconsole [%s%d]: unable to start printing thread\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
+ return;
+ }
+ pr_info("%sconsole [%s%d]: printing thread started\n",
+ (con->flags & CON_BOOT) ? "boot" : "",
+ con->name, con->index);
}
-static ssize_t info_print_ext_header(char *buf, size_t size,
- struct printk_info *info)
+
+/* protected by console_lock */
+static bool kthreads_started;
+
+/* Must be called within console_lock(). */
+static void console_try_thread(struct console *con)
{
- return 0;
+ if (kthreads_started) {
+ start_printk_kthread(con);
+ return;
+ }
+
+ /*
+ * The printing threads have not been started yet. If this console
+ * can print synchronously, print all unprinted messages.
+ */
+ if (console_can_sync(con))
+ print_sync_until(con, prb_next_seq(prb));
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *text, size_t text_len,
- struct dev_printk_info *dev_info) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
-static bool suppress_message_printing(int level) { return false; }
+
+#else /* CONFIG_PRINTK */
+
+#define prb_first_valid_seq(rb) 0
+#define prb_next_seq(rb) 0
+
+#define console_try_thread(con)
#endif /* CONFIG_PRINTK */
@@ -2644,36 +2548,6 @@ int is_console_locked(void)
}
EXPORT_SYMBOL(is_console_locked);
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(void)
-{
- if (kernel_sync_mode())
- return false;
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
-}
-
/**
* console_unlock - unlock the console system
*
@@ -2690,131 +2564,14 @@ static inline int can_use_console(void)
*/
void console_unlock(void)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[CONSOLE_LOG_MAX];
- bool do_cond_resched, retry;
- struct printk_info info;
- struct printk_record r;
- u64 seq;
-
if (console_suspended) {
up_console_sem();
return;
}
- prb_rec_init_rd(&r, &info, text, sizeof(text));
-
- /*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
- *
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the "again" goto label.
- */
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
-
- /*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
- */
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
- }
-
- for (;;) {
- size_t ext_len = 0;
- size_t len;
-
-skip:
- seq = atomic64_read(&console_seq);
- if (!prb_read_valid(prb, seq, &r))
- break;
-
- if (seq != r.info->seq) {
- console_dropped += r.info->seq - seq;
- atomic64_set(&console_seq, r.info->seq);
- seq = r.info->seq;
- }
-
- if (suppress_message_printing(r.info->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- atomic64_set(&console_seq, seq + 1);
- goto skip;
- }
-
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
-
- /*
- * Handle extended console text first because later
- * record_print_text() will modify the record buffer in-place.
- */
- if (nr_ext_console_drivers) {
- ext_len = info_print_ext_header(ext_text,
- sizeof(ext_text),
- r.info);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- &r.text_buf[0],
- r.info->text_len,
- &r.info->dev_info);
- }
- len = record_print_text(&r,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time);
- atomic64_set(&console_seq, seq + 1);
-
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
-
- stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
-
- if (console_lock_spinning_disable_and_check())
- return;
-
- if (do_cond_resched)
- cond_resched();
- }
-
console_locked = 0;
up_console_sem();
-
- /*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
- */
- retry = prb_read_valid(prb, atomic64_read(&console_seq), NULL);
- if (retry && console_trylock())
- goto again;
}
EXPORT_SYMBOL(console_unlock);
@@ -2864,18 +2621,20 @@ void console_unblank(void)
*/
void console_flush_on_panic(enum con_flush_mode mode)
{
- /*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
- */
- console_trylock();
+ struct console *c;
+ u64 seq;
+
+ if (!console_trylock())
+ return;
+
console_may_schedule = 0;
- if (mode == CONSOLE_REPLAY_ALL)
- atomic64_set(&console_seq, prb_first_valid_seq(prb));
+ if (mode == CONSOLE_REPLAY_ALL) {
+ seq = prb_first_valid_seq(prb);
+ for_each_console(c)
+ atomic64_set(&c->printk_seq, seq);
+ }
+
console_unlock();
}
@@ -3010,7 +2769,6 @@ static int try_enable_new_console(struct
*/
void register_console(struct console *newcon)
{
- unsigned long flags;
struct console *bcon = NULL;
int err;
@@ -3034,6 +2792,8 @@ void register_console(struct console *ne
}
}
+ newcon->thread = NULL;
+
if (console_drivers && console_drivers->flags & CON_BOOT)
bcon = console_drivers;
@@ -3098,27 +2858,12 @@ void register_console(struct console *ne
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- *
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = atomic64_read(&console_seq);
+ if (newcon->flags & CON_PRINTBUFFER)
+ atomic64_set(&newcon->printk_seq, 0);
+ else
+ atomic64_set(&newcon->printk_seq, prb_next_seq(prb));
- /* Get a consistent copy of @syslog_seq. */
- spin_lock_irqsave(&syslog_lock, flags);
- atomic64_set(&console_seq, syslog_seq);
- spin_unlock_irqrestore(&syslog_lock, flags);
- }
+ console_try_thread(newcon);
console_unlock();
console_sysfs_notify();
@@ -3192,6 +2937,9 @@ int unregister_console(struct console *c
console_unlock();
console_sysfs_notify();
+ if (console->thread && !IS_ERR(console->thread))
+ kthread_stop(console->thread);
+
if (console->exit)
res = console->exit(console);
@@ -3274,6 +3022,15 @@ static int __init printk_late_init(void)
unregister_console(con);
}
}
+
+#ifdef CONFIG_PRINTK
+ console_lock();
+ for_each_console(con)
+ start_printk_kthread(con);
+ kthreads_started = true;
+ console_unlock();
+#endif
+
ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
console_cpu_notify);
WARN_ON(ret < 0);
@@ -3289,7 +3046,6 @@ late_initcall(printk_late_init);
* Delayed printk version, for scheduler-internal messages:
*/
#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_OUTPUT 0x02
static DEFINE_PER_CPU(int, printk_pending);
@@ -3297,14 +3053,8 @@ static void wake_up_klogd_work_func(stru
{
int pending = __this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
- }
-
if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
+ wake_up_interruptible_all(&log_wait);
}
static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
@@ -3325,13 +3075,6 @@ void wake_up_klogd(void)
void defer_console_output(void)
{
- if (!printk_percpu_data_ready())
- return;
-
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
}
int vprintk_deferred(const char *fmt, va_list args)

View File

@@ -0,0 +1,407 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:08 +0106
Subject: [PATCH 26/28] printk: remove deferred printing
Since printing occurs either atomically or from the printing
kthread, there is no need for any deferring or tracking possible
recursion paths. Remove all printk context tracking.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/kernel/smp.c | 2 -
arch/powerpc/kexec/crash.c | 3 -
include/linux/hardirq.h | 2 -
include/linux/printk.h | 12 ------
kernel/printk/Makefile | 1
kernel/printk/internal.h | 70 -----------------------------------
kernel/printk/printk.c | 58 +++++++++++------------------
kernel/printk/printk_safe.c | 88 --------------------------------------------
kernel/trace/trace.c | 2 -
9 files changed, 22 insertions(+), 216 deletions(-)
delete mode 100644 kernel/printk/internal.h
delete mode 100644 kernel/printk/printk_safe.c
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -671,9 +671,7 @@ static void do_handle_IPI(int ipinr)
break;
case IPI_CPU_BACKTRACE:
- printk_nmi_enter();
nmi_cpu_backtrace(get_irq_regs());
- printk_nmi_exit();
break;
default:
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -311,9 +311,6 @@ void default_machine_crash_shutdown(stru
unsigned int i;
int (*old_handler)(struct pt_regs *regs);
- /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */
- printk_nmi_enter();
-
/*
* This function is only called after the system
* has panicked or is otherwise in a critical state.
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -115,7 +115,6 @@ extern void rcu_nmi_exit(void);
do { \
lockdep_off(); \
arch_nmi_enter(); \
- printk_nmi_enter(); \
BUG_ON(in_nmi() == NMI_MASK); \
__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
} while (0)
@@ -134,7 +133,6 @@ extern void rcu_nmi_exit(void);
do { \
BUG_ON(!in_nmi()); \
__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
- printk_nmi_exit(); \
arch_nmi_exit(); \
lockdep_on(); \
} while (0)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -155,18 +155,6 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-#ifdef CONFIG_PRINTK_NMI
-extern void printk_nmi_enter(void);
-extern void printk_nmi_exit(void);
-extern void printk_nmi_direct_enter(void);
-extern void printk_nmi_direct_exit(void);
-#else
-static inline void printk_nmi_enter(void) { }
-static inline void printk_nmi_exit(void) { }
-static inline void printk_nmi_direct_enter(void) { }
-static inline void printk_nmi_direct_exit(void) { }
-#endif /* PRINTK_NMI */
-
struct dev_printk_info;
#ifdef CONFIG_PRINTK
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
--- a/kernel/printk/internal.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * internal.h - printk internal definitions
- */
-#include <linux/percpu.h>
-
-#ifdef CONFIG_PRINTK
-
-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
-
-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
-
-__printf(4, 0)
-int vprintk_store(int facility, int level,
- const struct dev_printk_info *dev_info,
- const char *fmt, va_list args);
-
-__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-
-bool printk_percpu_data_ready(void);
-
-#define printk_safe_enter_irqsave(flags) \
- do { \
- local_irq_save(flags); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irqrestore(flags) \
- do { \
- __printk_safe_exit(); \
- local_irq_restore(flags); \
- } while (0)
-
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
-
-void defer_console_output(void);
-
-#else
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
-
-/*
- * In !PRINTK builds we still export console_sem
- * semaphore and some of console functions (console_unlock()/etc.), so
- * printk-safe must preserve the existing local IRQ guarantees.
- */
-#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
-
-static inline bool printk_percpu_data_ready(void) { return false; }
-#endif /* CONFIG_PRINTK */
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/kthread.h>
+#include <linux/kdb.h>
#include <linux/clocksource.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
@@ -60,7 +61,6 @@
#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
-#include "internal.h"
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -227,19 +227,7 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- int lock_failed;
- unsigned long flags;
-
- /*
- * Here and in __up_console_sem() we need to be in safe mode,
- * because spindump/WARN/etc from under console ->lock will
- * deadlock in printk()->down_trylock_console_sem() otherwise.
- */
- printk_safe_enter_irqsave(flags);
- lock_failed = down_trylock(&console_sem);
- printk_safe_exit_irqrestore(flags);
-
- if (lock_failed)
+ if (down_trylock(&console_sem))
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
@@ -248,13 +236,9 @@ static int __down_trylock_console_sem(un
static void __up_console_sem(unsigned long ip)
{
- unsigned long flags;
-
mutex_release(&console_lock_dep_map, ip);
- printk_safe_enter_irqsave(flags);
up(&console_sem);
- printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)
@@ -426,7 +410,7 @@ static struct printk_ringbuffer *prb = &
*/
static bool __printk_percpu_data_ready __read_mostly;
-bool printk_percpu_data_ready(void)
+static bool printk_percpu_data_ready(void)
{
return __printk_percpu_data_ready;
}
@@ -1060,7 +1044,6 @@ void __init setup_log_buf(int early)
struct printk_record r;
size_t new_descs_size;
size_t new_infos_size;
- unsigned long flags;
char *new_log_buf;
unsigned int free;
u64 seq;
@@ -1958,9 +1941,9 @@ static u16 printk_sprint(char *text, u16
}
__printf(4, 0)
-int vprintk_store(int facility, int level,
- const struct dev_printk_info *dev_info,
- const char *fmt, va_list args)
+static int vprintk_store(int facility, int level,
+ const struct dev_printk_info *dev_info,
+ const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
struct prb_reserved_entry e;
@@ -2106,11 +2089,22 @@ asmlinkage int vprintk_emit(int facility
}
EXPORT_SYMBOL(vprintk_emit);
-int vprintk_default(const char *fmt, va_list args)
+__printf(1, 0)
+static int vprintk_default(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
-EXPORT_SYMBOL_GPL(vprintk_default);
+
+__printf(1, 0)
+static int vprintk_func(const char *fmt, va_list args)
+{
+#ifdef CONFIG_KGDB_KDB
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+ return vprintk_default(fmt, args);
+}
asmlinkage int vprintk(const char *fmt, va_list args)
{
@@ -3073,18 +3067,10 @@ void wake_up_klogd(void)
preempt_enable();
}
-void defer_console_output(void)
+__printf(1, 0)
+static int vprintk_deferred(const char *fmt, va_list args)
{
-}
-
-int vprintk_deferred(const char *fmt, va_list args)
-{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
int printk_deferred(const char *fmt, ...)
--- a/kernel/printk/printk_safe.c
+++ /dev/null
@@ -1,88 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/kdb.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-#include <linux/kprobes.h>
-
-#include "internal.h"
-
-static DEFINE_PER_CPU(int, printk_context);
-
-#ifdef CONFIG_PRINTK_NMI
-void noinstr printk_nmi_enter(void)
-{
- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-void noinstr printk_nmi_exit(void)
-{
- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
-{
- this_cpu_inc(printk_context);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
-{
- this_cpu_dec(printk_context);
-}
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
-#ifdef CONFIG_KGDB_KDB
- /* Allow to pass printk() to kdb but avoid a recursion. */
- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
- return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
-#endif
-
- /*
- * Use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if (this_cpu_read(printk_context) &
- (PRINTK_NMI_DIRECT_CONTEXT_MASK |
- PRINTK_NMI_CONTEXT_MASK |
- PRINTK_SAFE_CONTEXT_MASK)) {
- int len;
-
- printk_safe_enter_irqsave(flags);
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
- printk_safe_exit_irqrestore(flags);
- defer_console_output();
- return len;
- }
-
- /* No obstacles. */
- return vprintk_default(fmt, args);
-}
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9325,7 +9325,6 @@ void ftrace_dump(enum ftrace_dump_mode o
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -9405,7 +9404,6 @@ void ftrace_dump(enum ftrace_dump_mode o
atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);

View File

@@ -0,0 +1,67 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:09 +0106
Subject: [PATCH 27/28] printk: add console handover
If earlyprintk is used, a boot console will print directly to the
console immediately. The boot console will unregister itself as soon
as a non-boot console registers. However, the non-boot console does
not begin printing until its kthread has started. Since this happens
much later, there is a long pause in the console output. If the
ringbuffer is small, messages could even be dropped during the
pause.
Add a new CON_HANDOVER console flag to be used internally by printk
in order to track which non-boot console took over from a boot
console. If handover consoles have implemented write_atomic(), they
are allowed to print directly to the console until their kthread can
take over.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 1 +
kernel/printk/printk.c | 8 +++++++-
2 files changed, 8 insertions(+), 1 deletion(-)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -137,6 +137,7 @@ static inline int con_debug_leave(void)
#define CON_ANYTIME (16) /* Safe to call when cpu is offline */
#define CON_BRL (32) /* Used for a braille device */
#define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */
+#define CON_HANDOVER (128) /* Device was previously a boot console. */
struct console {
char name[16];
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1726,6 +1726,8 @@ static bool console_can_sync(struct cons
return false;
if (con->write_atomic && kernel_sync_mode())
return true;
+ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread)
+ return true;
if (con->write && (con->flags & CON_BOOT) && !con->thread)
return true;
return false;
@@ -1737,6 +1739,8 @@ static bool call_sync_console_driver(str
return false;
if (con->write_atomic && kernel_sync_mode())
con->write_atomic(con, text, text_len);
+ else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread)
+ con->write_atomic(con, text, text_len);
else if (con->write && (con->flags & CON_BOOT) && !con->thread)
con->write(con, text, text_len);
else
@@ -2829,8 +2833,10 @@ void register_console(struct console *ne
* the real console are the same physical device, it's annoying to
* see the beginning boot messages twice
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
+ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
newcon->flags &= ~CON_PRINTBUFFER;
+ newcon->flags |= CON_HANDOVER;
+ }
/*
* Put this console in the list - keep the

View File

@@ -0,0 +1,198 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 30 Nov 2020 01:42:10 +0106
Subject: [PATCH 28/28] printk: add pr_flush()
Provide a function to allow waiting for console printers to catch
up to the latest logged message.
Use pr_flush() to give console printers a chance to finish in
critical situations if no atomic console is available. For now
pr_flush() is only used in the most common error paths:
panic(), print_oops_end_marker(), report_bug(), kmsg_dump().
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk.h | 2 +
kernel/panic.c | 28 ++++++++++-------
kernel/printk/printk.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
lib/bug.c | 1
4 files changed, 99 insertions(+), 11 deletions(-)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -481,6 +481,8 @@ extern int kptr_restrict;
no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif
+bool pr_flush(int timeout_ms, bool reset_on_progress);
+
/*
* ratelimited messages with local ratelimit_state,
* no local ratelimit_state used in the !PRINTK case
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,12 +177,28 @@ static void panic_print_sys_info(void)
void panic(const char *fmt, ...)
{
static char buf[1024];
+ va_list args2;
va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ console_verbose();
+ pr_emerg("Kernel panic - not syncing:\n");
+ va_start(args2, fmt);
+ va_copy(args, args2);
+ vprintk(fmt, args2);
+ va_end(args2);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
+ dump_stack();
+#endif
+ pr_flush(1000, true);
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -213,24 +229,13 @@ void panic(const char *fmt, ...)
if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
panic_smp_self_stop();
- console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
- pr_emerg("Kernel panic - not syncing: %s\n", buf);
-#ifdef CONFIG_DEBUG_BUGVERBOSE
- /*
- * Avoid nested stack-dumping if a panic occurs during oops processing
- */
- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
- dump_stack();
-#endif
-
/*
* If kgdb is enabled, give it a chance to run before we stop all
* the other CPUs or else we won't be able to debug processes left
@@ -552,6 +557,7 @@ static void print_oops_end_marker(void)
{
init_oops_id();
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
+ pr_flush(1000, true);
}
/*
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3228,6 +3228,12 @@ void kmsg_dump(enum kmsg_dump_reason rea
sync_mode = true;
pr_info("enabled sync mode\n");
}
+
+ /*
+ * Give the printing threads time to flush, allowing up to
+ * 1s of no printing forward progress before giving up.
+ */
+ pr_flush(1000, true);
}
rcu_read_lock();
@@ -3507,3 +3513,76 @@ void console_atomic_unlock(unsigned int
prb_unlock(&printk_cpulock, flags);
}
EXPORT_SYMBOL(console_atomic_unlock);
+
+static void pr_msleep(bool may_sleep, int ms)
+{
+ if (may_sleep) {
+ msleep(ms);
+ } else {
+ while (ms--)
+ udelay(1000);
+ }
+}
+
+/**
+ * pr_flush() - Wait for printing threads to catch up.
+ *
+ * @timeout_ms: The maximum time (in ms) to wait.
+ * @reset_on_progress: Reset the timeout if forward progress is seen.
+ *
+ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
+ * represents infinite waiting.
+ *
+ * If @reset_on_progress is true, the timeout will be reset whenever any
+ * printer has been seen to make some forward progress.
+ *
+ * Context: Any context.
+ * Return: true if all enabled printers are caught up.
+ */
+bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ int remaining = timeout_ms;
+ struct console *con;
+ u64 last_diff = 0;
+ bool may_sleep;
+ u64 printk_seq;
+ u64 diff;
+ u64 seq;
+
+ may_sleep = (preemptible() && !in_softirq());
+
+ seq = prb_next_seq(prb);
+
+ for (;;) {
+ diff = 0;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ printk_seq = atomic64_read(&con->printk_seq);
+ if (printk_seq < seq)
+ diff += seq - printk_seq;
+ }
+
+ if (diff != last_diff && reset_on_progress)
+ remaining = timeout_ms;
+
+ if (!diff || remaining == 0)
+ break;
+
+ if (remaining < 0) {
+ pr_msleep(may_sleep, 100);
+ } else if (remaining < 100) {
+ pr_msleep(may_sleep, remaining);
+ remaining = 0;
+ } else {
+ pr_msleep(may_sleep, 100);
+ remaining -= 100;
+ }
+
+ last_diff = diff;
+ }
+
+ return (diff == 0);
+}
+EXPORT_SYMBOL(pr_flush);
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -205,6 +205,7 @@ enum bug_trap_type report_bug(unsigned l
else
pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n",
(void *)bugaddr);
+ pr_flush(1000, true);
return BUG_TRAP_TYPE_BUG;
}

View File

@@ -0,0 +1,76 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 18 Feb 2021 18:31:26 +0100
Subject: [PATCH] kcov: Remove kcov include from sched.h and move it to its
users.
The recent addition of in_serving_softirq() to kconv.h results in
compile failure on PREEMPT_RT because it requires
task_struct::softirq_disable_cnt. This is not available if kconv.h is
included from sched.h.
It is not needed to include kconv.h from sched.h. All but the net/ user
already include the kconv header file.
Move the include of the kconv.h header from sched.h it its users.
Additionally include sched.h from kconv.h to ensure that everything
task_struct related is available.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kcov.h | 1 +
include/linux/sched.h | 1 -
net/core/skbuff.c | 1 +
net/mac80211/iface.c | 1 +
net/mac80211/rx.c | 1 +
5 files changed, 4 insertions(+), 1 deletion(-)
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_KCOV_H
#define _LINUX_KCOV_H
+#include <linux/sched.h>
#include <uapi/linux/kcov.h>
struct task_struct;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -14,7 +14,6 @@
#include <linux/pid.h>
#include <linux/sem.h>
#include <linux/shm.h>
-#include <linux/kcov.h>
#include <linux/mutex.h>
#include <linux/plist.h>
#include <linux/hrtimer.h>
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -60,6 +60,7 @@
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
+#include <linux/kcov.h>
#include <net/protocol.h>
#include <net/dst.h>
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -15,6 +15,7 @@
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
+#include <linux/kcov.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include "ieee80211_i.h"
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -17,6 +17,7 @@
#include <linux/etherdevice.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <linux/kcov.h>
#include <linux/bitops.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>

View File

@@ -0,0 +1,43 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 3 Jul 2018 18:19:48 +0200
Subject: [PATCH] cgroup: use irqsave in cgroup_rstat_flush_locked()
All callers of cgroup_rstat_flush_locked() acquire cgroup_rstat_lock
either with spin_lock_irq() or spin_lock_irqsave().
cgroup_rstat_flush_locked() itself acquires cgroup_rstat_cpu_lock which
is a raw_spin_lock. This lock is also acquired in cgroup_rstat_updated()
in IRQ context and therefore requires _irqsave() locking suffix in
cgroup_rstat_flush_locked().
Since there is no difference between spin_lock_t and raw_spin_lock_t
on !RT lockdep does not complain here. On RT lockdep complains because
the interrupts were not disabled here and a deadlock is possible.
Acquire the raw_spin_lock_t with disabled interrupts.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/cgroup/rstat.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -149,8 +149,9 @@ static void cgroup_rstat_flush_locked(st
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -162,7 +163,7 @@ static void cgroup_rstat_flush_locked(st
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||

View File

@@ -20,7 +20,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -367,6 +367,8 @@ static struct list_lru shadow_nodes;
@@ -430,6 +430,8 @@ static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
@@ -29,7 +29,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -375,7 +377,8 @@ void workingset_update_node(struct xa_no
@@ -438,7 +440,8 @@ void workingset_update_node(struct xa_no
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/

View File

@@ -0,0 +1,138 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 14 Aug 2020 18:53:34 +0200
Subject: [PATCH] shmem: Use raw_spinlock_t for ->stat_lock
Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is
per-CPU. Access here is serialized by disabling preemption. If the pool is
empty, it gets reloaded from `->next_ino'. Access here is serialized by
->stat_lock which is a spinlock_t and can not be acquired with disabled
preemption.
One way around it would make per-CPU ino_batch struct containing the inode
number a local_lock_t.
Another sollution is to promote ->stat_lock to a raw_spinlock_t. The critical
sections are short. The mpol_put() should be moved outside of the critical
section to avoid invoking the destrutor with disabled preemption.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/shmem_fs.h | 2 +-
mm/shmem.c | 31 +++++++++++++++++--------------
2 files changed, 18 insertions(+), 15 deletions(-)
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -31,7 +31,7 @@ struct shmem_sb_info {
struct percpu_counter used_blocks; /* How many are allocated */
unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */
- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
+ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
umode_t mode; /* Mount mode for root directory */
unsigned char huge; /* Whether to try for hugepages */
kuid_t uid; /* Mount uid for root directory */
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct su
ino_t ino;
if (!(sb->s_flags & SB_KERNMOUNT)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_inodes) {
if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
@@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct su
}
*inop = ino;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
} else if (inop) {
/*
* __shmem_file_setup, one of our callers, is lock-free: it
@@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct su
* to worry about things like glibc compatibility.
*/
ino_t *next_ino;
+
next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
ino = *next_ino;
if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
ino = sbinfo->next_ino;
sbinfo->next_ino += SHMEM_INO_BATCH;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
if (unlikely(is_zero_ino(ino)))
ino++;
}
@@ -341,9 +342,9 @@ static void shmem_free_inode(struct supe
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -1479,10 +1480,10 @@ static struct mempolicy *shmem_get_sbmpo
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -3587,9 +3588,10 @@ static int shmem_reconfigure(struct fs_c
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
+ struct mempolicy *mpol = NULL;
const char *err;
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
@@ -3634,14 +3636,15 @@ static int shmem_reconfigure(struct fs_c
* Preserve previous mempolicy unless mpol remount option was specified.
*/
if (ctx->mpol) {
- mpol_put(sbinfo->mpol);
+ mpol = sbinfo->mpol;
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return invalfc(fc, "%s", err);
}
@@ -3758,7 +3761,7 @@ static int shmem_fill_super(struct super
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
spin_lock_init(&sbinfo->shrinklist_lock);

View File

@@ -0,0 +1,40 @@
Subject: net: Move lockdep where it belongs
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 08 Sep 2020 07:32:20 +0200
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
net/core/sock.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3050,12 +3050,11 @@ void lock_sock_nested(struct sock *sk, i
if (sk->sk_lock.owned)
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
}
EXPORT_SYMBOL(lock_sock_nested);
@@ -3104,13 +3103,12 @@ bool lock_sock_fast(struct sock *sk) __a
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
__acquire(&sk->sk_lock.slock);
- local_bh_enable();
return true;
}
EXPORT_SYMBOL(lock_sock_fast);

View File

@@ -0,0 +1,99 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 12 Oct 2020 17:33:54 +0200
Subject: [PATCH] tcp: Remove superfluous BH-disable around listening_hash
Commit
9652dc2eb9e40 ("tcp: relax listening_hash operations")
removed the need to disable bottom half while acquiring
listening_hash.lock. There are still two callers left which disable
bottom half before the lock is acquired.
Drop local_bh_disable() around __inet_hash() which acquires
listening_hash->lock, invoke inet_ehash_nolisten() with disabled BH.
inet_unhash() conditionally acquires listening_hash->lock.
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/linux-rt-users/12d6f9879a97cd56c09fb53dee343cbb14f7f1f7.camel@gmx.de/
Link: https://lkml.kernel.org/r/X9CheYjuXWc75Spa@hirez.programming.kicks-ass.net
---
net/ipv4/inet_hashtables.c | 19 ++++++++++++-------
net/ipv6/inet6_hashtables.c | 5 +----
2 files changed, 13 insertions(+), 11 deletions(-)
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -635,7 +635,9 @@ int __inet_hash(struct sock *sk, struct
int err = 0;
if (sk->sk_state != TCP_LISTEN) {
+ local_bh_disable();
inet_ehash_nolisten(sk, osk, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
@@ -667,11 +669,8 @@ int inet_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}
@@ -682,17 +681,20 @@ void inet_unhash(struct sock *sk)
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb = NULL;
spinlock_t *lock;
+ bool state_listen;
if (sk_unhashed(sk))
return;
if (sk->sk_state == TCP_LISTEN) {
+ state_listen = true;
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
+ spin_lock(&ilb->lock);
} else {
+ state_listen = false;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ spin_lock_bh(lock);
}
- spin_lock_bh(lock);
if (sk_unhashed(sk))
goto unlock;
@@ -705,7 +707,10 @@ void inet_unhash(struct sock *sk)
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
unlock:
- spin_unlock_bh(lock);
+ if (state_listen)
+ spin_unlock(&ilb->lock);
+ else
+ spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(inet_unhash);
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk)
{
int err = 0;
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
+ if (sk->sk_state != TCP_CLOSE)
err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
return err;
}

View File

@@ -0,0 +1,40 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 15 Feb 2021 18:44:12 +0100
Subject: [PATCH] smp: Wake ksoftirqd on PREEMPT_RT instead do_softirq().
The softirq implementation on PREEMPT_RT does not provide do_softirq().
The other user of do_softirq() is replaced with a local_bh_disable()
+ enable() around the possible raise-softirq invocation. This can not be
done here because migration_cpu_stop() is invoked with disabled
preemption.
Wake the softirq thread on PREEMPT_RT if there are any pending softirqs.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/smp.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -450,8 +450,18 @@ void flush_smp_call_function_from_idle(v
local_irq_save(flags);
flush_smp_call_function_queue(true);
- if (local_softirq_pending())
- do_softirq();
+
+ if (local_softirq_pending()) {
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ do_softirq();
+ } else {
+ struct task_struct *ksoftirqd = this_cpu_ksoftirqd();
+
+ if (ksoftirqd && ksoftirqd->state != TASK_RUNNING)
+ wake_up_process(ksoftirqd);
+ }
+ }
local_irq_restore(flags);
}

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:04 +0100
Subject: [PATCH 01/20] tasklets: Replace barrier() with cpu_relax() in
tasklet_unlock_wait()
A barrier() in a tight loop which waits for something to happen on a remote
CPU is a pointless exercise. Replace it with cpu_relax() which allows HT
siblings to make progress.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -677,7 +677,8 @@ static inline void tasklet_unlock(struct
static inline void tasklet_unlock_wait(struct tasklet_struct *t)
{
- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
+ while (test_bit(TASKLET_STATE_RUN, &t->state))
+ cpu_relax();
}
#else
#define tasklet_trylock(t) 1

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:05 +0100
Subject: [PATCH 02/20] tasklets: Use static inlines for stub implementations
Inlines exist for a reason.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -681,9 +681,9 @@ static inline void tasklet_unlock_wait(s
cpu_relax();
}
#else
-#define tasklet_trylock(t) 1
-#define tasklet_unlock_wait(t) do { } while (0)
-#define tasklet_unlock(t) do { } while (0)
+static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
+static inline void tasklet_unlock(struct tasklet_struct *t) { }
+static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
#endif
extern void __tasklet_schedule(struct tasklet_struct *t);

View File

@@ -0,0 +1,61 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:06 +0100
Subject: [PATCH 03/20] tasklets: Provide tasklet_disable_in_atomic()
Replacing the spin wait loops in tasklet_unlock_wait() with
wait_var_event() is not possible as a handful of tasklet_disable()
invocations are happening in atomic context. All other invocations are in
teardown paths which can sleep.
Provide tasklet_disable_in_atomic() and tasklet_unlock_spin_wait() to
convert the few atomic use cases over, which allows to change
tasklet_disable() and tasklet_unlock_wait() in a later step.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -680,10 +680,21 @@ static inline void tasklet_unlock_wait(s
while (test_bit(TASKLET_STATE_RUN, &t->state))
cpu_relax();
}
+
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &t->state))
+ cpu_relax();
+}
#else
static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
static inline void tasklet_unlock(struct tasklet_struct *t) { }
static inline void tasklet_unlock_wait(struct tasklet_struct *t) { }
+static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { }
#endif
extern void __tasklet_schedule(struct tasklet_struct *t);
@@ -708,6 +719,17 @@ static inline void tasklet_disable_nosyn
smp_mb__after_atomic();
}
+/*
+ * Do not use in new code. Disabling tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+static inline void tasklet_disable_in_atomic(struct tasklet_struct *t)
+{
+ tasklet_disable_nosync(t);
+ tasklet_unlock_spin_wait(t);
+ smp_mb();
+}
+
static inline void tasklet_disable(struct tasklet_struct *t)
{
tasklet_disable_nosync(t);

View File

@@ -0,0 +1,26 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:07 +0100
Subject: [PATCH 04/20] tasklets: Use spin wait in tasklet_disable()
temporarily
To ease the transition use spin waiting in tasklet_disable() until all
usage sites from atomic context have been cleaned up.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -733,7 +733,8 @@ static inline void tasklet_disable_in_at
static inline void tasklet_disable(struct tasklet_struct *t)
{
tasklet_disable_nosync(t);
- tasklet_unlock_wait(t);
+ /* Spin wait until all atomic users are converted */
+ tasklet_unlock_spin_wait(t);
smp_mb();
}

View File

@@ -0,0 +1,81 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Mar 2021 09:42:08 +0100
Subject: [PATCH 05/20] tasklets: Replace spin wait in tasklet_unlock_wait()
tasklet_unlock_wait() spin waits for TASKLET_STATE_RUN to be cleared. This
is wasting CPU cycles in a tight loop which is especially painful in a
guest when the CPU running the tasklet is scheduled out.
tasklet_unlock_wait() is invoked from tasklet_kill() which is used in
teardown paths and not performance critical at all. Replace the spin wait
with wait_var_event().
There are no users of tasklet_unlock_wait() which are invoked from atomic
contexts. The usage in tasklet_disable() has been replaced temporarily with
the spin waiting variant until the atomic users are fixed up and will be
converted to the sleep wait variant later.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 13 ++-----------
kernel/softirq.c | 18 ++++++++++++++++++
2 files changed, 20 insertions(+), 11 deletions(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -669,17 +669,8 @@ static inline int tasklet_trylock(struct
return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
}
-static inline void tasklet_unlock(struct tasklet_struct *t)
-{
- smp_mb__before_atomic();
- clear_bit(TASKLET_STATE_RUN, &(t)->state);
-}
-
-static inline void tasklet_unlock_wait(struct tasklet_struct *t)
-{
- while (test_bit(TASKLET_STATE_RUN, &t->state))
- cpu_relax();
-}
+void tasklet_unlock(struct tasklet_struct *t);
+void tasklet_unlock_wait(struct tasklet_struct *t);
/*
* Do not use in new code. Waiting for tasklets from atomic contexts is
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/wait_bit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -619,6 +620,23 @@ void tasklet_kill(struct tasklet_struct
}
EXPORT_SYMBOL(tasklet_kill);
+#ifdef CONFIG_SMP
+void tasklet_unlock(struct tasklet_struct *t)
+{
+ smp_mb__before_atomic();
+ clear_bit(TASKLET_STATE_RUN, &t->state);
+ smp_mb__after_atomic();
+ wake_up_var(&t->state);
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock);
+
+void tasklet_unlock_wait(struct tasklet_struct *t)
+{
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state));
+}
+EXPORT_SYMBOL_GPL(tasklet_unlock_wait);
+#endif
+
void __init softirq_init(void)
{
int cpu;

View File

@@ -0,0 +1,67 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 Mar 2021 09:42:09 +0100
Subject: [PATCH 06/20] tasklets: Replace spin wait in tasklet_kill()
tasklet_kill() spin waits for TASKLET_STATE_SCHED to be cleared invoking
yield() from inside the loop. yield() is an ill defined mechanism and the
result might still be wasting CPU cycles in a tight loop which is
especially painful in a guest when the CPU running the tasklet is scheduled
out.
tasklet_kill() is used in teardown paths and not performance critical at
all. Replace the spin wait with wait_var_event().
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/softirq.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -530,6 +530,16 @@ void __tasklet_hi_schedule(struct taskle
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
+static inline bool tasklet_clear_sched(struct tasklet_struct *t)
+{
+ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) {
+ wake_up_var(&t->state);
+ return true;
+ }
+
+ return false;
+}
+
static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
@@ -549,8 +559,7 @@ static void tasklet_action_common(struct
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
- if (!test_and_clear_bit(TASKLET_STATE_SCHED,
- &t->state))
+ if (!tasklet_clear_sched(t))
BUG();
if (t->use_callback)
t->callback(t);
@@ -610,13 +619,11 @@ void tasklet_kill(struct tasklet_struct
if (in_interrupt())
pr_notice("Attempt to kill tasklet from interrupt\n");
- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- do {
- yield();
- } while (test_bit(TASKLET_STATE_SCHED, &t->state));
- }
+ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
+ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state));
+
tasklet_unlock_wait(t);
- clear_bit(TASKLET_STATE_SCHED, &t->state);
+ tasklet_clear_sched(t);
}
EXPORT_SYMBOL(tasklet_kill);

View File

@@ -0,0 +1,100 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:10 +0100
Subject: [PATCH 07/20] tasklets: Prevent tasklet_unlock_spin_wait() deadlock
on RT
tasklet_unlock_spin_wait() spin waits for the TASKLET_STATE_SCHED bit in
the tasklet state to be cleared. This works on !RT nicely because the
corresponding execution can only happen on a different CPU.
On RT softirq processing is preemptible, therefore a task preempting the
softirq processing thread can spin forever.
Prevent this by invoking local_bh_disable()/enable() inside the loop. In
case that the softirq processing thread was preempted by the current task,
current will block on the local lock which yields the CPU to the preempted
softirq processing thread. If the tasklet is processed on a different CPU
then the local_bh_disable()/enable() pair is just a waste of processor
cycles.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 12 ++----------
kernel/softirq.c | 28 +++++++++++++++++++++++++++-
2 files changed, 29 insertions(+), 11 deletions(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -663,7 +663,7 @@ enum
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
static inline int tasklet_trylock(struct tasklet_struct *t)
{
return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
@@ -671,16 +671,8 @@ static inline int tasklet_trylock(struct
void tasklet_unlock(struct tasklet_struct *t);
void tasklet_unlock_wait(struct tasklet_struct *t);
+void tasklet_unlock_spin_wait(struct tasklet_struct *t);
-/*
- * Do not use in new code. Waiting for tasklets from atomic contexts is
- * error prone and should be avoided.
- */
-static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t)
-{
- while (test_bit(TASKLET_STATE_RUN, &t->state))
- cpu_relax();
-}
#else
static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; }
static inline void tasklet_unlock(struct tasklet_struct *t) { }
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -614,6 +614,32 @@ void tasklet_init(struct tasklet_struct
}
EXPORT_SYMBOL(tasklet_init);
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * Do not use in new code. Waiting for tasklets from atomic contexts is
+ * error prone and should be avoided.
+ */
+void tasklet_unlock_spin_wait(struct tasklet_struct *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /*
+ * Prevent a live lock when current preempted soft
+ * interrupt processing or prevents ksoftirqd from
+ * running. If the tasklet runs on a different CPU
+ * then this has no effect other than doing the BH
+ * disable/enable dance for nothing.
+ */
+ local_bh_disable();
+ local_bh_enable();
+ } else {
+ cpu_relax();
+ }
+ }
+}
+EXPORT_SYMBOL(tasklet_unlock_spin_wait);
+#endif
+
void tasklet_kill(struct tasklet_struct *t)
{
if (in_interrupt())
@@ -627,7 +653,7 @@ void tasklet_kill(struct tasklet_struct
}
EXPORT_SYMBOL(tasklet_kill);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
void tasklet_unlock(struct tasklet_struct *t)
{
smp_mb__before_atomic();

View File

@@ -0,0 +1,79 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:11 +0100
Subject: [PATCH 08/20] net: jme: Replace link-change tasklet with work
The link change tasklet disables the tasklets for tx/rx processing while
upating hw parameters and then enables the tasklets again.
This update can also be pushed into a workqueue where it can be performed
in preemptible context. This allows tasklet_disable() to become sleeping.
Replace the linkch_task tasklet with a work.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/net/ethernet/jme.c | 10 +++++-----
drivers/net/ethernet/jme.h | 2 +-
2 files changed, 6 insertions(+), 6 deletions(-)
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1265,9 +1265,9 @@ jme_stop_shutdown_timer(struct jme_adapt
jwrite32f(jme, JME_APMC, apmc);
}
-static void jme_link_change_tasklet(struct tasklet_struct *t)
+static void jme_link_change_work(struct work_struct *work)
{
- struct jme_adapter *jme = from_tasklet(jme, t, linkch_task);
+ struct jme_adapter *jme = container_of(work, struct jme_adapter, linkch_task);
struct net_device *netdev = jme->dev;
int rc;
@@ -1510,7 +1510,7 @@ jme_intr_msi(struct jme_adapter *jme, u3
* all other events are ignored
*/
jwrite32(jme, JME_IEVE, intrstat);
- tasklet_schedule(&jme->linkch_task);
+ schedule_work(&jme->linkch_task);
goto out_reenable;
}
@@ -1832,7 +1832,6 @@ jme_open(struct net_device *netdev)
jme_clear_pm_disable_wol(jme);
JME_NAPI_ENABLE(jme);
- tasklet_setup(&jme->linkch_task, jme_link_change_tasklet);
tasklet_setup(&jme->txclean_task, jme_tx_clean_tasklet);
tasklet_setup(&jme->rxclean_task, jme_rx_clean_tasklet);
tasklet_setup(&jme->rxempty_task, jme_rx_empty_tasklet);
@@ -1920,7 +1919,7 @@ jme_close(struct net_device *netdev)
JME_NAPI_DISABLE(jme);
- tasklet_kill(&jme->linkch_task);
+ cancel_work_sync(&jme->linkch_task);
tasklet_kill(&jme->txclean_task);
tasklet_kill(&jme->rxclean_task);
tasklet_kill(&jme->rxempty_task);
@@ -3035,6 +3034,7 @@ jme_init_one(struct pci_dev *pdev,
atomic_set(&jme->rx_empty, 1);
tasklet_setup(&jme->pcc_task, jme_pcc_tasklet);
+ INIT_WORK(&jme->linkch_task, jme_link_change_work);
jme->dpi.cur = PCC_P1;
jme->reg_ghc = 0;
--- a/drivers/net/ethernet/jme.h
+++ b/drivers/net/ethernet/jme.h
@@ -411,7 +411,7 @@ struct jme_adapter {
struct tasklet_struct rxempty_task;
struct tasklet_struct rxclean_task;
struct tasklet_struct txclean_task;
- struct tasklet_struct linkch_task;
+ struct work_struct linkch_task;
struct tasklet_struct pcc_task;
unsigned long flags;
u32 reg_txcs;

View File

@@ -0,0 +1,32 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:12 +0100
Subject: [PATCH 09/20] net: sundance: Use tasklet_disable_in_atomic().
tasklet_disable() is used in the timer callback. This might be distangled,
but without access to the hardware that's a bit risky.
Replace it with tasklet_disable_in_atomic() so tasklet_disable() can be
changed to a sleep wait once all remaining atomic users are converted.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Denis Kirjanov <kda@linux-powerpc.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: netdev@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/net/ethernet/dlink/sundance.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/dlink/sundance.c
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -963,7 +963,7 @@ static void tx_timeout(struct net_device
unsigned long flag;
netif_stop_queue(dev);
- tasklet_disable(&np->tx_tasklet);
+ tasklet_disable_in_atomic(&np->tx_tasklet);
iowrite16(0, ioaddr + IntrEnable);
printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x "
"TxFrameId %2.2x,"

View File

@@ -0,0 +1,41 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:13 +0100
Subject: [PATCH 10/20] ath9k: Use tasklet_disable_in_atomic()
All callers of ath9k_beacon_ensure_primary_slot() are preemptible /
acquire a mutex except for this callchain:
spin_lock_bh(&sc->sc_pcu_lock);
ath_complete_reset()
-> ath9k_calculate_summary_state()
-> ath9k_beacon_ensure_primary_slot()
It's unclear how that can be distangled, so use tasklet_disable_in_atomic()
for now. This allows tasklet_disable() to become sleepable once the
remaining atomic users are cleaned up.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ath9k-devel@qca.qualcomm.com
Cc: Kalle Valo <kvalo@codeaurora.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: linux-wireless@vger.kernel.org
Cc: netdev@vger.kernel.org
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/net/wireless/ath/ath9k/beacon.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/wireless/ath/ath9k/beacon.c
+++ b/drivers/net/wireless/ath/ath9k/beacon.c
@@ -251,7 +251,7 @@ void ath9k_beacon_ensure_primary_slot(st
int first_slot = ATH_BCBUF;
int slot;
- tasklet_disable(&sc->bcon_tasklet);
+ tasklet_disable_in_atomic(&sc->bcon_tasklet);
/* Find first taken slot. */
for (slot = 0; slot < ATH_BCBUF; slot++) {

View File

@@ -0,0 +1,35 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:14 +0100
Subject: [PATCH 11/20] atm: eni: Use tasklet_disable_in_atomic() in the send()
callback
The atmdev_ops::send callback which calls tasklet_disable() is invoked with
bottom halfs disabled from net_device_ops::ndo_start_xmit(). All other
invocations of tasklet_disable() in this driver happen in preemptible
context.
Change the send() call to use tasklet_disable_in_atomic() which allows
tasklet_disable() to be made sleepable once the remaining atomic context
usage sites are cleaned up.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chas Williams <3chas3@gmail.com>
Cc: linux-atm-general@lists.sourceforge.net
Cc: netdev@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/atm/eni.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -2054,7 +2054,7 @@ static int eni_send(struct atm_vcc *vcc,
}
submitted++;
ATM_SKB(skb)->vcc = vcc;
- tasklet_disable(&ENI_DEV(vcc->dev)->task);
+ tasklet_disable_in_atomic(&ENI_DEV(vcc->dev)->task);
res = do_tx(skb);
tasklet_enable(&ENI_DEV(vcc->dev)->task);
if (res == enq_ok) return 0;

View File

@@ -0,0 +1,39 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:15 +0100
Subject: [PATCH 12/20] PCI: hv: Use tasklet_disable_in_atomic()
The hv_compose_msi_msg() callback in irq_chip::irq_compose_msi_msg is
invoked via irq_chip_compose_msi_msg(), which itself is always invoked from
atomic contexts from the guts of the interrupt core code.
There is no way to change this w/o rewriting the whole driver, so use
tasklet_disable_in_atomic() which allows to make tasklet_disable()
sleepable once the remaining atomic users are addressed.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Wei Liu <wei.liu@kernel.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-hyperv@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/pci/controller/pci-hyperv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1458,7 +1458,7 @@ static void hv_compose_msi_msg(struct ir
* Prevents hv_pci_onchannelcallback() from running concurrently
* in the tasklet.
*/
- tasklet_disable(&channel->callback_event);
+ tasklet_disable_in_atomic(&channel->callback_event);
/*
* Since this function is called with IRQ locks held, can't

View File

@@ -0,0 +1,54 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:16 +0100
Subject: [PATCH 13/20] firewire: ohci: Use tasklet_disable_in_atomic() where
required
tasklet_disable() is invoked in several places. Some of them are in atomic
context which prevents a conversion of tasklet_disable() to a sleepable
function.
The atomic callchains are:
ar_context_tasklet()
ohci_cancel_packet()
tasklet_disable()
...
ohci_flush_iso_completions()
tasklet_disable()
The invocation of tasklet_disable() from at_context_flush() is always in
preemptible context.
Use tasklet_disable_in_atomic() for the two invocations in
ohci_cancel_packet() and ohci_flush_iso_completions().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: linux1394-devel@lists.sourceforge.net
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/firewire/ohci.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -2545,7 +2545,7 @@ static int ohci_cancel_packet(struct fw_
struct driver_data *driver_data = packet->driver_data;
int ret = -ENOENT;
- tasklet_disable(&ctx->tasklet);
+ tasklet_disable_in_atomic(&ctx->tasklet);
if (packet->ack != 0)
goto out;
@@ -3465,7 +3465,7 @@ static int ohci_flush_iso_completions(st
struct iso_context *ctx = container_of(base, struct iso_context, base);
int ret = 0;
- tasklet_disable(&ctx->context.tasklet);
+ tasklet_disable_in_atomic(&ctx->context.tasklet);
if (!test_and_set_bit_lock(0, &ctx->flushing_completions)) {
context_tasklet((unsigned long)&ctx->context);

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:42:17 +0100
Subject: [PATCH 14/20] tasklets: Switch tasklet_disable() to the sleep wait
variant
-- NOT FOR IMMEDIATE MERGING --
Now that all users of tasklet_disable() are invoked from sleepable context,
convert it to use tasklet_unlock_wait() which might sleep.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/interrupt.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -716,8 +716,7 @@ static inline void tasklet_disable_in_at
static inline void tasklet_disable(struct tasklet_struct *t)
{
tasklet_disable_nosync(t);
- /* Spin wait until all atomic users are converted */
- tasklet_unlock_spin_wait(t);
+ tasklet_unlock_wait(t);
smp_mb();
}

View File

@@ -0,0 +1,64 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:53 +0100
Subject: [PATCH 15/20] softirq: Add RT specific softirq accounting
RT requires the softirq processing and local bottomhalf disabled regions to
be preemptible. Using the normal preempt count based serialization is
therefore not possible because this implicitely disables preemption.
RT kernels use a per CPU local lock to serialize bottomhalfs. As
local_bh_disable() can nest the lock can only be acquired on the outermost
invocation of local_bh_disable() and released when the nest count becomes
zero. Tasks which hold the local lock can be preempted so its required to
keep track of the nest count per task.
Add a RT only counter to task struct and adjust the relevant macros in
preempt.h.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hardirq.h | 1 +
include/linux/preempt.h | 6 +++++-
include/linux/sched.h | 3 +++
3 files changed, 9 insertions(+), 1 deletion(-)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -6,6 +6,7 @@
#include <linux/preempt.h>
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
+#include <linux/sched.h>
#include <linux/vtime.h>
#include <asm/hardirq.h>
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -79,7 +79,11 @@
#define nmi_count() (preempt_count() & NMI_MASK)
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
-#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
+#ifdef CONFIG_PREEMPT_RT
+# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK)
+#else
+# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
+#endif
#define irq_count() (nmi_count() | hardirq_count() | softirq_count())
/*
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1040,6 +1040,9 @@ struct task_struct {
int softirq_context;
int irq_config;
#endif
+#ifdef CONFIG_PREEMPT_RT
+ int softirq_disable_cnt;
+#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL

View File

@@ -0,0 +1,47 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:54 +0100
Subject: [PATCH 16/20] irqtime: Make accounting correct on RT
vtime_account_irq and irqtime_account_irq() base checks on preempt_count()
which fails on RT because preempt_count() does not contain the softirq
accounting which is seperate on RT.
These checks do not need the full preempt count as they only operate on the
hard and softirq sections.
Use irq_count() instead which provides the correct value on both RT and non
RT kernels. The compiler is clever enough to fold the masking for !RT:
99b: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax
- 9a2: 25 ff ff ff 7f and $0x7fffffff,%eax
+ 9a2: 25 00 ff ff 00 and $0xffff00,%eax
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/sched/cputime.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_str
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;
- pc = preempt_count() - offset;
+ pc = irq_count() - offset;
/*
* We do not account for softirq time from ksoftirqd here.
@@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struc
void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
{
- unsigned int pc = preempt_count() - offset;
+ unsigned int pc = irq_count() - offset;
if (pc & HARDIRQ_OFFSET) {
vtime_account_hardirq(tsk);

View File

@@ -0,0 +1,101 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:55 +0100
Subject: [PATCH 17/20] softirq: Move various protections into inline helpers
To allow reuse of the bulk of softirq processing code for RT and to avoid
#ifdeffery all over the place, split protections for various code sections
out into inline helpers so the RT variant can just replace them in one go.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/softirq.c | 39 ++++++++++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 7 deletions(-)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -205,6 +205,32 @@ void __local_bh_enable_ip(unsigned long
}
EXPORT_SYMBOL(__local_bh_enable_ip);
+static inline void softirq_handle_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+}
+
+static inline void softirq_handle_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ WARN_ON_ONCE(in_interrupt());
+}
+
+static inline void ksoftirqd_run_begin(void)
+{
+ local_irq_disable();
+}
+
+static inline void ksoftirqd_run_end(void)
+{
+ local_irq_enable();
+}
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return true;
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -317,7 +343,7 @@ asmlinkage __visible void __softirq_entr
pending = local_softirq_pending();
- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
account_softirq_enter(current);
@@ -368,8 +394,7 @@ asmlinkage __visible void __softirq_entr
account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
- __local_bh_enable(SOFTIRQ_OFFSET);
- WARN_ON_ONCE(in_interrupt());
+ softirq_handle_end();
current_restore_flags(old_flags, PF_MEMALLOC);
}
@@ -464,7 +489,7 @@ inline void raise_softirq_irqoff(unsigne
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
- if (!in_interrupt())
+ if (!in_interrupt() && should_wake_ksoftirqd())
wakeup_softirqd();
}
@@ -692,18 +717,18 @@ static int ksoftirqd_should_run(unsigned
static void run_ksoftirqd(unsigned int cpu)
{
- local_irq_disable();
+ ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
- local_irq_enable();
+ ksoftirqd_run_end();
cond_resched();
return;
}
- local_irq_enable();
+ ksoftirqd_run_end();
}
#ifdef CONFIG_HOTPLUG_CPU

View File

@@ -0,0 +1,258 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:56 +0100
Subject: [PATCH 18/20] softirq: Make softirq control and processing RT aware
Provide a local lock based serialization for soft interrupts on RT which
allows the local_bh_disabled() sections and servicing soft interrupts to be
preemptible.
Provide the necessary inline helpers which allow to reuse the bulk of the
softirq processing code.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/bottom_half.h | 2
kernel/softirq.c | 188 ++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 182 insertions(+), 8 deletions(-)
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -4,7 +4,7 @@
#include <linux/preempt.h>
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -13,6 +13,7 @@
#include <linux/kernel_stat.h>
#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/local_lock.h>
#include <linux/mm.h>
#include <linux/notifier.h>
#include <linux/percpu.h>
@@ -101,20 +102,189 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_contex
#endif
/*
- * preempt_count and SOFTIRQ_OFFSET usage:
- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
- * softirq processing.
- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ * processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
* on local_bh_disable or local_bh_enable.
+ *
* This lets us distinguish between whether we are currently processing
* softirq and whether we just have bh disabled.
*/
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
+ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a
+ * softirq disabled section to be preempted.
+ *
+ * The per task counter is used for softirq_count(), in_softirq() and
+ * in_serving_softirqs() because these counts are only valid when the task
+ * holding softirq_ctrl::lock is running.
+ *
+ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that
+ * the task which is in a softirq disabled section is preempted or blocks.
+ */
+struct softirq_ctrl {
+ local_lock_t lock;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
+};
+
+void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
+{
+ unsigned long flags;
+ int newcnt;
+
+ WARN_ON_ONCE(in_hardirq());
+
+ /* First entry of a task into a BH disabled section? */
+ if (!current->softirq_disable_cnt) {
+ if (preemptible()) {
+ local_lock(&softirq_ctrl.lock);
+ /* Required to meet the RCU bottomhalf requirements. */
+ rcu_read_lock();
+ } else {
+ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+ }
+ }
+
+ /*
+ * Track the per CPU softirq disabled state. On RT this is per CPU
+ * state to allow preemption of bottom half disabled sections.
+ */
+ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ /*
+ * Reflect the result in the task state to prevent recursion on the
+ * local lock and to make softirq_count() & al work.
+ */
+ current->softirq_disable_cnt = newcnt;
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+ }
+}
+EXPORT_SYMBOL(__local_bh_disable_ip);
+
+static void __local_bh_enable(unsigned int cnt, bool unlock)
+{
+ unsigned long flags;
+ int newcnt;
+
+ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
+ this_cpu_read(softirq_ctrl.cnt));
+
+ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
+ raw_local_irq_save(flags);
+ lockdep_softirqs_on(_RET_IP_);
+ raw_local_irq_restore(flags);
+ }
+
+ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ current->softirq_disable_cnt = newcnt;
+
+ if (!newcnt && unlock) {
+ rcu_read_unlock();
+ local_unlock(&softirq_ctrl.lock);
+ }
+}
+
+void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
+{
+ bool preempt_on = preemptible();
+ unsigned long flags;
+ u32 pending;
+ int curcnt;
+
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
+
+ local_irq_save(flags);
+ curcnt = __this_cpu_read(softirq_ctrl.cnt);
+
+ /*
+ * If this is not reenabling soft interrupts, no point in trying to
+ * run pending ones.
+ */
+ if (curcnt != cnt)
+ goto out;
+
+ pending = local_softirq_pending();
+ if (!pending || ksoftirqd_running(pending))
+ goto out;
+
+ /*
+ * If this was called from non preemptible context, wake up the
+ * softirq daemon.
+ */
+ if (!preempt_on) {
+ wakeup_softirqd();
+ goto out;
+ }
+
+ /*
+ * Adjust softirq count to SOFTIRQ_OFFSET which makes
+ * in_serving_softirq() become true.
+ */
+ cnt = SOFTIRQ_OFFSET;
+ __local_bh_enable(cnt, false);
+ __do_softirq();
+
+out:
+ __local_bh_enable(cnt, preempt_on);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(__local_bh_enable_ip);
+
+/*
+ * Invoked from ksoftirqd_run() outside of the interrupt disabled section
+ * to acquire the per CPU local lock for reentrancy protection.
+ */
+static inline void ksoftirqd_run_begin(void)
+{
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ local_irq_disable();
+}
+
+/* Counterpart to ksoftirqd_run_begin() */
+static inline void ksoftirqd_run_end(void)
+{
+ __local_bh_enable(SOFTIRQ_OFFSET, true);
+ WARN_ON_ONCE(in_interrupt());
+ local_irq_enable();
+}
+
+static inline void softirq_handle_begin(void) { }
+static inline void softirq_handle_end(void) { }
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+static inline void invoke_softirq(void)
+{
+ if (should_wake_ksoftirqd())
+ wakeup_softirqd();
+}
+
+#else /* CONFIG_PREEMPT_RT */
-#ifdef CONFIG_TRACE_IRQFLAGS
/*
- * This is for softirq.c-internal use, where hardirqs are disabled
+ * This one is for softirq.c-internal use, where hardirqs are disabled
* legitimately:
*/
+#ifdef CONFIG_TRACE_IRQFLAGS
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -275,6 +445,8 @@ asmlinkage __visible void do_softirq(voi
local_irq_restore(flags);
}
+#endif /* !CONFIG_PREEMPT_RT */
+
/*
* We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
* but break the loop if need_resched() is set or after 2 ms.
@@ -379,8 +551,10 @@ asmlinkage __visible void __softirq_entr
pending >>= softirq_bit;
}
- if (__this_cpu_read(ksoftirqd) == current)
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ __this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
+
local_irq_disable();
pending = local_softirq_pending();

View File

@@ -0,0 +1,73 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:57 +0100
Subject: [PATCH 19/20] tick/sched: Prevent false positive softirq pending
warnings on RT
On RT a task which has soft interrupts disabled can block on a lock and
schedule out to idle while soft interrupts are pending. This triggers the
warning in the NOHZ idle code which complains about going idle with pending
soft interrupts. But as the task is blocked soft interrupt processing is
temporarily blocked as well which means that such a warning is a false
positive.
To prevent that check the per CPU state which indicates that a scheduled
out task has soft interrupts disabled.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/bottom_half.h | 6 ++++++
kernel/softirq.c | 15 +++++++++++++++
kernel/time/tick-sched.c | 2 +-
3 files changed, 22 insertions(+), 1 deletion(-)
--- a/include/linux/bottom_half.h
+++ b/include/linux/bottom_half.h
@@ -32,4 +32,10 @@ static inline void local_bh_enable(void)
__local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}
+#ifdef CONFIG_PREEMPT_RT
+extern bool local_bh_blocked(void);
+#else
+static inline bool local_bh_blocked(void) { return false; }
+#endif
+
#endif /* _LINUX_BH_H */
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -139,6 +139,21 @@ static DEFINE_PER_CPU(struct softirq_ctr
.lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
};
+/**
+ * local_bh_blocked() - Check for idle whether BH processing is blocked
+ *
+ * Returns false if the per CPU softirq::cnt is 0 otherwise true.
+ *
+ * This is invoked from the idle task to guard against false positive
+ * softirq pending warnings, which would happen when the task which holds
+ * softirq_ctrl::lock was the only running task on the CPU and blocks on
+ * some other lock.
+ */
+bool local_bh_blocked(void)
+{
+ return __this_cpu_read(softirq_ctrl.cnt) != 0;
+}
+
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -973,7 +973,7 @@ static bool can_stop_idle_tick(int cpu,
if (unlikely(local_softirq_pending())) {
static int ratelimit;
- if (ratelimit < 10 &&
+ if (ratelimit < 10 && !local_bh_blocked() &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
(unsigned int) local_softirq_pending());

View File

@@ -0,0 +1,28 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Mar 2021 09:55:58 +0100
Subject: [PATCH 20/20] rcu: Prevent false positive softirq warning on RT
Soft interrupt disabled sections can legitimately be preempted or schedule
out when blocking on a lock on RT enabled kernels so the RCU preempt check
warning has to be disabled for RT kernels.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rcupdate.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -328,7 +328,8 @@ static inline void rcu_preempt_sleep_che
#define rcu_sleep_check() \
do { \
rcu_preempt_sleep_check(); \
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \
"Illegal context switch in RCU-bh read-side critical section"); \
RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \
"Illegal context switch in RCU-sched read-side critical section"); \

View File

@@ -0,0 +1,256 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 2 Feb 2021 18:01:03 +0100
Subject: [PATCH 1/2] chelsio: cxgb: Replace the workqueue with threaded
interrupt
The external interrupt (F_PL_INTR_EXT) needs to be handled in a process
context and this is accomplished by utilizing a workqueue.
The process context can also be provided by a threaded interrupt instead
of a workqueue. The threaded interrupt can be used later for other
interrupt related processing which require non-atomic context without
using yet another workqueue. free_irq() also ensures that the thread is
done which is currently missing (the worker could continue after the
module has been removed).
Save pending flags in pending_thread_intr. Use the same mechanism
to disable F_PL_INTR_EXT as interrupt source like it is used before the
worker is scheduled. Enable the interrupt again once
t1_elmer0_ext_intr_handler() is done.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/net/ethernet/chelsio/cxgb/common.h | 5 +--
drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 44 ++---------------------------
drivers/net/ethernet/chelsio/cxgb/sge.c | 33 +++++++++++++++++++--
drivers/net/ethernet/chelsio/cxgb/sge.h | 1
drivers/net/ethernet/chelsio/cxgb/subr.c | 26 +++++++++++------
5 files changed, 55 insertions(+), 54 deletions(-)
--- a/drivers/net/ethernet/chelsio/cxgb/common.h
+++ b/drivers/net/ethernet/chelsio/cxgb/common.h
@@ -238,7 +238,6 @@ struct adapter {
int msg_enable;
u32 mmio_len;
- struct work_struct ext_intr_handler_task;
struct adapter_params params;
/* Terminator modules. */
@@ -257,6 +256,7 @@ struct adapter {
/* guards async operations */
spinlock_t async_lock ____cacheline_aligned;
+ u32 pending_thread_intr;
u32 slow_intr_mask;
int t1powersave;
};
@@ -334,8 +334,7 @@ void t1_interrupts_enable(adapter_t *ada
void t1_interrupts_disable(adapter_t *adapter);
void t1_interrupts_clear(adapter_t *adapter);
int t1_elmer0_ext_intr_handler(adapter_t *adapter);
-void t1_elmer0_ext_intr(adapter_t *adapter);
-int t1_slow_intr_handler(adapter_t *adapter);
+irqreturn_t t1_slow_intr_handler(adapter_t *adapter);
int t1_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc);
const struct board_info *t1_get_board_info(unsigned int board_id);
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -211,9 +211,10 @@ static int cxgb_up(struct adapter *adapt
t1_interrupts_clear(adapter);
adapter->params.has_msi = !disable_msi && !pci_enable_msi(adapter->pdev);
- err = request_irq(adapter->pdev->irq, t1_interrupt,
- adapter->params.has_msi ? 0 : IRQF_SHARED,
- adapter->name, adapter);
+ err = request_threaded_irq(adapter->pdev->irq, t1_interrupt,
+ t1_interrupt_thread,
+ adapter->params.has_msi ? 0 : IRQF_SHARED,
+ adapter->name, adapter);
if (err) {
if (adapter->params.has_msi)
pci_disable_msi(adapter->pdev);
@@ -916,41 +917,6 @@ static void mac_stats_task(struct work_s
spin_unlock(&adapter->work_lock);
}
-/*
- * Processes elmer0 external interrupts in process context.
- */
-static void ext_intr_task(struct work_struct *work)
-{
- struct adapter *adapter =
- container_of(work, struct adapter, ext_intr_handler_task);
-
- t1_elmer0_ext_intr_handler(adapter);
-
- /* Now reenable external interrupts */
- spin_lock_irq(&adapter->async_lock);
- adapter->slow_intr_mask |= F_PL_INTR_EXT;
- writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE);
- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA,
- adapter->regs + A_PL_ENABLE);
- spin_unlock_irq(&adapter->async_lock);
-}
-
-/*
- * Interrupt-context handler for elmer0 external interrupts.
- */
-void t1_elmer0_ext_intr(struct adapter *adapter)
-{
- /*
- * Schedule a task to handle external interrupts as we require
- * a process context. We disable EXT interrupts in the interim
- * and let the task reenable them when it's done.
- */
- adapter->slow_intr_mask &= ~F_PL_INTR_EXT;
- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA,
- adapter->regs + A_PL_ENABLE);
- schedule_work(&adapter->ext_intr_handler_task);
-}
-
void t1_fatal_err(struct adapter *adapter)
{
if (adapter->flags & FULL_INIT_DONE) {
@@ -1062,8 +1028,6 @@ static int init_one(struct pci_dev *pdev
spin_lock_init(&adapter->async_lock);
spin_lock_init(&adapter->mac_lock);
- INIT_WORK(&adapter->ext_intr_handler_task,
- ext_intr_task);
INIT_DELAYED_WORK(&adapter->stats_update_task,
mac_stats_task);
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1619,11 +1619,38 @@ int t1_poll(struct napi_struct *napi, in
return work_done;
}
+irqreturn_t t1_interrupt_thread(int irq, void *data)
+{
+ struct adapter *adapter = data;
+ u32 pending_thread_intr;
+
+ spin_lock_irq(&adapter->async_lock);
+ pending_thread_intr = adapter->pending_thread_intr;
+ adapter->pending_thread_intr = 0;
+ spin_unlock_irq(&adapter->async_lock);
+
+ if (!pending_thread_intr)
+ return IRQ_NONE;
+
+ if (pending_thread_intr & F_PL_INTR_EXT)
+ t1_elmer0_ext_intr_handler(adapter);
+
+ spin_lock_irq(&adapter->async_lock);
+ adapter->slow_intr_mask |= F_PL_INTR_EXT;
+
+ writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE);
+ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA,
+ adapter->regs + A_PL_ENABLE);
+ spin_unlock_irq(&adapter->async_lock);
+
+ return IRQ_HANDLED;
+}
+
irqreturn_t t1_interrupt(int irq, void *data)
{
struct adapter *adapter = data;
struct sge *sge = adapter->sge;
- int handled;
+ irqreturn_t handled;
if (likely(responses_pending(adapter))) {
writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE);
@@ -1645,10 +1672,10 @@ irqreturn_t t1_interrupt(int irq, void *
handled = t1_slow_intr_handler(adapter);
spin_unlock(&adapter->async_lock);
- if (!handled)
+ if (handled == IRQ_NONE)
sge->stats.unhandled_irqs++;
- return IRQ_RETVAL(handled != 0);
+ return handled;
}
/*
--- a/drivers/net/ethernet/chelsio/cxgb/sge.h
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.h
@@ -74,6 +74,7 @@ struct sge *t1_sge_create(struct adapter
int t1_sge_configure(struct sge *, struct sge_params *);
int t1_sge_set_coalesce_params(struct sge *, struct sge_params *);
void t1_sge_destroy(struct sge *);
+irqreturn_t t1_interrupt_thread(int irq, void *data);
irqreturn_t t1_interrupt(int irq, void *cookie);
int t1_poll(struct napi_struct *, int);
--- a/drivers/net/ethernet/chelsio/cxgb/subr.c
+++ b/drivers/net/ethernet/chelsio/cxgb/subr.c
@@ -210,7 +210,7 @@ static int fpga_phy_intr_handler(adapter
/*
* Slow path interrupt handler for FPGAs.
*/
-static int fpga_slow_intr(adapter_t *adapter)
+static irqreturn_t fpga_slow_intr(adapter_t *adapter)
{
u32 cause = readl(adapter->regs + A_PL_CAUSE);
@@ -238,7 +238,7 @@ static int fpga_slow_intr(adapter_t *ada
if (cause)
writel(cause, adapter->regs + A_PL_CAUSE);
- return cause != 0;
+ return cause == 0 ? IRQ_NONE : IRQ_HANDLED;
}
#endif
@@ -842,13 +842,14 @@ void t1_interrupts_clear(adapter_t* adap
/*
* Slow path interrupt handler for ASICs.
*/
-static int asic_slow_intr(adapter_t *adapter)
+static irqreturn_t asic_slow_intr(adapter_t *adapter)
{
u32 cause = readl(adapter->regs + A_PL_CAUSE);
+ irqreturn_t ret = IRQ_HANDLED;
cause &= adapter->slow_intr_mask;
if (!cause)
- return 0;
+ return IRQ_NONE;
if (cause & F_PL_INTR_SGE_ERR)
t1_sge_intr_error_handler(adapter->sge);
if (cause & F_PL_INTR_TP)
@@ -857,16 +858,25 @@ static int asic_slow_intr(adapter_t *ada
t1_espi_intr_handler(adapter->espi);
if (cause & F_PL_INTR_PCIX)
t1_pci_intr_handler(adapter);
- if (cause & F_PL_INTR_EXT)
- t1_elmer0_ext_intr(adapter);
+ if (cause & F_PL_INTR_EXT) {
+ /* Wake the threaded interrupt to handle external interrupts as
+ * we require a process context. We disable EXT interrupts in
+ * the interim and let the thread reenable them when it's done.
+ */
+ adapter->pending_thread_intr |= F_PL_INTR_EXT;
+ adapter->slow_intr_mask &= ~F_PL_INTR_EXT;
+ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA,
+ adapter->regs + A_PL_ENABLE);
+ ret = IRQ_WAKE_THREAD;
+ }
/* Clear the interrupts just processed. */
writel(cause, adapter->regs + A_PL_CAUSE);
readl(adapter->regs + A_PL_CAUSE); /* flush writes */
- return 1;
+ return ret;
}
-int t1_slow_intr_handler(adapter_t *adapter)
+irqreturn_t t1_slow_intr_handler(adapter_t *adapter)
{
#ifdef CONFIG_CHELSIO_T1_1G
if (!t1_is_asic(adapter))

View File

@@ -0,0 +1,200 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 2 Feb 2021 18:01:04 +0100
Subject: [PATCH 2/2] chelsio: cxgb: Disable the card on error in threaded
interrupt
t1_fatal_err() is invoked from the interrupt handler. The bad part is
that it invokes (via t1_sge_stop()) del_timer_sync() and tasklet_kill().
Both functions must not be called from an interrupt because it is
possible that it will wait for the completion of the timer/tasklet it
just interrupted.
In case of a fatal error, use t1_interrupts_disable() to disable all
interrupt sources and then wake the interrupt thread with
F_PL_INTR_SGE_ERR as pending flag. The threaded-interrupt will stop the
card via t1_sge_stop() and not re-enable the interrupts again.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/net/ethernet/chelsio/cxgb/common.h | 1
drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 10 -------
drivers/net/ethernet/chelsio/cxgb/sge.c | 20 ++++++++++++---
drivers/net/ethernet/chelsio/cxgb/sge.h | 2 -
drivers/net/ethernet/chelsio/cxgb/subr.c | 38 ++++++++++++++++++++---------
5 files changed, 44 insertions(+), 27 deletions(-)
--- a/drivers/net/ethernet/chelsio/cxgb/common.h
+++ b/drivers/net/ethernet/chelsio/cxgb/common.h
@@ -346,7 +346,6 @@ int t1_get_board_rev(adapter_t *adapter,
int t1_init_hw_modules(adapter_t *adapter);
int t1_init_sw_modules(adapter_t *adapter, const struct board_info *bi);
void t1_free_sw_modules(adapter_t *adapter);
-void t1_fatal_err(adapter_t *adapter);
void t1_link_changed(adapter_t *adapter, int port_id);
void t1_link_negotiated(adapter_t *adapter, int port_id, int link_stat,
int speed, int duplex, int pause);
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -917,16 +917,6 @@ static void mac_stats_task(struct work_s
spin_unlock(&adapter->work_lock);
}
-void t1_fatal_err(struct adapter *adapter)
-{
- if (adapter->flags & FULL_INIT_DONE) {
- t1_sge_stop(adapter->sge);
- t1_interrupts_disable(adapter);
- }
- pr_alert("%s: encountered fatal error, operation suspended\n",
- adapter->name);
-}
-
static const struct net_device_ops cxgb_netdev_ops = {
.ndo_open = cxgb_open,
.ndo_stop = cxgb_close,
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -940,10 +940,11 @@ void t1_sge_intr_clear(struct sge *sge)
/*
* SGE 'Error' interrupt handler
*/
-int t1_sge_intr_error_handler(struct sge *sge)
+bool t1_sge_intr_error_handler(struct sge *sge)
{
struct adapter *adapter = sge->adapter;
u32 cause = readl(adapter->regs + A_SG_INT_CAUSE);
+ bool wake = false;
if (adapter->port[0].dev->hw_features & NETIF_F_TSO)
cause &= ~F_PACKET_TOO_BIG;
@@ -967,11 +968,14 @@ int t1_sge_intr_error_handler(struct sge
sge->stats.pkt_mismatch++;
pr_alert("%s: SGE packet mismatch\n", adapter->name);
}
- if (cause & SGE_INT_FATAL)
- t1_fatal_err(adapter);
+ if (cause & SGE_INT_FATAL) {
+ t1_interrupts_disable(adapter);
+ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR;
+ wake = true;
+ }
writel(cause, adapter->regs + A_SG_INT_CAUSE);
- return 0;
+ return wake;
}
const struct sge_intr_counts *t1_sge_get_intr_counts(const struct sge *sge)
@@ -1635,6 +1639,14 @@ irqreturn_t t1_interrupt_thread(int irq,
if (pending_thread_intr & F_PL_INTR_EXT)
t1_elmer0_ext_intr_handler(adapter);
+ /* This error is fatal, interrupts remain off */
+ if (pending_thread_intr & F_PL_INTR_SGE_ERR) {
+ pr_alert("%s: encountered fatal error, operation suspended\n",
+ adapter->name);
+ t1_sge_stop(adapter->sge);
+ return IRQ_HANDLED;
+ }
+
spin_lock_irq(&adapter->async_lock);
adapter->slow_intr_mask |= F_PL_INTR_EXT;
--- a/drivers/net/ethernet/chelsio/cxgb/sge.h
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.h
@@ -82,7 +82,7 @@ netdev_tx_t t1_start_xmit(struct sk_buff
void t1_vlan_mode(struct adapter *adapter, netdev_features_t features);
void t1_sge_start(struct sge *);
void t1_sge_stop(struct sge *);
-int t1_sge_intr_error_handler(struct sge *);
+bool t1_sge_intr_error_handler(struct sge *sge);
void t1_sge_intr_enable(struct sge *);
void t1_sge_intr_disable(struct sge *);
void t1_sge_intr_clear(struct sge *);
--- a/drivers/net/ethernet/chelsio/cxgb/subr.c
+++ b/drivers/net/ethernet/chelsio/cxgb/subr.c
@@ -170,7 +170,7 @@ void t1_link_changed(adapter_t *adapter,
t1_link_negotiated(adapter, port_id, link_ok, speed, duplex, fc);
}
-static int t1_pci_intr_handler(adapter_t *adapter)
+static bool t1_pci_intr_handler(adapter_t *adapter)
{
u32 pcix_cause;
@@ -179,9 +179,13 @@ static int t1_pci_intr_handler(adapter_t
if (pcix_cause) {
pci_write_config_dword(adapter->pdev, A_PCICFG_INTR_CAUSE,
pcix_cause);
- t1_fatal_err(adapter); /* PCI errors are fatal */
+ /* PCI errors are fatal */
+ t1_interrupts_disable(adapter);
+ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR;
+ pr_alert("%s: PCI error encountered.\n", adapter->name);
+ return true;
}
- return 0;
+ return false;
}
#ifdef CONFIG_CHELSIO_T1_1G
@@ -213,10 +217,13 @@ static int fpga_phy_intr_handler(adapter
static irqreturn_t fpga_slow_intr(adapter_t *adapter)
{
u32 cause = readl(adapter->regs + A_PL_CAUSE);
+ irqreturn_t ret = IRQ_NONE;
cause &= ~F_PL_INTR_SGE_DATA;
- if (cause & F_PL_INTR_SGE_ERR)
- t1_sge_intr_error_handler(adapter->sge);
+ if (cause & F_PL_INTR_SGE_ERR) {
+ if (t1_sge_intr_error_handler(adapter->sge))
+ ret = IRQ_WAKE_THREAD;
+ }
if (cause & FPGA_PCIX_INTERRUPT_GMAC)
fpga_phy_intr_handler(adapter);
@@ -231,13 +238,18 @@ static irqreturn_t fpga_slow_intr(adapte
/* Clear TP interrupt */
writel(tp_cause, adapter->regs + FPGA_TP_ADDR_INTERRUPT_CAUSE);
}
- if (cause & FPGA_PCIX_INTERRUPT_PCIX)
- t1_pci_intr_handler(adapter);
+ if (cause & FPGA_PCIX_INTERRUPT_PCIX) {
+ if (t1_pci_intr_handler(adapter))
+ ret = IRQ_WAKE_THREAD;
+ }
/* Clear the interrupts just processed. */
if (cause)
writel(cause, adapter->regs + A_PL_CAUSE);
+ if (ret != IRQ_NONE)
+ return ret;
+
return cause == 0 ? IRQ_NONE : IRQ_HANDLED;
}
#endif
@@ -850,14 +862,18 @@ static irqreturn_t asic_slow_intr(adapte
cause &= adapter->slow_intr_mask;
if (!cause)
return IRQ_NONE;
- if (cause & F_PL_INTR_SGE_ERR)
- t1_sge_intr_error_handler(adapter->sge);
+ if (cause & F_PL_INTR_SGE_ERR) {
+ if (t1_sge_intr_error_handler(adapter->sge))
+ ret = IRQ_WAKE_THREAD;
+ }
if (cause & F_PL_INTR_TP)
t1_tp_intr_handler(adapter->tp);
if (cause & F_PL_INTR_ESPI)
t1_espi_intr_handler(adapter->espi);
- if (cause & F_PL_INTR_PCIX)
- t1_pci_intr_handler(adapter);
+ if (cause & F_PL_INTR_PCIX) {
+ if (t1_pci_intr_handler(adapter))
+ ret = IRQ_WAKE_THREAD;
+ }
if (cause & F_PL_INTR_EXT) {
/* Wake the threaded interrupt to handle external interrupts as
* we require a process context. We disable EXT interrupts in

View File

@@ -0,0 +1,86 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 29 Sep 2020 15:21:17 +0200
Subject: [PATCH 01/22] locking/rtmutex: Remove cruft
Most of this is around since the very beginning. I'm not sure if this
was used while the rtmutex-deadlock-tester was around but today it seems
to only waste memory:
- save_state: No users
- name: Assigned and printed if a dead lock was detected. I'm keeping it
but want to point out that lockdep has the same information.
- file + line: Printed if ::name was NULL. This is only used for
in-kernel locks so it ::name shouldn't be NULL and then ::file and
::line isn't used.
- magic: Assigned to NULL by rt_mutex_destroy().
Remove members of rt_mutex which are not used.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rtmutex.h | 7 ++-----
kernel/locking/rtmutex-debug.c | 7 +------
kernel/locking/rtmutex.c | 3 ---
kernel/locking/rtmutex_common.h | 1 -
4 files changed, 3 insertions(+), 15 deletions(-)
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -32,10 +32,7 @@ struct rt_mutex {
struct rb_root_cached waiters;
struct task_struct *owner;
#ifdef CONFIG_DEBUG_RT_MUTEXES
- int save_state;
- const char *name, *file;
- int line;
- void *magic;
+ const char *name;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -60,7 +57,7 @@ struct hrtimer_sleeper;
#ifdef CONFIG_DEBUG_RT_MUTEXES
# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
- , .name = #mutexname, .file = __FILE__, .line = __LINE__
+ , .name = #mutexname
# define rt_mutex_init(mutex) \
do { \
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -42,12 +42,7 @@ static void printk_task(struct task_stru
static void printk_lock(struct rt_mutex *lock, int print_owner)
{
- if (lock->name)
- printk(" [%p] {%s}\n",
- lock, lock->name);
- else
- printk(" [%p] {%s:%d}\n",
- lock, lock->file, lock->line);
+ printk(" [%p] {%s}\n", lock, lock->name);
if (print_owner && rt_mutex_owner(lock)) {
printk(".. ->owner: %p\n", lock->owner);
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1655,9 +1655,6 @@ void __sched rt_mutex_futex_unlock(struc
void rt_mutex_destroy(struct rt_mutex *lock)
{
WARN_ON(rt_mutex_is_locked(lock));
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- lock->magic = NULL;
-#endif
}
EXPORT_SYMBOL_GPL(rt_mutex_destroy);
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -30,7 +30,6 @@ struct rt_mutex_waiter {
struct task_struct *task;
struct rt_mutex *lock;
#ifdef CONFIG_DEBUG_RT_MUTEXES
- unsigned long ip;
struct pid *deadlock_task_pid;
struct rt_mutex *deadlock_lock;
#endif

View File

@@ -0,0 +1,294 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 29 Sep 2020 16:05:11 +0200
Subject: [PATCH 02/22] locking/rtmutex: Remove output from deadlock detector.
In commit
f5694788ad8da ("rt_mutex: Add lockdep annotations")
rtmutex gained lockdep annotation for rt_mutex_lock() and and related
functions.
lockdep will see the locking order and may complain about a deadlock
before rtmutex' own mechanism gets a chance to detect it.
The rtmutex deadlock detector will only complain locks with the
RT_MUTEX_MIN_CHAINWALK and a waiter must be pending. That means it
works only for in-kernel locks because the futex interface always uses
RT_MUTEX_FULL_CHAINWALK.
The requirement for an active waiter limits the detector to actual
deadlocks and makes it possible to report potential deadlocks like
lockdep does.
It looks like lockdep is better suited for reporting deadlocks.
Remove rtmutex' debug print on deadlock detection.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rtmutex.h | 7 --
kernel/locking/rtmutex-debug.c | 97 ----------------------------------------
kernel/locking/rtmutex-debug.h | 11 ----
kernel/locking/rtmutex.c | 9 ---
kernel/locking/rtmutex.h | 7 --
kernel/locking/rtmutex_common.h | 4 -
6 files changed, 135 deletions(-)
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -31,9 +31,6 @@ struct rt_mutex {
raw_spinlock_t wait_lock;
struct rb_root_cached waiters;
struct task_struct *owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- const char *name;
-#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -56,8 +53,6 @@ struct hrtimer_sleeper;
#endif
#ifdef CONFIG_DEBUG_RT_MUTEXES
-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
- , .name = #mutexname
# define rt_mutex_init(mutex) \
do { \
@@ -67,7 +62,6 @@ do { \
extern void rt_mutex_debug_task_free(struct task_struct *tsk);
#else
-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL)
# define rt_mutex_debug_task_free(t) do { } while (0)
#endif
@@ -83,7 +77,6 @@ do { \
{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
, .waiters = RB_ROOT_CACHED \
, .owner = NULL \
- __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
__DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
#define DEFINE_RT_MUTEX(mutexname) \
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -32,105 +32,12 @@
#include "rtmutex_common.h"
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct rt_mutex *lock, int print_owner)
-{
- printk(" [%p] {%s}\n", lock, lock->name);
-
- if (print_owner && rt_mutex_owner(lock)) {
- printk(".. ->owner: %p\n", lock->owner);
- printk(".. held by: ");
- printk_task(rt_mutex_owner(lock));
- printk("\n");
- }
-}
-
void rt_mutex_debug_task_free(struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
-/*
- * We fill out the fields in the waiter to store the information about
- * the deadlock. We print when we return. act_waiter can be NULL in
- * case of a remove waiter operation.
- */
-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *act_waiter,
- struct rt_mutex *lock)
-{
- struct task_struct *task;
-
- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
- return;
-
- task = rt_mutex_owner(act_waiter->lock);
- if (task && task != current) {
- act_waiter->deadlock_task_pid = get_pid(task_pid(task));
- act_waiter->deadlock_lock = lock;
- }
-}
-
-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
-{
- struct task_struct *task;
-
- if (!waiter->deadlock_lock || !debug_locks)
- return;
-
- rcu_read_lock();
- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
- if (!task) {
- rcu_read_unlock();
- return;
- }
-
- if (!debug_locks_off()) {
- rcu_read_unlock();
- return;
- }
-
- pr_warn("\n");
- pr_warn("============================================\n");
- pr_warn("WARNING: circular locking deadlock detected!\n");
- pr_warn("%s\n", print_tainted());
- pr_warn("--------------------------------------------\n");
- printk("%s/%d is deadlocking current task %s/%d\n\n",
- task->comm, task_pid_nr(task),
- current->comm, task_pid_nr(current));
-
- printk("\n1) %s/%d is trying to acquire this lock:\n",
- current->comm, task_pid_nr(current));
- printk_lock(waiter->lock, 1);
-
- printk("\n2) %s/%d is blocked on this lock:\n",
- task->comm, task_pid_nr(task));
- printk_lock(waiter->deadlock_lock, 1);
-
- debug_show_held_locks(current);
- debug_show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task_pid_nr(task));
- show_stack(task, NULL, KERN_DEFAULT);
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, task_pid_nr(current));
- dump_stack();
- debug_show_all_locks();
- rcu_read_unlock();
-
- printk("[ turning off deadlock detection."
- "Please report this trace. ]\n\n");
-}
-
void debug_rt_mutex_lock(struct rt_mutex *lock)
{
}
@@ -153,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
memset(waiter, 0x11, sizeof(*waiter));
- waiter->deadlock_task_pid = NULL;
}
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
{
- put_pid(waiter->deadlock_task_pid);
memset(waiter, 0x22, sizeof(*waiter));
}
@@ -168,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex
* Make sure we are not reinitializing a held lock:
*/
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
- lock->name = name;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
}
-
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -18,20 +18,9 @@ extern void debug_rt_mutex_unlock(struct
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
struct task_struct *powner);
extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
- struct rt_mutex_waiter *waiter,
- struct rt_mutex *lock);
-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
-# define debug_rt_mutex_reset_waiter(w) \
- do { (w)->deadlock_lock = NULL; } while (0)
static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
enum rtmutex_chainwalk walk)
{
return (waiter != NULL);
}
-
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- debug_rt_mutex_print_deadlock(w);
-}
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -597,7 +597,6 @@ static int rt_mutex_adjust_prio_chain(st
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
goto out_unlock_pi;
@@ -1189,8 +1188,6 @@ static int __sched
raw_spin_unlock_irq(&lock->wait_lock);
- debug_rt_mutex_print_deadlock(waiter);
-
schedule();
raw_spin_lock_irq(&lock->wait_lock);
@@ -1211,10 +1208,6 @@ static void rt_mutex_handle_deadlock(int
if (res != -EDEADLOCK || detect_deadlock)
return;
- /*
- * Yell lowdly and stop the task right here.
- */
- rt_mutex_print_deadlock(w);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
@@ -1763,8 +1756,6 @@ int __rt_mutex_start_proxy_lock(struct r
ret = 0;
}
- debug_rt_mutex_print_deadlock(waiter);
-
return ret;
}
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -19,15 +19,8 @@
#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
#define debug_rt_mutex_unlock(l) do { } while (0)
#define debug_rt_mutex_init(m, n, k) do { } while (0)
-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
-#define debug_rt_mutex_print_deadlock(w) do { } while (0)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
-{
- WARN(1, "rtmutex deadlock detected\n");
-}
-
static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
enum rtmutex_chainwalk walk)
{
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -29,10 +29,6 @@ struct rt_mutex_waiter {
struct rb_node pi_tree_entry;
struct task_struct *task;
struct rt_mutex *lock;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
- struct pid *deadlock_task_pid;
- struct rt_mutex *deadlock_lock;
-#endif
int prio;
u64 deadline;
};

View File

@@ -0,0 +1,53 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 29 Sep 2020 16:32:49 +0200
Subject: [PATCH 03/22] locking/rtmutex: Move rt_mutex_init() outside of
CONFIG_DEBUG_RT_MUTEXES
rt_mutex_init() only initializes lockdep if CONFIG_DEBUG_RT_MUTEXES is
enabled. The static initializer (DEFINE_RT_MUTEX) does not have such a
restriction.
Move rt_mutex_init() outside of CONFIG_DEBUG_RT_MUTEXES.
Move the remaining functions in this CONFIG_DEBUG_RT_MUTEXES block to
the upper block.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rtmutex.h | 12 +++---------
1 file changed, 3 insertions(+), 9 deletions(-)
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -43,6 +43,7 @@ struct hrtimer_sleeper;
extern int rt_mutex_debug_check_no_locks_freed(const void *from,
unsigned long len);
extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task);
+ extern void rt_mutex_debug_task_free(struct task_struct *tsk);
#else
static inline int rt_mutex_debug_check_no_locks_freed(const void *from,
unsigned long len)
@@ -50,22 +51,15 @@ struct hrtimer_sleeper;
return 0;
}
# define rt_mutex_debug_check_no_locks_held(task) do { } while (0)
+# define rt_mutex_debug_task_free(t) do { } while (0)
#endif
-#ifdef CONFIG_DEBUG_RT_MUTEXES
-
-# define rt_mutex_init(mutex) \
+#define rt_mutex_init(mutex) \
do { \
static struct lock_class_key __key; \
__rt_mutex_init(mutex, __func__, &__key); \
} while (0)
- extern void rt_mutex_debug_task_free(struct task_struct *tsk);
-#else
-# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL)
-# define rt_mutex_debug_task_free(t) do { } while (0)
-#endif
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \
, .dep_map = { .name = #mutexname }

View File

@@ -0,0 +1,89 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 7 Oct 2020 12:11:33 +0200
Subject: [PATCH 04/22] locking/rtmutex: Remove rt_mutex_timed_lock()
rt_mutex_timed_lock() has no callers since commit
c051b21f71d1f ("rtmutex: Confine deadlock logic to futex")
Remove rt_mutex_timed_lock().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rtmutex.h | 3 ---
kernel/locking/rtmutex.c | 46 ----------------------------------------------
2 files changed, 49 deletions(-)
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -99,9 +99,6 @@ extern void rt_mutex_lock(struct rt_mute
#endif
extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
-extern int rt_mutex_timed_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout);
-
extern int rt_mutex_trylock(struct rt_mutex *lock);
extern void rt_mutex_unlock(struct rt_mutex *lock);
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1406,21 +1406,6 @@ rt_mutex_fastlock(struct rt_mutex *lock,
}
static inline int
-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk,
- int (*slowfn)(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
-{
- if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- return 0;
-
- return slowfn(lock, state, timeout, chwalk);
-}
-
-static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
@@ -1528,37 +1513,6 @@ int __sched __rt_mutex_futex_trylock(str
}
/**
- * rt_mutex_timed_lock - lock a rt_mutex interruptible
- * the timeout structure is provided
- * by the caller
- *
- * @lock: the rt_mutex to be locked
- * @timeout: timeout structure or NULL (no timeout)
- *
- * Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -ETIMEDOUT when the timeout expired
- */
-int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
-{
- int ret;
-
- might_sleep();
-
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_MIN_CHAINWALK,
- rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, _RET_IP_);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
-
-/**
* rt_mutex_trylock - try to lock a rt_mutex
*
* @lock: the rt_mutex to be locked

View File

@@ -1,6 +1,7 @@
Subject: rtmutex: Handle the various new futex race conditions
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 11:04:15 +0200
Subject: [PATCH 05/22] locking/rtmutex: Handle the various new futex race
conditions
RT opens a few new interesting race conditions in the rtmutex/futex
combo due to futex hash bucket lock being a 'sleeping' spinlock and
@@ -8,16 +9,16 @@ therefor not disabling preemption.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/futex.c | 77 ++++++++++++++++++++++++++++++++--------
kernel/futex.c | 78 ++++++++++++++++++++++++++++++++--------
kernel/locking/rtmutex.c | 36 +++++++++++++++---
kernel/locking/rtmutex_common.h | 2 +
3 files changed, 94 insertions(+), 21 deletions(-)
3 files changed, 95 insertions(+), 21 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2260,6 +2260,16 @@ static int futex_requeue(u32 __user *uad
@@ -2154,6 +2154,16 @@ static int futex_requeue(u32 __user *uad
*/
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
+ } else if (ret == -EAGAIN) {
+ /*
@@ -32,16 +33,16 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
} else if (ret) {
/*
* rt_mutex_start_proxy_lock() detected a
@@ -3315,7 +3325,7 @@ static int futex_wait_requeue_pi(u32 __u
@@ -3172,7 +3182,7 @@ static int futex_wait_requeue_pi(u32 __u
{
struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
+ struct futex_hash_bucket *hb, *hb2;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
int res, ret;
@@ -3367,20 +3377,55 @@ static int futex_wait_requeue_pi(u32 __u
@@ -3224,20 +3234,55 @@ static int futex_wait_requeue_pi(u32 __u
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -49,7 +50,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
- spin_unlock(&hb->lock);
- if (ret)
- goto out_put_keys;
- goto out;
+ /*
+ * On RT we must avoid races with requeue and trying to block
+ * on two mutexes (hb->lock and uaddr2's rtmutex) by
@@ -86,7 +87,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+ spin_unlock(&hb->lock);
+ if (ret)
+ goto out_put_keys;
+ goto out;
+ }
/*
@@ -108,7 +109,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/* Check if the requeue code acquired the second futex for us. */
if (!q.rt_waiter) {
@@ -3389,7 +3434,8 @@ static int futex_wait_requeue_pi(u32 __u
@@ -3246,14 +3291,16 @@ static int futex_wait_requeue_pi(u32 __u
* did a lock-steal - fix up the PI-state in that case.
*/
if (q.pi_state && (q.pi_state->owner != current)) {
@@ -116,18 +117,18 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
pi_state = q.pi_state;
@@ -3400,7 +3446,7 @@ static int futex_wait_requeue_pi(u32 __u
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
- spin_unlock(q.lock_ptr);
+ spin_unlock(&hb2->lock);
}
} else {
struct rt_mutex *pi_mutex;
@@ -3414,7 +3460,8 @@ static int futex_wait_requeue_pi(u32 __u
+
/*
* Adjust the return value. It's either -EFAULT or
* success (1) but the caller expects 0 for success.
@@ -3272,7 +3319,8 @@ static int futex_wait_requeue_pi(u32 __u
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
@@ -151,7 +152,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/*
* We can speed up the acquire/release, if there's no debugging state to be
* set up.
@@ -380,7 +385,8 @@ int max_lock_depth = 1024;
@@ -378,7 +383,8 @@ int max_lock_depth = 1024;
static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
{
@@ -161,7 +162,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
/*
@@ -516,7 +522,7 @@ static int rt_mutex_adjust_prio_chain(st
@@ -514,7 +520,7 @@ static int rt_mutex_adjust_prio_chain(st
* reached or the state of the chain has changed while we
* dropped the locks.
*/
@@ -170,7 +171,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
goto out_unlock_pi;
/*
@@ -950,6 +956,22 @@ static int task_blocks_on_rt_mutex(struc
@@ -947,6 +953,22 @@ static int task_blocks_on_rt_mutex(struc
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
@@ -193,7 +194,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
@@ -973,7 +995,7 @@ static int task_blocks_on_rt_mutex(struc
@@ -970,7 +992,7 @@ static int task_blocks_on_rt_mutex(struc
rt_mutex_enqueue_pi(owner, waiter);
rt_mutex_adjust_prio(owner);
@@ -202,7 +203,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
@@ -1069,7 +1091,7 @@ static void remove_waiter(struct rt_mute
@@ -1066,7 +1088,7 @@ static void remove_waiter(struct rt_mute
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
@@ -211,7 +212,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
lockdep_assert_held(&lock->wait_lock);
@@ -1095,7 +1117,8 @@ static void remove_waiter(struct rt_mute
@@ -1092,7 +1114,8 @@ static void remove_waiter(struct rt_mute
rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
@@ -221,7 +222,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
raw_spin_unlock(&owner->pi_lock);
@@ -1131,7 +1154,8 @@ void rt_mutex_adjust_pi(struct task_stru
@@ -1128,7 +1151,8 @@ void rt_mutex_adjust_pi(struct task_stru
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
@@ -233,7 +234,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -130,6 +130,8 @@ enum rtmutex_chainwalk {
@@ -125,6 +125,8 @@ enum rtmutex_chainwalk {
/*
* PI-futex support (proxy locking functions, etc.):
*/

View File

@@ -1,12 +1,11 @@
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 14 Jul 2015 14:26:34 +0200
Subject: futex: Fix bug on when a requeued RT task times out
Subject: [PATCH 06/22] futex: Fix bug on when a requeued RT task times out
Requeue with timeout causes a bug with PREEMPT_RT.
The bug comes from a timed out condition.
TASK 1 TASK 2
------ ------
futex_wait_requeue_pi()
@@ -16,13 +15,12 @@ The bug comes from a timed out condition.
double_lock_hb();
raw_spin_lock(pi_lock);
if (current->pi_blocked_on) {
if (current->pi_blocked_on) {
} else {
current->pi_blocked_on = PI_WAKE_INPROGRESS;
run_spin_unlock(pi_lock);
spin_lock(hb->lock); <-- blocked!
plist_for_each_entry_safe(this) {
rt_mutex_start_proxy_lock();
task_blocks_on_rt_mutex();
@@ -45,7 +43,6 @@ Otherwise set it to a new flag PI_REQUEUE_INPROGRESS, which notifies
the proxy task that it is being requeued, and will handle things
appropriately.
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
@@ -65,7 +62,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
/*
@@ -1779,6 +1780,34 @@ int __rt_mutex_start_proxy_lock(struct r
@@ -1720,6 +1721,34 @@ int __rt_mutex_start_proxy_lock(struct r
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
@@ -102,7 +99,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
RT_MUTEX_FULL_CHAINWALK);
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -131,6 +131,7 @@ enum rtmutex_chainwalk {
@@ -126,6 +126,7 @@ enum rtmutex_chainwalk {
* PI-futex support (proxy locking functions, etc.):
*/
#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)

View File

@@ -1,13 +1,12 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 1 Apr 2017 12:50:59 +0200
Subject: [PATCH] rtmutex: Make lock_killable work
Subject: [PATCH 07/22] locking/rtmutex: Make lock_killable work
Locking an rt mutex killable does not work because signal handling is
restricted to TASK_INTERRUPTIBLE.
Use signal_pending_state() unconditionaly.
Use signal_pending_state() unconditionally.
Cc: stable-rt@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
@@ -16,7 +15,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1200,18 +1200,13 @@ static int __sched
@@ -1197,18 +1197,13 @@ static int __sched
if (try_to_take_rt_mutex(lock, current, waiter))
break;

View File

@@ -1,6 +1,6 @@
Subject: spinlock: Split the lock types header
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 19:34:01 +0200
Subject: [PATCH 08/22] locking/spinlock: Split the lock types header
Split raw_spinlock into its own file and the remaining spinlock_t into
its own non-RT header. The non-RT header will be replaced later by sleeping
@@ -8,11 +8,13 @@ spinlocks.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rwlock_types.h | 4 ++
include/linux/spinlock_types.h | 71 +-----------------------------------
include/linux/spinlock_types_nort.h | 33 ++++++++++++++++
include/linux/spinlock_types_raw.h | 55 +++++++++++++++++++++++++++
4 files changed, 94 insertions(+), 69 deletions(-)
include/linux/rwlock_types.h | 4 +
include/linux/spinlock_types.h | 87 ------------------------------------
include/linux/spinlock_types_nort.h | 39 ++++++++++++++++
include/linux/spinlock_types_raw.h | 65 ++++++++++++++++++++++++++
4 files changed, 110 insertions(+), 85 deletions(-)
create mode 100644 include/linux/spinlock_types_nort.h
create mode 100644 include/linux/spinlock_types_raw.h
--- a/include/linux/rwlock_types.h
+++ b/include/linux/rwlock_types.h
@@ -29,7 +31,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* and initializers
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -9,76 +9,9 @@
@@ -9,92 +9,9 @@
* Released under the General Public License (GPL).
*/
@@ -40,7 +42,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-#endif
+#include <linux/spinlock_types_raw.h>
-#include <linux/lockdep.h>
-#include <linux/lockdep_types.h>
-
-typedef struct raw_spinlock {
- arch_spinlock_t raw_lock;
@@ -58,8 +60,18 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-#define SPINLOCK_OWNER_INIT ((void *)-1L)
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
-# define RAW_SPIN_DEP_MAP_INIT(lockname) \
- .dep_map = { \
- .name = #lockname, \
- .wait_type_inner = LD_WAIT_SPIN, \
- }
-# define SPIN_DEP_MAP_INIT(lockname) \
- .dep_map = { \
- .name = #lockname, \
- .wait_type_inner = LD_WAIT_CONFIG, \
- }
-#else
-# define RAW_SPIN_DEP_MAP_INIT(lockname)
-# define SPIN_DEP_MAP_INIT(lockname)
-#endif
-
@@ -76,7 +88,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
- { \
- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
- SPIN_DEBUG_INIT(lockname) \
- SPIN_DEP_MAP_INIT(lockname) }
- RAW_SPIN_DEP_MAP_INIT(lockname) }
-
-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
@@ -97,11 +109,17 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
- };
-} spinlock_t;
-
-#define ___SPIN_LOCK_INITIALIZER(lockname) \
- { \
- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
- SPIN_DEBUG_INIT(lockname) \
- SPIN_DEP_MAP_INIT(lockname) }
-
-#define __SPIN_LOCK_INITIALIZER(lockname) \
- { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
- { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } }
-
-#define __SPIN_LOCK_UNLOCKED(lockname) \
- (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
- (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname)
-
-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+#include <linux/spinlock_types_nort.h>
@@ -110,7 +128,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- /dev/null
+++ b/include/linux/spinlock_types_nort.h
@@ -0,0 +1,33 @@
@@ -0,0 +1,39 @@
+#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
+#define __LINUX_SPINLOCK_TYPES_NORT_H
+
@@ -135,18 +153,24 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ };
+} spinlock_t;
+
+#define ___SPIN_LOCK_INITIALIZER(lockname) \
+{ \
+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
+ SPIN_DEBUG_INIT(lockname) \
+ SPIN_DEP_MAP_INIT(lockname) }
+
+#define __SPIN_LOCK_INITIALIZER(lockname) \
+ { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
+ { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } }
+
+#define __SPIN_LOCK_UNLOCKED(lockname) \
+ (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
+ (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
+
+#endif
--- /dev/null
+++ b/include/linux/spinlock_types_raw.h
@@ -0,0 +1,55 @@
@@ -0,0 +1,65 @@
+#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
+#define __LINUX_SPINLOCK_TYPES_RAW_H
+
@@ -158,7 +182,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+# include <linux/spinlock_types_up.h>
+#endif
+
+#include <linux/lockdep.h>
+#include <linux/lockdep_types.h>
+
+typedef struct raw_spinlock {
+ arch_spinlock_t raw_lock;
@@ -176,8 +200,18 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+#define SPINLOCK_OWNER_INIT ((void *)-1L)
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
+# define RAW_SPIN_DEP_MAP_INIT(lockname) \
+ .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_SPIN, \
+ }
+# define SPIN_DEP_MAP_INIT(lockname) \
+ .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_CONFIG, \
+ }
+#else
+# define RAW_SPIN_DEP_MAP_INIT(lockname)
+# define SPIN_DEP_MAP_INIT(lockname)
+#endif
+
@@ -191,14 +225,14 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+#endif
+
+#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
+ { \
+{ \
+ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
+ SPIN_DEBUG_INIT(lockname) \
+ SPIN_DEP_MAP_INIT(lockname) }
+ RAW_SPIN_DEP_MAP_INIT(lockname) }
+
+#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
+ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
+
+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
+
+#endif

View File

@@ -1,6 +1,6 @@
Subject: rtmutex: Avoid include hell
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 29 Jun 2011 20:06:39 +0200
Subject: [PATCH 09/22] locking/rtmutex: Avoid include hell
Include only the required raw types. This avoids pulling in the
complete spinlock header which in turn requires rtmutex.h at some point.

View File

@@ -0,0 +1,26 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 14 Aug 2020 16:55:25 +0200
Subject: [PATCH 11/23] lockdep: Reduce header files in debug_locks.h
The inclusion of printk.h leads to circular dependency if spinlock_t is
based on rt_mutex.
Include only atomic.h (xchg()) and cache.h (__read_mostly).
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/debug_locks.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -3,8 +3,7 @@
#define __LINUX_DEBUG_LOCKING_H
#include <linux/atomic.h>
-#include <linux/bug.h>
-#include <linux/printk.h>
+#include <linux/cache.h>
struct task_struct;

View File

@@ -0,0 +1,108 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 14 Aug 2020 17:08:41 +0200
Subject: [PATCH 11/22] locking: split out the rbtree definition
rtmutex.h needs the definition for rb_root_cached. By including kernel.h
we will get to spinlock.h which requires rtmutex.h again.
Split out the required struct definition and move it into its own header
file which can be included by rtmutex.h
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rbtree.h | 27 +--------------------------
include/linux/rbtree_type.h | 31 +++++++++++++++++++++++++++++++
include/linux/rtmutex.h | 2 +-
3 files changed, 33 insertions(+), 27 deletions(-)
create mode 100644 include/linux/rbtree_type.h
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -19,19 +19,9 @@
#include <linux/kernel.h>
#include <linux/stddef.h>
+#include <linux/rbtree_type.h>
#include <linux/rcupdate.h>
-struct rb_node {
- unsigned long __rb_parent_color;
- struct rb_node *rb_right;
- struct rb_node *rb_left;
-} __attribute__((aligned(sizeof(long))));
- /* The alignment might seem pointless, but allegedly CRIS needs it */
-
-struct rb_root {
- struct rb_node *rb_node;
-};
-
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define RB_ROOT (struct rb_root) { NULL, }
@@ -112,21 +102,6 @@ static inline void rb_link_node_rcu(stru
typeof(*pos), field); 1; }); \
pos = n)
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
- struct rb_root rb_root;
- struct rb_node *rb_leftmost;
-};
-
#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
/* Same as rb_first(), but O(1) */
--- /dev/null
+++ b/include/linux/rbtree_type.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_RBTREE_TYPE_H
+#define _LINUX_RBTREE_TYPE_H
+
+struct rb_node {
+ unsigned long __rb_parent_color;
+ struct rb_node *rb_right;
+ struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+/* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root {
+ struct rb_node *rb_node;
+};
+
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+};
+
+#endif
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,7 +14,7 @@
#define __LINUX_RT_MUTEX_H
#include <linux/linkage.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_type.h>
#include <linux/spinlock_types_raw.h>
extern int max_lock_depth; /* for sysctl */

View File

@@ -1,6 +1,6 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 16:14:22 +0200
Subject: rtmutex: Provide rt_mutex_slowlock_locked()
Subject: [PATCH 12/22] locking/rtmutex: Provide rt_mutex_slowlock_locked()
This is the inner-part of rt_mutex_slowlock(), required for rwsem-rt.
@@ -13,7 +13,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1243,35 +1243,16 @@ static void rt_mutex_handle_deadlock(int
@@ -1234,35 +1234,16 @@ static void rt_mutex_handle_deadlock(int
}
}
@@ -55,7 +55,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
set_current_state(state);
@@ -1279,16 +1260,16 @@ rt_mutex_slowlock(struct rt_mutex *lock,
@@ -1270,16 +1251,16 @@ rt_mutex_slowlock(struct rt_mutex *lock,
if (unlikely(timeout))
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
@@ -76,7 +76,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/*
@@ -1296,6 +1277,34 @@ rt_mutex_slowlock(struct rt_mutex *lock,
@@ -1287,6 +1268,34 @@ rt_mutex_slowlock(struct rt_mutex *lock,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
@@ -121,7 +121,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* This is the control structure for tasks blocked on a rt_mutex,
@@ -159,6 +160,12 @@ extern bool __rt_mutex_futex_unlock(stru
@@ -153,6 +154,12 @@ extern bool __rt_mutex_futex_unlock(stru
struct wake_q_head *wqh);
extern void rt_mutex_postunlock(struct wake_q_head *wake_q);

View File

@@ -1,20 +1,20 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 16:36:39 +0200
Subject: rtmutex: export lockdep-less version of rt_mutex's lock,
trylock and unlock
Subject: [PATCH 13/22] locking/rtmutex: export lockdep-less version of
rt_mutex's lock, trylock and unlock
Required for lock implementation ontop of rtmutex.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/locking/rtmutex.c | 59 ++++++++++++++++++++++++++--------------
kernel/locking/rtmutex.c | 54 ++++++++++++++++++++++++++++------------
kernel/locking/rtmutex_common.h | 3 ++
2 files changed, 42 insertions(+), 20 deletions(-)
2 files changed, 41 insertions(+), 16 deletions(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1493,12 +1493,33 @@ rt_mutex_fastunlock(struct rt_mutex *loc
@@ -1469,12 +1469,33 @@ rt_mutex_fastunlock(struct rt_mutex *loc
rt_mutex_postunlock(&wake_q);
}
@@ -40,7 +40,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
+ ret = __rt_mutex_lock_state(lock, state);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+
@@ -50,7 +50,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -1539,16 +1560,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
@@ -1515,16 +1536,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
@@ -61,31 +61,16 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
- if (ret)
- mutex_release(&lock->dep_map, 1, _RET_IP_);
- mutex_release(&lock->dep_map, _RET_IP_);
-
- return ret;
+ return rt_mutex_lock_state(lock, 0, TASK_INTERRUPTIBLE);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
@@ -1574,13 +1586,10 @@ int __sched __rt_mutex_futex_trylock(str
* Returns:
* 0 on success
* -EINTR when interrupted by a signal
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
{
- might_sleep();
-
- return rt_mutex_fastlock(lock, TASK_KILLABLE, rt_mutex_slowlock);
+ return rt_mutex_lock_state(lock, 0, TASK_KILLABLE);
@@ -1541,6 +1553,14 @@ int __sched __rt_mutex_futex_trylock(str
return __rt_mutex_slowtrylock(lock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
@@ -1615,6 +1624,14 @@ rt_mutex_timed_lock(struct rt_mutex *loc
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+int __sched __rt_mutex_trylock(struct rt_mutex *lock)
+{
@@ -98,7 +83,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/**
* rt_mutex_trylock - try to lock a rt_mutex
*
@@ -1630,10 +1647,7 @@ int __sched rt_mutex_trylock(struct rt_m
@@ -1556,10 +1576,7 @@ int __sched rt_mutex_trylock(struct rt_m
{
int ret;
@@ -110,7 +95,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (ret)
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
@@ -1641,6 +1655,11 @@ int __sched rt_mutex_trylock(struct rt_m
@@ -1567,6 +1584,11 @@ int __sched rt_mutex_trylock(struct rt_m
}
EXPORT_SYMBOL_GPL(rt_mutex_trylock);
@@ -124,7 +109,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -162,6 +162,9 @@ extern bool __rt_mutex_futex_unlock(stru
@@ -156,6 +156,9 @@ extern bool __rt_mutex_futex_unlock(stru
extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
/* RW semaphore special interface */

View File

@@ -1,6 +1,7 @@
Subject: sched: Add saved_state for tasks blocked on sleeping locks
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 25 Jun 2011 09:21:04 +0200
Subject: [PATCH 14/22] sched: Add saved_state for tasks blocked on sleeping
locks
Spinlocks are state preserving in !RT. RT changes the state when a
task gets blocked on a lock. So we need to remember the state before
@@ -11,13 +12,13 @@ sleep is done, the saved state is restored.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/sched.h | 3 +++
kernel/sched/core.c | 42 +++++++++++++++++++++++++++++++++++++++---
kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 1 +
3 files changed, 43 insertions(+), 3 deletions(-)
3 files changed, 36 insertions(+), 2 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -631,6 +631,8 @@ struct task_struct {
@@ -655,6 +655,8 @@ struct task_struct {
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
@@ -26,7 +27,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/*
* This begins the randomizable portion of task_struct. Only
@@ -1679,6 +1681,7 @@ extern struct task_struct *find_get_task
@@ -1777,6 +1779,7 @@ extern struct task_struct *find_get_task
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
@@ -36,30 +37,20 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
#ifdef CONFIG_SMP
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2524,6 +2524,8 @@ try_to_wake_up(struct task_struct *p, un
@@ -3316,7 +3316,7 @@ try_to_wake_up(struct task_struct *p, un
int cpu, success = 0;
preempt_disable();
+
+#ifndef CONFIG_PREEMPT_RT
if (p == current) {
- if (p == current) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -2546,7 +2548,7 @@ try_to_wake_up(struct task_struct *p, un
trace_sched_wakeup(p);
goto out;
}
-
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -2555,8 +2557,27 @@ try_to_wake_up(struct task_struct *p, un
* == smp_processor_id()'. Together this means we can special
@@ -3346,8 +3346,26 @@ try_to_wake_up(struct task_struct *p, un
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(p->state & state))
- goto unlock;
+ if (!(p->state & state)) {
+ /*
+ * The task might be running due to a spinlock sleeper
@@ -72,8 +63,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ success = 1;
+ }
+ }
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ goto out_nostat;
goto unlock;
+ }
+ /*
+ * If this is a regular wakeup, then we can unconditionally
@@ -84,20 +74,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
trace_sched_waking(p);
@@ -2648,9 +2669,12 @@ try_to_wake_up(struct task_struct *p, un
ttwu_queue(p, cpu, wake_flags);
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+#ifndef CONFIG_PREEMPT_RT
out:
+#endif
if (success)
ttwu_stat(p, cpu, wake_flags);
+out_nostat:
preempt_enable();
return success;
@@ -2673,6 +2697,18 @@ int wake_up_process(struct task_struct *
@@ -3536,6 +3554,18 @@ int wake_up_process(struct task_struct *
}
EXPORT_SYMBOL(wake_up_process);
@@ -118,11 +95,11 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
return try_to_wake_up(p, state, 0);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1644,6 +1644,7 @@ static inline int task_on_rq_migrating(s
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
+#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
@@ -1751,6 +1751,7 @@ static inline int task_on_rq_migrating(s
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
+#define WF_LOCK_SLEEPER 0x80 /* Wakeup spinlock "sleeper" */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);

View File

@@ -1,28 +1,29 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 17:11:19 +0200
Subject: rtmutex: add sleeping lock implementation
Subject: [PATCH 15/22] locking/rtmutex: add sleeping lock implementation
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kernel.h | 5
include/linux/rtmutex.h | 21 +
include/linux/sched.h | 8
include/linux/preempt.h | 4
include/linux/rtmutex.h | 19 +
include/linux/sched.h | 7
include/linux/sched/wake_q.h | 13 +
include/linux/spinlock_rt.h | 156 +++++++++++++
include/linux/spinlock_types_rt.h | 48 ++++
include/linux/spinlock_rt.h | 155 +++++++++++++
include/linux/spinlock_types_rt.h | 38 +++
kernel/fork.c | 1
kernel/futex.c | 11
kernel/locking/rtmutex.c | 436 ++++++++++++++++++++++++++++++++++----
kernel/futex.c | 10
kernel/locking/rtmutex.c | 444 ++++++++++++++++++++++++++++++++++----
kernel/locking/rtmutex_common.h | 14 -
kernel/sched/core.c | 39 ++-
11 files changed, 694 insertions(+), 58 deletions(-)
12 files changed, 694 insertions(+), 55 deletions(-)
create mode 100644 include/linux/spinlock_rt.h
create mode 100644 include/linux/spinlock_types_rt.h
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -227,6 +227,10 @@ extern void __cant_sleep(const char *fil
@@ -107,6 +107,10 @@ extern void __cant_migrate(const char *f
*/
# define might_sleep() \
do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
@@ -33,23 +34,31 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/**
* cant_sleep - annotation for functions that cannot sleep
*
@@ -258,6 +262,7 @@ extern void __cant_sleep(const char *fil
@@ -150,6 +154,7 @@ extern void __cant_migrate(const char *f
static inline void __might_sleep(const char *file, int line,
int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0)
+# define might_sleep_no_state_check() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0)
# define cant_migrate() do { } while (0)
# define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0)
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -121,7 +121,11 @@
/*
* The preempt_count offset after spin_lock()
*/
+#if !defined(CONFIG_PREEMPT_RT)
#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET
+#else
+#define PREEMPT_LOCK_OFFSET 0
+#endif
/*
* The preempt_count offset needed for things like:
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -14,11 +14,15 @@
#define __LINUX_RT_MUTEX_H
#include <linux/linkage.h>
-#include <linux/rbtree.h>
#include <linux/spinlock_types_raw.h>
+#include <linux/rbtree.h>
@@ -19,6 +19,10 @@
extern int max_lock_depth; /* for sysctl */
@@ -60,46 +69,40 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/**
* The rt_mutex structure
*
@@ -31,8 +35,8 @@ struct rt_mutex {
@@ -31,6 +35,7 @@ struct rt_mutex {
raw_spinlock_t wait_lock;
struct rb_root_cached waiters;
struct task_struct *owner;
-#ifdef CONFIG_DEBUG_RT_MUTEXES
int save_state;
+#ifdef CONFIG_DEBUG_RT_MUTEXES
const char *name, *file;
int line;
void *magic;
@@ -82,16 +86,23 @@ do { \
+ int save_state;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -67,11 +72,19 @@ do { \
#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
#endif
-#define __RT_MUTEX_INITIALIZER(mutexname) \
- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
+#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
, .waiters = RB_ROOT_CACHED \
, .owner = NULL \
__DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
+ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
+
+#define __RT_MUTEX_INITIALIZER(mutexname) \
+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+ , .save_state = 0 }
+
+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+ , .save_state = 1 }
#define DEFINE_RT_MUTEX(mutexname) \
struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
+#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
+ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
+ , .save_state = 1 }
+
/**
* rt_mutex_is_locked - is the mutex locked
* @lock: the mutex to be queried
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -140,6 +140,9 @@ struct task_group;
@@ -141,6 +141,9 @@ struct io_uring_task;
smp_store_mb(current->state, (state_value)); \
} while (0)
@@ -109,15 +112,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#define set_special_state(state_value) \
do { \
unsigned long flags; /* may shadow */ \
@@ -149,6 +152,7 @@ struct task_group;
current->state = (state_value); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
+
#else
/*
* set_current_state() includes a barrier so that the write of current->state
@@ -193,6 +197,9 @@ struct task_group;
@@ -194,6 +197,9 @@ struct io_uring_task;
#define set_current_state(state_value) \
smp_store_mb(current->state, (state_value))
@@ -127,7 +122,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* set_special_state() should be used for those states when the blocking task
* can not use the regular condition based wait-loop. In that case we must
@@ -950,6 +957,7 @@ struct task_struct {
@@ -1015,6 +1021,7 @@ struct task_struct {
raw_spinlock_t pi_lock;
struct wake_q_node wake_q;
@@ -158,7 +153,8 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#endif /* _LINUX_SCHED_WAKE_Q_H */
--- /dev/null
+++ b/include/linux/spinlock_rt.h
@@ -0,0 +1,156 @@
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_SPINLOCK_RT_H
+#define __LINUX_SPINLOCK_RT_H
+
@@ -180,10 +176,10 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+} while (0)
+
+extern void __lockfunc rt_spin_lock(spinlock_t *lock);
+extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
+extern void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock);
+extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
+extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
+extern void __lockfunc rt_spin_lock_unlock(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
@@ -229,6 +225,12 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ rt_spin_lock_nested(lock, subclass); \
+ } while (0)
+
+# define spin_lock_nest_lock(lock, subclass) \
+ do { \
+ typecheck(struct lockdep_map *, &(subclass)->dep_map); \
+ rt_spin_lock_nest_lock(lock, &(subclass)->dep_map); \
+ } while (0)
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+ do { \
+ typecheck(unsigned long, flags); \
@@ -236,14 +238,15 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ rt_spin_lock_nested(lock, subclass); \
+ } while (0)
+#else
+# define spin_lock_nested(lock, subclass) spin_lock(lock)
+# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock)
+# define spin_lock_nested(lock, subclass) spin_lock(((void)(subclass), (lock)))
+# define spin_lock_nest_lock(lock, subclass) spin_lock(((void)(subclass), (lock)))
+# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(((void)(subclass), (lock)))
+
+# define spin_lock_irqsave_nested(lock, flags, subclass) \
+ do { \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ spin_lock(lock); \
+ spin_lock(((void)(subclass), (lock))); \
+ } while (0)
+#endif
+
@@ -254,20 +257,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ spin_lock(lock); \
+ } while (0)
+
+static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
+{
+ unsigned long flags = 0;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ flags = rt_spin_lock_trace_flags(lock);
+#else
+ spin_lock(lock); /* lock_local */
+#endif
+ return flags;
+}
+
+/* FIXME: we need rt_spin_lock_nest_lock */
+#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
+
+#define spin_unlock(lock) rt_spin_unlock(lock)
+
+#define spin_unlock_bh(lock) \
@@ -288,10 +277,15 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
+#define spin_trylock_irq(lock) spin_trylock(lock)
+
+#define spin_trylock_irqsave(lock, flags) \
+ rt_spin_trylock_irqsave(lock, &(flags))
+
+#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
+#define spin_trylock_irqsave(lock, flags) \
+({ \
+ int __locked; \
+ \
+ typecheck(unsigned long, flags); \
+ flags = 0; \
+ __locked = spin_trylock(lock); \
+ __locked; \
+})
+
+#ifdef CONFIG_GENERIC_LOCKBREAK
+# define spin_is_contended(lock) ((lock)->break_lock)
@@ -317,7 +311,8 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
--- /dev/null
+++ b/include/linux/spinlock_types_rt.h
@@ -0,0 +1,48 @@
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_SPINLOCK_TYPES_RT_H
+#define __LINUX_SPINLOCK_TYPES_RT_H
+
@@ -338,22 +333,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# define __RT_SPIN_INITIALIZER(name) \
+#define __RT_SPIN_INITIALIZER(name) \
+ { \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+ .save_state = 1, \
+ .file = __FILE__, \
+ .line = __LINE__ , \
+ }
+#else
+# define __RT_SPIN_INITIALIZER(name) \
+ { \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+ .save_state = 1, \
+ }
+#endif
+
+/*
+.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
+*/
@@ -368,7 +352,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -950,6 +950,7 @@ static struct task_struct *dup_task_stru
@@ -927,6 +927,7 @@ static struct task_struct *dup_task_stru
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
@@ -378,7 +362,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1573,6 +1573,7 @@ static int wake_futex_pi(u32 __user *uad
@@ -1497,6 +1497,7 @@ static int wake_futex_pi(u32 __user *uad
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
@@ -386,14 +370,15 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
int ret = 0;
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
@@ -1632,13 +1633,13 @@ static int wake_futex_pi(u32 __user *uad
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
@@ -1546,14 +1547,15 @@ static int wake_futex_pi(u32 __user *uad
* not fail.
*/
pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
+ &wake_sleeper_q);
}
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
+ &wake_sleeper_q);
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
@@ -403,7 +388,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
return ret;
}
@@ -2980,7 +2981,7 @@ static int futex_lock_pi(u32 __user *uad
@@ -2857,7 +2859,7 @@ static int futex_lock_pi(u32 __user *uad
goto no_block;
}
@@ -412,7 +397,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
@@ -3348,7 +3349,7 @@ static int futex_wait_requeue_pi(u32 __u
@@ -3203,7 +3205,7 @@ static int futex_wait_requeue_pi(u32 __u
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
@@ -435,7 +420,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*
* See Documentation/locking/rt-mutex-design.rst for details.
*/
@@ -235,7 +240,7 @@ static inline bool unlock_rt_mutex_safe(
@@ -233,7 +238,7 @@ static inline bool unlock_rt_mutex_safe(
* Only use with rt_mutex_waiter_{less,equal}()
*/
#define task_to_waiter(p) \
@@ -444,7 +429,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
@@ -275,6 +280,27 @@ rt_mutex_waiter_equal(struct rt_mutex_wa
@@ -273,6 +278,27 @@ rt_mutex_waiter_equal(struct rt_mutex_wa
return 1;
}
@@ -472,7 +457,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -379,6 +405,14 @@ static bool rt_mutex_cond_detect_deadloc
@@ -377,6 +403,14 @@ static bool rt_mutex_cond_detect_deadloc
return debug_rt_mutex_detect_deadlock(waiter, chwalk);
}
@@ -487,7 +472,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Max number of times we'll walk the boosting chain:
*/
@@ -703,13 +737,16 @@ static int rt_mutex_adjust_prio_chain(st
@@ -700,13 +734,16 @@ static int rt_mutex_adjust_prio_chain(st
* follow here. This is the end of the chain we are walking.
*/
if (!rt_mutex_owner(lock)) {
@@ -506,7 +491,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
@@ -810,9 +847,11 @@ static int rt_mutex_adjust_prio_chain(st
@@ -807,9 +844,11 @@ static int rt_mutex_adjust_prio_chain(st
* @task: The task which wants to acquire the lock
* @waiter: The waiter that is queued to the lock's wait tree if the
* callsite called task_blocked_on_lock(), otherwise NULL
@@ -520,7 +505,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
lockdep_assert_held(&lock->wait_lock);
@@ -848,12 +887,11 @@ static int try_to_take_rt_mutex(struct r
@@ -845,12 +884,11 @@ static int try_to_take_rt_mutex(struct r
*/
if (waiter) {
/*
@@ -536,7 +521,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* We can acquire the lock. Remove the waiter from the
* lock waiters tree.
@@ -871,14 +909,12 @@ static int try_to_take_rt_mutex(struct r
@@ -868,14 +906,12 @@ static int try_to_take_rt_mutex(struct r
*/
if (rt_mutex_has_waiters(lock)) {
/*
@@ -555,7 +540,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* The current top waiter stays enqueued. We
* don't have to change anything in the lock
@@ -925,6 +961,296 @@ static int try_to_take_rt_mutex(struct r
@@ -922,6 +958,289 @@ static int try_to_take_rt_mutex(struct r
return 1;
}
@@ -666,8 +651,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+ debug_rt_mutex_print_deadlock(waiter);
+
+ if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
+ schedule();
+
@@ -736,9 +719,9 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+void __lockfunc rt_spin_lock(spinlock_t *lock)
+{
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_spin_lock);
+
@@ -750,19 +733,28 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ migrate_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_spin_lock_nested);
+
+void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_spin_lock_nest_lock);
+#endif
+
+void __lockfunc rt_spin_unlock(spinlock_t *lock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+ spin_release(&lock->dep_map, _RET_IP_);
+ migrate_enable();
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock);
+
@@ -777,23 +769,22 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ * (like raw spinlocks do), we lock and unlock, to force the kernel to
+ * schedule if there's contention:
+ */
+void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
+void __lockfunc rt_spin_lock_unlock(spinlock_t *lock)
+{
+ spin_lock(lock);
+ spin_unlock(lock);
+}
+EXPORT_SYMBOL(rt_spin_unlock_wait);
+EXPORT_SYMBOL(rt_spin_lock_unlock);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+ int ret;
+
+ migrate_disable();
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret)
+ if (ret) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock);
@@ -805,27 +796,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ local_bh_disable();
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ } else
+ migrate_disable();
+ } else {
+ local_bh_enable();
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+ int ret;
+
+ *flags = 0;
+ ret = __rt_mutex_trylock(&lock->lock);
+ if (ret) {
+ migrate_disable();
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_spin_trylock_irqsave);
+EXPORT_SYMBOL(rt_spin_trylock_bh);
+
+void
+__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key)
@@ -852,7 +830,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Task blocks on lock.
*
@@ -1038,6 +1364,7 @@ static int task_blocks_on_rt_mutex(struc
@@ -1035,6 +1354,7 @@ static int task_blocks_on_rt_mutex(struc
* Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
@@ -860,7 +838,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
@@ -1077,7 +1404,10 @@ static void mark_wakeup_next_waiter(stru
@@ -1074,7 +1394,10 @@ static void mark_wakeup_next_waiter(stru
* Pairs with preempt_enable() in rt_mutex_postunlock();
*/
preempt_disable();
@@ -872,7 +850,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
raw_spin_unlock(&current->pi_lock);
}
@@ -1161,21 +1491,22 @@ void rt_mutex_adjust_pi(struct task_stru
@@ -1158,21 +1481,22 @@ void rt_mutex_adjust_pi(struct task_stru
return;
}
next_lock = waiter->lock;
@@ -897,7 +875,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/**
@@ -1292,7 +1623,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
@@ -1283,7 +1607,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
unsigned long flags;
int ret = 0;
@@ -906,7 +884,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1365,7 +1696,8 @@ static inline int rt_mutex_slowtrylock(s
@@ -1356,7 +1680,8 @@ static inline int rt_mutex_slowtrylock(s
* Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
@@ -916,7 +894,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
unsigned long flags;
@@ -1419,7 +1751,7 @@ static bool __sched rt_mutex_slowunlock(
@@ -1410,7 +1735,7 @@ static bool __sched rt_mutex_slowunlock(
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
@@ -925,7 +903,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true; /* call rt_mutex_postunlock() */
@@ -1471,9 +1803,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
@@ -1447,9 +1772,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
/*
* Performs the wakeup of the the top-waiter and re-enables preemption.
*/
@@ -938,7 +916,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* Pairs with preempt_disable() in rt_mutex_slowunlock() */
preempt_enable();
@@ -1482,15 +1816,17 @@ void rt_mutex_postunlock(struct wake_q_h
@@ -1458,15 +1785,17 @@ void rt_mutex_postunlock(struct wake_q_h
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -959,10 +937,10 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
@@ -1668,16 +2004,13 @@ void __sched __rt_mutex_unlock(struct rt
@@ -1597,16 +1926,13 @@ void __sched __rt_mutex_unlock(struct rt
void __sched rt_mutex_unlock(struct rt_mutex *lock)
{
mutex_release(&lock->dep_map, 1, _RET_IP_);
mutex_release(&lock->dep_map, _RET_IP_);
- rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
+ __rt_mutex_unlock(lock);
}
@@ -980,7 +958,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
lockdep_assert_held(&lock->wait_lock);
@@ -1694,23 +2027,35 @@ bool __sched __rt_mutex_futex_unlock(str
@@ -1623,23 +1949,35 @@ bool __sched __rt_mutex_futex_unlock(str
* avoid inversion prior to the wakeup. preempt_disable()
* therein pairs with rt_mutex_postunlock().
*/
@@ -1019,7 +997,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/**
@@ -1749,7 +2094,7 @@ void __rt_mutex_init(struct rt_mutex *lo
@@ -1675,7 +2013,7 @@ void __rt_mutex_init(struct rt_mutex *lo
if (name && key)
debug_rt_mutex_init(lock, name, key);
}
@@ -1028,18 +1006,28 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/**
* rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
@@ -1944,6 +2289,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
@@ -1695,6 +2033,14 @@ void rt_mutex_init_proxy_locked(struct r
struct task_struct *proxy_owner)
{
+ struct task_struct *tsk = current;
int ret;
__rt_mutex_init(lock, NULL, NULL);
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /*
+ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is
+ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping
+ * lock.
+ */
+ raw_spin_lock_init(&lock->wait_lock);
+#endif
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
}
@@ -1717,6 +2063,26 @@ void rt_mutex_proxy_unlock(struct rt_mut
rt_mutex_set_owner(lock, NULL);
}
raw_spin_lock_irq(&lock->wait_lock);
@@ -1955,6 +2301,24 @@ int rt_mutex_wait_proxy_lock(struct rt_m
* have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+static void fixup_rt_mutex_blocked(struct rt_mutex *lock)
+{
+ struct task_struct *tsk = current;
+ /*
+ * RT has a problem here when the wait got interrupted by a timeout
+ * or a signal. task->pi_blocked_on is still set. The task must
@@ -1052,35 +1040,54 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ * boosting chain of the rtmutex. That's correct because the task
+ * is not longer blocked on it.
+ */
+ if (ret) {
+ raw_spin_lock(&tsk->pi_lock);
+ tsk->pi_blocked_on = NULL;
+ raw_spin_unlock(&tsk->pi_lock);
+ }
+ raw_spin_lock(&tsk->pi_lock);
+ tsk->pi_blocked_on = NULL;
+ raw_spin_unlock(&tsk->pi_lock);
+}
+
/**
* __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
* @lock: the rt_mutex to take
@@ -1789,6 +2155,9 @@ int __rt_mutex_start_proxy_lock(struct r
ret = 0;
}
+ if (ret)
+ fixup_rt_mutex_blocked(lock);
+
return ret;
}
@@ -1878,6 +2247,9 @@ int rt_mutex_wait_proxy_lock(struct rt_m
* have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+ if (ret)
+ fixup_rt_mutex_blocked(lock);
+
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -30,6 +30,7 @@ struct rt_mutex_waiter {
struct rb_node pi_tree_entry;
@@ -31,6 +31,7 @@ struct rt_mutex_waiter {
struct task_struct *task;
struct rt_mutex *lock;
int prio;
+ bool savestate;
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
struct pid *deadlock_task_pid;
@@ -139,7 +140,7 @@ extern void rt_mutex_init_proxy_locked(s
u64 deadline;
};
@@ -133,7 +134,7 @@ extern struct task_struct *rt_mutex_next
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -157,9 +158,12 @@ extern int __rt_mutex_futex_trylock(stru
@@ -151,9 +152,12 @@ extern int __rt_mutex_futex_trylock(stru
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
@@ -1095,7 +1102,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* RW semaphore special interface */
extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
@@ -169,6 +173,10 @@ int __sched rt_mutex_slowlock_locked(str
@@ -163,6 +167,10 @@ int __sched rt_mutex_slowlock_locked(str
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk,
struct rt_mutex_waiter *waiter);
@@ -1108,7 +1115,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
# include "rtmutex-debug.h"
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -414,9 +414,15 @@ static bool set_nr_if_polling(struct tas
@@ -502,9 +502,15 @@ static bool set_nr_if_polling(struct tas
#endif
#endif
@@ -1126,7 +1133,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Atomically grab the task, if ->wake_q is !nil already it means
@@ -452,7 +458,13 @@ static bool __wake_q_add(struct wake_q_h
@@ -540,7 +546,13 @@ static bool __wake_q_add(struct wake_q_h
*/
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
@@ -1141,7 +1148,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
get_task_struct(task);
}
@@ -475,28 +487,39 @@ void wake_q_add(struct wake_q_head *head
@@ -563,28 +575,39 @@ void wake_q_add(struct wake_q_head *head
*/
void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{

View File

@@ -1,10 +1,12 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed 02 Dec 2015 11:34:07 +0100
Subject: rtmutex: trylock is okay on -RT
Date: Wed, 2 Dec 2015 11:34:07 +0100
Subject: [PATCH 16/22] locking/rtmutex: Allow rt_mutex_trylock() on PREEMPT_RT
non-RT kernel could deadlock on rt_mutex_trylock() in softirq context. On
-RT we don't run softirqs in IRQ context but in thread context so it is
not a issue here.
Non PREEMPT_RT kernel can deadlock on rt_mutex_trylock() in softirq
context.
On PREEMPT_RT the softirq context is handled in thread context. This
avoids the deadlock in the slow path and PI-boosting will be done on the
correct thread.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
@@ -13,7 +15,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1962,7 +1962,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1884,7 +1884,11 @@ int __sched __rt_mutex_futex_trylock(str
int __sched __rt_mutex_trylock(struct rt_mutex *lock)
{

View File

@@ -1,19 +1,21 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 17:17:03 +0200
Subject: rtmutex: add mutex implementation based on rtmutex
Subject: [PATCH 17/22] locking/rtmutex: add mutex implementation based on
rtmutex
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/mutex_rt.h | 130 ++++++++++++++++++++++++++
kernel/locking/mutex-rt.c | 223 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 353 insertions(+)
kernel/locking/mutex-rt.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 354 insertions(+)
create mode 100644 include/linux/mutex_rt.h
create mode 100644 kernel/locking/mutex-rt.c
--- /dev/null
+++ b/include/linux/mutex_rt.h
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_MUTEX_RT_H
+#define __LINUX_MUTEX_RT_H
+
@@ -44,7 +46,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
+extern void __lockfunc _mutex_lock(struct mutex *lock);
+extern void __lockfunc _mutex_lock_io(struct mutex *lock);
+extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass);
+extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
+extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
@@ -61,7 +62,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#define mutex_lock_killable(l) _mutex_lock_killable(l)
+#define mutex_trylock(l) _mutex_trylock(l)
+#define mutex_unlock(l) _mutex_unlock(l)
+#define mutex_lock_io(l) _mutex_lock_io(l);
+#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0);
+
+#define __mutex_owner(l) ((l)->lock.owner)
+
@@ -92,7 +93,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+# define mutex_lock_killable_nested(l, s) \
+ _mutex_lock_killable(l)
+# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
+# define mutex_lock_io_nested(l, s) _mutex_lock_io(l)
+# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s)
+#endif
+
+# define mutex_init(mutex) \
@@ -146,10 +147,9 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
--- /dev/null
+++ b/kernel/locking/mutex-rt.c
@@ -0,0 +1,223 @@
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
@@ -215,6 +215,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/hrtimer.h>
+#include <linux/blkdev.h>
+
+#include "rtmutex_common.h"
+
@@ -235,55 +236,24 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+}
+EXPORT_SYMBOL(__mutex_do_init);
+
+static int _mutex_lock_blk_flush(struct mutex *lock, int state)
+{
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+ return __rt_mutex_lock_state(&lock->lock, state);
+}
+
+void __lockfunc _mutex_lock(struct mutex *lock)
+{
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock);
+
+void __lockfunc _mutex_lock_io(struct mutex *lock)
+{
+ int token;
+
+ token = io_schedule_prepare();
+ _mutex_lock(lock);
+ io_schedule_finish(token);
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_io);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass)
+{
+ int token;
@@ -297,10 +267,42 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+}
+EXPORT_SYMBOL_GPL(_mutex_lock_io_nested);
+
+int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible);
+
+int __lockfunc _mutex_lock_killable(struct mutex *lock)
+{
+ int ret;
+
+ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
+{
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nested);
+
+void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
+ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE);
+ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(_mutex_lock_nest_lock);
+
@@ -309,9 +311,9 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ int ret;
+
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE);
+ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
@@ -321,9 +323,9 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ int ret;
+
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE);
+ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE);
+ if (ret)
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
+ return ret;
+}
+EXPORT_SYMBOL(_mutex_lock_killable_nested);
@@ -342,7 +344,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+void __lockfunc _mutex_unlock(struct mutex *lock)
+{
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
+ mutex_release(&lock->dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(_mutex_unlock);

View File

@@ -1,6 +1,7 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 17:28:34 +0200
Subject: rtmutex: add rwsem implementation based on rtmutex
Subject: [PATCH 18/22] locking/rtmutex: add rwsem implementation based on
rtmutex
The RT specific R/W semaphore implementation restricts the number of readers
to one because a writer cannot block on multiple readers and inherit its
@@ -14,7 +15,7 @@ The single reader restricting is painful in various ways:
- Progress blocker for drivers which are carefully crafted to avoid the
potential reader/writer deadlock in mainline.
The analysis of the writer code pathes shows, that properly written RT tasks
The analysis of the writer code paths shows, that properly written RT tasks
should not take them. Syscalls like mmap(), file access which take mmap sem
write locked have unbound latencies which are completely unrelated to mmap
sem. Other R/W sem users like graphics drivers are not suitable for RT tasks
@@ -41,15 +42,16 @@ the approach.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rwsem-rt.h | 68 ++++++++++
kernel/locking/rwsem-rt.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 361 insertions(+)
include/linux/rwsem-rt.h | 70 ++++++++++
kernel/locking/rwsem-rt.c | 318 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 388 insertions(+)
create mode 100644 include/linux/rwsem-rt.h
create mode 100644 kernel/locking/rwsem-rt.c
--- /dev/null
+++ b/include/linux/rwsem-rt.h
@@ -0,0 +1,68 @@
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _LINUX_RWSEM_RT_H
+#define _LINUX_RWSEM_RT_H
+
@@ -108,6 +110,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+}
+
+extern void __down_read(struct rw_semaphore *sem);
+extern int __down_read_interruptible(struct rw_semaphore *sem);
+extern int __down_read_killable(struct rw_semaphore *sem);
+extern int __down_read_trylock(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
@@ -120,13 +123,13 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
--- /dev/null
+++ b/kernel/locking/rwsem-rt.c
@@ -0,0 +1,293 @@
+/*
+ */
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/rwsem.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/signal.h>
+#include <linux/export.h>
+#include <linux/blkdev.h>
+
+#include "rtmutex_common.h"
+
@@ -211,6 +214,13 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ if (__down_read_trylock(sem))
+ return 0;
+
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
+ might_sleep();
+ raw_spin_lock_irq(&m->wait_lock);
+ /*
@@ -280,6 +290,17 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ WARN_ON_ONCE(ret);
+}
+
+int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ int ret;
+
+ ret = __down_read_common(sem, TASK_INTERRUPTIBLE);
+ if (likely(!ret))
+ return ret;
+ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret);
+ return -EINTR;
+}
+
+int __down_read_killable(struct rw_semaphore *sem)
+{
+ int ret;
@@ -333,6 +354,13 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ struct rt_mutex *m = &sem->rtmutex;
+ unsigned long flags;
+
+ /*
+ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too
+ * late if one of the callbacks needs to acquire a sleeping lock.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
+ /* Take the rtmutex as a first step */
+ if (__rt_mutex_lock_state(m, state))
+ return -EINTR;

View File

@@ -1,23 +1,26 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 17:18:06 +0200
Subject: rtmutex: add rwlock implementation based on rtmutex
Subject: [PATCH 19/22] locking/rtmutex: add rwlock implementation based on
rtmutex
The implementation is bias-based, similar to the rwsem implementation.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/rwlock_rt.h | 119 ++++++++++++
include/linux/rwlock_types_rt.h | 55 +++++
kernel/locking/rwlock-rt.c | 368 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 542 insertions(+)
include/linux/rwlock_rt.h | 109 +++++++++++++
include/linux/rwlock_types_rt.h | 56 ++++++
kernel/Kconfig.locks | 2
kernel/locking/rwlock-rt.c | 328 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 494 insertions(+), 1 deletion(-)
create mode 100644 include/linux/rwlock_rt.h
create mode 100644 include/linux/rwlock_types_rt.h
create mode 100644 kernel/locking/rwlock-rt.c
--- /dev/null
+++ b/include/linux/rwlock_rt.h
@@ -0,0 +1,119 @@
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_RWLOCK_RT_H
+#define __LINUX_RWLOCK_RT_H
+
@@ -43,7 +46,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags)
+{
+ /* XXX ARCH_IRQ_ENABLED */
+ *flags = 0;
+ return rt_write_trylock(lock);
+}
@@ -126,20 +128,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ __rt_rwlock_init(rwl, #rwl, &__key); \
+} while (0)
+
+/*
+ * Internal functions made global for CPU pinning
+ */
+void __read_rt_lock(struct rt_rw_lock *lock);
+int __read_rt_trylock(struct rt_rw_lock *lock);
+void __write_rt_lock(struct rt_rw_lock *lock);
+int __write_rt_trylock(struct rt_rw_lock *lock);
+void __read_rt_unlock(struct rt_rw_lock *lock);
+void __write_rt_unlock(struct rt_rw_lock *lock);
+
+#endif
--- /dev/null
+++ b/include/linux/rwlock_types_rt.h
@@ -0,0 +1,55 @@
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __LINUX_RWLOCK_TYPES_RT_H
+#define __LINUX_RWLOCK_TYPES_RT_H
+
@@ -195,11 +188,21 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ } while (0)
+
+#endif
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
--- /dev/null
+++ b/kernel/locking/rwlock-rt.c
@@ -0,0 +1,368 @@
+/*
+ */
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include <linux/export.h>
+
@@ -262,7 +265,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ lock->rtmutex.save_state = 1;
+}
+
+int __read_rt_trylock(struct rt_rw_lock *lock)
+static int __read_rt_trylock(struct rt_rw_lock *lock)
+{
+ int r, old;
+
@@ -279,7 +282,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ return 0;
+}
+
+void __sched __read_rt_lock(struct rt_rw_lock *lock)
+static void __read_rt_lock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct rt_mutex_waiter waiter;
@@ -342,7 +345,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ debug_rt_mutex_free_waiter(&waiter);
+}
+
+void __read_rt_unlock(struct rt_rw_lock *lock)
+static void __read_rt_unlock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct task_struct *tsk;
@@ -378,7 +381,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ rt_spin_lock_slowunlock(m);
+}
+
+void __sched __write_rt_lock(struct rt_rw_lock *lock)
+static void __write_rt_lock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ struct task_struct *self = current;
@@ -422,7 +425,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ }
+}
+
+int __write_rt_trylock(struct rt_rw_lock *lock)
+static int __write_rt_trylock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ unsigned long flags;
@@ -442,7 +445,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ return 0;
+}
+
+void __write_rt_unlock(struct rt_rw_lock *lock)
+static void __write_rt_unlock(struct rt_rw_lock *lock)
+{
+ struct rt_mutex *m = &lock->rtmutex;
+ unsigned long flags;
@@ -451,43 +454,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ __write_unlock_common(lock, WRITER_BIAS, flags);
+}
+
+/* Map the reader biased implementation */
+static inline int do_read_rt_trylock(rwlock_t *rwlock)
+{
+ return __read_rt_trylock(rwlock);
+}
+
+static inline int do_write_rt_trylock(rwlock_t *rwlock)
+{
+ return __write_rt_trylock(rwlock);
+}
+
+static inline void do_read_rt_lock(rwlock_t *rwlock)
+{
+ __read_rt_lock(rwlock);
+}
+
+static inline void do_write_rt_lock(rwlock_t *rwlock)
+{
+ __write_rt_lock(rwlock);
+}
+
+static inline void do_read_rt_unlock(rwlock_t *rwlock)
+{
+ __read_rt_unlock(rwlock);
+}
+
+static inline void do_write_rt_unlock(rwlock_t *rwlock)
+{
+ __write_rt_unlock(rwlock);
+}
+
+static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name,
+ struct lock_class_key *key)
+{
+ __rwlock_biased_rt_init(rwlock, name, key);
+}
+
+int __lockfunc rt_read_can_lock(rwlock_t *rwlock)
+{
+ return atomic_read(&rwlock->readers) < 0;
@@ -505,12 +471,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+{
+ int ret;
+
+ migrate_disable();
+ ret = do_read_rt_trylock(rwlock);
+ if (ret)
+ ret = __read_rt_trylock(rwlock);
+ if (ret) {
+ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_read_trylock);
@@ -519,50 +484,49 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+{
+ int ret;
+
+ migrate_disable();
+ ret = do_write_rt_trylock(rwlock);
+ if (ret)
+ ret = __write_rt_trylock(rwlock);
+ if (ret) {
+ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+ migrate_disable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rt_write_trylock);
+
+void __lockfunc rt_read_lock(rwlock_t *rwlock)
+{
+ migrate_disable();
+ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_);
+ do_read_rt_lock(rwlock);
+ __read_rt_lock(rwlock);
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_read_lock);
+
+void __lockfunc rt_write_lock(rwlock_t *rwlock)
+{
+ migrate_disable();
+ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
+ do_write_rt_lock(rwlock);
+ __write_rt_lock(rwlock);
+ migrate_disable();
+}
+EXPORT_SYMBOL(rt_write_lock);
+
+void __lockfunc rt_read_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ do_read_rt_unlock(rwlock);
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ __read_rt_unlock(rwlock);
+}
+EXPORT_SYMBOL(rt_read_unlock);
+
+void __lockfunc rt_write_unlock(rwlock_t *rwlock)
+{
+ rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
+ do_write_rt_unlock(rwlock);
+ rwlock_release(&rwlock->dep_map, _RET_IP_);
+ migrate_enable();
+ __write_rt_unlock(rwlock);
+}
+EXPORT_SYMBOL(rt_write_unlock);
+
+void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
+{
+ do_rwlock_rt_init(rwlock, name, key);
+ __rwlock_biased_rt_init(rwlock, name, key);
+}
+EXPORT_SYMBOL(__rt_rwlock_init);

View File

@@ -1,30 +1,35 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 12 Oct 2017 17:31:14 +0200
Subject: rtmutex: wire up RT's locking
Subject: [PATCH 20/22] locking/rtmutex: wire up RT's locking
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/mutex.h | 20 +++++++++++++-------
include/linux/rwsem.h | 12 ++++++++++++
include/linux/spinlock.h | 12 +++++++++++-
include/linux/spinlock_api_smp.h | 4 +++-
include/linux/spinlock_types.h | 11 ++++++++---
kernel/locking/Makefile | 10 +++++++---
kernel/locking/rwsem.c | 7 +++++++
kernel/locking/spinlock.c | 7 +++++++
kernel/locking/spinlock_debug.c | 5 +++++
9 files changed, 73 insertions(+), 15 deletions(-)
include/linux/mutex.h | 26 ++++++++++++++++----------
include/linux/rwsem.h | 12 ++++++++++++
include/linux/spinlock.h | 12 +++++++++++-
include/linux/spinlock_api_smp.h | 4 +++-
include/linux/spinlock_types.h | 11 ++++++++---
include/linux/spinlock_types_up.h | 2 +-
kernel/Kconfig.preempt | 1 +
kernel/locking/Makefile | 10 +++++++---
kernel/locking/rwsem.c | 6 ++++++
kernel/locking/spinlock.c | 7 +++++++
kernel/locking/spinlock_debug.c | 5 +++++
11 files changed, 77 insertions(+), 19 deletions(-)
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -22,6 +22,17 @@
@@ -22,6 +22,20 @@
struct ww_acquire_ctx;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+ , .dep_map = { .name = #lockname }
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
+ , .dep_map = { \
+ .name = #lockname, \
+ .wait_type_inner = LD_WAIT_SLEEP, \
+ }
+#else
+# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+#endif
@@ -36,13 +41,16 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Simple, straightforward mutexes with strict semantics:
*
@@ -108,13 +119,6 @@ do { \
@@ -119,16 +133,6 @@ do { \
__mutex_init((mutex), #mutex, &__key); \
} while (0)
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
- , .dep_map = { .name = #lockname }
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
- , .dep_map = { \
- .name = #lockname, \
- .wait_type_inner = LD_WAIT_SLEEP, \
- }
-#else
-# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-#endif
@@ -50,7 +58,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#define __MUTEX_INITIALIZER(lockname) \
{ .owner = ATOMIC_LONG_INIT(0) \
, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
@@ -210,4 +214,6 @@ enum mutex_trylock_recursive_enum {
@@ -224,4 +228,6 @@ enum mutex_trylock_recursive_enum {
extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
mutex_trylock_recursive(struct mutex *lock);
@@ -71,7 +79,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif
@@ -115,6 +120,13 @@ static inline int rwsem_is_contended(str
@@ -119,6 +124,13 @@ static inline int rwsem_is_contended(str
return !list_empty(&sem->wait_list);
}
@@ -87,7 +95,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*/
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -307,7 +307,11 @@ static inline void do_raw_spin_unlock(ra
@@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(ra
})
/* Include rwlock functions */
@@ -100,7 +108,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -318,6 +322,10 @@ static inline void do_raw_spin_unlock(ra
@@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(ra
# include <linux/spinlock_api_up.h>
#endif
@@ -111,7 +119,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* Map the spin_lock functions to the raw variants for PREEMPT_RT=n
*/
@@ -438,6 +446,8 @@ static __always_inline int spin_is_conte
@@ -454,6 +462,8 @@ static __always_inline int spin_is_conte
#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock)
@@ -151,6 +159,27 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
#endif /* __LINUX_SPINLOCK_TYPES_H */
--- a/include/linux/spinlock_types_up.h
+++ b/include/linux/spinlock_types_up.h
@@ -1,7 +1,7 @@
#ifndef __LINUX_SPINLOCK_TYPES_UP_H
#define __LINUX_SPINLOCK_TYPES_UP_H
-#ifndef __LINUX_SPINLOCK_TYPES_H
+#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H)
# error "please don't include this file directly"
#endif
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -59,6 +59,7 @@ config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
depends on EXPERT && ARCH_SUPPORTS_RT
select PREEMPTION
+ select RT_MUTEXES
help
This option turns the kernel into a real-time kernel by replacing
various locking primitives (spinlocks, rwlocks, etc.) with
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
@@ -160,9 +189,9 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+obj-y += semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -12,19 +12,23 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS
# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
@@ -15,19 +15,23 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS
CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
endif
@@ -198,15 +227,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#include "lock_events.h"
/*
@@ -1332,6 +1333,7 @@ static struct rw_semaphore *rwsem_downgr
return sem;
}
+
/*
* lock for reading
*/
@@ -1482,6 +1484,7 @@ static inline void __downgrade_write(str
@@ -1343,6 +1344,7 @@ static inline void __downgrade_write(str
if (tmp & RWSEM_FLAG_WAITERS)
rwsem_downgrade_wake(sem);
}
@@ -214,36 +235,26 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* lock for reading
@@ -1613,6 +1616,7 @@ void _down_write_nest_lock(struct rw_sem
}
EXPORT_SYMBOL(_down_write_nest_lock);
+#ifndef CONFIG_PREEMPT_RT
void down_read_non_owner(struct rw_semaphore *sem)
@@ -1506,7 +1508,9 @@ void down_read_non_owner(struct rw_semap
{
might_sleep();
@@ -1620,6 +1624,7 @@ void down_read_non_owner(struct rw_semap
__down_read(sem);
+#ifndef CONFIG_PREEMPT_RT
__rwsem_set_reader_owned(sem, NULL);
+#endif
}
EXPORT_SYMBOL(down_read_non_owner);
+#endif
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
@@ -1644,11 +1649,13 @@ int __sched down_write_killable_nested(s
}
EXPORT_SYMBOL(down_write_killable_nested);
@@ -1535,7 +1539,9 @@ EXPORT_SYMBOL(down_write_killable_nested
+#ifndef CONFIG_PREEMPT_RT
void up_read_non_owner(struct rw_semaphore *sem)
{
+#ifndef CONFIG_PREEMPT_RT
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+#endif
__up_read(sem);
}
EXPORT_SYMBOL(up_read_non_owner);
+#endif
#endif
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(loc

View File

@@ -1,14 +1,50 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 12 Oct 2017 17:34:38 +0200
Subject: rtmutex: add ww_mutex addon for mutex-rt
Subject: [PATCH 21/22] locking/rtmutex: add ww_mutex addon for mutex-rt
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/locking/rtmutex.c | 271 ++++++++++++++++++++++++++++++++++++++--
include/linux/mutex.h | 8 -
include/linux/ww_mutex.h | 8 +
kernel/locking/rtmutex.c | 262 ++++++++++++++++++++++++++++++++++++++--
kernel/locking/rtmutex_common.h | 2
kernel/locking/rwsem-rt.c | 2
3 files changed, 261 insertions(+), 14 deletions(-)
5 files changed, 262 insertions(+), 20 deletions(-)
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -82,14 +82,6 @@ struct mutex {
struct ww_class;
struct ww_acquire_ctx;
-struct ww_mutex {
- struct mutex base;
- struct ww_acquire_ctx *ctx;
-#ifdef CONFIG_DEBUG_MUTEXES
- struct ww_class *ww_class;
-#endif
-};
-
/*
* This is the control structure for tasks blocked on mutex,
* which resides on the blocked task's kernel stack:
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -28,6 +28,14 @@ struct ww_class {
unsigned int is_wait_die;
};
+struct ww_mutex {
+ struct mutex base;
+ struct ww_acquire_ctx *ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+ struct ww_class *ww_class;
+#endif
+};
+
struct ww_acquire_ctx {
struct task_struct *task;
unsigned long stamp;
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,6 +24,7 @@
@@ -19,7 +55,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#include "rtmutex_common.h"
@@ -1244,6 +1245,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init);
@@ -1234,6 +1235,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init);
#endif /* PREEMPT_RT */
@@ -60,7 +96,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static inline int
try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
@@ -1522,7 +1557,8 @@ void rt_mutex_init_waiter(struct rt_mute
@@ -1512,7 +1547,8 @@ void rt_mutex_init_waiter(struct rt_mute
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
@@ -70,7 +106,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
int ret = 0;
@@ -1540,6 +1576,12 @@ static int __sched
@@ -1530,6 +1566,12 @@ static int __sched
break;
}
@@ -82,8 +118,8 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1574,16 +1616,106 @@ static void rt_mutex_handle_deadlock(int
schedule();
@@ -1558,16 +1600,106 @@ static void rt_mutex_handle_deadlock(int
}
}
@@ -191,7 +227,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
set_current_state(state);
@@ -1593,14 +1725,24 @@ int __sched rt_mutex_slowlock_locked(str
@@ -1577,14 +1709,24 @@ int __sched rt_mutex_slowlock_locked(str
ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
@@ -219,7 +255,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/*
@@ -1617,7 +1759,8 @@ int __sched rt_mutex_slowlock_locked(str
@@ -1601,7 +1743,8 @@ int __sched rt_mutex_slowlock_locked(str
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
@@ -229,7 +265,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct rt_mutex_waiter waiter;
unsigned long flags;
@@ -1635,7 +1778,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
@@ -1619,7 +1762,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
@@ -239,7 +275,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1765,29 +1909,33 @@ static bool __sched rt_mutex_slowunlock(
@@ -1749,14 +1893,16 @@ static bool __sched rt_mutex_slowunlock(
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
@@ -258,26 +294,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- enum rtmutex_chainwalk chwalk))
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- return slowfn(lock, state, timeout, chwalk);
+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
}
static inline int
@@ -1832,7 +1980,7 @@ rt_mutex_fastunlock(struct rt_mutex *loc
@@ -1801,7 +1947,7 @@ rt_mutex_fastunlock(struct rt_mutex *loc
int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state)
{
might_sleep();
@@ -286,15 +303,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/**
@@ -1952,6 +2100,7 @@ rt_mutex_timed_lock(struct rt_mutex *loc
mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
RT_MUTEX_MIN_CHAINWALK,
+ NULL,
rt_mutex_slowlock);
if (ret)
mutex_release(&lock->dep_map, 1, _RET_IP_);
@@ -2321,7 +2470,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
@@ -2245,7 +2391,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
@@ -303,7 +312,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
@@ -2391,3 +2540,99 @@ bool rt_mutex_cleanup_proxy_lock(struct
@@ -2315,3 +2461,97 @@ bool rt_mutex_cleanup_proxy_lock(struct
return cleanup;
}
@@ -312,7 +321,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+ unsigned tmp;
+ unsigned int tmp;
+
+ if (ctx->deadlock_inject_countdown-- == 0) {
+ tmp = ctx->deadlock_inject_interval;
@@ -347,7 +356,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0,
+ ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ mutex_release(&lock->base.dep_map, _RET_IP_);
+ else if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
@@ -367,7 +376,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0,
+ ctx);
+ if (ret)
+ mutex_release(&lock->base.dep_map, 1, _RET_IP_);
+ mutex_release(&lock->base.dep_map, _RET_IP_);
+ else if (!ret && ctx && ctx->acquired > 1)
+ return ww_mutex_deadlock_injection(lock, ctx);
+
@@ -377,13 +386,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+ int nest = !!lock->ctx;
+
+ /*
+ * The unlocking fastpath is the 0->1 transition from 'locked'
+ * into 'unlocked' state:
+ */
+ if (nest) {
+ if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
@@ -392,7 +399,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ lock->ctx = NULL;
+ }
+
+ mutex_release(&lock->base.dep_map, nest, _RET_IP_);
+ mutex_release(&lock->base.dep_map, _RET_IP_);
+ __rt_mutex_unlock(&lock->base.lock);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
@@ -405,7 +412,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+#endif
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -165,6 +165,7 @@ extern void rt_mutex_postunlock(struct w
@@ -159,6 +159,7 @@ extern void rt_mutex_postunlock(struct w
struct wake_q_head *wake_sleeper_q);
/* RW semaphore special interface */
@@ -413,7 +420,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state);
extern int __rt_mutex_trylock(struct rt_mutex *lock);
@@ -172,6 +173,7 @@ extern void __rt_mutex_unlock(struct rt_
@@ -166,6 +167,7 @@ extern void __rt_mutex_unlock(struct rt_
int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk,
@@ -423,7 +430,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
struct rt_mutex_waiter *waiter,
--- a/kernel/locking/rwsem-rt.c
+++ b/kernel/locking/rwsem-rt.c
@@ -131,7 +131,7 @@ static int __sched __down_read_common(st
@@ -138,7 +138,7 @@ static int __sched __down_read_common(st
*/
rt_mutex_init_waiter(&waiter, false);
ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK,

View File

@@ -0,0 +1,224 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 6 Oct 2020 13:07:17 +0200
Subject: [PATCH 22/22] locking/rtmutex: Use custom scheduling function for
spin-schedule()
PREEMPT_RT builds the rwsem, mutex, spinlock and rwlock typed locks on
top of a rtmutex lock. While blocked task->pi_blocked_on is set
(tsk_is_pi_blocked()) and task needs to schedule away while waiting.
The schedule process must distinguish between blocking on a regular
sleeping lock (rwsem and mutex) and a RT-only sleeping lock (spinlock
and rwlock):
- rwsem and mutex must flush block requests (blk_schedule_flush_plug())
even if blocked on a lock. This can not deadlock because this also
happens for non-RT.
There should be a warning if the scheduling point is within a RCU read
section.
- spinlock and rwlock must not flush block requests. This will deadlock
if the callback attempts to acquire a lock which is already acquired.
Similarly to being preempted, there should be no warning if the
scheduling point is within a RCU read section.
Add preempt_schedule_lock() which is invoked if scheduling is required
while blocking on a PREEMPT_RT-only sleeping lock.
Remove tsk_is_pi_blocked() from the scheduler path which is no longer
needed with the additional scheduler entry point.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm64/include/asm/preempt.h | 3 +++
arch/x86/include/asm/preempt.h | 3 +++
include/asm-generic/preempt.h | 3 +++
include/linux/sched/rt.h | 8 --------
kernel/locking/rtmutex.c | 2 +-
kernel/locking/rwlock-rt.c | 2 +-
kernel/sched/core.c | 32 +++++++++++++++++++++-----------
7 files changed, 32 insertions(+), 21 deletions(-)
--- a/arch/arm64/include/asm/preempt.h
+++ b/arch/arm64/include/asm/preempt.h
@@ -81,6 +81,9 @@ static inline bool should_resched(int pr
#ifdef CONFIG_PREEMPTION
void preempt_schedule(void);
+#ifdef CONFIG_PREEMPT_RT
+void preempt_schedule_lock(void);
+#endif
#define __preempt_schedule() preempt_schedule()
void preempt_schedule_notrace(void);
#define __preempt_schedule_notrace() preempt_schedule_notrace()
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -103,6 +103,9 @@ static __always_inline bool should_resch
}
#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_RT
+ extern void preempt_schedule_lock(void);
+#endif
extern asmlinkage void preempt_schedule_thunk(void);
# define __preempt_schedule() \
asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -79,6 +79,9 @@ static __always_inline bool should_resch
}
#ifdef CONFIG_PREEMPTION
+#ifdef CONFIG_PREEMPT_RT
+extern void preempt_schedule_lock(void);
+#endif
extern asmlinkage void preempt_schedule(void);
#define __preempt_schedule() preempt_schedule()
extern asmlinkage void preempt_schedule_notrace(void);
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mut
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return tsk->pi_blocked_on != NULL;
-}
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return false;
-}
#endif
extern void normalize_rt_tasks(void);
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1067,7 +1067,7 @@ void __sched rt_spin_lock_slowlock_locke
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
if (top_waiter != waiter || adaptive_wait(lock, lock_owner))
- schedule();
+ preempt_schedule_lock();
raw_spin_lock_irqsave(&lock->wait_lock, flags);
--- a/kernel/locking/rwlock-rt.c
+++ b/kernel/locking/rwlock-rt.c
@@ -211,7 +211,7 @@ static void __write_rt_lock(struct rt_rw
raw_spin_unlock_irqrestore(&m->wait_lock, flags);
if (atomic_read(&lock->readers) != 0)
- schedule();
+ preempt_schedule_lock();
raw_spin_lock_irqsave(&m->wait_lock, flags);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5006,7 +5006,7 @@ pick_next_task(struct rq *rq, struct tas
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(bool preempt, bool spinning_lock)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -5059,7 +5059,7 @@ static void __sched notrace __schedule(b
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
prev_state = prev->state;
- if (!preempt && prev_state) {
+ if ((!preempt || spinning_lock) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
prev->state = TASK_RUNNING;
} else {
@@ -5143,7 +5143,7 @@ void __noreturn do_task_dead(void)
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(false, false);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -5176,9 +5176,6 @@ static inline void sched_submit_work(str
preempt_enable_no_resched();
}
- if (tsk_is_pi_blocked(tsk))
- return;
-
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -5204,7 +5201,7 @@ asmlinkage __visible void __sched schedu
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(false, false);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
@@ -5232,7 +5229,7 @@ void __sched schedule_idle(void)
*/
WARN_ON_ONCE(current->state);
do {
- __schedule(false);
+ __schedule(false, false);
} while (need_resched());
}
@@ -5285,7 +5282,7 @@ static void __sched notrace preempt_sche
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(true, false);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
@@ -5315,6 +5312,19 @@ asmlinkage __visible void __sched notrac
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace preempt_schedule_lock(void)
+{
+ do {
+ preempt_disable();
+ __schedule(true, true);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+NOKPROBE_SYMBOL(preempt_schedule_lock);
+EXPORT_SYMBOL(preempt_schedule_lock);
+#endif
+
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
*
@@ -5358,7 +5368,7 @@ asmlinkage __visible void __sched notrac
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(true, false);
exception_exit(prev_ctx);
preempt_latency_stop(1);
@@ -5387,7 +5397,7 @@ asmlinkage __visible void __sched preemp
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(true, false);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());

Some files were not shown because too many files have changed in this diff Show More